@artemiskit/core 0.1.6 → 0.2.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (56) hide show
  1. package/CHANGELOG.md +116 -0
  2. package/dist/adapters/types.d.ts +8 -1
  3. package/dist/adapters/types.d.ts.map +1 -1
  4. package/dist/artifacts/types.d.ts +39 -0
  5. package/dist/artifacts/types.d.ts.map +1 -1
  6. package/dist/cost/index.d.ts +5 -0
  7. package/dist/cost/index.d.ts.map +1 -0
  8. package/dist/cost/pricing.d.ts +67 -0
  9. package/dist/cost/pricing.d.ts.map +1 -0
  10. package/dist/evaluators/combined.d.ts +10 -0
  11. package/dist/evaluators/combined.d.ts.map +1 -0
  12. package/dist/evaluators/index.d.ts +4 -0
  13. package/dist/evaluators/index.d.ts.map +1 -1
  14. package/dist/evaluators/inline.d.ts +22 -0
  15. package/dist/evaluators/inline.d.ts.map +1 -0
  16. package/dist/evaluators/llm-grader.d.ts.map +1 -1
  17. package/dist/evaluators/not-contains.d.ts +10 -0
  18. package/dist/evaluators/not-contains.d.ts.map +1 -0
  19. package/dist/evaluators/similarity.d.ts +16 -0
  20. package/dist/evaluators/similarity.d.ts.map +1 -0
  21. package/dist/index.d.ts +1 -0
  22. package/dist/index.d.ts.map +1 -1
  23. package/dist/index.js +13212 -12018
  24. package/dist/scenario/discovery.d.ts +72 -0
  25. package/dist/scenario/discovery.d.ts.map +1 -0
  26. package/dist/scenario/index.d.ts +1 -0
  27. package/dist/scenario/index.d.ts.map +1 -1
  28. package/dist/scenario/schema.d.ts +1253 -9
  29. package/dist/scenario/schema.d.ts.map +1 -1
  30. package/dist/storage/local.d.ts +44 -2
  31. package/dist/storage/local.d.ts.map +1 -1
  32. package/dist/storage/types.d.ts +62 -0
  33. package/dist/storage/types.d.ts.map +1 -1
  34. package/package.json +1 -1
  35. package/src/adapters/types.ts +8 -1
  36. package/src/artifacts/types.ts +39 -0
  37. package/src/cost/index.ts +14 -0
  38. package/src/cost/pricing.ts +450 -0
  39. package/src/evaluators/combined.test.ts +172 -0
  40. package/src/evaluators/combined.ts +95 -0
  41. package/src/evaluators/index.ts +12 -0
  42. package/src/evaluators/inline.test.ts +409 -0
  43. package/src/evaluators/inline.ts +393 -0
  44. package/src/evaluators/llm-grader.ts +45 -13
  45. package/src/evaluators/not-contains.test.ts +105 -0
  46. package/src/evaluators/not-contains.ts +45 -0
  47. package/src/evaluators/similarity.test.ts +333 -0
  48. package/src/evaluators/similarity.ts +258 -0
  49. package/src/index.ts +3 -0
  50. package/src/scenario/discovery.test.ts +153 -0
  51. package/src/scenario/discovery.ts +277 -0
  52. package/src/scenario/index.ts +1 -0
  53. package/src/scenario/schema.ts +47 -2
  54. package/src/storage/local.test.ts +243 -0
  55. package/src/storage/local.ts +162 -2
  56. package/src/storage/types.ts +73 -0
@@ -162,4 +162,247 @@ describe('LocalStorageAdapter', () => {
162
162
  const runs = await emptyStorage.list();
163
163
  expect(runs).toEqual([]);
164
164
  });
165
+
166
+ // ==================== Baseline Tests ====================
167
+
168
+ describe('Baseline Management', () => {
169
+ const baselineManifest: RunManifest = {
170
+ ...mockManifest,
171
+ run_id: 'baseline-test-run',
172
+ config: {
173
+ ...mockManifest.config,
174
+ scenario: 'baseline-test-scenario',
175
+ },
176
+ };
177
+
178
+ test('sets a baseline for a scenario', async () => {
179
+ await storage.save(baselineManifest);
180
+
181
+ const baseline = await storage.setBaseline('baseline-test-scenario', 'baseline-test-run');
182
+
183
+ expect(baseline.scenario).toBe('baseline-test-scenario');
184
+ expect(baseline.runId).toBe('baseline-test-run');
185
+ expect(baseline.metrics.successRate).toBe(0.8);
186
+ expect(baseline.metrics.totalCases).toBe(10);
187
+ expect(baseline.metrics.passedCases).toBe(8);
188
+ expect(baseline.createdAt).toBeDefined();
189
+ });
190
+
191
+ test('sets a baseline with a tag', async () => {
192
+ await storage.save(baselineManifest);
193
+
194
+ const baseline = await storage.setBaseline(
195
+ 'baseline-test-scenario',
196
+ 'baseline-test-run',
197
+ 'v1.0.0-release'
198
+ );
199
+
200
+ expect(baseline.tag).toBe('v1.0.0-release');
201
+ });
202
+
203
+ test('gets a baseline for a scenario', async () => {
204
+ await storage.save(baselineManifest);
205
+ await storage.setBaseline('baseline-test-scenario', 'baseline-test-run');
206
+
207
+ const baseline = await storage.getBaseline('baseline-test-scenario');
208
+
209
+ expect(baseline).not.toBeNull();
210
+ expect(baseline?.runId).toBe('baseline-test-run');
211
+ expect(baseline?.metrics.successRate).toBe(0.8);
212
+ });
213
+
214
+ test('returns null for non-existent baseline', async () => {
215
+ const baseline = await storage.getBaseline('non-existent-scenario');
216
+ expect(baseline).toBeNull();
217
+ });
218
+
219
+ test('gets a baseline by run ID', async () => {
220
+ await storage.save(baselineManifest);
221
+ await storage.setBaseline('baseline-test-scenario', 'baseline-test-run', 'by-run-id-test');
222
+
223
+ const baseline = await storage.getBaselineByRunId('baseline-test-run');
224
+
225
+ expect(baseline).not.toBeNull();
226
+ expect(baseline?.scenario).toBe('baseline-test-scenario');
227
+ expect(baseline?.runId).toBe('baseline-test-run');
228
+ expect(baseline?.tag).toBe('by-run-id-test');
229
+ });
230
+
231
+ test('returns null for non-existent baseline by run ID', async () => {
232
+ const baseline = await storage.getBaselineByRunId('non-existent-run-id');
233
+ expect(baseline).toBeNull();
234
+ });
235
+
236
+ test('lists all baselines', async () => {
237
+ // Clear any existing baselines by creating a new storage instance
238
+ const freshStorage = new LocalStorageAdapter(TEST_DIR);
239
+
240
+ // Create multiple runs and baselines
241
+ const manifest1 = {
242
+ ...mockManifest,
243
+ run_id: 'list-baseline-1',
244
+ config: { ...mockManifest.config, scenario: 'scenario-1' },
245
+ };
246
+ const manifest2 = {
247
+ ...mockManifest,
248
+ run_id: 'list-baseline-2',
249
+ config: { ...mockManifest.config, scenario: 'scenario-2' },
250
+ };
251
+
252
+ await freshStorage.save(manifest1);
253
+ await freshStorage.save(manifest2);
254
+ await freshStorage.setBaseline('scenario-1', 'list-baseline-1');
255
+ await freshStorage.setBaseline('scenario-2', 'list-baseline-2');
256
+
257
+ const baselines = await freshStorage.listBaselines();
258
+
259
+ expect(baselines.length).toBeGreaterThanOrEqual(2);
260
+ expect(baselines.some((b) => b.scenario === 'scenario-1')).toBe(true);
261
+ expect(baselines.some((b) => b.scenario === 'scenario-2')).toBe(true);
262
+ });
263
+
264
+ test('removes a baseline', async () => {
265
+ await storage.save(baselineManifest);
266
+ await storage.setBaseline('baseline-test-scenario', 'baseline-test-run');
267
+
268
+ // Verify it exists
269
+ const before = await storage.getBaseline('baseline-test-scenario');
270
+ expect(before).not.toBeNull();
271
+
272
+ // Remove
273
+ const removed = await storage.removeBaseline('baseline-test-scenario');
274
+ expect(removed).toBe(true);
275
+
276
+ // Verify it's gone
277
+ const after = await storage.getBaseline('baseline-test-scenario');
278
+ expect(after).toBeNull();
279
+ });
280
+
281
+ test('returns false when removing non-existent baseline', async () => {
282
+ const removed = await storage.removeBaseline('non-existent-baseline');
283
+ expect(removed).toBe(false);
284
+ });
285
+
286
+ test('removes a baseline by run ID', async () => {
287
+ const removeByIdManifest = {
288
+ ...mockManifest,
289
+ run_id: 'remove-by-id-run',
290
+ config: { ...mockManifest.config, scenario: 'remove-by-id-scenario' },
291
+ };
292
+ await storage.save(removeByIdManifest);
293
+ await storage.setBaseline('remove-by-id-scenario', 'remove-by-id-run');
294
+
295
+ // Verify it exists
296
+ const before = await storage.getBaselineByRunId('remove-by-id-run');
297
+ expect(before).not.toBeNull();
298
+
299
+ // Remove by run ID
300
+ const removed = await storage.removeBaselineByRunId('remove-by-id-run');
301
+ expect(removed).toBe(true);
302
+
303
+ // Verify it's gone
304
+ const after = await storage.getBaselineByRunId('remove-by-id-run');
305
+ expect(after).toBeNull();
306
+ });
307
+
308
+ test('returns false when removing non-existent baseline by run ID', async () => {
309
+ const removed = await storage.removeBaselineByRunId('non-existent-run-id');
310
+ expect(removed).toBe(false);
311
+ });
312
+
313
+ test('updates existing baseline when set again', async () => {
314
+ await storage.save(baselineManifest);
315
+
316
+ // Set initial baseline
317
+ await storage.setBaseline('baseline-test-scenario', 'baseline-test-run', 'v1.0');
318
+
319
+ // Create a new run with different metrics
320
+ const newManifest = {
321
+ ...baselineManifest,
322
+ run_id: 'baseline-test-run-2',
323
+ metrics: {
324
+ ...baselineManifest.metrics,
325
+ success_rate: 0.95,
326
+ passed_cases: 9,
327
+ failed_cases: 1,
328
+ },
329
+ };
330
+ await storage.save(newManifest);
331
+
332
+ // Update baseline
333
+ await storage.setBaseline('baseline-test-scenario', 'baseline-test-run-2', 'v2.0');
334
+
335
+ const baseline = await storage.getBaseline('baseline-test-scenario');
336
+ expect(baseline?.runId).toBe('baseline-test-run-2');
337
+ expect(baseline?.tag).toBe('v2.0');
338
+ expect(baseline?.metrics.successRate).toBe(0.95);
339
+ });
340
+
341
+ test('compares run to baseline and detects no regression', async () => {
342
+ // Create baseline run
343
+ const baselineRun = {
344
+ ...mockManifest,
345
+ run_id: 'compare-baseline-run',
346
+ config: { ...mockManifest.config, scenario: 'compare-scenario' },
347
+ metrics: { ...mockManifest.metrics, success_rate: 0.8 },
348
+ };
349
+ await storage.save(baselineRun);
350
+ await storage.setBaseline('compare-scenario', 'compare-baseline-run');
351
+
352
+ // Create current run with same or better success rate
353
+ const currentRun = {
354
+ ...mockManifest,
355
+ run_id: 'compare-current-run',
356
+ config: { ...mockManifest.config, scenario: 'compare-scenario' },
357
+ metrics: { ...mockManifest.metrics, success_rate: 0.85 },
358
+ };
359
+ await storage.save(currentRun);
360
+
361
+ const result = await storage.compareToBaseline('compare-current-run', 0.05);
362
+
363
+ expect(result).not.toBeNull();
364
+ expect(result?.hasRegression).toBe(false);
365
+ expect(result?.comparison.delta.successRate).toBeCloseTo(0.05, 2);
366
+ });
367
+
368
+ test('compares run to baseline and detects regression', async () => {
369
+ // Create baseline run
370
+ const baselineRun = {
371
+ ...mockManifest,
372
+ run_id: 'regression-baseline-run',
373
+ config: { ...mockManifest.config, scenario: 'regression-scenario' },
374
+ metrics: { ...mockManifest.metrics, success_rate: 0.9 },
375
+ };
376
+ await storage.save(baselineRun);
377
+ await storage.setBaseline('regression-scenario', 'regression-baseline-run');
378
+
379
+ // Create current run with worse success rate (regression)
380
+ const currentRun = {
381
+ ...mockManifest,
382
+ run_id: 'regression-current-run',
383
+ config: { ...mockManifest.config, scenario: 'regression-scenario' },
384
+ metrics: { ...mockManifest.metrics, success_rate: 0.7 },
385
+ };
386
+ await storage.save(currentRun);
387
+
388
+ const result = await storage.compareToBaseline('regression-current-run', 0.05);
389
+
390
+ expect(result).not.toBeNull();
391
+ expect(result?.hasRegression).toBe(true);
392
+ expect(result?.comparison.delta.successRate).toBeCloseTo(-0.2, 2);
393
+ });
394
+
395
+ test('returns null when no baseline exists for scenario', async () => {
396
+ const noBaselineRun = {
397
+ ...mockManifest,
398
+ run_id: 'no-baseline-run',
399
+ config: { ...mockManifest.config, scenario: 'no-baseline-scenario' },
400
+ };
401
+ await storage.save(noBaselineRun);
402
+
403
+ const result = await storage.compareToBaseline('no-baseline-run');
404
+
405
+ expect(result).toBeNull();
406
+ });
407
+ });
165
408
  });
@@ -5,7 +5,13 @@
5
5
  import { mkdir, readFile, readdir, unlink, writeFile } from 'node:fs/promises';
6
6
  import { join, resolve } from 'node:path';
7
7
  import type { AnyManifest, RedTeamManifest, RunManifest, StressManifest } from '../artifacts/types';
8
- import type { ComparisonResult, ListOptions, RunListItem, StorageAdapter } from './types';
8
+ import type {
9
+ BaselineMetadata,
10
+ BaselineStorageAdapter,
11
+ ComparisonResult,
12
+ ListOptions,
13
+ RunListItem,
14
+ } from './types';
9
15
 
10
16
  /**
11
17
  * Get manifest type from a manifest object
@@ -39,11 +45,21 @@ function getScenario(manifest: AnyManifest): string {
39
45
  return manifest.config.scenario;
40
46
  }
41
47
 
42
- export class LocalStorageAdapter implements StorageAdapter {
48
+ /**
49
+ * Baselines file structure
50
+ */
51
+ interface BaselinesFile {
52
+ version: string;
53
+ baselines: Record<string, BaselineMetadata>;
54
+ }
55
+
56
+ export class LocalStorageAdapter implements BaselineStorageAdapter {
43
57
  private basePath: string;
58
+ private baselinesPath: string;
44
59
 
45
60
  constructor(basePath = './artemis-runs') {
46
61
  this.basePath = resolve(basePath);
62
+ this.baselinesPath = join(this.basePath, '.artemis', 'baselines.json');
47
63
  }
48
64
 
49
65
  async save(manifest: AnyManifest): Promise<string> {
@@ -191,4 +207,148 @@ export class LocalStorageAdapter implements StorageAdapter {
191
207
  return [];
192
208
  }
193
209
  }
210
+
211
+ // ==================== Baseline Methods ====================
212
+
213
+ /**
214
+ * Load baselines file
215
+ */
216
+ private async loadBaselinesFile(): Promise<BaselinesFile> {
217
+ try {
218
+ const content = await readFile(this.baselinesPath, 'utf-8');
219
+ return JSON.parse(content);
220
+ } catch {
221
+ return { version: '1.0', baselines: {} };
222
+ }
223
+ }
224
+
225
+ /**
226
+ * Save baselines file
227
+ */
228
+ private async saveBaselinesFile(data: BaselinesFile): Promise<void> {
229
+ const dir = join(this.basePath, '.artemis');
230
+ await mkdir(dir, { recursive: true });
231
+ await writeFile(this.baselinesPath, JSON.stringify(data, null, 2));
232
+ }
233
+
234
+ /**
235
+ * Set a baseline for a scenario
236
+ */
237
+ async setBaseline(scenario: string, runId: string, tag?: string): Promise<BaselineMetadata> {
238
+ // Load the run to extract metrics
239
+ const manifest = await this.loadRun(runId);
240
+ const scenarioName = scenario || getScenario(manifest);
241
+
242
+ const baseline: BaselineMetadata = {
243
+ scenario: scenarioName,
244
+ runId,
245
+ createdAt: new Date().toISOString(),
246
+ metrics: {
247
+ successRate: manifest.metrics.success_rate,
248
+ medianLatencyMs: manifest.metrics.median_latency_ms,
249
+ totalTokens: manifest.metrics.total_tokens,
250
+ passedCases: manifest.metrics.passed_cases,
251
+ failedCases: manifest.metrics.failed_cases,
252
+ totalCases: manifest.metrics.total_cases,
253
+ },
254
+ tag,
255
+ };
256
+
257
+ // Load existing baselines and add/update
258
+ const data = await this.loadBaselinesFile();
259
+ data.baselines[scenarioName] = baseline;
260
+ await this.saveBaselinesFile(data);
261
+
262
+ return baseline;
263
+ }
264
+
265
+ /**
266
+ * Get the baseline for a scenario
267
+ */
268
+ async getBaseline(scenario: string): Promise<BaselineMetadata | null> {
269
+ const data = await this.loadBaselinesFile();
270
+ return data.baselines[scenario] || null;
271
+ }
272
+
273
+ /**
274
+ * Get a baseline by run ID
275
+ */
276
+ async getBaselineByRunId(runId: string): Promise<BaselineMetadata | null> {
277
+ const data = await this.loadBaselinesFile();
278
+ const baselines = Object.values(data.baselines);
279
+ return baselines.find((b) => b.runId === runId) || null;
280
+ }
281
+
282
+ /**
283
+ * List all baselines
284
+ */
285
+ async listBaselines(): Promise<BaselineMetadata[]> {
286
+ const data = await this.loadBaselinesFile();
287
+ return Object.values(data.baselines).sort(
288
+ (a, b) => new Date(b.createdAt).getTime() - new Date(a.createdAt).getTime()
289
+ );
290
+ }
291
+
292
+ /**
293
+ * Remove a baseline by scenario name
294
+ */
295
+ async removeBaseline(scenario: string): Promise<boolean> {
296
+ const data = await this.loadBaselinesFile();
297
+ if (data.baselines[scenario]) {
298
+ delete data.baselines[scenario];
299
+ await this.saveBaselinesFile(data);
300
+ return true;
301
+ }
302
+ return false;
303
+ }
304
+
305
+ /**
306
+ * Remove a baseline by run ID
307
+ */
308
+ async removeBaselineByRunId(runId: string): Promise<boolean> {
309
+ const data = await this.loadBaselinesFile();
310
+ const entry = Object.entries(data.baselines).find(([_, b]) => b.runId === runId);
311
+ if (entry) {
312
+ delete data.baselines[entry[0]];
313
+ await this.saveBaselinesFile(data);
314
+ return true;
315
+ }
316
+ return false;
317
+ }
318
+
319
+ /**
320
+ * Compare a run against its baseline (if exists)
321
+ */
322
+ async compareToBaseline(
323
+ runId: string,
324
+ regressionThreshold = 0.05
325
+ ): Promise<{
326
+ baseline: BaselineMetadata;
327
+ comparison: ComparisonResult;
328
+ hasRegression: boolean;
329
+ regressionThreshold: number;
330
+ } | null> {
331
+ // Load the current run
332
+ const currentManifest = await this.loadRun(runId);
333
+ const scenario = getScenario(currentManifest);
334
+
335
+ // Get baseline for this scenario
336
+ const baseline = await this.getBaseline(scenario);
337
+ if (!baseline) {
338
+ return null;
339
+ }
340
+
341
+ // Load baseline manifest for full comparison
342
+ const comparison = await this.compare(baseline.runId, runId);
343
+
344
+ // Check for regression (negative delta in success rate)
345
+ const hasRegression = comparison.delta.successRate < -regressionThreshold;
346
+
347
+ return {
348
+ baseline,
349
+ comparison,
350
+ hasRegression,
351
+ regressionThreshold,
352
+ };
353
+ }
194
354
  }
@@ -96,3 +96,76 @@ export interface StorageConfig {
96
96
  bucket?: string;
97
97
  basePath?: string;
98
98
  }
99
+
100
+ /**
101
+ * Baseline metadata for regression comparison
102
+ */
103
+ export interface BaselineMetadata {
104
+ /** Scenario name or identifier */
105
+ scenario: string;
106
+ /** Run ID of the baseline */
107
+ runId: string;
108
+ /** ISO timestamp when baseline was set */
109
+ createdAt: string;
110
+ /** Key metrics captured at baseline time */
111
+ metrics: {
112
+ successRate: number;
113
+ medianLatencyMs: number;
114
+ totalTokens: number;
115
+ passedCases: number;
116
+ failedCases: number;
117
+ totalCases: number;
118
+ };
119
+ /** Optional description or tag */
120
+ tag?: string;
121
+ }
122
+
123
+ /**
124
+ * Extended storage adapter with baseline support
125
+ */
126
+ export interface BaselineStorageAdapter extends StorageAdapter {
127
+ /**
128
+ * Set a baseline for a scenario
129
+ */
130
+ setBaseline(scenario: string, runId: string, tag?: string): Promise<BaselineMetadata>;
131
+
132
+ /**
133
+ * Get the baseline by scenario name
134
+ */
135
+ getBaseline(scenario: string): Promise<BaselineMetadata | null>;
136
+
137
+ /**
138
+ * Get the baseline by run ID
139
+ */
140
+ getBaselineByRunId(runId: string): Promise<BaselineMetadata | null>;
141
+
142
+ /**
143
+ * List all baselines
144
+ */
145
+ listBaselines(): Promise<BaselineMetadata[]>;
146
+
147
+ /**
148
+ * Remove a baseline by scenario name
149
+ */
150
+ removeBaseline(scenario: string): Promise<boolean>;
151
+
152
+ /**
153
+ * Remove a baseline by run ID
154
+ */
155
+ removeBaselineByRunId(runId: string): Promise<boolean>;
156
+
157
+ /**
158
+ * Compare a run against its baseline (if exists)
159
+ * @param runId - The run ID to compare
160
+ * @param regressionThreshold - Threshold for regression detection (0-1), default 0.05
161
+ */
162
+ compareToBaseline?(
163
+ runId: string,
164
+ regressionThreshold?: number
165
+ ): Promise<{
166
+ baseline: BaselineMetadata;
167
+ comparison: ComparisonResult;
168
+ hasRegression: boolean;
169
+ regressionThreshold: number;
170
+ } | null>;
171
+ }