@artemiskit/core 0.2.0 → 0.2.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +42 -0
- package/dist/adapters/types.d.ts +5 -0
- package/dist/adapters/types.d.ts.map +1 -1
- package/dist/cost/pricing.d.ts +2 -1
- package/dist/cost/pricing.d.ts.map +1 -1
- package/dist/evaluators/llm-grader.d.ts.map +1 -1
- package/dist/index.js +299 -68
- package/dist/scenario/schema.d.ts +8 -0
- package/dist/scenario/schema.d.ts.map +1 -1
- package/dist/storage/local.d.ts +44 -2
- package/dist/storage/local.d.ts.map +1 -1
- package/dist/storage/types.d.ts +62 -0
- package/dist/storage/types.d.ts.map +1 -1
- package/package.json +1 -1
- package/src/adapters/types.ts +5 -0
- package/src/cost/pricing.ts +242 -65
- package/src/evaluators/llm-grader.ts +45 -13
- package/src/scenario/schema.ts +4 -0
- package/src/storage/local.test.ts +243 -0
- package/src/storage/local.ts +162 -2
- package/src/storage/types.ts +73 -0
- package/dist/events/emitter.d.ts +0 -111
- package/dist/events/emitter.d.ts.map +0 -1
- package/dist/events/index.d.ts +0 -6
- package/dist/events/index.d.ts.map +0 -1
- package/dist/events/types.d.ts +0 -177
- package/dist/events/types.d.ts.map +0 -1
|
@@ -5,22 +5,27 @@
|
|
|
5
5
|
import type { Expected } from '../scenario/schema';
|
|
6
6
|
import type { Evaluator, EvaluatorContext, EvaluatorResult } from './types';
|
|
7
7
|
|
|
8
|
-
const GRADER_PROMPT = `You are
|
|
8
|
+
const GRADER_PROMPT = `You are a strict JSON-only evaluator. You grade AI responses based on rubrics.
|
|
9
9
|
|
|
10
|
-
|
|
10
|
+
RUBRIC:
|
|
11
11
|
{{rubric}}
|
|
12
12
|
|
|
13
|
-
|
|
13
|
+
RESPONSE TO EVALUATE:
|
|
14
14
|
{{response}}
|
|
15
15
|
|
|
16
|
-
|
|
17
|
-
Score the response from 0.0 to 1.0 based on the rubric.
|
|
18
|
-
Be objective and consistent in your scoring.
|
|
16
|
+
TASK: Score the response from 0.0 to 1.0 based on the rubric above.
|
|
19
17
|
|
|
20
|
-
|
|
21
|
-
{"score":
|
|
18
|
+
OUTPUT FORMAT: You MUST respond with ONLY this exact JSON structure, nothing else:
|
|
19
|
+
{"score":0.0,"reason":"explanation"}
|
|
22
20
|
|
|
23
|
-
|
|
21
|
+
RULES:
|
|
22
|
+
- Output ONLY valid JSON, no markdown, no code blocks, no extra text
|
|
23
|
+
- "score" must be a number between 0.0 and 1.0
|
|
24
|
+
- "reason" must be a brief string explaining the score
|
|
25
|
+
- Do NOT wrap in \`\`\`json or any formatting
|
|
26
|
+
- Your entire response must be parseable by JSON.parse()
|
|
27
|
+
|
|
28
|
+
JSON OUTPUT:`;
|
|
24
29
|
|
|
25
30
|
export class LLMGraderEvaluator implements Evaluator {
|
|
26
31
|
readonly type = 'llm_grader';
|
|
@@ -44,11 +49,13 @@ export class LLMGraderEvaluator implements Evaluator {
|
|
|
44
49
|
);
|
|
45
50
|
|
|
46
51
|
try {
|
|
52
|
+
// Note: Some models (like o1, o3, gpt-5-mini, reasoning models) only support temperature=1
|
|
53
|
+
// We omit temperature to let the API use its default for maximum compatibility
|
|
54
|
+
// Use higher maxTokens for reasoning models which use tokens for internal "thinking"
|
|
47
55
|
const result = await context.client.generate({
|
|
48
56
|
prompt,
|
|
49
57
|
model: expected.model,
|
|
50
|
-
|
|
51
|
-
maxTokens: 200,
|
|
58
|
+
maxTokens: 1000,
|
|
52
59
|
});
|
|
53
60
|
|
|
54
61
|
const parsed = this.parseGraderResponse(result.text);
|
|
@@ -76,9 +83,25 @@ export class LLMGraderEvaluator implements Evaluator {
|
|
|
76
83
|
}
|
|
77
84
|
|
|
78
85
|
private parseGraderResponse(text: string): { score: number; reason?: string } {
|
|
79
|
-
|
|
86
|
+
// Clean up the response - remove markdown code blocks if present
|
|
87
|
+
const cleanedText = text
|
|
88
|
+
.replace(/```json\s*/gi, '')
|
|
89
|
+
.replace(/```\s*/g, '')
|
|
90
|
+
.trim();
|
|
91
|
+
|
|
92
|
+
// Try to find JSON object in the response
|
|
93
|
+
const jsonMatch = cleanedText.match(/\{[\s\S]*?\}/);
|
|
94
|
+
|
|
80
95
|
if (!jsonMatch) {
|
|
81
|
-
|
|
96
|
+
// Fallback: try to extract score from plain text patterns like "Score: 0.8" or "0.85"
|
|
97
|
+
const scoreMatch = cleanedText.match(/(?:score[:\s]*)?(\d+\.?\d*)/i);
|
|
98
|
+
if (scoreMatch) {
|
|
99
|
+
const score = Number(scoreMatch[1]);
|
|
100
|
+
if (!Number.isNaN(score) && score >= 0 && score <= 1) {
|
|
101
|
+
return { score, reason: cleanedText };
|
|
102
|
+
}
|
|
103
|
+
}
|
|
104
|
+
throw new Error(`No JSON found in grader response: ${text.substring(0, 100)}...`);
|
|
82
105
|
}
|
|
83
106
|
|
|
84
107
|
try {
|
|
@@ -94,6 +117,15 @@ export class LLMGraderEvaluator implements Evaluator {
|
|
|
94
117
|
reason: parsed.reason,
|
|
95
118
|
};
|
|
96
119
|
} catch (error) {
|
|
120
|
+
// If JSON parsing fails, try extracting score directly
|
|
121
|
+
const scoreMatch = jsonMatch[0].match(/"score"[:\s]*(\d+\.?\d*)/i);
|
|
122
|
+
if (scoreMatch) {
|
|
123
|
+
const score = Number(scoreMatch[1]);
|
|
124
|
+
if (!Number.isNaN(score) && score >= 0 && score <= 1) {
|
|
125
|
+
const reasonMatch = jsonMatch[0].match(/"reason"[:\s]*"([^"]+)"/i);
|
|
126
|
+
return { score, reason: reasonMatch?.[1] };
|
|
127
|
+
}
|
|
128
|
+
}
|
|
97
129
|
throw new Error(`Failed to parse grader response: ${(error as Error).message}`);
|
|
98
130
|
}
|
|
99
131
|
}
|
package/src/scenario/schema.ts
CHANGED
|
@@ -44,6 +44,10 @@ export const ProviderConfigSchema = z
|
|
|
44
44
|
apiVersion: z.string().optional(),
|
|
45
45
|
embeddingDeploymentName: z.string().optional(),
|
|
46
46
|
|
|
47
|
+
// Model family for parameter detection (e.g., 'gpt-5-mini' when deployment is '5-mini')
|
|
48
|
+
// Used by OpenAI/Azure to determine which API parameters to use (max_tokens vs max_completion_tokens)
|
|
49
|
+
modelFamily: z.string().optional(),
|
|
50
|
+
|
|
47
51
|
// Vercel AI specific
|
|
48
52
|
underlyingProvider: z.enum(['openai', 'azure', 'anthropic', 'google', 'mistral']).optional(),
|
|
49
53
|
})
|
|
@@ -162,4 +162,247 @@ describe('LocalStorageAdapter', () => {
|
|
|
162
162
|
const runs = await emptyStorage.list();
|
|
163
163
|
expect(runs).toEqual([]);
|
|
164
164
|
});
|
|
165
|
+
|
|
166
|
+
// ==================== Baseline Tests ====================
|
|
167
|
+
|
|
168
|
+
describe('Baseline Management', () => {
|
|
169
|
+
const baselineManifest: RunManifest = {
|
|
170
|
+
...mockManifest,
|
|
171
|
+
run_id: 'baseline-test-run',
|
|
172
|
+
config: {
|
|
173
|
+
...mockManifest.config,
|
|
174
|
+
scenario: 'baseline-test-scenario',
|
|
175
|
+
},
|
|
176
|
+
};
|
|
177
|
+
|
|
178
|
+
test('sets a baseline for a scenario', async () => {
|
|
179
|
+
await storage.save(baselineManifest);
|
|
180
|
+
|
|
181
|
+
const baseline = await storage.setBaseline('baseline-test-scenario', 'baseline-test-run');
|
|
182
|
+
|
|
183
|
+
expect(baseline.scenario).toBe('baseline-test-scenario');
|
|
184
|
+
expect(baseline.runId).toBe('baseline-test-run');
|
|
185
|
+
expect(baseline.metrics.successRate).toBe(0.8);
|
|
186
|
+
expect(baseline.metrics.totalCases).toBe(10);
|
|
187
|
+
expect(baseline.metrics.passedCases).toBe(8);
|
|
188
|
+
expect(baseline.createdAt).toBeDefined();
|
|
189
|
+
});
|
|
190
|
+
|
|
191
|
+
test('sets a baseline with a tag', async () => {
|
|
192
|
+
await storage.save(baselineManifest);
|
|
193
|
+
|
|
194
|
+
const baseline = await storage.setBaseline(
|
|
195
|
+
'baseline-test-scenario',
|
|
196
|
+
'baseline-test-run',
|
|
197
|
+
'v1.0.0-release'
|
|
198
|
+
);
|
|
199
|
+
|
|
200
|
+
expect(baseline.tag).toBe('v1.0.0-release');
|
|
201
|
+
});
|
|
202
|
+
|
|
203
|
+
test('gets a baseline for a scenario', async () => {
|
|
204
|
+
await storage.save(baselineManifest);
|
|
205
|
+
await storage.setBaseline('baseline-test-scenario', 'baseline-test-run');
|
|
206
|
+
|
|
207
|
+
const baseline = await storage.getBaseline('baseline-test-scenario');
|
|
208
|
+
|
|
209
|
+
expect(baseline).not.toBeNull();
|
|
210
|
+
expect(baseline?.runId).toBe('baseline-test-run');
|
|
211
|
+
expect(baseline?.metrics.successRate).toBe(0.8);
|
|
212
|
+
});
|
|
213
|
+
|
|
214
|
+
test('returns null for non-existent baseline', async () => {
|
|
215
|
+
const baseline = await storage.getBaseline('non-existent-scenario');
|
|
216
|
+
expect(baseline).toBeNull();
|
|
217
|
+
});
|
|
218
|
+
|
|
219
|
+
test('gets a baseline by run ID', async () => {
|
|
220
|
+
await storage.save(baselineManifest);
|
|
221
|
+
await storage.setBaseline('baseline-test-scenario', 'baseline-test-run', 'by-run-id-test');
|
|
222
|
+
|
|
223
|
+
const baseline = await storage.getBaselineByRunId('baseline-test-run');
|
|
224
|
+
|
|
225
|
+
expect(baseline).not.toBeNull();
|
|
226
|
+
expect(baseline?.scenario).toBe('baseline-test-scenario');
|
|
227
|
+
expect(baseline?.runId).toBe('baseline-test-run');
|
|
228
|
+
expect(baseline?.tag).toBe('by-run-id-test');
|
|
229
|
+
});
|
|
230
|
+
|
|
231
|
+
test('returns null for non-existent baseline by run ID', async () => {
|
|
232
|
+
const baseline = await storage.getBaselineByRunId('non-existent-run-id');
|
|
233
|
+
expect(baseline).toBeNull();
|
|
234
|
+
});
|
|
235
|
+
|
|
236
|
+
test('lists all baselines', async () => {
|
|
237
|
+
// Clear any existing baselines by creating a new storage instance
|
|
238
|
+
const freshStorage = new LocalStorageAdapter(TEST_DIR);
|
|
239
|
+
|
|
240
|
+
// Create multiple runs and baselines
|
|
241
|
+
const manifest1 = {
|
|
242
|
+
...mockManifest,
|
|
243
|
+
run_id: 'list-baseline-1',
|
|
244
|
+
config: { ...mockManifest.config, scenario: 'scenario-1' },
|
|
245
|
+
};
|
|
246
|
+
const manifest2 = {
|
|
247
|
+
...mockManifest,
|
|
248
|
+
run_id: 'list-baseline-2',
|
|
249
|
+
config: { ...mockManifest.config, scenario: 'scenario-2' },
|
|
250
|
+
};
|
|
251
|
+
|
|
252
|
+
await freshStorage.save(manifest1);
|
|
253
|
+
await freshStorage.save(manifest2);
|
|
254
|
+
await freshStorage.setBaseline('scenario-1', 'list-baseline-1');
|
|
255
|
+
await freshStorage.setBaseline('scenario-2', 'list-baseline-2');
|
|
256
|
+
|
|
257
|
+
const baselines = await freshStorage.listBaselines();
|
|
258
|
+
|
|
259
|
+
expect(baselines.length).toBeGreaterThanOrEqual(2);
|
|
260
|
+
expect(baselines.some((b) => b.scenario === 'scenario-1')).toBe(true);
|
|
261
|
+
expect(baselines.some((b) => b.scenario === 'scenario-2')).toBe(true);
|
|
262
|
+
});
|
|
263
|
+
|
|
264
|
+
test('removes a baseline', async () => {
|
|
265
|
+
await storage.save(baselineManifest);
|
|
266
|
+
await storage.setBaseline('baseline-test-scenario', 'baseline-test-run');
|
|
267
|
+
|
|
268
|
+
// Verify it exists
|
|
269
|
+
const before = await storage.getBaseline('baseline-test-scenario');
|
|
270
|
+
expect(before).not.toBeNull();
|
|
271
|
+
|
|
272
|
+
// Remove
|
|
273
|
+
const removed = await storage.removeBaseline('baseline-test-scenario');
|
|
274
|
+
expect(removed).toBe(true);
|
|
275
|
+
|
|
276
|
+
// Verify it's gone
|
|
277
|
+
const after = await storage.getBaseline('baseline-test-scenario');
|
|
278
|
+
expect(after).toBeNull();
|
|
279
|
+
});
|
|
280
|
+
|
|
281
|
+
test('returns false when removing non-existent baseline', async () => {
|
|
282
|
+
const removed = await storage.removeBaseline('non-existent-baseline');
|
|
283
|
+
expect(removed).toBe(false);
|
|
284
|
+
});
|
|
285
|
+
|
|
286
|
+
test('removes a baseline by run ID', async () => {
|
|
287
|
+
const removeByIdManifest = {
|
|
288
|
+
...mockManifest,
|
|
289
|
+
run_id: 'remove-by-id-run',
|
|
290
|
+
config: { ...mockManifest.config, scenario: 'remove-by-id-scenario' },
|
|
291
|
+
};
|
|
292
|
+
await storage.save(removeByIdManifest);
|
|
293
|
+
await storage.setBaseline('remove-by-id-scenario', 'remove-by-id-run');
|
|
294
|
+
|
|
295
|
+
// Verify it exists
|
|
296
|
+
const before = await storage.getBaselineByRunId('remove-by-id-run');
|
|
297
|
+
expect(before).not.toBeNull();
|
|
298
|
+
|
|
299
|
+
// Remove by run ID
|
|
300
|
+
const removed = await storage.removeBaselineByRunId('remove-by-id-run');
|
|
301
|
+
expect(removed).toBe(true);
|
|
302
|
+
|
|
303
|
+
// Verify it's gone
|
|
304
|
+
const after = await storage.getBaselineByRunId('remove-by-id-run');
|
|
305
|
+
expect(after).toBeNull();
|
|
306
|
+
});
|
|
307
|
+
|
|
308
|
+
test('returns false when removing non-existent baseline by run ID', async () => {
|
|
309
|
+
const removed = await storage.removeBaselineByRunId('non-existent-run-id');
|
|
310
|
+
expect(removed).toBe(false);
|
|
311
|
+
});
|
|
312
|
+
|
|
313
|
+
test('updates existing baseline when set again', async () => {
|
|
314
|
+
await storage.save(baselineManifest);
|
|
315
|
+
|
|
316
|
+
// Set initial baseline
|
|
317
|
+
await storage.setBaseline('baseline-test-scenario', 'baseline-test-run', 'v1.0');
|
|
318
|
+
|
|
319
|
+
// Create a new run with different metrics
|
|
320
|
+
const newManifest = {
|
|
321
|
+
...baselineManifest,
|
|
322
|
+
run_id: 'baseline-test-run-2',
|
|
323
|
+
metrics: {
|
|
324
|
+
...baselineManifest.metrics,
|
|
325
|
+
success_rate: 0.95,
|
|
326
|
+
passed_cases: 9,
|
|
327
|
+
failed_cases: 1,
|
|
328
|
+
},
|
|
329
|
+
};
|
|
330
|
+
await storage.save(newManifest);
|
|
331
|
+
|
|
332
|
+
// Update baseline
|
|
333
|
+
await storage.setBaseline('baseline-test-scenario', 'baseline-test-run-2', 'v2.0');
|
|
334
|
+
|
|
335
|
+
const baseline = await storage.getBaseline('baseline-test-scenario');
|
|
336
|
+
expect(baseline?.runId).toBe('baseline-test-run-2');
|
|
337
|
+
expect(baseline?.tag).toBe('v2.0');
|
|
338
|
+
expect(baseline?.metrics.successRate).toBe(0.95);
|
|
339
|
+
});
|
|
340
|
+
|
|
341
|
+
test('compares run to baseline and detects no regression', async () => {
|
|
342
|
+
// Create baseline run
|
|
343
|
+
const baselineRun = {
|
|
344
|
+
...mockManifest,
|
|
345
|
+
run_id: 'compare-baseline-run',
|
|
346
|
+
config: { ...mockManifest.config, scenario: 'compare-scenario' },
|
|
347
|
+
metrics: { ...mockManifest.metrics, success_rate: 0.8 },
|
|
348
|
+
};
|
|
349
|
+
await storage.save(baselineRun);
|
|
350
|
+
await storage.setBaseline('compare-scenario', 'compare-baseline-run');
|
|
351
|
+
|
|
352
|
+
// Create current run with same or better success rate
|
|
353
|
+
const currentRun = {
|
|
354
|
+
...mockManifest,
|
|
355
|
+
run_id: 'compare-current-run',
|
|
356
|
+
config: { ...mockManifest.config, scenario: 'compare-scenario' },
|
|
357
|
+
metrics: { ...mockManifest.metrics, success_rate: 0.85 },
|
|
358
|
+
};
|
|
359
|
+
await storage.save(currentRun);
|
|
360
|
+
|
|
361
|
+
const result = await storage.compareToBaseline('compare-current-run', 0.05);
|
|
362
|
+
|
|
363
|
+
expect(result).not.toBeNull();
|
|
364
|
+
expect(result?.hasRegression).toBe(false);
|
|
365
|
+
expect(result?.comparison.delta.successRate).toBeCloseTo(0.05, 2);
|
|
366
|
+
});
|
|
367
|
+
|
|
368
|
+
test('compares run to baseline and detects regression', async () => {
|
|
369
|
+
// Create baseline run
|
|
370
|
+
const baselineRun = {
|
|
371
|
+
...mockManifest,
|
|
372
|
+
run_id: 'regression-baseline-run',
|
|
373
|
+
config: { ...mockManifest.config, scenario: 'regression-scenario' },
|
|
374
|
+
metrics: { ...mockManifest.metrics, success_rate: 0.9 },
|
|
375
|
+
};
|
|
376
|
+
await storage.save(baselineRun);
|
|
377
|
+
await storage.setBaseline('regression-scenario', 'regression-baseline-run');
|
|
378
|
+
|
|
379
|
+
// Create current run with worse success rate (regression)
|
|
380
|
+
const currentRun = {
|
|
381
|
+
...mockManifest,
|
|
382
|
+
run_id: 'regression-current-run',
|
|
383
|
+
config: { ...mockManifest.config, scenario: 'regression-scenario' },
|
|
384
|
+
metrics: { ...mockManifest.metrics, success_rate: 0.7 },
|
|
385
|
+
};
|
|
386
|
+
await storage.save(currentRun);
|
|
387
|
+
|
|
388
|
+
const result = await storage.compareToBaseline('regression-current-run', 0.05);
|
|
389
|
+
|
|
390
|
+
expect(result).not.toBeNull();
|
|
391
|
+
expect(result?.hasRegression).toBe(true);
|
|
392
|
+
expect(result?.comparison.delta.successRate).toBeCloseTo(-0.2, 2);
|
|
393
|
+
});
|
|
394
|
+
|
|
395
|
+
test('returns null when no baseline exists for scenario', async () => {
|
|
396
|
+
const noBaselineRun = {
|
|
397
|
+
...mockManifest,
|
|
398
|
+
run_id: 'no-baseline-run',
|
|
399
|
+
config: { ...mockManifest.config, scenario: 'no-baseline-scenario' },
|
|
400
|
+
};
|
|
401
|
+
await storage.save(noBaselineRun);
|
|
402
|
+
|
|
403
|
+
const result = await storage.compareToBaseline('no-baseline-run');
|
|
404
|
+
|
|
405
|
+
expect(result).toBeNull();
|
|
406
|
+
});
|
|
407
|
+
});
|
|
165
408
|
});
|
package/src/storage/local.ts
CHANGED
|
@@ -5,7 +5,13 @@
|
|
|
5
5
|
import { mkdir, readFile, readdir, unlink, writeFile } from 'node:fs/promises';
|
|
6
6
|
import { join, resolve } from 'node:path';
|
|
7
7
|
import type { AnyManifest, RedTeamManifest, RunManifest, StressManifest } from '../artifacts/types';
|
|
8
|
-
import type {
|
|
8
|
+
import type {
|
|
9
|
+
BaselineMetadata,
|
|
10
|
+
BaselineStorageAdapter,
|
|
11
|
+
ComparisonResult,
|
|
12
|
+
ListOptions,
|
|
13
|
+
RunListItem,
|
|
14
|
+
} from './types';
|
|
9
15
|
|
|
10
16
|
/**
|
|
11
17
|
* Get manifest type from a manifest object
|
|
@@ -39,11 +45,21 @@ function getScenario(manifest: AnyManifest): string {
|
|
|
39
45
|
return manifest.config.scenario;
|
|
40
46
|
}
|
|
41
47
|
|
|
42
|
-
|
|
48
|
+
/**
|
|
49
|
+
* Baselines file structure
|
|
50
|
+
*/
|
|
51
|
+
interface BaselinesFile {
|
|
52
|
+
version: string;
|
|
53
|
+
baselines: Record<string, BaselineMetadata>;
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
export class LocalStorageAdapter implements BaselineStorageAdapter {
|
|
43
57
|
private basePath: string;
|
|
58
|
+
private baselinesPath: string;
|
|
44
59
|
|
|
45
60
|
constructor(basePath = './artemis-runs') {
|
|
46
61
|
this.basePath = resolve(basePath);
|
|
62
|
+
this.baselinesPath = join(this.basePath, '.artemis', 'baselines.json');
|
|
47
63
|
}
|
|
48
64
|
|
|
49
65
|
async save(manifest: AnyManifest): Promise<string> {
|
|
@@ -191,4 +207,148 @@ export class LocalStorageAdapter implements StorageAdapter {
|
|
|
191
207
|
return [];
|
|
192
208
|
}
|
|
193
209
|
}
|
|
210
|
+
|
|
211
|
+
// ==================== Baseline Methods ====================
|
|
212
|
+
|
|
213
|
+
/**
|
|
214
|
+
* Load baselines file
|
|
215
|
+
*/
|
|
216
|
+
private async loadBaselinesFile(): Promise<BaselinesFile> {
|
|
217
|
+
try {
|
|
218
|
+
const content = await readFile(this.baselinesPath, 'utf-8');
|
|
219
|
+
return JSON.parse(content);
|
|
220
|
+
} catch {
|
|
221
|
+
return { version: '1.0', baselines: {} };
|
|
222
|
+
}
|
|
223
|
+
}
|
|
224
|
+
|
|
225
|
+
/**
|
|
226
|
+
* Save baselines file
|
|
227
|
+
*/
|
|
228
|
+
private async saveBaselinesFile(data: BaselinesFile): Promise<void> {
|
|
229
|
+
const dir = join(this.basePath, '.artemis');
|
|
230
|
+
await mkdir(dir, { recursive: true });
|
|
231
|
+
await writeFile(this.baselinesPath, JSON.stringify(data, null, 2));
|
|
232
|
+
}
|
|
233
|
+
|
|
234
|
+
/**
|
|
235
|
+
* Set a baseline for a scenario
|
|
236
|
+
*/
|
|
237
|
+
async setBaseline(scenario: string, runId: string, tag?: string): Promise<BaselineMetadata> {
|
|
238
|
+
// Load the run to extract metrics
|
|
239
|
+
const manifest = await this.loadRun(runId);
|
|
240
|
+
const scenarioName = scenario || getScenario(manifest);
|
|
241
|
+
|
|
242
|
+
const baseline: BaselineMetadata = {
|
|
243
|
+
scenario: scenarioName,
|
|
244
|
+
runId,
|
|
245
|
+
createdAt: new Date().toISOString(),
|
|
246
|
+
metrics: {
|
|
247
|
+
successRate: manifest.metrics.success_rate,
|
|
248
|
+
medianLatencyMs: manifest.metrics.median_latency_ms,
|
|
249
|
+
totalTokens: manifest.metrics.total_tokens,
|
|
250
|
+
passedCases: manifest.metrics.passed_cases,
|
|
251
|
+
failedCases: manifest.metrics.failed_cases,
|
|
252
|
+
totalCases: manifest.metrics.total_cases,
|
|
253
|
+
},
|
|
254
|
+
tag,
|
|
255
|
+
};
|
|
256
|
+
|
|
257
|
+
// Load existing baselines and add/update
|
|
258
|
+
const data = await this.loadBaselinesFile();
|
|
259
|
+
data.baselines[scenarioName] = baseline;
|
|
260
|
+
await this.saveBaselinesFile(data);
|
|
261
|
+
|
|
262
|
+
return baseline;
|
|
263
|
+
}
|
|
264
|
+
|
|
265
|
+
/**
|
|
266
|
+
* Get the baseline for a scenario
|
|
267
|
+
*/
|
|
268
|
+
async getBaseline(scenario: string): Promise<BaselineMetadata | null> {
|
|
269
|
+
const data = await this.loadBaselinesFile();
|
|
270
|
+
return data.baselines[scenario] || null;
|
|
271
|
+
}
|
|
272
|
+
|
|
273
|
+
/**
|
|
274
|
+
* Get a baseline by run ID
|
|
275
|
+
*/
|
|
276
|
+
async getBaselineByRunId(runId: string): Promise<BaselineMetadata | null> {
|
|
277
|
+
const data = await this.loadBaselinesFile();
|
|
278
|
+
const baselines = Object.values(data.baselines);
|
|
279
|
+
return baselines.find((b) => b.runId === runId) || null;
|
|
280
|
+
}
|
|
281
|
+
|
|
282
|
+
/**
|
|
283
|
+
* List all baselines
|
|
284
|
+
*/
|
|
285
|
+
async listBaselines(): Promise<BaselineMetadata[]> {
|
|
286
|
+
const data = await this.loadBaselinesFile();
|
|
287
|
+
return Object.values(data.baselines).sort(
|
|
288
|
+
(a, b) => new Date(b.createdAt).getTime() - new Date(a.createdAt).getTime()
|
|
289
|
+
);
|
|
290
|
+
}
|
|
291
|
+
|
|
292
|
+
/**
|
|
293
|
+
* Remove a baseline by scenario name
|
|
294
|
+
*/
|
|
295
|
+
async removeBaseline(scenario: string): Promise<boolean> {
|
|
296
|
+
const data = await this.loadBaselinesFile();
|
|
297
|
+
if (data.baselines[scenario]) {
|
|
298
|
+
delete data.baselines[scenario];
|
|
299
|
+
await this.saveBaselinesFile(data);
|
|
300
|
+
return true;
|
|
301
|
+
}
|
|
302
|
+
return false;
|
|
303
|
+
}
|
|
304
|
+
|
|
305
|
+
/**
|
|
306
|
+
* Remove a baseline by run ID
|
|
307
|
+
*/
|
|
308
|
+
async removeBaselineByRunId(runId: string): Promise<boolean> {
|
|
309
|
+
const data = await this.loadBaselinesFile();
|
|
310
|
+
const entry = Object.entries(data.baselines).find(([_, b]) => b.runId === runId);
|
|
311
|
+
if (entry) {
|
|
312
|
+
delete data.baselines[entry[0]];
|
|
313
|
+
await this.saveBaselinesFile(data);
|
|
314
|
+
return true;
|
|
315
|
+
}
|
|
316
|
+
return false;
|
|
317
|
+
}
|
|
318
|
+
|
|
319
|
+
/**
|
|
320
|
+
* Compare a run against its baseline (if exists)
|
|
321
|
+
*/
|
|
322
|
+
async compareToBaseline(
|
|
323
|
+
runId: string,
|
|
324
|
+
regressionThreshold = 0.05
|
|
325
|
+
): Promise<{
|
|
326
|
+
baseline: BaselineMetadata;
|
|
327
|
+
comparison: ComparisonResult;
|
|
328
|
+
hasRegression: boolean;
|
|
329
|
+
regressionThreshold: number;
|
|
330
|
+
} | null> {
|
|
331
|
+
// Load the current run
|
|
332
|
+
const currentManifest = await this.loadRun(runId);
|
|
333
|
+
const scenario = getScenario(currentManifest);
|
|
334
|
+
|
|
335
|
+
// Get baseline for this scenario
|
|
336
|
+
const baseline = await this.getBaseline(scenario);
|
|
337
|
+
if (!baseline) {
|
|
338
|
+
return null;
|
|
339
|
+
}
|
|
340
|
+
|
|
341
|
+
// Load baseline manifest for full comparison
|
|
342
|
+
const comparison = await this.compare(baseline.runId, runId);
|
|
343
|
+
|
|
344
|
+
// Check for regression (negative delta in success rate)
|
|
345
|
+
const hasRegression = comparison.delta.successRate < -regressionThreshold;
|
|
346
|
+
|
|
347
|
+
return {
|
|
348
|
+
baseline,
|
|
349
|
+
comparison,
|
|
350
|
+
hasRegression,
|
|
351
|
+
regressionThreshold,
|
|
352
|
+
};
|
|
353
|
+
}
|
|
194
354
|
}
|
package/src/storage/types.ts
CHANGED
|
@@ -96,3 +96,76 @@ export interface StorageConfig {
|
|
|
96
96
|
bucket?: string;
|
|
97
97
|
basePath?: string;
|
|
98
98
|
}
|
|
99
|
+
|
|
100
|
+
/**
|
|
101
|
+
* Baseline metadata for regression comparison
|
|
102
|
+
*/
|
|
103
|
+
export interface BaselineMetadata {
|
|
104
|
+
/** Scenario name or identifier */
|
|
105
|
+
scenario: string;
|
|
106
|
+
/** Run ID of the baseline */
|
|
107
|
+
runId: string;
|
|
108
|
+
/** ISO timestamp when baseline was set */
|
|
109
|
+
createdAt: string;
|
|
110
|
+
/** Key metrics captured at baseline time */
|
|
111
|
+
metrics: {
|
|
112
|
+
successRate: number;
|
|
113
|
+
medianLatencyMs: number;
|
|
114
|
+
totalTokens: number;
|
|
115
|
+
passedCases: number;
|
|
116
|
+
failedCases: number;
|
|
117
|
+
totalCases: number;
|
|
118
|
+
};
|
|
119
|
+
/** Optional description or tag */
|
|
120
|
+
tag?: string;
|
|
121
|
+
}
|
|
122
|
+
|
|
123
|
+
/**
|
|
124
|
+
* Extended storage adapter with baseline support
|
|
125
|
+
*/
|
|
126
|
+
export interface BaselineStorageAdapter extends StorageAdapter {
|
|
127
|
+
/**
|
|
128
|
+
* Set a baseline for a scenario
|
|
129
|
+
*/
|
|
130
|
+
setBaseline(scenario: string, runId: string, tag?: string): Promise<BaselineMetadata>;
|
|
131
|
+
|
|
132
|
+
/**
|
|
133
|
+
* Get the baseline by scenario name
|
|
134
|
+
*/
|
|
135
|
+
getBaseline(scenario: string): Promise<BaselineMetadata | null>;
|
|
136
|
+
|
|
137
|
+
/**
|
|
138
|
+
* Get the baseline by run ID
|
|
139
|
+
*/
|
|
140
|
+
getBaselineByRunId(runId: string): Promise<BaselineMetadata | null>;
|
|
141
|
+
|
|
142
|
+
/**
|
|
143
|
+
* List all baselines
|
|
144
|
+
*/
|
|
145
|
+
listBaselines(): Promise<BaselineMetadata[]>;
|
|
146
|
+
|
|
147
|
+
/**
|
|
148
|
+
* Remove a baseline by scenario name
|
|
149
|
+
*/
|
|
150
|
+
removeBaseline(scenario: string): Promise<boolean>;
|
|
151
|
+
|
|
152
|
+
/**
|
|
153
|
+
* Remove a baseline by run ID
|
|
154
|
+
*/
|
|
155
|
+
removeBaselineByRunId(runId: string): Promise<boolean>;
|
|
156
|
+
|
|
157
|
+
/**
|
|
158
|
+
* Compare a run against its baseline (if exists)
|
|
159
|
+
* @param runId - The run ID to compare
|
|
160
|
+
* @param regressionThreshold - Threshold for regression detection (0-1), default 0.05
|
|
161
|
+
*/
|
|
162
|
+
compareToBaseline?(
|
|
163
|
+
runId: string,
|
|
164
|
+
regressionThreshold?: number
|
|
165
|
+
): Promise<{
|
|
166
|
+
baseline: BaselineMetadata;
|
|
167
|
+
comparison: ComparisonResult;
|
|
168
|
+
hasRegression: boolean;
|
|
169
|
+
regressionThreshold: number;
|
|
170
|
+
} | null>;
|
|
171
|
+
}
|