@artemiskit/core 0.2.4 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,10 +1,21 @@
1
1
  /**
2
- * Supabase storage adapter
2
+ * Supabase storage adapter with analytics capabilities
3
3
  */
4
4
 
5
5
  import { type SupabaseClient, createClient } from '@supabase/supabase-js';
6
- import type { RunManifest } from '../artifacts/types';
7
- import type { ComparisonResult, ListOptions, RunListItem, StorageAdapter } from './types';
6
+ import type { CaseResult, RunManifest } from '../artifacts/types';
7
+ import type {
8
+ AnalyticsStorageAdapter,
9
+ BaselineMetadata,
10
+ CaseResultQueryOptions,
11
+ CaseResultRecord,
12
+ ComparisonResult,
13
+ ListOptions,
14
+ MetricsSnapshot,
15
+ MetricsTrendOptions,
16
+ RunListItem,
17
+ TrendDataPoint,
18
+ } from './types';
8
19
 
9
20
  export interface SupabaseStorageConfig {
10
21
  url: string;
@@ -12,15 +23,43 @@ export interface SupabaseStorageConfig {
12
23
  bucket?: string;
13
24
  }
14
25
 
15
- export class SupabaseStorageAdapter implements StorageAdapter {
26
+ /**
27
+ * Map CaseResult from manifest to CaseResultRecord for storage
28
+ */
29
+ function mapCaseToRecord(runId: string, caseResult: CaseResult): CaseResultRecord {
30
+ return {
31
+ runId,
32
+ caseId: caseResult.id,
33
+ caseName: caseResult.name,
34
+ status: caseResult.error ? 'error' : caseResult.ok ? 'passed' : 'failed',
35
+ score: caseResult.score,
36
+ matcherType: caseResult.matcherType,
37
+ reason: caseResult.reason,
38
+ response: caseResult.response,
39
+ latencyMs: caseResult.latencyMs,
40
+ promptTokens: caseResult.tokens.prompt,
41
+ completionTokens: caseResult.tokens.completion,
42
+ totalTokens: caseResult.tokens.total,
43
+ error: caseResult.error,
44
+ tags: caseResult.tags,
45
+ };
46
+ }
47
+
48
+ export class SupabaseStorageAdapter implements AnalyticsStorageAdapter {
16
49
  private client: SupabaseClient;
17
50
  private bucket: string;
51
+ private project: string;
18
52
 
19
- constructor(config: SupabaseStorageConfig) {
53
+ constructor(config: SupabaseStorageConfig, project?: string) {
20
54
  this.client = createClient(config.url, config.anonKey);
21
55
  this.bucket = config.bucket || 'artemis-runs';
56
+ this.project = project || 'default';
22
57
  }
23
58
 
59
+ // ============================================================================
60
+ // Core Storage Methods
61
+ // ============================================================================
62
+
24
63
  async save(manifest: RunManifest): Promise<string> {
25
64
  const filePath = `${manifest.project}/${manifest.run_id}.json`;
26
65
 
@@ -62,6 +101,12 @@ export class SupabaseStorageAdapter implements StorageAdapter {
62
101
  throw new Error(`Failed to save run metadata: ${dbError.message}`);
63
102
  }
64
103
 
104
+ // Also save individual case results for granular analytics
105
+ if (manifest.cases && manifest.cases.length > 0) {
106
+ const caseRecords = manifest.cases.map((c) => mapCaseToRecord(manifest.run_id, c));
107
+ await this.saveCaseResults(caseRecords);
108
+ }
109
+
65
110
  return filePath;
66
111
  }
67
112
 
@@ -132,6 +177,7 @@ export class SupabaseStorageAdapter implements StorageAdapter {
132
177
  await this.client.storage.from(this.bucket).remove([run.manifest_path]);
133
178
  }
134
179
 
180
+ // Case results are deleted via CASCADE
135
181
  await this.client.from('runs').delete().eq('run_id', runId);
136
182
  }
137
183
 
@@ -148,4 +194,552 @@ export class SupabaseStorageAdapter implements StorageAdapter {
148
194
  },
149
195
  };
150
196
  }
197
+
198
+ // ============================================================================
199
+ // Baseline Methods
200
+ // ============================================================================
201
+
202
+ async setBaseline(scenario: string, runId: string, tag?: string): Promise<BaselineMetadata> {
203
+ // Load the run to get metrics
204
+ const { data: run, error: runError } = await this.client
205
+ .from('runs')
206
+ .select('*')
207
+ .eq('run_id', runId)
208
+ .single();
209
+
210
+ if (runError || !run) {
211
+ throw new Error(`Run not found: ${runId}`);
212
+ }
213
+
214
+ const baselineData = {
215
+ project: run.project,
216
+ scenario,
217
+ run_id: runId,
218
+ success_rate: run.success_rate,
219
+ median_latency_ms: run.median_latency_ms,
220
+ total_tokens: run.total_tokens,
221
+ passed_cases: run.passed_cases,
222
+ failed_cases: run.failed_cases,
223
+ total_cases: run.total_cases,
224
+ tag,
225
+ created_by: run.run_by,
226
+ };
227
+
228
+ const { error } = await this.client.from('baselines').upsert(baselineData, {
229
+ onConflict: 'project,scenario',
230
+ });
231
+
232
+ if (error) {
233
+ throw new Error(`Failed to set baseline: ${error.message}`);
234
+ }
235
+
236
+ return {
237
+ scenario,
238
+ runId,
239
+ createdAt: new Date().toISOString(),
240
+ metrics: {
241
+ successRate: run.success_rate,
242
+ medianLatencyMs: run.median_latency_ms,
243
+ totalTokens: run.total_tokens,
244
+ passedCases: run.passed_cases,
245
+ failedCases: run.failed_cases,
246
+ totalCases: run.total_cases,
247
+ },
248
+ tag,
249
+ };
250
+ }
251
+
252
+ async getBaseline(scenario: string): Promise<BaselineMetadata | null> {
253
+ const { data, error } = await this.client
254
+ .from('baselines')
255
+ .select('*')
256
+ .eq('project', this.project)
257
+ .eq('scenario', scenario)
258
+ .single();
259
+
260
+ if (error || !data) {
261
+ return null;
262
+ }
263
+
264
+ return {
265
+ scenario: data.scenario,
266
+ runId: data.run_id,
267
+ createdAt: data.created_at,
268
+ metrics: {
269
+ successRate: data.success_rate,
270
+ medianLatencyMs: data.median_latency_ms,
271
+ totalTokens: data.total_tokens,
272
+ passedCases: data.passed_cases,
273
+ failedCases: data.failed_cases,
274
+ totalCases: data.total_cases,
275
+ },
276
+ tag: data.tag,
277
+ };
278
+ }
279
+
280
+ async getBaselineByRunId(runId: string): Promise<BaselineMetadata | null> {
281
+ const { data, error } = await this.client
282
+ .from('baselines')
283
+ .select('*')
284
+ .eq('run_id', runId)
285
+ .single();
286
+
287
+ if (error || !data) {
288
+ return null;
289
+ }
290
+
291
+ return {
292
+ scenario: data.scenario,
293
+ runId: data.run_id,
294
+ createdAt: data.created_at,
295
+ metrics: {
296
+ successRate: data.success_rate,
297
+ medianLatencyMs: data.median_latency_ms,
298
+ totalTokens: data.total_tokens,
299
+ passedCases: data.passed_cases,
300
+ failedCases: data.failed_cases,
301
+ totalCases: data.total_cases,
302
+ },
303
+ tag: data.tag,
304
+ };
305
+ }
306
+
307
+ async listBaselines(): Promise<BaselineMetadata[]> {
308
+ const { data, error } = await this.client
309
+ .from('baselines')
310
+ .select('*')
311
+ .eq('project', this.project)
312
+ .order('created_at', { ascending: false });
313
+
314
+ if (error) {
315
+ throw new Error(`Failed to list baselines: ${error.message}`);
316
+ }
317
+
318
+ return (data || []).map((b) => ({
319
+ scenario: b.scenario,
320
+ runId: b.run_id,
321
+ createdAt: b.created_at,
322
+ metrics: {
323
+ successRate: b.success_rate,
324
+ medianLatencyMs: b.median_latency_ms,
325
+ totalTokens: b.total_tokens,
326
+ passedCases: b.passed_cases,
327
+ failedCases: b.failed_cases,
328
+ totalCases: b.total_cases,
329
+ },
330
+ tag: b.tag,
331
+ }));
332
+ }
333
+
334
+ async removeBaseline(scenario: string): Promise<boolean> {
335
+ const { error, count } = await this.client
336
+ .from('baselines')
337
+ .delete()
338
+ .eq('project', this.project)
339
+ .eq('scenario', scenario);
340
+
341
+ if (error) {
342
+ throw new Error(`Failed to remove baseline: ${error.message}`);
343
+ }
344
+
345
+ return (count ?? 0) > 0;
346
+ }
347
+
348
+ async removeBaselineByRunId(runId: string): Promise<boolean> {
349
+ const { error, count } = await this.client.from('baselines').delete().eq('run_id', runId);
350
+
351
+ if (error) {
352
+ throw new Error(`Failed to remove baseline: ${error.message}`);
353
+ }
354
+
355
+ return (count ?? 0) > 0;
356
+ }
357
+
358
+ async compareToBaseline(
359
+ runId: string,
360
+ regressionThreshold = 0.05
361
+ ): Promise<{
362
+ baseline: BaselineMetadata;
363
+ comparison: ComparisonResult;
364
+ hasRegression: boolean;
365
+ regressionThreshold: number;
366
+ } | null> {
367
+ // Get the run's scenario
368
+ const { data: run, error: runError } = await this.client
369
+ .from('runs')
370
+ .select('scenario')
371
+ .eq('run_id', runId)
372
+ .single();
373
+
374
+ if (runError || !run) {
375
+ return null;
376
+ }
377
+
378
+ // Get the baseline for this scenario
379
+ const baseline = await this.getBaseline(run.scenario);
380
+ if (!baseline) {
381
+ return null;
382
+ }
383
+
384
+ // Compare
385
+ const comparison = await this.compare(baseline.runId, runId);
386
+
387
+ // Check for regression (success rate dropped by more than threshold)
388
+ const hasRegression = comparison.delta.successRate < -regressionThreshold;
389
+
390
+ return {
391
+ baseline,
392
+ comparison,
393
+ hasRegression,
394
+ regressionThreshold,
395
+ };
396
+ }
397
+
398
+ // ============================================================================
399
+ // Case Results Methods
400
+ // ============================================================================
401
+
402
+ async saveCaseResult(result: CaseResultRecord): Promise<string> {
403
+ const dbRecord = {
404
+ run_id: result.runId,
405
+ case_id: result.caseId,
406
+ case_name: result.caseName,
407
+ status: result.status,
408
+ score: result.score,
409
+ matcher_type: result.matcherType,
410
+ reason: result.reason,
411
+ response: result.response,
412
+ latency_ms: result.latencyMs,
413
+ prompt_tokens: result.promptTokens,
414
+ completion_tokens: result.completionTokens,
415
+ total_tokens: result.totalTokens,
416
+ error: result.error,
417
+ tags: result.tags || [],
418
+ };
419
+
420
+ const { data, error } = await this.client
421
+ .from('case_results')
422
+ .upsert(dbRecord, { onConflict: 'run_id,case_id' })
423
+ .select('id')
424
+ .single();
425
+
426
+ if (error) {
427
+ throw new Error(`Failed to save case result: ${error.message}`);
428
+ }
429
+
430
+ return data?.id || result.caseId;
431
+ }
432
+
433
+ async saveCaseResults(results: CaseResultRecord[]): Promise<string[]> {
434
+ if (results.length === 0) {
435
+ return [];
436
+ }
437
+
438
+ const dbRecords = results.map((r) => ({
439
+ run_id: r.runId,
440
+ case_id: r.caseId,
441
+ case_name: r.caseName,
442
+ status: r.status,
443
+ score: r.score,
444
+ matcher_type: r.matcherType,
445
+ reason: r.reason,
446
+ response: r.response,
447
+ latency_ms: r.latencyMs,
448
+ prompt_tokens: r.promptTokens,
449
+ completion_tokens: r.completionTokens,
450
+ total_tokens: r.totalTokens,
451
+ error: r.error,
452
+ tags: r.tags || [],
453
+ }));
454
+
455
+ const { data, error } = await this.client
456
+ .from('case_results')
457
+ .upsert(dbRecords, { onConflict: 'run_id,case_id' })
458
+ .select('id');
459
+
460
+ if (error) {
461
+ throw new Error(`Failed to save case results: ${error.message}`);
462
+ }
463
+
464
+ return (data || []).map((d) => d.id);
465
+ }
466
+
467
+ async getCaseResults(runId: string): Promise<CaseResultRecord[]> {
468
+ const { data, error } = await this.client
469
+ .from('case_results')
470
+ .select('*')
471
+ .eq('run_id', runId)
472
+ .order('created_at', { ascending: true });
473
+
474
+ if (error) {
475
+ throw new Error(`Failed to get case results: ${error.message}`);
476
+ }
477
+
478
+ return (data || []).map((r) => ({
479
+ id: r.id,
480
+ runId: r.run_id,
481
+ caseId: r.case_id,
482
+ caseName: r.case_name,
483
+ status: r.status,
484
+ score: r.score,
485
+ matcherType: r.matcher_type,
486
+ reason: r.reason,
487
+ response: r.response,
488
+ latencyMs: r.latency_ms,
489
+ promptTokens: r.prompt_tokens,
490
+ completionTokens: r.completion_tokens,
491
+ totalTokens: r.total_tokens,
492
+ error: r.error,
493
+ tags: r.tags,
494
+ createdAt: r.created_at,
495
+ }));
496
+ }
497
+
498
+ async queryCaseResults(options: CaseResultQueryOptions): Promise<CaseResultRecord[]> {
499
+ let query = this.client
500
+ .from('case_results')
501
+ .select('*')
502
+ .order('created_at', { ascending: false });
503
+
504
+ if (options.runId) {
505
+ query = query.eq('run_id', options.runId);
506
+ }
507
+ if (options.caseId) {
508
+ query = query.eq('case_id', options.caseId);
509
+ }
510
+ if (options.status) {
511
+ query = query.eq('status', options.status);
512
+ }
513
+ if (options.tags && options.tags.length > 0) {
514
+ query = query.overlaps('tags', options.tags);
515
+ }
516
+ if (options.offset && options.limit) {
517
+ query = query.range(options.offset, options.offset + options.limit - 1);
518
+ } else if (options.limit) {
519
+ query = query.limit(options.limit);
520
+ }
521
+
522
+ const { data, error } = await query;
523
+
524
+ if (error) {
525
+ throw new Error(`Failed to query case results: ${error.message}`);
526
+ }
527
+
528
+ return (data || []).map((r) => ({
529
+ id: r.id,
530
+ runId: r.run_id,
531
+ caseId: r.case_id,
532
+ caseName: r.case_name,
533
+ status: r.status,
534
+ score: r.score,
535
+ matcherType: r.matcher_type,
536
+ reason: r.reason,
537
+ response: r.response,
538
+ latencyMs: r.latency_ms,
539
+ promptTokens: r.prompt_tokens,
540
+ completionTokens: r.completion_tokens,
541
+ totalTokens: r.total_tokens,
542
+ error: r.error,
543
+ tags: r.tags,
544
+ createdAt: r.created_at,
545
+ }));
546
+ }
547
+
548
+ // ============================================================================
549
+ // Metrics History Methods
550
+ // ============================================================================
551
+
552
+ async saveMetricsSnapshot(snapshot: MetricsSnapshot): Promise<string> {
553
+ const dbRecord = {
554
+ date: snapshot.date,
555
+ project: snapshot.project,
556
+ scenario: snapshot.scenario || null,
557
+ total_runs: snapshot.totalRuns,
558
+ total_cases: snapshot.totalCases,
559
+ passed_cases: snapshot.passedCases,
560
+ failed_cases: snapshot.failedCases,
561
+ avg_success_rate: snapshot.avgSuccessRate,
562
+ avg_latency_ms: snapshot.avgLatencyMs,
563
+ avg_tokens_per_run: snapshot.avgTokensPerRun,
564
+ min_success_rate: snapshot.minSuccessRate,
565
+ max_success_rate: snapshot.maxSuccessRate,
566
+ min_latency_ms: snapshot.minLatencyMs,
567
+ max_latency_ms: snapshot.maxLatencyMs,
568
+ total_tokens: snapshot.totalTokens,
569
+ };
570
+
571
+ const { data, error } = await this.client
572
+ .from('metrics_history')
573
+ .upsert(dbRecord, { onConflict: 'date,project,scenario' })
574
+ .select('id')
575
+ .single();
576
+
577
+ if (error) {
578
+ throw new Error(`Failed to save metrics snapshot: ${error.message}`);
579
+ }
580
+
581
+ return data?.id || `${snapshot.date}-${snapshot.project}`;
582
+ }
583
+
584
+ async getMetricsTrend(options: MetricsTrendOptions): Promise<TrendDataPoint[]> {
585
+ let query = this.client
586
+ .from('metrics_history')
587
+ .select('date, avg_success_rate, avg_latency_ms, total_runs, total_tokens')
588
+ .eq('project', options.project)
589
+ .order('date', { ascending: true });
590
+
591
+ if (options.scenario) {
592
+ query = query.eq('scenario', options.scenario);
593
+ } else {
594
+ query = query.is('scenario', null);
595
+ }
596
+
597
+ if (options.startDate) {
598
+ query = query.gte('date', options.startDate);
599
+ }
600
+ if (options.endDate) {
601
+ query = query.lte('date', options.endDate);
602
+ }
603
+ if (options.limit) {
604
+ query = query.limit(options.limit);
605
+ }
606
+
607
+ const { data, error } = await query;
608
+
609
+ if (error) {
610
+ throw new Error(`Failed to get metrics trend: ${error.message}`);
611
+ }
612
+
613
+ return (data || []).map((m) => ({
614
+ date: m.date,
615
+ successRate: m.avg_success_rate,
616
+ latencyMs: m.avg_latency_ms,
617
+ totalRuns: m.total_runs,
618
+ totalTokens: m.total_tokens,
619
+ }));
620
+ }
621
+
622
+ async getMetricsSnapshot(
623
+ date: string,
624
+ project: string,
625
+ scenario?: string
626
+ ): Promise<MetricsSnapshot | null> {
627
+ let query = this.client
628
+ .from('metrics_history')
629
+ .select('*')
630
+ .eq('date', date)
631
+ .eq('project', project);
632
+
633
+ if (scenario) {
634
+ query = query.eq('scenario', scenario);
635
+ } else {
636
+ query = query.is('scenario', null);
637
+ }
638
+
639
+ const { data, error } = await query.single();
640
+
641
+ if (error || !data) {
642
+ return null;
643
+ }
644
+
645
+ return {
646
+ id: data.id,
647
+ date: data.date,
648
+ project: data.project,
649
+ scenario: data.scenario,
650
+ totalRuns: data.total_runs,
651
+ totalCases: data.total_cases,
652
+ passedCases: data.passed_cases,
653
+ failedCases: data.failed_cases,
654
+ avgSuccessRate: data.avg_success_rate,
655
+ avgLatencyMs: data.avg_latency_ms,
656
+ avgTokensPerRun: data.avg_tokens_per_run,
657
+ minSuccessRate: data.min_success_rate,
658
+ maxSuccessRate: data.max_success_rate,
659
+ minLatencyMs: data.min_latency_ms,
660
+ maxLatencyMs: data.max_latency_ms,
661
+ totalTokens: data.total_tokens,
662
+ createdAt: data.created_at,
663
+ updatedAt: data.updated_at,
664
+ };
665
+ }
666
+
667
+ async aggregateDailyMetrics(
668
+ date: string,
669
+ project: string,
670
+ scenario?: string
671
+ ): Promise<MetricsSnapshot> {
672
+ // Query runs for this date
673
+ const startOfDay = `${date}T00:00:00.000Z`;
674
+ const endOfDay = `${date}T23:59:59.999Z`;
675
+
676
+ let query = this.client
677
+ .from('runs')
678
+ .select('*')
679
+ .eq('project', project)
680
+ .gte('started_at', startOfDay)
681
+ .lte('started_at', endOfDay);
682
+
683
+ if (scenario) {
684
+ query = query.eq('scenario', scenario);
685
+ }
686
+
687
+ const { data: runs, error } = await query;
688
+
689
+ if (error) {
690
+ throw new Error(`Failed to aggregate metrics: ${error.message}`);
691
+ }
692
+
693
+ const runList = runs || [];
694
+
695
+ if (runList.length === 0) {
696
+ // Return empty snapshot
697
+ const emptySnapshot: MetricsSnapshot = {
698
+ date,
699
+ project,
700
+ scenario,
701
+ totalRuns: 0,
702
+ totalCases: 0,
703
+ passedCases: 0,
704
+ failedCases: 0,
705
+ avgSuccessRate: 0,
706
+ avgLatencyMs: 0,
707
+ avgTokensPerRun: 0,
708
+ totalTokens: 0,
709
+ };
710
+ await this.saveMetricsSnapshot(emptySnapshot);
711
+ return emptySnapshot;
712
+ }
713
+
714
+ // Aggregate metrics
715
+ const totalRuns = runList.length;
716
+ const totalCases = runList.reduce((sum, r) => sum + r.total_cases, 0);
717
+ const passedCases = runList.reduce((sum, r) => sum + r.passed_cases, 0);
718
+ const failedCases = runList.reduce((sum, r) => sum + r.failed_cases, 0);
719
+ const totalTokens = runList.reduce((sum, r) => sum + r.total_tokens, 0);
720
+
721
+ const successRates = runList.map((r) => r.success_rate);
722
+ const latencies = runList.map((r) => r.median_latency_ms);
723
+
724
+ const snapshot: MetricsSnapshot = {
725
+ date,
726
+ project,
727
+ scenario,
728
+ totalRuns,
729
+ totalCases,
730
+ passedCases,
731
+ failedCases,
732
+ avgSuccessRate: successRates.reduce((a, b) => a + b, 0) / totalRuns,
733
+ avgLatencyMs: latencies.reduce((a, b) => a + b, 0) / totalRuns,
734
+ avgTokensPerRun: totalTokens / totalRuns,
735
+ minSuccessRate: Math.min(...successRates),
736
+ maxSuccessRate: Math.max(...successRates),
737
+ minLatencyMs: Math.min(...latencies),
738
+ maxLatencyMs: Math.max(...latencies),
739
+ totalTokens,
740
+ };
741
+
742
+ await this.saveMetricsSnapshot(snapshot);
743
+ return snapshot;
744
+ }
151
745
  }