@agentlensai/server 0.6.0 → 0.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (36) hide show
  1. package/dist/db/benchmark-store.d.ts +74 -0
  2. package/dist/db/benchmark-store.d.ts.map +1 -0
  3. package/dist/db/benchmark-store.js +268 -0
  4. package/dist/db/benchmark-store.js.map +1 -0
  5. package/dist/db/migrate.d.ts.map +1 -1
  6. package/dist/db/migrate.js +48 -0
  7. package/dist/db/migrate.js.map +1 -1
  8. package/dist/index.d.ts +1 -0
  9. package/dist/index.d.ts.map +1 -1
  10. package/dist/index.js +12 -0
  11. package/dist/index.js.map +1 -1
  12. package/dist/lib/benchmark/engine.d.ts +24 -0
  13. package/dist/lib/benchmark/engine.d.ts.map +1 -0
  14. package/dist/lib/benchmark/engine.js +159 -0
  15. package/dist/lib/benchmark/engine.js.map +1 -0
  16. package/dist/lib/benchmark/metric-aggregator.d.ts +38 -0
  17. package/dist/lib/benchmark/metric-aggregator.d.ts.map +1 -0
  18. package/dist/lib/benchmark/metric-aggregator.js +159 -0
  19. package/dist/lib/benchmark/metric-aggregator.js.map +1 -0
  20. package/dist/lib/benchmark/statistical.d.ts +51 -0
  21. package/dist/lib/benchmark/statistical.d.ts.map +1 -0
  22. package/dist/lib/benchmark/statistical.js +381 -0
  23. package/dist/lib/benchmark/statistical.js.map +1 -0
  24. package/dist/lib/replay/builder.d.ts +28 -0
  25. package/dist/lib/replay/builder.d.ts.map +1 -0
  26. package/dist/lib/replay/builder.js +482 -0
  27. package/dist/lib/replay/builder.js.map +1 -0
  28. package/dist/routes/benchmarks.d.ts +18 -0
  29. package/dist/routes/benchmarks.d.ts.map +1 -0
  30. package/dist/routes/benchmarks.js +312 -0
  31. package/dist/routes/benchmarks.js.map +1 -0
  32. package/dist/routes/replay.d.ts +28 -0
  33. package/dist/routes/replay.d.ts.map +1 -0
  34. package/dist/routes/replay.js +140 -0
  35. package/dist/routes/replay.js.map +1 -0
  36. package/package.json +2 -2
@@ -0,0 +1,159 @@
1
+ /**
2
+ * BenchmarkEngine (Story 3.4)
3
+ *
4
+ * Orchestrator that combines MetricAggregator and StatisticalComparator
5
+ * to produce full benchmark results with human-readable summaries.
6
+ */
7
+ import { MetricAggregator } from './metric-aggregator.js';
8
+ import { StatisticalComparator } from './statistical.js';
9
+ // ─── Constants ─────────────────────────────────────────────
10
+ const MIN_SESSIONS_FOR_RELIABLE_RESULTS = 30;
11
+ // ─── Metric display names ──────────────────────────────────
12
+ const METRIC_DISPLAY_NAMES = {
13
+ health_score: 'health score',
14
+ error_rate: 'error rate',
15
+ avg_cost: 'cost',
16
+ avg_latency: 'latency',
17
+ tool_success_rate: 'tool success rate',
18
+ completion_rate: 'completion rate',
19
+ avg_tokens: 'tokens',
20
+ avg_duration: 'duration',
21
+ };
22
+ // ─── BenchmarkEngine ───────────────────────────────────────
23
+ export class BenchmarkEngine {
24
+ aggregator = new MetricAggregator();
25
+ comparator = new StatisticalComparator();
26
+ /**
27
+ * Compute full benchmark results: aggregation + statistical comparison.
28
+ *
29
+ * For completed benchmarks, results are cached via store.saveResults().
30
+ * For running benchmarks, results are computed on-the-fly (not cached).
31
+ */
32
+ async computeResults(benchmark, eventStore, benchmarkStore) {
33
+ // For completed benchmarks, try to return cached results first
34
+ if (benchmark.status === 'completed' && benchmarkStore) {
35
+ const cached = benchmarkStore.getResults(benchmark.tenantId, benchmark.id);
36
+ if (cached)
37
+ return cached;
38
+ }
39
+ const variants = benchmark.variants;
40
+ const metrics = benchmark.metrics;
41
+ // Step 1: Aggregate metrics for each variant
42
+ const variantMetricsList = [];
43
+ const variantData = [];
44
+ for (const variant of variants) {
45
+ const aggregated = await this.aggregator.aggregate(eventStore, variant, metrics, benchmark.timeRange);
46
+ variantMetricsList.push({
47
+ variantId: variant.id,
48
+ variantName: variant.name,
49
+ sessionCount: aggregated.sessionCount,
50
+ metrics: aggregated.metrics,
51
+ });
52
+ variantData.push({
53
+ id: variant.id,
54
+ name: variant.name,
55
+ metrics: aggregated.metrics,
56
+ });
57
+ }
58
+ // Step 2: Pairwise comparisons for each metric
59
+ const comparisons = [];
60
+ for (let i = 0; i < variantData.length; i++) {
61
+ for (let j = i + 1; j < variantData.length; j++) {
62
+ const vA = variantData[i];
63
+ const vB = variantData[j];
64
+ for (const metric of metrics) {
65
+ try {
66
+ const comparison = this.comparator.compare(vA, vB, metric);
67
+ comparisons.push(comparison);
68
+ }
69
+ catch {
70
+ // Skip metrics where data is missing for a variant
71
+ }
72
+ }
73
+ }
74
+ }
75
+ // Step 3: Generate summary
76
+ const summary = this.formatSummary(comparisons, variantMetricsList);
77
+ const results = {
78
+ benchmarkId: benchmark.id,
79
+ tenantId: benchmark.tenantId,
80
+ variants: variantMetricsList,
81
+ comparisons,
82
+ summary,
83
+ computedAt: new Date().toISOString(),
84
+ };
85
+ // Cache results for completed benchmarks
86
+ if (benchmark.status === 'completed' && benchmarkStore) {
87
+ benchmarkStore.saveResults(benchmark.tenantId, benchmark.id, results);
88
+ }
89
+ return results;
90
+ }
91
+ /**
92
+ * Generate a human-readable summary of benchmark comparisons.
93
+ */
94
+ formatSummary(comparisons, variants) {
95
+ const parts = [];
96
+ // Warning: insufficient data
97
+ const lowDataVariants = variants.filter((v) => v.sessionCount < MIN_SESSIONS_FOR_RELIABLE_RESULTS);
98
+ if (lowDataVariants.length > 0) {
99
+ const names = lowDataVariants
100
+ .map((v) => `${v.variantName}: ${v.sessionCount} sessions`)
101
+ .join(', ');
102
+ parts.push(`⚠️ Insufficient data (${names}). Results may be unreliable.`);
103
+ }
104
+ // Group significant comparisons by winner
105
+ const significant = comparisons.filter((c) => c.significant);
106
+ if (significant.length === 0) {
107
+ parts.push('No significant differences found between variants.' +
108
+ (variants.length > 0
109
+ ? ` Current sample sizes: ${variants.map((v) => `${v.variantName}: ${v.sessionCount}`).join(', ')}.`
110
+ : ''));
111
+ return parts.join('\n');
112
+ }
113
+ // Build wins map: winnerId → list of comparisons
114
+ const winsByVariant = new Map();
115
+ for (const comp of significant) {
116
+ if (!comp.winner)
117
+ continue;
118
+ const wins = winsByVariant.get(comp.winner) ?? [];
119
+ wins.push(comp);
120
+ winsByVariant.set(comp.winner, wins);
121
+ }
122
+ // Find variant names
123
+ const variantNameMap = new Map();
124
+ for (const v of variants) {
125
+ variantNameMap.set(v.variantId, v.variantName);
126
+ }
127
+ // Also look in comparisons for variant names
128
+ for (const c of comparisons) {
129
+ variantNameMap.set(c.variantA.id, c.variantA.name);
130
+ variantNameMap.set(c.variantB.id, c.variantB.name);
131
+ }
132
+ // Generate per-winner summaries
133
+ for (const [winnerId, wins] of winsByVariant) {
134
+ const winnerName = variantNameMap.get(winnerId) ?? winnerId;
135
+ // Group by loser
136
+ const loserIds = new Set(wins.map((c) => c.winner === c.variantA.id ? c.variantB.id : c.variantA.id));
137
+ for (const loserId of loserIds) {
138
+ const loserName = variantNameMap.get(loserId) ?? loserId;
139
+ const relevantWins = wins.filter((c) => (c.winner === c.variantA.id && c.variantB.id === loserId) ||
140
+ (c.winner === c.variantB.id && c.variantA.id === loserId));
141
+ const metricTexts = relevantWins.map((comp) => {
142
+ const metricName = METRIC_DISPLAY_NAMES[comp.metric] ?? comp.metric;
143
+ const pctSign = comp.percentDiff >= 0 ? '+' : '';
144
+ const pctStr = `${pctSign}${comp.percentDiff.toFixed(0)}%`;
145
+ return `${metricName} (${pctStr}, ${comp.confidence})`;
146
+ });
147
+ parts.push(`${winnerName} outperforms ${loserName} on ${metricTexts.join(' and ')}.`);
148
+ }
149
+ }
150
+ // Note metrics with no significant differences
151
+ const notSignificant = comparisons.filter((c) => !c.significant);
152
+ if (notSignificant.length > 0) {
153
+ const nsMetrics = [...new Set(notSignificant.map((c) => METRIC_DISPLAY_NAMES[c.metric] ?? c.metric))];
154
+ parts.push(`No significant difference on ${nsMetrics.join(', ')}.`);
155
+ }
156
+ return parts.join('\n');
157
+ }
158
+ }
159
+ //# sourceMappingURL=engine.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"engine.js","sourceRoot":"","sources":["../../../src/lib/benchmark/engine.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AAUH,OAAO,EAAE,gBAAgB,EAAE,MAAM,wBAAwB,CAAC;AAC1D,OAAO,EAAE,qBAAqB,EAAE,MAAM,kBAAkB,CAAC;AAGzD,8DAA8D;AAE9D,MAAM,iCAAiC,GAAG,EAAE,CAAC;AAE7C,8DAA8D;AAE9D,MAAM,oBAAoB,GAAoC;IAC5D,YAAY,EAAE,cAAc;IAC5B,UAAU,EAAE,YAAY;IACxB,QAAQ,EAAE,MAAM;IAChB,WAAW,EAAE,SAAS;IACtB,iBAAiB,EAAE,mBAAmB;IACtC,eAAe,EAAE,iBAAiB;IAClC,UAAU,EAAE,QAAQ;IACpB,YAAY,EAAE,UAAU;CACzB,CAAC;AAEF,8DAA8D;AAE9D,MAAM,OAAO,eAAe;IACT,UAAU,GAAG,IAAI,gBAAgB,EAAE,CAAC;IACpC,UAAU,GAAG,IAAI,qBAAqB,EAAE,CAAC;IAE1D;;;;;OAKG;IACH,KAAK,CAAC,cAAc,CAClB,SAAgC,EAChC,UAAuB,EACvB,cAA+B;QAE/B,+DAA+D;QAC/D,IAAI,SAAS,CAAC,MAAM,KAAK,WAAW,IAAI,cAAc,EAAE,CAAC;YACvD,MAAM,MAAM,GAAG,cAAc,CAAC,UAAU,CAAC,SAAS,CAAC,QAAQ,EAAE,SAAS,CAAC,EAAE,CAAC,CAAC;YAC3E,IAAI,MAAM;gBAAE,OAAO,MAAM,CAAC;QAC5B,CAAC;QAED,MAAM,QAAQ,GAAG,SAAS,CAAC,QAAQ,CAAC;QACpC,MAAM,OAAO,GAAG,SAAS,CAAC,OAAO,CAAC;QAElC,6CAA6C;QAC7C,MAAM,kBAAkB,GAAqB,EAAE,CAAC;QAChD,MAAM,WAAW,GAIZ,EAAE,CAAC;QAER,KAAK,MAAM,OAAO,IAAI,QAAQ,EAAE,CAAC;YAC/B,MAAM,UAAU,GAAG,MAAM,IAAI,CAAC,UAAU,CAAC,SAAS,CAChD,UAAU,EACV,OAAO,EACP,OAAO,EACP,SAAS,CAAC,SAAS,CACpB,CAAC;YAEF,kBAAkB,CAAC,IAAI,CAAC;gBACtB,SAAS,EAAE,OAAO,CAAC,EAAE;gBACrB,WAAW,EAAE,OAAO,CAAC,IAAI;gBACzB,YAAY,EAAE,UAAU,CAAC,YAAY;gBACrC,OAAO,EAAE,UAAU,CAAC,OAAO;aAC5B,CAAC,CAAC;YAEH,WAAW,CAAC,IAAI,CAAC;gBACf,EAAE,EAAE,OAAO,CAAC,EAAE;gBACd,IAAI,EAAE,OAAO,CAAC,IAAI;gBAClB,OAAO,EAAE,UAAU,CAAC,OAAO;aAC5B,CAAC,CAAC;QACL,CAAC;QAED,+CAA+C;QAC/C,MAAM,WAAW,GAAuB,EAAE,CAAC;QAE3C,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,WAAW,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;YAC5C,KAAK,IAAI,CAAC,GAAG,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,WAAW,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;gBAChD,MAAM,EAAE,GAAG,WAAW,CAAC,CAAC,CAAE,CAAC;gBAC3B,MAAM,EAAE,GAAG,WAAW,CAAC,CAAC,CAAE,CAAC;gBAE3B,KAAK,MAAM,MAAM,IAAI,OAAO,EAAE,CAAC;oBAC7B,IAAI,CAAC;wBACH,MAAM,UAAU,GAAG,IAAI,CAAC,UAAU,CAAC,OAAO,CAAC,EAAE,EAAE,EAAE,EAAE,MAAM,CAAC,CAAC;wBAC3D,WAAW,CAAC,IAAI,CAAC,UAAU,CAAC,CAAC;oBAC/B,CAAC;oBAAC,MAAM,CAAC;wBACP,mDAAmD;oBACrD,CAAC;gBACH,CAAC;YACH,CAAC;QACH,CAAC;QAED,2BAA2B;QAC3B,MAAM,OAAO,GAAG,IAAI,CAAC,aAAa,CAAC,WAAW,EAAE,kBAAkB,CAAC,CAAC;QAEpE,MAAM,OAAO,GAAqB;YAChC,WAAW,EAAE,SAAS,CAAC,EAAE;YACzB,QAAQ,EAAE,SAAS,CAAC,QAAQ;YAC5B,QAAQ,EAAE,kBAAkB;YAC5B,WAAW;YACX,OAAO;YACP,UAAU,EAAE,IAAI,IAAI,EAAE,CAAC,WAAW,EAAE;SACrC,CAAC;QAEF,yCAAyC;QACzC,IAAI,SAAS,CAAC,MAAM,KAAK,WAAW,IAAI,cAAc,EAAE,CAAC;YACvD,cAAc,CAAC,WAAW,CAAC,SAAS,CAAC,QAAQ,EAAE,SAAS,CAAC,EAAE,EAAE,OAAO,CAAC,CAAC;QACxE,CAAC;QAED,OAAO,OAAO,CAAC;IACjB,CAAC;IAED;;OAEG;IACH,aAAa,CACX,WAA+B,EAC/B,QAA0B;QAE1B,MAAM,KAAK,GAAa,EAAE,CAAC;QAE3B,6BAA6B;QAC7B,MAAM,eAAe,GAAG,QAAQ,CAAC,MAAM,CACrC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,YAAY,GAAG,iCAAiC,CAC1D,CAAC;QACF,IAAI,eAAe,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;YAC/B,MAAM,KAAK,GAAG,eAAe;iBAC1B,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,GAAG,CAAC,CAAC,WAAW,KAAK,CAAC,CAAC,YAAY,WAAW,CAAC;iBAC1D,IAAI,CAAC,IAAI,CAAC,CAAC;YACd,KAAK,CAAC,IAAI,CAAC,yBAAyB,KAAK,+BAA+B,CAAC,CAAC;QAC5E,CAAC;QAED,0CAA0C;QAC1C,MAAM,WAAW,GAAG,WAAW,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,WAAW,CAAC,CAAC;QAE7D,IAAI,WAAW,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;YAC7B,KAAK,CAAC,IAAI,CACR,oDAAoD;gBAClD,CAAC,QAAQ,CAAC,MAAM,GAAG,CAAC;oBAClB,CAAC,CAAC,0BAA0B,QAAQ,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,GAAG,CAAC,CAAC,WAAW,KAAK,CAAC,CAAC,YAAY,EAAE,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,GAAG;oBACpG,CAAC,CAAC,EAAE,CAAC,CACV,CAAC;YACF,OAAO,KAAK,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;QAC1B,CAAC;QAED,iDAAiD;QACjD,MAAM,aAAa,GAAG,IAAI,GAAG,EAA8B,CAAC;QAC5D,KAAK,MAAM,IAAI,IAAI,WAAW,EAAE,CAAC;YAC/B,IAAI,CAAC,IAAI,CAAC,MAAM;gBAAE,SAAS;YAC3B,MAAM,IAAI,GAAG,aAAa,CAAC,GAAG,CAAC,IAAI,CAAC,MAAM,CAAC,IAAI,EAAE,CAAC;YAClD,IAAI,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;YAChB,aAAa,CAAC,GAAG,CAAC,IAAI,CAAC,MAAM,EAAE,IAAI,CAAC,CAAC;QACvC,CAAC;QAED,qBAAqB;QACrB,MAAM,cAAc,GAAG,IAAI,GAAG,EAAkB,CAAC;QACjD,KAAK,MAAM,CAAC,IAAI,QAAQ,EAAE,CAAC;YACzB,cAAc,CAAC,GAAG,CAAC,CAAC,CAAC,SAAS,EAAE,CAAC,CAAC,WAAW,CAAC,CAAC;QACjD,CAAC;QACD,6CAA6C;QAC7C,KAAK,MAAM,CAAC,IAAI,WAAW,EAAE,CAAC;YAC5B,cAAc,CAAC,GAAG,CAAC,CAAC,CAAC,QAAQ,CAAC,EAAE,EAAE,CAAC,CAAC,QAAQ,CAAC,IAAI,CAAC,CAAC;YACnD,cAAc,CAAC,GAAG,CAAC,CAAC,CAAC,QAAQ,CAAC,EAAE,EAAE,CAAC,CAAC,QAAQ,CAAC,IAAI,CAAC,CAAC;QACrD,CAAC;QAED,gCAAgC;QAChC,KAAK,MAAM,CAAC,QAAQ,EAAE,IAAI,CAAC,IAAI,aAAa,EAAE,CAAC;YAC7C,MAAM,UAAU,GAAG,cAAc,CAAC,GAAG,CAAC,QAAQ,CAAC,IAAI,QAAQ,CAAC;YAE5D,iBAAiB;YACjB,MAAM,QAAQ,GAAG,IAAI,GAAG,CACtB,IAAI,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CACb,CAAC,CAAC,MAAM,KAAK,CAAC,CAAC,QAAQ,CAAC,EAAE,CAAC,CAAC,CAAC,CAAC,CAAC,QAAQ,CAAC,EAAE,CAAC,CAAC,CAAC,CAAC,CAAC,QAAQ,CAAC,EAAE,CAC3D,CACF,CAAC;YAEF,KAAK,MAAM,OAAO,IAAI,QAAQ,EAAE,CAAC;gBAC/B,MAAM,SAAS,GAAG,cAAc,CAAC,GAAG,CAAC,OAAO,CAAC,IAAI,OAAO,CAAC;gBACzD,MAAM,YAAY,GAAG,IAAI,CAAC,MAAM,CAC9B,CAAC,CAAC,EAAE,EAAE,CACJ,CAAC,CAAC,CAAC,MAAM,KAAK,CAAC,CAAC,QAAQ,CAAC,EAAE,IAAI,CAAC,CAAC,QAAQ,CAAC,EAAE,KAAK,OAAO,CAAC;oBACzD,CAAC,CAAC,CAAC,MAAM,KAAK,CAAC,CAAC,QAAQ,CAAC,EAAE,IAAI,CAAC,CAAC,QAAQ,CAAC,EAAE,KAAK,OAAO,CAAC,CAC5D,CAAC;gBACF,MAAM,WAAW,GAAG,YAAY,CAAC,GAAG,CAAC,CAAC,IAAI,EAAE,EAAE;oBAC5C,MAAM,UAAU,GAAG,oBAAoB,CAAC,IAAI,CAAC,MAAM,CAAC,IAAI,IAAI,CAAC,MAAM,CAAC;oBACpE,MAAM,OAAO,GAAG,IAAI,CAAC,WAAW,IAAI,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC;oBACjD,MAAM,MAAM,GAAG,GAAG,OAAO,GAAG,IAAI,CAAC,WAAW,CAAC,OAAO,CAAC,CAAC,CAAC,GAAG,CAAC;oBAC3D,OAAO,GAAG,UAAU,KAAK,MAAM,KAAK,IAAI,CAAC,UAAU,GAAG,CAAC;gBACzD,CAAC,CAAC,CAAC;gBAEH,KAAK,CAAC,IAAI,CACR,GAAG,UAAU,gBAAgB,SAAS,OAAO,WAAW,CAAC,IAAI,CAAC,OAAO,CAAC,GAAG,CAC1E,CAAC;YACJ,CAAC;QACH,CAAC;QAED,+CAA+C;QAC/C,MAAM,cAAc,GAAG,WAAW,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,CAAC,WAAW,CAAC,CAAC;QACjE,IAAI,cAAc,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;YAC9B,MAAM,SAAS,GAAG,CAAC,GAAG,IAAI,GAAG,CAAC,cAAc,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,oBAAoB,CAAC,CAAC,CAAC,MAAM,CAAC,IAAI,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,CAAC;YACtG,KAAK,CAAC,IAAI,CAAC,gCAAgC,SAAS,CAAC,IAAI,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC;QACtE,CAAC;QAED,OAAO,KAAK,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;IAC1B,CAAC;CACF"}
@@ -0,0 +1,38 @@
1
+ /**
2
+ * MetricAggregator (Story 3.2)
3
+ *
4
+ * Computes descriptive statistics for each benchmark metric across a variant's sessions.
5
+ * Provides clean inputs for the StatisticalComparator.
6
+ */
7
+ import type { BenchmarkMetric, BenchmarkVariant, IEventStore, MetricStats, Session } from '@agentlensai/core';
8
+ export declare class MetricAggregator {
9
+ /**
10
+ * Aggregate metrics for a variant by querying sessions matching the variant's tag.
11
+ *
12
+ * @param store - Event store to query sessions from
13
+ * @param variant - The benchmark variant (identifies sessions via tag)
14
+ * @param metrics - Which metrics to compute
15
+ * @param timeRange - Optional time range filter
16
+ * @returns Record of metric → MetricStats
17
+ */
18
+ aggregate(store: IEventStore, variant: BenchmarkVariant, metrics: BenchmarkMetric[], timeRange?: {
19
+ from: string;
20
+ to: string;
21
+ }): Promise<{
22
+ sessionCount: number;
23
+ metrics: Record<BenchmarkMetric, MetricStats>;
24
+ }>;
25
+ /**
26
+ * Extract numeric values for a metric from a list of sessions.
27
+ */
28
+ extractMetricValues(sessions: Session[], metric: BenchmarkMetric): number[];
29
+ /**
30
+ * Extract a single metric value from a session.
31
+ */
32
+ private extractSingleValue;
33
+ /**
34
+ * Compute descriptive statistics from an array of numeric values.
35
+ */
36
+ computeStats(values: number[]): MetricStats;
37
+ }
38
+ //# sourceMappingURL=metric-aggregator.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"metric-aggregator.d.ts","sourceRoot":"","sources":["../../../src/lib/benchmark/metric-aggregator.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AAEH,OAAO,KAAK,EACV,eAAe,EACf,gBAAgB,EAChB,WAAW,EACX,WAAW,EACX,OAAO,EACR,MAAM,mBAAmB,CAAC;AAI3B,qBAAa,gBAAgB;IAC3B;;;;;;;;OAQG;IACG,SAAS,CACb,KAAK,EAAE,WAAW,EAClB,OAAO,EAAE,gBAAgB,EACzB,OAAO,EAAE,eAAe,EAAE,EAC1B,SAAS,CAAC,EAAE;QAAE,IAAI,EAAE,MAAM,CAAC;QAAC,EAAE,EAAE,MAAM,CAAA;KAAE,GACvC,OAAO,CAAC;QAAE,YAAY,EAAE,MAAM,CAAC;QAAC,OAAO,EAAE,MAAM,CAAC,eAAe,EAAE,WAAW,CAAC,CAAA;KAAE,CAAC;IAiCnF;;OAEG;IACH,mBAAmB,CAAC,QAAQ,EAAE,OAAO,EAAE,EAAE,MAAM,EAAE,eAAe,GAAG,MAAM,EAAE;IAa3E;;OAEG;IACH,OAAO,CAAC,kBAAkB;IAsD1B;;OAEG;IACH,YAAY,CAAC,MAAM,EAAE,MAAM,EAAE,GAAG,WAAW;CA6C5C"}
@@ -0,0 +1,159 @@
1
+ /**
2
+ * MetricAggregator (Story 3.2)
3
+ *
4
+ * Computes descriptive statistics for each benchmark metric across a variant's sessions.
5
+ * Provides clean inputs for the StatisticalComparator.
6
+ */
7
+ // ─── Public Interface ──────────────────────────────────────
8
+ export class MetricAggregator {
9
+ /**
10
+ * Aggregate metrics for a variant by querying sessions matching the variant's tag.
11
+ *
12
+ * @param store - Event store to query sessions from
13
+ * @param variant - The benchmark variant (identifies sessions via tag)
14
+ * @param metrics - Which metrics to compute
15
+ * @param timeRange - Optional time range filter
16
+ * @returns Record of metric → MetricStats
17
+ */
18
+ async aggregate(store, variant, metrics, timeRange) {
19
+ // Paginate through all sessions (store caps at 500 per query)
20
+ const sessions = [];
21
+ const pageSize = 500;
22
+ let offset = 0;
23
+ while (true) {
24
+ const page = await store.querySessions({
25
+ tenantId: variant.tenantId,
26
+ agentId: variant.agentId,
27
+ tags: [variant.tag],
28
+ from: timeRange?.from,
29
+ to: timeRange?.to,
30
+ limit: pageSize,
31
+ offset,
32
+ });
33
+ sessions.push(...page.sessions);
34
+ if (page.sessions.length < pageSize)
35
+ break;
36
+ offset += pageSize;
37
+ }
38
+ const result = {};
39
+ for (const metric of metrics) {
40
+ const values = this.extractMetricValues(sessions, metric);
41
+ result[metric] = this.computeStats(values);
42
+ }
43
+ return {
44
+ sessionCount: sessions.length,
45
+ metrics: result,
46
+ };
47
+ }
48
+ /**
49
+ * Extract numeric values for a metric from a list of sessions.
50
+ */
51
+ extractMetricValues(sessions, metric) {
52
+ const values = [];
53
+ for (const session of sessions) {
54
+ const value = this.extractSingleValue(session, metric);
55
+ if (value !== null && !isNaN(value)) {
56
+ values.push(value);
57
+ }
58
+ }
59
+ return values;
60
+ }
61
+ /**
62
+ * Extract a single metric value from a session.
63
+ */
64
+ extractSingleValue(session, metric) {
65
+ switch (metric) {
66
+ case 'avg_cost':
67
+ return session.totalCostUsd;
68
+ case 'error_rate':
69
+ // Error rate = errorCount / eventCount
70
+ if (session.eventCount === 0)
71
+ return null;
72
+ return session.errorCount / session.eventCount;
73
+ case 'completion_rate':
74
+ // Binary: 1 if completed, 0 otherwise
75
+ return session.status === 'completed' ? 1 : 0;
76
+ case 'tool_success_rate':
77
+ // errorCount includes ALL errors (LLM, system, etc.), not just tool errors.
78
+ // Use conservative proxy: cap tool errors at toolCallCount to avoid negative rates.
79
+ if (session.toolCallCount === 0)
80
+ return null;
81
+ const toolErrors = Math.min(session.errorCount, session.toolCallCount);
82
+ return Math.max(0, (session.toolCallCount - toolErrors) / session.toolCallCount);
83
+ case 'avg_tokens':
84
+ return session.totalInputTokens + session.totalOutputTokens;
85
+ case 'avg_duration': {
86
+ // Duration in milliseconds
87
+ if (!session.endedAt || !session.startedAt)
88
+ return null;
89
+ const start = new Date(session.startedAt).getTime();
90
+ const end = new Date(session.endedAt).getTime();
91
+ const durationMs = end - start;
92
+ return durationMs > 0 ? durationMs : null;
93
+ }
94
+ case 'avg_latency':
95
+ // avg_latency requires event-level query; we approximate from session-level data
96
+ // latency ≈ duration / llmCallCount (average time per LLM call)
97
+ if (!session.endedAt || !session.startedAt || session.llmCallCount === 0)
98
+ return null;
99
+ const startMs = new Date(session.startedAt).getTime();
100
+ const endMs = new Date(session.endedAt).getTime();
101
+ const totalDuration = endMs - startMs;
102
+ return totalDuration > 0 ? totalDuration / session.llmCallCount : null;
103
+ case 'health_score':
104
+ // health_score is a composite score that would need the HealthComputer.
105
+ // At the session level, we compute a simplified score:
106
+ // Based on error rate, completion, and tool success
107
+ // This is a placeholder that returns null if not computed elsewhere.
108
+ return null;
109
+ default:
110
+ return null;
111
+ }
112
+ }
113
+ /**
114
+ * Compute descriptive statistics from an array of numeric values.
115
+ */
116
+ computeStats(values) {
117
+ if (values.length === 0) {
118
+ return {
119
+ mean: 0,
120
+ median: 0,
121
+ stddev: 0,
122
+ min: 0,
123
+ max: 0,
124
+ count: 0,
125
+ };
126
+ }
127
+ const n = values.length;
128
+ const sorted = [...values].sort((a, b) => a - b);
129
+ const sum = values.reduce((acc, v) => acc + v, 0);
130
+ const mean = sum / n;
131
+ // Median
132
+ let median;
133
+ if (n % 2 === 0) {
134
+ median = (sorted[n / 2 - 1] + sorted[n / 2]) / 2;
135
+ }
136
+ else {
137
+ median = sorted[Math.floor(n / 2)];
138
+ }
139
+ // Sample standard deviation (Bessel's correction)
140
+ let variance;
141
+ if (n < 2) {
142
+ variance = 0;
143
+ }
144
+ else {
145
+ const sumSquaredDiffs = values.reduce((acc, v) => acc + (v - mean) ** 2, 0);
146
+ variance = sumSquaredDiffs / (n - 1);
147
+ }
148
+ const stddev = Math.sqrt(variance);
149
+ return {
150
+ mean,
151
+ median,
152
+ stddev,
153
+ min: sorted[0],
154
+ max: sorted[n - 1],
155
+ count: n,
156
+ };
157
+ }
158
+ }
159
+ //# sourceMappingURL=metric-aggregator.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"metric-aggregator.js","sourceRoot":"","sources":["../../../src/lib/benchmark/metric-aggregator.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AAUH,8DAA8D;AAE9D,MAAM,OAAO,gBAAgB;IAC3B;;;;;;;;OAQG;IACH,KAAK,CAAC,SAAS,CACb,KAAkB,EAClB,OAAyB,EACzB,OAA0B,EAC1B,SAAwC;QAExC,8DAA8D;QAC9D,MAAM,QAAQ,GAAc,EAAE,CAAC;QAC/B,MAAM,QAAQ,GAAG,GAAG,CAAC;QACrB,IAAI,MAAM,GAAG,CAAC,CAAC;QACf,OAAO,IAAI,EAAE,CAAC;YACZ,MAAM,IAAI,GAAG,MAAM,KAAK,CAAC,aAAa,CAAC;gBACrC,QAAQ,EAAE,OAAO,CAAC,QAAQ;gBAC1B,OAAO,EAAE,OAAO,CAAC,OAAO;gBACxB,IAAI,EAAE,CAAC,OAAO,CAAC,GAAG,CAAC;gBACnB,IAAI,EAAE,SAAS,EAAE,IAAI;gBACrB,EAAE,EAAE,SAAS,EAAE,EAAE;gBACjB,KAAK,EAAE,QAAQ;gBACf,MAAM;aACP,CAAC,CAAC;YACH,QAAQ,CAAC,IAAI,CAAC,GAAG,IAAI,CAAC,QAAQ,CAAC,CAAC;YAChC,IAAI,IAAI,CAAC,QAAQ,CAAC,MAAM,GAAG,QAAQ;gBAAE,MAAM;YAC3C,MAAM,IAAI,QAAQ,CAAC;QACrB,CAAC;QAED,MAAM,MAAM,GAAG,EAA0C,CAAC;QAE1D,KAAK,MAAM,MAAM,IAAI,OAAO,EAAE,CAAC;YAC7B,MAAM,MAAM,GAAG,IAAI,CAAC,mBAAmB,CAAC,QAAQ,EAAE,MAAM,CAAC,CAAC;YAC1D,MAAM,CAAC,MAAM,CAAC,GAAG,IAAI,CAAC,YAAY,CAAC,MAAM,CAAC,CAAC;QAC7C,CAAC;QAED,OAAO;YACL,YAAY,EAAE,QAAQ,CAAC,MAAM;YAC7B,OAAO,EAAE,MAAM;SAChB,CAAC;IACJ,CAAC;IAED;;OAEG;IACH,mBAAmB,CAAC,QAAmB,EAAE,MAAuB;QAC9D,MAAM,MAAM,GAAa,EAAE,CAAC;QAE5B,KAAK,MAAM,OAAO,IAAI,QAAQ,EAAE,CAAC;YAC/B,MAAM,KAAK,GAAG,IAAI,CAAC,kBAAkB,CAAC,OAAO,EAAE,MAAM,CAAC,CAAC;YACvD,IAAI,KAAK,KAAK,IAAI,IAAI,CAAC,KAAK,CAAC,KAAK,CAAC,EAAE,CAAC;gBACpC,MAAM,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC;YACrB,CAAC;QACH,CAAC;QAED,OAAO,MAAM,CAAC;IAChB,CAAC;IAED;;OAEG;IACK,kBAAkB,CAAC,OAAgB,EAAE,MAAuB;QAClE,QAAQ,MAAM,EAAE,CAAC;YACf,KAAK,UAAU;gBACb,OAAO,OAAO,CAAC,YAAY,CAAC;YAE9B,KAAK,YAAY;gBACf,uCAAuC;gBACvC,IAAI,OAAO,CAAC,UAAU,KAAK,CAAC;oBAAE,OAAO,IAAI,CAAC;gBAC1C,OAAO,OAAO,CAAC,UAAU,GAAG,OAAO,CAAC,UAAU,CAAC;YAEjD,KAAK,iBAAiB;gBACpB,sCAAsC;gBACtC,OAAO,OAAO,CAAC,MAAM,KAAK,WAAW,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC;YAEhD,KAAK,mBAAmB;gBACtB,4EAA4E;gBAC5E,oFAAoF;gBACpF,IAAI,OAAO,CAAC,aAAa,KAAK,CAAC;oBAAE,OAAO,IAAI,CAAC;gBAC7C,MAAM,UAAU,GAAG,IAAI,CAAC,GAAG,CAAC,OAAO,CAAC,UAAU,EAAE,OAAO,CAAC,aAAa,CAAC,CAAC;gBACvE,OAAO,IAAI,CAAC,GAAG,CAAC,CAAC,EAAE,CAAC,OAAO,CAAC,aAAa,GAAG,UAAU,CAAC,GAAG,OAAO,CAAC,aAAa,CAAC,CAAC;YAEnF,KAAK,YAAY;gBACf,OAAO,OAAO,CAAC,gBAAgB,GAAG,OAAO,CAAC,iBAAiB,CAAC;YAE9D,KAAK,cAAc,CAAC,CAAC,CAAC;gBACpB,2BAA2B;gBAC3B,IAAI,CAAC,OAAO,CAAC,OAAO,IAAI,CAAC,OAAO,CAAC,SAAS;oBAAE,OAAO,IAAI,CAAC;gBACxD,MAAM,KAAK,GAAG,IAAI,IAAI,CAAC,OAAO,CAAC,SAAS,CAAC,CAAC,OAAO,EAAE,CAAC;gBACpD,MAAM,GAAG,GAAG,IAAI,IAAI,CAAC,OAAO,CAAC,OAAO,CAAC,CAAC,OAAO,EAAE,CAAC;gBAChD,MAAM,UAAU,GAAG,GAAG,GAAG,KAAK,CAAC;gBAC/B,OAAO,UAAU,GAAG,CAAC,CAAC,CAAC,CAAC,UAAU,CAAC,CAAC,CAAC,IAAI,CAAC;YAC5C,CAAC;YAED,KAAK,aAAa;gBAChB,iFAAiF;gBACjF,gEAAgE;gBAChE,IAAI,CAAC,OAAO,CAAC,OAAO,IAAI,CAAC,OAAO,CAAC,SAAS,IAAI,OAAO,CAAC,YAAY,KAAK,CAAC;oBAAE,OAAO,IAAI,CAAC;gBACtF,MAAM,OAAO,GAAG,IAAI,IAAI,CAAC,OAAO,CAAC,SAAS,CAAC,CAAC,OAAO,EAAE,CAAC;gBACtD,MAAM,KAAK,GAAG,IAAI,IAAI,CAAC,OAAO,CAAC,OAAO,CAAC,CAAC,OAAO,EAAE,CAAC;gBAClD,MAAM,aAAa,GAAG,KAAK,GAAG,OAAO,CAAC;gBACtC,OAAO,aAAa,GAAG,CAAC,CAAC,CAAC,CAAC,aAAa,GAAG,OAAO,CAAC,YAAY,CAAC,CAAC,CAAC,IAAI,CAAC;YAEzE,KAAK,cAAc;gBACjB,wEAAwE;gBACxE,uDAAuD;gBACvD,oDAAoD;gBACpD,qEAAqE;gBACrE,OAAO,IAAI,CAAC;YAEd;gBACE,OAAO,IAAI,CAAC;QAChB,CAAC;IACH,CAAC;IAED;;OAEG;IACH,YAAY,CAAC,MAAgB;QAC3B,IAAI,MAAM,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;YACxB,OAAO;gBACL,IAAI,EAAE,CAAC;gBACP,MAAM,EAAE,CAAC;gBACT,MAAM,EAAE,CAAC;gBACT,GAAG,EAAE,CAAC;gBACN,GAAG,EAAE,CAAC;gBACN,KAAK,EAAE,CAAC;aACT,CAAC;QACJ,CAAC;QAED,MAAM,CAAC,GAAG,MAAM,CAAC,MAAM,CAAC;QACxB,MAAM,MAAM,GAAG,CAAC,GAAG,MAAM,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC;QAEjD,MAAM,GAAG,GAAG,MAAM,CAAC,MAAM,CAAC,CAAC,GAAG,EAAE,CAAC,EAAE,EAAE,CAAC,GAAG,GAAG,CAAC,EAAE,CAAC,CAAC,CAAC;QAClD,MAAM,IAAI,GAAG,GAAG,GAAG,CAAC,CAAC;QAErB,SAAS;QACT,IAAI,MAAc,CAAC;QACnB,IAAI,CAAC,GAAG,CAAC,KAAK,CAAC,EAAE,CAAC;YAChB,MAAM,GAAG,CAAC,MAAM,CAAC,CAAC,GAAG,CAAC,GAAG,CAAC,CAAE,GAAG,MAAM,CAAC,CAAC,GAAG,CAAC,CAAE,CAAC,GAAG,CAAC,CAAC;QACrD,CAAC;aAAM,CAAC;YACN,MAAM,GAAG,MAAM,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC,GAAG,CAAC,CAAC,CAAE,CAAC;QACtC,CAAC;QAED,kDAAkD;QAClD,IAAI,QAAgB,CAAC;QACrB,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC;YACV,QAAQ,GAAG,CAAC,CAAC;QACf,CAAC;aAAM,CAAC;YACN,MAAM,eAAe,GAAG,MAAM,CAAC,MAAM,CAAC,CAAC,GAAG,EAAE,CAAC,EAAE,EAAE,CAAC,GAAG,GAAG,CAAC,CAAC,GAAG,IAAI,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC,CAAC;YAC5E,QAAQ,GAAG,eAAe,GAAG,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC;QACvC,CAAC;QACD,MAAM,MAAM,GAAG,IAAI,CAAC,IAAI,CAAC,QAAQ,CAAC,CAAC;QAEnC,OAAO;YACL,IAAI;YACJ,MAAM;YACN,MAAM;YACN,GAAG,EAAE,MAAM,CAAC,CAAC,CAAE;YACf,GAAG,EAAE,MAAM,CAAC,CAAC,GAAG,CAAC,CAAE;YACnB,KAAK,EAAE,CAAC;SACT,CAAC;IACJ,CAAC;CACF"}
@@ -0,0 +1,51 @@
1
+ /**
2
+ * StatisticalComparator (Story 3.3)
3
+ *
4
+ * Hypothesis testing for benchmark variant comparisons.
5
+ * Implements Welch's t-test and chi-squared test with numerical p-value computation.
6
+ */
7
+ import type { BenchmarkMetric, MetricComparison, MetricStats } from '@agentlensai/core';
8
+ export interface TTestResult {
9
+ tStatistic: number;
10
+ degreesOfFreedom: number;
11
+ pValue: number;
12
+ confidenceInterval: {
13
+ lower: number;
14
+ upper: number;
15
+ };
16
+ cohenD: number;
17
+ }
18
+ export interface ChiSquaredResult {
19
+ chiSquared: number;
20
+ pValue: number;
21
+ phi: number;
22
+ }
23
+ export declare class StatisticalComparator {
24
+ /**
25
+ * Welch's t-test for comparing two sets of continuous metric statistics.
26
+ */
27
+ welchTTest(statsA: MetricStats, statsB: MetricStats): TTestResult;
28
+ /**
29
+ * Chi-squared test with Yates' continuity correction for comparing two proportions.
30
+ */
31
+ chiSquaredTest(successesA: number, totalA: number, successesB: number, totalB: number): ChiSquaredResult;
32
+ /**
33
+ * Compare two variants on a single metric. Selects appropriate test.
34
+ */
35
+ compare(variantA: {
36
+ id: string;
37
+ name: string;
38
+ metrics: Record<string, MetricStats>;
39
+ }, variantB: {
40
+ id: string;
41
+ name: string;
42
+ metrics: Record<string, MetricStats>;
43
+ }, metric: BenchmarkMetric): MetricComparison;
44
+ private compareContinuous;
45
+ private compareProportions;
46
+ /**
47
+ * Confidence star rating based on p-value.
48
+ */
49
+ confidenceStars(pValue: number): '★★★' | '★★' | '★' | '—';
50
+ }
51
+ //# sourceMappingURL=statistical.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"statistical.d.ts","sourceRoot":"","sources":["../../../src/lib/benchmark/statistical.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AAEH,OAAO,KAAK,EACV,eAAe,EACf,gBAAgB,EAChB,WAAW,EAEZ,MAAM,mBAAmB,CAAC;AAiC3B,MAAM,WAAW,WAAW;IAC1B,UAAU,EAAE,MAAM,CAAC;IACnB,gBAAgB,EAAE,MAAM,CAAC;IACzB,MAAM,EAAE,MAAM,CAAC;IACf,kBAAkB,EAAE;QAAE,KAAK,EAAE,MAAM,CAAC;QAAC,KAAK,EAAE,MAAM,CAAA;KAAE,CAAC;IACrD,MAAM,EAAE,MAAM,CAAC;CAChB;AAID,MAAM,WAAW,gBAAgB;IAC/B,UAAU,EAAE,MAAM,CAAC;IACnB,MAAM,EAAE,MAAM,CAAC;IACf,GAAG,EAAE,MAAM,CAAC;CACb;AAgKD,qBAAa,qBAAqB;IAChC;;OAEG;IACH,UAAU,CAAC,MAAM,EAAE,WAAW,EAAE,MAAM,EAAE,WAAW,GAAG,WAAW;IAmEjE;;OAEG;IACH,cAAc,CACZ,UAAU,EAAE,MAAM,EAClB,MAAM,EAAE,MAAM,EACd,UAAU,EAAE,MAAM,EAClB,MAAM,EAAE,MAAM,GACb,gBAAgB;IA8CnB;;OAEG;IACH,OAAO,CACL,QAAQ,EAAE;QAAE,EAAE,EAAE,MAAM,CAAC;QAAC,IAAI,EAAE,MAAM,CAAC;QAAC,OAAO,EAAE,MAAM,CAAC,MAAM,EAAE,WAAW,CAAC,CAAA;KAAE,EAC5E,QAAQ,EAAE;QAAE,EAAE,EAAE,MAAM,CAAC;QAAC,IAAI,EAAE,MAAM,CAAC;QAAC,OAAO,EAAE,MAAM,CAAC,MAAM,EAAE,WAAW,CAAC,CAAA;KAAE,EAC5E,MAAM,EAAE,eAAe,GACtB,gBAAgB;IAiBnB,OAAO,CAAC,iBAAiB;IAwCzB,OAAO,CAAC,kBAAkB;IAwD1B;;OAEG;IACH,eAAe,CAAC,MAAM,EAAE,MAAM,GAAG,KAAK,GAAG,IAAI,GAAG,GAAG,GAAG,GAAG;CAM1D"}