@agentlensai/server 0.5.0 → 0.7.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/db/benchmark-store.d.ts +74 -0
- package/dist/db/benchmark-store.d.ts.map +1 -0
- package/dist/db/benchmark-store.js +268 -0
- package/dist/db/benchmark-store.js.map +1 -0
- package/dist/db/health-snapshot-store.d.ts +33 -0
- package/dist/db/health-snapshot-store.d.ts.map +1 -0
- package/dist/db/health-snapshot-store.js +112 -0
- package/dist/db/health-snapshot-store.js.map +1 -0
- package/dist/db/migrate.d.ts.map +1 -1
- package/dist/db/migrate.js +67 -0
- package/dist/db/migrate.js.map +1 -1
- package/dist/db/sqlite-store.d.ts +5 -0
- package/dist/db/sqlite-store.d.ts.map +1 -1
- package/dist/db/sqlite-store.js +15 -0
- package/dist/db/sqlite-store.js.map +1 -1
- package/dist/index.d.ts +3 -0
- package/dist/index.d.ts.map +1 -1
- package/dist/index.js +25 -1
- package/dist/index.js.map +1 -1
- package/dist/lib/alert-engine.d.ts.map +1 -1
- package/dist/lib/alert-engine.js +5 -2
- package/dist/lib/alert-engine.js.map +1 -1
- package/dist/lib/analysis/cost-analysis.d.ts.map +1 -1
- package/dist/lib/analysis/cost-analysis.js +5 -2
- package/dist/lib/analysis/cost-analysis.js.map +1 -1
- package/dist/lib/analysis/error-patterns.d.ts.map +1 -1
- package/dist/lib/analysis/error-patterns.js +5 -2
- package/dist/lib/analysis/error-patterns.js.map +1 -1
- package/dist/lib/analysis/performance-trends.d.ts.map +1 -1
- package/dist/lib/analysis/performance-trends.js +4 -1
- package/dist/lib/analysis/performance-trends.js.map +1 -1
- package/dist/lib/analysis/tool-sequences.d.ts.map +1 -1
- package/dist/lib/analysis/tool-sequences.js +5 -2
- package/dist/lib/analysis/tool-sequences.js.map +1 -1
- package/dist/lib/benchmark/engine.d.ts +24 -0
- package/dist/lib/benchmark/engine.d.ts.map +1 -0
- package/dist/lib/benchmark/engine.js +159 -0
- package/dist/lib/benchmark/engine.js.map +1 -0
- package/dist/lib/benchmark/metric-aggregator.d.ts +38 -0
- package/dist/lib/benchmark/metric-aggregator.d.ts.map +1 -0
- package/dist/lib/benchmark/metric-aggregator.js +159 -0
- package/dist/lib/benchmark/metric-aggregator.js.map +1 -0
- package/dist/lib/benchmark/statistical.d.ts +51 -0
- package/dist/lib/benchmark/statistical.d.ts.map +1 -0
- package/dist/lib/benchmark/statistical.js +381 -0
- package/dist/lib/benchmark/statistical.js.map +1 -0
- package/dist/lib/context/retrieval.d.ts +4 -0
- package/dist/lib/context/retrieval.d.ts.map +1 -1
- package/dist/lib/context/retrieval.js +4 -0
- package/dist/lib/context/retrieval.js.map +1 -1
- package/dist/lib/embeddings/local.js +2 -2
- package/dist/lib/embeddings/local.js.map +1 -1
- package/dist/lib/health/computer.d.ts +28 -0
- package/dist/lib/health/computer.d.ts.map +1 -0
- package/dist/lib/health/computer.js +270 -0
- package/dist/lib/health/computer.js.map +1 -0
- package/dist/lib/optimization/classifier.d.ts +34 -0
- package/dist/lib/optimization/classifier.d.ts.map +1 -0
- package/dist/lib/optimization/classifier.js +108 -0
- package/dist/lib/optimization/classifier.js.map +1 -0
- package/dist/lib/optimization/engine.d.ts +24 -0
- package/dist/lib/optimization/engine.d.ts.map +1 -0
- package/dist/lib/optimization/engine.js +202 -0
- package/dist/lib/optimization/engine.js.map +1 -0
- package/dist/lib/optimization/index.d.ts +10 -0
- package/dist/lib/optimization/index.d.ts.map +1 -0
- package/dist/lib/optimization/index.js +9 -0
- package/dist/lib/optimization/index.js.map +1 -0
- package/dist/lib/replay/builder.d.ts +28 -0
- package/dist/lib/replay/builder.d.ts.map +1 -0
- package/dist/lib/replay/builder.js +482 -0
- package/dist/lib/replay/builder.js.map +1 -0
- package/dist/routes/benchmarks.d.ts +18 -0
- package/dist/routes/benchmarks.d.ts.map +1 -0
- package/dist/routes/benchmarks.js +312 -0
- package/dist/routes/benchmarks.js.map +1 -0
- package/dist/routes/health.d.ts +21 -0
- package/dist/routes/health.d.ts.map +1 -0
- package/dist/routes/health.js +142 -0
- package/dist/routes/health.js.map +1 -0
- package/dist/routes/optimize.d.ts +15 -0
- package/dist/routes/optimize.d.ts.map +1 -0
- package/dist/routes/optimize.js +55 -0
- package/dist/routes/optimize.js.map +1 -0
- package/dist/routes/recall.d.ts +2 -0
- package/dist/routes/recall.d.ts.map +1 -1
- package/dist/routes/recall.js +21 -1
- package/dist/routes/recall.js.map +1 -1
- package/dist/routes/reflect.d.ts.map +1 -1
- package/dist/routes/reflect.js +0 -1
- package/dist/routes/reflect.js.map +1 -1
- package/dist/routes/replay.d.ts +28 -0
- package/dist/routes/replay.d.ts.map +1 -0
- package/dist/routes/replay.js +140 -0
- package/dist/routes/replay.js.map +1 -0
- package/dist/routes/tenant-helper.d.ts.map +1 -1
- package/dist/routes/tenant-helper.js +12 -0
- package/dist/routes/tenant-helper.js.map +1 -1
- package/package.json +2 -2
|
@@ -13,8 +13,10 @@ export async function analyzePerformance(store, opts = {}) {
|
|
|
13
13
|
agentId: opts.agentId,
|
|
14
14
|
from: opts.from,
|
|
15
15
|
to: opts.to,
|
|
16
|
-
limit:
|
|
16
|
+
limit: 1000,
|
|
17
17
|
});
|
|
18
|
+
// Track whether the query hit its limit (results may be truncated)
|
|
19
|
+
const possiblyTruncated = sessionResult.sessions.length === 1000;
|
|
18
20
|
const sessions = sessionResult.sessions;
|
|
19
21
|
const sessionsAnalyzed = sessions.length;
|
|
20
22
|
// Count events for metadata
|
|
@@ -92,6 +94,7 @@ export async function analyzePerformance(store, opts = {}) {
|
|
|
92
94
|
sessionsAnalyzed,
|
|
93
95
|
eventsAnalyzed: eventCount,
|
|
94
96
|
timeRange,
|
|
97
|
+
...(possiblyTruncated ? { truncated: true, note: 'Results may be incomplete — query limit reached' } : {}),
|
|
95
98
|
},
|
|
96
99
|
};
|
|
97
100
|
}
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"performance-trends.js","sourceRoot":"","sources":["../../../src/lib/analysis/performance-trends.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AAgBH;;GAEG;AACH,MAAM,CAAC,KAAK,UAAU,kBAAkB,CACtC,KAAkB,EAClB,OAA8B,EAAE;IAEhC,iBAAiB;IACjB,MAAM,aAAa,GAAG,MAAM,KAAK,CAAC,aAAa,CAAC;QAC9C,OAAO,EAAE,IAAI,CAAC,OAAO;QACrB,IAAI,EAAE,IAAI,CAAC,IAAI;QACf,EAAE,EAAE,IAAI,CAAC,EAAE;QACX,KAAK,EAAE,
|
|
1
|
+
{"version":3,"file":"performance-trends.js","sourceRoot":"","sources":["../../../src/lib/analysis/performance-trends.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AAgBH;;GAEG;AACH,MAAM,CAAC,KAAK,UAAU,kBAAkB,CACtC,KAAkB,EAClB,OAA8B,EAAE;IAEhC,iBAAiB;IACjB,MAAM,aAAa,GAAG,MAAM,KAAK,CAAC,aAAa,CAAC;QAC9C,OAAO,EAAE,IAAI,CAAC,OAAO;QACrB,IAAI,EAAE,IAAI,CAAC,IAAI;QACf,EAAE,EAAE,IAAI,CAAC,EAAE;QACX,KAAK,EAAE,IAAI;KACZ,CAAC,CAAC;IAEH,mEAAmE;IACnE,MAAM,iBAAiB,GAAG,aAAa,CAAC,QAAQ,CAAC,MAAM,KAAK,IAAI,CAAC;IAEjE,MAAM,QAAQ,GAAG,aAAa,CAAC,QAAQ,CAAC;IACxC,MAAM,gBAAgB,GAAG,QAAQ,CAAC,MAAM,CAAC;IAEzC,4BAA4B;IAC5B,MAAM,UAAU,GAAG,MAAM,KAAK,CAAC,WAAW,CAAC;QACzC,OAAO,EAAE,IAAI,CAAC,OAAO;QACrB,IAAI,EAAE,IAAI,CAAC,IAAI;QACf,EAAE,EAAE,IAAI,CAAC,EAAE;KACZ,CAAC,CAAC;IAEH,MAAM,SAAS,GAAG;QAChB,IAAI,EAAE,QAAQ,CAAC,MAAM,GAAG,CAAC;YACvB,CAAC,CAAC,QAAQ,CAAC,MAAM,CAAC,CAAC,GAAG,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,CAAC,SAAS,GAAG,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC,SAAS,CAAC,CAAC,CAAC,GAAG,CAAC,EAAE,QAAQ,CAAC,CAAC,CAAE,CAAC,SAAS,CAAC;YAC9F,CAAC,CAAC,CAAC,IAAI,CAAC,IAAI,IAAI,EAAE,CAAC;QACrB,EAAE,EAAE,QAAQ,CAAC,MAAM,GAAG,CAAC;YACrB,CAAC,CAAC,QAAQ,CAAC,MAAM,CAAC,CAAC,GAAG,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,CAAC,SAAS,GAAG,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC,SAAS,CAAC,CAAC,CAAC,GAAG,CAAC,EAAE,QAAQ,CAAC,CAAC,CAAE,CAAC,SAAS,CAAC;YAC9F,CAAC,CAAC,CAAC,IAAI,CAAC,EAAE,IAAI,EAAE,CAAC;KACpB,CAAC;IAEF,4BAA4B;IAC5B,MAAM,YAAY,GAAG,QAAQ,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,UAAU,KAAK,CAAC,CAAC,CAAC,MAAM,CAAC;IACvE,MAAM,WAAW,GAAG,gBAAgB,GAAG,CAAC,CAAC,CAAC,CAAC,YAAY,GAAG,gBAAgB,CAAC,CAAC,CAAC,CAAC,CAAC;IAE/E,MAAM,SAAS,GAAG,QAAQ;SACvB,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,OAAO,IAAI,CAAC,CAAC,SAAS,CAAC;SACvC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,IAAI,IAAI,CAAC,CAAC,CAAC,OAAQ,CAAC,CAAC,OAAO,EAAE,GAAG,IAAI,IAAI,CAAC,CAAC,CAAC,SAAS,CAAC,CAAC,OAAO,EAAE,CAAC,CAAC;IAChF,MAAM,WAAW,GAAG,SAAS,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,SAAS,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,GAAG,CAAC,EAAE,CAAC,CAAC,GAAG,SAAS,CAAC,MAAM,CAAC,CAAC,CAAC,CAAC,CAAC;IAEvG,MAAM,YAAY,GAAG,gBAAgB,GAAG,CAAC;QACvC,CAAC,CAAC,QAAQ,CAAC,MAAM,CAAC,CAAC,GAAG,EAAE,CAAC,EAAE,EAAE,CAAC,GAAG,GAAG,CAAC,CAAC,aAAa,EAAE,CAAC,CAAC,GAAG,gBAAgB;QAC1E,CAAC,CAAC,CAAC,CAAC;IAEN,MAAM,SAAS,GAAG,gBAAgB,GAAG,CAAC;QACpC,CAAC,CAAC,QAAQ,CAAC,MAAM,CAAC,CAAC,GAAG,EAAE,CAAC,EAAE,EAAE,CAAC,GAAG,GAAG,CAAC,CAAC,UAAU,EAAE,CAAC,CAAC,GAAG,gBAAgB;QACvE,CAAC,CAAC,CAAC,CAAC;IAEN,gCAAgC;IAChC,MAAM,QAAQ,GAAG,IAAI,GAAG,EAKrB,CAAC;IAEJ,KAAK,MAAM,OAAO,IAAI,QAAQ,EAAE,CAAC;QAC/B,MAAM,IAAI,GAAG,OAAO,CAAC,SAAS,CAAC,KAAK,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,aAAa;QAC1D,MAAM,KAAK,GAAG,QAAQ,CAAC,GAAG,CAAC,IAAI,CAAC,IAAI,EAAE,QAAQ,EAAE,EAAE,EAAE,CAAC;QACrD,KAAK,CAAC,QAAQ,CAAC,IAAI,CAAC,OAAO,CAAC,CAAC;QAC7B,QAAQ,CAAC,GAAG,CAAC,IAAI,EAAE,KAAK,CAAC,CAAC;IAC5B,CAAC;IAED,MAAM,KAAK,GAAG,IAAI,CAAC,KAAK,IAAI,EAAE,CAAC;IAC/B,MAAM,UAAU,GAA6B,KAAK,CAAC,IAAI,CAAC,QAAQ,CAAC,OAAO,EAAE,CAAC;SACxE,GAAG,CAAC,CAAC,CAAC,IAAI,EAAE,IAAI,CAAC,EAAE,EAAE;QACpB,MAAM,WAAW,GAAG,IAAI,CAAC,QAAQ,CAAC;QAClC,MAAM,UAAU,GAAG,WAAW,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,UAAU,KAAK,CAAC,CAAC,CAAC,MAAM,CAAC;QACxE,MAAM,YAAY,GAAG,WAAW;aAC7B,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,OAAO,IAAI,CAAC,CAAC,SAAS,CAAC;aACvC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,IAAI,IAAI,CAAC,CAAC,CAAC,OAAQ,CAAC,CAAC,OAAO,EAAE,GAAG,IAAI,IAAI,CAAC,CAAC,CAAC,SAAS,CAAC,CAAC,OAAO,EAAE,CAAC,CAAC;QAEhF,OAAO;YACL,IAAI;YACJ,WAAW,EAAE,WAAW,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,UAAU,GAAG,WAAW,CAAC,MAAM,CAAC,CAAC,CAAC,CAAC;YACzE,QAAQ,EACN,YAAY,CAAC,MAAM,GAAG,CAAC;gBACrB,CAAC,CAAC,YAAY,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,GAAG,CAAC,EAAE,CAAC,CAAC,GAAG,YAAY,CAAC,MAAM;gBAC/D,CAAC,CAAC,CAAC;YACP,SAAS,EACP,WAAW,CAAC,MAAM,GAAG,CAAC;gBACpB,CAAC,CAAC,WAAW,CAAC,MAAM,CAAC,CAAC,GAAG,EAAE,CAAC,EAAE,EAAE,CAAC,GAAG,GAAG,CAAC,CAAC,aAAa,EAAE,CAAC,CAAC,GAAG,WAAW,CAAC,MAAM;gBAC/E,CAAC,CAAC,CAAC;YACP,MAAM,EACJ,WAAW,CAAC,MAAM,GAAG,CAAC;gBACpB,CAAC,CAAC,WAAW,CAAC,MAAM,CAAC,CAAC,GAAG,EAAE,CAAC,EAAE,EAAE,CAAC,GAAG,GAAG,CAAC,CAAC,UAAU,EAAE,CAAC,CAAC,GAAG,WAAW,CAAC,MAAM;gBAC5E,CAAC,CAAC,CAAC;SACR,CAAC;IACJ,CAAC,CAAC;SACD,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,IAAI,CAAC,aAAa,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC;IAEhD,iCAAiC;IACjC,MAAM,MAAM,GAAG,UAAU,CAAC,KAAK,CAAC,CAAC,KAAK,CAAC,CAAC;IAExC,2DAA2D;IAC3D,MAAM,UAAU,GAAG,sBAAsB,CAAC,UAAU,CAAC,CAAC;IAEtD,OAAO;QACL,OAAO,EAAE;YACP,WAAW,EAAE,IAAI,CAAC,KAAK,CAAC,WAAW,GAAG,KAAK,CAAC,GAAG,KAAK;YACpD,WAAW,EAAE,IAAI,CAAC,KAAK,CAAC,WAAW,CAAC;YACpC,YAAY,EAAE,IAAI,CAAC,KAAK,CAAC,YAAY,GAAG,GAAG,CAAC,GAAG,GAAG;YAClD,SAAS,EAAE,IAAI,CAAC,KAAK,CAAC,SAAS,GAAG,GAAG,CAAC,GAAG,GAAG;SAC7C;QACD,MAAM;QACN,UAAU;QACV,QAAQ,EAAE;YACR,gBAAgB;YAChB,cAAc,EAAE,UAAU;YAC1B,SAAS;YACT,GAAG,CAAC,iBAAiB,CAAC,CAAC,CAAC,EAAE,SAAS,EAAE,IAAI,EAAE,IAAI,EAAE,iDAAiD,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC;SAC3G;KACF,CAAC;AACJ,CAAC;AAED;;GAEG;AACH,SAAS,sBAAsB,CAC7B,OAAiC;IAEjC,IAAI,OAAO,CAAC,MAAM,GAAG,CAAC;QAAE,OAAO,QAAQ,CAAC;IAExC,MAAM,QAAQ,GAAG,IAAI,CAAC,KAAK,CAAC,OAAO,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC;IAChD,MAAM,MAAM,GAAG,OAAO,CAAC,KAAK,CAAC,QAAQ,CAAC,CAAC;IACvC,MAAM,UAAU,GAAG,OAAO,CAAC,KAAK,CAAC,CAAC,EAAE,QAAQ,CAAC,CAAC;IAE9C,IAAI,UAAU,CAAC,MAAM,KAAK,CAAC,IAAI,MAAM,CAAC,MAAM,KAAK,CAAC;QAAE,OAAO,QAAQ,CAAC;IAEpE,MAAM,gBAAgB,GACpB,MAAM,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,GAAG,CAAC,CAAC,WAAW,EAAE,CAAC,CAAC,GAAG,MAAM,CAAC,MAAM,CAAC;IAChE,MAAM,oBAAoB,GACxB,UAAU,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,GAAG,CAAC,CAAC,WAAW,EAAE,CAAC,CAAC,GAAG,UAAU,CAAC,MAAM,CAAC;IAExE,MAAM,IAAI,GAAG,gBAAgB,GAAG,oBAAoB,CAAC;IAErD,IAAI,IAAI,GAAG,IAAI;QAAE,OAAO,WAAW,CAAC;IACpC,IAAI,IAAI,GAAG,CAAC,IAAI;QAAE,OAAO,WAAW,CAAC;IACrC,OAAO,QAAQ,CAAC;AAClB,CAAC"}
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"tool-sequences.d.ts","sourceRoot":"","sources":["../../../src/lib/analysis/tool-sequences.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AAEH,OAAO,KAAK,EAAE,WAAW,EAAE,MAAM,mBAAmB,CAAC;AACrD,OAAO,KAAK,EACV,kBAAkB,EAGnB,MAAM,mBAAmB,CAAC;AAE3B,MAAM,WAAW,gBAAgB;IAC/B,OAAO,CAAC,EAAE,MAAM,CAAC;IACjB,IAAI,CAAC,EAAE,MAAM,CAAC;IACd,EAAE,CAAC,EAAE,MAAM,CAAC;IACZ,KAAK,CAAC,EAAE,MAAM,CAAC;CAChB;AAED;;GAEG;AACH,wBAAsB,oBAAoB,CACxC,KAAK,EAAE,WAAW,EAClB,IAAI,GAAE,gBAAqB,GAC1B,OAAO,CAAC,kBAAkB,CAAC,
|
|
1
|
+
{"version":3,"file":"tool-sequences.d.ts","sourceRoot":"","sources":["../../../src/lib/analysis/tool-sequences.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AAEH,OAAO,KAAK,EAAE,WAAW,EAAE,MAAM,mBAAmB,CAAC;AACrD,OAAO,KAAK,EACV,kBAAkB,EAGnB,MAAM,mBAAmB,CAAC;AAE3B,MAAM,WAAW,gBAAgB;IAC/B,OAAO,CAAC,EAAE,MAAM,CAAC;IACjB,IAAI,CAAC,EAAE,MAAM,CAAC;IACd,EAAE,CAAC,EAAE,MAAM,CAAC;IACZ,KAAK,CAAC,EAAE,MAAM,CAAC;CAChB;AAED;;GAEG;AACH,wBAAsB,oBAAoB,CACxC,KAAK,EAAE,WAAW,EAClB,IAAI,GAAE,gBAAqB,GAC1B,OAAO,CAAC,kBAAkB,CAAC,CAmK7B"}
|
|
@@ -15,7 +15,7 @@ export async function analyzeToolSequences(store, opts = {}) {
|
|
|
15
15
|
from: opts.from,
|
|
16
16
|
to: opts.to,
|
|
17
17
|
eventType: 'tool_call',
|
|
18
|
-
limit:
|
|
18
|
+
limit: 1000,
|
|
19
19
|
order: 'asc',
|
|
20
20
|
});
|
|
21
21
|
// Also query tool_error events to identify error-prone sequences
|
|
@@ -24,9 +24,11 @@ export async function analyzeToolSequences(store, opts = {}) {
|
|
|
24
24
|
from: opts.from,
|
|
25
25
|
to: opts.to,
|
|
26
26
|
eventType: 'tool_error',
|
|
27
|
-
limit:
|
|
27
|
+
limit: 1000,
|
|
28
28
|
order: 'asc',
|
|
29
29
|
});
|
|
30
|
+
// Track whether either query hit its limit (results may be truncated)
|
|
31
|
+
const possiblyTruncated = toolCallResult.events.length === 1000 || toolErrorResult.events.length === 1000;
|
|
30
32
|
const allToolCalls = toolCallResult.events;
|
|
31
33
|
const allToolErrors = toolErrorResult.events;
|
|
32
34
|
const eventsAnalyzed = allToolCalls.length + allToolErrors.length;
|
|
@@ -139,6 +141,7 @@ export async function analyzeToolSequences(store, opts = {}) {
|
|
|
139
141
|
metadata: {
|
|
140
142
|
eventsAnalyzed,
|
|
141
143
|
timeRange,
|
|
144
|
+
...(possiblyTruncated ? { truncated: true, note: 'Results may be incomplete — query limit reached' } : {}),
|
|
142
145
|
},
|
|
143
146
|
};
|
|
144
147
|
}
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"tool-sequences.js","sourceRoot":"","sources":["../../../src/lib/analysis/tool-sequences.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AAgBH;;GAEG;AACH,MAAM,CAAC,KAAK,UAAU,oBAAoB,CACxC,KAAkB,EAClB,OAAyB,EAAE;IAE3B,MAAM,KAAK,GAAG,IAAI,CAAC,KAAK,IAAI,EAAE,CAAC;IAE/B,yBAAyB;IACzB,MAAM,cAAc,GAAG,MAAM,KAAK,CAAC,WAAW,CAAC;QAC7C,OAAO,EAAE,IAAI,CAAC,OAAO;QACrB,IAAI,EAAE,IAAI,CAAC,IAAI;QACf,EAAE,EAAE,IAAI,CAAC,EAAE;QACX,SAAS,EAAE,WAAW;QACtB,KAAK,EAAE,
|
|
1
|
+
{"version":3,"file":"tool-sequences.js","sourceRoot":"","sources":["../../../src/lib/analysis/tool-sequences.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AAgBH;;GAEG;AACH,MAAM,CAAC,KAAK,UAAU,oBAAoB,CACxC,KAAkB,EAClB,OAAyB,EAAE;IAE3B,MAAM,KAAK,GAAG,IAAI,CAAC,KAAK,IAAI,EAAE,CAAC;IAE/B,yBAAyB;IACzB,MAAM,cAAc,GAAG,MAAM,KAAK,CAAC,WAAW,CAAC;QAC7C,OAAO,EAAE,IAAI,CAAC,OAAO;QACrB,IAAI,EAAE,IAAI,CAAC,IAAI;QACf,EAAE,EAAE,IAAI,CAAC,EAAE;QACX,SAAS,EAAE,WAAW;QACtB,KAAK,EAAE,IAAI;QACX,KAAK,EAAE,KAAK;KACb,CAAC,CAAC;IAEH,iEAAiE;IACjE,MAAM,eAAe,GAAG,MAAM,KAAK,CAAC,WAAW,CAAC;QAC9C,OAAO,EAAE,IAAI,CAAC,OAAO;QACrB,IAAI,EAAE,IAAI,CAAC,IAAI;QACf,EAAE,EAAE,IAAI,CAAC,EAAE;QACX,SAAS,EAAE,YAAY;QACvB,KAAK,EAAE,IAAI;QACX,KAAK,EAAE,KAAK;KACb,CAAC,CAAC;IAEH,sEAAsE;IACtE,MAAM,iBAAiB,GACrB,cAAc,CAAC,MAAM,CAAC,MAAM,KAAK,IAAI,IAAI,eAAe,CAAC,MAAM,CAAC,MAAM,KAAK,IAAI,CAAC;IAElF,MAAM,YAAY,GAAG,cAAc,CAAC,MAAM,CAAC;IAC3C,MAAM,aAAa,GAAG,eAAe,CAAC,MAAM,CAAC;IAE7C,MAAM,cAAc,GAAG,YAAY,CAAC,MAAM,GAAG,aAAa,CAAC,MAAM,CAAC;IAClE,MAAM,SAAS,GAAG;QAChB,IAAI,EAAE,YAAY,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,YAAY,CAAC,CAAC,CAAE,CAAC,SAAS,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC,IAAI,IAAI,EAAE,CAAC;QAC9E,EAAE,EAAE,YAAY,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,YAAY,CAAC,YAAY,CAAC,MAAM,GAAG,CAAC,CAAE,CAAC,SAAS,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC,EAAE,IAAI,EAAE,CAAC;KACjG,CAAC;IAEF,oCAAoC;IACpC,MAAM,gBAAgB,GAAG,IAAI,GAAG,EAA4B,CAAC;IAC7D,KAAK,MAAM,KAAK,IAAI,YAAY,EAAE,CAAC;QACjC,MAAM,IAAI,GAAG,gBAAgB,CAAC,GAAG,CAAC,KAAK,CAAC,SAAS,CAAC,IAAI,EAAE,CAAC;QACzD,IAAI,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC;QACjB,gBAAgB,CAAC,GAAG,CAAC,KAAK,CAAC,SAAS,EAAE,IAAI,CAAC,CAAC;IAC9C,CAAC;IAED,+DAA+D;IAC/D,MAAM,aAAa,GAAG,IAAI,GAAG,EAA4B,CAAC;IAC1D,KAAK,MAAM,KAAK,IAAI,aAAa,EAAE,CAAC;QAClC,MAAM,IAAI,GAAG,aAAa,CAAC,GAAG,CAAC,KAAK,CAAC,SAAS,CAAC,IAAI,EAAE,CAAC;QACtD,IAAI,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC;QACjB,aAAa,CAAC,GAAG,CAAC,KAAK,CAAC,SAAS,EAAE,IAAI,CAAC,CAAC;IAC3C,CAAC;IAED,wDAAwD;IACxD,MAAM,cAAc,GAAG,IAAI,GAAG,EAAU,CAAC;IACzC,IAAI,UAAU,GAAG,CAAC,CAAC;IACnB,MAAM,gBAAgB,GAAG,IAAI,GAAG,EAAoB,CAAC;IAErD,KAAK,MAAM,CAAC,SAAS,EAAE,KAAK,CAAC,IAAI,gBAAgB,EAAE,CAAC;QAClD,MAAM,KAAK,GAAG,KAAK,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE;YAC5B,MAAM,CAAC,GAAG,CAAC,CAAC,OAAkC,CAAC;YAC/C,MAAM,IAAI,GAAG,MAAM,CAAC,CAAC,CAAC,QAAQ,IAAI,SAAS,CAAC,CAAC;YAC7C,cAAc,CAAC,GAAG,CAAC,IAAI,CAAC,CAAC;YACzB,OAAO,IAAI,CAAC;QACd,CAAC,CAAC,CAAC;QACH,gBAAgB,CAAC,GAAG,CAAC,SAAS,EAAE,KAAK,CAAC,CAAC;QACvC,UAAU,IAAI,KAAK,CAAC,MAAM,CAAC;IAC7B,CAAC;IAED,wCAAwC;IACxC,yGAAyG;IACzG,MAAM,QAAQ,GAAG,IAAI,GAAG,EAGrB,CAAC;IAEJ,KAAK,MAAM,CAAC,SAAS,EAAE,SAAS,CAAC,IAAI,gBAAgB,EAAE,CAAC;QACtD,MAAM,kBAAkB,GAAG,aAAa,CAAC,GAAG,CAAC,SAAS,CAAC,IAAI,EAAE,CAAC;QAE9D,2DAA2D;QAC3D,MAAM,YAAY,GAAG,gBAAgB,CAAC,GAAG,CAAC,SAAS,CAAC,IAAI,EAAE,CAAC;QAE3D,8BAA8B;QAC9B,MAAM,aAAa,GAAG,IAAI,GAAG,EAAU,CAAC,CAAC,8CAA8C;QACvF,KAAK,MAAM,CAAC,IAAI,CAAC,CAAC,EAAE,CAAC,CAAC,EAAE,CAAC;YACvB,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,IAAI,SAAS,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC;gBAC/C,MAAM,KAAK,GAAG,SAAS,CAAC,KAAK,CAAC,CAAC,EAAE,CAAC,GAAG,CAAC,CAAC,CAAC;gBACxC,MAAM,GAAG,GAAG,KAAK,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC;gBAE5B,IAAI,CAAC,aAAa,CAAC,GAAG,CAAC,GAAG,CAAC,EAAE,CAAC;oBAC5B,aAAa,CAAC,GAAG,CAAC,GAAG,CAAC,CAAC;oBACvB,IAAI,KAAK,GAAG,QAAQ,CAAC,GAAG,CAAC,GAAG,CAAC,CAAC;oBAC9B,IAAI,CAAC,KAAK,EAAE,CAAC;wBACX,KAAK,GAAG,EAAE,KAAK,EAAE,KAAK,EAAE,SAAS,EAAE,CAAC,EAAE,QAAQ,EAAE,IAAI,GAAG,EAAE,EAAE,aAAa,EAAE,CAAC,EAAE,CAAC;wBAC9E,QAAQ,CAAC,GAAG,CAAC,GAAG,EAAE,KAAK,CAAC,CAAC;oBAC3B,CAAC;oBACD,KAAK,CAAC,SAAS,EAAE,CAAC;oBAClB,KAAK,CAAC,QAAQ,CAAC,GAAG,CAAC,SAAS,CAAC,CAAC;gBAChC,CAAC;gBAED,8DAA8D;gBAC9D,oEAAoE;gBACpE,MAAM,eAAe,GAAG,YAAY,CAAC,CAAC,GAAG,CAAC,GAAG,CAAC,CAAC,CAAC;gBAChD,IAAI,eAAe,EAAE,CAAC;oBACpB,MAAM,eAAe,GAAG,kBAAkB,CAAC,IAAI,CAAC,CAAC,MAAM,EAAE,EAAE;wBACzD,kDAAkD;wBAClD,gDAAgD;wBAChD,IAAI,MAAM,CAAC,SAAS,GAAG,eAAe,CAAC,SAAS;4BAAE,OAAO,KAAK,CAAC;wBAE/D,gFAAgF;wBAChF,MAAM,UAAU,GAAG,YAAY,CAAC,MAAM,CACpC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,SAAS,GAAG,eAAe,CAAC,SAAS,IAAI,CAAC,CAAC,SAAS,IAAI,MAAM,CAAC,SAAS,CAClF,CAAC;wBACF,OAAO,UAAU,CAAC,MAAM,IAAI,CAAC,CAAC;oBAChC,CAAC,CAAC,CAAC;oBAEH,IAAI,eAAe,EAAE,CAAC;wBACpB,2EAA2E;wBAC3E,2DAA2D;wBAC3D,IAAI,KAAK,GAAG,QAAQ,CAAC,GAAG,CAAC,GAAG,CAAE,CAAC;wBAC/B,uCAAuC;wBACvC,IAAI,aAAa,CAAC,GAAG,CAAC,GAAG,GAAG,gBAAgB,CAAC;4BAAE,SAAS;wBACxD,aAAa,CAAC,GAAG,CAAC,GAAG,GAAG,gBAAgB,CAAC,CAAC;wBAC1C,KAAK,CAAC,aAAa,EAAE,CAAC;oBACxB,CAAC;gBACH,CAAC;YACH,CAAC;QACH,CAAC;IACH,CAAC;IAED,kBAAkB;IAClB,MAAM,cAAc,GAAG,KAAK,CAAC,IAAI,CAAC,gBAAgB,CAAC,MAAM,EAAE,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC;IAClF,MAAM,iBAAiB,GACrB,cAAc,CAAC,MAAM,GAAG,CAAC;QACvB,CAAC,CAAC,cAAc,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,GAAG,CAAC,EAAE,CAAC,CAAC,GAAG,cAAc,CAAC,MAAM;QACnE,CAAC,CAAC,CAAC,CAAC;IAER,uBAAuB;IACvB,MAAM,SAAS,GAAmB,KAAK,CAAC,IAAI,CAAC,QAAQ,CAAC,MAAM,EAAE,CAAC;SAC5D,GAAG,CAAC,CAAC,KAAK,EAAE,EAAE,CAAC,CAAC;QACf,KAAK,EAAE,KAAK,CAAC,KAAK;QAClB,SAAS,EAAE,KAAK,CAAC,SAAS;QAC1B,QAAQ,EAAE,KAAK,CAAC,QAAQ,CAAC,IAAI;QAC7B,SAAS,EACP,KAAK,CAAC,QAAQ,CAAC,IAAI,GAAG,CAAC;YACrB,CAAC,CAAC,KAAK,CAAC,aAAa,GAAG,KAAK,CAAC,QAAQ,CAAC,IAAI;YAC3C,CAAC,CAAC,CAAC;KACR,CAAC,CAAC;SACF,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,SAAS,GAAG,CAAC,CAAC,SAAS,CAAC;SACzC,KAAK,CAAC,CAAC,EAAE,KAAK,CAAC,CAAC;IAEnB,OAAO;QACL,SAAS;QACT,KAAK,EAAE;YACL,iBAAiB,EAAE,IAAI,CAAC,KAAK,CAAC,iBAAiB,GAAG,GAAG,CAAC,GAAG,GAAG;YAC5D,WAAW,EAAE,cAAc,CAAC,IAAI;YAChC,UAAU;SACX;QACD,QAAQ,EAAE;YACR,cAAc;YACd,SAAS;YACT,GAAG,CAAC,iBAAiB,CAAC,CAAC,CAAC,EAAE,SAAS,EAAE,IAAI,EAAE,IAAI,EAAE,iDAAiD,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC;SAC3G;KACF,CAAC;AACJ,CAAC"}
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* BenchmarkEngine (Story 3.4)
|
|
3
|
+
*
|
|
4
|
+
* Orchestrator that combines MetricAggregator and StatisticalComparator
|
|
5
|
+
* to produce full benchmark results with human-readable summaries.
|
|
6
|
+
*/
|
|
7
|
+
import type { BenchmarkResults, IEventStore, MetricComparison, VariantMetrics } from '@agentlensai/core';
|
|
8
|
+
import type { BenchmarkStore, BenchmarkWithVariants } from '../../db/benchmark-store.js';
|
|
9
|
+
export declare class BenchmarkEngine {
|
|
10
|
+
private readonly aggregator;
|
|
11
|
+
private readonly comparator;
|
|
12
|
+
/**
|
|
13
|
+
* Compute full benchmark results: aggregation + statistical comparison.
|
|
14
|
+
*
|
|
15
|
+
* For completed benchmarks, results are cached via store.saveResults().
|
|
16
|
+
* For running benchmarks, results are computed on-the-fly (not cached).
|
|
17
|
+
*/
|
|
18
|
+
computeResults(benchmark: BenchmarkWithVariants, eventStore: IEventStore, benchmarkStore?: BenchmarkStore): Promise<BenchmarkResults>;
|
|
19
|
+
/**
|
|
20
|
+
* Generate a human-readable summary of benchmark comparisons.
|
|
21
|
+
*/
|
|
22
|
+
formatSummary(comparisons: MetricComparison[], variants: VariantMetrics[]): string;
|
|
23
|
+
}
|
|
24
|
+
//# sourceMappingURL=engine.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"engine.d.ts","sourceRoot":"","sources":["../../../src/lib/benchmark/engine.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AAEH,OAAO,KAAK,EAEV,gBAAgB,EAChB,WAAW,EACX,gBAAgB,EAEhB,cAAc,EACf,MAAM,mBAAmB,CAAC;AAG3B,OAAO,KAAK,EAAE,cAAc,EAAE,qBAAqB,EAAE,MAAM,6BAA6B,CAAC;AAqBzF,qBAAa,eAAe;IAC1B,OAAO,CAAC,QAAQ,CAAC,UAAU,CAA0B;IACrD,OAAO,CAAC,QAAQ,CAAC,UAAU,CAA+B;IAE1D;;;;;OAKG;IACG,cAAc,CAClB,SAAS,EAAE,qBAAqB,EAChC,UAAU,EAAE,WAAW,EACvB,cAAc,CAAC,EAAE,cAAc,GAC9B,OAAO,CAAC,gBAAgB,CAAC;IA+E5B;;OAEG;IACH,aAAa,CACX,WAAW,EAAE,gBAAgB,EAAE,EAC/B,QAAQ,EAAE,cAAc,EAAE,GACzB,MAAM;CAuFV"}
|
|
@@ -0,0 +1,159 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* BenchmarkEngine (Story 3.4)
|
|
3
|
+
*
|
|
4
|
+
* Orchestrator that combines MetricAggregator and StatisticalComparator
|
|
5
|
+
* to produce full benchmark results with human-readable summaries.
|
|
6
|
+
*/
|
|
7
|
+
import { MetricAggregator } from './metric-aggregator.js';
|
|
8
|
+
import { StatisticalComparator } from './statistical.js';
|
|
9
|
+
// ─── Constants ─────────────────────────────────────────────
|
|
10
|
+
const MIN_SESSIONS_FOR_RELIABLE_RESULTS = 30;
|
|
11
|
+
// ─── Metric display names ──────────────────────────────────
|
|
12
|
+
const METRIC_DISPLAY_NAMES = {
|
|
13
|
+
health_score: 'health score',
|
|
14
|
+
error_rate: 'error rate',
|
|
15
|
+
avg_cost: 'cost',
|
|
16
|
+
avg_latency: 'latency',
|
|
17
|
+
tool_success_rate: 'tool success rate',
|
|
18
|
+
completion_rate: 'completion rate',
|
|
19
|
+
avg_tokens: 'tokens',
|
|
20
|
+
avg_duration: 'duration',
|
|
21
|
+
};
|
|
22
|
+
// ─── BenchmarkEngine ───────────────────────────────────────
|
|
23
|
+
export class BenchmarkEngine {
|
|
24
|
+
aggregator = new MetricAggregator();
|
|
25
|
+
comparator = new StatisticalComparator();
|
|
26
|
+
/**
|
|
27
|
+
* Compute full benchmark results: aggregation + statistical comparison.
|
|
28
|
+
*
|
|
29
|
+
* For completed benchmarks, results are cached via store.saveResults().
|
|
30
|
+
* For running benchmarks, results are computed on-the-fly (not cached).
|
|
31
|
+
*/
|
|
32
|
+
async computeResults(benchmark, eventStore, benchmarkStore) {
|
|
33
|
+
// For completed benchmarks, try to return cached results first
|
|
34
|
+
if (benchmark.status === 'completed' && benchmarkStore) {
|
|
35
|
+
const cached = benchmarkStore.getResults(benchmark.tenantId, benchmark.id);
|
|
36
|
+
if (cached)
|
|
37
|
+
return cached;
|
|
38
|
+
}
|
|
39
|
+
const variants = benchmark.variants;
|
|
40
|
+
const metrics = benchmark.metrics;
|
|
41
|
+
// Step 1: Aggregate metrics for each variant
|
|
42
|
+
const variantMetricsList = [];
|
|
43
|
+
const variantData = [];
|
|
44
|
+
for (const variant of variants) {
|
|
45
|
+
const aggregated = await this.aggregator.aggregate(eventStore, variant, metrics, benchmark.timeRange);
|
|
46
|
+
variantMetricsList.push({
|
|
47
|
+
variantId: variant.id,
|
|
48
|
+
variantName: variant.name,
|
|
49
|
+
sessionCount: aggregated.sessionCount,
|
|
50
|
+
metrics: aggregated.metrics,
|
|
51
|
+
});
|
|
52
|
+
variantData.push({
|
|
53
|
+
id: variant.id,
|
|
54
|
+
name: variant.name,
|
|
55
|
+
metrics: aggregated.metrics,
|
|
56
|
+
});
|
|
57
|
+
}
|
|
58
|
+
// Step 2: Pairwise comparisons for each metric
|
|
59
|
+
const comparisons = [];
|
|
60
|
+
for (let i = 0; i < variantData.length; i++) {
|
|
61
|
+
for (let j = i + 1; j < variantData.length; j++) {
|
|
62
|
+
const vA = variantData[i];
|
|
63
|
+
const vB = variantData[j];
|
|
64
|
+
for (const metric of metrics) {
|
|
65
|
+
try {
|
|
66
|
+
const comparison = this.comparator.compare(vA, vB, metric);
|
|
67
|
+
comparisons.push(comparison);
|
|
68
|
+
}
|
|
69
|
+
catch {
|
|
70
|
+
// Skip metrics where data is missing for a variant
|
|
71
|
+
}
|
|
72
|
+
}
|
|
73
|
+
}
|
|
74
|
+
}
|
|
75
|
+
// Step 3: Generate summary
|
|
76
|
+
const summary = this.formatSummary(comparisons, variantMetricsList);
|
|
77
|
+
const results = {
|
|
78
|
+
benchmarkId: benchmark.id,
|
|
79
|
+
tenantId: benchmark.tenantId,
|
|
80
|
+
variants: variantMetricsList,
|
|
81
|
+
comparisons,
|
|
82
|
+
summary,
|
|
83
|
+
computedAt: new Date().toISOString(),
|
|
84
|
+
};
|
|
85
|
+
// Cache results for completed benchmarks
|
|
86
|
+
if (benchmark.status === 'completed' && benchmarkStore) {
|
|
87
|
+
benchmarkStore.saveResults(benchmark.tenantId, benchmark.id, results);
|
|
88
|
+
}
|
|
89
|
+
return results;
|
|
90
|
+
}
|
|
91
|
+
/**
|
|
92
|
+
* Generate a human-readable summary of benchmark comparisons.
|
|
93
|
+
*/
|
|
94
|
+
formatSummary(comparisons, variants) {
|
|
95
|
+
const parts = [];
|
|
96
|
+
// Warning: insufficient data
|
|
97
|
+
const lowDataVariants = variants.filter((v) => v.sessionCount < MIN_SESSIONS_FOR_RELIABLE_RESULTS);
|
|
98
|
+
if (lowDataVariants.length > 0) {
|
|
99
|
+
const names = lowDataVariants
|
|
100
|
+
.map((v) => `${v.variantName}: ${v.sessionCount} sessions`)
|
|
101
|
+
.join(', ');
|
|
102
|
+
parts.push(`⚠️ Insufficient data (${names}). Results may be unreliable.`);
|
|
103
|
+
}
|
|
104
|
+
// Group significant comparisons by winner
|
|
105
|
+
const significant = comparisons.filter((c) => c.significant);
|
|
106
|
+
if (significant.length === 0) {
|
|
107
|
+
parts.push('No significant differences found between variants.' +
|
|
108
|
+
(variants.length > 0
|
|
109
|
+
? ` Current sample sizes: ${variants.map((v) => `${v.variantName}: ${v.sessionCount}`).join(', ')}.`
|
|
110
|
+
: ''));
|
|
111
|
+
return parts.join('\n');
|
|
112
|
+
}
|
|
113
|
+
// Build wins map: winnerId → list of comparisons
|
|
114
|
+
const winsByVariant = new Map();
|
|
115
|
+
for (const comp of significant) {
|
|
116
|
+
if (!comp.winner)
|
|
117
|
+
continue;
|
|
118
|
+
const wins = winsByVariant.get(comp.winner) ?? [];
|
|
119
|
+
wins.push(comp);
|
|
120
|
+
winsByVariant.set(comp.winner, wins);
|
|
121
|
+
}
|
|
122
|
+
// Find variant names
|
|
123
|
+
const variantNameMap = new Map();
|
|
124
|
+
for (const v of variants) {
|
|
125
|
+
variantNameMap.set(v.variantId, v.variantName);
|
|
126
|
+
}
|
|
127
|
+
// Also look in comparisons for variant names
|
|
128
|
+
for (const c of comparisons) {
|
|
129
|
+
variantNameMap.set(c.variantA.id, c.variantA.name);
|
|
130
|
+
variantNameMap.set(c.variantB.id, c.variantB.name);
|
|
131
|
+
}
|
|
132
|
+
// Generate per-winner summaries
|
|
133
|
+
for (const [winnerId, wins] of winsByVariant) {
|
|
134
|
+
const winnerName = variantNameMap.get(winnerId) ?? winnerId;
|
|
135
|
+
// Group by loser
|
|
136
|
+
const loserIds = new Set(wins.map((c) => c.winner === c.variantA.id ? c.variantB.id : c.variantA.id));
|
|
137
|
+
for (const loserId of loserIds) {
|
|
138
|
+
const loserName = variantNameMap.get(loserId) ?? loserId;
|
|
139
|
+
const relevantWins = wins.filter((c) => (c.winner === c.variantA.id && c.variantB.id === loserId) ||
|
|
140
|
+
(c.winner === c.variantB.id && c.variantA.id === loserId));
|
|
141
|
+
const metricTexts = relevantWins.map((comp) => {
|
|
142
|
+
const metricName = METRIC_DISPLAY_NAMES[comp.metric] ?? comp.metric;
|
|
143
|
+
const pctSign = comp.percentDiff >= 0 ? '+' : '';
|
|
144
|
+
const pctStr = `${pctSign}${comp.percentDiff.toFixed(0)}%`;
|
|
145
|
+
return `${metricName} (${pctStr}, ${comp.confidence})`;
|
|
146
|
+
});
|
|
147
|
+
parts.push(`${winnerName} outperforms ${loserName} on ${metricTexts.join(' and ')}.`);
|
|
148
|
+
}
|
|
149
|
+
}
|
|
150
|
+
// Note metrics with no significant differences
|
|
151
|
+
const notSignificant = comparisons.filter((c) => !c.significant);
|
|
152
|
+
if (notSignificant.length > 0) {
|
|
153
|
+
const nsMetrics = [...new Set(notSignificant.map((c) => METRIC_DISPLAY_NAMES[c.metric] ?? c.metric))];
|
|
154
|
+
parts.push(`No significant difference on ${nsMetrics.join(', ')}.`);
|
|
155
|
+
}
|
|
156
|
+
return parts.join('\n');
|
|
157
|
+
}
|
|
158
|
+
}
|
|
159
|
+
//# sourceMappingURL=engine.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"engine.js","sourceRoot":"","sources":["../../../src/lib/benchmark/engine.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AAUH,OAAO,EAAE,gBAAgB,EAAE,MAAM,wBAAwB,CAAC;AAC1D,OAAO,EAAE,qBAAqB,EAAE,MAAM,kBAAkB,CAAC;AAGzD,8DAA8D;AAE9D,MAAM,iCAAiC,GAAG,EAAE,CAAC;AAE7C,8DAA8D;AAE9D,MAAM,oBAAoB,GAAoC;IAC5D,YAAY,EAAE,cAAc;IAC5B,UAAU,EAAE,YAAY;IACxB,QAAQ,EAAE,MAAM;IAChB,WAAW,EAAE,SAAS;IACtB,iBAAiB,EAAE,mBAAmB;IACtC,eAAe,EAAE,iBAAiB;IAClC,UAAU,EAAE,QAAQ;IACpB,YAAY,EAAE,UAAU;CACzB,CAAC;AAEF,8DAA8D;AAE9D,MAAM,OAAO,eAAe;IACT,UAAU,GAAG,IAAI,gBAAgB,EAAE,CAAC;IACpC,UAAU,GAAG,IAAI,qBAAqB,EAAE,CAAC;IAE1D;;;;;OAKG;IACH,KAAK,CAAC,cAAc,CAClB,SAAgC,EAChC,UAAuB,EACvB,cAA+B;QAE/B,+DAA+D;QAC/D,IAAI,SAAS,CAAC,MAAM,KAAK,WAAW,IAAI,cAAc,EAAE,CAAC;YACvD,MAAM,MAAM,GAAG,cAAc,CAAC,UAAU,CAAC,SAAS,CAAC,QAAQ,EAAE,SAAS,CAAC,EAAE,CAAC,CAAC;YAC3E,IAAI,MAAM;gBAAE,OAAO,MAAM,CAAC;QAC5B,CAAC;QAED,MAAM,QAAQ,GAAG,SAAS,CAAC,QAAQ,CAAC;QACpC,MAAM,OAAO,GAAG,SAAS,CAAC,OAAO,CAAC;QAElC,6CAA6C;QAC7C,MAAM,kBAAkB,GAAqB,EAAE,CAAC;QAChD,MAAM,WAAW,GAIZ,EAAE,CAAC;QAER,KAAK,MAAM,OAAO,IAAI,QAAQ,EAAE,CAAC;YAC/B,MAAM,UAAU,GAAG,MAAM,IAAI,CAAC,UAAU,CAAC,SAAS,CAChD,UAAU,EACV,OAAO,EACP,OAAO,EACP,SAAS,CAAC,SAAS,CACpB,CAAC;YAEF,kBAAkB,CAAC,IAAI,CAAC;gBACtB,SAAS,EAAE,OAAO,CAAC,EAAE;gBACrB,WAAW,EAAE,OAAO,CAAC,IAAI;gBACzB,YAAY,EAAE,UAAU,CAAC,YAAY;gBACrC,OAAO,EAAE,UAAU,CAAC,OAAO;aAC5B,CAAC,CAAC;YAEH,WAAW,CAAC,IAAI,CAAC;gBACf,EAAE,EAAE,OAAO,CAAC,EAAE;gBACd,IAAI,EAAE,OAAO,CAAC,IAAI;gBAClB,OAAO,EAAE,UAAU,CAAC,OAAO;aAC5B,CAAC,CAAC;QACL,CAAC;QAED,+CAA+C;QAC/C,MAAM,WAAW,GAAuB,EAAE,CAAC;QAE3C,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,WAAW,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;YAC5C,KAAK,IAAI,CAAC,GAAG,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,WAAW,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;gBAChD,MAAM,EAAE,GAAG,WAAW,CAAC,CAAC,CAAE,CAAC;gBAC3B,MAAM,EAAE,GAAG,WAAW,CAAC,CAAC,CAAE,CAAC;gBAE3B,KAAK,MAAM,MAAM,IAAI,OAAO,EAAE,CAAC;oBAC7B,IAAI,CAAC;wBACH,MAAM,UAAU,GAAG,IAAI,CAAC,UAAU,CAAC,OAAO,CAAC,EAAE,EAAE,EAAE,EAAE,MAAM,CAAC,CAAC;wBAC3D,WAAW,CAAC,IAAI,CAAC,UAAU,CAAC,CAAC;oBAC/B,CAAC;oBAAC,MAAM,CAAC;wBACP,mDAAmD;oBACrD,CAAC;gBACH,CAAC;YACH,CAAC;QACH,CAAC;QAED,2BAA2B;QAC3B,MAAM,OAAO,GAAG,IAAI,CAAC,aAAa,CAAC,WAAW,EAAE,kBAAkB,CAAC,CAAC;QAEpE,MAAM,OAAO,GAAqB;YAChC,WAAW,EAAE,SAAS,CAAC,EAAE;YACzB,QAAQ,EAAE,SAAS,CAAC,QAAQ;YAC5B,QAAQ,EAAE,kBAAkB;YAC5B,WAAW;YACX,OAAO;YACP,UAAU,EAAE,IAAI,IAAI,EAAE,CAAC,WAAW,EAAE;SACrC,CAAC;QAEF,yCAAyC;QACzC,IAAI,SAAS,CAAC,MAAM,KAAK,WAAW,IAAI,cAAc,EAAE,CAAC;YACvD,cAAc,CAAC,WAAW,CAAC,SAAS,CAAC,QAAQ,EAAE,SAAS,CAAC,EAAE,EAAE,OAAO,CAAC,CAAC;QACxE,CAAC;QAED,OAAO,OAAO,CAAC;IACjB,CAAC;IAED;;OAEG;IACH,aAAa,CACX,WAA+B,EAC/B,QAA0B;QAE1B,MAAM,KAAK,GAAa,EAAE,CAAC;QAE3B,6BAA6B;QAC7B,MAAM,eAAe,GAAG,QAAQ,CAAC,MAAM,CACrC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,YAAY,GAAG,iCAAiC,CAC1D,CAAC;QACF,IAAI,eAAe,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;YAC/B,MAAM,KAAK,GAAG,eAAe;iBAC1B,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,GAAG,CAAC,CAAC,WAAW,KAAK,CAAC,CAAC,YAAY,WAAW,CAAC;iBAC1D,IAAI,CAAC,IAAI,CAAC,CAAC;YACd,KAAK,CAAC,IAAI,CAAC,yBAAyB,KAAK,+BAA+B,CAAC,CAAC;QAC5E,CAAC;QAED,0CAA0C;QAC1C,MAAM,WAAW,GAAG,WAAW,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,WAAW,CAAC,CAAC;QAE7D,IAAI,WAAW,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;YAC7B,KAAK,CAAC,IAAI,CACR,oDAAoD;gBAClD,CAAC,QAAQ,CAAC,MAAM,GAAG,CAAC;oBAClB,CAAC,CAAC,0BAA0B,QAAQ,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,GAAG,CAAC,CAAC,WAAW,KAAK,CAAC,CAAC,YAAY,EAAE,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,GAAG;oBACpG,CAAC,CAAC,EAAE,CAAC,CACV,CAAC;YACF,OAAO,KAAK,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;QAC1B,CAAC;QAED,iDAAiD;QACjD,MAAM,aAAa,GAAG,IAAI,GAAG,EAA8B,CAAC;QAC5D,KAAK,MAAM,IAAI,IAAI,WAAW,EAAE,CAAC;YAC/B,IAAI,CAAC,IAAI,CAAC,MAAM;gBAAE,SAAS;YAC3B,MAAM,IAAI,GAAG,aAAa,CAAC,GAAG,CAAC,IAAI,CAAC,MAAM,CAAC,IAAI,EAAE,CAAC;YAClD,IAAI,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;YAChB,aAAa,CAAC,GAAG,CAAC,IAAI,CAAC,MAAM,EAAE,IAAI,CAAC,CAAC;QACvC,CAAC;QAED,qBAAqB;QACrB,MAAM,cAAc,GAAG,IAAI,GAAG,EAAkB,CAAC;QACjD,KAAK,MAAM,CAAC,IAAI,QAAQ,EAAE,CAAC;YACzB,cAAc,CAAC,GAAG,CAAC,CAAC,CAAC,SAAS,EAAE,CAAC,CAAC,WAAW,CAAC,CAAC;QACjD,CAAC;QACD,6CAA6C;QAC7C,KAAK,MAAM,CAAC,IAAI,WAAW,EAAE,CAAC;YAC5B,cAAc,CAAC,GAAG,CAAC,CAAC,CAAC,QAAQ,CAAC,EAAE,EAAE,CAAC,CAAC,QAAQ,CAAC,IAAI,CAAC,CAAC;YACnD,cAAc,CAAC,GAAG,CAAC,CAAC,CAAC,QAAQ,CAAC,EAAE,EAAE,CAAC,CAAC,QAAQ,CAAC,IAAI,CAAC,CAAC;QACrD,CAAC;QAED,gCAAgC;QAChC,KAAK,MAAM,CAAC,QAAQ,EAAE,IAAI,CAAC,IAAI,aAAa,EAAE,CAAC;YAC7C,MAAM,UAAU,GAAG,cAAc,CAAC,GAAG,CAAC,QAAQ,CAAC,IAAI,QAAQ,CAAC;YAE5D,iBAAiB;YACjB,MAAM,QAAQ,GAAG,IAAI,GAAG,CACtB,IAAI,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CACb,CAAC,CAAC,MAAM,KAAK,CAAC,CAAC,QAAQ,CAAC,EAAE,CAAC,CAAC,CAAC,CAAC,CAAC,QAAQ,CAAC,EAAE,CAAC,CAAC,CAAC,CAAC,CAAC,QAAQ,CAAC,EAAE,CAC3D,CACF,CAAC;YAEF,KAAK,MAAM,OAAO,IAAI,QAAQ,EAAE,CAAC;gBAC/B,MAAM,SAAS,GAAG,cAAc,CAAC,GAAG,CAAC,OAAO,CAAC,IAAI,OAAO,CAAC;gBACzD,MAAM,YAAY,GAAG,IAAI,CAAC,MAAM,CAC9B,CAAC,CAAC,EAAE,EAAE,CACJ,CAAC,CAAC,CAAC,MAAM,KAAK,CAAC,CAAC,QAAQ,CAAC,EAAE,IAAI,CAAC,CAAC,QAAQ,CAAC,EAAE,KAAK,OAAO,CAAC;oBACzD,CAAC,CAAC,CAAC,MAAM,KAAK,CAAC,CAAC,QAAQ,CAAC,EAAE,IAAI,CAAC,CAAC,QAAQ,CAAC,EAAE,KAAK,OAAO,CAAC,CAC5D,CAAC;gBACF,MAAM,WAAW,GAAG,YAAY,CAAC,GAAG,CAAC,CAAC,IAAI,EAAE,EAAE;oBAC5C,MAAM,UAAU,GAAG,oBAAoB,CAAC,IAAI,CAAC,MAAM,CAAC,IAAI,IAAI,CAAC,MAAM,CAAC;oBACpE,MAAM,OAAO,GAAG,IAAI,CAAC,WAAW,IAAI,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC;oBACjD,MAAM,MAAM,GAAG,GAAG,OAAO,GAAG,IAAI,CAAC,WAAW,CAAC,OAAO,CAAC,CAAC,CAAC,GAAG,CAAC;oBAC3D,OAAO,GAAG,UAAU,KAAK,MAAM,KAAK,IAAI,CAAC,UAAU,GAAG,CAAC;gBACzD,CAAC,CAAC,CAAC;gBAEH,KAAK,CAAC,IAAI,CACR,GAAG,UAAU,gBAAgB,SAAS,OAAO,WAAW,CAAC,IAAI,CAAC,OAAO,CAAC,GAAG,CAC1E,CAAC;YACJ,CAAC;QACH,CAAC;QAED,+CAA+C;QAC/C,MAAM,cAAc,GAAG,WAAW,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,CAAC,WAAW,CAAC,CAAC;QACjE,IAAI,cAAc,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;YAC9B,MAAM,SAAS,GAAG,CAAC,GAAG,IAAI,GAAG,CAAC,cAAc,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,oBAAoB,CAAC,CAAC,CAAC,MAAM,CAAC,IAAI,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,CAAC;YACtG,KAAK,CAAC,IAAI,CAAC,gCAAgC,SAAS,CAAC,IAAI,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC;QACtE,CAAC;QAED,OAAO,KAAK,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;IAC1B,CAAC;CACF"}
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* MetricAggregator (Story 3.2)
|
|
3
|
+
*
|
|
4
|
+
* Computes descriptive statistics for each benchmark metric across a variant's sessions.
|
|
5
|
+
* Provides clean inputs for the StatisticalComparator.
|
|
6
|
+
*/
|
|
7
|
+
import type { BenchmarkMetric, BenchmarkVariant, IEventStore, MetricStats, Session } from '@agentlensai/core';
|
|
8
|
+
export declare class MetricAggregator {
|
|
9
|
+
/**
|
|
10
|
+
* Aggregate metrics for a variant by querying sessions matching the variant's tag.
|
|
11
|
+
*
|
|
12
|
+
* @param store - Event store to query sessions from
|
|
13
|
+
* @param variant - The benchmark variant (identifies sessions via tag)
|
|
14
|
+
* @param metrics - Which metrics to compute
|
|
15
|
+
* @param timeRange - Optional time range filter
|
|
16
|
+
* @returns Record of metric → MetricStats
|
|
17
|
+
*/
|
|
18
|
+
aggregate(store: IEventStore, variant: BenchmarkVariant, metrics: BenchmarkMetric[], timeRange?: {
|
|
19
|
+
from: string;
|
|
20
|
+
to: string;
|
|
21
|
+
}): Promise<{
|
|
22
|
+
sessionCount: number;
|
|
23
|
+
metrics: Record<BenchmarkMetric, MetricStats>;
|
|
24
|
+
}>;
|
|
25
|
+
/**
|
|
26
|
+
* Extract numeric values for a metric from a list of sessions.
|
|
27
|
+
*/
|
|
28
|
+
extractMetricValues(sessions: Session[], metric: BenchmarkMetric): number[];
|
|
29
|
+
/**
|
|
30
|
+
* Extract a single metric value from a session.
|
|
31
|
+
*/
|
|
32
|
+
private extractSingleValue;
|
|
33
|
+
/**
|
|
34
|
+
* Compute descriptive statistics from an array of numeric values.
|
|
35
|
+
*/
|
|
36
|
+
computeStats(values: number[]): MetricStats;
|
|
37
|
+
}
|
|
38
|
+
//# sourceMappingURL=metric-aggregator.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"metric-aggregator.d.ts","sourceRoot":"","sources":["../../../src/lib/benchmark/metric-aggregator.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AAEH,OAAO,KAAK,EACV,eAAe,EACf,gBAAgB,EAChB,WAAW,EACX,WAAW,EACX,OAAO,EACR,MAAM,mBAAmB,CAAC;AAI3B,qBAAa,gBAAgB;IAC3B;;;;;;;;OAQG;IACG,SAAS,CACb,KAAK,EAAE,WAAW,EAClB,OAAO,EAAE,gBAAgB,EACzB,OAAO,EAAE,eAAe,EAAE,EAC1B,SAAS,CAAC,EAAE;QAAE,IAAI,EAAE,MAAM,CAAC;QAAC,EAAE,EAAE,MAAM,CAAA;KAAE,GACvC,OAAO,CAAC;QAAE,YAAY,EAAE,MAAM,CAAC;QAAC,OAAO,EAAE,MAAM,CAAC,eAAe,EAAE,WAAW,CAAC,CAAA;KAAE,CAAC;IAiCnF;;OAEG;IACH,mBAAmB,CAAC,QAAQ,EAAE,OAAO,EAAE,EAAE,MAAM,EAAE,eAAe,GAAG,MAAM,EAAE;IAa3E;;OAEG;IACH,OAAO,CAAC,kBAAkB;IAsD1B;;OAEG;IACH,YAAY,CAAC,MAAM,EAAE,MAAM,EAAE,GAAG,WAAW;CA6C5C"}
|
|
@@ -0,0 +1,159 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* MetricAggregator (Story 3.2)
|
|
3
|
+
*
|
|
4
|
+
* Computes descriptive statistics for each benchmark metric across a variant's sessions.
|
|
5
|
+
* Provides clean inputs for the StatisticalComparator.
|
|
6
|
+
*/
|
|
7
|
+
// ─── Public Interface ──────────────────────────────────────
|
|
8
|
+
export class MetricAggregator {
|
|
9
|
+
/**
|
|
10
|
+
* Aggregate metrics for a variant by querying sessions matching the variant's tag.
|
|
11
|
+
*
|
|
12
|
+
* @param store - Event store to query sessions from
|
|
13
|
+
* @param variant - The benchmark variant (identifies sessions via tag)
|
|
14
|
+
* @param metrics - Which metrics to compute
|
|
15
|
+
* @param timeRange - Optional time range filter
|
|
16
|
+
* @returns Record of metric → MetricStats
|
|
17
|
+
*/
|
|
18
|
+
async aggregate(store, variant, metrics, timeRange) {
|
|
19
|
+
// Paginate through all sessions (store caps at 500 per query)
|
|
20
|
+
const sessions = [];
|
|
21
|
+
const pageSize = 500;
|
|
22
|
+
let offset = 0;
|
|
23
|
+
while (true) {
|
|
24
|
+
const page = await store.querySessions({
|
|
25
|
+
tenantId: variant.tenantId,
|
|
26
|
+
agentId: variant.agentId,
|
|
27
|
+
tags: [variant.tag],
|
|
28
|
+
from: timeRange?.from,
|
|
29
|
+
to: timeRange?.to,
|
|
30
|
+
limit: pageSize,
|
|
31
|
+
offset,
|
|
32
|
+
});
|
|
33
|
+
sessions.push(...page.sessions);
|
|
34
|
+
if (page.sessions.length < pageSize)
|
|
35
|
+
break;
|
|
36
|
+
offset += pageSize;
|
|
37
|
+
}
|
|
38
|
+
const result = {};
|
|
39
|
+
for (const metric of metrics) {
|
|
40
|
+
const values = this.extractMetricValues(sessions, metric);
|
|
41
|
+
result[metric] = this.computeStats(values);
|
|
42
|
+
}
|
|
43
|
+
return {
|
|
44
|
+
sessionCount: sessions.length,
|
|
45
|
+
metrics: result,
|
|
46
|
+
};
|
|
47
|
+
}
|
|
48
|
+
/**
|
|
49
|
+
* Extract numeric values for a metric from a list of sessions.
|
|
50
|
+
*/
|
|
51
|
+
extractMetricValues(sessions, metric) {
|
|
52
|
+
const values = [];
|
|
53
|
+
for (const session of sessions) {
|
|
54
|
+
const value = this.extractSingleValue(session, metric);
|
|
55
|
+
if (value !== null && !isNaN(value)) {
|
|
56
|
+
values.push(value);
|
|
57
|
+
}
|
|
58
|
+
}
|
|
59
|
+
return values;
|
|
60
|
+
}
|
|
61
|
+
/**
|
|
62
|
+
* Extract a single metric value from a session.
|
|
63
|
+
*/
|
|
64
|
+
extractSingleValue(session, metric) {
|
|
65
|
+
switch (metric) {
|
|
66
|
+
case 'avg_cost':
|
|
67
|
+
return session.totalCostUsd;
|
|
68
|
+
case 'error_rate':
|
|
69
|
+
// Error rate = errorCount / eventCount
|
|
70
|
+
if (session.eventCount === 0)
|
|
71
|
+
return null;
|
|
72
|
+
return session.errorCount / session.eventCount;
|
|
73
|
+
case 'completion_rate':
|
|
74
|
+
// Binary: 1 if completed, 0 otherwise
|
|
75
|
+
return session.status === 'completed' ? 1 : 0;
|
|
76
|
+
case 'tool_success_rate':
|
|
77
|
+
// errorCount includes ALL errors (LLM, system, etc.), not just tool errors.
|
|
78
|
+
// Use conservative proxy: cap tool errors at toolCallCount to avoid negative rates.
|
|
79
|
+
if (session.toolCallCount === 0)
|
|
80
|
+
return null;
|
|
81
|
+
const toolErrors = Math.min(session.errorCount, session.toolCallCount);
|
|
82
|
+
return Math.max(0, (session.toolCallCount - toolErrors) / session.toolCallCount);
|
|
83
|
+
case 'avg_tokens':
|
|
84
|
+
return session.totalInputTokens + session.totalOutputTokens;
|
|
85
|
+
case 'avg_duration': {
|
|
86
|
+
// Duration in milliseconds
|
|
87
|
+
if (!session.endedAt || !session.startedAt)
|
|
88
|
+
return null;
|
|
89
|
+
const start = new Date(session.startedAt).getTime();
|
|
90
|
+
const end = new Date(session.endedAt).getTime();
|
|
91
|
+
const durationMs = end - start;
|
|
92
|
+
return durationMs > 0 ? durationMs : null;
|
|
93
|
+
}
|
|
94
|
+
case 'avg_latency':
|
|
95
|
+
// avg_latency requires event-level query; we approximate from session-level data
|
|
96
|
+
// latency ≈ duration / llmCallCount (average time per LLM call)
|
|
97
|
+
if (!session.endedAt || !session.startedAt || session.llmCallCount === 0)
|
|
98
|
+
return null;
|
|
99
|
+
const startMs = new Date(session.startedAt).getTime();
|
|
100
|
+
const endMs = new Date(session.endedAt).getTime();
|
|
101
|
+
const totalDuration = endMs - startMs;
|
|
102
|
+
return totalDuration > 0 ? totalDuration / session.llmCallCount : null;
|
|
103
|
+
case 'health_score':
|
|
104
|
+
// health_score is a composite score that would need the HealthComputer.
|
|
105
|
+
// At the session level, we compute a simplified score:
|
|
106
|
+
// Based on error rate, completion, and tool success
|
|
107
|
+
// This is a placeholder that returns null if not computed elsewhere.
|
|
108
|
+
return null;
|
|
109
|
+
default:
|
|
110
|
+
return null;
|
|
111
|
+
}
|
|
112
|
+
}
|
|
113
|
+
/**
|
|
114
|
+
* Compute descriptive statistics from an array of numeric values.
|
|
115
|
+
*/
|
|
116
|
+
computeStats(values) {
|
|
117
|
+
if (values.length === 0) {
|
|
118
|
+
return {
|
|
119
|
+
mean: 0,
|
|
120
|
+
median: 0,
|
|
121
|
+
stddev: 0,
|
|
122
|
+
min: 0,
|
|
123
|
+
max: 0,
|
|
124
|
+
count: 0,
|
|
125
|
+
};
|
|
126
|
+
}
|
|
127
|
+
const n = values.length;
|
|
128
|
+
const sorted = [...values].sort((a, b) => a - b);
|
|
129
|
+
const sum = values.reduce((acc, v) => acc + v, 0);
|
|
130
|
+
const mean = sum / n;
|
|
131
|
+
// Median
|
|
132
|
+
let median;
|
|
133
|
+
if (n % 2 === 0) {
|
|
134
|
+
median = (sorted[n / 2 - 1] + sorted[n / 2]) / 2;
|
|
135
|
+
}
|
|
136
|
+
else {
|
|
137
|
+
median = sorted[Math.floor(n / 2)];
|
|
138
|
+
}
|
|
139
|
+
// Sample standard deviation (Bessel's correction)
|
|
140
|
+
let variance;
|
|
141
|
+
if (n < 2) {
|
|
142
|
+
variance = 0;
|
|
143
|
+
}
|
|
144
|
+
else {
|
|
145
|
+
const sumSquaredDiffs = values.reduce((acc, v) => acc + (v - mean) ** 2, 0);
|
|
146
|
+
variance = sumSquaredDiffs / (n - 1);
|
|
147
|
+
}
|
|
148
|
+
const stddev = Math.sqrt(variance);
|
|
149
|
+
return {
|
|
150
|
+
mean,
|
|
151
|
+
median,
|
|
152
|
+
stddev,
|
|
153
|
+
min: sorted[0],
|
|
154
|
+
max: sorted[n - 1],
|
|
155
|
+
count: n,
|
|
156
|
+
};
|
|
157
|
+
}
|
|
158
|
+
}
|
|
159
|
+
//# sourceMappingURL=metric-aggregator.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"metric-aggregator.js","sourceRoot":"","sources":["../../../src/lib/benchmark/metric-aggregator.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AAUH,8DAA8D;AAE9D,MAAM,OAAO,gBAAgB;IAC3B;;;;;;;;OAQG;IACH,KAAK,CAAC,SAAS,CACb,KAAkB,EAClB,OAAyB,EACzB,OAA0B,EAC1B,SAAwC;QAExC,8DAA8D;QAC9D,MAAM,QAAQ,GAAc,EAAE,CAAC;QAC/B,MAAM,QAAQ,GAAG,GAAG,CAAC;QACrB,IAAI,MAAM,GAAG,CAAC,CAAC;QACf,OAAO,IAAI,EAAE,CAAC;YACZ,MAAM,IAAI,GAAG,MAAM,KAAK,CAAC,aAAa,CAAC;gBACrC,QAAQ,EAAE,OAAO,CAAC,QAAQ;gBAC1B,OAAO,EAAE,OAAO,CAAC,OAAO;gBACxB,IAAI,EAAE,CAAC,OAAO,CAAC,GAAG,CAAC;gBACnB,IAAI,EAAE,SAAS,EAAE,IAAI;gBACrB,EAAE,EAAE,SAAS,EAAE,EAAE;gBACjB,KAAK,EAAE,QAAQ;gBACf,MAAM;aACP,CAAC,CAAC;YACH,QAAQ,CAAC,IAAI,CAAC,GAAG,IAAI,CAAC,QAAQ,CAAC,CAAC;YAChC,IAAI,IAAI,CAAC,QAAQ,CAAC,MAAM,GAAG,QAAQ;gBAAE,MAAM;YAC3C,MAAM,IAAI,QAAQ,CAAC;QACrB,CAAC;QAED,MAAM,MAAM,GAAG,EAA0C,CAAC;QAE1D,KAAK,MAAM,MAAM,IAAI,OAAO,EAAE,CAAC;YAC7B,MAAM,MAAM,GAAG,IAAI,CAAC,mBAAmB,CAAC,QAAQ,EAAE,MAAM,CAAC,CAAC;YAC1D,MAAM,CAAC,MAAM,CAAC,GAAG,IAAI,CAAC,YAAY,CAAC,MAAM,CAAC,CAAC;QAC7C,CAAC;QAED,OAAO;YACL,YAAY,EAAE,QAAQ,CAAC,MAAM;YAC7B,OAAO,EAAE,MAAM;SAChB,CAAC;IACJ,CAAC;IAED;;OAEG;IACH,mBAAmB,CAAC,QAAmB,EAAE,MAAuB;QAC9D,MAAM,MAAM,GAAa,EAAE,CAAC;QAE5B,KAAK,MAAM,OAAO,IAAI,QAAQ,EAAE,CAAC;YAC/B,MAAM,KAAK,GAAG,IAAI,CAAC,kBAAkB,CAAC,OAAO,EAAE,MAAM,CAAC,CAAC;YACvD,IAAI,KAAK,KAAK,IAAI,IAAI,CAAC,KAAK,CAAC,KAAK,CAAC,EAAE,CAAC;gBACpC,MAAM,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC;YACrB,CAAC;QACH,CAAC;QAED,OAAO,MAAM,CAAC;IAChB,CAAC;IAED;;OAEG;IACK,kBAAkB,CAAC,OAAgB,EAAE,MAAuB;QAClE,QAAQ,MAAM,EAAE,CAAC;YACf,KAAK,UAAU;gBACb,OAAO,OAAO,CAAC,YAAY,CAAC;YAE9B,KAAK,YAAY;gBACf,uCAAuC;gBACvC,IAAI,OAAO,CAAC,UAAU,KAAK,CAAC;oBAAE,OAAO,IAAI,CAAC;gBAC1C,OAAO,OAAO,CAAC,UAAU,GAAG,OAAO,CAAC,UAAU,CAAC;YAEjD,KAAK,iBAAiB;gBACpB,sCAAsC;gBACtC,OAAO,OAAO,CAAC,MAAM,KAAK,WAAW,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC;YAEhD,KAAK,mBAAmB;gBACtB,4EAA4E;gBAC5E,oFAAoF;gBACpF,IAAI,OAAO,CAAC,aAAa,KAAK,CAAC;oBAAE,OAAO,IAAI,CAAC;gBAC7C,MAAM,UAAU,GAAG,IAAI,CAAC,GAAG,CAAC,OAAO,CAAC,UAAU,EAAE,OAAO,CAAC,aAAa,CAAC,CAAC;gBACvE,OAAO,IAAI,CAAC,GAAG,CAAC,CAAC,EAAE,CAAC,OAAO,CAAC,aAAa,GAAG,UAAU,CAAC,GAAG,OAAO,CAAC,aAAa,CAAC,CAAC;YAEnF,KAAK,YAAY;gBACf,OAAO,OAAO,CAAC,gBAAgB,GAAG,OAAO,CAAC,iBAAiB,CAAC;YAE9D,KAAK,cAAc,CAAC,CAAC,CAAC;gBACpB,2BAA2B;gBAC3B,IAAI,CAAC,OAAO,CAAC,OAAO,IAAI,CAAC,OAAO,CAAC,SAAS;oBAAE,OAAO,IAAI,CAAC;gBACxD,MAAM,KAAK,GAAG,IAAI,IAAI,CAAC,OAAO,CAAC,SAAS,CAAC,CAAC,OAAO,EAAE,CAAC;gBACpD,MAAM,GAAG,GAAG,IAAI,IAAI,CAAC,OAAO,CAAC,OAAO,CAAC,CAAC,OAAO,EAAE,CAAC;gBAChD,MAAM,UAAU,GAAG,GAAG,GAAG,KAAK,CAAC;gBAC/B,OAAO,UAAU,GAAG,CAAC,CAAC,CAAC,CAAC,UAAU,CAAC,CAAC,CAAC,IAAI,CAAC;YAC5C,CAAC;YAED,KAAK,aAAa;gBAChB,iFAAiF;gBACjF,gEAAgE;gBAChE,IAAI,CAAC,OAAO,CAAC,OAAO,IAAI,CAAC,OAAO,CAAC,SAAS,IAAI,OAAO,CAAC,YAAY,KAAK,CAAC;oBAAE,OAAO,IAAI,CAAC;gBACtF,MAAM,OAAO,GAAG,IAAI,IAAI,CAAC,OAAO,CAAC,SAAS,CAAC,CAAC,OAAO,EAAE,CAAC;gBACtD,MAAM,KAAK,GAAG,IAAI,IAAI,CAAC,OAAO,CAAC,OAAO,CAAC,CAAC,OAAO,EAAE,CAAC;gBAClD,MAAM,aAAa,GAAG,KAAK,GAAG,OAAO,CAAC;gBACtC,OAAO,aAAa,GAAG,CAAC,CAAC,CAAC,CAAC,aAAa,GAAG,OAAO,CAAC,YAAY,CAAC,CAAC,CAAC,IAAI,CAAC;YAEzE,KAAK,cAAc;gBACjB,wEAAwE;gBACxE,uDAAuD;gBACvD,oDAAoD;gBACpD,qEAAqE;gBACrE,OAAO,IAAI,CAAC;YAEd;gBACE,OAAO,IAAI,CAAC;QAChB,CAAC;IACH,CAAC;IAED;;OAEG;IACH,YAAY,CAAC,MAAgB;QAC3B,IAAI,MAAM,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;YACxB,OAAO;gBACL,IAAI,EAAE,CAAC;gBACP,MAAM,EAAE,CAAC;gBACT,MAAM,EAAE,CAAC;gBACT,GAAG,EAAE,CAAC;gBACN,GAAG,EAAE,CAAC;gBACN,KAAK,EAAE,CAAC;aACT,CAAC;QACJ,CAAC;QAED,MAAM,CAAC,GAAG,MAAM,CAAC,MAAM,CAAC;QACxB,MAAM,MAAM,GAAG,CAAC,GAAG,MAAM,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC;QAEjD,MAAM,GAAG,GAAG,MAAM,CAAC,MAAM,CAAC,CAAC,GAAG,EAAE,CAAC,EAAE,EAAE,CAAC,GAAG,GAAG,CAAC,EAAE,CAAC,CAAC,CAAC;QAClD,MAAM,IAAI,GAAG,GAAG,GAAG,CAAC,CAAC;QAErB,SAAS;QACT,IAAI,MAAc,CAAC;QACnB,IAAI,CAAC,GAAG,CAAC,KAAK,CAAC,EAAE,CAAC;YAChB,MAAM,GAAG,CAAC,MAAM,CAAC,CAAC,GAAG,CAAC,GAAG,CAAC,CAAE,GAAG,MAAM,CAAC,CAAC,GAAG,CAAC,CAAE,CAAC,GAAG,CAAC,CAAC;QACrD,CAAC;aAAM,CAAC;YACN,MAAM,GAAG,MAAM,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC,GAAG,CAAC,CAAC,CAAE,CAAC;QACtC,CAAC;QAED,kDAAkD;QAClD,IAAI,QAAgB,CAAC;QACrB,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC;YACV,QAAQ,GAAG,CAAC,CAAC;QACf,CAAC;aAAM,CAAC;YACN,MAAM,eAAe,GAAG,MAAM,CAAC,MAAM,CAAC,CAAC,GAAG,EAAE,CAAC,EAAE,EAAE,CAAC,GAAG,GAAG,CAAC,CAAC,GAAG,IAAI,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC,CAAC;YAC5E,QAAQ,GAAG,eAAe,GAAG,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC;QACvC,CAAC;QACD,MAAM,MAAM,GAAG,IAAI,CAAC,IAAI,CAAC,QAAQ,CAAC,CAAC;QAEnC,OAAO;YACL,IAAI;YACJ,MAAM;YACN,MAAM;YACN,GAAG,EAAE,MAAM,CAAC,CAAC,CAAE;YACf,GAAG,EAAE,MAAM,CAAC,CAAC,GAAG,CAAC,CAAE;YACnB,KAAK,EAAE,CAAC;SACT,CAAC;IACJ,CAAC;CACF"}
|
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* StatisticalComparator (Story 3.3)
|
|
3
|
+
*
|
|
4
|
+
* Hypothesis testing for benchmark variant comparisons.
|
|
5
|
+
* Implements Welch's t-test and chi-squared test with numerical p-value computation.
|
|
6
|
+
*/
|
|
7
|
+
import type { BenchmarkMetric, MetricComparison, MetricStats } from '@agentlensai/core';
|
|
8
|
+
export interface TTestResult {
|
|
9
|
+
tStatistic: number;
|
|
10
|
+
degreesOfFreedom: number;
|
|
11
|
+
pValue: number;
|
|
12
|
+
confidenceInterval: {
|
|
13
|
+
lower: number;
|
|
14
|
+
upper: number;
|
|
15
|
+
};
|
|
16
|
+
cohenD: number;
|
|
17
|
+
}
|
|
18
|
+
export interface ChiSquaredResult {
|
|
19
|
+
chiSquared: number;
|
|
20
|
+
pValue: number;
|
|
21
|
+
phi: number;
|
|
22
|
+
}
|
|
23
|
+
export declare class StatisticalComparator {
|
|
24
|
+
/**
|
|
25
|
+
* Welch's t-test for comparing two sets of continuous metric statistics.
|
|
26
|
+
*/
|
|
27
|
+
welchTTest(statsA: MetricStats, statsB: MetricStats): TTestResult;
|
|
28
|
+
/**
|
|
29
|
+
* Chi-squared test with Yates' continuity correction for comparing two proportions.
|
|
30
|
+
*/
|
|
31
|
+
chiSquaredTest(successesA: number, totalA: number, successesB: number, totalB: number): ChiSquaredResult;
|
|
32
|
+
/**
|
|
33
|
+
* Compare two variants on a single metric. Selects appropriate test.
|
|
34
|
+
*/
|
|
35
|
+
compare(variantA: {
|
|
36
|
+
id: string;
|
|
37
|
+
name: string;
|
|
38
|
+
metrics: Record<string, MetricStats>;
|
|
39
|
+
}, variantB: {
|
|
40
|
+
id: string;
|
|
41
|
+
name: string;
|
|
42
|
+
metrics: Record<string, MetricStats>;
|
|
43
|
+
}, metric: BenchmarkMetric): MetricComparison;
|
|
44
|
+
private compareContinuous;
|
|
45
|
+
private compareProportions;
|
|
46
|
+
/**
|
|
47
|
+
* Confidence star rating based on p-value.
|
|
48
|
+
*/
|
|
49
|
+
confidenceStars(pValue: number): '★★★' | '★★' | '★' | '—';
|
|
50
|
+
}
|
|
51
|
+
//# sourceMappingURL=statistical.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"statistical.d.ts","sourceRoot":"","sources":["../../../src/lib/benchmark/statistical.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AAEH,OAAO,KAAK,EACV,eAAe,EACf,gBAAgB,EAChB,WAAW,EAEZ,MAAM,mBAAmB,CAAC;AAiC3B,MAAM,WAAW,WAAW;IAC1B,UAAU,EAAE,MAAM,CAAC;IACnB,gBAAgB,EAAE,MAAM,CAAC;IACzB,MAAM,EAAE,MAAM,CAAC;IACf,kBAAkB,EAAE;QAAE,KAAK,EAAE,MAAM,CAAC;QAAC,KAAK,EAAE,MAAM,CAAA;KAAE,CAAC;IACrD,MAAM,EAAE,MAAM,CAAC;CAChB;AAID,MAAM,WAAW,gBAAgB;IAC/B,UAAU,EAAE,MAAM,CAAC;IACnB,MAAM,EAAE,MAAM,CAAC;IACf,GAAG,EAAE,MAAM,CAAC;CACb;AAgKD,qBAAa,qBAAqB;IAChC;;OAEG;IACH,UAAU,CAAC,MAAM,EAAE,WAAW,EAAE,MAAM,EAAE,WAAW,GAAG,WAAW;IAmEjE;;OAEG;IACH,cAAc,CACZ,UAAU,EAAE,MAAM,EAClB,MAAM,EAAE,MAAM,EACd,UAAU,EAAE,MAAM,EAClB,MAAM,EAAE,MAAM,GACb,gBAAgB;IA8CnB;;OAEG;IACH,OAAO,CACL,QAAQ,EAAE;QAAE,EAAE,EAAE,MAAM,CAAC;QAAC,IAAI,EAAE,MAAM,CAAC;QAAC,OAAO,EAAE,MAAM,CAAC,MAAM,EAAE,WAAW,CAAC,CAAA;KAAE,EAC5E,QAAQ,EAAE;QAAE,EAAE,EAAE,MAAM,CAAC;QAAC,IAAI,EAAE,MAAM,CAAC;QAAC,OAAO,EAAE,MAAM,CAAC,MAAM,EAAE,WAAW,CAAC,CAAA;KAAE,EAC5E,MAAM,EAAE,eAAe,GACtB,gBAAgB;IAiBnB,OAAO,CAAC,iBAAiB;IAwCzB,OAAO,CAAC,kBAAkB;IAwD1B;;OAEG;IACH,eAAe,CAAC,MAAM,EAAE,MAAM,GAAG,KAAK,GAAG,IAAI,GAAG,GAAG,GAAG,GAAG;CAM1D"}
|