@fuzdev/fuz_util 0.54.0 → 0.55.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/benchmark_baseline.d.ts +6 -0
- package/dist/benchmark_baseline.d.ts.map +1 -1
- package/dist/benchmark_baseline.js +15 -8
- package/dist/benchmark_stats.d.ts +25 -5
- package/dist/benchmark_stats.d.ts.map +1 -1
- package/dist/benchmark_stats.js +43 -35
- package/package.json +3 -3
- package/src/lib/benchmark_baseline.ts +26 -9
- package/src/lib/benchmark_stats.ts +61 -39
|
@@ -83,6 +83,12 @@ export interface BenchmarkBaselineCompareOptions extends BenchmarkBaselineLoadOp
|
|
|
83
83
|
* Default: undefined (no staleness warning)
|
|
84
84
|
*/
|
|
85
85
|
staleness_warning_days?: number;
|
|
86
|
+
/**
|
|
87
|
+
* Minimum percentage difference to consider meaningful, as a ratio.
|
|
88
|
+
* Passed through to `benchmark_stats_compare`. See `BenchmarkCompareOptions`.
|
|
89
|
+
* Default: 0.10 (10%)
|
|
90
|
+
*/
|
|
91
|
+
min_percent_difference?: number;
|
|
86
92
|
}
|
|
87
93
|
/**
|
|
88
94
|
* Result of comparing current results against a baseline.
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"benchmark_baseline.d.ts","sourceRoot":"../src/lib/","sources":["../src/lib/benchmark_baseline.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AAIH,OAAO,EAAC,CAAC,EAAC,MAAM,KAAK,CAAC;AAItB,OAAO,KAAK,EAAC,eAAe,EAAC,MAAM,sBAAsB,CAAC;AAC1D,OAAO,EAEN,KAAK,mBAAmB,EAExB,MAAM,sBAAsB,CAAC;AAM9B;;GAEG;AACH,eAAO,MAAM,sBAAsB;;;;;;;;;;;;;iBAajC,CAAC;AACH,MAAM,MAAM,sBAAsB,GAAG,CAAC,CAAC,KAAK,CAAC,OAAO,sBAAsB,CAAC,CAAC;AAE5E;;GAEG;AACH,eAAO,MAAM,iBAAiB;;;;;;;;;;;;;;;;;;;;iBAO5B,CAAC;AACH,MAAM,MAAM,iBAAiB,GAAG,CAAC,CAAC,KAAK,CAAC,OAAO,iBAAiB,CAAC,CAAC;AAElE;;GAEG;AACH,MAAM,WAAW,4BAA4B;IAC5C,gEAAgE;IAChE,IAAI,CAAC,EAAE,MAAM,CAAC;IACd,sDAAsD;IACtD,UAAU,CAAC,EAAE,MAAM,GAAG,IAAI,CAAC;IAC3B,sDAAsD;IACtD,UAAU,CAAC,EAAE,MAAM,GAAG,IAAI,CAAC;CAC3B;AAED;;GAEG;AACH,MAAM,WAAW,4BAA4B;IAC5C,mEAAmE;IACnE,IAAI,CAAC,EAAE,MAAM,CAAC;CACd;AAED;;GAEG;AACH,MAAM,WAAW,+BAAgC,SAAQ,4BAA4B;IACpF;;;;OAIG;IACH,oBAAoB,CAAC,EAAE,MAAM,CAAC;IAC9B;;;OAGG;IACH,sBAAsB,CAAC,EAAE,MAAM,CAAC;CAChC;AAED;;GAEG;AACH,MAAM,WAAW,iCAAiC;IACjD,mCAAmC;IACnC,cAAc,EAAE,OAAO,CAAC;IACxB,gCAAgC;IAChC,kBAAkB,EAAE,MAAM,GAAG,IAAI,CAAC;IAClC,iCAAiC;IACjC,eAAe,EAAE,MAAM,GAAG,IAAI,CAAC;IAC/B,kCAAkC;IAClC,iBAAiB,EAAE,MAAM,GAAG,IAAI,CAAC;IACjC,sFAAsF;IACtF,cAAc,EAAE,OAAO,CAAC;IACxB,kCAAkC;IAClC,WAAW,EAAE,KAAK,CAAC,+BAA+B,CAAC,CAAC;IACpD,yGAAyG;IACzG,WAAW,EAAE,KAAK,CAAC,+BAA+B,CAAC,CAAC;IACpD,wGAAwG;IACxG,YAAY,EAAE,KAAK,CAAC,+BAA+B,CAAC,CAAC;IACrD,uCAAuC;IACvC,SAAS,EAAE,KAAK,CAAC,+BAA+B,CAAC,CAAC;IAClD,+CAA+C;IAC/C,SAAS,EAAE,KAAK,CAAC,MAAM,CAAC,CAAC;IACzB,+CAA+C;IAC/C,aAAa,EAAE,KAAK,CAAC,MAAM,CAAC,CAAC;CAC7B;AAED;;GAEG;AACH,MAAM,WAAW,+BAA+B;IAC/C,IAAI,EAAE,MAAM,CAAC;IACb,QAAQ,EAAE,sBAAsB,CAAC;IACjC,OAAO,EAAE,sBAAsB,CAAC;IAChC,UAAU,EAAE,mBAAmB,CAAC;CAChC;AAyBD;;;;;;;;;;;;;GAaG;AACH,eAAO,MAAM,uBAAuB,GACnC,SAAS,KAAK,CAAC,eAAe,CAAC,EAC/B,UAAS,4BAAiC,KACxC,OAAO,CAAC,IAAI,CAwBd,CAAC;AAEF;;;;;;;;;;;;;GAaG;AACH,eAAO,MAAM,uBAAuB,GACnC,UAAS,4BAAiC,KACxC,OAAO,CAAC,iBAAiB,GAAG,IAAI,CAiClC,CAAC;AAEF;;;;;;;;;;;;;;;;;;;;;;;;;GAyBG;AACH,eAAO,MAAM,0BAA0B,GACtC,SAAS,KAAK,CAAC,eAAe,CAAC,EAC/B,UAAS,+BAAoC,KAC3C,OAAO,CAAC,iCAAiC,
|
|
1
|
+
{"version":3,"file":"benchmark_baseline.d.ts","sourceRoot":"../src/lib/","sources":["../src/lib/benchmark_baseline.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AAIH,OAAO,EAAC,CAAC,EAAC,MAAM,KAAK,CAAC;AAItB,OAAO,KAAK,EAAC,eAAe,EAAC,MAAM,sBAAsB,CAAC;AAC1D,OAAO,EAEN,KAAK,mBAAmB,EAExB,MAAM,sBAAsB,CAAC;AAM9B;;GAEG;AACH,eAAO,MAAM,sBAAsB;;;;;;;;;;;;;iBAajC,CAAC;AACH,MAAM,MAAM,sBAAsB,GAAG,CAAC,CAAC,KAAK,CAAC,OAAO,sBAAsB,CAAC,CAAC;AAE5E;;GAEG;AACH,eAAO,MAAM,iBAAiB;;;;;;;;;;;;;;;;;;;;iBAO5B,CAAC;AACH,MAAM,MAAM,iBAAiB,GAAG,CAAC,CAAC,KAAK,CAAC,OAAO,iBAAiB,CAAC,CAAC;AAElE;;GAEG;AACH,MAAM,WAAW,4BAA4B;IAC5C,gEAAgE;IAChE,IAAI,CAAC,EAAE,MAAM,CAAC;IACd,sDAAsD;IACtD,UAAU,CAAC,EAAE,MAAM,GAAG,IAAI,CAAC;IAC3B,sDAAsD;IACtD,UAAU,CAAC,EAAE,MAAM,GAAG,IAAI,CAAC;CAC3B;AAED;;GAEG;AACH,MAAM,WAAW,4BAA4B;IAC5C,mEAAmE;IACnE,IAAI,CAAC,EAAE,MAAM,CAAC;CACd;AAED;;GAEG;AACH,MAAM,WAAW,+BAAgC,SAAQ,4BAA4B;IACpF;;;;OAIG;IACH,oBAAoB,CAAC,EAAE,MAAM,CAAC;IAC9B;;;OAGG;IACH,sBAAsB,CAAC,EAAE,MAAM,CAAC;IAChC;;;;OAIG;IACH,sBAAsB,CAAC,EAAE,MAAM,CAAC;CAChC;AAED;;GAEG;AACH,MAAM,WAAW,iCAAiC;IACjD,mCAAmC;IACnC,cAAc,EAAE,OAAO,CAAC;IACxB,gCAAgC;IAChC,kBAAkB,EAAE,MAAM,GAAG,IAAI,CAAC;IAClC,iCAAiC;IACjC,eAAe,EAAE,MAAM,GAAG,IAAI,CAAC;IAC/B,kCAAkC;IAClC,iBAAiB,EAAE,MAAM,GAAG,IAAI,CAAC;IACjC,sFAAsF;IACtF,cAAc,EAAE,OAAO,CAAC;IACxB,kCAAkC;IAClC,WAAW,EAAE,KAAK,CAAC,+BAA+B,CAAC,CAAC;IACpD,yGAAyG;IACzG,WAAW,EAAE,KAAK,CAAC,+BAA+B,CAAC,CAAC;IACpD,wGAAwG;IACxG,YAAY,EAAE,KAAK,CAAC,+BAA+B,CAAC,CAAC;IACrD,uCAAuC;IACvC,SAAS,EAAE,KAAK,CAAC,+BAA+B,CAAC,CAAC;IAClD,+CAA+C;IAC/C,SAAS,EAAE,KAAK,CAAC,MAAM,CAAC,CAAC;IACzB,+CAA+C;IAC/C,aAAa,EAAE,KAAK,CAAC,MAAM,CAAC,CAAC;CAC7B;AAED;;GAEG;AACH,MAAM,WAAW,+BAA+B;IAC/C,IAAI,EAAE,MAAM,CAAC;IACb,QAAQ,EAAE,sBAAsB,CAAC;IACjC,OAAO,EAAE,sBAAsB,CAAC;IAChC,UAAU,EAAE,mBAAmB,CAAC;CAChC;AAyBD;;;;;;;;;;;;;GAaG;AACH,eAAO,MAAM,uBAAuB,GACnC,SAAS,KAAK,CAAC,eAAe,CAAC,EAC/B,UAAS,4BAAiC,KACxC,OAAO,CAAC,IAAI,CAwBd,CAAC;AAEF;;;;;;;;;;;;;GAaG;AACH,eAAO,MAAM,uBAAuB,GACnC,UAAS,4BAAiC,KACxC,OAAO,CAAC,iBAAiB,GAAG,IAAI,CAiClC,CAAC;AAEF;;;;;;;;;;;;;;;;;;;;;;;;;GAyBG;AACH,eAAO,MAAM,0BAA0B,GACtC,SAAS,KAAK,CAAC,eAAe,CAAC,EAC/B,UAAS,+BAAoC,KAC3C,OAAO,CAAC,iCAAiC,CAsI3C,CAAC;AAEF;;;;;GAKG;AACH,eAAO,MAAM,yBAAyB,GAAI,QAAQ,iCAAiC,KAAG,MA8ErF,CAAC;AAEF;;;;;;GAMG;AACH,eAAO,MAAM,8BAA8B,GAC1C,QAAQ,iCAAiC,EACzC,UAAS;IAAC,MAAM,CAAC,EAAE,OAAO,CAAA;CAAM,KAC9B,MAyCF,CAAC"}
|
|
@@ -217,7 +217,9 @@ export const benchmark_baseline_compare = async (results, options = {}) => {
|
|
|
217
217
|
sample_size: current.sample_size,
|
|
218
218
|
confidence_interval_ns: stats_confidence_interval_from_summary(current.mean_ns, current.std_dev_ns, current.sample_size),
|
|
219
219
|
};
|
|
220
|
-
const comparison = benchmark_stats_compare(baseline_stats, current_stats
|
|
220
|
+
const comparison = benchmark_stats_compare(baseline_stats, current_stats, {
|
|
221
|
+
min_percent_difference: options.min_percent_difference,
|
|
222
|
+
});
|
|
221
223
|
const task_comparison = {
|
|
222
224
|
name: current.name,
|
|
223
225
|
baseline: baseline_entry,
|
|
@@ -227,7 +229,8 @@ export const benchmark_baseline_compare = async (results, options = {}) => {
|
|
|
227
229
|
comparisons.push(task_comparison);
|
|
228
230
|
// Categorize based on comparison result
|
|
229
231
|
// Note: comparison.faster is 'a' (baseline) or 'b' (current)
|
|
230
|
-
|
|
232
|
+
// significant implies percent_difference >= min_pct, which implies effect_magnitude !== 'negligible'
|
|
233
|
+
if (comparison.significant) {
|
|
231
234
|
if (comparison.faster === 'a') {
|
|
232
235
|
// Baseline was faster = potential regression
|
|
233
236
|
// Only count as regression if it exceeds the threshold
|
|
@@ -256,10 +259,10 @@ export const benchmark_baseline_compare = async (results, options = {}) => {
|
|
|
256
259
|
removed_tasks.push(baseline_entry.name);
|
|
257
260
|
}
|
|
258
261
|
}
|
|
259
|
-
// Sort regressions and improvements by
|
|
260
|
-
const
|
|
261
|
-
regressions.sort(
|
|
262
|
-
improvements.sort(
|
|
262
|
+
// Sort regressions and improvements by percentage difference (largest first)
|
|
263
|
+
const sort_by_percent_difference = (a, b) => b.comparison.percent_difference - a.comparison.percent_difference;
|
|
264
|
+
regressions.sort(sort_by_percent_difference);
|
|
265
|
+
improvements.sort(sort_by_percent_difference);
|
|
263
266
|
return {
|
|
264
267
|
baseline_found: true,
|
|
265
268
|
baseline_timestamp: baseline.timestamp,
|
|
@@ -302,8 +305,9 @@ export const benchmark_baseline_format = (result) => {
|
|
|
302
305
|
lines.push(`Regressions (${result.regressions.length}):`);
|
|
303
306
|
for (const r of result.regressions) {
|
|
304
307
|
const ratio = r.comparison.speedup_ratio.toFixed(2);
|
|
308
|
+
const pct = (r.comparison.percent_difference * 100).toFixed(1);
|
|
305
309
|
const p = r.comparison.p_value.toFixed(3);
|
|
306
|
-
lines.push(` ${r.name}: ${ratio}x slower (p=${p}, ${r.comparison.effect_magnitude})`);
|
|
310
|
+
lines.push(` ${r.name}: ${ratio}x slower (${pct}%, p=${p}, ${r.comparison.effect_magnitude})`);
|
|
307
311
|
}
|
|
308
312
|
lines.push('');
|
|
309
313
|
}
|
|
@@ -311,8 +315,9 @@ export const benchmark_baseline_format = (result) => {
|
|
|
311
315
|
lines.push(`Improvements (${result.improvements.length}):`);
|
|
312
316
|
for (const r of result.improvements) {
|
|
313
317
|
const ratio = r.comparison.speedup_ratio.toFixed(2);
|
|
318
|
+
const pct = (r.comparison.percent_difference * 100).toFixed(1);
|
|
314
319
|
const p = r.comparison.p_value.toFixed(3);
|
|
315
|
-
lines.push(` ${r.name}: ${ratio}x faster (p=${p}, ${r.comparison.effect_magnitude})`);
|
|
320
|
+
lines.push(` ${r.name}: ${ratio}x faster (${pct}%, p=${p}, ${r.comparison.effect_magnitude})`);
|
|
316
321
|
}
|
|
317
322
|
lines.push('');
|
|
318
323
|
}
|
|
@@ -367,6 +372,7 @@ export const benchmark_baseline_format_json = (result, options = {}) => {
|
|
|
367
372
|
regressions: result.regressions.map((r) => ({
|
|
368
373
|
name: r.name,
|
|
369
374
|
speedup_ratio: r.comparison.speedup_ratio,
|
|
375
|
+
percent_difference: r.comparison.percent_difference,
|
|
370
376
|
effect_size: r.comparison.effect_size,
|
|
371
377
|
effect_magnitude: r.comparison.effect_magnitude,
|
|
372
378
|
p_value: r.comparison.p_value,
|
|
@@ -376,6 +382,7 @@ export const benchmark_baseline_format_json = (result, options = {}) => {
|
|
|
376
382
|
improvements: result.improvements.map((r) => ({
|
|
377
383
|
name: r.name,
|
|
378
384
|
speedup_ratio: r.comparison.speedup_ratio,
|
|
385
|
+
percent_difference: r.comparison.percent_difference,
|
|
379
386
|
effect_size: r.comparison.effect_size,
|
|
380
387
|
effect_magnitude: r.comparison.effect_magnitude,
|
|
381
388
|
p_value: r.comparison.p_value,
|
|
@@ -27,13 +27,15 @@ export interface BenchmarkComparison {
|
|
|
27
27
|
faster: 'a' | 'b' | 'equal';
|
|
28
28
|
/** How much faster the winner is (e.g., 1.5 means 1.5x faster) */
|
|
29
29
|
speedup_ratio: number;
|
|
30
|
-
/** Whether the difference is statistically
|
|
30
|
+
/** Whether the difference is both statistically and practically significant */
|
|
31
31
|
significant: boolean;
|
|
32
32
|
/** P-value from Welch's t-test (lower = more confident the difference is real) */
|
|
33
33
|
p_value: number;
|
|
34
|
-
/**
|
|
34
|
+
/** Percentage difference between means as a ratio (0.05 = 5%, 1.0 = 100%) */
|
|
35
|
+
percent_difference: number;
|
|
36
|
+
/** Cohen's d effect size (informational — not used for classification) */
|
|
35
37
|
effect_size: number;
|
|
36
|
-
/** Interpretation of
|
|
38
|
+
/** Interpretation of practical significance based on percentage difference */
|
|
37
39
|
effect_magnitude: EffectMagnitude;
|
|
38
40
|
/** Whether the 95% confidence intervals overlap */
|
|
39
41
|
ci_overlap: boolean;
|
|
@@ -46,6 +48,20 @@ export interface BenchmarkComparison {
|
|
|
46
48
|
export interface BenchmarkCompareOptions {
|
|
47
49
|
/** Significance level for hypothesis testing (default: 0.05) */
|
|
48
50
|
alpha?: number;
|
|
51
|
+
/**
|
|
52
|
+
* Minimum percentage difference to consider practically meaningful, as a ratio.
|
|
53
|
+
* Below this threshold, differences are classified as 'negligible' and
|
|
54
|
+
* `significant` is forced to `false`, regardless of p-value.
|
|
55
|
+
* This prevents the t-test's oversensitivity at large sample sizes from
|
|
56
|
+
* flagging system-level noise (thermal throttle, OS scheduler, cache pressure)
|
|
57
|
+
* as meaningful differences.
|
|
58
|
+
*
|
|
59
|
+
* Effect magnitude thresholds scale from this value:
|
|
60
|
+
* negligible < min, small < min*3, medium < min*5, large >= min*5.
|
|
61
|
+
*
|
|
62
|
+
* Default: 0.10 (10%).
|
|
63
|
+
*/
|
|
64
|
+
min_percent_difference?: number;
|
|
49
65
|
}
|
|
50
66
|
/**
|
|
51
67
|
* Complete statistical analysis of timing measurements.
|
|
@@ -94,8 +110,12 @@ export declare class BenchmarkStats {
|
|
|
94
110
|
toString(): string;
|
|
95
111
|
}
|
|
96
112
|
/**
|
|
97
|
-
* Compare two benchmark results for statistical significance.
|
|
98
|
-
* Uses
|
|
113
|
+
* Compare two benchmark results for practical and statistical significance.
|
|
114
|
+
* Uses percentage difference for effect magnitude classification, with Welch's
|
|
115
|
+
* t-test for statistical confidence. Cohen's d is computed as an informational
|
|
116
|
+
* metric but does not drive classification — its thresholds (0.2/0.5/0.8) are
|
|
117
|
+
* calibrated for social science and produce false positives in benchmarking
|
|
118
|
+
* where within-run variance is tight.
|
|
99
119
|
*
|
|
100
120
|
* @param a - First benchmark stats (or any object with required properties)
|
|
101
121
|
* @param b - Second benchmark stats (or any object with required properties)
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"benchmark_stats.d.ts","sourceRoot":"../src/lib/","sources":["../src/lib/benchmark_stats.ts"],"names":[],"mappings":"AAAA;;;;;;GAMG;AAgBH;;;GAGG;AACH,MAAM,WAAW,wBAAwB;IACxC,OAAO,EAAE,MAAM,CAAC;IAChB,UAAU,EAAE,MAAM,CAAC;IACnB,WAAW,EAAE,MAAM,CAAC;IACpB,sBAAsB,EAAE,CAAC,MAAM,EAAE,MAAM,CAAC,CAAC;CACzC;AAED;;GAEG;AACH,MAAM,MAAM,eAAe,GAAG,YAAY,GAAG,OAAO,GAAG,QAAQ,GAAG,OAAO,CAAC;AAE1E;;GAEG;AACH,MAAM,WAAW,mBAAmB;IACnC,mFAAmF;IACnF,MAAM,EAAE,GAAG,GAAG,GAAG,GAAG,OAAO,CAAC;IAC5B,kEAAkE;IAClE,aAAa,EAAE,MAAM,CAAC;IACtB
|
|
1
|
+
{"version":3,"file":"benchmark_stats.d.ts","sourceRoot":"../src/lib/","sources":["../src/lib/benchmark_stats.ts"],"names":[],"mappings":"AAAA;;;;;;GAMG;AAgBH;;;GAGG;AACH,MAAM,WAAW,wBAAwB;IACxC,OAAO,EAAE,MAAM,CAAC;IAChB,UAAU,EAAE,MAAM,CAAC;IACnB,WAAW,EAAE,MAAM,CAAC;IACpB,sBAAsB,EAAE,CAAC,MAAM,EAAE,MAAM,CAAC,CAAC;CACzC;AAED;;GAEG;AACH,MAAM,MAAM,eAAe,GAAG,YAAY,GAAG,OAAO,GAAG,QAAQ,GAAG,OAAO,CAAC;AAE1E;;GAEG;AACH,MAAM,WAAW,mBAAmB;IACnC,mFAAmF;IACnF,MAAM,EAAE,GAAG,GAAG,GAAG,GAAG,OAAO,CAAC;IAC5B,kEAAkE;IAClE,aAAa,EAAE,MAAM,CAAC;IACtB,+EAA+E;IAC/E,WAAW,EAAE,OAAO,CAAC;IACrB,kFAAkF;IAClF,OAAO,EAAE,MAAM,CAAC;IAChB,6EAA6E;IAC7E,kBAAkB,EAAE,MAAM,CAAC;IAC3B,0EAA0E;IAC1E,WAAW,EAAE,MAAM,CAAC;IACpB,8EAA8E;IAC9E,gBAAgB,EAAE,eAAe,CAAC;IAClC,mDAAmD;IACnD,UAAU,EAAE,OAAO,CAAC;IACpB,sDAAsD;IACtD,cAAc,EAAE,MAAM,CAAC;CACvB;AAED;;GAEG;AACH,MAAM,WAAW,uBAAuB;IACvC,gEAAgE;IAChE,KAAK,CAAC,EAAE,MAAM,CAAC;IACf;;;;;;;;;;;;OAYG;IACH,sBAAsB,CAAC,EAAE,MAAM,CAAC;CAChC;AAED;;;;GAIG;AACH,qBAAa,cAAc;IAC1B,yCAAyC;IACzC,QAAQ,CAAC,OAAO,EAAE,MAAM,CAAC;IACzB,mDAAmD;IACnD,QAAQ,CAAC,MAAM,EAAE,MAAM,CAAC;IACxB,wCAAwC;IACxC,QAAQ,CAAC,UAAU,EAAE,MAAM,CAAC;IAC5B,kCAAkC;IAClC,QAAQ,CAAC,MAAM,EAAE,MAAM,CAAC;IACxB,kCAAkC;IAClC,QAAQ,CAAC,MAAM,EAAE,MAAM,CAAC;IACxB,qCAAqC;IACrC,QAAQ,CAAC,MAAM,EAAE,MAAM,CAAC;IACxB,qCAAqC;IACrC,QAAQ,CAAC,MAAM,EAAE,MAAM,CAAC;IACxB,qCAAqC;IACrC,QAAQ,CAAC,MAAM,EAAE,MAAM,CAAC;IACxB,qCAAqC;IACrC,QAAQ,CAAC,MAAM,EAAE,MAAM,CAAC;IACxB,gDAAgD;IAChD,QAAQ,CAAC,EAAE,EAAE,MAAM,CAAC;IACpB,0DAA0D;IAC1D,QAAQ,CAAC,sBAAsB,EAAE,CAAC,MAAM,EAAE,MAAM,CAAC,CAAC;IAClD,sDAAsD;IACtD,QAAQ,CAAC,WAAW,EAAE,KAAK,CAAC,MAAM,CAAC,CAAC;IACpC,yCAAyC;IACzC,QAAQ,CAAC,aAAa,EAAE,MAAM,CAAC;IAC/B,8CAA8C;IAC9C,QAAQ,CAAC,WAAW,EAAE,MAAM,CAAC;IAC7B,0DAA0D;IAC1D,QAAQ,CAAC,eAAe,EAAE,MAAM,CAAC;IACjC,mDAAmD;IACnD,QAAQ,CAAC,cAAc,EAAE,MAAM,CAAC;IAChC,sEAAsE;IACtE,QAAQ,CAAC,iBAAiB,EAAE,MAAM,CAAC;gBAEvB,UAAU,EAAE,KAAK,CAAC,MAAM,CAAC;IAiErC;;OAEG;IACH,QAAQ,IAAI,MAAM;CAGlB;AAED;;;;;;;;;;;;;;;;;;;;GAoBG;AACH,eAAO,MAAM,uBAAuB,GACnC,GAAG,wBAAwB,EAC3B,GAAG,wBAAwB,EAC3B,UAAU,uBAAuB,KAC/B,mBA+GF,CAAC"}
|
package/dist/benchmark_stats.js
CHANGED
|
@@ -110,8 +110,12 @@ export class BenchmarkStats {
|
|
|
110
110
|
}
|
|
111
111
|
}
|
|
112
112
|
/**
|
|
113
|
-
* Compare two benchmark results for statistical significance.
|
|
114
|
-
* Uses
|
|
113
|
+
* Compare two benchmark results for practical and statistical significance.
|
|
114
|
+
* Uses percentage difference for effect magnitude classification, with Welch's
|
|
115
|
+
* t-test for statistical confidence. Cohen's d is computed as an informational
|
|
116
|
+
* metric but does not drive classification — its thresholds (0.2/0.5/0.8) are
|
|
117
|
+
* calibrated for social science and produce false positives in benchmarking
|
|
118
|
+
* where within-run variance is tight.
|
|
115
119
|
*
|
|
116
120
|
* @param a - First benchmark stats (or any object with required properties)
|
|
117
121
|
* @param b - Second benchmark stats (or any object with required properties)
|
|
@@ -128,6 +132,7 @@ export class BenchmarkStats {
|
|
|
128
132
|
*/
|
|
129
133
|
export const benchmark_stats_compare = (a, b, options) => {
|
|
130
134
|
const alpha = options?.alpha ?? 0.05;
|
|
135
|
+
const min_pct = options?.min_percent_difference ?? 0.1;
|
|
131
136
|
// Handle edge cases
|
|
132
137
|
if (a.sample_size === 0 || b.sample_size === 0) {
|
|
133
138
|
return {
|
|
@@ -135,6 +140,7 @@ export const benchmark_stats_compare = (a, b, options) => {
|
|
|
135
140
|
speedup_ratio: 1,
|
|
136
141
|
significant: false,
|
|
137
142
|
p_value: 1,
|
|
143
|
+
percent_difference: 0,
|
|
138
144
|
effect_size: 0,
|
|
139
145
|
effect_magnitude: 'negligible',
|
|
140
146
|
ci_overlap: true,
|
|
@@ -144,6 +150,8 @@ export const benchmark_stats_compare = (a, b, options) => {
|
|
|
144
150
|
// Calculate speedup ratio (lower time = faster, so compare by time not ops/sec)
|
|
145
151
|
const speedup_ratio = a.mean_ns < b.mean_ns ? b.mean_ns / a.mean_ns : a.mean_ns / b.mean_ns;
|
|
146
152
|
const faster = a.mean_ns < b.mean_ns ? 'a' : a.mean_ns > b.mean_ns ? 'b' : 'equal';
|
|
153
|
+
// Percentage difference relative to the faster mean (always >= 0)
|
|
154
|
+
const percent_difference = speedup_ratio - 1;
|
|
147
155
|
// Welch's t-test (handles unequal variances)
|
|
148
156
|
// Special case: if both have zero variance, t-test is undefined
|
|
149
157
|
let p_value;
|
|
@@ -156,55 +164,54 @@ export const benchmark_stats_compare = (a, b, options) => {
|
|
|
156
164
|
// Calculate two-tailed p-value using t-distribution approximation
|
|
157
165
|
p_value = stats_t_distribution_p_value(Math.abs(t_statistic), degrees_of_freedom);
|
|
158
166
|
}
|
|
159
|
-
// Cohen's d effect size
|
|
167
|
+
// Cohen's d effect size (informational only — not used for classification)
|
|
160
168
|
const pooled_std_dev = Math.sqrt(((a.sample_size - 1) * a.std_dev_ns ** 2 + (b.sample_size - 1) * b.std_dev_ns ** 2) /
|
|
161
169
|
(a.sample_size + b.sample_size - 2));
|
|
162
|
-
// When pooled_std_dev is 0 but means differ, effect is maximal (infinite)
|
|
163
|
-
// When means are equal, effect is 0
|
|
164
170
|
let effect_size;
|
|
165
|
-
let effect_magnitude;
|
|
166
171
|
if (pooled_std_dev === 0) {
|
|
167
|
-
|
|
168
|
-
if (a.mean_ns === b.mean_ns) {
|
|
169
|
-
effect_size = 0;
|
|
170
|
-
effect_magnitude = 'negligible';
|
|
171
|
-
}
|
|
172
|
-
else {
|
|
173
|
-
// Any difference is 100% reliable when there's no variance
|
|
174
|
-
effect_size = Infinity;
|
|
175
|
-
effect_magnitude = 'large';
|
|
176
|
-
}
|
|
172
|
+
effect_size = a.mean_ns === b.mean_ns ? 0 : Infinity;
|
|
177
173
|
}
|
|
178
174
|
else {
|
|
179
175
|
effect_size = Math.abs(a.mean_ns - b.mean_ns) / pooled_std_dev;
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
176
|
+
}
|
|
177
|
+
// Effect magnitude based on percentage difference, not Cohen's d.
|
|
178
|
+
// Cohen's d thresholds (0.2/0.5/0.8) are calibrated for social science, not benchmarking.
|
|
179
|
+
// Within-run variance is tight, so even small system noise (thermal throttle, OS scheduler)
|
|
180
|
+
// produces large Cohen's d. Percentage thresholds directly answer "is this difference
|
|
181
|
+
// meaningful in practice?" Thresholds scale with min_percent_difference so users can
|
|
182
|
+
// tune one knob for their system's noise floor.
|
|
183
|
+
let effect_magnitude;
|
|
184
|
+
if (percent_difference < min_pct) {
|
|
185
|
+
effect_magnitude = 'negligible';
|
|
186
|
+
}
|
|
187
|
+
else if (percent_difference < min_pct * 3) {
|
|
188
|
+
effect_magnitude = 'small';
|
|
189
|
+
}
|
|
190
|
+
else if (percent_difference < min_pct * 5) {
|
|
191
|
+
effect_magnitude = 'medium';
|
|
192
|
+
}
|
|
193
|
+
else {
|
|
194
|
+
effect_magnitude = 'large';
|
|
189
195
|
}
|
|
190
196
|
// Check confidence interval overlap
|
|
191
197
|
const ci_overlap = a.confidence_interval_ns[0] <= b.confidence_interval_ns[1] &&
|
|
192
198
|
b.confidence_interval_ns[0] <= a.confidence_interval_ns[1];
|
|
193
|
-
//
|
|
194
|
-
|
|
199
|
+
// Significance requires both statistical significance (p < alpha)
|
|
200
|
+
// AND practical significance (percent_difference >= min_pct).
|
|
201
|
+
// With large n, the t-test finds p≈0 for any difference because
|
|
202
|
+
// SE = std_dev/sqrt(n) → 0. Gating on practical significance
|
|
203
|
+
// prevents system noise from being flagged as meaningful.
|
|
204
|
+
const significant = p_value < alpha && percent_difference >= min_pct;
|
|
195
205
|
// Generate recommendation
|
|
196
206
|
let recommendation;
|
|
197
|
-
if (
|
|
198
|
-
recommendation =
|
|
199
|
-
effect_magnitude === 'negligible'
|
|
200
|
-
? 'No meaningful difference detected'
|
|
201
|
-
: `Difference not statistically significant (p=${p_value.toFixed(3)}), but effect size suggests ${effect_magnitude} practical difference`;
|
|
207
|
+
if (percent_difference < min_pct) {
|
|
208
|
+
recommendation = 'No meaningful difference detected';
|
|
202
209
|
}
|
|
203
|
-
else if (
|
|
204
|
-
recommendation =
|
|
210
|
+
else if (!significant) {
|
|
211
|
+
recommendation = `${(percent_difference * 100).toFixed(1)}% difference observed but not statistically significant (p=${p_value.toFixed(3)})`;
|
|
205
212
|
}
|
|
206
213
|
else {
|
|
207
|
-
recommendation = `${faster === 'a' ? 'First' : 'Second'} is ${speedup_ratio.toFixed(2)}x faster with ${effect_magnitude} effect size (p=${p_value.toFixed(3)})`;
|
|
214
|
+
recommendation = `${faster === 'a' ? 'First' : 'Second'} is ${speedup_ratio.toFixed(2)}x faster with ${effect_magnitude} effect size (${(percent_difference * 100).toFixed(1)}%, p=${p_value.toFixed(3)})`;
|
|
208
215
|
}
|
|
209
216
|
// Adjust 'faster' to 'equal' if effect is negligible
|
|
210
217
|
const adjusted_faster = effect_magnitude === 'negligible' ? 'equal' : faster;
|
|
@@ -213,6 +220,7 @@ export const benchmark_stats_compare = (a, b, options) => {
|
|
|
213
220
|
speedup_ratio,
|
|
214
221
|
significant,
|
|
215
222
|
p_value,
|
|
223
|
+
percent_difference,
|
|
216
224
|
effect_size,
|
|
217
225
|
effect_magnitude,
|
|
218
226
|
ci_overlap,
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@fuzdev/fuz_util",
|
|
3
|
-
"version": "0.
|
|
3
|
+
"version": "0.55.0",
|
|
4
4
|
"description": "utility belt for JS",
|
|
5
5
|
"glyph": "🦕",
|
|
6
6
|
"logo": "logo.svg",
|
|
@@ -77,8 +77,8 @@
|
|
|
77
77
|
"@changesets/changelog-git": "^0.2.1",
|
|
78
78
|
"@fuzdev/blake3_wasm": "^0.1.0",
|
|
79
79
|
"@fuzdev/fuz_code": "^0.45.1",
|
|
80
|
-
"@fuzdev/fuz_css": "^0.
|
|
81
|
-
"@fuzdev/fuz_ui": "^0.
|
|
80
|
+
"@fuzdev/fuz_css": "^0.56.0",
|
|
81
|
+
"@fuzdev/fuz_ui": "^0.190.0",
|
|
82
82
|
"@fuzdev/gro": "^0.197.0",
|
|
83
83
|
"@jridgewell/trace-mapping": "^0.3.31",
|
|
84
84
|
"@ryanatkn/eslint-config": "^0.10.1",
|
|
@@ -89,6 +89,12 @@ export interface BenchmarkBaselineCompareOptions extends BenchmarkBaselineLoadOp
|
|
|
89
89
|
* Default: undefined (no staleness warning)
|
|
90
90
|
*/
|
|
91
91
|
staleness_warning_days?: number;
|
|
92
|
+
/**
|
|
93
|
+
* Minimum percentage difference to consider meaningful, as a ratio.
|
|
94
|
+
* Passed through to `benchmark_stats_compare`. See `BenchmarkCompareOptions`.
|
|
95
|
+
* Default: 0.10 (10%)
|
|
96
|
+
*/
|
|
97
|
+
min_percent_difference?: number;
|
|
92
98
|
}
|
|
93
99
|
|
|
94
100
|
/**
|
|
@@ -344,7 +350,9 @@ export const benchmark_baseline_compare = async (
|
|
|
344
350
|
),
|
|
345
351
|
};
|
|
346
352
|
|
|
347
|
-
const comparison = benchmark_stats_compare(baseline_stats, current_stats
|
|
353
|
+
const comparison = benchmark_stats_compare(baseline_stats, current_stats, {
|
|
354
|
+
min_percent_difference: options.min_percent_difference,
|
|
355
|
+
});
|
|
348
356
|
|
|
349
357
|
const task_comparison: BenchmarkBaselineTaskComparison = {
|
|
350
358
|
name: current.name,
|
|
@@ -357,7 +365,8 @@ export const benchmark_baseline_compare = async (
|
|
|
357
365
|
|
|
358
366
|
// Categorize based on comparison result
|
|
359
367
|
// Note: comparison.faster is 'a' (baseline) or 'b' (current)
|
|
360
|
-
|
|
368
|
+
// significant implies percent_difference >= min_pct, which implies effect_magnitude !== 'negligible'
|
|
369
|
+
if (comparison.significant) {
|
|
361
370
|
if (comparison.faster === 'a') {
|
|
362
371
|
// Baseline was faster = potential regression
|
|
363
372
|
// Only count as regression if it exceeds the threshold
|
|
@@ -384,14 +393,14 @@ export const benchmark_baseline_compare = async (
|
|
|
384
393
|
}
|
|
385
394
|
}
|
|
386
395
|
|
|
387
|
-
// Sort regressions and improvements by
|
|
388
|
-
const
|
|
396
|
+
// Sort regressions and improvements by percentage difference (largest first)
|
|
397
|
+
const sort_by_percent_difference = (
|
|
389
398
|
a: BenchmarkBaselineTaskComparison,
|
|
390
399
|
b: BenchmarkBaselineTaskComparison,
|
|
391
|
-
) => b.comparison.
|
|
400
|
+
) => b.comparison.percent_difference - a.comparison.percent_difference;
|
|
392
401
|
|
|
393
|
-
regressions.sort(
|
|
394
|
-
improvements.sort(
|
|
402
|
+
regressions.sort(sort_by_percent_difference);
|
|
403
|
+
improvements.sort(sort_by_percent_difference);
|
|
395
404
|
|
|
396
405
|
return {
|
|
397
406
|
baseline_found: true,
|
|
@@ -440,8 +449,11 @@ export const benchmark_baseline_format = (result: BenchmarkBaselineComparisonRes
|
|
|
440
449
|
lines.push(`Regressions (${result.regressions.length}):`);
|
|
441
450
|
for (const r of result.regressions) {
|
|
442
451
|
const ratio = r.comparison.speedup_ratio.toFixed(2);
|
|
452
|
+
const pct = (r.comparison.percent_difference * 100).toFixed(1);
|
|
443
453
|
const p = r.comparison.p_value.toFixed(3);
|
|
444
|
-
lines.push(
|
|
454
|
+
lines.push(
|
|
455
|
+
` ${r.name}: ${ratio}x slower (${pct}%, p=${p}, ${r.comparison.effect_magnitude})`,
|
|
456
|
+
);
|
|
445
457
|
}
|
|
446
458
|
lines.push('');
|
|
447
459
|
}
|
|
@@ -450,8 +462,11 @@ export const benchmark_baseline_format = (result: BenchmarkBaselineComparisonRes
|
|
|
450
462
|
lines.push(`Improvements (${result.improvements.length}):`);
|
|
451
463
|
for (const r of result.improvements) {
|
|
452
464
|
const ratio = r.comparison.speedup_ratio.toFixed(2);
|
|
465
|
+
const pct = (r.comparison.percent_difference * 100).toFixed(1);
|
|
453
466
|
const p = r.comparison.p_value.toFixed(3);
|
|
454
|
-
lines.push(
|
|
467
|
+
lines.push(
|
|
468
|
+
` ${r.name}: ${ratio}x faster (${pct}%, p=${p}, ${r.comparison.effect_magnitude})`,
|
|
469
|
+
);
|
|
455
470
|
}
|
|
456
471
|
lines.push('');
|
|
457
472
|
}
|
|
@@ -516,6 +531,7 @@ export const benchmark_baseline_format_json = (
|
|
|
516
531
|
regressions: result.regressions.map((r) => ({
|
|
517
532
|
name: r.name,
|
|
518
533
|
speedup_ratio: r.comparison.speedup_ratio,
|
|
534
|
+
percent_difference: r.comparison.percent_difference,
|
|
519
535
|
effect_size: r.comparison.effect_size,
|
|
520
536
|
effect_magnitude: r.comparison.effect_magnitude,
|
|
521
537
|
p_value: r.comparison.p_value,
|
|
@@ -525,6 +541,7 @@ export const benchmark_baseline_format_json = (
|
|
|
525
541
|
improvements: result.improvements.map((r) => ({
|
|
526
542
|
name: r.name,
|
|
527
543
|
speedup_ratio: r.comparison.speedup_ratio,
|
|
544
|
+
percent_difference: r.comparison.percent_difference,
|
|
528
545
|
effect_size: r.comparison.effect_size,
|
|
529
546
|
effect_magnitude: r.comparison.effect_magnitude,
|
|
530
547
|
p_value: r.comparison.p_value,
|
|
@@ -44,13 +44,15 @@ export interface BenchmarkComparison {
|
|
|
44
44
|
faster: 'a' | 'b' | 'equal';
|
|
45
45
|
/** How much faster the winner is (e.g., 1.5 means 1.5x faster) */
|
|
46
46
|
speedup_ratio: number;
|
|
47
|
-
/** Whether the difference is statistically
|
|
47
|
+
/** Whether the difference is both statistically and practically significant */
|
|
48
48
|
significant: boolean;
|
|
49
49
|
/** P-value from Welch's t-test (lower = more confident the difference is real) */
|
|
50
50
|
p_value: number;
|
|
51
|
-
/**
|
|
51
|
+
/** Percentage difference between means as a ratio (0.05 = 5%, 1.0 = 100%) */
|
|
52
|
+
percent_difference: number;
|
|
53
|
+
/** Cohen's d effect size (informational — not used for classification) */
|
|
52
54
|
effect_size: number;
|
|
53
|
-
/** Interpretation of
|
|
55
|
+
/** Interpretation of practical significance based on percentage difference */
|
|
54
56
|
effect_magnitude: EffectMagnitude;
|
|
55
57
|
/** Whether the 95% confidence intervals overlap */
|
|
56
58
|
ci_overlap: boolean;
|
|
@@ -64,6 +66,20 @@ export interface BenchmarkComparison {
|
|
|
64
66
|
export interface BenchmarkCompareOptions {
|
|
65
67
|
/** Significance level for hypothesis testing (default: 0.05) */
|
|
66
68
|
alpha?: number;
|
|
69
|
+
/**
|
|
70
|
+
* Minimum percentage difference to consider practically meaningful, as a ratio.
|
|
71
|
+
* Below this threshold, differences are classified as 'negligible' and
|
|
72
|
+
* `significant` is forced to `false`, regardless of p-value.
|
|
73
|
+
* This prevents the t-test's oversensitivity at large sample sizes from
|
|
74
|
+
* flagging system-level noise (thermal throttle, OS scheduler, cache pressure)
|
|
75
|
+
* as meaningful differences.
|
|
76
|
+
*
|
|
77
|
+
* Effect magnitude thresholds scale from this value:
|
|
78
|
+
* negligible < min, small < min*3, medium < min*5, large >= min*5.
|
|
79
|
+
*
|
|
80
|
+
* Default: 0.10 (10%).
|
|
81
|
+
*/
|
|
82
|
+
min_percent_difference?: number;
|
|
67
83
|
}
|
|
68
84
|
|
|
69
85
|
/**
|
|
@@ -181,8 +197,12 @@ export class BenchmarkStats {
|
|
|
181
197
|
}
|
|
182
198
|
|
|
183
199
|
/**
|
|
184
|
-
* Compare two benchmark results for statistical significance.
|
|
185
|
-
* Uses
|
|
200
|
+
* Compare two benchmark results for practical and statistical significance.
|
|
201
|
+
* Uses percentage difference for effect magnitude classification, with Welch's
|
|
202
|
+
* t-test for statistical confidence. Cohen's d is computed as an informational
|
|
203
|
+
* metric but does not drive classification — its thresholds (0.2/0.5/0.8) are
|
|
204
|
+
* calibrated for social science and produce false positives in benchmarking
|
|
205
|
+
* where within-run variance is tight.
|
|
186
206
|
*
|
|
187
207
|
* @param a - First benchmark stats (or any object with required properties)
|
|
188
208
|
* @param b - Second benchmark stats (or any object with required properties)
|
|
@@ -203,6 +223,7 @@ export const benchmark_stats_compare = (
|
|
|
203
223
|
options?: BenchmarkCompareOptions,
|
|
204
224
|
): BenchmarkComparison => {
|
|
205
225
|
const alpha = options?.alpha ?? 0.05;
|
|
226
|
+
const min_pct = options?.min_percent_difference ?? 0.1;
|
|
206
227
|
|
|
207
228
|
// Handle edge cases
|
|
208
229
|
if (a.sample_size === 0 || b.sample_size === 0) {
|
|
@@ -211,6 +232,7 @@ export const benchmark_stats_compare = (
|
|
|
211
232
|
speedup_ratio: 1,
|
|
212
233
|
significant: false,
|
|
213
234
|
p_value: 1,
|
|
235
|
+
percent_difference: 0,
|
|
214
236
|
effect_size: 0,
|
|
215
237
|
effect_magnitude: 'negligible',
|
|
216
238
|
ci_overlap: true,
|
|
@@ -223,6 +245,9 @@ export const benchmark_stats_compare = (
|
|
|
223
245
|
const faster: 'a' | 'b' | 'equal' =
|
|
224
246
|
a.mean_ns < b.mean_ns ? 'a' : a.mean_ns > b.mean_ns ? 'b' : 'equal';
|
|
225
247
|
|
|
248
|
+
// Percentage difference relative to the faster mean (always >= 0)
|
|
249
|
+
const percent_difference = speedup_ratio - 1;
|
|
250
|
+
|
|
226
251
|
// Welch's t-test (handles unequal variances)
|
|
227
252
|
// Special case: if both have zero variance, t-test is undefined
|
|
228
253
|
let p_value: number;
|
|
@@ -242,38 +267,33 @@ export const benchmark_stats_compare = (
|
|
|
242
267
|
p_value = stats_t_distribution_p_value(Math.abs(t_statistic), degrees_of_freedom);
|
|
243
268
|
}
|
|
244
269
|
|
|
245
|
-
// Cohen's d effect size
|
|
270
|
+
// Cohen's d effect size (informational only — not used for classification)
|
|
246
271
|
const pooled_std_dev = Math.sqrt(
|
|
247
272
|
((a.sample_size - 1) * a.std_dev_ns ** 2 + (b.sample_size - 1) * b.std_dev_ns ** 2) /
|
|
248
273
|
(a.sample_size + b.sample_size - 2),
|
|
249
274
|
);
|
|
250
|
-
|
|
251
|
-
// When pooled_std_dev is 0 but means differ, effect is maximal (infinite)
|
|
252
|
-
// When means are equal, effect is 0
|
|
253
275
|
let effect_size: number;
|
|
254
|
-
let effect_magnitude: EffectMagnitude;
|
|
255
|
-
|
|
256
276
|
if (pooled_std_dev === 0) {
|
|
257
|
-
|
|
258
|
-
if (a.mean_ns === b.mean_ns) {
|
|
259
|
-
effect_size = 0;
|
|
260
|
-
effect_magnitude = 'negligible';
|
|
261
|
-
} else {
|
|
262
|
-
// Any difference is 100% reliable when there's no variance
|
|
263
|
-
effect_size = Infinity;
|
|
264
|
-
effect_magnitude = 'large';
|
|
265
|
-
}
|
|
277
|
+
effect_size = a.mean_ns === b.mean_ns ? 0 : Infinity;
|
|
266
278
|
} else {
|
|
267
279
|
effect_size = Math.abs(a.mean_ns - b.mean_ns) / pooled_std_dev;
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
|
|
280
|
+
}
|
|
281
|
+
|
|
282
|
+
// Effect magnitude based on percentage difference, not Cohen's d.
|
|
283
|
+
// Cohen's d thresholds (0.2/0.5/0.8) are calibrated for social science, not benchmarking.
|
|
284
|
+
// Within-run variance is tight, so even small system noise (thermal throttle, OS scheduler)
|
|
285
|
+
// produces large Cohen's d. Percentage thresholds directly answer "is this difference
|
|
286
|
+
// meaningful in practice?" Thresholds scale with min_percent_difference so users can
|
|
287
|
+
// tune one knob for their system's noise floor.
|
|
288
|
+
let effect_magnitude: EffectMagnitude;
|
|
289
|
+
if (percent_difference < min_pct) {
|
|
290
|
+
effect_magnitude = 'negligible';
|
|
291
|
+
} else if (percent_difference < min_pct * 3) {
|
|
292
|
+
effect_magnitude = 'small';
|
|
293
|
+
} else if (percent_difference < min_pct * 5) {
|
|
294
|
+
effect_magnitude = 'medium';
|
|
295
|
+
} else {
|
|
296
|
+
effect_magnitude = 'large';
|
|
277
297
|
}
|
|
278
298
|
|
|
279
299
|
// Check confidence interval overlap
|
|
@@ -281,20 +301,21 @@ export const benchmark_stats_compare = (
|
|
|
281
301
|
a.confidence_interval_ns[0] <= b.confidence_interval_ns[1] &&
|
|
282
302
|
b.confidence_interval_ns[0] <= a.confidence_interval_ns[1];
|
|
283
303
|
|
|
284
|
-
//
|
|
285
|
-
|
|
304
|
+
// Significance requires both statistical significance (p < alpha)
|
|
305
|
+
// AND practical significance (percent_difference >= min_pct).
|
|
306
|
+
// With large n, the t-test finds p≈0 for any difference because
|
|
307
|
+
// SE = std_dev/sqrt(n) → 0. Gating on practical significance
|
|
308
|
+
// prevents system noise from being flagged as meaningful.
|
|
309
|
+
const significant = p_value < alpha && percent_difference >= min_pct;
|
|
286
310
|
|
|
287
311
|
// Generate recommendation
|
|
288
312
|
let recommendation: string;
|
|
289
|
-
if (
|
|
290
|
-
recommendation =
|
|
291
|
-
|
|
292
|
-
|
|
293
|
-
: `Difference not statistically significant (p=${p_value.toFixed(3)}), but effect size suggests ${effect_magnitude} practical difference`;
|
|
294
|
-
} else if (effect_magnitude === 'negligible') {
|
|
295
|
-
recommendation = `Statistically significant but negligible practical difference (${speedup_ratio.toFixed(2)}x)`;
|
|
313
|
+
if (percent_difference < min_pct) {
|
|
314
|
+
recommendation = 'No meaningful difference detected';
|
|
315
|
+
} else if (!significant) {
|
|
316
|
+
recommendation = `${(percent_difference * 100).toFixed(1)}% difference observed but not statistically significant (p=${p_value.toFixed(3)})`;
|
|
296
317
|
} else {
|
|
297
|
-
recommendation = `${faster === 'a' ? 'First' : 'Second'} is ${speedup_ratio.toFixed(2)}x faster with ${effect_magnitude} effect size (p=${p_value.toFixed(3)})`;
|
|
318
|
+
recommendation = `${faster === 'a' ? 'First' : 'Second'} is ${speedup_ratio.toFixed(2)}x faster with ${effect_magnitude} effect size (${(percent_difference * 100).toFixed(1)}%, p=${p_value.toFixed(3)})`;
|
|
298
319
|
}
|
|
299
320
|
|
|
300
321
|
// Adjust 'faster' to 'equal' if effect is negligible
|
|
@@ -305,6 +326,7 @@ export const benchmark_stats_compare = (
|
|
|
305
326
|
speedup_ratio,
|
|
306
327
|
significant,
|
|
307
328
|
p_value,
|
|
329
|
+
percent_difference,
|
|
308
330
|
effect_size,
|
|
309
331
|
effect_magnitude,
|
|
310
332
|
ci_overlap,
|