@fuzdev/fuz_util 0.53.4 → 0.55.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -83,6 +83,12 @@ export interface BenchmarkBaselineCompareOptions extends BenchmarkBaselineLoadOp
83
83
  * Default: undefined (no staleness warning)
84
84
  */
85
85
  staleness_warning_days?: number;
86
+ /**
87
+ * Minimum percentage difference to consider meaningful, as a ratio.
88
+ * Passed through to `benchmark_stats_compare`. See `BenchmarkCompareOptions`.
89
+ * Default: 0.10 (10%)
90
+ */
91
+ min_percent_difference?: number;
86
92
  }
87
93
  /**
88
94
  * Result of comparing current results against a baseline.
@@ -1 +1 @@
1
- {"version":3,"file":"benchmark_baseline.d.ts","sourceRoot":"../src/lib/","sources":["../src/lib/benchmark_baseline.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AAIH,OAAO,EAAC,CAAC,EAAC,MAAM,KAAK,CAAC;AAItB,OAAO,KAAK,EAAC,eAAe,EAAC,MAAM,sBAAsB,CAAC;AAC1D,OAAO,EAEN,KAAK,mBAAmB,EAExB,MAAM,sBAAsB,CAAC;AAM9B;;GAEG;AACH,eAAO,MAAM,sBAAsB;;;;;;;;;;;;;iBAajC,CAAC;AACH,MAAM,MAAM,sBAAsB,GAAG,CAAC,CAAC,KAAK,CAAC,OAAO,sBAAsB,CAAC,CAAC;AAE5E;;GAEG;AACH,eAAO,MAAM,iBAAiB;;;;;;;;;;;;;;;;;;;;iBAO5B,CAAC;AACH,MAAM,MAAM,iBAAiB,GAAG,CAAC,CAAC,KAAK,CAAC,OAAO,iBAAiB,CAAC,CAAC;AAElE;;GAEG;AACH,MAAM,WAAW,4BAA4B;IAC5C,gEAAgE;IAChE,IAAI,CAAC,EAAE,MAAM,CAAC;IACd,sDAAsD;IACtD,UAAU,CAAC,EAAE,MAAM,GAAG,IAAI,CAAC;IAC3B,sDAAsD;IACtD,UAAU,CAAC,EAAE,MAAM,GAAG,IAAI,CAAC;CAC3B;AAED;;GAEG;AACH,MAAM,WAAW,4BAA4B;IAC5C,mEAAmE;IACnE,IAAI,CAAC,EAAE,MAAM,CAAC;CACd;AAED;;GAEG;AACH,MAAM,WAAW,+BAAgC,SAAQ,4BAA4B;IACpF;;;;OAIG;IACH,oBAAoB,CAAC,EAAE,MAAM,CAAC;IAC9B;;;OAGG;IACH,sBAAsB,CAAC,EAAE,MAAM,CAAC;CAChC;AAED;;GAEG;AACH,MAAM,WAAW,iCAAiC;IACjD,mCAAmC;IACnC,cAAc,EAAE,OAAO,CAAC;IACxB,gCAAgC;IAChC,kBAAkB,EAAE,MAAM,GAAG,IAAI,CAAC;IAClC,iCAAiC;IACjC,eAAe,EAAE,MAAM,GAAG,IAAI,CAAC;IAC/B,kCAAkC;IAClC,iBAAiB,EAAE,MAAM,GAAG,IAAI,CAAC;IACjC,sFAAsF;IACtF,cAAc,EAAE,OAAO,CAAC;IACxB,kCAAkC;IAClC,WAAW,EAAE,KAAK,CAAC,+BAA+B,CAAC,CAAC;IACpD,yGAAyG;IACzG,WAAW,EAAE,KAAK,CAAC,+BAA+B,CAAC,CAAC;IACpD,wGAAwG;IACxG,YAAY,EAAE,KAAK,CAAC,+BAA+B,CAAC,CAAC;IACrD,uCAAuC;IACvC,SAAS,EAAE,KAAK,CAAC,+BAA+B,CAAC,CAAC;IAClD,+CAA+C;IAC/C,SAAS,EAAE,KAAK,CAAC,MAAM,CAAC,CAAC;IACzB,+CAA+C;IAC/C,aAAa,EAAE,KAAK,CAAC,MAAM,CAAC,CAAC;CAC7B;AAED;;GAEG;AACH,MAAM,WAAW,+BAA+B;IAC/C,IAAI,EAAE,MAAM,CAAC;IACb,QAAQ,EAAE,sBAAsB,CAAC;IACjC,OAAO,EAAE,sBAAsB,CAAC;IAChC,UAAU,EAAE,mBAAmB,CAAC;CAChC;AAyBD;;;;;;;;;;;;;GAaG;AACH,eAAO,MAAM,uBAAuB,GACnC,SAAS,KAAK,CAAC,eAAe,CAAC,EAC/B,UAAS,4BAAiC,KACxC,OAAO,CAAC,IAAI,CAwBd,CAAC;AAEF;;;;;;;;;;;;;GAaG;AACH,eAAO,MAAM,uBAAuB,GACnC,UAAS,4BAAiC,KACxC,OAAO,CAAC,iBAAiB,GAAG,IAAI,CAiClC,CAAC;AAEF;;;;;;;;;;;;;;;;;;;;;;;;;GAyBG;AACH,eAAO,MAAM,0BAA0B,GACtC,SAAS,KAAK,CAAC,eAAe,CAAC,EAC/B,UAAS,+BAAoC,KAC3C,OAAO,CAAC,iCAAiC,CAmI3C,CAAC;AAEF;;;;;GAKG;AACH,eAAO,MAAM,yBAAyB,GAAI,QAAQ,iCAAiC,KAAG,MAwErF,CAAC;AAEF;;;;;;GAMG;AACH,eAAO,MAAM,8BAA8B,GAC1C,QAAQ,iCAAiC,EACzC,UAAS;IAAC,MAAM,CAAC,EAAE,OAAO,CAAA;CAAM,KAC9B,MAuCF,CAAC"}
1
+ {"version":3,"file":"benchmark_baseline.d.ts","sourceRoot":"../src/lib/","sources":["../src/lib/benchmark_baseline.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AAIH,OAAO,EAAC,CAAC,EAAC,MAAM,KAAK,CAAC;AAItB,OAAO,KAAK,EAAC,eAAe,EAAC,MAAM,sBAAsB,CAAC;AAC1D,OAAO,EAEN,KAAK,mBAAmB,EAExB,MAAM,sBAAsB,CAAC;AAM9B;;GAEG;AACH,eAAO,MAAM,sBAAsB;;;;;;;;;;;;;iBAajC,CAAC;AACH,MAAM,MAAM,sBAAsB,GAAG,CAAC,CAAC,KAAK,CAAC,OAAO,sBAAsB,CAAC,CAAC;AAE5E;;GAEG;AACH,eAAO,MAAM,iBAAiB;;;;;;;;;;;;;;;;;;;;iBAO5B,CAAC;AACH,MAAM,MAAM,iBAAiB,GAAG,CAAC,CAAC,KAAK,CAAC,OAAO,iBAAiB,CAAC,CAAC;AAElE;;GAEG;AACH,MAAM,WAAW,4BAA4B;IAC5C,gEAAgE;IAChE,IAAI,CAAC,EAAE,MAAM,CAAC;IACd,sDAAsD;IACtD,UAAU,CAAC,EAAE,MAAM,GAAG,IAAI,CAAC;IAC3B,sDAAsD;IACtD,UAAU,CAAC,EAAE,MAAM,GAAG,IAAI,CAAC;CAC3B;AAED;;GAEG;AACH,MAAM,WAAW,4BAA4B;IAC5C,mEAAmE;IACnE,IAAI,CAAC,EAAE,MAAM,CAAC;CACd;AAED;;GAEG;AACH,MAAM,WAAW,+BAAgC,SAAQ,4BAA4B;IACpF;;;;OAIG;IACH,oBAAoB,CAAC,EAAE,MAAM,CAAC;IAC9B;;;OAGG;IACH,sBAAsB,CAAC,EAAE,MAAM,CAAC;IAChC;;;;OAIG;IACH,sBAAsB,CAAC,EAAE,MAAM,CAAC;CAChC;AAED;;GAEG;AACH,MAAM,WAAW,iCAAiC;IACjD,mCAAmC;IACnC,cAAc,EAAE,OAAO,CAAC;IACxB,gCAAgC;IAChC,kBAAkB,EAAE,MAAM,GAAG,IAAI,CAAC;IAClC,iCAAiC;IACjC,eAAe,EAAE,MAAM,GAAG,IAAI,CAAC;IAC/B,kCAAkC;IAClC,iBAAiB,EAAE,MAAM,GAAG,IAAI,CAAC;IACjC,sFAAsF;IACtF,cAAc,EAAE,OAAO,CAAC;IACxB,kCAAkC;IAClC,WAAW,EAAE,KAAK,CAAC,+BAA+B,CAAC,CAAC;IACpD,yGAAyG;IACzG,WAAW,EAAE,KAAK,CAAC,+BAA+B,CAAC,CAAC;IACpD,wGAAwG;IACxG,YAAY,EAAE,KAAK,CAAC,+BAA+B,CAAC,CAAC;IACrD,uCAAuC;IACvC,SAAS,EAAE,KAAK,CAAC,+BAA+B,CAAC,CAAC;IAClD,+CAA+C;IAC/C,SAAS,EAAE,KAAK,CAAC,MAAM,CAAC,CAAC;IACzB,+CAA+C;IAC/C,aAAa,EAAE,KAAK,CAAC,MAAM,CAAC,CAAC;CAC7B;AAED;;GAEG;AACH,MAAM,WAAW,+BAA+B;IAC/C,IAAI,EAAE,MAAM,CAAC;IACb,QAAQ,EAAE,sBAAsB,CAAC;IACjC,OAAO,EAAE,sBAAsB,CAAC;IAChC,UAAU,EAAE,mBAAmB,CAAC;CAChC;AAyBD;;;;;;;;;;;;;GAaG;AACH,eAAO,MAAM,uBAAuB,GACnC,SAAS,KAAK,CAAC,eAAe,CAAC,EAC/B,UAAS,4BAAiC,KACxC,OAAO,CAAC,IAAI,CAwBd,CAAC;AAEF;;;;;;;;;;;;;GAaG;AACH,eAAO,MAAM,uBAAuB,GACnC,UAAS,4BAAiC,KACxC,OAAO,CAAC,iBAAiB,GAAG,IAAI,CAiClC,CAAC;AAEF;;;;;;;;;;;;;;;;;;;;;;;;;GAyBG;AACH,eAAO,MAAM,0BAA0B,GACtC,SAAS,KAAK,CAAC,eAAe,CAAC,EAC/B,UAAS,+BAAoC,KAC3C,OAAO,CAAC,iCAAiC,CAsI3C,CAAC;AAEF;;;;;GAKG;AACH,eAAO,MAAM,yBAAyB,GAAI,QAAQ,iCAAiC,KAAG,MA8ErF,CAAC;AAEF;;;;;;GAMG;AACH,eAAO,MAAM,8BAA8B,GAC1C,QAAQ,iCAAiC,EACzC,UAAS;IAAC,MAAM,CAAC,EAAE,OAAO,CAAA;CAAM,KAC9B,MAyCF,CAAC"}
@@ -217,7 +217,9 @@ export const benchmark_baseline_compare = async (results, options = {}) => {
217
217
  sample_size: current.sample_size,
218
218
  confidence_interval_ns: stats_confidence_interval_from_summary(current.mean_ns, current.std_dev_ns, current.sample_size),
219
219
  };
220
- const comparison = benchmark_stats_compare(baseline_stats, current_stats);
220
+ const comparison = benchmark_stats_compare(baseline_stats, current_stats, {
221
+ min_percent_difference: options.min_percent_difference,
222
+ });
221
223
  const task_comparison = {
222
224
  name: current.name,
223
225
  baseline: baseline_entry,
@@ -227,7 +229,8 @@ export const benchmark_baseline_compare = async (results, options = {}) => {
227
229
  comparisons.push(task_comparison);
228
230
  // Categorize based on comparison result
229
231
  // Note: comparison.faster is 'a' (baseline) or 'b' (current)
230
- if (comparison.significant && comparison.effect_magnitude !== 'negligible') {
232
+ // significant implies percent_difference >= min_pct, which implies effect_magnitude !== 'negligible'
233
+ if (comparison.significant) {
231
234
  if (comparison.faster === 'a') {
232
235
  // Baseline was faster = potential regression
233
236
  // Only count as regression if it exceeds the threshold
@@ -256,10 +259,10 @@ export const benchmark_baseline_compare = async (results, options = {}) => {
256
259
  removed_tasks.push(baseline_entry.name);
257
260
  }
258
261
  }
259
- // Sort regressions and improvements by effect size (largest first)
260
- const sort_by_effect_size = (a, b) => b.comparison.effect_size - a.comparison.effect_size;
261
- regressions.sort(sort_by_effect_size);
262
- improvements.sort(sort_by_effect_size);
262
+ // Sort regressions and improvements by percentage difference (largest first)
263
+ const sort_by_percent_difference = (a, b) => b.comparison.percent_difference - a.comparison.percent_difference;
264
+ regressions.sort(sort_by_percent_difference);
265
+ improvements.sort(sort_by_percent_difference);
263
266
  return {
264
267
  baseline_found: true,
265
268
  baseline_timestamp: baseline.timestamp,
@@ -302,8 +305,9 @@ export const benchmark_baseline_format = (result) => {
302
305
  lines.push(`Regressions (${result.regressions.length}):`);
303
306
  for (const r of result.regressions) {
304
307
  const ratio = r.comparison.speedup_ratio.toFixed(2);
308
+ const pct = (r.comparison.percent_difference * 100).toFixed(1);
305
309
  const p = r.comparison.p_value.toFixed(3);
306
- lines.push(` ${r.name}: ${ratio}x slower (p=${p}, ${r.comparison.effect_magnitude})`);
310
+ lines.push(` ${r.name}: ${ratio}x slower (${pct}%, p=${p}, ${r.comparison.effect_magnitude})`);
307
311
  }
308
312
  lines.push('');
309
313
  }
@@ -311,8 +315,9 @@ export const benchmark_baseline_format = (result) => {
311
315
  lines.push(`Improvements (${result.improvements.length}):`);
312
316
  for (const r of result.improvements) {
313
317
  const ratio = r.comparison.speedup_ratio.toFixed(2);
318
+ const pct = (r.comparison.percent_difference * 100).toFixed(1);
314
319
  const p = r.comparison.p_value.toFixed(3);
315
- lines.push(` ${r.name}: ${ratio}x faster (p=${p}, ${r.comparison.effect_magnitude})`);
320
+ lines.push(` ${r.name}: ${ratio}x faster (${pct}%, p=${p}, ${r.comparison.effect_magnitude})`);
316
321
  }
317
322
  lines.push('');
318
323
  }
@@ -367,6 +372,7 @@ export const benchmark_baseline_format_json = (result, options = {}) => {
367
372
  regressions: result.regressions.map((r) => ({
368
373
  name: r.name,
369
374
  speedup_ratio: r.comparison.speedup_ratio,
375
+ percent_difference: r.comparison.percent_difference,
370
376
  effect_size: r.comparison.effect_size,
371
377
  effect_magnitude: r.comparison.effect_magnitude,
372
378
  p_value: r.comparison.p_value,
@@ -376,6 +382,7 @@ export const benchmark_baseline_format_json = (result, options = {}) => {
376
382
  improvements: result.improvements.map((r) => ({
377
383
  name: r.name,
378
384
  speedup_ratio: r.comparison.speedup_ratio,
385
+ percent_difference: r.comparison.percent_difference,
379
386
  effect_size: r.comparison.effect_size,
380
387
  effect_magnitude: r.comparison.effect_magnitude,
381
388
  p_value: r.comparison.p_value,
@@ -27,13 +27,15 @@ export interface BenchmarkComparison {
27
27
  faster: 'a' | 'b' | 'equal';
28
28
  /** How much faster the winner is (e.g., 1.5 means 1.5x faster) */
29
29
  speedup_ratio: number;
30
- /** Whether the difference is statistically significant at the given alpha */
30
+ /** Whether the difference is both statistically and practically significant */
31
31
  significant: boolean;
32
32
  /** P-value from Welch's t-test (lower = more confident the difference is real) */
33
33
  p_value: number;
34
- /** Cohen's d effect size (magnitude of difference independent of sample size) */
34
+ /** Percentage difference between means as a ratio (0.05 = 5%, 1.0 = 100%) */
35
+ percent_difference: number;
36
+ /** Cohen's d effect size (informational — not used for classification) */
35
37
  effect_size: number;
36
- /** Interpretation of effect size */
38
+ /** Interpretation of practical significance based on percentage difference */
37
39
  effect_magnitude: EffectMagnitude;
38
40
  /** Whether the 95% confidence intervals overlap */
39
41
  ci_overlap: boolean;
@@ -46,6 +48,20 @@ export interface BenchmarkComparison {
46
48
  export interface BenchmarkCompareOptions {
47
49
  /** Significance level for hypothesis testing (default: 0.05) */
48
50
  alpha?: number;
51
+ /**
52
+ * Minimum percentage difference to consider practically meaningful, as a ratio.
53
+ * Below this threshold, differences are classified as 'negligible' and
54
+ * `significant` is forced to `false`, regardless of p-value.
55
+ * This prevents the t-test's oversensitivity at large sample sizes from
56
+ * flagging system-level noise (thermal throttle, OS scheduler, cache pressure)
57
+ * as meaningful differences.
58
+ *
59
+ * Effect magnitude thresholds scale from this value:
60
+ * negligible < min, small < min*3, medium < min*5, large >= min*5.
61
+ *
62
+ * Default: 0.10 (10%).
63
+ */
64
+ min_percent_difference?: number;
49
65
  }
50
66
  /**
51
67
  * Complete statistical analysis of timing measurements.
@@ -94,8 +110,12 @@ export declare class BenchmarkStats {
94
110
  toString(): string;
95
111
  }
96
112
  /**
97
- * Compare two benchmark results for statistical significance.
98
- * Uses Welch's t-test (handles unequal variances) and Cohen's d effect size.
113
+ * Compare two benchmark results for practical and statistical significance.
114
+ * Uses percentage difference for effect magnitude classification, with Welch's
115
+ * t-test for statistical confidence. Cohen's d is computed as an informational
116
+ * metric but does not drive classification — its thresholds (0.2/0.5/0.8) are
117
+ * calibrated for social science and produce false positives in benchmarking
118
+ * where within-run variance is tight.
99
119
  *
100
120
  * @param a - First benchmark stats (or any object with required properties)
101
121
  * @param b - Second benchmark stats (or any object with required properties)
@@ -1 +1 @@
1
- {"version":3,"file":"benchmark_stats.d.ts","sourceRoot":"../src/lib/","sources":["../src/lib/benchmark_stats.ts"],"names":[],"mappings":"AAAA;;;;;;GAMG;AAgBH;;;GAGG;AACH,MAAM,WAAW,wBAAwB;IACxC,OAAO,EAAE,MAAM,CAAC;IAChB,UAAU,EAAE,MAAM,CAAC;IACnB,WAAW,EAAE,MAAM,CAAC;IACpB,sBAAsB,EAAE,CAAC,MAAM,EAAE,MAAM,CAAC,CAAC;CACzC;AAED;;GAEG;AACH,MAAM,MAAM,eAAe,GAAG,YAAY,GAAG,OAAO,GAAG,QAAQ,GAAG,OAAO,CAAC;AAE1E;;GAEG;AACH,MAAM,WAAW,mBAAmB;IACnC,mFAAmF;IACnF,MAAM,EAAE,GAAG,GAAG,GAAG,GAAG,OAAO,CAAC;IAC5B,kEAAkE;IAClE,aAAa,EAAE,MAAM,CAAC;IACtB,6EAA6E;IAC7E,WAAW,EAAE,OAAO,CAAC;IACrB,kFAAkF;IAClF,OAAO,EAAE,MAAM,CAAC;IAChB,iFAAiF;IACjF,WAAW,EAAE,MAAM,CAAC;IACpB,oCAAoC;IACpC,gBAAgB,EAAE,eAAe,CAAC;IAClC,mDAAmD;IACnD,UAAU,EAAE,OAAO,CAAC;IACpB,sDAAsD;IACtD,cAAc,EAAE,MAAM,CAAC;CACvB;AAED;;GAEG;AACH,MAAM,WAAW,uBAAuB;IACvC,gEAAgE;IAChE,KAAK,CAAC,EAAE,MAAM,CAAC;CACf;AAED;;;;GAIG;AACH,qBAAa,cAAc;IAC1B,yCAAyC;IACzC,QAAQ,CAAC,OAAO,EAAE,MAAM,CAAC;IACzB,mDAAmD;IACnD,QAAQ,CAAC,MAAM,EAAE,MAAM,CAAC;IACxB,wCAAwC;IACxC,QAAQ,CAAC,UAAU,EAAE,MAAM,CAAC;IAC5B,kCAAkC;IAClC,QAAQ,CAAC,MAAM,EAAE,MAAM,CAAC;IACxB,kCAAkC;IAClC,QAAQ,CAAC,MAAM,EAAE,MAAM,CAAC;IACxB,qCAAqC;IACrC,QAAQ,CAAC,MAAM,EAAE,MAAM,CAAC;IACxB,qCAAqC;IACrC,QAAQ,CAAC,MAAM,EAAE,MAAM,CAAC;IACxB,qCAAqC;IACrC,QAAQ,CAAC,MAAM,EAAE,MAAM,CAAC;IACxB,qCAAqC;IACrC,QAAQ,CAAC,MAAM,EAAE,MAAM,CAAC;IACxB,gDAAgD;IAChD,QAAQ,CAAC,EAAE,EAAE,MAAM,CAAC;IACpB,0DAA0D;IAC1D,QAAQ,CAAC,sBAAsB,EAAE,CAAC,MAAM,EAAE,MAAM,CAAC,CAAC;IAClD,sDAAsD;IACtD,QAAQ,CAAC,WAAW,EAAE,KAAK,CAAC,MAAM,CAAC,CAAC;IACpC,yCAAyC;IACzC,QAAQ,CAAC,aAAa,EAAE,MAAM,CAAC;IAC/B,8CAA8C;IAC9C,QAAQ,CAAC,WAAW,EAAE,MAAM,CAAC;IAC7B,0DAA0D;IAC1D,QAAQ,CAAC,eAAe,EAAE,MAAM,CAAC;IACjC,mDAAmD;IACnD,QAAQ,CAAC,cAAc,EAAE,MAAM,CAAC;IAChC,sEAAsE;IACtE,QAAQ,CAAC,iBAAiB,EAAE,MAAM,CAAC;gBAEvB,UAAU,EAAE,KAAK,CAAC,MAAM,CAAC;IAiErC;;OAEG;IACH,QAAQ,IAAI,MAAM;CAGlB;AAED;;;;;;;;;;;;;;;;GAgBG;AACH,eAAO,MAAM,uBAAuB,GACnC,GAAG,wBAAwB,EAC3B,GAAG,wBAAwB,EAC3B,UAAU,uBAAuB,KAC/B,mBA6GF,CAAC"}
1
+ {"version":3,"file":"benchmark_stats.d.ts","sourceRoot":"../src/lib/","sources":["../src/lib/benchmark_stats.ts"],"names":[],"mappings":"AAAA;;;;;;GAMG;AAgBH;;;GAGG;AACH,MAAM,WAAW,wBAAwB;IACxC,OAAO,EAAE,MAAM,CAAC;IAChB,UAAU,EAAE,MAAM,CAAC;IACnB,WAAW,EAAE,MAAM,CAAC;IACpB,sBAAsB,EAAE,CAAC,MAAM,EAAE,MAAM,CAAC,CAAC;CACzC;AAED;;GAEG;AACH,MAAM,MAAM,eAAe,GAAG,YAAY,GAAG,OAAO,GAAG,QAAQ,GAAG,OAAO,CAAC;AAE1E;;GAEG;AACH,MAAM,WAAW,mBAAmB;IACnC,mFAAmF;IACnF,MAAM,EAAE,GAAG,GAAG,GAAG,GAAG,OAAO,CAAC;IAC5B,kEAAkE;IAClE,aAAa,EAAE,MAAM,CAAC;IACtB,+EAA+E;IAC/E,WAAW,EAAE,OAAO,CAAC;IACrB,kFAAkF;IAClF,OAAO,EAAE,MAAM,CAAC;IAChB,6EAA6E;IAC7E,kBAAkB,EAAE,MAAM,CAAC;IAC3B,0EAA0E;IAC1E,WAAW,EAAE,MAAM,CAAC;IACpB,8EAA8E;IAC9E,gBAAgB,EAAE,eAAe,CAAC;IAClC,mDAAmD;IACnD,UAAU,EAAE,OAAO,CAAC;IACpB,sDAAsD;IACtD,cAAc,EAAE,MAAM,CAAC;CACvB;AAED;;GAEG;AACH,MAAM,WAAW,uBAAuB;IACvC,gEAAgE;IAChE,KAAK,CAAC,EAAE,MAAM,CAAC;IACf;;;;;;;;;;;;OAYG;IACH,sBAAsB,CAAC,EAAE,MAAM,CAAC;CAChC;AAED;;;;GAIG;AACH,qBAAa,cAAc;IAC1B,yCAAyC;IACzC,QAAQ,CAAC,OAAO,EAAE,MAAM,CAAC;IACzB,mDAAmD;IACnD,QAAQ,CAAC,MAAM,EAAE,MAAM,CAAC;IACxB,wCAAwC;IACxC,QAAQ,CAAC,UAAU,EAAE,MAAM,CAAC;IAC5B,kCAAkC;IAClC,QAAQ,CAAC,MAAM,EAAE,MAAM,CAAC;IACxB,kCAAkC;IAClC,QAAQ,CAAC,MAAM,EAAE,MAAM,CAAC;IACxB,qCAAqC;IACrC,QAAQ,CAAC,MAAM,EAAE,MAAM,CAAC;IACxB,qCAAqC;IACrC,QAAQ,CAAC,MAAM,EAAE,MAAM,CAAC;IACxB,qCAAqC;IACrC,QAAQ,CAAC,MAAM,EAAE,MAAM,CAAC;IACxB,qCAAqC;IACrC,QAAQ,CAAC,MAAM,EAAE,MAAM,CAAC;IACxB,gDAAgD;IAChD,QAAQ,CAAC,EAAE,EAAE,MAAM,CAAC;IACpB,0DAA0D;IAC1D,QAAQ,CAAC,sBAAsB,EAAE,CAAC,MAAM,EAAE,MAAM,CAAC,CAAC;IAClD,sDAAsD;IACtD,QAAQ,CAAC,WAAW,EAAE,KAAK,CAAC,MAAM,CAAC,CAAC;IACpC,yCAAyC;IACzC,QAAQ,CAAC,aAAa,EAAE,MAAM,CAAC;IAC/B,8CAA8C;IAC9C,QAAQ,CAAC,WAAW,EAAE,MAAM,CAAC;IAC7B,0DAA0D;IAC1D,QAAQ,CAAC,eAAe,EAAE,MAAM,CAAC;IACjC,mDAAmD;IACnD,QAAQ,CAAC,cAAc,EAAE,MAAM,CAAC;IAChC,sEAAsE;IACtE,QAAQ,CAAC,iBAAiB,EAAE,MAAM,CAAC;gBAEvB,UAAU,EAAE,KAAK,CAAC,MAAM,CAAC;IAiErC;;OAEG;IACH,QAAQ,IAAI,MAAM;CAGlB;AAED;;;;;;;;;;;;;;;;;;;;GAoBG;AACH,eAAO,MAAM,uBAAuB,GACnC,GAAG,wBAAwB,EAC3B,GAAG,wBAAwB,EAC3B,UAAU,uBAAuB,KAC/B,mBA+GF,CAAC"}
@@ -110,8 +110,12 @@ export class BenchmarkStats {
110
110
  }
111
111
  }
112
112
  /**
113
- * Compare two benchmark results for statistical significance.
114
- * Uses Welch's t-test (handles unequal variances) and Cohen's d effect size.
113
+ * Compare two benchmark results for practical and statistical significance.
114
+ * Uses percentage difference for effect magnitude classification, with Welch's
115
+ * t-test for statistical confidence. Cohen's d is computed as an informational
116
+ * metric but does not drive classification — its thresholds (0.2/0.5/0.8) are
117
+ * calibrated for social science and produce false positives in benchmarking
118
+ * where within-run variance is tight.
115
119
  *
116
120
  * @param a - First benchmark stats (or any object with required properties)
117
121
  * @param b - Second benchmark stats (or any object with required properties)
@@ -128,6 +132,7 @@ export class BenchmarkStats {
128
132
  */
129
133
  export const benchmark_stats_compare = (a, b, options) => {
130
134
  const alpha = options?.alpha ?? 0.05;
135
+ const min_pct = options?.min_percent_difference ?? 0.1;
131
136
  // Handle edge cases
132
137
  if (a.sample_size === 0 || b.sample_size === 0) {
133
138
  return {
@@ -135,6 +140,7 @@ export const benchmark_stats_compare = (a, b, options) => {
135
140
  speedup_ratio: 1,
136
141
  significant: false,
137
142
  p_value: 1,
143
+ percent_difference: 0,
138
144
  effect_size: 0,
139
145
  effect_magnitude: 'negligible',
140
146
  ci_overlap: true,
@@ -144,6 +150,8 @@ export const benchmark_stats_compare = (a, b, options) => {
144
150
  // Calculate speedup ratio (lower time = faster, so compare by time not ops/sec)
145
151
  const speedup_ratio = a.mean_ns < b.mean_ns ? b.mean_ns / a.mean_ns : a.mean_ns / b.mean_ns;
146
152
  const faster = a.mean_ns < b.mean_ns ? 'a' : a.mean_ns > b.mean_ns ? 'b' : 'equal';
153
+ // Percentage difference relative to the faster mean (always >= 0)
154
+ const percent_difference = speedup_ratio - 1;
147
155
  // Welch's t-test (handles unequal variances)
148
156
  // Special case: if both have zero variance, t-test is undefined
149
157
  let p_value;
@@ -156,55 +164,54 @@ export const benchmark_stats_compare = (a, b, options) => {
156
164
  // Calculate two-tailed p-value using t-distribution approximation
157
165
  p_value = stats_t_distribution_p_value(Math.abs(t_statistic), degrees_of_freedom);
158
166
  }
159
- // Cohen's d effect size
167
+ // Cohen's d effect size (informational only — not used for classification)
160
168
  const pooled_std_dev = Math.sqrt(((a.sample_size - 1) * a.std_dev_ns ** 2 + (b.sample_size - 1) * b.std_dev_ns ** 2) /
161
169
  (a.sample_size + b.sample_size - 2));
162
- // When pooled_std_dev is 0 but means differ, effect is maximal (infinite)
163
- // When means are equal, effect is 0
164
170
  let effect_size;
165
- let effect_magnitude;
166
171
  if (pooled_std_dev === 0) {
167
- // Zero variance case - if means differ, it's a definitive difference
168
- if (a.mean_ns === b.mean_ns) {
169
- effect_size = 0;
170
- effect_magnitude = 'negligible';
171
- }
172
- else {
173
- // Any difference is 100% reliable when there's no variance
174
- effect_size = Infinity;
175
- effect_magnitude = 'large';
176
- }
172
+ effect_size = a.mean_ns === b.mean_ns ? 0 : Infinity;
177
173
  }
178
174
  else {
179
175
  effect_size = Math.abs(a.mean_ns - b.mean_ns) / pooled_std_dev;
180
- // Interpret effect size (Cohen's conventions)
181
- effect_magnitude =
182
- effect_size < 0.2
183
- ? 'negligible'
184
- : effect_size < 0.5
185
- ? 'small'
186
- : effect_size < 0.8
187
- ? 'medium'
188
- : 'large';
176
+ }
177
+ // Effect magnitude based on percentage difference, not Cohen's d.
178
+ // Cohen's d thresholds (0.2/0.5/0.8) are calibrated for social science, not benchmarking.
179
+ // Within-run variance is tight, so even small system noise (thermal throttle, OS scheduler)
180
+ // produces large Cohen's d. Percentage thresholds directly answer "is this difference
181
+ // meaningful in practice?" Thresholds scale with min_percent_difference so users can
182
+ // tune one knob for their system's noise floor.
183
+ let effect_magnitude;
184
+ if (percent_difference < min_pct) {
185
+ effect_magnitude = 'negligible';
186
+ }
187
+ else if (percent_difference < min_pct * 3) {
188
+ effect_magnitude = 'small';
189
+ }
190
+ else if (percent_difference < min_pct * 5) {
191
+ effect_magnitude = 'medium';
192
+ }
193
+ else {
194
+ effect_magnitude = 'large';
189
195
  }
190
196
  // Check confidence interval overlap
191
197
  const ci_overlap = a.confidence_interval_ns[0] <= b.confidence_interval_ns[1] &&
192
198
  b.confidence_interval_ns[0] <= a.confidence_interval_ns[1];
193
- // Determine significance
194
- const significant = p_value < alpha;
199
+ // Significance requires both statistical significance (p < alpha)
200
+ // AND practical significance (percent_difference >= min_pct).
201
+ // With large n, the t-test finds p≈0 for any difference because
202
+ // SE = std_dev/sqrt(n) → 0. Gating on practical significance
203
+ // prevents system noise from being flagged as meaningful.
204
+ const significant = p_value < alpha && percent_difference >= min_pct;
195
205
  // Generate recommendation
196
206
  let recommendation;
197
- if (!significant) {
198
- recommendation =
199
- effect_magnitude === 'negligible'
200
- ? 'No meaningful difference detected'
201
- : `Difference not statistically significant (p=${p_value.toFixed(3)}), but effect size suggests ${effect_magnitude} practical difference`;
207
+ if (percent_difference < min_pct) {
208
+ recommendation = 'No meaningful difference detected';
202
209
  }
203
- else if (effect_magnitude === 'negligible') {
204
- recommendation = `Statistically significant but negligible practical difference (${speedup_ratio.toFixed(2)}x)`;
210
+ else if (!significant) {
211
+ recommendation = `${(percent_difference * 100).toFixed(1)}% difference observed but not statistically significant (p=${p_value.toFixed(3)})`;
205
212
  }
206
213
  else {
207
- recommendation = `${faster === 'a' ? 'First' : 'Second'} is ${speedup_ratio.toFixed(2)}x faster with ${effect_magnitude} effect size (p=${p_value.toFixed(3)})`;
214
+ recommendation = `${faster === 'a' ? 'First' : 'Second'} is ${speedup_ratio.toFixed(2)}x faster with ${effect_magnitude} effect size (${(percent_difference * 100).toFixed(1)}%, p=${p_value.toFixed(3)})`;
208
215
  }
209
216
  // Adjust 'faster' to 'equal' if effect is negligible
210
217
  const adjusted_faster = effect_magnitude === 'negligible' ? 'equal' : faster;
@@ -213,6 +220,7 @@ export const benchmark_stats_compare = (a, b, options) => {
213
220
  speedup_ratio,
214
221
  significant,
215
222
  p_value,
223
+ percent_difference,
216
224
  effect_size,
217
225
  effect_magnitude,
218
226
  ci_overlap,
@@ -143,7 +143,7 @@ export const deep_equal = (a, b) => {
143
143
  return false;
144
144
  for (let i = 0; i < a_keys_length; i++) {
145
145
  const key = a_keys[i];
146
- if (!(key in b))
146
+ if (!Object.hasOwn(b, key))
147
147
  return false;
148
148
  if (!deep_equal(a[key], b[key]))
149
149
  return false;
@@ -34,7 +34,7 @@ export declare const PackageJson: z.ZodObject<{
34
34
  version: z.ZodString;
35
35
  private: z.ZodOptional<z.ZodBoolean>;
36
36
  description: z.ZodOptional<z.ZodString>;
37
- motto: z.ZodOptional<z.ZodString>;
37
+ tagline: z.ZodOptional<z.ZodString>;
38
38
  glyph: z.ZodOptional<z.ZodString>;
39
39
  logo: z.ZodOptional<z.ZodString>;
40
40
  logo_alt: z.ZodOptional<z.ZodString>;
@@ -55,26 +55,26 @@ export const PackageJson = z.looseObject({
55
55
  .meta({ description: 'disallow publishing to the configured registry' })
56
56
  .optional(),
57
57
  description: z.string().optional(),
58
- motto: z
58
+ tagline: z
59
59
  .string()
60
- .meta({ description: "a Gro extension that's a short phrase that represents this project" })
60
+ .meta({ description: "a Fuz extension that's a short phrase that represents this project" })
61
61
  .optional(),
62
62
  glyph: z
63
63
  .string()
64
64
  .meta({
65
- description: "a Gro extension that's a single unicode character that represents this project",
65
+ description: "a Fuz extension that's a single unicode character that represents this project",
66
66
  })
67
67
  .refine((v) => count_graphemes(v) === 1, 'must be a single unicode character')
68
68
  .optional(),
69
69
  logo: z
70
70
  .string()
71
71
  .meta({
72
- description: "a Gro extension that's a link relative to the `homepage` to an image that represents this project",
72
+ description: "a Fuz extension that's a link relative to the `homepage` to an image that represents this project",
73
73
  })
74
74
  .optional(),
75
75
  logo_alt: z
76
76
  .string()
77
- .meta({ description: "a Gro extension that's the alt text for the `logo`" })
77
+ .meta({ description: "a Fuz extension that's the alt text for the `logo`" })
78
78
  .optional(),
79
79
  license: z.string().optional(),
80
80
  scripts: z.record(z.string(), z.string()).optional(),
package/package.json CHANGED
@@ -1,11 +1,11 @@
1
1
  {
2
2
  "name": "@fuzdev/fuz_util",
3
- "version": "0.53.4",
3
+ "version": "0.55.0",
4
4
  "description": "utility belt for JS",
5
5
  "glyph": "🦕",
6
6
  "logo": "logo.svg",
7
7
  "logo_alt": "a green sauropod wearing a brown utility belt",
8
- "motto": "ancient not extinct",
8
+ "tagline": "ancient not extinct",
9
9
  "license": "MIT",
10
10
  "homepage": "https://util.fuz.dev/",
11
11
  "author": {
@@ -77,13 +77,14 @@
77
77
  "@changesets/changelog-git": "^0.2.1",
78
78
  "@fuzdev/blake3_wasm": "^0.1.0",
79
79
  "@fuzdev/fuz_code": "^0.45.1",
80
- "@fuzdev/fuz_css": "^0.55.0",
81
- "@fuzdev/fuz_ui": "^0.185.2",
80
+ "@fuzdev/fuz_css": "^0.56.0",
81
+ "@fuzdev/fuz_ui": "^0.190.0",
82
82
  "@fuzdev/gro": "^0.197.0",
83
83
  "@jridgewell/trace-mapping": "^0.3.31",
84
84
  "@ryanatkn/eslint-config": "^0.10.1",
85
+ "@sveltejs/acorn-typescript": "^1.0.9",
85
86
  "@sveltejs/adapter-static": "^3.0.10",
86
- "@sveltejs/kit": "^2.50.1",
87
+ "@sveltejs/kit": "^2.54.0",
87
88
  "@sveltejs/package": "^2.5.7",
88
89
  "@sveltejs/vite-plugin-svelte": "^6.2.4",
89
90
  "@types/estree": "^1.0.8",
@@ -97,13 +98,14 @@
97
98
  "magic-string": "^0.30.21",
98
99
  "prettier": "^3.7.4",
99
100
  "prettier-plugin-svelte": "^3.4.1",
100
- "svelte": "^5.49.1",
101
+ "svelte": "^5.53.10",
101
102
  "svelte-check": "^4.3.6",
102
103
  "svelte2tsx": "^0.7.47",
103
104
  "tslib": "^2.8.1",
104
105
  "typescript": "^5.9.3",
105
106
  "typescript-eslint": "^8.48.1",
106
- "vitest": "^4.0.15",
107
+ "vite": "^7.3.1",
108
+ "vitest": "^4.0.18",
107
109
  "zimmerframe": "^1.1.4",
108
110
  "zod": "^4.3.6"
109
111
  },
@@ -89,6 +89,12 @@ export interface BenchmarkBaselineCompareOptions extends BenchmarkBaselineLoadOp
89
89
  * Default: undefined (no staleness warning)
90
90
  */
91
91
  staleness_warning_days?: number;
92
+ /**
93
+ * Minimum percentage difference to consider meaningful, as a ratio.
94
+ * Passed through to `benchmark_stats_compare`. See `BenchmarkCompareOptions`.
95
+ * Default: 0.10 (10%)
96
+ */
97
+ min_percent_difference?: number;
92
98
  }
93
99
 
94
100
  /**
@@ -344,7 +350,9 @@ export const benchmark_baseline_compare = async (
344
350
  ),
345
351
  };
346
352
 
347
- const comparison = benchmark_stats_compare(baseline_stats, current_stats);
353
+ const comparison = benchmark_stats_compare(baseline_stats, current_stats, {
354
+ min_percent_difference: options.min_percent_difference,
355
+ });
348
356
 
349
357
  const task_comparison: BenchmarkBaselineTaskComparison = {
350
358
  name: current.name,
@@ -357,7 +365,8 @@ export const benchmark_baseline_compare = async (
357
365
 
358
366
  // Categorize based on comparison result
359
367
  // Note: comparison.faster is 'a' (baseline) or 'b' (current)
360
- if (comparison.significant && comparison.effect_magnitude !== 'negligible') {
368
+ // significant implies percent_difference >= min_pct, which implies effect_magnitude !== 'negligible'
369
+ if (comparison.significant) {
361
370
  if (comparison.faster === 'a') {
362
371
  // Baseline was faster = potential regression
363
372
  // Only count as regression if it exceeds the threshold
@@ -384,14 +393,14 @@ export const benchmark_baseline_compare = async (
384
393
  }
385
394
  }
386
395
 
387
- // Sort regressions and improvements by effect size (largest first)
388
- const sort_by_effect_size = (
396
+ // Sort regressions and improvements by percentage difference (largest first)
397
+ const sort_by_percent_difference = (
389
398
  a: BenchmarkBaselineTaskComparison,
390
399
  b: BenchmarkBaselineTaskComparison,
391
- ) => b.comparison.effect_size - a.comparison.effect_size;
400
+ ) => b.comparison.percent_difference - a.comparison.percent_difference;
392
401
 
393
- regressions.sort(sort_by_effect_size);
394
- improvements.sort(sort_by_effect_size);
402
+ regressions.sort(sort_by_percent_difference);
403
+ improvements.sort(sort_by_percent_difference);
395
404
 
396
405
  return {
397
406
  baseline_found: true,
@@ -440,8 +449,11 @@ export const benchmark_baseline_format = (result: BenchmarkBaselineComparisonRes
440
449
  lines.push(`Regressions (${result.regressions.length}):`);
441
450
  for (const r of result.regressions) {
442
451
  const ratio = r.comparison.speedup_ratio.toFixed(2);
452
+ const pct = (r.comparison.percent_difference * 100).toFixed(1);
443
453
  const p = r.comparison.p_value.toFixed(3);
444
- lines.push(` ${r.name}: ${ratio}x slower (p=${p}, ${r.comparison.effect_magnitude})`);
454
+ lines.push(
455
+ ` ${r.name}: ${ratio}x slower (${pct}%, p=${p}, ${r.comparison.effect_magnitude})`,
456
+ );
445
457
  }
446
458
  lines.push('');
447
459
  }
@@ -450,8 +462,11 @@ export const benchmark_baseline_format = (result: BenchmarkBaselineComparisonRes
450
462
  lines.push(`Improvements (${result.improvements.length}):`);
451
463
  for (const r of result.improvements) {
452
464
  const ratio = r.comparison.speedup_ratio.toFixed(2);
465
+ const pct = (r.comparison.percent_difference * 100).toFixed(1);
453
466
  const p = r.comparison.p_value.toFixed(3);
454
- lines.push(` ${r.name}: ${ratio}x faster (p=${p}, ${r.comparison.effect_magnitude})`);
467
+ lines.push(
468
+ ` ${r.name}: ${ratio}x faster (${pct}%, p=${p}, ${r.comparison.effect_magnitude})`,
469
+ );
455
470
  }
456
471
  lines.push('');
457
472
  }
@@ -516,6 +531,7 @@ export const benchmark_baseline_format_json = (
516
531
  regressions: result.regressions.map((r) => ({
517
532
  name: r.name,
518
533
  speedup_ratio: r.comparison.speedup_ratio,
534
+ percent_difference: r.comparison.percent_difference,
519
535
  effect_size: r.comparison.effect_size,
520
536
  effect_magnitude: r.comparison.effect_magnitude,
521
537
  p_value: r.comparison.p_value,
@@ -525,6 +541,7 @@ export const benchmark_baseline_format_json = (
525
541
  improvements: result.improvements.map((r) => ({
526
542
  name: r.name,
527
543
  speedup_ratio: r.comparison.speedup_ratio,
544
+ percent_difference: r.comparison.percent_difference,
528
545
  effect_size: r.comparison.effect_size,
529
546
  effect_magnitude: r.comparison.effect_magnitude,
530
547
  p_value: r.comparison.p_value,
@@ -44,13 +44,15 @@ export interface BenchmarkComparison {
44
44
  faster: 'a' | 'b' | 'equal';
45
45
  /** How much faster the winner is (e.g., 1.5 means 1.5x faster) */
46
46
  speedup_ratio: number;
47
- /** Whether the difference is statistically significant at the given alpha */
47
+ /** Whether the difference is both statistically and practically significant */
48
48
  significant: boolean;
49
49
  /** P-value from Welch's t-test (lower = more confident the difference is real) */
50
50
  p_value: number;
51
- /** Cohen's d effect size (magnitude of difference independent of sample size) */
51
+ /** Percentage difference between means as a ratio (0.05 = 5%, 1.0 = 100%) */
52
+ percent_difference: number;
53
+ /** Cohen's d effect size (informational — not used for classification) */
52
54
  effect_size: number;
53
- /** Interpretation of effect size */
55
+ /** Interpretation of practical significance based on percentage difference */
54
56
  effect_magnitude: EffectMagnitude;
55
57
  /** Whether the 95% confidence intervals overlap */
56
58
  ci_overlap: boolean;
@@ -64,6 +66,20 @@ export interface BenchmarkComparison {
64
66
  export interface BenchmarkCompareOptions {
65
67
  /** Significance level for hypothesis testing (default: 0.05) */
66
68
  alpha?: number;
69
+ /**
70
+ * Minimum percentage difference to consider practically meaningful, as a ratio.
71
+ * Below this threshold, differences are classified as 'negligible' and
72
+ * `significant` is forced to `false`, regardless of p-value.
73
+ * This prevents the t-test's oversensitivity at large sample sizes from
74
+ * flagging system-level noise (thermal throttle, OS scheduler, cache pressure)
75
+ * as meaningful differences.
76
+ *
77
+ * Effect magnitude thresholds scale from this value:
78
+ * negligible < min, small < min*3, medium < min*5, large >= min*5.
79
+ *
80
+ * Default: 0.10 (10%).
81
+ */
82
+ min_percent_difference?: number;
67
83
  }
68
84
 
69
85
  /**
@@ -181,8 +197,12 @@ export class BenchmarkStats {
181
197
  }
182
198
 
183
199
  /**
184
- * Compare two benchmark results for statistical significance.
185
- * Uses Welch's t-test (handles unequal variances) and Cohen's d effect size.
200
+ * Compare two benchmark results for practical and statistical significance.
201
+ * Uses percentage difference for effect magnitude classification, with Welch's
202
+ * t-test for statistical confidence. Cohen's d is computed as an informational
203
+ * metric but does not drive classification — its thresholds (0.2/0.5/0.8) are
204
+ * calibrated for social science and produce false positives in benchmarking
205
+ * where within-run variance is tight.
186
206
  *
187
207
  * @param a - First benchmark stats (or any object with required properties)
188
208
  * @param b - Second benchmark stats (or any object with required properties)
@@ -203,6 +223,7 @@ export const benchmark_stats_compare = (
203
223
  options?: BenchmarkCompareOptions,
204
224
  ): BenchmarkComparison => {
205
225
  const alpha = options?.alpha ?? 0.05;
226
+ const min_pct = options?.min_percent_difference ?? 0.1;
206
227
 
207
228
  // Handle edge cases
208
229
  if (a.sample_size === 0 || b.sample_size === 0) {
@@ -211,6 +232,7 @@ export const benchmark_stats_compare = (
211
232
  speedup_ratio: 1,
212
233
  significant: false,
213
234
  p_value: 1,
235
+ percent_difference: 0,
214
236
  effect_size: 0,
215
237
  effect_magnitude: 'negligible',
216
238
  ci_overlap: true,
@@ -223,6 +245,9 @@ export const benchmark_stats_compare = (
223
245
  const faster: 'a' | 'b' | 'equal' =
224
246
  a.mean_ns < b.mean_ns ? 'a' : a.mean_ns > b.mean_ns ? 'b' : 'equal';
225
247
 
248
+ // Percentage difference relative to the faster mean (always >= 0)
249
+ const percent_difference = speedup_ratio - 1;
250
+
226
251
  // Welch's t-test (handles unequal variances)
227
252
  // Special case: if both have zero variance, t-test is undefined
228
253
  let p_value: number;
@@ -242,38 +267,33 @@ export const benchmark_stats_compare = (
242
267
  p_value = stats_t_distribution_p_value(Math.abs(t_statistic), degrees_of_freedom);
243
268
  }
244
269
 
245
- // Cohen's d effect size
270
+ // Cohen's d effect size (informational only — not used for classification)
246
271
  const pooled_std_dev = Math.sqrt(
247
272
  ((a.sample_size - 1) * a.std_dev_ns ** 2 + (b.sample_size - 1) * b.std_dev_ns ** 2) /
248
273
  (a.sample_size + b.sample_size - 2),
249
274
  );
250
-
251
- // When pooled_std_dev is 0 but means differ, effect is maximal (infinite)
252
- // When means are equal, effect is 0
253
275
  let effect_size: number;
254
- let effect_magnitude: EffectMagnitude;
255
-
256
276
  if (pooled_std_dev === 0) {
257
- // Zero variance case - if means differ, it's a definitive difference
258
- if (a.mean_ns === b.mean_ns) {
259
- effect_size = 0;
260
- effect_magnitude = 'negligible';
261
- } else {
262
- // Any difference is 100% reliable when there's no variance
263
- effect_size = Infinity;
264
- effect_magnitude = 'large';
265
- }
277
+ effect_size = a.mean_ns === b.mean_ns ? 0 : Infinity;
266
278
  } else {
267
279
  effect_size = Math.abs(a.mean_ns - b.mean_ns) / pooled_std_dev;
268
- // Interpret effect size (Cohen's conventions)
269
- effect_magnitude =
270
- effect_size < 0.2
271
- ? 'negligible'
272
- : effect_size < 0.5
273
- ? 'small'
274
- : effect_size < 0.8
275
- ? 'medium'
276
- : 'large';
280
+ }
281
+
282
+ // Effect magnitude based on percentage difference, not Cohen's d.
283
+ // Cohen's d thresholds (0.2/0.5/0.8) are calibrated for social science, not benchmarking.
284
+ // Within-run variance is tight, so even small system noise (thermal throttle, OS scheduler)
285
+ // produces large Cohen's d. Percentage thresholds directly answer "is this difference
286
+ // meaningful in practice?" Thresholds scale with min_percent_difference so users can
287
+ // tune one knob for their system's noise floor.
288
+ let effect_magnitude: EffectMagnitude;
289
+ if (percent_difference < min_pct) {
290
+ effect_magnitude = 'negligible';
291
+ } else if (percent_difference < min_pct * 3) {
292
+ effect_magnitude = 'small';
293
+ } else if (percent_difference < min_pct * 5) {
294
+ effect_magnitude = 'medium';
295
+ } else {
296
+ effect_magnitude = 'large';
277
297
  }
278
298
 
279
299
  // Check confidence interval overlap
@@ -281,20 +301,21 @@ export const benchmark_stats_compare = (
281
301
  a.confidence_interval_ns[0] <= b.confidence_interval_ns[1] &&
282
302
  b.confidence_interval_ns[0] <= a.confidence_interval_ns[1];
283
303
 
284
- // Determine significance
285
- const significant = p_value < alpha;
304
+ // Significance requires both statistical significance (p < alpha)
305
+ // AND practical significance (percent_difference >= min_pct).
306
+ // With large n, the t-test finds p≈0 for any difference because
307
+ // SE = std_dev/sqrt(n) → 0. Gating on practical significance
308
+ // prevents system noise from being flagged as meaningful.
309
+ const significant = p_value < alpha && percent_difference >= min_pct;
286
310
 
287
311
  // Generate recommendation
288
312
  let recommendation: string;
289
- if (!significant) {
290
- recommendation =
291
- effect_magnitude === 'negligible'
292
- ? 'No meaningful difference detected'
293
- : `Difference not statistically significant (p=${p_value.toFixed(3)}), but effect size suggests ${effect_magnitude} practical difference`;
294
- } else if (effect_magnitude === 'negligible') {
295
- recommendation = `Statistically significant but negligible practical difference (${speedup_ratio.toFixed(2)}x)`;
313
+ if (percent_difference < min_pct) {
314
+ recommendation = 'No meaningful difference detected';
315
+ } else if (!significant) {
316
+ recommendation = `${(percent_difference * 100).toFixed(1)}% difference observed but not statistically significant (p=${p_value.toFixed(3)})`;
296
317
  } else {
297
- recommendation = `${faster === 'a' ? 'First' : 'Second'} is ${speedup_ratio.toFixed(2)}x faster with ${effect_magnitude} effect size (p=${p_value.toFixed(3)})`;
318
+ recommendation = `${faster === 'a' ? 'First' : 'Second'} is ${speedup_ratio.toFixed(2)}x faster with ${effect_magnitude} effect size (${(percent_difference * 100).toFixed(1)}%, p=${p_value.toFixed(3)})`;
298
319
  }
299
320
 
300
321
  // Adjust 'faster' to 'equal' if effect is negligible
@@ -305,6 +326,7 @@ export const benchmark_stats_compare = (
305
326
  speedup_ratio,
306
327
  significant,
307
328
  p_value,
329
+ percent_difference,
308
330
  effect_size,
309
331
  effect_magnitude,
310
332
  ci_overlap,
@@ -147,7 +147,7 @@ export const deep_equal = (a: unknown, b: unknown): boolean => {
147
147
  if (a_keys_length !== Object.keys(b!).length) return false;
148
148
  for (let i = 0; i < a_keys_length; i++) {
149
149
  const key = a_keys[i]!;
150
- if (!(key in (b as any))) return false;
150
+ if (!Object.hasOwn(b as object, key)) return false;
151
151
  if (!deep_equal((a as any)[key], (b as any)[key])) return false;
152
152
  }
153
153
 
@@ -72,14 +72,14 @@ export const PackageJson = z.looseObject({
72
72
  .meta({description: 'disallow publishing to the configured registry'})
73
73
  .optional(),
74
74
  description: z.string().optional(),
75
- motto: z
75
+ tagline: z
76
76
  .string()
77
- .meta({description: "a Gro extension that's a short phrase that represents this project"})
77
+ .meta({description: "a Fuz extension that's a short phrase that represents this project"})
78
78
  .optional(),
79
79
  glyph: z
80
80
  .string()
81
81
  .meta({
82
- description: "a Gro extension that's a single unicode character that represents this project",
82
+ description: "a Fuz extension that's a single unicode character that represents this project",
83
83
  })
84
84
  .refine((v) => count_graphemes(v) === 1, 'must be a single unicode character')
85
85
  .optional(),
@@ -87,12 +87,12 @@ export const PackageJson = z.looseObject({
87
87
  .string()
88
88
  .meta({
89
89
  description:
90
- "a Gro extension that's a link relative to the `homepage` to an image that represents this project",
90
+ "a Fuz extension that's a link relative to the `homepage` to an image that represents this project",
91
91
  })
92
92
  .optional(),
93
93
  logo_alt: z
94
94
  .string()
95
- .meta({description: "a Gro extension that's the alt text for the `logo`"})
95
+ .meta({description: "a Fuz extension that's the alt text for the `logo`"})
96
96
  .optional(),
97
97
  license: z.string().optional(),
98
98
  scripts: z.record(z.string(), z.string()).optional(),