npm - @fuzdev/fuz_util - Versions diffs - 0.54.0 → 0.55.0 - Mend

@fuzdev/fuz_util 0.54.0 → 0.55.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

package/dist/benchmark_baseline.d.ts +6 -0
package/dist/benchmark_baseline.d.ts.map +1 -1
package/dist/benchmark_baseline.js +15 -8
package/dist/benchmark_stats.d.ts +25 -5
package/dist/benchmark_stats.d.ts.map +1 -1
package/dist/benchmark_stats.js +43 -35
package/package.json +3 -3
package/src/lib/benchmark_baseline.ts +26 -9
package/src/lib/benchmark_stats.ts +61 -39

package/dist/benchmark_baseline.d.ts CHANGED Viewed

@@ -83,6 +83,12 @@ export interface BenchmarkBaselineCompareOptions extends BenchmarkBaselineLoadOp
      * Default: undefined (no staleness warning)
      */
     staleness_warning_days?: number;
+    /**
+     * Minimum percentage difference to consider meaningful, as a ratio.
+     * Passed through to `benchmark_stats_compare`. See `BenchmarkCompareOptions`.
+     * Default: 0.10 (10%)
+     */
+    min_percent_difference?: number;
 }
 /**
  * Result of comparing current results against a baseline.

package/dist/benchmark_baseline.d.ts.map CHANGED Viewed

	@@ -1 +1 @@
1	- {"version":3,"file":"benchmark_baseline.d.ts","sourceRoot":"../src/lib/","sources":["../src/lib/benchmark_baseline.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AAIH,OAAO,EAAC,CAAC,EAAC,MAAM,KAAK,CAAC;AAItB,OAAO,KAAK,EAAC,eAAe,EAAC,MAAM,sBAAsB,CAAC;AAC1D,OAAO,EAEN,KAAK,mBAAmB,EAExB,MAAM,sBAAsB,CAAC;AAM9B;;GAEG;AACH,eAAO,MAAM,sBAAsB;;;;;;;;;;;;;iBAajC,CAAC;AACH,MAAM,MAAM,sBAAsB,GAAG,CAAC,CAAC,KAAK,CAAC,OAAO,sBAAsB,CAAC,CAAC;AAE5E;;GAEG;AACH,eAAO,MAAM,iBAAiB;;;;;;;;;;;;;;;;;;;;iBAO5B,CAAC;AACH,MAAM,MAAM,iBAAiB,GAAG,CAAC,CAAC,KAAK,CAAC,OAAO,iBAAiB,CAAC,CAAC;AAElE;;GAEG;AACH,MAAM,WAAW,4BAA4B;IAC5C,gEAAgE;IAChE,IAAI,CAAC,EAAE,MAAM,CAAC;IACd,sDAAsD;IACtD,UAAU,CAAC,EAAE,MAAM,GAAG,IAAI,CAAC;IAC3B,sDAAsD;IACtD,UAAU,CAAC,EAAE,MAAM,GAAG,IAAI,CAAC;CAC3B;AAED;;GAEG;AACH,MAAM,WAAW,4BAA4B;IAC5C,mEAAmE;IACnE,IAAI,CAAC,EAAE,MAAM,CAAC;CACd;AAED;;GAEG;AACH,MAAM,WAAW,+BAAgC,SAAQ,4BAA4B;IACpF;;;;OAIG;IACH,oBAAoB,CAAC,EAAE,MAAM,CAAC;IAC9B;;;OAGG;IACH,sBAAsB,CAAC,EAAE,MAAM,CAAC;CAChC;AAED;;GAEG;AACH,MAAM,WAAW,iCAAiC;IACjD,mCAAmC;IACnC,cAAc,EAAE,OAAO,CAAC;IACxB,gCAAgC;IAChC,kBAAkB,EAAE,MAAM,GAAG,IAAI,CAAC;IAClC,iCAAiC;IACjC,eAAe,EAAE,MAAM,GAAG,IAAI,CAAC;IAC/B,kCAAkC;IAClC,iBAAiB,EAAE,MAAM,GAAG,IAAI,CAAC;IACjC,sFAAsF;IACtF,cAAc,EAAE,OAAO,CAAC;IACxB,kCAAkC;IAClC,WAAW,EAAE,KAAK,CAAC,+BAA+B,CAAC,CAAC;IACpD,yGAAyG;IACzG,WAAW,EAAE,KAAK,CAAC,+BAA+B,CAAC,CAAC;IACpD,wGAAwG;IACxG,YAAY,EAAE,KAAK,CAAC,+BAA+B,CAAC,CAAC;IACrD,uCAAuC;IACvC,SAAS,EAAE,KAAK,CAAC,+BAA+B,CAAC,CAAC;IAClD,+CAA+C;IAC/C,SAAS,EAAE,KAAK,CAAC,MAAM,CAAC,CAAC;IACzB,+CAA+C;IAC/C,aAAa,EAAE,KAAK,CAAC,MAAM,CAAC,CAAC;CAC7B;AAED;;GAEG;AACH,MAAM,WAAW,+BAA+B;IAC/C,IAAI,EAAE,MAAM,CAAC;IACb,QAAQ,EAAE,sBAAsB,CAAC;IACjC,OAAO,EAAE,sBAAsB,CAAC;IAChC,UAAU,EAAE,mBAAmB,CAAC;CAChC;AAyBD;;;;;;;;;;;;;GAaG;AACH,eAAO,MAAM,uBAAuB,GACnC,SAAS,KAAK,CAAC,eAAe,CAAC,EAC/B,UAAS,4BAAiC,KACxC,OAAO,CAAC,IAAI,CAwBd,CAAC;AAEF;;;;;;;;;;;;;GAaG;AACH,eAAO,MAAM,uBAAuB,GACnC,UAAS,4BAAiC,KACxC,OAAO,CAAC,iBAAiB,GAAG,IAAI,CAiClC,CAAC;AAEF;;;;;;;;;;;;;;;;;;;;;;;;;GAyBG;AACH,eAAO,MAAM,0BAA0B,GACtC,SAAS,KAAK,CAAC,eAAe,CAAC,EAC/B,UAAS,+BAAoC,KAC3C,OAAO,CAAC,iCAAiC,~~CAmI3C~~,CAAC;AAEF;;;;;GAKG;AACH,eAAO,MAAM,yBAAyB,GAAI,QAAQ,iCAAiC,KAAG,~~MAwErF~~,CAAC;AAEF;;;;;;GAMG;AACH,eAAO,MAAM,8BAA8B,GAC1C,QAAQ,iCAAiC,EACzC,UAAS;IAAC,MAAM,CAAC,EAAE,OAAO,CAAA;CAAM,KAC9B,~~MAuCF~~,CAAC"}
1	+ {"version":3,"file":"benchmark_baseline.d.ts","sourceRoot":"../src/lib/","sources":["../src/lib/benchmark_baseline.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AAIH,OAAO,EAAC,CAAC,EAAC,MAAM,KAAK,CAAC;AAItB,OAAO,KAAK,EAAC,eAAe,EAAC,MAAM,sBAAsB,CAAC;AAC1D,OAAO,EAEN,KAAK,mBAAmB,EAExB,MAAM,sBAAsB,CAAC;AAM9B;;GAEG;AACH,eAAO,MAAM,sBAAsB;;;;;;;;;;;;;iBAajC,CAAC;AACH,MAAM,MAAM,sBAAsB,GAAG,CAAC,CAAC,KAAK,CAAC,OAAO,sBAAsB,CAAC,CAAC;AAE5E;;GAEG;AACH,eAAO,MAAM,iBAAiB;;;;;;;;;;;;;;;;;;;;iBAO5B,CAAC;AACH,MAAM,MAAM,iBAAiB,GAAG,CAAC,CAAC,KAAK,CAAC,OAAO,iBAAiB,CAAC,CAAC;AAElE;;GAEG;AACH,MAAM,WAAW,4BAA4B;IAC5C,gEAAgE;IAChE,IAAI,CAAC,EAAE,MAAM,CAAC;IACd,sDAAsD;IACtD,UAAU,CAAC,EAAE,MAAM,GAAG,IAAI,CAAC;IAC3B,sDAAsD;IACtD,UAAU,CAAC,EAAE,MAAM,GAAG,IAAI,CAAC;CAC3B;AAED;;GAEG;AACH,MAAM,WAAW,4BAA4B;IAC5C,mEAAmE;IACnE,IAAI,CAAC,EAAE,MAAM,CAAC;CACd;AAED;;GAEG;AACH,MAAM,WAAW,+BAAgC,SAAQ,4BAA4B;IACpF;;;;OAIG;IACH,oBAAoB,CAAC,EAAE,MAAM,CAAC;IAC9B;;;OAGG;IACH,sBAAsB,CAAC,EAAE,MAAM,CAAC;IAChC;;;;OAIG;IACH,sBAAsB,CAAC,EAAE,MAAM,CAAC;CAChC;AAED;;GAEG;AACH,MAAM,WAAW,iCAAiC;IACjD,mCAAmC;IACnC,cAAc,EAAE,OAAO,CAAC;IACxB,gCAAgC;IAChC,kBAAkB,EAAE,MAAM,GAAG,IAAI,CAAC;IAClC,iCAAiC;IACjC,eAAe,EAAE,MAAM,GAAG,IAAI,CAAC;IAC/B,kCAAkC;IAClC,iBAAiB,EAAE,MAAM,GAAG,IAAI,CAAC;IACjC,sFAAsF;IACtF,cAAc,EAAE,OAAO,CAAC;IACxB,kCAAkC;IAClC,WAAW,EAAE,KAAK,CAAC,+BAA+B,CAAC,CAAC;IACpD,yGAAyG;IACzG,WAAW,EAAE,KAAK,CAAC,+BAA+B,CAAC,CAAC;IACpD,wGAAwG;IACxG,YAAY,EAAE,KAAK,CAAC,+BAA+B,CAAC,CAAC;IACrD,uCAAuC;IACvC,SAAS,EAAE,KAAK,CAAC,+BAA+B,CAAC,CAAC;IAClD,+CAA+C;IAC/C,SAAS,EAAE,KAAK,CAAC,MAAM,CAAC,CAAC;IACzB,+CAA+C;IAC/C,aAAa,EAAE,KAAK,CAAC,MAAM,CAAC,CAAC;CAC7B;AAED;;GAEG;AACH,MAAM,WAAW,+BAA+B;IAC/C,IAAI,EAAE,MAAM,CAAC;IACb,QAAQ,EAAE,sBAAsB,CAAC;IACjC,OAAO,EAAE,sBAAsB,CAAC;IAChC,UAAU,EAAE,mBAAmB,CAAC;CAChC;AAyBD;;;;;;;;;;;;;GAaG;AACH,eAAO,MAAM,uBAAuB,GACnC,SAAS,KAAK,CAAC,eAAe,CAAC,EAC/B,UAAS,4BAAiC,KACxC,OAAO,CAAC,IAAI,CAwBd,CAAC;AAEF;;;;;;;;;;;;;GAaG;AACH,eAAO,MAAM,uBAAuB,GACnC,UAAS,4BAAiC,KACxC,OAAO,CAAC,iBAAiB,GAAG,IAAI,CAiClC,CAAC;AAEF;;;;;;;;;;;;;;;;;;;;;;;;;GAyBG;AACH,eAAO,MAAM,0BAA0B,GACtC,SAAS,KAAK,CAAC,eAAe,CAAC,EAC/B,UAAS,+BAAoC,KAC3C,OAAO,CAAC,iCAAiC,CAsI3C,CAAC;AAEF;;;;;GAKG;AACH,eAAO,MAAM,yBAAyB,GAAI,QAAQ,iCAAiC,KAAG,MA8ErF,CAAC;AAEF;;;;;;GAMG;AACH,eAAO,MAAM,8BAA8B,GAC1C,QAAQ,iCAAiC,EACzC,UAAS;IAAC,MAAM,CAAC,EAAE,OAAO,CAAA;CAAM,KAC9B,MAyCF,CAAC"}

package/dist/benchmark_baseline.js CHANGED Viewed

@@ -217,7 +217,9 @@ export const benchmark_baseline_compare = async (results, options = {}) => {
             sample_size: current.sample_size,
             confidence_interval_ns: stats_confidence_interval_from_summary(current.mean_ns, current.std_dev_ns, current.sample_size),
         };
-        const comparison = benchmark_stats_compare(baseline_stats, current_stats);
+        const comparison = benchmark_stats_compare(baseline_stats, current_stats, {
+            min_percent_difference: options.min_percent_difference,
+        });
         const task_comparison = {
             name: current.name,
             baseline: baseline_entry,
@@ -227,7 +229,8 @@ export const benchmark_baseline_compare = async (results, options = {}) => {
         comparisons.push(task_comparison);
         // Categorize based on comparison result
         // Note: comparison.faster is 'a' (baseline) or 'b' (current)
-        if (comparison.significant && comparison.effect_magnitude !== 'negligible') {
+        // significant implies percent_difference >= min_pct, which implies effect_magnitude !== 'negligible'
+        if (comparison.significant) {
             if (comparison.faster === 'a') {
                 // Baseline was faster = potential regression
                 // Only count as regression if it exceeds the threshold
@@ -256,10 +259,10 @@ export const benchmark_baseline_compare = async (results, options = {}) => {
             removed_tasks.push(baseline_entry.name);
         }
     }
-    // Sort regressions and improvements by effect size (largest first)
-    const sort_by_effect_size = (a, b) => b.comparison.effect_size - a.comparison.effect_size;
-    regressions.sort(sort_by_effect_size);
-    improvements.sort(sort_by_effect_size);
+    // Sort regressions and improvements by percentage difference (largest first)
+    const sort_by_percent_difference = (a, b) => b.comparison.percent_difference - a.comparison.percent_difference;
+    regressions.sort(sort_by_percent_difference);
+    improvements.sort(sort_by_percent_difference);
     return {
         baseline_found: true,
         baseline_timestamp: baseline.timestamp,
@@ -302,8 +305,9 @@ export const benchmark_baseline_format = (result) => {
         lines.push(`Regressions (${result.regressions.length}):`);
         for (const r of result.regressions) {
             const ratio = r.comparison.speedup_ratio.toFixed(2);
+            const pct = (r.comparison.percent_difference * 100).toFixed(1);
             const p = r.comparison.p_value.toFixed(3);
-            lines.push(`  ${r.name}: ${ratio}x slower (p=${p}, ${r.comparison.effect_magnitude})`);
+            lines.push(`  ${r.name}: ${ratio}x slower (${pct}%, p=${p}, ${r.comparison.effect_magnitude})`);
         }
         lines.push('');
     }
@@ -311,8 +315,9 @@ export const benchmark_baseline_format = (result) => {
         lines.push(`Improvements (${result.improvements.length}):`);
         for (const r of result.improvements) {
             const ratio = r.comparison.speedup_ratio.toFixed(2);
+            const pct = (r.comparison.percent_difference * 100).toFixed(1);
             const p = r.comparison.p_value.toFixed(3);
-            lines.push(`  ${r.name}: ${ratio}x faster (p=${p}, ${r.comparison.effect_magnitude})`);
+            lines.push(`  ${r.name}: ${ratio}x faster (${pct}%, p=${p}, ${r.comparison.effect_magnitude})`);
         }
         lines.push('');
     }
@@ -367,6 +372,7 @@ export const benchmark_baseline_format_json = (result, options = {}) => {
         regressions: result.regressions.map((r) => ({
             name: r.name,
             speedup_ratio: r.comparison.speedup_ratio,
+            percent_difference: r.comparison.percent_difference,
             effect_size: r.comparison.effect_size,
             effect_magnitude: r.comparison.effect_magnitude,
             p_value: r.comparison.p_value,
@@ -376,6 +382,7 @@ export const benchmark_baseline_format_json = (result, options = {}) => {
         improvements: result.improvements.map((r) => ({
             name: r.name,
             speedup_ratio: r.comparison.speedup_ratio,
+            percent_difference: r.comparison.percent_difference,
             effect_size: r.comparison.effect_size,
             effect_magnitude: r.comparison.effect_magnitude,
             p_value: r.comparison.p_value,

package/dist/benchmark_stats.d.ts CHANGED Viewed

@@ -27,13 +27,15 @@ export interface BenchmarkComparison {
     faster: 'a' | 'b' | 'equal';
     /** How much faster the winner is (e.g., 1.5 means 1.5x faster) */
     speedup_ratio: number;
-    /** Whether the difference is statistically significant at the given alpha */
+    /** Whether the difference is both statistically and practically significant */
     significant: boolean;
     /** P-value from Welch's t-test (lower = more confident the difference is real) */
     p_value: number;
-    /** Cohen's d effect size (magnitude of difference independent of sample size) */
+    /** Percentage difference between means as a ratio (0.05 = 5%, 1.0 = 100%) */
+    percent_difference: number;
+    /** Cohen's d effect size (informational — not used for classification) */
     effect_size: number;
-    /** Interpretation of effect size */
+    /** Interpretation of practical significance based on percentage difference */
     effect_magnitude: EffectMagnitude;
     /** Whether the 95% confidence intervals overlap */
     ci_overlap: boolean;
@@ -46,6 +48,20 @@ export interface BenchmarkComparison {
 export interface BenchmarkCompareOptions {
     /** Significance level for hypothesis testing (default: 0.05) */
     alpha?: number;
+    /**
+     * Minimum percentage difference to consider practically meaningful, as a ratio.
+     * Below this threshold, differences are classified as 'negligible' and
+     * `significant` is forced to `false`, regardless of p-value.
+     * This prevents the t-test's oversensitivity at large sample sizes from
+     * flagging system-level noise (thermal throttle, OS scheduler, cache pressure)
+     * as meaningful differences.
+     *
+     * Effect magnitude thresholds scale from this value:
+     * negligible < min, small < min*3, medium < min*5, large >= min*5.
+     *
+     * Default: 0.10 (10%).
+     */
+    min_percent_difference?: number;
 }
 /**
  * Complete statistical analysis of timing measurements.
@@ -94,8 +110,12 @@ export declare class BenchmarkStats {
     toString(): string;
 }
 /**
- * Compare two benchmark results for statistical significance.
- * Uses Welch's t-test (handles unequal variances) and Cohen's d effect size.
+ * Compare two benchmark results for practical and statistical significance.
+ * Uses percentage difference for effect magnitude classification, with Welch's
+ * t-test for statistical confidence. Cohen's d is computed as an informational
+ * metric but does not drive classification — its thresholds (0.2/0.5/0.8) are
+ * calibrated for social science and produce false positives in benchmarking
+ * where within-run variance is tight.
  *
  * @param a - First benchmark stats (or any object with required properties)
  * @param b - Second benchmark stats (or any object with required properties)

package/dist/benchmark_stats.d.ts.map CHANGED Viewed

	@@ -1 +1 @@
1	- {"version":3,"file":"benchmark_stats.d.ts","sourceRoot":"../src/lib/","sources":["../src/lib/benchmark_stats.ts"],"names":[],"mappings":"AAAA;;;;;;GAMG;AAgBH;;;GAGG;AACH,MAAM,WAAW,wBAAwB;IACxC,OAAO,EAAE,MAAM,CAAC;IAChB,UAAU,EAAE,MAAM,CAAC;IACnB,WAAW,EAAE,MAAM,CAAC;IACpB,sBAAsB,EAAE,CAAC,MAAM,EAAE,MAAM,CAAC,CAAC;CACzC;AAED;;GAEG;AACH,MAAM,MAAM,eAAe,GAAG,YAAY,GAAG,OAAO,GAAG,QAAQ,GAAG,OAAO,CAAC;AAE1E;;GAEG;AACH,MAAM,WAAW,mBAAmB;IACnC,mFAAmF;IACnF,MAAM,EAAE,GAAG,GAAG,GAAG,GAAG,OAAO,CAAC;IAC5B,kEAAkE;IAClE,aAAa,EAAE,MAAM,CAAC;IACtB~~,6EAA6E~~;~~IAC7E~~,WAAW,EAAE,OAAO,CAAC;IACrB,kFAAkF;IAClF,OAAO,EAAE,MAAM,CAAC;IAChB,~~iFAAiF~~;~~IACjF~~,WAAW,EAAE,MAAM,CAAC;IACpB,~~oCAAoC~~;~~IACpC~~,gBAAgB,EAAE,eAAe,CAAC;IAClC,mDAAmD;IACnD,UAAU,EAAE,OAAO,CAAC;IACpB,sDAAsD;IACtD,cAAc,EAAE,MAAM,CAAC;CACvB;AAED;;GAEG;AACH,MAAM,WAAW,uBAAuB;IACvC,gEAAgE;IAChE,KAAK,CAAC,EAAE,MAAM,CAAC;~~CACf~~;AAED;;;;GAIG;AACH,qBAAa,cAAc;IAC1B,yCAAyC;IACzC,QAAQ,CAAC,OAAO,EAAE,MAAM,CAAC;IACzB,mDAAmD;IACnD,QAAQ,CAAC,MAAM,EAAE,MAAM,CAAC;IACxB,wCAAwC;IACxC,QAAQ,CAAC,UAAU,EAAE,MAAM,CAAC;IAC5B,kCAAkC;IAClC,QAAQ,CAAC,MAAM,EAAE,MAAM,CAAC;IACxB,kCAAkC;IAClC,QAAQ,CAAC,MAAM,EAAE,MAAM,CAAC;IACxB,qCAAqC;IACrC,QAAQ,CAAC,MAAM,EAAE,MAAM,CAAC;IACxB,qCAAqC;IACrC,QAAQ,CAAC,MAAM,EAAE,MAAM,CAAC;IACxB,qCAAqC;IACrC,QAAQ,CAAC,MAAM,EAAE,MAAM,CAAC;IACxB,qCAAqC;IACrC,QAAQ,CAAC,MAAM,EAAE,MAAM,CAAC;IACxB,gDAAgD;IAChD,QAAQ,CAAC,EAAE,EAAE,MAAM,CAAC;IACpB,0DAA0D;IAC1D,QAAQ,CAAC,sBAAsB,EAAE,CAAC,MAAM,EAAE,MAAM,CAAC,CAAC;IAClD,sDAAsD;IACtD,QAAQ,CAAC,WAAW,EAAE,KAAK,CAAC,MAAM,CAAC,CAAC;IACpC,yCAAyC;IACzC,QAAQ,CAAC,aAAa,EAAE,MAAM,CAAC;IAC/B,8CAA8C;IAC9C,QAAQ,CAAC,WAAW,EAAE,MAAM,CAAC;IAC7B,0DAA0D;IAC1D,QAAQ,CAAC,eAAe,EAAE,MAAM,CAAC;IACjC,mDAAmD;IACnD,QAAQ,CAAC,cAAc,EAAE,MAAM,CAAC;IAChC,sEAAsE;IACtE,QAAQ,CAAC,iBAAiB,EAAE,MAAM,CAAC;gBAEvB,UAAU,EAAE,KAAK,CAAC,MAAM,CAAC;IAiErC;;OAEG;IACH,QAAQ,IAAI,MAAM;CAGlB;AAED~~;;;;;;;;;;;;;;;;GAgBG~~;AACH,eAAO,MAAM,uBAAuB,GACnC,GAAG,wBAAwB,EAC3B,GAAG,wBAAwB,EAC3B,UAAU,uBAAuB,KAC/B,~~mBA6GF~~,CAAC"}
1	+ {"version":3,"file":"benchmark_stats.d.ts","sourceRoot":"../src/lib/","sources":["../src/lib/benchmark_stats.ts"],"names":[],"mappings":"AAAA;;;;;;GAMG;AAgBH;;;GAGG;AACH,MAAM,WAAW,wBAAwB;IACxC,OAAO,EAAE,MAAM,CAAC;IAChB,UAAU,EAAE,MAAM,CAAC;IACnB,WAAW,EAAE,MAAM,CAAC;IACpB,sBAAsB,EAAE,CAAC,MAAM,EAAE,MAAM,CAAC,CAAC;CACzC;AAED;;GAEG;AACH,MAAM,MAAM,eAAe,GAAG,YAAY,GAAG,OAAO,GAAG,QAAQ,GAAG,OAAO,CAAC;AAE1E;;GAEG;AACH,MAAM,WAAW,mBAAmB;IACnC,mFAAmF;IACnF,MAAM,EAAE,GAAG,GAAG,GAAG,GAAG,OAAO,CAAC;IAC5B,kEAAkE;IAClE,aAAa,EAAE,MAAM,CAAC;IACtB,+EAA+E;IAC/E,WAAW,EAAE,OAAO,CAAC;IACrB,kFAAkF;IAClF,OAAO,EAAE,MAAM,CAAC;IAChB,6EAA6E;IAC7E,kBAAkB,EAAE,MAAM,CAAC;IAC3B,0EAA0E;IAC1E,WAAW,EAAE,MAAM,CAAC;IACpB,8EAA8E;IAC9E,gBAAgB,EAAE,eAAe,CAAC;IAClC,mDAAmD;IACnD,UAAU,EAAE,OAAO,CAAC;IACpB,sDAAsD;IACtD,cAAc,EAAE,MAAM,CAAC;CACvB;AAED;;GAEG;AACH,MAAM,WAAW,uBAAuB;IACvC,gEAAgE;IAChE,KAAK,CAAC,EAAE,MAAM,CAAC;IACf;;;;;;;;;;;;OAYG;IACH,sBAAsB,CAAC,EAAE,MAAM,CAAC;CAChC;AAED;;;;GAIG;AACH,qBAAa,cAAc;IAC1B,yCAAyC;IACzC,QAAQ,CAAC,OAAO,EAAE,MAAM,CAAC;IACzB,mDAAmD;IACnD,QAAQ,CAAC,MAAM,EAAE,MAAM,CAAC;IACxB,wCAAwC;IACxC,QAAQ,CAAC,UAAU,EAAE,MAAM,CAAC;IAC5B,kCAAkC;IAClC,QAAQ,CAAC,MAAM,EAAE,MAAM,CAAC;IACxB,kCAAkC;IAClC,QAAQ,CAAC,MAAM,EAAE,MAAM,CAAC;IACxB,qCAAqC;IACrC,QAAQ,CAAC,MAAM,EAAE,MAAM,CAAC;IACxB,qCAAqC;IACrC,QAAQ,CAAC,MAAM,EAAE,MAAM,CAAC;IACxB,qCAAqC;IACrC,QAAQ,CAAC,MAAM,EAAE,MAAM,CAAC;IACxB,qCAAqC;IACrC,QAAQ,CAAC,MAAM,EAAE,MAAM,CAAC;IACxB,gDAAgD;IAChD,QAAQ,CAAC,EAAE,EAAE,MAAM,CAAC;IACpB,0DAA0D;IAC1D,QAAQ,CAAC,sBAAsB,EAAE,CAAC,MAAM,EAAE,MAAM,CAAC,CAAC;IAClD,sDAAsD;IACtD,QAAQ,CAAC,WAAW,EAAE,KAAK,CAAC,MAAM,CAAC,CAAC;IACpC,yCAAyC;IACzC,QAAQ,CAAC,aAAa,EAAE,MAAM,CAAC;IAC/B,8CAA8C;IAC9C,QAAQ,CAAC,WAAW,EAAE,MAAM,CAAC;IAC7B,0DAA0D;IAC1D,QAAQ,CAAC,eAAe,EAAE,MAAM,CAAC;IACjC,mDAAmD;IACnD,QAAQ,CAAC,cAAc,EAAE,MAAM,CAAC;IAChC,sEAAsE;IACtE,QAAQ,CAAC,iBAAiB,EAAE,MAAM,CAAC;gBAEvB,UAAU,EAAE,KAAK,CAAC,MAAM,CAAC;IAiErC;;OAEG;IACH,QAAQ,IAAI,MAAM;CAGlB;AAED;;;;;;;;;;;;;;;;;;;;GAoBG;AACH,eAAO,MAAM,uBAAuB,GACnC,GAAG,wBAAwB,EAC3B,GAAG,wBAAwB,EAC3B,UAAU,uBAAuB,KAC/B,mBA+GF,CAAC"}

package/dist/benchmark_stats.js CHANGED Viewed

@@ -110,8 +110,12 @@ export class BenchmarkStats {
     }
 }
 /**
- * Compare two benchmark results for statistical significance.
- * Uses Welch's t-test (handles unequal variances) and Cohen's d effect size.
+ * Compare two benchmark results for practical and statistical significance.
+ * Uses percentage difference for effect magnitude classification, with Welch's
+ * t-test for statistical confidence. Cohen's d is computed as an informational
+ * metric but does not drive classification — its thresholds (0.2/0.5/0.8) are
+ * calibrated for social science and produce false positives in benchmarking
+ * where within-run variance is tight.
  *
  * @param a - First benchmark stats (or any object with required properties)
  * @param b - Second benchmark stats (or any object with required properties)
@@ -128,6 +132,7 @@ export class BenchmarkStats {
  */
 export const benchmark_stats_compare = (a, b, options) => {
     const alpha = options?.alpha ?? 0.05;
+    const min_pct = options?.min_percent_difference ?? 0.1;
     // Handle edge cases
     if (a.sample_size === 0 || b.sample_size === 0) {
         return {
@@ -135,6 +140,7 @@ export const benchmark_stats_compare = (a, b, options) => {
             speedup_ratio: 1,
             significant: false,
             p_value: 1,
+            percent_difference: 0,
             effect_size: 0,
             effect_magnitude: 'negligible',
             ci_overlap: true,
@@ -144,6 +150,8 @@ export const benchmark_stats_compare = (a, b, options) => {
     // Calculate speedup ratio (lower time = faster, so compare by time not ops/sec)
     const speedup_ratio = a.mean_ns < b.mean_ns ? b.mean_ns / a.mean_ns : a.mean_ns / b.mean_ns;
     const faster = a.mean_ns < b.mean_ns ? 'a' : a.mean_ns > b.mean_ns ? 'b' : 'equal';
+    // Percentage difference relative to the faster mean (always >= 0)
+    const percent_difference = speedup_ratio - 1;
     // Welch's t-test (handles unequal variances)
     // Special case: if both have zero variance, t-test is undefined
     let p_value;
@@ -156,55 +164,54 @@ export const benchmark_stats_compare = (a, b, options) => {
         // Calculate two-tailed p-value using t-distribution approximation
         p_value = stats_t_distribution_p_value(Math.abs(t_statistic), degrees_of_freedom);
     }
-    // Cohen's d effect size
+    // Cohen's d effect size (informational only — not used for classification)
     const pooled_std_dev = Math.sqrt(((a.sample_size - 1) * a.std_dev_ns ** 2 + (b.sample_size - 1) * b.std_dev_ns ** 2) /
         (a.sample_size + b.sample_size - 2));
-    // When pooled_std_dev is 0 but means differ, effect is maximal (infinite)
-    // When means are equal, effect is 0
     let effect_size;
-    let effect_magnitude;
     if (pooled_std_dev === 0) {
-        // Zero variance case - if means differ, it's a definitive difference
-        if (a.mean_ns === b.mean_ns) {
-            effect_size = 0;
-            effect_magnitude = 'negligible';
-        }
-        else {
-            // Any difference is 100% reliable when there's no variance
-            effect_size = Infinity;
-            effect_magnitude = 'large';
-        }
+        effect_size = a.mean_ns === b.mean_ns ? 0 : Infinity;
     }
     else {
         effect_size = Math.abs(a.mean_ns - b.mean_ns) / pooled_std_dev;
-        // Interpret effect size (Cohen's conventions)
-        effect_magnitude =
-            effect_size < 0.2
-                ? 'negligible'
-                : effect_size < 0.5
-                    ? 'small'
-                    : effect_size < 0.8
-                        ? 'medium'
-                        : 'large';
+    }
+    // Effect magnitude based on percentage difference, not Cohen's d.
+    // Cohen's d thresholds (0.2/0.5/0.8) are calibrated for social science, not benchmarking.
+    // Within-run variance is tight, so even small system noise (thermal throttle, OS scheduler)
+    // produces large Cohen's d. Percentage thresholds directly answer "is this difference
+    // meaningful in practice?" Thresholds scale with min_percent_difference so users can
+    // tune one knob for their system's noise floor.
+    let effect_magnitude;
+    if (percent_difference < min_pct) {
+        effect_magnitude = 'negligible';
+    }
+    else if (percent_difference < min_pct * 3) {
+        effect_magnitude = 'small';
+    }
+    else if (percent_difference < min_pct * 5) {
+        effect_magnitude = 'medium';
+    }
+    else {
+        effect_magnitude = 'large';
     }
     // Check confidence interval overlap
     const ci_overlap = a.confidence_interval_ns[0] <= b.confidence_interval_ns[1] &&
         b.confidence_interval_ns[0] <= a.confidence_interval_ns[1];
-    // Determine significance
-    const significant = p_value < alpha;
+    // Significance requires both statistical significance (p < alpha)
+    // AND practical significance (percent_difference >= min_pct).
+    // With large n, the t-test finds p≈0 for any difference because
+    // SE = std_dev/sqrt(n) → 0. Gating on practical significance
+    // prevents system noise from being flagged as meaningful.
+    const significant = p_value < alpha && percent_difference >= min_pct;
     // Generate recommendation
     let recommendation;
-    if (!significant) {
-        recommendation =
-            effect_magnitude === 'negligible'
-                ? 'No meaningful difference detected'
-                : `Difference not statistically significant (p=${p_value.toFixed(3)}), but effect size suggests ${effect_magnitude} practical difference`;
+    if (percent_difference < min_pct) {
+        recommendation = 'No meaningful difference detected';
     }
-    else if (effect_magnitude === 'negligible') {
-        recommendation = `Statistically significant but negligible practical difference (${speedup_ratio.toFixed(2)}x)`;
+    else if (!significant) {
+        recommendation = `${(percent_difference * 100).toFixed(1)}% difference observed but not statistically significant (p=${p_value.toFixed(3)})`;
     }
     else {
-        recommendation = `${faster === 'a' ? 'First' : 'Second'} is ${speedup_ratio.toFixed(2)}x faster with ${effect_magnitude} effect size (p=${p_value.toFixed(3)})`;
+        recommendation = `${faster === 'a' ? 'First' : 'Second'} is ${speedup_ratio.toFixed(2)}x faster with ${effect_magnitude} effect size (${(percent_difference * 100).toFixed(1)}%, p=${p_value.toFixed(3)})`;
     }
     // Adjust 'faster' to 'equal' if effect is negligible
     const adjusted_faster = effect_magnitude === 'negligible' ? 'equal' : faster;
@@ -213,6 +220,7 @@ export const benchmark_stats_compare = (a, b, options) => {
         speedup_ratio,
         significant,
         p_value,
+        percent_difference,
         effect_size,
         effect_magnitude,
         ci_overlap,

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@fuzdev/fuz_util",
-  "version": "0.54.0",
+  "version": "0.55.0",
   "description": "utility belt for JS",
   "glyph": "🦕",
   "logo": "logo.svg",
@@ -77,8 +77,8 @@
     "@changesets/changelog-git": "^0.2.1",
     "@fuzdev/blake3_wasm": "^0.1.0",
     "@fuzdev/fuz_code": "^0.45.1",
-    "@fuzdev/fuz_css": "^0.55.0",
-    "@fuzdev/fuz_ui": "^0.188.0",
+    "@fuzdev/fuz_css": "^0.56.0",
+    "@fuzdev/fuz_ui": "^0.190.0",
     "@fuzdev/gro": "^0.197.0",
     "@jridgewell/trace-mapping": "^0.3.31",
     "@ryanatkn/eslint-config": "^0.10.1",

package/src/lib/benchmark_baseline.ts CHANGED Viewed

@@ -89,6 +89,12 @@ export interface BenchmarkBaselineCompareOptions extends BenchmarkBaselineLoadOp
 	 * Default: undefined (no staleness warning)
 	 */
 	staleness_warning_days?: number;
+	/**
+	 * Minimum percentage difference to consider meaningful, as a ratio.
+	 * Passed through to `benchmark_stats_compare`. See `BenchmarkCompareOptions`.
+	 * Default: 0.10 (10%)
+	 */
+	min_percent_difference?: number;
 }
 /**
@@ -344,7 +350,9 @@ export const benchmark_baseline_compare = async (
 			),
 		};
-		const comparison = benchmark_stats_compare(baseline_stats, current_stats);
+		const comparison = benchmark_stats_compare(baseline_stats, current_stats, {
+			min_percent_difference: options.min_percent_difference,
+		});
 		const task_comparison: BenchmarkBaselineTaskComparison = {
 			name: current.name,
@@ -357,7 +365,8 @@ export const benchmark_baseline_compare = async (
 		// Categorize based on comparison result
 		// Note: comparison.faster is 'a' (baseline) or 'b' (current)
-		if (comparison.significant && comparison.effect_magnitude !== 'negligible') {
+		// significant implies percent_difference >= min_pct, which implies effect_magnitude !== 'negligible'
+		if (comparison.significant) {
 			if (comparison.faster === 'a') {
 				// Baseline was faster = potential regression
 				// Only count as regression if it exceeds the threshold
@@ -384,14 +393,14 @@ export const benchmark_baseline_compare = async (
 		}
 	}
-	// Sort regressions and improvements by effect size (largest first)
-	const sort_by_effect_size = (
+	// Sort regressions and improvements by percentage difference (largest first)
+	const sort_by_percent_difference = (
 		a: BenchmarkBaselineTaskComparison,
 		b: BenchmarkBaselineTaskComparison,
-	) => b.comparison.effect_size - a.comparison.effect_size;
+	) => b.comparison.percent_difference - a.comparison.percent_difference;
-	regressions.sort(sort_by_effect_size);
-	improvements.sort(sort_by_effect_size);
+	regressions.sort(sort_by_percent_difference);
+	improvements.sort(sort_by_percent_difference);
 	return {
 		baseline_found: true,
@@ -440,8 +449,11 @@ export const benchmark_baseline_format = (result: BenchmarkBaselineComparisonRes
 		lines.push(`Regressions (${result.regressions.length}):`);
 		for (const r of result.regressions) {
 			const ratio = r.comparison.speedup_ratio.toFixed(2);
+			const pct = (r.comparison.percent_difference * 100).toFixed(1);
 			const p = r.comparison.p_value.toFixed(3);
-			lines.push(`  ${r.name}: ${ratio}x slower (p=${p}, ${r.comparison.effect_magnitude})`);
+			lines.push(
+				`  ${r.name}: ${ratio}x slower (${pct}%, p=${p}, ${r.comparison.effect_magnitude})`,
+			);
 		}
 		lines.push('');
 	}
@@ -450,8 +462,11 @@ export const benchmark_baseline_format = (result: BenchmarkBaselineComparisonRes
 		lines.push(`Improvements (${result.improvements.length}):`);
 		for (const r of result.improvements) {
 			const ratio = r.comparison.speedup_ratio.toFixed(2);
+			const pct = (r.comparison.percent_difference * 100).toFixed(1);
 			const p = r.comparison.p_value.toFixed(3);
-			lines.push(`  ${r.name}: ${ratio}x faster (p=${p}, ${r.comparison.effect_magnitude})`);
+			lines.push(
+				`  ${r.name}: ${ratio}x faster (${pct}%, p=${p}, ${r.comparison.effect_magnitude})`,
+			);
 		}
 		lines.push('');
 	}
@@ -516,6 +531,7 @@ export const benchmark_baseline_format_json = (
 		regressions: result.regressions.map((r) => ({
 			name: r.name,
 			speedup_ratio: r.comparison.speedup_ratio,
+			percent_difference: r.comparison.percent_difference,
 			effect_size: r.comparison.effect_size,
 			effect_magnitude: r.comparison.effect_magnitude,
 			p_value: r.comparison.p_value,
@@ -525,6 +541,7 @@ export const benchmark_baseline_format_json = (
 		improvements: result.improvements.map((r) => ({
 			name: r.name,
 			speedup_ratio: r.comparison.speedup_ratio,
+			percent_difference: r.comparison.percent_difference,
 			effect_size: r.comparison.effect_size,
 			effect_magnitude: r.comparison.effect_magnitude,
 			p_value: r.comparison.p_value,

package/src/lib/benchmark_stats.ts CHANGED Viewed

@@ -44,13 +44,15 @@ export interface BenchmarkComparison {
 	faster: 'a' | 'b' | 'equal';
 	/** How much faster the winner is (e.g., 1.5 means 1.5x faster) */
 	speedup_ratio: number;
-	/** Whether the difference is statistically significant at the given alpha */
+	/** Whether the difference is both statistically and practically significant */
 	significant: boolean;
 	/** P-value from Welch's t-test (lower = more confident the difference is real) */
 	p_value: number;
-	/** Cohen's d effect size (magnitude of difference independent of sample size) */
+	/** Percentage difference between means as a ratio (0.05 = 5%, 1.0 = 100%) */
+	percent_difference: number;
+	/** Cohen's d effect size (informational — not used for classification) */
 	effect_size: number;
-	/** Interpretation of effect size */
+	/** Interpretation of practical significance based on percentage difference */
 	effect_magnitude: EffectMagnitude;
 	/** Whether the 95% confidence intervals overlap */
 	ci_overlap: boolean;
@@ -64,6 +66,20 @@ export interface BenchmarkComparison {
 export interface BenchmarkCompareOptions {
 	/** Significance level for hypothesis testing (default: 0.05) */
 	alpha?: number;
+	/**
+	 * Minimum percentage difference to consider practically meaningful, as a ratio.
+	 * Below this threshold, differences are classified as 'negligible' and
+	 * `significant` is forced to `false`, regardless of p-value.
+	 * This prevents the t-test's oversensitivity at large sample sizes from
+	 * flagging system-level noise (thermal throttle, OS scheduler, cache pressure)
+	 * as meaningful differences.
+	 *
+	 * Effect magnitude thresholds scale from this value:
+	 * negligible < min, small < min*3, medium < min*5, large >= min*5.
+	 *
+	 * Default: 0.10 (10%).
+	 */
+	min_percent_difference?: number;
 }
 /**
@@ -181,8 +197,12 @@ export class BenchmarkStats {
 }
 /**
- * Compare two benchmark results for statistical significance.
- * Uses Welch's t-test (handles unequal variances) and Cohen's d effect size.
+ * Compare two benchmark results for practical and statistical significance.
+ * Uses percentage difference for effect magnitude classification, with Welch's
+ * t-test for statistical confidence. Cohen's d is computed as an informational
+ * metric but does not drive classification — its thresholds (0.2/0.5/0.8) are
+ * calibrated for social science and produce false positives in benchmarking
+ * where within-run variance is tight.
  *
  * @param a - First benchmark stats (or any object with required properties)
  * @param b - Second benchmark stats (or any object with required properties)
@@ -203,6 +223,7 @@ export const benchmark_stats_compare = (
 	options?: BenchmarkCompareOptions,
 ): BenchmarkComparison => {
 	const alpha = options?.alpha ?? 0.05;
+	const min_pct = options?.min_percent_difference ?? 0.1;
 	// Handle edge cases
 	if (a.sample_size === 0 || b.sample_size === 0) {
@@ -211,6 +232,7 @@ export const benchmark_stats_compare = (
 			speedup_ratio: 1,
 			significant: false,
 			p_value: 1,
+			percent_difference: 0,
 			effect_size: 0,
 			effect_magnitude: 'negligible',
 			ci_overlap: true,
@@ -223,6 +245,9 @@ export const benchmark_stats_compare = (
 	const faster: 'a' | 'b' | 'equal' =
 		a.mean_ns < b.mean_ns ? 'a' : a.mean_ns > b.mean_ns ? 'b' : 'equal';
+	// Percentage difference relative to the faster mean (always >= 0)
+	const percent_difference = speedup_ratio - 1;
 	// Welch's t-test (handles unequal variances)
 	// Special case: if both have zero variance, t-test is undefined
 	let p_value: number;
@@ -242,38 +267,33 @@ export const benchmark_stats_compare = (
 		p_value = stats_t_distribution_p_value(Math.abs(t_statistic), degrees_of_freedom);
 	}
-	// Cohen's d effect size
+	// Cohen's d effect size (informational only — not used for classification)
 	const pooled_std_dev = Math.sqrt(
 		((a.sample_size - 1) * a.std_dev_ns ** 2 + (b.sample_size - 1) * b.std_dev_ns ** 2) /
 			(a.sample_size + b.sample_size - 2),
 	);
-	// When pooled_std_dev is 0 but means differ, effect is maximal (infinite)
-	// When means are equal, effect is 0
 	let effect_size: number;
-	let effect_magnitude: EffectMagnitude;
 	if (pooled_std_dev === 0) {
-		// Zero variance case - if means differ, it's a definitive difference
-		if (a.mean_ns === b.mean_ns) {
-			effect_size = 0;
-			effect_magnitude = 'negligible';
-		} else {
-			// Any difference is 100% reliable when there's no variance
-			effect_size = Infinity;
-			effect_magnitude = 'large';
-		}
+		effect_size = a.mean_ns === b.mean_ns ? 0 : Infinity;
 	} else {
 		effect_size = Math.abs(a.mean_ns - b.mean_ns) / pooled_std_dev;
-		// Interpret effect size (Cohen's conventions)
-		effect_magnitude =
-			effect_size < 0.2
-				? 'negligible'
-				: effect_size < 0.5
-					? 'small'
-					: effect_size < 0.8
-						? 'medium'
-						: 'large';
+	}
+	// Effect magnitude based on percentage difference, not Cohen's d.
+	// Cohen's d thresholds (0.2/0.5/0.8) are calibrated for social science, not benchmarking.
+	// Within-run variance is tight, so even small system noise (thermal throttle, OS scheduler)
+	// produces large Cohen's d. Percentage thresholds directly answer "is this difference
+	// meaningful in practice?" Thresholds scale with min_percent_difference so users can
+	// tune one knob for their system's noise floor.
+	let effect_magnitude: EffectMagnitude;
+	if (percent_difference < min_pct) {
+		effect_magnitude = 'negligible';
+	} else if (percent_difference < min_pct * 3) {
+		effect_magnitude = 'small';
+	} else if (percent_difference < min_pct * 5) {
+		effect_magnitude = 'medium';
+	} else {
+		effect_magnitude = 'large';
 	}
 	// Check confidence interval overlap
@@ -281,20 +301,21 @@ export const benchmark_stats_compare = (
 		a.confidence_interval_ns[0] <= b.confidence_interval_ns[1] &&
 		b.confidence_interval_ns[0] <= a.confidence_interval_ns[1];
-	// Determine significance
-	const significant = p_value < alpha;
+	// Significance requires both statistical significance (p < alpha)
+	// AND practical significance (percent_difference >= min_pct).
+	// With large n, the t-test finds p≈0 for any difference because
+	// SE = std_dev/sqrt(n) → 0. Gating on practical significance
+	// prevents system noise from being flagged as meaningful.
+	const significant = p_value < alpha && percent_difference >= min_pct;
 	// Generate recommendation
 	let recommendation: string;
-	if (!significant) {
-		recommendation =
-			effect_magnitude === 'negligible'
-				? 'No meaningful difference detected'
-				: `Difference not statistically significant (p=${p_value.toFixed(3)}), but effect size suggests ${effect_magnitude} practical difference`;
-	} else if (effect_magnitude === 'negligible') {
-		recommendation = `Statistically significant but negligible practical difference (${speedup_ratio.toFixed(2)}x)`;
+	if (percent_difference < min_pct) {
+		recommendation = 'No meaningful difference detected';
+	} else if (!significant) {
+		recommendation = `${(percent_difference * 100).toFixed(1)}% difference observed but not statistically significant (p=${p_value.toFixed(3)})`;
 	} else {
-		recommendation = `${faster === 'a' ? 'First' : 'Second'} is ${speedup_ratio.toFixed(2)}x faster with ${effect_magnitude} effect size (p=${p_value.toFixed(3)})`;
+		recommendation = `${faster === 'a' ? 'First' : 'Second'} is ${speedup_ratio.toFixed(2)}x faster with ${effect_magnitude} effect size (${(percent_difference * 100).toFixed(1)}%, p=${p_value.toFixed(3)})`;
 	}
 	// Adjust 'faster' to 'equal' if effect is negligible
@@ -305,6 +326,7 @@ export const benchmark_stats_compare = (
 		speedup_ratio,
 		significant,
 		p_value,
+		percent_difference,
 		effect_size,
 		effect_magnitude,
 		ci_overlap,