PyPI - diffcb - Versions diffs - 0.1.0__tar.gz - Mend

diffcb 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (21) hide show

diffcb-0.1.0/.gitignore +11 -0
diffcb-0.1.0/.zenodo.json +30 -0
diffcb-0.1.0/LICENSE +21 -0
diffcb-0.1.0/PKG-INFO +148 -0
diffcb-0.1.0/README.md +91 -0
diffcb-0.1.0/dcb/__init__.py +22 -0
diffcb-0.1.0/dcb/diagnostics.py +163 -0
diffcb-0.1.0/dcb/fft_kde.py +128 -0
diffcb-0.1.0/dcb/kde.py +394 -0
diffcb-0.1.0/dcb/layer.py +231 -0
diffcb-0.1.0/dcb/solver.py +604 -0
diffcb-0.1.0/dcb/utils.py +183 -0
diffcb-0.1.0/notebooks/.gitkeep +0 -0
diffcb-0.1.0/pyproject.toml +63 -0
diffcb-0.1.0/tests/test_kde.py +312 -0
diffcb-0.1.0/tests/test_layer.py +165 -0
diffcb-0.1.0/tests/test_r18c_denom_audit.py +118 -0
diffcb-0.1.0/tests/test_r18c_deprecation_warn.py +64 -0
diffcb-0.1.0/tests/test_r19_default_fft.py +52 -0
diffcb-0.1.0/tests/test_r19_diagnostics.py +80 -0
diffcb-0.1.0/tests/test_solver.py +179 -0

diffcb-0.1.0/.gitignore ADDED Viewed

@@ -0,0 +1,11 @@
+__pycache__/
+*.py[cod]
+*.egg-info/
+dist/
+build/
+.venv/
+venv/
+.ipynb_checkpoints/
+.DS_Store
+outputs/
+.claude/

diffcb-0.1.0/.zenodo.json ADDED Viewed

@@ -0,0 +1,30 @@
+{
+  "title": "Differentiable Critical Bandwidth (DCB) v0.1.0",
+  "description": "A PyTorch package making Silverman's critical bandwidth test fully differentiable via a smooth mode-counting integral and an Implicit Function Theorem backward pass.",
+  "upload_type": "software",
+  "license": "MIT",
+  "creators": [
+    {
+      "name": "Zhang, Ruiyu",
+      "affiliation": "University of Hong Kong"
+    }
+  ],
+  "keywords": [
+    "nonparametric statistics",
+    "kernel density estimation",
+    "differentiable programming",
+    "critical bandwidth",
+    "mode counting",
+    "implicit function theorem",
+    "PyTorch",
+    "JMLR",
+    "Silverman 1981"
+  ],
+  "related_identifiers": [
+    {
+      "relation": "isSupplementTo",
+      "identifier": "10.48550/arXiv.XXXX.XXXXX",
+      "scheme": "doi"
+    }
+  ]
+}

diffcb-0.1.0/LICENSE ADDED Viewed

@@ -0,0 +1,21 @@
+MIT License
+Copyright (c) 2026 Ruiyu Zhang
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

diffcb-0.1.0/PKG-INFO ADDED Viewed

@@ -0,0 +1,148 @@
+Metadata-Version: 2.4
+Name: diffcb
+Version: 0.1.0
+Summary: Differentiable Critical Bandwidth: Silverman's modality test as a differentiable PyTorch layer with IFT backward pass.
+Project-URL: Homepage, https://github.com/ryZhangHason/differentiable-critical-bandwidth
+Project-URL: Repository, https://github.com/ryZhangHason/differentiable-critical-bandwidth
+Project-URL: Documentation, https://github.com/ryZhangHason/differentiable-critical-bandwidth#readme
+Project-URL: Bug Tracker, https://github.com/ryZhangHason/differentiable-critical-bandwidth/issues
+Author-email: Ruiyu Zhang <dhhhason@gmail.com>
+License: MIT License
+        Copyright (c) 2026 Ruiyu Zhang
+        Permission is hereby granted, free of charge, to any person obtaining a copy
+        of this software and associated documentation files (the "Software"), to deal
+        in the Software without restriction, including without limitation the rights
+        to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+        copies of the Software, and to permit persons to whom the Software is
+        furnished to do so, subject to the following conditions:
+        The above copyright notice and this permission notice shall be included in all
+        copies or substantial portions of the Software.
+        THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+        IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+        FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+        AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+        LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+        OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+        SOFTWARE.
+License-File: LICENSE
+Keywords: PyTorch,anomaly detection,critical bandwidth,differentiable programming,generative models,kernel density estimation,mode counting,nonparametric statistics
+Classifier: Development Status :: 3 - Alpha
+Classifier: Intended Audience :: Science/Research
+Classifier: License :: OSI Approved :: MIT License
+Classifier: Programming Language :: Python :: 3
+Classifier: Programming Language :: Python :: 3.9
+Classifier: Programming Language :: Python :: 3.10
+Classifier: Programming Language :: Python :: 3.11
+Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
+Classifier: Topic :: Scientific/Engineering :: Mathematics
+Requires-Python: >=3.9
+Requires-Dist: matplotlib>=3.7.0
+Requires-Dist: numpy>=1.24.0
+Requires-Dist: scikit-learn>=1.3.0
+Requires-Dist: scipy>=1.10.0
+Requires-Dist: torch>=2.0.0
+Provides-Extra: dev
+Requires-Dist: black>=23.0.0; extra == 'dev'
+Requires-Dist: pytest-cov>=4.1.0; extra == 'dev'
+Requires-Dist: pytest>=7.4.0; extra == 'dev'
+Requires-Dist: ruff>=0.1.0; extra == 'dev'
+Provides-Extra: notebooks
+Requires-Dist: ipywidgets>=8.0.0; extra == 'notebooks'
+Requires-Dist: jupyter>=1.0.0; extra == 'notebooks'
+Description-Content-Type: text/markdown
+# DCB — Differentiable Critical Bandwidth
+[![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](LICENSE)
+[![Python 3.9+](https://img.shields.io/badge/python-3.9+-blue.svg)](https://www.python.org/)
+A PyTorch package that makes **Silverman's critical bandwidth test (1981)** fully differentiable, enabling end-to-end gradient-based optimization over the modal structure of continuous distributions.
+## Overview
+The critical bandwidth `h_crit` is the minimum KDE bandwidth at which a distribution appears to have at most `m` modes — a classical nonparametric statistic for modality testing. DCB replaces every non-differentiable operation in its computation with a smooth surrogate, then uses the **Implicit Function Theorem** to compute exact gradients through the root-finding step at O(1) memory cost.
+```python
+import torch
+from dcb import DCBLayer
+X = torch.randn(256, requires_grad=True)   # 1D samples
+layer = DCBLayer(target_modes=1)
+h_crit = layer(X)                          # differentiable scalar
+h_crit.backward()                          # exact IFT gradients
+```
+## Installation
+```bash
+pip install dcb
+```
+Or from source:
+```bash
+git clone https://github.com/ryZhangHason/dcb
+cd dcb
+pip install -e ".[dev]"
+```
+## Paper
+> Ruiyu Zhang. "Differentiable Critical Bandwidth: Making Silverman's Modality Test End-to-End Trainable." *Journal of Machine Learning Research*, 2026 (in preparation).
+## Confirmed Experimental Results
+All results produced on Kaggle GPU (T4 / P100) — see `experiments/` and `outputs/`.
+| Experiment | Result | Criterion |
+|---|---|---|
+| **Validation (m≥2)** | R²=0.91, MAE=0.07, Spearman ρ=0.89 | R²≥0.85, MAE≤0.10 ✓ |
+| **Speedup vs scipy (n=8192)** | **10.5×** on T4 | ≥3× ✓ |
+| **GAN mode preservation** | h_crit=1.232 >> 0.3 | h_crit>0.3 ✓ |
+| **Anomaly AUC (KDDCup99)** | DCB=**0.9982** vs IF=0.9867 | DCB≥IF ✓ |
+## Repository Structure
+```
+dcb/            Core PyTorch package (layer.py, solver.py, kde.py, utils.py)
+experiments/    Reproduction scripts for all paper figures and tables
+  phase1_validation.py   Figure 1: DCB vs reference h_crit scatter
+  phase1_speedup.py      Figure 2: GPU speedup benchmark
+  phase1_ablation.py     Figures S1–S2: ε/τ sensitivity heatmaps
+  phase2_gan.py          Figure 3: GAN mode-collapse prevention
+  phase3_anomaly.py      Table 2 + Figure 5: anomaly detection benchmark
+tests/          Unit tests (pytest, 35/35 passing)
+outputs/        All generated figures and tables (PDFs, PNGs, CSVs)
+notebooks/      Quickstart and demo notebooks
+```
+## Reproducing Paper Results
+```bash
+# Phase 1: validation, speedup, ablation
+python experiments/phase1_validation.py
+python experiments/phase1_speedup.py
+python experiments/phase1_ablation.py
+# Phase 2: GAN mode collapse experiment
+python experiments/phase2_gan.py
+# Phase 3: anomaly detection benchmark
+python experiments/phase3_anomaly.py
+```
+For GPU runs, use the provided Kaggle kernels:
+- Phase 1–2: `hsingle/dcb-full-experiments`
+- Phase 3: `hsingle/dcb-phase-3-anomaly-detection`
+## Kaggle GPU Notes
+Kaggle may assign a P100 (sm_60) instead of T4. The Phase 3 kernel handles this automatically by installing `torch==2.2.2+cu118` (the earliest PyTorch release with both Python 3.12 and sm_60 support) when P100 is detected.
+## License
+MIT — see [LICENSE](LICENSE).

diffcb-0.1.0/README.md ADDED Viewed

@@ -0,0 +1,91 @@
+# DCB — Differentiable Critical Bandwidth
+[![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](LICENSE)
+[![Python 3.9+](https://img.shields.io/badge/python-3.9+-blue.svg)](https://www.python.org/)
+A PyTorch package that makes **Silverman's critical bandwidth test (1981)** fully differentiable, enabling end-to-end gradient-based optimization over the modal structure of continuous distributions.
+## Overview
+The critical bandwidth `h_crit` is the minimum KDE bandwidth at which a distribution appears to have at most `m` modes — a classical nonparametric statistic for modality testing. DCB replaces every non-differentiable operation in its computation with a smooth surrogate, then uses the **Implicit Function Theorem** to compute exact gradients through the root-finding step at O(1) memory cost.
+```python
+import torch
+from dcb import DCBLayer
+X = torch.randn(256, requires_grad=True)   # 1D samples
+layer = DCBLayer(target_modes=1)
+h_crit = layer(X)                          # differentiable scalar
+h_crit.backward()                          # exact IFT gradients
+```
+## Installation
+```bash
+pip install dcb
+```
+Or from source:
+```bash
+git clone https://github.com/ryZhangHason/dcb
+cd dcb
+pip install -e ".[dev]"
+```
+## Paper
+> Ruiyu Zhang. "Differentiable Critical Bandwidth: Making Silverman's Modality Test End-to-End Trainable." *Journal of Machine Learning Research*, 2026 (in preparation).
+## Confirmed Experimental Results
+All results produced on Kaggle GPU (T4 / P100) — see `experiments/` and `outputs/`.
+| Experiment | Result | Criterion |
+|---|---|---|
+| **Validation (m≥2)** | R²=0.91, MAE=0.07, Spearman ρ=0.89 | R²≥0.85, MAE≤0.10 ✓ |
+| **Speedup vs scipy (n=8192)** | **10.5×** on T4 | ≥3× ✓ |
+| **GAN mode preservation** | h_crit=1.232 >> 0.3 | h_crit>0.3 ✓ |
+| **Anomaly AUC (KDDCup99)** | DCB=**0.9982** vs IF=0.9867 | DCB≥IF ✓ |
+## Repository Structure
+```
+dcb/            Core PyTorch package (layer.py, solver.py, kde.py, utils.py)
+experiments/    Reproduction scripts for all paper figures and tables
+  phase1_validation.py   Figure 1: DCB vs reference h_crit scatter
+  phase1_speedup.py      Figure 2: GPU speedup benchmark
+  phase1_ablation.py     Figures S1–S2: ε/τ sensitivity heatmaps
+  phase2_gan.py          Figure 3: GAN mode-collapse prevention
+  phase3_anomaly.py      Table 2 + Figure 5: anomaly detection benchmark
+tests/          Unit tests (pytest, 35/35 passing)
+outputs/        All generated figures and tables (PDFs, PNGs, CSVs)
+notebooks/      Quickstart and demo notebooks
+```
+## Reproducing Paper Results
+```bash
+# Phase 1: validation, speedup, ablation
+python experiments/phase1_validation.py
+python experiments/phase1_speedup.py
+python experiments/phase1_ablation.py
+# Phase 2: GAN mode collapse experiment
+python experiments/phase2_gan.py
+# Phase 3: anomaly detection benchmark
+python experiments/phase3_anomaly.py
+```
+For GPU runs, use the provided Kaggle kernels:
+- Phase 1–2: `hsingle/dcb-full-experiments`
+- Phase 3: `hsingle/dcb-phase-3-anomaly-detection`
+## Kaggle GPU Notes
+Kaggle may assign a P100 (sm_60) instead of T4. The Phase 3 kernel handles this automatically by installing `torch==2.2.2+cu118` (the earliest PyTorch release with both Python 3.12 and sm_60 support) when P100 is detected.
+## License
+MIT — see [LICENSE](LICENSE).

diffcb-0.1.0/dcb/__init__.py ADDED Viewed

@@ -0,0 +1,22 @@
+"""
+dcb — Differentiable Critical Bandwidth
+A PyTorch package that makes Silverman's critical bandwidth test (1981) fully
+differentiable via a smooth mode-counting integral and an Implicit Function
+Theorem (IFT) backward pass. The primary public API is the
+`DifferentiableCriticalBandwidth` class, which behaves as a standard
+`torch.nn.Module` and can be used as a loss component or regularizer in any
+gradient-based learning pipeline. Import as `from dcb import DCBLayer` for
+the layer, or `from dcb.kde import gaussian_kde_grid` for lower-level KDE
+utilities. Requires PyTorch >= 2.0, NumPy >= 1.24, and SciPy >= 1.10.
+"""
+from dcb.layer import DCBLayer, DifferentiableCriticalBandwidth
+from dcb.utils import anneal_eps_tau
+from dcb.kde import soft_mode_count_cross, soft_mode_count
+__all__ = [
+    "DCBLayer", "DifferentiableCriticalBandwidth",
+    "anneal_eps_tau", "soft_mode_count_cross", "soft_mode_count",
+]
+__version__ = "0.1.0"

diffcb-0.1.0/dcb/diagnostics.py ADDED Viewed

@@ -0,0 +1,163 @@
+"""
+dcb.diagnostics — Gradient Stability Diagnostics for DCB
+Provides `denom_profile()` which maps M̃(h) and ∂M̃/∂h over a bandwidth grid
+to assess gradient conditioning before training.  A stable IFT gradient at
+h_crit requires |∂M̃/∂h| > 0 (non-zero denominator in the IFT formula).
+Use case: call denom_profile() on your dataset before fitting DCBLayer to
+verify that the IFT gradient is well-conditioned at h_crit.  If
+stability_mask=False at h_crit, consider using safe_backward=True or
+widening the bandwidth search range.
+"""
+from __future__ import annotations
+import torch
+from torch import Tensor
+from dcb.kde import soft_mode_count_cross
+from dcb.utils import make_grid
+def denom_profile(
+    X: Tensor,
+    h_grid: Tensor,
+    formula: str = 'cross',
+    eps: float = 0.1,
+    tau: float = 0.2,
+    chunk_size: int = 50_000,
+    guard: float = 0.01,
+) -> dict:
+    """Compute M̃(h) and ∂M̃/∂h over a bandwidth grid for gradient stability diagnosis.
+    Evaluates the soft mode count M̃_cross at each bandwidth in h_grid, then
+    computes the finite-difference derivative ∂M̃/∂h.  The stability_mask
+    identifies bandwidths where the IFT denominator is large enough for
+    well-conditioned gradients.
+    Parameters
+    ----------
+    X : Tensor, shape (n,)
+        Observed data points.
+    h_grid : Tensor, shape (H,)
+        Bandwidth grid to evaluate. Should cover the expected h_crit.
+    formula : str
+        Mode-count formula to use. Only 'cross' is supported (matches DCBLayer).
+    eps : float
+        Sigmoid temperature for the zero-crossing detector. Default 0.1.
+    tau : float
+        Sigmoid temperature for the local-max selector. Default 0.2.
+    chunk_size : int
+        Chunk size for KDE computation (not used in dense path, kept for API
+        consistency with large-n paths).
+    guard : float
+        Threshold for stability_mask: True where |dM_dh| > guard. Default 0.01.
+    Returns
+    -------
+    dict with keys:
+        'h_grid'         : Tensor (H,) — input bandwidth grid
+        'M_tilde'        : Tensor (H,) — soft mode count at each h
+        'dM_dh'          : Tensor (H,) — ∂M̃/∂h via central finite differences
+        'stability_mask' : BoolTensor (H,) — True where |dM_dh| > guard
+        'h_crit_approx'  : float — approximate h_crit (smallest h where M̃ ≤ 1.5);
+                           float('nan') if not found
+    Notes
+    -----
+    All computation runs under torch.no_grad() — this function is diagnostic
+    only and does not build a computation graph.
+    Use case: call before training to confirm gradients are well-conditioned
+    at h_crit. stability_mask=True at h_crit means IFT gradient is valid for
+    that dataset.
+    """
+    if formula != 'cross':
+        raise ValueError(f"Only formula='cross' is supported; got {formula!r}")
+    H = h_grid.shape[0]
+    grid = make_grid(X, G=512)
+    M_tilde = torch.zeros(H, dtype=X.dtype, device=X.device)
+    with torch.no_grad():
+        for i in range(H):
+            h_val = h_grid[i].item()
+            M_tilde[i] = soft_mode_count_cross(X, h_val, grid, eps, tau)
+        # Central finite differences for interior; forward/backward at edges
+        dM_dh = torch.zeros(H, dtype=X.dtype, device=X.device)
+        for i in range(H):
+            if i == 0:
+                # Forward difference
+                dM_dh[i] = (M_tilde[1] - M_tilde[0]) / (h_grid[1] - h_grid[0])
+            elif i == H - 1:
+                # Backward difference
+                dM_dh[i] = (M_tilde[H - 1] - M_tilde[H - 2]) / (h_grid[H - 1] - h_grid[H - 2])
+            else:
+                # Central difference
+                dM_dh[i] = (M_tilde[i + 1] - M_tilde[i - 1]) / (h_grid[i + 1] - h_grid[i - 1])
+        stability_mask = dM_dh.abs() > guard
+        # h_crit_approx: smallest h where M̃ ≤ 1.5 (threshold for target_modes=1)
+        below_threshold = (M_tilde <= 1.5).nonzero(as_tuple=False)
+        if below_threshold.numel() > 0:
+            first_idx = below_threshold[0].item()
+            h_crit_approx = h_grid[first_idx].item()
+        else:
+            h_crit_approx = float('nan')
+    return {
+        'h_grid': h_grid,
+        'M_tilde': M_tilde,
+        'dM_dh': dM_dh,
+        'stability_mask': stability_mask,
+        'h_crit_approx': h_crit_approx,
+    }
+def print_stability_report(profile: dict) -> None:
+    """Print a human-readable stability report from denom_profile output.
+    Parameters
+    ----------
+    profile : dict
+        Output from `denom_profile()`.
+    """
+    h_grid = profile['h_grid']
+    M_tilde = profile['M_tilde']
+    dM_dh = profile['dM_dh']
+    stability_mask = profile['stability_mask']
+    h_crit_approx = profile['h_crit_approx']
+    H = h_grid.shape[0]
+    n_stable = stability_mask.sum().item()
+    pct_stable = 100.0 * n_stable / H
+    print("=" * 60)
+    print("DCB Gradient Stability Report")
+    print("=" * 60)
+    print(f"  h_grid range  : [{h_grid.min().item():.4f}, {h_grid.max().item():.4f}]  (H={H})")
+    print(f"  M_tilde range : [{M_tilde.min().item():.4f}, {M_tilde.max().item():.4f}]")
+    print(f"  dM_dh range   : [{dM_dh.min().item():.4f}, {dM_dh.max().item():.4f}]")
+    print(f"  h_crit_approx : {h_crit_approx:.4f}" if h_crit_approx == h_crit_approx
+          else "  h_crit_approx : NaN (M_tilde never <= 1.5 in grid)")
+    print(f"  Stable points : {n_stable}/{H} ({pct_stable:.1f}%)")
+    if h_crit_approx == h_crit_approx:  # not NaN
+        # Find index of h_crit_approx
+        idx = (h_grid - h_crit_approx).abs().argmin().item()
+        stable_at_hcrit = stability_mask[idx].item()
+        dM_at_hcrit = dM_dh[idx].abs().item()
+        print(f"  At h_crit     : stability={stable_at_hcrit}, |dM_dh|={dM_at_hcrit:.4f}")
+        if not stable_at_hcrit:
+            print()
+            print("  WARNING: h_crit_approx falls in an UNSTABLE region.")
+            print("  IFT gradient may be ill-conditioned at h_crit.")
+            print("  Consider: safe_backward=True, wider h_grid, or larger n.")
+        else:
+            print()
+            print("  OK: IFT gradient is well-conditioned at h_crit.")
+    print("=" * 60)

diffcb-0.1.0/dcb/fft_kde.py ADDED Viewed

@@ -0,0 +1,128 @@
+"""
+dcb.fft_kde — FFT-based KDE Mode Counter
+Implements mode counting via FFT convolution of the histogram with a
+Gaussian derivative kernel. Complexity is O(n + G log G), avoiding the
+O(n × G) cost of the direct KDE approach and — crucially — requiring NO
+subsampling. This eliminates the (brentq_n_max / n)^{-1/5} upward bias
+that affects the standard bisection path when n > brentq_n_max.
+Round 18b: forward kernel only. The IFT backward is unchanged (still uses
+the analytical chunked KDE derivatives on all n points).
+"""
+from __future__ import annotations
+import math
+import torch
+from torch import Tensor
+def fft_mode_count(
+    X: Tensor,
+    h: float,
+    G: int = 4096,
+    pad_factor: int = 4,
+) -> int:
+    """Count KDE modes via FFT convolution — O(n + G log G), no subsampling.
+    Bins X into G histogram bins, zero-pads to pad_factor*G, convolves with
+    the Gaussian derivative kernel in the frequency domain (applying iω·exp(−½(ωh)²)),
+    back-transforms, and counts positive-to-negative sign changes of the
+    resulting f' estimate.
+    Parameters
+    ----------
+    X : Tensor, shape (n,)
+        1D data tensor (may be on CPU or CUDA).
+    h : float
+        Bandwidth for the Gaussian kernel.
+    G : int
+        Number of histogram bins. Must satisfy h > 8 * (data_range / G) for
+        reliable derivative estimation. Use `adaptive_fft_G` to choose G
+        automatically before bisection.
+    pad_factor : int
+        Zero-padding multiplier (default 4). Mandatory ≥ 2 for circular-wrap
+        correctness; 4 is recommended at the largest h encountered.
+    Returns
+    -------
+    int
+        Number of KDE modes (downward zero-crossings of f').
+    """
+    with torch.no_grad():
+        # Domain: extend 3σ beyond data range to avoid boundary effects
+        sigma = X.std().item()
+        if sigma == 0.0:
+            sigma = 1.0  # degenerate case: all points identical
+        lo = X.min().item() - 3 * sigma
+        hi = X.max().item() + 3 * sigma
+        data_range = hi - lo
+        if data_range == 0.0:
+            return 1  # single-point distribution has 1 mode
+        # Histogram (O(n), CUDA-native)
+        counts = torch.histc(X.float(), bins=G, min=lo, max=hi)
+        # Zero-pad to pad_factor*G (4× mandatory for circular wrap correctness at h_hi)
+        N = pad_factor * G
+        counts_padded = torch.zeros(N, dtype=torch.float32, device=X.device)
+        counts_padded[:G] = counts
+        # FFT of histogram
+        C = torch.fft.rfft(counts_padded)
+        # Derivative kernel in frequency domain: iω * exp(-0.5*(ω*h)²)
+        # ω_k = 2π*k / (N * bin_width), bin_width = data_range / G
+        bin_width = data_range / G
+        k = torch.arange(N // 2 + 1, device=X.device, dtype=torch.float32)
+        omega = 2 * math.pi * k / (N * bin_width)
+        K_deriv = 1j * omega * torch.exp(-0.5 * (omega * h) ** 2)
+        # Convolve and back-transform
+        f_prime_padded = torch.fft.irfft(C * K_deriv, n=N)
+        # Trim to original G grid (discard zero-padded tail)
+        f_prime = f_prime_padded[:G]
+        # Count (+→-) sign changes = number of modes
+        # A mode is a local max of f, i.e., f' crosses zero from + to -
+        # Remove zeros (flat segments) — carry forward last nonzero sign
+        nonzero_mask = f_prime != 0
+        if not nonzero_mask.any():
+            return 0
+        s = f_prime[nonzero_mask]
+        transitions = int(((s[:-1] > 0) & (s[1:] < 0)).sum().item())
+        return transitions
+def adaptive_fft_G(data_range: float, h_hi: float, G_min: int = 4096) -> int:
+    """Choose FFT grid size G so that the derivative kernel is well-resolved.
+    Requires h > 8 * bin_width = 8 * data_range / G, equivalently
+    G > 8 * data_range / h_hi. We use a factor of 16 for safety margin,
+    then round up to the next power of 2 for efficient FFT.
+    Parameters
+    ----------
+    data_range : float
+        hi - lo of the data domain (typically X.max() - X.min() + 6σ).
+    h_hi : float
+        Upper bracket of the bisection (smallest h needing resolution).
+    G_min : int
+        Minimum returned G (default 4096).
+    Returns
+    -------
+    int
+        Grid size G, a power of 2, at least G_min.
+    """
+    needed = 16 * math.ceil(data_range / h_hi)
+    # Round up to next power of 2
+    p = 1
+    while p < needed:
+        p <<= 1
+    return max(G_min, p)