PyPI - diffcb - Versions diffs - 0.1.6__tar.gz → 0.1.8__tar.gz - Mend

diffcb 0.1.6tar.gz → 0.1.8tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (29) hide show

{diffcb-0.1.6 → diffcb-0.1.8}/PKG-INFO +131 -44
diffcb-0.1.8/README.md +182 -0
{diffcb-0.1.6 → diffcb-0.1.8}/dcb/__init__.py +1 -1
{diffcb-0.1.6 → diffcb-0.1.8}/dcb/fft_kde.py +112 -4
{diffcb-0.1.6 → diffcb-0.1.8}/dcb/layer.py +83 -5
{diffcb-0.1.6 → diffcb-0.1.8}/dcb/solver.py +14 -2
diffcb-0.1.8/dcb/training.py +475 -0
{diffcb-0.1.6 → diffcb-0.1.8}/pyproject.toml +1 -1
diffcb-0.1.8/round24_v016_test.py +469 -0
diffcb-0.1.8/round25_full_range_sweep.py +500 -0
diffcb-0.1.8/round25_write_csv.py +137 -0
diffcb-0.1.8/tests/test_gradcheck.py +104 -0
{diffcb-0.1.6 → diffcb-0.1.8}/tests/test_layer.py +9 -6
diffcb-0.1.6/README.md +0 -95
diffcb-0.1.6/dcb/training.py +0 -231
{diffcb-0.1.6 → diffcb-0.1.8}/.gitignore +0 -0
{diffcb-0.1.6 → diffcb-0.1.8}/.zenodo.json +0 -0
{diffcb-0.1.6 → diffcb-0.1.8}/LICENSE +0 -0
{diffcb-0.1.6 → diffcb-0.1.8}/dcb/diagnostics.py +0 -0
{diffcb-0.1.6 → diffcb-0.1.8}/dcb/kde.py +0 -0
{diffcb-0.1.6 → diffcb-0.1.8}/dcb/utils.py +0 -0
{diffcb-0.1.6 → diffcb-0.1.8}/notebooks/.gitkeep +0 -0
{diffcb-0.1.6 → diffcb-0.1.8}/round24_cumulative_bench.py +0 -0
{diffcb-0.1.6 → diffcb-0.1.8}/tests/test_kde.py +0 -0
{diffcb-0.1.6 → diffcb-0.1.8}/tests/test_r18c_denom_audit.py +0 -0
{diffcb-0.1.6 → diffcb-0.1.8}/tests/test_r18c_deprecation_warn.py +0 -0
{diffcb-0.1.6 → diffcb-0.1.8}/tests/test_r19_default_fft.py +0 -0
{diffcb-0.1.6 → diffcb-0.1.8}/tests/test_r19_diagnostics.py +0 -0
{diffcb-0.1.6 → diffcb-0.1.8}/tests/test_solver.py +0 -0

{diffcb-0.1.6 → diffcb-0.1.8}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: diffcb
-Version: 0.1.6
+Version: 0.1.8
 Summary: Differentiable Critical Bandwidth: Silverman's modality test as a differentiable PyTorch layer with IFT backward pass.
 Project-URL: Homepage, https://github.com/ryZhangHason/differentiable-critical-bandwidth
 Project-URL: Repository, https://github.com/ryZhangHason/differentiable-critical-bandwidth
@@ -226,20 +226,25 @@ Description-Content-Type: text/markdown
 [![License: Apache 2.0](https://img.shields.io/badge/License-Apache%202.0-blue.svg)](LICENSE)
 [![Python 3.9+](https://img.shields.io/badge/python-3.9+-blue.svg)](https://www.python.org/)
-A PyTorch package that makes **Silverman's critical bandwidth test (1981)** fully differentiable, enabling end-to-end gradient-based optimization over the modal structure of continuous distributions.
+A PyTorch package that makes **Silverman's critical bandwidth test (1981)** fully differentiable, enabling end-to-end gradient-based optimisation over the modal structure of continuous distributions.
 ## Overview
-The critical bandwidth `h_crit` is the minimum KDE bandwidth at which a distribution appears to have at most `m` modes — a classical nonparametric statistic for modality testing. DCB replaces every non-differentiable operation in its computation with a smooth surrogate, then uses the **Implicit Function Theorem** to compute exact gradients through the root-finding step at O(1) memory cost.
+`h_crit` is the minimum KDE bandwidth at which a distribution appears unimodal — a classical nonparametric statistic for modality testing. DCB replaces every non-differentiable step with a smooth surrogate, then uses the **Implicit Function Theorem** (IFT) to compute exact gradients through the root-finding step at O(1) memory cost.
 ```python
 import torch
-from dcb import DCBLayer
+from dcb import DCBLayer, TrainingLayer
-X = torch.randn(1000, requires_grad=True)   # 1D samples
-layer = DCBLayer(target_modes=1)
-h_crit = layer(X)                           # differentiable scalar
-h_crit.backward()                           # exact IFT gradients
+X = torch.randn(10_000, requires_grad=True)  # 1D samples, any n from 5K to 1B
+layer = DCBLayer()
+h_crit = layer(X)       # differentiable scalar
+h_crit.backward()       # exact IFT gradients
+# For repeated training-loop use with warm-start bracket caching:
+layer = TrainingLayer(warm_start=True)
+for batch in dataloader:
+    h = layer(batch)    # 1.8× faster after first call on CPU; ~10× on CUDA with compile=True
 ```
 ## Installation
@@ -249,7 +254,6 @@ pip install diffcb
 ```
 Or from source:
 ```bash
 git clone https://github.com/ryZhangHason/differentiable-critical-bandwidth
 cd differentiable-critical-bandwidth
@@ -258,58 +262,141 @@ pip install -e ".[dev]"
 ## Accuracy vs R's `bw.crit`
-DCB is validated against R's `multimode::bw.crit(data, mod0=1)` — the standard reference implementation of Hall & York (2001). On **identical data**:
+Validated against R's `multimode::bw.crit(data, mod0=1)` (Hall & York 2001).
+**Same-sample protocol** (identical data fed to both Python and R):
-| n | DCB vs R (same sample) | DCB vs R (independent samples) |
-|---|---|---|
-| 100K | **0.004%** | ~0.5% (MC noise from independent RNG) |
-| 1M | **0.005%** | ~0.2% |
-| 10M | **0.004%** | ~0.1% |
+| n | DCB error vs R | Notes |
+|---|---------------|-------|
+| 5K–25K | **< 0.005%** | Direct-KDE path, zero histogram bias |
+| 100K | **0.003%** | FFT histogram path, G=16384 |
+| 1M | **0.003%** | FFT path |
+| 10M | **0.003%** | FFT path |
+| 100M+ | **< 0.01%** | Histogram-dominated; sketch available |
+Independent-sample error (~0.2–0.5%) reflects natural sampling variability (two RNGs), not algorithmic error. The 0.003% algorithmic error sits below R's own ~0.001% numerical noise floor.
+## Hardware Performance (v0.1.6)
+| n | CPU (Apple M) | MPS | P100 GPU |
+|---|:---:|:---:|:---:|
+| 10K | 2,300 ms | 1,400 ms | **107 ms** |
+| 50K | 2,900 ms | 1,700 ms | **167 ms** |
+| 100K | 265 ms | 248 ms | **35 ms** |
+| 1M | 269 ms | 189 ms | **36 ms** |
+| 10M | 544 ms | — | **44 ms** |
+P100 speedup: **43–116× vs CPU**. Peak 116× at n=50K (direct-KDE GPU parallelism).
+Cumulative speedup vs v0.1.4 on CPU: 1.1× (100K), 1.7× (1M), **4.2× (10M)**.
-The independent-sample figures reflect natural sampling variability (two unbiased estimators drawing different data), not algorithmic error. On identical data, DCB agrees with R to within **0.005%** at all tested n. DCB is 43× faster than R at n=100M (1.1 s vs 50 s) and handles n=2B in 24 s while R OOMs.
+## API Reference
-## Key Parameters
+### `DCBLayer`
 ```python
 DCBLayer(
-    target_modes=1,       # target number of modes
-    G=512,                # IFT evaluation grid points
-    use_fft=True,         # FFT forward (default); eliminates subsampling bias for n>50K
-    max_n_exact=1_000_000,# sketch to sketch_size when n exceeds this (None = always exact)
-    sketch_size=500_000,  # sketch target; 500K matches full-n accuracy (O(n^{-2/9}) rate)
-    safe_backward=False,  # clamp IFT denominator near bifurcations
+    target_modes=1,           # target number of modes (default 1)
+    use_fft=True,             # FFT path for n > 50K (default True)
+    max_n_exact=None,         # sketch above this n (None = always exact)
+    G_min=16384,              # minimum FFT histogram bins (accuracy ↑ with G)
+    use_richardson="auto",    # Richardson on CPU, off on GPU (30% accuracy gain on CPU)
+    direct_n_max=25_000,      # direct-KDE active only when forward_path='auto'/'direct'
+    direct_M=2048,            # direct-KDE evaluation grid size
+    forward_path='smooth',    # 'smooth' (default, strictly differentiable) |
+                              # 'auto' (direct-KDE at n≤25K, surrogate gradient) |
+                              # 'direct' (force direct-KDE, accuracy benchmarks)
+    safe_backward=False,      # clamp IFT denominator near bifurcations
 )
 ```
-## Confirmed Experimental Results
+### `TrainingLayer` (for ML training loops)
+```python
+from dcb import TrainingLayer
+layer = TrainingLayer(
+    warm_start=True,    # cache h_prev; init bracket to [0.95h, 1.05h] → 1.8× CPU speedup
+    compile=False,      # torch.compile opt-in (requires float32, Python ≤ 3.11 on CUDA)
+    warm_margin=0.05,   # bracket half-width around cached h_crit
+    **dcb_kwargs,       # any DCBLayer parameter
+)
+layer.reset_cache()     # call on distribution shift
+```
+### Direct-KDE path (n ≤ 25K)
-All GPU results produced on Kaggle (T4 / P100) — see `experiments/` and `outputs/`.
+For small samples, DCB evaluates f′_h directly without histogram binning (O(n·M) per evaluation, zero binning bias). This is 3–4× slower on CPU but **80–96× faster than CPU on GPU**.
+```python
+# Force direct-KDE for all n (accuracy benchmark):
+layer = DCBLayer(direct_n_max=float('inf'))
+# Disable direct-KDE (speed benchmark):
+layer = DCBLayer(direct_n_max=0)
+```
+### Richardson extrapolation
+By default (`use_richardson=True`), DCB runs a second bisection at G/2=8192 and combines:
+`h̃ = (4·ĥ(G) − ĥ(G/2)) / 3`, reducing error ~30%. On GPU this adds 38% overhead with
+<0.01% accuracy gain — consider `use_richardson=False` for GPU training loops.
+## Known Limitations
+- **`compile=True` on MPS**: blocked by float64 in `_refine_hcrit` fallback (fix in v0.1.7)
+- **`compile=True` on CUDA with Python 3.12**: requires torch ≥ 2.4 or Python ≤ 3.11
+- **`gradcheck`**: passes with the default `forward_path='smooth'`; the default is strictly differentiable at all n. Opt into `forward_path='auto'` only for forward-only accuracy benchmarks (surrogate gradient at n≤25K)
+- **n > 100M**: requires streaming histogram (not yet public API); use `max_n_exact=1_000_000` sketch as workaround
+## Confirmed Experimental Results
 | Experiment | Result | Criterion |
 |---|---|---|
-| **Accuracy vs R (same data, n=100K)** | **0.004%** | < 0.01% ✓ |
-| **Validation (m≥2, Marron-Wand)** | R²=0.91, MAE=0.07, ρ=0.89 | R²≥0.85 ✓ |
-| **Speedup vs scipy (CUDA T4, n=8192)** | **10.5×** | ≥3× ✓ |
-| **GAN mode preservation** | h_crit=1.232 >> 0.3 | h_crit>0.3 ✓ |
-| **Anomaly AUC (KDDCup99)** | DCB=**0.9982** vs IF=0.9867 | DCB≥IF ✓ |
+| Accuracy vs R (same data, n=100K) | **0.003%** | < 0.01% ✓ |
+| Validation (m≥2, Marron-Wand) | R²=0.91, MAE=0.07, ρ=0.89 | R²≥0.85 ✓ |
+| Speedup vs scipy (CUDA T4, n=8192) | **10.5×** | ≥3× ✓ |
+| GAN mode preservation | h_crit=1.232 >> 0.3 | h_crit>0.3 ✓ |
+| Anomaly AUC (KDDCup99) | DCB=**0.9982** vs IF=0.9867 | DCB≥IF ✓ |
+| GPU speedup (P100, n=50K) | **116×** vs CPU | — |
+| GPU speedup (P100, n=100K) | **43×** vs CPU | — |
+## Changelog
+### v0.1.6 (2026-05-30)
+- `TrainingLayer`: warm-start bracket caching (1.82× CPU speedup in training loops)
+- `direct_mode_count_batch`: direct-KDE path for n ≤ 25K (zero histogram bias; 80–96× GPU speedup)
+- Compile-ready trisection: tensor lo/hi, no `.item()` inside loop, fixed 16-round unroll
+- `mode_count_from_C_batch` returns `Tensor(B,)` (was `list[int]`) — enables torch.compile tracing
+### v0.1.5 (2026-05-29)
+- Richardson extrapolation on h_crit scalar (30% accuracy gain, G=16384+8192)
+- alloc/sync hygiene: removed `nonzero_mask` host sync (4.2× faster at n=10M)
+- Batched trisection bisection (one irfft dispatch per round)
+- Eliminated duplicate O(n) histogram in `_refine_hcrit` (C_external reuse)
+### v0.1.4 (2026-05-29)
+- FFT histogram path: C hoisted out of bisection loop (Worker 1)
+- Device-native histogram: CUDA histc, MPS scatter_add_, CPU bucketize+bincount
+- float32 FFT default; pad_factor 4→2 (halves irfft size)
+- Adaptive bisection early-exit
+### v0.1.1 (2026-05-29)
+- MPS histc OOM bug fixed (bucketize+bincount)
+- Sketch API: max_n_exact=1M, sketch_size=500K
+- Domain consistency and bias warning fixes
 ## Repository Structure
 ```
-dcb/            Core PyTorch package
-  layer.py        DCBLayer nn.Module + DCBFunction autograd
-  solver.py       IFT root-finder and backward pass
-  fft_kde.py      FFT-based mode counter (MPS-safe, float64, G=16384)
-  kde.py          Direct KDE derivatives (small-n path)
-  utils.py        Grid, Silverman bandwidth, sg() stabilizer
-experiments/    Reproduction scripts for all paper figures and tables
-  phase1_*.py     Validation, speedup, ablation (Figures 1–2, S1–S2)
-  phase2_gan.py   GAN mode-collapse prevention (Figure 3)
-  phase3_anomaly.py  Anomaly detection (Table 2, Figure 5)
-  round20_*.py    Large-n R comparison and streaming benchmarks
-  round21_*.py    Accuracy improvement experiments
-tests/          Unit tests (pytest, 45 passed, 1 xfailed)
-outputs/        All generated figures and tables (PDFs, PNGs, CSVs)
+dcb/
+  layer.py         DCBLayer nn.Module + DCBFunction autograd
+  solver.py        IFT root-finder, trisection bisection, Richardson pass
+  fft_kde.py       FFT mode counter, direct_mode_count_batch, precompute_fft
+  training.py      TrainingLayer with warm-start and compile support
+  kde.py           Direct KDE derivatives (IFT backward path)
+  utils.py         Grid, Silverman bandwidth, sg() stabiliser
+experiments/       Reproduction scripts for all benchmarks and paper figures
+tests/             Unit tests (45 passed, 1 xfailed)
 ```
 ## License

diffcb-0.1.8/README.md ADDED Viewed

@@ -0,0 +1,182 @@
+# DCB — Differentiable Critical Bandwidth
+[![PyPI](https://img.shields.io/pypi/v/diffcb.svg)](https://pypi.org/project/diffcb/)
+[![License: Apache 2.0](https://img.shields.io/badge/License-Apache%202.0-blue.svg)](LICENSE)
+[![Python 3.9+](https://img.shields.io/badge/python-3.9+-blue.svg)](https://www.python.org/)
+A PyTorch package that makes **Silverman's critical bandwidth test (1981)** fully differentiable, enabling end-to-end gradient-based optimisation over the modal structure of continuous distributions.
+## Overview
+`h_crit` is the minimum KDE bandwidth at which a distribution appears unimodal — a classical nonparametric statistic for modality testing. DCB replaces every non-differentiable step with a smooth surrogate, then uses the **Implicit Function Theorem** (IFT) to compute exact gradients through the root-finding step at O(1) memory cost.
+```python
+import torch
+from dcb import DCBLayer, TrainingLayer
+X = torch.randn(10_000, requires_grad=True)  # 1D samples, any n from 5K to 1B
+layer = DCBLayer()
+h_crit = layer(X)       # differentiable scalar
+h_crit.backward()       # exact IFT gradients
+# For repeated training-loop use with warm-start bracket caching:
+layer = TrainingLayer(warm_start=True)
+for batch in dataloader:
+    h = layer(batch)    # 1.8× faster after first call on CPU; ~10× on CUDA with compile=True
+```
+## Installation
+```bash
+pip install diffcb
+```
+Or from source:
+```bash
+git clone https://github.com/ryZhangHason/differentiable-critical-bandwidth
+cd differentiable-critical-bandwidth
+pip install -e ".[dev]"
+```
+## Accuracy vs R's `bw.crit`
+Validated against R's `multimode::bw.crit(data, mod0=1)` (Hall & York 2001).
+**Same-sample protocol** (identical data fed to both Python and R):
+| n | DCB error vs R | Notes |
+|---|---------------|-------|
+| 5K–25K | **< 0.005%** | Direct-KDE path, zero histogram bias |
+| 100K | **0.003%** | FFT histogram path, G=16384 |
+| 1M | **0.003%** | FFT path |
+| 10M | **0.003%** | FFT path |
+| 100M+ | **< 0.01%** | Histogram-dominated; sketch available |
+Independent-sample error (~0.2–0.5%) reflects natural sampling variability (two RNGs), not algorithmic error. The 0.003% algorithmic error sits below R's own ~0.001% numerical noise floor.
+## Hardware Performance (v0.1.6)
+| n | CPU (Apple M) | MPS | P100 GPU |
+|---|:---:|:---:|:---:|
+| 10K | 2,300 ms | 1,400 ms | **107 ms** |
+| 50K | 2,900 ms | 1,700 ms | **167 ms** |
+| 100K | 265 ms | 248 ms | **35 ms** |
+| 1M | 269 ms | 189 ms | **36 ms** |
+| 10M | 544 ms | — | **44 ms** |
+P100 speedup: **43–116× vs CPU**. Peak 116× at n=50K (direct-KDE GPU parallelism).
+Cumulative speedup vs v0.1.4 on CPU: 1.1× (100K), 1.7× (1M), **4.2× (10M)**.
+## API Reference
+### `DCBLayer`
+```python
+DCBLayer(
+    target_modes=1,           # target number of modes (default 1)
+    use_fft=True,             # FFT path for n > 50K (default True)
+    max_n_exact=None,         # sketch above this n (None = always exact)
+    G_min=16384,              # minimum FFT histogram bins (accuracy ↑ with G)
+    use_richardson="auto",    # Richardson on CPU, off on GPU (30% accuracy gain on CPU)
+    direct_n_max=25_000,      # direct-KDE active only when forward_path='auto'/'direct'
+    direct_M=2048,            # direct-KDE evaluation grid size
+    forward_path='smooth',    # 'smooth' (default, strictly differentiable) |
+                              # 'auto' (direct-KDE at n≤25K, surrogate gradient) |
+                              # 'direct' (force direct-KDE, accuracy benchmarks)
+    safe_backward=False,      # clamp IFT denominator near bifurcations
+)
+```
+### `TrainingLayer` (for ML training loops)
+```python
+from dcb import TrainingLayer
+layer = TrainingLayer(
+    warm_start=True,    # cache h_prev; init bracket to [0.95h, 1.05h] → 1.8× CPU speedup
+    compile=False,      # torch.compile opt-in (requires float32, Python ≤ 3.11 on CUDA)
+    warm_margin=0.05,   # bracket half-width around cached h_crit
+    **dcb_kwargs,       # any DCBLayer parameter
+)
+layer.reset_cache()     # call on distribution shift
+```
+### Direct-KDE path (n ≤ 25K)
+For small samples, DCB evaluates f′_h directly without histogram binning (O(n·M) per evaluation, zero binning bias). This is 3–4× slower on CPU but **80–96× faster than CPU on GPU**.
+```python
+# Force direct-KDE for all n (accuracy benchmark):
+layer = DCBLayer(direct_n_max=float('inf'))
+# Disable direct-KDE (speed benchmark):
+layer = DCBLayer(direct_n_max=0)
+```
+### Richardson extrapolation
+By default (`use_richardson=True`), DCB runs a second bisection at G/2=8192 and combines:
+`h̃ = (4·ĥ(G) − ĥ(G/2)) / 3`, reducing error ~30%. On GPU this adds 38% overhead with
+<0.01% accuracy gain — consider `use_richardson=False` for GPU training loops.
+## Known Limitations
+- **`compile=True` on MPS**: blocked by float64 in `_refine_hcrit` fallback (fix in v0.1.7)
+- **`compile=True` on CUDA with Python 3.12**: requires torch ≥ 2.4 or Python ≤ 3.11
+- **`gradcheck`**: passes with the default `forward_path='smooth'`; the default is strictly differentiable at all n. Opt into `forward_path='auto'` only for forward-only accuracy benchmarks (surrogate gradient at n≤25K)
+- **n > 100M**: requires streaming histogram (not yet public API); use `max_n_exact=1_000_000` sketch as workaround
+## Confirmed Experimental Results
+| Experiment | Result | Criterion |
+|---|---|---|
+| Accuracy vs R (same data, n=100K) | **0.003%** | < 0.01% ✓ |
+| Validation (m≥2, Marron-Wand) | R²=0.91, MAE=0.07, ρ=0.89 | R²≥0.85 ✓ |
+| Speedup vs scipy (CUDA T4, n=8192) | **10.5×** | ≥3× ✓ |
+| GAN mode preservation | h_crit=1.232 >> 0.3 | h_crit>0.3 ✓ |
+| Anomaly AUC (KDDCup99) | DCB=**0.9982** vs IF=0.9867 | DCB≥IF ✓ |
+| GPU speedup (P100, n=50K) | **116×** vs CPU | — |
+| GPU speedup (P100, n=100K) | **43×** vs CPU | — |
+## Changelog
+### v0.1.6 (2026-05-30)
+- `TrainingLayer`: warm-start bracket caching (1.82× CPU speedup in training loops)
+- `direct_mode_count_batch`: direct-KDE path for n ≤ 25K (zero histogram bias; 80–96× GPU speedup)
+- Compile-ready trisection: tensor lo/hi, no `.item()` inside loop, fixed 16-round unroll
+- `mode_count_from_C_batch` returns `Tensor(B,)` (was `list[int]`) — enables torch.compile tracing
+### v0.1.5 (2026-05-29)
+- Richardson extrapolation on h_crit scalar (30% accuracy gain, G=16384+8192)
+- alloc/sync hygiene: removed `nonzero_mask` host sync (4.2× faster at n=10M)
+- Batched trisection bisection (one irfft dispatch per round)
+- Eliminated duplicate O(n) histogram in `_refine_hcrit` (C_external reuse)
+### v0.1.4 (2026-05-29)
+- FFT histogram path: C hoisted out of bisection loop (Worker 1)
+- Device-native histogram: CUDA histc, MPS scatter_add_, CPU bucketize+bincount
+- float32 FFT default; pad_factor 4→2 (halves irfft size)
+- Adaptive bisection early-exit
+### v0.1.1 (2026-05-29)
+- MPS histc OOM bug fixed (bucketize+bincount)
+- Sketch API: max_n_exact=1M, sketch_size=500K
+- Domain consistency and bias warning fixes
+## Repository Structure
+```
+dcb/
+  layer.py         DCBLayer nn.Module + DCBFunction autograd
+  solver.py        IFT root-finder, trisection bisection, Richardson pass
+  fft_kde.py       FFT mode counter, direct_mode_count_batch, precompute_fft
+  training.py      TrainingLayer with warm-start and compile support
+  kde.py           Direct KDE derivatives (IFT backward path)
+  utils.py         Grid, Silverman bandwidth, sg() stabiliser
+experiments/       Reproduction scripts for all benchmarks and paper figures
+tests/             Unit tests (45 passed, 1 xfailed)
+```
+## License
+Apache 2.0 — see [LICENSE](LICENSE).

{diffcb-0.1.6 → diffcb-0.1.8}/dcb/__init__.py RENAMED Viewed

@@ -21,4 +21,4 @@ __all__ = [
     "TrainingLayer",
     "anneal_eps_tau", "soft_mode_count_cross", "soft_mode_count",
 ]
-__version__ = "0.1.6"
+__version__ = "0.1.8"

{diffcb-0.1.6 → diffcb-0.1.8}/dcb/fft_kde.py RENAMED Viewed

@@ -248,6 +248,7 @@ def _refine_hcrit(
     pad_factor: int = 2,  # Worker 5: pad_factor=2 (was 4) — safe for h ≤ 3σ, halves irfft size
     C_external: Tensor | None = None,
     omega_external: Tensor | None = None,
+    fft_dtype: torch.dtype = torch.float32,
 ) -> float:
     """Sub-bin quadratic refinement of h_crit after bisection converges.
@@ -298,13 +299,13 @@ def _refine_hcrit(
         C_ref = C_external
         omega_base = omega_external
     else:
-        # Fallback: build float64 histogram + rfft (original behaviour).
+        # Fallback: build histogram + rfft, respecting fft_dtype (avoids float64 on MPS).
         with torch.no_grad():
             counts = _histogram_on_device(X, G, lo_d, hi_d).cpu()
-            counts_padded = torch.zeros(N, dtype=torch.float64)
-            counts_padded[:G] = counts.double()
+            counts_padded = torch.zeros(N, dtype=fft_dtype)
+            counts_padded[:G] = counts.to(fft_dtype)
             C_ref = torch.fft.rfft(counts_padded)
-            k = torch.arange(N // 2 + 1, dtype=torch.float64)
+            k = torch.arange(N // 2 + 1, dtype=fft_dtype)
             omega_base = 2 * math.pi * k / (N * bw)
     def fprime(h: float) -> Tensor:
@@ -454,6 +455,113 @@ def direct_mode_count_batch(
         return counts
+def precompute_fft_batch(
+    xs: list,
+    G: int = 4096,
+    pad_factor: int = 2,
+    fft_dtype: torch.dtype = torch.float32,
+) -> tuple:
+    """Precompute FFT spectra for K independent distributions in one batched rfft.
+    For K distributions sharing G and pad_factor, builds one histogram per
+    distribution, stacks into (K, N), and runs a single batched rfft —
+    amortising per-call Python dispatch overhead.
+    Parameters
+    ----------
+    xs : list of Tensor, each shape (n_k,)
+    G, pad_factor, fft_dtype : same as precompute_fft
+    Returns
+    -------
+    C_batch : Tensor (K, N//2+1), complex
+    omega_batch : Tensor (K, N//2+1), float  — per-k frequency grids
+    domains : list of (lo_k, hi_k)
+    N : int
+    any_degenerate : list of bool, length K
+    """
+    K = len(xs)
+    N = pad_factor * G
+    device = xs[0].device if K > 0 else torch.device('cpu')
+    domains: list = []
+    any_degenerate: list = []
+    counts_list: list = []
+    for x_k in xs:
+        sigma = x_k.std().item()
+        if sigma == 0.0:
+            sigma = 1.0
+        lo_k = x_k.min().item() - 3 * sigma
+        hi_k = x_k.max().item() + 3 * sigma
+        data_range_k = hi_k - lo_k
+        domains.append((lo_k, hi_k))
+        if data_range_k == 0.0:
+            any_degenerate.append(True)
+            counts_list.append(torch.zeros(G, dtype=fft_dtype, device=device))
+        else:
+            any_degenerate.append(False)
+            counts_list.append(_histogram_on_device(x_k, G, lo_k, hi_k).to(fft_dtype))
+    counts_batch = torch.zeros(K, N, dtype=fft_dtype, device=device)
+    for k, c in enumerate(counts_list):
+        counts_batch[k, :G] = c
+    C_batch = torch.fft.rfft(counts_batch, dim=-1)  # (K, N//2+1)
+    M = N // 2 + 1
+    omega_batch = torch.zeros(K, M, dtype=fft_dtype, device=device)
+    k_freq = torch.arange(M, device=device, dtype=fft_dtype)
+    for k, (lo_k, hi_k) in enumerate(domains):
+        data_range_k = hi_k - lo_k
+        if data_range_k > 0.0:
+            bin_width_k = data_range_k / G
+            omega_batch[k] = 2 * math.pi * k_freq / (N * bin_width_k)
+    return C_batch, omega_batch, domains, N, any_degenerate
+def mode_count_K_batch(
+    C_batch: Tensor,
+    omega_batch: Tensor,
+    h1: Tensor,
+    h2: Tensor,
+    G: int,
+    N: int,
+    any_degenerate: list,
+) -> tuple:
+    """Mode counts for K distributions × 2 bandwidths in one batched irfft.
+    Replaces 2K separate irfft calls per trisection round with a single
+    (K, 2, M) dispatch — the core of the forward_batched speedup.
+    Parameters
+    ----------
+    C_batch : Tensor (K, M), complex
+    omega_batch : Tensor (K, M), float  — per-distribution frequency grids
+    h1, h2 : Tensor (K,)  — two trisection interior points per distribution
+    G, N : int
+    any_degenerate : list[bool]
+    Returns
+    -------
+    c1, c2 : Tensor (K,), long
+    """
+    K, M = C_batch.shape
+    h_eval = torch.stack([h1, h2], dim=1)               # (K, 2)
+    omega_h = omega_batch.unsqueeze(1) * h_eval.unsqueeze(-1)   # (K, 2, M)
+    K_deriv = 1j * omega_batch.unsqueeze(1) * torch.exp(-0.5 * omega_h ** 2)  # (K, 2, M)
+    C_exp = C_batch.unsqueeze(1).expand(K, 2, M)         # (K, 2, M)
+    f_prime = torch.fft.irfft(C_exp * K_deriv, n=N, dim=-1)[:, :, :G]  # (K, 2, G)
+    counts = ((f_prime[:, :, :-1] > 0) & (f_prime[:, :, 1:] < 0)).sum(dim=-1)  # (K, 2)
+    c1, c2 = counts[:, 0], counts[:, 1]
+    for k, degen in enumerate(any_degenerate):
+        if degen:
+            c1[k] = 1
+            c2[k] = 1
+    return c1, c2
 def adaptive_fft_G(data_range: float, h_hi: float, G_min: int = 16384) -> int:
     """Choose FFT grid size G so that the derivative kernel is well-resolved.

diffcb 0.1.6__tar.gz → 0.1.8__tar.gz

diffcb 0.1.6tar.gz → 0.1.8tar.gz