diffcb 0.1.6__tar.gz → 0.1.8__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (29) hide show
  1. {diffcb-0.1.6 → diffcb-0.1.8}/PKG-INFO +131 -44
  2. diffcb-0.1.8/README.md +182 -0
  3. {diffcb-0.1.6 → diffcb-0.1.8}/dcb/__init__.py +1 -1
  4. {diffcb-0.1.6 → diffcb-0.1.8}/dcb/fft_kde.py +112 -4
  5. {diffcb-0.1.6 → diffcb-0.1.8}/dcb/layer.py +83 -5
  6. {diffcb-0.1.6 → diffcb-0.1.8}/dcb/solver.py +14 -2
  7. diffcb-0.1.8/dcb/training.py +475 -0
  8. {diffcb-0.1.6 → diffcb-0.1.8}/pyproject.toml +1 -1
  9. diffcb-0.1.8/round24_v016_test.py +469 -0
  10. diffcb-0.1.8/round25_full_range_sweep.py +500 -0
  11. diffcb-0.1.8/round25_write_csv.py +137 -0
  12. diffcb-0.1.8/tests/test_gradcheck.py +104 -0
  13. {diffcb-0.1.6 → diffcb-0.1.8}/tests/test_layer.py +9 -6
  14. diffcb-0.1.6/README.md +0 -95
  15. diffcb-0.1.6/dcb/training.py +0 -231
  16. {diffcb-0.1.6 → diffcb-0.1.8}/.gitignore +0 -0
  17. {diffcb-0.1.6 → diffcb-0.1.8}/.zenodo.json +0 -0
  18. {diffcb-0.1.6 → diffcb-0.1.8}/LICENSE +0 -0
  19. {diffcb-0.1.6 → diffcb-0.1.8}/dcb/diagnostics.py +0 -0
  20. {diffcb-0.1.6 → diffcb-0.1.8}/dcb/kde.py +0 -0
  21. {diffcb-0.1.6 → diffcb-0.1.8}/dcb/utils.py +0 -0
  22. {diffcb-0.1.6 → diffcb-0.1.8}/notebooks/.gitkeep +0 -0
  23. {diffcb-0.1.6 → diffcb-0.1.8}/round24_cumulative_bench.py +0 -0
  24. {diffcb-0.1.6 → diffcb-0.1.8}/tests/test_kde.py +0 -0
  25. {diffcb-0.1.6 → diffcb-0.1.8}/tests/test_r18c_denom_audit.py +0 -0
  26. {diffcb-0.1.6 → diffcb-0.1.8}/tests/test_r18c_deprecation_warn.py +0 -0
  27. {diffcb-0.1.6 → diffcb-0.1.8}/tests/test_r19_default_fft.py +0 -0
  28. {diffcb-0.1.6 → diffcb-0.1.8}/tests/test_r19_diagnostics.py +0 -0
  29. {diffcb-0.1.6 → diffcb-0.1.8}/tests/test_solver.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: diffcb
3
- Version: 0.1.6
3
+ Version: 0.1.8
4
4
  Summary: Differentiable Critical Bandwidth: Silverman's modality test as a differentiable PyTorch layer with IFT backward pass.
5
5
  Project-URL: Homepage, https://github.com/ryZhangHason/differentiable-critical-bandwidth
6
6
  Project-URL: Repository, https://github.com/ryZhangHason/differentiable-critical-bandwidth
@@ -226,20 +226,25 @@ Description-Content-Type: text/markdown
226
226
  [![License: Apache 2.0](https://img.shields.io/badge/License-Apache%202.0-blue.svg)](LICENSE)
227
227
  [![Python 3.9+](https://img.shields.io/badge/python-3.9+-blue.svg)](https://www.python.org/)
228
228
 
229
- A PyTorch package that makes **Silverman's critical bandwidth test (1981)** fully differentiable, enabling end-to-end gradient-based optimization over the modal structure of continuous distributions.
229
+ A PyTorch package that makes **Silverman's critical bandwidth test (1981)** fully differentiable, enabling end-to-end gradient-based optimisation over the modal structure of continuous distributions.
230
230
 
231
231
  ## Overview
232
232
 
233
- The critical bandwidth `h_crit` is the minimum KDE bandwidth at which a distribution appears to have at most `m` modes — a classical nonparametric statistic for modality testing. DCB replaces every non-differentiable operation in its computation with a smooth surrogate, then uses the **Implicit Function Theorem** to compute exact gradients through the root-finding step at O(1) memory cost.
233
+ `h_crit` is the minimum KDE bandwidth at which a distribution appears unimodal — a classical nonparametric statistic for modality testing. DCB replaces every non-differentiable step with a smooth surrogate, then uses the **Implicit Function Theorem** (IFT) to compute exact gradients through the root-finding step at O(1) memory cost.
234
234
 
235
235
  ```python
236
236
  import torch
237
- from dcb import DCBLayer
237
+ from dcb import DCBLayer, TrainingLayer
238
238
 
239
- X = torch.randn(1000, requires_grad=True) # 1D samples
240
- layer = DCBLayer(target_modes=1)
241
- h_crit = layer(X) # differentiable scalar
242
- h_crit.backward() # exact IFT gradients
239
+ X = torch.randn(10_000, requires_grad=True) # 1D samples, any n from 5K to 1B
240
+ layer = DCBLayer()
241
+ h_crit = layer(X) # differentiable scalar
242
+ h_crit.backward() # exact IFT gradients
243
+
244
+ # For repeated training-loop use with warm-start bracket caching:
245
+ layer = TrainingLayer(warm_start=True)
246
+ for batch in dataloader:
247
+ h = layer(batch) # 1.8× faster after first call on CPU; ~10× on CUDA with compile=True
243
248
  ```
244
249
 
245
250
  ## Installation
@@ -249,7 +254,6 @@ pip install diffcb
249
254
  ```
250
255
 
251
256
  Or from source:
252
-
253
257
  ```bash
254
258
  git clone https://github.com/ryZhangHason/differentiable-critical-bandwidth
255
259
  cd differentiable-critical-bandwidth
@@ -258,58 +262,141 @@ pip install -e ".[dev]"
258
262
 
259
263
  ## Accuracy vs R's `bw.crit`
260
264
 
261
- DCB is validated against R's `multimode::bw.crit(data, mod0=1)` — the standard reference implementation of Hall & York (2001). On **identical data**:
265
+ Validated against R's `multimode::bw.crit(data, mod0=1)` (Hall & York 2001).
266
+ **Same-sample protocol** (identical data fed to both Python and R):
262
267
 
263
- | n | DCB vs R (same sample) | DCB vs R (independent samples) |
264
- |---|---|---|
265
- | 100K | **0.004%** | ~0.5% (MC noise from independent RNG) |
266
- | 1M | **0.005%** | ~0.2% |
267
- | 10M | **0.004%** | ~0.1% |
268
+ | n | DCB error vs R | Notes |
269
+ |---|---------------|-------|
270
+ | 5K–25K | **< 0.005%** | Direct-KDE path, zero histogram bias |
271
+ | 100K | **0.003%** | FFT histogram path, G=16384 |
272
+ | 1M | **0.003%** | FFT path |
273
+ | 10M | **0.003%** | FFT path |
274
+ | 100M+ | **< 0.01%** | Histogram-dominated; sketch available |
275
+
276
+ Independent-sample error (~0.2–0.5%) reflects natural sampling variability (two RNGs), not algorithmic error. The 0.003% algorithmic error sits below R's own ~0.001% numerical noise floor.
277
+
278
+ ## Hardware Performance (v0.1.6)
279
+
280
+ | n | CPU (Apple M) | MPS | P100 GPU |
281
+ |---|:---:|:---:|:---:|
282
+ | 10K | 2,300 ms | 1,400 ms | **107 ms** |
283
+ | 50K | 2,900 ms | 1,700 ms | **167 ms** |
284
+ | 100K | 265 ms | 248 ms | **35 ms** |
285
+ | 1M | 269 ms | 189 ms | **36 ms** |
286
+ | 10M | 544 ms | — | **44 ms** |
287
+
288
+ P100 speedup: **43–116× vs CPU**. Peak 116× at n=50K (direct-KDE GPU parallelism).
289
+
290
+ Cumulative speedup vs v0.1.4 on CPU: 1.1× (100K), 1.7× (1M), **4.2× (10M)**.
268
291
 
269
- The independent-sample figures reflect natural sampling variability (two unbiased estimators drawing different data), not algorithmic error. On identical data, DCB agrees with R to within **0.005%** at all tested n. DCB is 43× faster than R at n=100M (1.1 s vs 50 s) and handles n=2B in 24 s while R OOMs.
292
+ ## API Reference
270
293
 
271
- ## Key Parameters
294
+ ### `DCBLayer`
272
295
 
273
296
  ```python
274
297
  DCBLayer(
275
- target_modes=1, # target number of modes
276
- G=512, # IFT evaluation grid points
277
- use_fft=True, # FFT forward (default); eliminates subsampling bias for n>50K
278
- max_n_exact=1_000_000,# sketch to sketch_size when n exceeds this (None = always exact)
279
- sketch_size=500_000, # sketch target; 500K matches full-n accuracy (O(n^{-2/9}) rate)
280
- safe_backward=False, # clamp IFT denominator near bifurcations
298
+ target_modes=1, # target number of modes (default 1)
299
+ use_fft=True, # FFT path for n > 50K (default True)
300
+ max_n_exact=None, # sketch above this n (None = always exact)
301
+ G_min=16384, # minimum FFT histogram bins (accuracy with G)
302
+ use_richardson="auto", # Richardson on CPU, off on GPU (30% accuracy gain on CPU)
303
+ direct_n_max=25_000, # direct-KDE active only when forward_path='auto'/'direct'
304
+ direct_M=2048, # direct-KDE evaluation grid size
305
+ forward_path='smooth', # 'smooth' (default, strictly differentiable) |
306
+ # 'auto' (direct-KDE at n≤25K, surrogate gradient) |
307
+ # 'direct' (force direct-KDE, accuracy benchmarks)
308
+ safe_backward=False, # clamp IFT denominator near bifurcations
281
309
  )
282
310
  ```
283
311
 
284
- ## Confirmed Experimental Results
312
+ ### `TrainingLayer` (for ML training loops)
313
+
314
+ ```python
315
+ from dcb import TrainingLayer
316
+
317
+ layer = TrainingLayer(
318
+ warm_start=True, # cache h_prev; init bracket to [0.95h, 1.05h] → 1.8× CPU speedup
319
+ compile=False, # torch.compile opt-in (requires float32, Python ≤ 3.11 on CUDA)
320
+ warm_margin=0.05, # bracket half-width around cached h_crit
321
+ **dcb_kwargs, # any DCBLayer parameter
322
+ )
323
+ layer.reset_cache() # call on distribution shift
324
+ ```
325
+
326
+ ### Direct-KDE path (n ≤ 25K)
285
327
 
286
- All GPU results produced on Kaggle (T4 / P100) see `experiments/` and `outputs/`.
328
+ For small samples, DCB evaluates f′_h directly without histogram binning (O(n·M) per evaluation, zero binning bias). This is 3–4× slower on CPU but **80–96× faster than CPU on GPU**.
329
+
330
+ ```python
331
+ # Force direct-KDE for all n (accuracy benchmark):
332
+ layer = DCBLayer(direct_n_max=float('inf'))
333
+
334
+ # Disable direct-KDE (speed benchmark):
335
+ layer = DCBLayer(direct_n_max=0)
336
+ ```
337
+
338
+ ### Richardson extrapolation
339
+
340
+ By default (`use_richardson=True`), DCB runs a second bisection at G/2=8192 and combines:
341
+ `h̃ = (4·ĥ(G) − ĥ(G/2)) / 3`, reducing error ~30%. On GPU this adds 38% overhead with
342
+ <0.01% accuracy gain — consider `use_richardson=False` for GPU training loops.
343
+
344
+ ## Known Limitations
345
+
346
+ - **`compile=True` on MPS**: blocked by float64 in `_refine_hcrit` fallback (fix in v0.1.7)
347
+ - **`compile=True` on CUDA with Python 3.12**: requires torch ≥ 2.4 or Python ≤ 3.11
348
+ - **`gradcheck`**: passes with the default `forward_path='smooth'`; the default is strictly differentiable at all n. Opt into `forward_path='auto'` only for forward-only accuracy benchmarks (surrogate gradient at n≤25K)
349
+ - **n > 100M**: requires streaming histogram (not yet public API); use `max_n_exact=1_000_000` sketch as workaround
350
+
351
+ ## Confirmed Experimental Results
287
352
 
288
353
  | Experiment | Result | Criterion |
289
354
  |---|---|---|
290
- | **Accuracy vs R (same data, n=100K)** | **0.004%** | < 0.01% ✓ |
291
- | **Validation (m≥2, Marron-Wand)** | R²=0.91, MAE=0.07, ρ=0.89 | R²≥0.85 ✓ |
292
- | **Speedup vs scipy (CUDA T4, n=8192)** | **10.5×** | ≥3× ✓ |
293
- | **GAN mode preservation** | h_crit=1.232 >> 0.3 | h_crit>0.3 ✓ |
294
- | **Anomaly AUC (KDDCup99)** | DCB=**0.9982** vs IF=0.9867 | DCB≥IF ✓ |
355
+ | Accuracy vs R (same data, n=100K) | **0.003%** | < 0.01% ✓ |
356
+ | Validation (m≥2, Marron-Wand) | R²=0.91, MAE=0.07, ρ=0.89 | R²≥0.85 ✓ |
357
+ | Speedup vs scipy (CUDA T4, n=8192) | **10.5×** | ≥3× ✓ |
358
+ | GAN mode preservation | h_crit=1.232 >> 0.3 | h_crit>0.3 ✓ |
359
+ | Anomaly AUC (KDDCup99) | DCB=**0.9982** vs IF=0.9867 | DCB≥IF ✓ |
360
+ | GPU speedup (P100, n=50K) | **116×** vs CPU | — |
361
+ | GPU speedup (P100, n=100K) | **43×** vs CPU | — |
362
+
363
+ ## Changelog
364
+
365
+ ### v0.1.6 (2026-05-30)
366
+ - `TrainingLayer`: warm-start bracket caching (1.82× CPU speedup in training loops)
367
+ - `direct_mode_count_batch`: direct-KDE path for n ≤ 25K (zero histogram bias; 80–96× GPU speedup)
368
+ - Compile-ready trisection: tensor lo/hi, no `.item()` inside loop, fixed 16-round unroll
369
+ - `mode_count_from_C_batch` returns `Tensor(B,)` (was `list[int]`) — enables torch.compile tracing
370
+
371
+ ### v0.1.5 (2026-05-29)
372
+ - Richardson extrapolation on h_crit scalar (30% accuracy gain, G=16384+8192)
373
+ - alloc/sync hygiene: removed `nonzero_mask` host sync (4.2× faster at n=10M)
374
+ - Batched trisection bisection (one irfft dispatch per round)
375
+ - Eliminated duplicate O(n) histogram in `_refine_hcrit` (C_external reuse)
376
+
377
+ ### v0.1.4 (2026-05-29)
378
+ - FFT histogram path: C hoisted out of bisection loop (Worker 1)
379
+ - Device-native histogram: CUDA histc, MPS scatter_add_, CPU bucketize+bincount
380
+ - float32 FFT default; pad_factor 4→2 (halves irfft size)
381
+ - Adaptive bisection early-exit
382
+
383
+ ### v0.1.1 (2026-05-29)
384
+ - MPS histc OOM bug fixed (bucketize+bincount)
385
+ - Sketch API: max_n_exact=1M, sketch_size=500K
386
+ - Domain consistency and bias warning fixes
295
387
 
296
388
  ## Repository Structure
297
389
 
298
390
  ```
299
- dcb/ Core PyTorch package
300
- layer.py DCBLayer nn.Module + DCBFunction autograd
301
- solver.py IFT root-finder and backward pass
302
- fft_kde.py FFT-based mode counter (MPS-safe, float64, G=16384)
303
- kde.py Direct KDE derivatives (small-n path)
304
- utils.py Grid, Silverman bandwidth, sg() stabilizer
305
- experiments/ Reproduction scripts for all paper figures and tables
306
- phase1_*.py Validation, speedup, ablation (Figures 1–2, S1–S2)
307
- phase2_gan.py GAN mode-collapse prevention (Figure 3)
308
- phase3_anomaly.py Anomaly detection (Table 2, Figure 5)
309
- round20_*.py Large-n R comparison and streaming benchmarks
310
- round21_*.py Accuracy improvement experiments
311
- tests/ Unit tests (pytest, 45 passed, 1 xfailed)
312
- outputs/ All generated figures and tables (PDFs, PNGs, CSVs)
391
+ dcb/
392
+ layer.py DCBLayer nn.Module + DCBFunction autograd
393
+ solver.py IFT root-finder, trisection bisection, Richardson pass
394
+ fft_kde.py FFT mode counter, direct_mode_count_batch, precompute_fft
395
+ training.py TrainingLayer with warm-start and compile support
396
+ kde.py Direct KDE derivatives (IFT backward path)
397
+ utils.py Grid, Silverman bandwidth, sg() stabiliser
398
+ experiments/ Reproduction scripts for all benchmarks and paper figures
399
+ tests/ Unit tests (45 passed, 1 xfailed)
313
400
  ```
314
401
 
315
402
  ## License
diffcb-0.1.8/README.md ADDED
@@ -0,0 +1,182 @@
1
+ # DCB — Differentiable Critical Bandwidth
2
+
3
+ [![PyPI](https://img.shields.io/pypi/v/diffcb.svg)](https://pypi.org/project/diffcb/)
4
+ [![License: Apache 2.0](https://img.shields.io/badge/License-Apache%202.0-blue.svg)](LICENSE)
5
+ [![Python 3.9+](https://img.shields.io/badge/python-3.9+-blue.svg)](https://www.python.org/)
6
+
7
+ A PyTorch package that makes **Silverman's critical bandwidth test (1981)** fully differentiable, enabling end-to-end gradient-based optimisation over the modal structure of continuous distributions.
8
+
9
+ ## Overview
10
+
11
+ `h_crit` is the minimum KDE bandwidth at which a distribution appears unimodal — a classical nonparametric statistic for modality testing. DCB replaces every non-differentiable step with a smooth surrogate, then uses the **Implicit Function Theorem** (IFT) to compute exact gradients through the root-finding step at O(1) memory cost.
12
+
13
+ ```python
14
+ import torch
15
+ from dcb import DCBLayer, TrainingLayer
16
+
17
+ X = torch.randn(10_000, requires_grad=True) # 1D samples, any n from 5K to 1B
18
+ layer = DCBLayer()
19
+ h_crit = layer(X) # differentiable scalar
20
+ h_crit.backward() # exact IFT gradients
21
+
22
+ # For repeated training-loop use with warm-start bracket caching:
23
+ layer = TrainingLayer(warm_start=True)
24
+ for batch in dataloader:
25
+ h = layer(batch) # 1.8× faster after first call on CPU; ~10× on CUDA with compile=True
26
+ ```
27
+
28
+ ## Installation
29
+
30
+ ```bash
31
+ pip install diffcb
32
+ ```
33
+
34
+ Or from source:
35
+ ```bash
36
+ git clone https://github.com/ryZhangHason/differentiable-critical-bandwidth
37
+ cd differentiable-critical-bandwidth
38
+ pip install -e ".[dev]"
39
+ ```
40
+
41
+ ## Accuracy vs R's `bw.crit`
42
+
43
+ Validated against R's `multimode::bw.crit(data, mod0=1)` (Hall & York 2001).
44
+ **Same-sample protocol** (identical data fed to both Python and R):
45
+
46
+ | n | DCB error vs R | Notes |
47
+ |---|---------------|-------|
48
+ | 5K–25K | **< 0.005%** | Direct-KDE path, zero histogram bias |
49
+ | 100K | **0.003%** | FFT histogram path, G=16384 |
50
+ | 1M | **0.003%** | FFT path |
51
+ | 10M | **0.003%** | FFT path |
52
+ | 100M+ | **< 0.01%** | Histogram-dominated; sketch available |
53
+
54
+ Independent-sample error (~0.2–0.5%) reflects natural sampling variability (two RNGs), not algorithmic error. The 0.003% algorithmic error sits below R's own ~0.001% numerical noise floor.
55
+
56
+ ## Hardware Performance (v0.1.6)
57
+
58
+ | n | CPU (Apple M) | MPS | P100 GPU |
59
+ |---|:---:|:---:|:---:|
60
+ | 10K | 2,300 ms | 1,400 ms | **107 ms** |
61
+ | 50K | 2,900 ms | 1,700 ms | **167 ms** |
62
+ | 100K | 265 ms | 248 ms | **35 ms** |
63
+ | 1M | 269 ms | 189 ms | **36 ms** |
64
+ | 10M | 544 ms | — | **44 ms** |
65
+
66
+ P100 speedup: **43–116× vs CPU**. Peak 116× at n=50K (direct-KDE GPU parallelism).
67
+
68
+ Cumulative speedup vs v0.1.4 on CPU: 1.1× (100K), 1.7× (1M), **4.2× (10M)**.
69
+
70
+ ## API Reference
71
+
72
+ ### `DCBLayer`
73
+
74
+ ```python
75
+ DCBLayer(
76
+ target_modes=1, # target number of modes (default 1)
77
+ use_fft=True, # FFT path for n > 50K (default True)
78
+ max_n_exact=None, # sketch above this n (None = always exact)
79
+ G_min=16384, # minimum FFT histogram bins (accuracy ↑ with G)
80
+ use_richardson="auto", # Richardson on CPU, off on GPU (30% accuracy gain on CPU)
81
+ direct_n_max=25_000, # direct-KDE active only when forward_path='auto'/'direct'
82
+ direct_M=2048, # direct-KDE evaluation grid size
83
+ forward_path='smooth', # 'smooth' (default, strictly differentiable) |
84
+ # 'auto' (direct-KDE at n≤25K, surrogate gradient) |
85
+ # 'direct' (force direct-KDE, accuracy benchmarks)
86
+ safe_backward=False, # clamp IFT denominator near bifurcations
87
+ )
88
+ ```
89
+
90
+ ### `TrainingLayer` (for ML training loops)
91
+
92
+ ```python
93
+ from dcb import TrainingLayer
94
+
95
+ layer = TrainingLayer(
96
+ warm_start=True, # cache h_prev; init bracket to [0.95h, 1.05h] → 1.8× CPU speedup
97
+ compile=False, # torch.compile opt-in (requires float32, Python ≤ 3.11 on CUDA)
98
+ warm_margin=0.05, # bracket half-width around cached h_crit
99
+ **dcb_kwargs, # any DCBLayer parameter
100
+ )
101
+ layer.reset_cache() # call on distribution shift
102
+ ```
103
+
104
+ ### Direct-KDE path (n ≤ 25K)
105
+
106
+ For small samples, DCB evaluates f′_h directly without histogram binning (O(n·M) per evaluation, zero binning bias). This is 3–4× slower on CPU but **80–96× faster than CPU on GPU**.
107
+
108
+ ```python
109
+ # Force direct-KDE for all n (accuracy benchmark):
110
+ layer = DCBLayer(direct_n_max=float('inf'))
111
+
112
+ # Disable direct-KDE (speed benchmark):
113
+ layer = DCBLayer(direct_n_max=0)
114
+ ```
115
+
116
+ ### Richardson extrapolation
117
+
118
+ By default (`use_richardson=True`), DCB runs a second bisection at G/2=8192 and combines:
119
+ `h̃ = (4·ĥ(G) − ĥ(G/2)) / 3`, reducing error ~30%. On GPU this adds 38% overhead with
120
+ <0.01% accuracy gain — consider `use_richardson=False` for GPU training loops.
121
+
122
+ ## Known Limitations
123
+
124
+ - **`compile=True` on MPS**: blocked by float64 in `_refine_hcrit` fallback (fix in v0.1.7)
125
+ - **`compile=True` on CUDA with Python 3.12**: requires torch ≥ 2.4 or Python ≤ 3.11
126
+ - **`gradcheck`**: passes with the default `forward_path='smooth'`; the default is strictly differentiable at all n. Opt into `forward_path='auto'` only for forward-only accuracy benchmarks (surrogate gradient at n≤25K)
127
+ - **n > 100M**: requires streaming histogram (not yet public API); use `max_n_exact=1_000_000` sketch as workaround
128
+
129
+ ## Confirmed Experimental Results
130
+
131
+ | Experiment | Result | Criterion |
132
+ |---|---|---|
133
+ | Accuracy vs R (same data, n=100K) | **0.003%** | < 0.01% ✓ |
134
+ | Validation (m≥2, Marron-Wand) | R²=0.91, MAE=0.07, ρ=0.89 | R²≥0.85 ✓ |
135
+ | Speedup vs scipy (CUDA T4, n=8192) | **10.5×** | ≥3× ✓ |
136
+ | GAN mode preservation | h_crit=1.232 >> 0.3 | h_crit>0.3 ✓ |
137
+ | Anomaly AUC (KDDCup99) | DCB=**0.9982** vs IF=0.9867 | DCB≥IF ✓ |
138
+ | GPU speedup (P100, n=50K) | **116×** vs CPU | — |
139
+ | GPU speedup (P100, n=100K) | **43×** vs CPU | — |
140
+
141
+ ## Changelog
142
+
143
+ ### v0.1.6 (2026-05-30)
144
+ - `TrainingLayer`: warm-start bracket caching (1.82× CPU speedup in training loops)
145
+ - `direct_mode_count_batch`: direct-KDE path for n ≤ 25K (zero histogram bias; 80–96× GPU speedup)
146
+ - Compile-ready trisection: tensor lo/hi, no `.item()` inside loop, fixed 16-round unroll
147
+ - `mode_count_from_C_batch` returns `Tensor(B,)` (was `list[int]`) — enables torch.compile tracing
148
+
149
+ ### v0.1.5 (2026-05-29)
150
+ - Richardson extrapolation on h_crit scalar (30% accuracy gain, G=16384+8192)
151
+ - alloc/sync hygiene: removed `nonzero_mask` host sync (4.2× faster at n=10M)
152
+ - Batched trisection bisection (one irfft dispatch per round)
153
+ - Eliminated duplicate O(n) histogram in `_refine_hcrit` (C_external reuse)
154
+
155
+ ### v0.1.4 (2026-05-29)
156
+ - FFT histogram path: C hoisted out of bisection loop (Worker 1)
157
+ - Device-native histogram: CUDA histc, MPS scatter_add_, CPU bucketize+bincount
158
+ - float32 FFT default; pad_factor 4→2 (halves irfft size)
159
+ - Adaptive bisection early-exit
160
+
161
+ ### v0.1.1 (2026-05-29)
162
+ - MPS histc OOM bug fixed (bucketize+bincount)
163
+ - Sketch API: max_n_exact=1M, sketch_size=500K
164
+ - Domain consistency and bias warning fixes
165
+
166
+ ## Repository Structure
167
+
168
+ ```
169
+ dcb/
170
+ layer.py DCBLayer nn.Module + DCBFunction autograd
171
+ solver.py IFT root-finder, trisection bisection, Richardson pass
172
+ fft_kde.py FFT mode counter, direct_mode_count_batch, precompute_fft
173
+ training.py TrainingLayer with warm-start and compile support
174
+ kde.py Direct KDE derivatives (IFT backward path)
175
+ utils.py Grid, Silverman bandwidth, sg() stabiliser
176
+ experiments/ Reproduction scripts for all benchmarks and paper figures
177
+ tests/ Unit tests (45 passed, 1 xfailed)
178
+ ```
179
+
180
+ ## License
181
+
182
+ Apache 2.0 — see [LICENSE](LICENSE).
@@ -21,4 +21,4 @@ __all__ = [
21
21
  "TrainingLayer",
22
22
  "anneal_eps_tau", "soft_mode_count_cross", "soft_mode_count",
23
23
  ]
24
- __version__ = "0.1.6"
24
+ __version__ = "0.1.8"
@@ -248,6 +248,7 @@ def _refine_hcrit(
248
248
  pad_factor: int = 2, # Worker 5: pad_factor=2 (was 4) — safe for h ≤ 3σ, halves irfft size
249
249
  C_external: Tensor | None = None,
250
250
  omega_external: Tensor | None = None,
251
+ fft_dtype: torch.dtype = torch.float32,
251
252
  ) -> float:
252
253
  """Sub-bin quadratic refinement of h_crit after bisection converges.
253
254
 
@@ -298,13 +299,13 @@ def _refine_hcrit(
298
299
  C_ref = C_external
299
300
  omega_base = omega_external
300
301
  else:
301
- # Fallback: build float64 histogram + rfft (original behaviour).
302
+ # Fallback: build histogram + rfft, respecting fft_dtype (avoids float64 on MPS).
302
303
  with torch.no_grad():
303
304
  counts = _histogram_on_device(X, G, lo_d, hi_d).cpu()
304
- counts_padded = torch.zeros(N, dtype=torch.float64)
305
- counts_padded[:G] = counts.double()
305
+ counts_padded = torch.zeros(N, dtype=fft_dtype)
306
+ counts_padded[:G] = counts.to(fft_dtype)
306
307
  C_ref = torch.fft.rfft(counts_padded)
307
- k = torch.arange(N // 2 + 1, dtype=torch.float64)
308
+ k = torch.arange(N // 2 + 1, dtype=fft_dtype)
308
309
  omega_base = 2 * math.pi * k / (N * bw)
309
310
 
310
311
  def fprime(h: float) -> Tensor:
@@ -454,6 +455,113 @@ def direct_mode_count_batch(
454
455
  return counts
455
456
 
456
457
 
458
+ def precompute_fft_batch(
459
+ xs: list,
460
+ G: int = 4096,
461
+ pad_factor: int = 2,
462
+ fft_dtype: torch.dtype = torch.float32,
463
+ ) -> tuple:
464
+ """Precompute FFT spectra for K independent distributions in one batched rfft.
465
+
466
+ For K distributions sharing G and pad_factor, builds one histogram per
467
+ distribution, stacks into (K, N), and runs a single batched rfft —
468
+ amortising per-call Python dispatch overhead.
469
+
470
+ Parameters
471
+ ----------
472
+ xs : list of Tensor, each shape (n_k,)
473
+ G, pad_factor, fft_dtype : same as precompute_fft
474
+
475
+ Returns
476
+ -------
477
+ C_batch : Tensor (K, N//2+1), complex
478
+ omega_batch : Tensor (K, N//2+1), float — per-k frequency grids
479
+ domains : list of (lo_k, hi_k)
480
+ N : int
481
+ any_degenerate : list of bool, length K
482
+ """
483
+ K = len(xs)
484
+ N = pad_factor * G
485
+ device = xs[0].device if K > 0 else torch.device('cpu')
486
+
487
+ domains: list = []
488
+ any_degenerate: list = []
489
+ counts_list: list = []
490
+
491
+ for x_k in xs:
492
+ sigma = x_k.std().item()
493
+ if sigma == 0.0:
494
+ sigma = 1.0
495
+ lo_k = x_k.min().item() - 3 * sigma
496
+ hi_k = x_k.max().item() + 3 * sigma
497
+ data_range_k = hi_k - lo_k
498
+ domains.append((lo_k, hi_k))
499
+ if data_range_k == 0.0:
500
+ any_degenerate.append(True)
501
+ counts_list.append(torch.zeros(G, dtype=fft_dtype, device=device))
502
+ else:
503
+ any_degenerate.append(False)
504
+ counts_list.append(_histogram_on_device(x_k, G, lo_k, hi_k).to(fft_dtype))
505
+
506
+ counts_batch = torch.zeros(K, N, dtype=fft_dtype, device=device)
507
+ for k, c in enumerate(counts_list):
508
+ counts_batch[k, :G] = c
509
+
510
+ C_batch = torch.fft.rfft(counts_batch, dim=-1) # (K, N//2+1)
511
+
512
+ M = N // 2 + 1
513
+ omega_batch = torch.zeros(K, M, dtype=fft_dtype, device=device)
514
+ k_freq = torch.arange(M, device=device, dtype=fft_dtype)
515
+ for k, (lo_k, hi_k) in enumerate(domains):
516
+ data_range_k = hi_k - lo_k
517
+ if data_range_k > 0.0:
518
+ bin_width_k = data_range_k / G
519
+ omega_batch[k] = 2 * math.pi * k_freq / (N * bin_width_k)
520
+
521
+ return C_batch, omega_batch, domains, N, any_degenerate
522
+
523
+
524
+ def mode_count_K_batch(
525
+ C_batch: Tensor,
526
+ omega_batch: Tensor,
527
+ h1: Tensor,
528
+ h2: Tensor,
529
+ G: int,
530
+ N: int,
531
+ any_degenerate: list,
532
+ ) -> tuple:
533
+ """Mode counts for K distributions × 2 bandwidths in one batched irfft.
534
+
535
+ Replaces 2K separate irfft calls per trisection round with a single
536
+ (K, 2, M) dispatch — the core of the forward_batched speedup.
537
+
538
+ Parameters
539
+ ----------
540
+ C_batch : Tensor (K, M), complex
541
+ omega_batch : Tensor (K, M), float — per-distribution frequency grids
542
+ h1, h2 : Tensor (K,) — two trisection interior points per distribution
543
+ G, N : int
544
+ any_degenerate : list[bool]
545
+
546
+ Returns
547
+ -------
548
+ c1, c2 : Tensor (K,), long
549
+ """
550
+ K, M = C_batch.shape
551
+ h_eval = torch.stack([h1, h2], dim=1) # (K, 2)
552
+ omega_h = omega_batch.unsqueeze(1) * h_eval.unsqueeze(-1) # (K, 2, M)
553
+ K_deriv = 1j * omega_batch.unsqueeze(1) * torch.exp(-0.5 * omega_h ** 2) # (K, 2, M)
554
+ C_exp = C_batch.unsqueeze(1).expand(K, 2, M) # (K, 2, M)
555
+ f_prime = torch.fft.irfft(C_exp * K_deriv, n=N, dim=-1)[:, :, :G] # (K, 2, G)
556
+ counts = ((f_prime[:, :, :-1] > 0) & (f_prime[:, :, 1:] < 0)).sum(dim=-1) # (K, 2)
557
+ c1, c2 = counts[:, 0], counts[:, 1]
558
+ for k, degen in enumerate(any_degenerate):
559
+ if degen:
560
+ c1[k] = 1
561
+ c2[k] = 1
562
+ return c1, c2
563
+
564
+
457
565
  def adaptive_fft_G(data_range: float, h_hi: float, G_min: int = 16384) -> int:
458
566
  """Choose FFT grid size G so that the derivative kernel is well-resolved.
459
567