diffcb 0.1.6__tar.gz → 0.1.8__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {diffcb-0.1.6 → diffcb-0.1.8}/PKG-INFO +131 -44
- diffcb-0.1.8/README.md +182 -0
- {diffcb-0.1.6 → diffcb-0.1.8}/dcb/__init__.py +1 -1
- {diffcb-0.1.6 → diffcb-0.1.8}/dcb/fft_kde.py +112 -4
- {diffcb-0.1.6 → diffcb-0.1.8}/dcb/layer.py +83 -5
- {diffcb-0.1.6 → diffcb-0.1.8}/dcb/solver.py +14 -2
- diffcb-0.1.8/dcb/training.py +475 -0
- {diffcb-0.1.6 → diffcb-0.1.8}/pyproject.toml +1 -1
- diffcb-0.1.8/round24_v016_test.py +469 -0
- diffcb-0.1.8/round25_full_range_sweep.py +500 -0
- diffcb-0.1.8/round25_write_csv.py +137 -0
- diffcb-0.1.8/tests/test_gradcheck.py +104 -0
- {diffcb-0.1.6 → diffcb-0.1.8}/tests/test_layer.py +9 -6
- diffcb-0.1.6/README.md +0 -95
- diffcb-0.1.6/dcb/training.py +0 -231
- {diffcb-0.1.6 → diffcb-0.1.8}/.gitignore +0 -0
- {diffcb-0.1.6 → diffcb-0.1.8}/.zenodo.json +0 -0
- {diffcb-0.1.6 → diffcb-0.1.8}/LICENSE +0 -0
- {diffcb-0.1.6 → diffcb-0.1.8}/dcb/diagnostics.py +0 -0
- {diffcb-0.1.6 → diffcb-0.1.8}/dcb/kde.py +0 -0
- {diffcb-0.1.6 → diffcb-0.1.8}/dcb/utils.py +0 -0
- {diffcb-0.1.6 → diffcb-0.1.8}/notebooks/.gitkeep +0 -0
- {diffcb-0.1.6 → diffcb-0.1.8}/round24_cumulative_bench.py +0 -0
- {diffcb-0.1.6 → diffcb-0.1.8}/tests/test_kde.py +0 -0
- {diffcb-0.1.6 → diffcb-0.1.8}/tests/test_r18c_denom_audit.py +0 -0
- {diffcb-0.1.6 → diffcb-0.1.8}/tests/test_r18c_deprecation_warn.py +0 -0
- {diffcb-0.1.6 → diffcb-0.1.8}/tests/test_r19_default_fft.py +0 -0
- {diffcb-0.1.6 → diffcb-0.1.8}/tests/test_r19_diagnostics.py +0 -0
- {diffcb-0.1.6 → diffcb-0.1.8}/tests/test_solver.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: diffcb
|
|
3
|
-
Version: 0.1.
|
|
3
|
+
Version: 0.1.8
|
|
4
4
|
Summary: Differentiable Critical Bandwidth: Silverman's modality test as a differentiable PyTorch layer with IFT backward pass.
|
|
5
5
|
Project-URL: Homepage, https://github.com/ryZhangHason/differentiable-critical-bandwidth
|
|
6
6
|
Project-URL: Repository, https://github.com/ryZhangHason/differentiable-critical-bandwidth
|
|
@@ -226,20 +226,25 @@ Description-Content-Type: text/markdown
|
|
|
226
226
|
[](LICENSE)
|
|
227
227
|
[](https://www.python.org/)
|
|
228
228
|
|
|
229
|
-
A PyTorch package that makes **Silverman's critical bandwidth test (1981)** fully differentiable, enabling end-to-end gradient-based
|
|
229
|
+
A PyTorch package that makes **Silverman's critical bandwidth test (1981)** fully differentiable, enabling end-to-end gradient-based optimisation over the modal structure of continuous distributions.
|
|
230
230
|
|
|
231
231
|
## Overview
|
|
232
232
|
|
|
233
|
-
|
|
233
|
+
`h_crit` is the minimum KDE bandwidth at which a distribution appears unimodal — a classical nonparametric statistic for modality testing. DCB replaces every non-differentiable step with a smooth surrogate, then uses the **Implicit Function Theorem** (IFT) to compute exact gradients through the root-finding step at O(1) memory cost.
|
|
234
234
|
|
|
235
235
|
```python
|
|
236
236
|
import torch
|
|
237
|
-
from dcb import DCBLayer
|
|
237
|
+
from dcb import DCBLayer, TrainingLayer
|
|
238
238
|
|
|
239
|
-
X = torch.randn(
|
|
240
|
-
layer = DCBLayer(
|
|
241
|
-
h_crit = layer(X)
|
|
242
|
-
h_crit.backward()
|
|
239
|
+
X = torch.randn(10_000, requires_grad=True) # 1D samples, any n from 5K to 1B
|
|
240
|
+
layer = DCBLayer()
|
|
241
|
+
h_crit = layer(X) # differentiable scalar
|
|
242
|
+
h_crit.backward() # exact IFT gradients
|
|
243
|
+
|
|
244
|
+
# For repeated training-loop use with warm-start bracket caching:
|
|
245
|
+
layer = TrainingLayer(warm_start=True)
|
|
246
|
+
for batch in dataloader:
|
|
247
|
+
h = layer(batch) # 1.8× faster after first call on CPU; ~10× on CUDA with compile=True
|
|
243
248
|
```
|
|
244
249
|
|
|
245
250
|
## Installation
|
|
@@ -249,7 +254,6 @@ pip install diffcb
|
|
|
249
254
|
```
|
|
250
255
|
|
|
251
256
|
Or from source:
|
|
252
|
-
|
|
253
257
|
```bash
|
|
254
258
|
git clone https://github.com/ryZhangHason/differentiable-critical-bandwidth
|
|
255
259
|
cd differentiable-critical-bandwidth
|
|
@@ -258,58 +262,141 @@ pip install -e ".[dev]"
|
|
|
258
262
|
|
|
259
263
|
## Accuracy vs R's `bw.crit`
|
|
260
264
|
|
|
261
|
-
|
|
265
|
+
Validated against R's `multimode::bw.crit(data, mod0=1)` (Hall & York 2001).
|
|
266
|
+
**Same-sample protocol** (identical data fed to both Python and R):
|
|
262
267
|
|
|
263
|
-
| n | DCB vs R
|
|
264
|
-
|
|
265
|
-
|
|
|
266
|
-
|
|
|
267
|
-
|
|
|
268
|
+
| n | DCB error vs R | Notes |
|
|
269
|
+
|---|---------------|-------|
|
|
270
|
+
| 5K–25K | **< 0.005%** | Direct-KDE path, zero histogram bias |
|
|
271
|
+
| 100K | **0.003%** | FFT histogram path, G=16384 |
|
|
272
|
+
| 1M | **0.003%** | FFT path |
|
|
273
|
+
| 10M | **0.003%** | FFT path |
|
|
274
|
+
| 100M+ | **< 0.01%** | Histogram-dominated; sketch available |
|
|
275
|
+
|
|
276
|
+
Independent-sample error (~0.2–0.5%) reflects natural sampling variability (two RNGs), not algorithmic error. The 0.003% algorithmic error sits below R's own ~0.001% numerical noise floor.
|
|
277
|
+
|
|
278
|
+
## Hardware Performance (v0.1.6)
|
|
279
|
+
|
|
280
|
+
| n | CPU (Apple M) | MPS | P100 GPU |
|
|
281
|
+
|---|:---:|:---:|:---:|
|
|
282
|
+
| 10K | 2,300 ms | 1,400 ms | **107 ms** |
|
|
283
|
+
| 50K | 2,900 ms | 1,700 ms | **167 ms** |
|
|
284
|
+
| 100K | 265 ms | 248 ms | **35 ms** |
|
|
285
|
+
| 1M | 269 ms | 189 ms | **36 ms** |
|
|
286
|
+
| 10M | 544 ms | — | **44 ms** |
|
|
287
|
+
|
|
288
|
+
P100 speedup: **43–116× vs CPU**. Peak 116× at n=50K (direct-KDE GPU parallelism).
|
|
289
|
+
|
|
290
|
+
Cumulative speedup vs v0.1.4 on CPU: 1.1× (100K), 1.7× (1M), **4.2× (10M)**.
|
|
268
291
|
|
|
269
|
-
|
|
292
|
+
## API Reference
|
|
270
293
|
|
|
271
|
-
|
|
294
|
+
### `DCBLayer`
|
|
272
295
|
|
|
273
296
|
```python
|
|
274
297
|
DCBLayer(
|
|
275
|
-
target_modes=1,
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
|
|
298
|
+
target_modes=1, # target number of modes (default 1)
|
|
299
|
+
use_fft=True, # FFT path for n > 50K (default True)
|
|
300
|
+
max_n_exact=None, # sketch above this n (None = always exact)
|
|
301
|
+
G_min=16384, # minimum FFT histogram bins (accuracy ↑ with G)
|
|
302
|
+
use_richardson="auto", # Richardson on CPU, off on GPU (30% accuracy gain on CPU)
|
|
303
|
+
direct_n_max=25_000, # direct-KDE active only when forward_path='auto'/'direct'
|
|
304
|
+
direct_M=2048, # direct-KDE evaluation grid size
|
|
305
|
+
forward_path='smooth', # 'smooth' (default, strictly differentiable) |
|
|
306
|
+
# 'auto' (direct-KDE at n≤25K, surrogate gradient) |
|
|
307
|
+
# 'direct' (force direct-KDE, accuracy benchmarks)
|
|
308
|
+
safe_backward=False, # clamp IFT denominator near bifurcations
|
|
281
309
|
)
|
|
282
310
|
```
|
|
283
311
|
|
|
284
|
-
|
|
312
|
+
### `TrainingLayer` (for ML training loops)
|
|
313
|
+
|
|
314
|
+
```python
|
|
315
|
+
from dcb import TrainingLayer
|
|
316
|
+
|
|
317
|
+
layer = TrainingLayer(
|
|
318
|
+
warm_start=True, # cache h_prev; init bracket to [0.95h, 1.05h] → 1.8× CPU speedup
|
|
319
|
+
compile=False, # torch.compile opt-in (requires float32, Python ≤ 3.11 on CUDA)
|
|
320
|
+
warm_margin=0.05, # bracket half-width around cached h_crit
|
|
321
|
+
**dcb_kwargs, # any DCBLayer parameter
|
|
322
|
+
)
|
|
323
|
+
layer.reset_cache() # call on distribution shift
|
|
324
|
+
```
|
|
325
|
+
|
|
326
|
+
### Direct-KDE path (n ≤ 25K)
|
|
285
327
|
|
|
286
|
-
|
|
328
|
+
For small samples, DCB evaluates f′_h directly without histogram binning (O(n·M) per evaluation, zero binning bias). This is 3–4× slower on CPU but **80–96× faster than CPU on GPU**.
|
|
329
|
+
|
|
330
|
+
```python
|
|
331
|
+
# Force direct-KDE for all n (accuracy benchmark):
|
|
332
|
+
layer = DCBLayer(direct_n_max=float('inf'))
|
|
333
|
+
|
|
334
|
+
# Disable direct-KDE (speed benchmark):
|
|
335
|
+
layer = DCBLayer(direct_n_max=0)
|
|
336
|
+
```
|
|
337
|
+
|
|
338
|
+
### Richardson extrapolation
|
|
339
|
+
|
|
340
|
+
By default (`use_richardson=True`), DCB runs a second bisection at G/2=8192 and combines:
|
|
341
|
+
`h̃ = (4·ĥ(G) − ĥ(G/2)) / 3`, reducing error ~30%. On GPU this adds 38% overhead with
|
|
342
|
+
<0.01% accuracy gain — consider `use_richardson=False` for GPU training loops.
|
|
343
|
+
|
|
344
|
+
## Known Limitations
|
|
345
|
+
|
|
346
|
+
- **`compile=True` on MPS**: blocked by float64 in `_refine_hcrit` fallback (fix in v0.1.7)
|
|
347
|
+
- **`compile=True` on CUDA with Python 3.12**: requires torch ≥ 2.4 or Python ≤ 3.11
|
|
348
|
+
- **`gradcheck`**: passes with the default `forward_path='smooth'`; the default is strictly differentiable at all n. Opt into `forward_path='auto'` only for forward-only accuracy benchmarks (surrogate gradient at n≤25K)
|
|
349
|
+
- **n > 100M**: requires streaming histogram (not yet public API); use `max_n_exact=1_000_000` sketch as workaround
|
|
350
|
+
|
|
351
|
+
## Confirmed Experimental Results
|
|
287
352
|
|
|
288
353
|
| Experiment | Result | Criterion |
|
|
289
354
|
|---|---|---|
|
|
290
|
-
|
|
|
291
|
-
|
|
|
292
|
-
|
|
|
293
|
-
|
|
|
294
|
-
|
|
|
355
|
+
| Accuracy vs R (same data, n=100K) | **0.003%** | < 0.01% ✓ |
|
|
356
|
+
| Validation (m≥2, Marron-Wand) | R²=0.91, MAE=0.07, ρ=0.89 | R²≥0.85 ✓ |
|
|
357
|
+
| Speedup vs scipy (CUDA T4, n=8192) | **10.5×** | ≥3× ✓ |
|
|
358
|
+
| GAN mode preservation | h_crit=1.232 >> 0.3 | h_crit>0.3 ✓ |
|
|
359
|
+
| Anomaly AUC (KDDCup99) | DCB=**0.9982** vs IF=0.9867 | DCB≥IF ✓ |
|
|
360
|
+
| GPU speedup (P100, n=50K) | **116×** vs CPU | — |
|
|
361
|
+
| GPU speedup (P100, n=100K) | **43×** vs CPU | — |
|
|
362
|
+
|
|
363
|
+
## Changelog
|
|
364
|
+
|
|
365
|
+
### v0.1.6 (2026-05-30)
|
|
366
|
+
- `TrainingLayer`: warm-start bracket caching (1.82× CPU speedup in training loops)
|
|
367
|
+
- `direct_mode_count_batch`: direct-KDE path for n ≤ 25K (zero histogram bias; 80–96× GPU speedup)
|
|
368
|
+
- Compile-ready trisection: tensor lo/hi, no `.item()` inside loop, fixed 16-round unroll
|
|
369
|
+
- `mode_count_from_C_batch` returns `Tensor(B,)` (was `list[int]`) — enables torch.compile tracing
|
|
370
|
+
|
|
371
|
+
### v0.1.5 (2026-05-29)
|
|
372
|
+
- Richardson extrapolation on h_crit scalar (30% accuracy gain, G=16384+8192)
|
|
373
|
+
- alloc/sync hygiene: removed `nonzero_mask` host sync (4.2× faster at n=10M)
|
|
374
|
+
- Batched trisection bisection (one irfft dispatch per round)
|
|
375
|
+
- Eliminated duplicate O(n) histogram in `_refine_hcrit` (C_external reuse)
|
|
376
|
+
|
|
377
|
+
### v0.1.4 (2026-05-29)
|
|
378
|
+
- FFT histogram path: C hoisted out of bisection loop (Worker 1)
|
|
379
|
+
- Device-native histogram: CUDA histc, MPS scatter_add_, CPU bucketize+bincount
|
|
380
|
+
- float32 FFT default; pad_factor 4→2 (halves irfft size)
|
|
381
|
+
- Adaptive bisection early-exit
|
|
382
|
+
|
|
383
|
+
### v0.1.1 (2026-05-29)
|
|
384
|
+
- MPS histc OOM bug fixed (bucketize+bincount)
|
|
385
|
+
- Sketch API: max_n_exact=1M, sketch_size=500K
|
|
386
|
+
- Domain consistency and bias warning fixes
|
|
295
387
|
|
|
296
388
|
## Repository Structure
|
|
297
389
|
|
|
298
390
|
```
|
|
299
|
-
dcb/
|
|
300
|
-
layer.py
|
|
301
|
-
solver.py
|
|
302
|
-
fft_kde.py
|
|
303
|
-
|
|
304
|
-
|
|
305
|
-
|
|
306
|
-
|
|
307
|
-
|
|
308
|
-
phase3_anomaly.py Anomaly detection (Table 2, Figure 5)
|
|
309
|
-
round20_*.py Large-n R comparison and streaming benchmarks
|
|
310
|
-
round21_*.py Accuracy improvement experiments
|
|
311
|
-
tests/ Unit tests (pytest, 45 passed, 1 xfailed)
|
|
312
|
-
outputs/ All generated figures and tables (PDFs, PNGs, CSVs)
|
|
391
|
+
dcb/
|
|
392
|
+
layer.py DCBLayer nn.Module + DCBFunction autograd
|
|
393
|
+
solver.py IFT root-finder, trisection bisection, Richardson pass
|
|
394
|
+
fft_kde.py FFT mode counter, direct_mode_count_batch, precompute_fft
|
|
395
|
+
training.py TrainingLayer with warm-start and compile support
|
|
396
|
+
kde.py Direct KDE derivatives (IFT backward path)
|
|
397
|
+
utils.py Grid, Silverman bandwidth, sg() stabiliser
|
|
398
|
+
experiments/ Reproduction scripts for all benchmarks and paper figures
|
|
399
|
+
tests/ Unit tests (45 passed, 1 xfailed)
|
|
313
400
|
```
|
|
314
401
|
|
|
315
402
|
## License
|
diffcb-0.1.8/README.md
ADDED
|
@@ -0,0 +1,182 @@
|
|
|
1
|
+
# DCB — Differentiable Critical Bandwidth
|
|
2
|
+
|
|
3
|
+
[](https://pypi.org/project/diffcb/)
|
|
4
|
+
[](LICENSE)
|
|
5
|
+
[](https://www.python.org/)
|
|
6
|
+
|
|
7
|
+
A PyTorch package that makes **Silverman's critical bandwidth test (1981)** fully differentiable, enabling end-to-end gradient-based optimisation over the modal structure of continuous distributions.
|
|
8
|
+
|
|
9
|
+
## Overview
|
|
10
|
+
|
|
11
|
+
`h_crit` is the minimum KDE bandwidth at which a distribution appears unimodal — a classical nonparametric statistic for modality testing. DCB replaces every non-differentiable step with a smooth surrogate, then uses the **Implicit Function Theorem** (IFT) to compute exact gradients through the root-finding step at O(1) memory cost.
|
|
12
|
+
|
|
13
|
+
```python
|
|
14
|
+
import torch
|
|
15
|
+
from dcb import DCBLayer, TrainingLayer
|
|
16
|
+
|
|
17
|
+
X = torch.randn(10_000, requires_grad=True) # 1D samples, any n from 5K to 1B
|
|
18
|
+
layer = DCBLayer()
|
|
19
|
+
h_crit = layer(X) # differentiable scalar
|
|
20
|
+
h_crit.backward() # exact IFT gradients
|
|
21
|
+
|
|
22
|
+
# For repeated training-loop use with warm-start bracket caching:
|
|
23
|
+
layer = TrainingLayer(warm_start=True)
|
|
24
|
+
for batch in dataloader:
|
|
25
|
+
h = layer(batch) # 1.8× faster after first call on CPU; ~10× on CUDA with compile=True
|
|
26
|
+
```
|
|
27
|
+
|
|
28
|
+
## Installation
|
|
29
|
+
|
|
30
|
+
```bash
|
|
31
|
+
pip install diffcb
|
|
32
|
+
```
|
|
33
|
+
|
|
34
|
+
Or from source:
|
|
35
|
+
```bash
|
|
36
|
+
git clone https://github.com/ryZhangHason/differentiable-critical-bandwidth
|
|
37
|
+
cd differentiable-critical-bandwidth
|
|
38
|
+
pip install -e ".[dev]"
|
|
39
|
+
```
|
|
40
|
+
|
|
41
|
+
## Accuracy vs R's `bw.crit`
|
|
42
|
+
|
|
43
|
+
Validated against R's `multimode::bw.crit(data, mod0=1)` (Hall & York 2001).
|
|
44
|
+
**Same-sample protocol** (identical data fed to both Python and R):
|
|
45
|
+
|
|
46
|
+
| n | DCB error vs R | Notes |
|
|
47
|
+
|---|---------------|-------|
|
|
48
|
+
| 5K–25K | **< 0.005%** | Direct-KDE path, zero histogram bias |
|
|
49
|
+
| 100K | **0.003%** | FFT histogram path, G=16384 |
|
|
50
|
+
| 1M | **0.003%** | FFT path |
|
|
51
|
+
| 10M | **0.003%** | FFT path |
|
|
52
|
+
| 100M+ | **< 0.01%** | Histogram-dominated; sketch available |
|
|
53
|
+
|
|
54
|
+
Independent-sample error (~0.2–0.5%) reflects natural sampling variability (two RNGs), not algorithmic error. The 0.003% algorithmic error sits below R's own ~0.001% numerical noise floor.
|
|
55
|
+
|
|
56
|
+
## Hardware Performance (v0.1.6)
|
|
57
|
+
|
|
58
|
+
| n | CPU (Apple M) | MPS | P100 GPU |
|
|
59
|
+
|---|:---:|:---:|:---:|
|
|
60
|
+
| 10K | 2,300 ms | 1,400 ms | **107 ms** |
|
|
61
|
+
| 50K | 2,900 ms | 1,700 ms | **167 ms** |
|
|
62
|
+
| 100K | 265 ms | 248 ms | **35 ms** |
|
|
63
|
+
| 1M | 269 ms | 189 ms | **36 ms** |
|
|
64
|
+
| 10M | 544 ms | — | **44 ms** |
|
|
65
|
+
|
|
66
|
+
P100 speedup: **43–116× vs CPU**. Peak 116× at n=50K (direct-KDE GPU parallelism).
|
|
67
|
+
|
|
68
|
+
Cumulative speedup vs v0.1.4 on CPU: 1.1× (100K), 1.7× (1M), **4.2× (10M)**.
|
|
69
|
+
|
|
70
|
+
## API Reference
|
|
71
|
+
|
|
72
|
+
### `DCBLayer`
|
|
73
|
+
|
|
74
|
+
```python
|
|
75
|
+
DCBLayer(
|
|
76
|
+
target_modes=1, # target number of modes (default 1)
|
|
77
|
+
use_fft=True, # FFT path for n > 50K (default True)
|
|
78
|
+
max_n_exact=None, # sketch above this n (None = always exact)
|
|
79
|
+
G_min=16384, # minimum FFT histogram bins (accuracy ↑ with G)
|
|
80
|
+
use_richardson="auto", # Richardson on CPU, off on GPU (30% accuracy gain on CPU)
|
|
81
|
+
direct_n_max=25_000, # direct-KDE active only when forward_path='auto'/'direct'
|
|
82
|
+
direct_M=2048, # direct-KDE evaluation grid size
|
|
83
|
+
forward_path='smooth', # 'smooth' (default, strictly differentiable) |
|
|
84
|
+
# 'auto' (direct-KDE at n≤25K, surrogate gradient) |
|
|
85
|
+
# 'direct' (force direct-KDE, accuracy benchmarks)
|
|
86
|
+
safe_backward=False, # clamp IFT denominator near bifurcations
|
|
87
|
+
)
|
|
88
|
+
```
|
|
89
|
+
|
|
90
|
+
### `TrainingLayer` (for ML training loops)
|
|
91
|
+
|
|
92
|
+
```python
|
|
93
|
+
from dcb import TrainingLayer
|
|
94
|
+
|
|
95
|
+
layer = TrainingLayer(
|
|
96
|
+
warm_start=True, # cache h_prev; init bracket to [0.95h, 1.05h] → 1.8× CPU speedup
|
|
97
|
+
compile=False, # torch.compile opt-in (requires float32, Python ≤ 3.11 on CUDA)
|
|
98
|
+
warm_margin=0.05, # bracket half-width around cached h_crit
|
|
99
|
+
**dcb_kwargs, # any DCBLayer parameter
|
|
100
|
+
)
|
|
101
|
+
layer.reset_cache() # call on distribution shift
|
|
102
|
+
```
|
|
103
|
+
|
|
104
|
+
### Direct-KDE path (n ≤ 25K)
|
|
105
|
+
|
|
106
|
+
For small samples, DCB evaluates f′_h directly without histogram binning (O(n·M) per evaluation, zero binning bias). This is 3–4× slower on CPU but **80–96× faster than CPU on GPU**.
|
|
107
|
+
|
|
108
|
+
```python
|
|
109
|
+
# Force direct-KDE for all n (accuracy benchmark):
|
|
110
|
+
layer = DCBLayer(direct_n_max=float('inf'))
|
|
111
|
+
|
|
112
|
+
# Disable direct-KDE (speed benchmark):
|
|
113
|
+
layer = DCBLayer(direct_n_max=0)
|
|
114
|
+
```
|
|
115
|
+
|
|
116
|
+
### Richardson extrapolation
|
|
117
|
+
|
|
118
|
+
By default (`use_richardson=True`), DCB runs a second bisection at G/2=8192 and combines:
|
|
119
|
+
`h̃ = (4·ĥ(G) − ĥ(G/2)) / 3`, reducing error ~30%. On GPU this adds 38% overhead with
|
|
120
|
+
<0.01% accuracy gain — consider `use_richardson=False` for GPU training loops.
|
|
121
|
+
|
|
122
|
+
## Known Limitations
|
|
123
|
+
|
|
124
|
+
- **`compile=True` on MPS**: blocked by float64 in `_refine_hcrit` fallback (fix in v0.1.7)
|
|
125
|
+
- **`compile=True` on CUDA with Python 3.12**: requires torch ≥ 2.4 or Python ≤ 3.11
|
|
126
|
+
- **`gradcheck`**: passes with the default `forward_path='smooth'`; the default is strictly differentiable at all n. Opt into `forward_path='auto'` only for forward-only accuracy benchmarks (surrogate gradient at n≤25K)
|
|
127
|
+
- **n > 100M**: requires streaming histogram (not yet public API); use `max_n_exact=1_000_000` sketch as workaround
|
|
128
|
+
|
|
129
|
+
## Confirmed Experimental Results
|
|
130
|
+
|
|
131
|
+
| Experiment | Result | Criterion |
|
|
132
|
+
|---|---|---|
|
|
133
|
+
| Accuracy vs R (same data, n=100K) | **0.003%** | < 0.01% ✓ |
|
|
134
|
+
| Validation (m≥2, Marron-Wand) | R²=0.91, MAE=0.07, ρ=0.89 | R²≥0.85 ✓ |
|
|
135
|
+
| Speedup vs scipy (CUDA T4, n=8192) | **10.5×** | ≥3× ✓ |
|
|
136
|
+
| GAN mode preservation | h_crit=1.232 >> 0.3 | h_crit>0.3 ✓ |
|
|
137
|
+
| Anomaly AUC (KDDCup99) | DCB=**0.9982** vs IF=0.9867 | DCB≥IF ✓ |
|
|
138
|
+
| GPU speedup (P100, n=50K) | **116×** vs CPU | — |
|
|
139
|
+
| GPU speedup (P100, n=100K) | **43×** vs CPU | — |
|
|
140
|
+
|
|
141
|
+
## Changelog
|
|
142
|
+
|
|
143
|
+
### v0.1.6 (2026-05-30)
|
|
144
|
+
- `TrainingLayer`: warm-start bracket caching (1.82× CPU speedup in training loops)
|
|
145
|
+
- `direct_mode_count_batch`: direct-KDE path for n ≤ 25K (zero histogram bias; 80–96× GPU speedup)
|
|
146
|
+
- Compile-ready trisection: tensor lo/hi, no `.item()` inside loop, fixed 16-round unroll
|
|
147
|
+
- `mode_count_from_C_batch` returns `Tensor(B,)` (was `list[int]`) — enables torch.compile tracing
|
|
148
|
+
|
|
149
|
+
### v0.1.5 (2026-05-29)
|
|
150
|
+
- Richardson extrapolation on h_crit scalar (30% accuracy gain, G=16384+8192)
|
|
151
|
+
- alloc/sync hygiene: removed `nonzero_mask` host sync (4.2× faster at n=10M)
|
|
152
|
+
- Batched trisection bisection (one irfft dispatch per round)
|
|
153
|
+
- Eliminated duplicate O(n) histogram in `_refine_hcrit` (C_external reuse)
|
|
154
|
+
|
|
155
|
+
### v0.1.4 (2026-05-29)
|
|
156
|
+
- FFT histogram path: C hoisted out of bisection loop (Worker 1)
|
|
157
|
+
- Device-native histogram: CUDA histc, MPS scatter_add_, CPU bucketize+bincount
|
|
158
|
+
- float32 FFT default; pad_factor 4→2 (halves irfft size)
|
|
159
|
+
- Adaptive bisection early-exit
|
|
160
|
+
|
|
161
|
+
### v0.1.1 (2026-05-29)
|
|
162
|
+
- MPS histc OOM bug fixed (bucketize+bincount)
|
|
163
|
+
- Sketch API: max_n_exact=1M, sketch_size=500K
|
|
164
|
+
- Domain consistency and bias warning fixes
|
|
165
|
+
|
|
166
|
+
## Repository Structure
|
|
167
|
+
|
|
168
|
+
```
|
|
169
|
+
dcb/
|
|
170
|
+
layer.py DCBLayer nn.Module + DCBFunction autograd
|
|
171
|
+
solver.py IFT root-finder, trisection bisection, Richardson pass
|
|
172
|
+
fft_kde.py FFT mode counter, direct_mode_count_batch, precompute_fft
|
|
173
|
+
training.py TrainingLayer with warm-start and compile support
|
|
174
|
+
kde.py Direct KDE derivatives (IFT backward path)
|
|
175
|
+
utils.py Grid, Silverman bandwidth, sg() stabiliser
|
|
176
|
+
experiments/ Reproduction scripts for all benchmarks and paper figures
|
|
177
|
+
tests/ Unit tests (45 passed, 1 xfailed)
|
|
178
|
+
```
|
|
179
|
+
|
|
180
|
+
## License
|
|
181
|
+
|
|
182
|
+
Apache 2.0 — see [LICENSE](LICENSE).
|
|
@@ -248,6 +248,7 @@ def _refine_hcrit(
|
|
|
248
248
|
pad_factor: int = 2, # Worker 5: pad_factor=2 (was 4) — safe for h ≤ 3σ, halves irfft size
|
|
249
249
|
C_external: Tensor | None = None,
|
|
250
250
|
omega_external: Tensor | None = None,
|
|
251
|
+
fft_dtype: torch.dtype = torch.float32,
|
|
251
252
|
) -> float:
|
|
252
253
|
"""Sub-bin quadratic refinement of h_crit after bisection converges.
|
|
253
254
|
|
|
@@ -298,13 +299,13 @@ def _refine_hcrit(
|
|
|
298
299
|
C_ref = C_external
|
|
299
300
|
omega_base = omega_external
|
|
300
301
|
else:
|
|
301
|
-
# Fallback: build
|
|
302
|
+
# Fallback: build histogram + rfft, respecting fft_dtype (avoids float64 on MPS).
|
|
302
303
|
with torch.no_grad():
|
|
303
304
|
counts = _histogram_on_device(X, G, lo_d, hi_d).cpu()
|
|
304
|
-
counts_padded = torch.zeros(N, dtype=
|
|
305
|
-
counts_padded[:G] = counts.
|
|
305
|
+
counts_padded = torch.zeros(N, dtype=fft_dtype)
|
|
306
|
+
counts_padded[:G] = counts.to(fft_dtype)
|
|
306
307
|
C_ref = torch.fft.rfft(counts_padded)
|
|
307
|
-
k = torch.arange(N // 2 + 1, dtype=
|
|
308
|
+
k = torch.arange(N // 2 + 1, dtype=fft_dtype)
|
|
308
309
|
omega_base = 2 * math.pi * k / (N * bw)
|
|
309
310
|
|
|
310
311
|
def fprime(h: float) -> Tensor:
|
|
@@ -454,6 +455,113 @@ def direct_mode_count_batch(
|
|
|
454
455
|
return counts
|
|
455
456
|
|
|
456
457
|
|
|
458
|
+
def precompute_fft_batch(
|
|
459
|
+
xs: list,
|
|
460
|
+
G: int = 4096,
|
|
461
|
+
pad_factor: int = 2,
|
|
462
|
+
fft_dtype: torch.dtype = torch.float32,
|
|
463
|
+
) -> tuple:
|
|
464
|
+
"""Precompute FFT spectra for K independent distributions in one batched rfft.
|
|
465
|
+
|
|
466
|
+
For K distributions sharing G and pad_factor, builds one histogram per
|
|
467
|
+
distribution, stacks into (K, N), and runs a single batched rfft —
|
|
468
|
+
amortising per-call Python dispatch overhead.
|
|
469
|
+
|
|
470
|
+
Parameters
|
|
471
|
+
----------
|
|
472
|
+
xs : list of Tensor, each shape (n_k,)
|
|
473
|
+
G, pad_factor, fft_dtype : same as precompute_fft
|
|
474
|
+
|
|
475
|
+
Returns
|
|
476
|
+
-------
|
|
477
|
+
C_batch : Tensor (K, N//2+1), complex
|
|
478
|
+
omega_batch : Tensor (K, N//2+1), float — per-k frequency grids
|
|
479
|
+
domains : list of (lo_k, hi_k)
|
|
480
|
+
N : int
|
|
481
|
+
any_degenerate : list of bool, length K
|
|
482
|
+
"""
|
|
483
|
+
K = len(xs)
|
|
484
|
+
N = pad_factor * G
|
|
485
|
+
device = xs[0].device if K > 0 else torch.device('cpu')
|
|
486
|
+
|
|
487
|
+
domains: list = []
|
|
488
|
+
any_degenerate: list = []
|
|
489
|
+
counts_list: list = []
|
|
490
|
+
|
|
491
|
+
for x_k in xs:
|
|
492
|
+
sigma = x_k.std().item()
|
|
493
|
+
if sigma == 0.0:
|
|
494
|
+
sigma = 1.0
|
|
495
|
+
lo_k = x_k.min().item() - 3 * sigma
|
|
496
|
+
hi_k = x_k.max().item() + 3 * sigma
|
|
497
|
+
data_range_k = hi_k - lo_k
|
|
498
|
+
domains.append((lo_k, hi_k))
|
|
499
|
+
if data_range_k == 0.0:
|
|
500
|
+
any_degenerate.append(True)
|
|
501
|
+
counts_list.append(torch.zeros(G, dtype=fft_dtype, device=device))
|
|
502
|
+
else:
|
|
503
|
+
any_degenerate.append(False)
|
|
504
|
+
counts_list.append(_histogram_on_device(x_k, G, lo_k, hi_k).to(fft_dtype))
|
|
505
|
+
|
|
506
|
+
counts_batch = torch.zeros(K, N, dtype=fft_dtype, device=device)
|
|
507
|
+
for k, c in enumerate(counts_list):
|
|
508
|
+
counts_batch[k, :G] = c
|
|
509
|
+
|
|
510
|
+
C_batch = torch.fft.rfft(counts_batch, dim=-1) # (K, N//2+1)
|
|
511
|
+
|
|
512
|
+
M = N // 2 + 1
|
|
513
|
+
omega_batch = torch.zeros(K, M, dtype=fft_dtype, device=device)
|
|
514
|
+
k_freq = torch.arange(M, device=device, dtype=fft_dtype)
|
|
515
|
+
for k, (lo_k, hi_k) in enumerate(domains):
|
|
516
|
+
data_range_k = hi_k - lo_k
|
|
517
|
+
if data_range_k > 0.0:
|
|
518
|
+
bin_width_k = data_range_k / G
|
|
519
|
+
omega_batch[k] = 2 * math.pi * k_freq / (N * bin_width_k)
|
|
520
|
+
|
|
521
|
+
return C_batch, omega_batch, domains, N, any_degenerate
|
|
522
|
+
|
|
523
|
+
|
|
524
|
+
def mode_count_K_batch(
|
|
525
|
+
C_batch: Tensor,
|
|
526
|
+
omega_batch: Tensor,
|
|
527
|
+
h1: Tensor,
|
|
528
|
+
h2: Tensor,
|
|
529
|
+
G: int,
|
|
530
|
+
N: int,
|
|
531
|
+
any_degenerate: list,
|
|
532
|
+
) -> tuple:
|
|
533
|
+
"""Mode counts for K distributions × 2 bandwidths in one batched irfft.
|
|
534
|
+
|
|
535
|
+
Replaces 2K separate irfft calls per trisection round with a single
|
|
536
|
+
(K, 2, M) dispatch — the core of the forward_batched speedup.
|
|
537
|
+
|
|
538
|
+
Parameters
|
|
539
|
+
----------
|
|
540
|
+
C_batch : Tensor (K, M), complex
|
|
541
|
+
omega_batch : Tensor (K, M), float — per-distribution frequency grids
|
|
542
|
+
h1, h2 : Tensor (K,) — two trisection interior points per distribution
|
|
543
|
+
G, N : int
|
|
544
|
+
any_degenerate : list[bool]
|
|
545
|
+
|
|
546
|
+
Returns
|
|
547
|
+
-------
|
|
548
|
+
c1, c2 : Tensor (K,), long
|
|
549
|
+
"""
|
|
550
|
+
K, M = C_batch.shape
|
|
551
|
+
h_eval = torch.stack([h1, h2], dim=1) # (K, 2)
|
|
552
|
+
omega_h = omega_batch.unsqueeze(1) * h_eval.unsqueeze(-1) # (K, 2, M)
|
|
553
|
+
K_deriv = 1j * omega_batch.unsqueeze(1) * torch.exp(-0.5 * omega_h ** 2) # (K, 2, M)
|
|
554
|
+
C_exp = C_batch.unsqueeze(1).expand(K, 2, M) # (K, 2, M)
|
|
555
|
+
f_prime = torch.fft.irfft(C_exp * K_deriv, n=N, dim=-1)[:, :, :G] # (K, 2, G)
|
|
556
|
+
counts = ((f_prime[:, :, :-1] > 0) & (f_prime[:, :, 1:] < 0)).sum(dim=-1) # (K, 2)
|
|
557
|
+
c1, c2 = counts[:, 0], counts[:, 1]
|
|
558
|
+
for k, degen in enumerate(any_degenerate):
|
|
559
|
+
if degen:
|
|
560
|
+
c1[k] = 1
|
|
561
|
+
c2[k] = 1
|
|
562
|
+
return c1, c2
|
|
563
|
+
|
|
564
|
+
|
|
457
565
|
def adaptive_fft_G(data_range: float, h_hi: float, G_min: int = 16384) -> int:
|
|
458
566
|
"""Choose FFT grid size G so that the derivative kernel is well-resolved.
|
|
459
567
|
|