diffcb 0.1.5__tar.gz → 0.1.7__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {diffcb-0.1.5 → diffcb-0.1.7}/PKG-INFO +129 -44
- diffcb-0.1.7/README.md +180 -0
- {diffcb-0.1.5 → diffcb-0.1.7}/dcb/__init__.py +3 -1
- {diffcb-0.1.5 → diffcb-0.1.7}/dcb/fft_kde.py +213 -14
- {diffcb-0.1.5 → diffcb-0.1.7}/dcb/layer.py +118 -9
- {diffcb-0.1.5 → diffcb-0.1.7}/dcb/solver.py +160 -51
- diffcb-0.1.7/dcb/training.py +475 -0
- {diffcb-0.1.5 → diffcb-0.1.7}/pyproject.toml +1 -1
- diffcb-0.1.7/round24_cumulative_bench.py +110 -0
- diffcb-0.1.7/round24_v016_test.py +469 -0
- diffcb-0.1.7/round25_full_range_sweep.py +500 -0
- diffcb-0.1.7/round25_write_csv.py +137 -0
- diffcb-0.1.7/tests/test_gradcheck.py +112 -0
- {diffcb-0.1.5 → diffcb-0.1.7}/tests/test_r19_default_fft.py +10 -2
- {diffcb-0.1.5 → diffcb-0.1.7}/tests/test_solver.py +10 -4
- diffcb-0.1.5/README.md +0 -95
- {diffcb-0.1.5 → diffcb-0.1.7}/.gitignore +0 -0
- {diffcb-0.1.5 → diffcb-0.1.7}/.zenodo.json +0 -0
- {diffcb-0.1.5 → diffcb-0.1.7}/LICENSE +0 -0
- {diffcb-0.1.5 → diffcb-0.1.7}/dcb/diagnostics.py +0 -0
- {diffcb-0.1.5 → diffcb-0.1.7}/dcb/kde.py +0 -0
- {diffcb-0.1.5 → diffcb-0.1.7}/dcb/utils.py +0 -0
- {diffcb-0.1.5 → diffcb-0.1.7}/notebooks/.gitkeep +0 -0
- {diffcb-0.1.5 → diffcb-0.1.7}/tests/test_kde.py +0 -0
- {diffcb-0.1.5 → diffcb-0.1.7}/tests/test_layer.py +0 -0
- {diffcb-0.1.5 → diffcb-0.1.7}/tests/test_r18c_denom_audit.py +0 -0
- {diffcb-0.1.5 → diffcb-0.1.7}/tests/test_r18c_deprecation_warn.py +0 -0
- {diffcb-0.1.5 → diffcb-0.1.7}/tests/test_r19_diagnostics.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: diffcb
|
|
3
|
-
Version: 0.1.
|
|
3
|
+
Version: 0.1.7
|
|
4
4
|
Summary: Differentiable Critical Bandwidth: Silverman's modality test as a differentiable PyTorch layer with IFT backward pass.
|
|
5
5
|
Project-URL: Homepage, https://github.com/ryZhangHason/differentiable-critical-bandwidth
|
|
6
6
|
Project-URL: Repository, https://github.com/ryZhangHason/differentiable-critical-bandwidth
|
|
@@ -226,20 +226,25 @@ Description-Content-Type: text/markdown
|
|
|
226
226
|
[](LICENSE)
|
|
227
227
|
[](https://www.python.org/)
|
|
228
228
|
|
|
229
|
-
A PyTorch package that makes **Silverman's critical bandwidth test (1981)** fully differentiable, enabling end-to-end gradient-based
|
|
229
|
+
A PyTorch package that makes **Silverman's critical bandwidth test (1981)** fully differentiable, enabling end-to-end gradient-based optimisation over the modal structure of continuous distributions.
|
|
230
230
|
|
|
231
231
|
## Overview
|
|
232
232
|
|
|
233
|
-
|
|
233
|
+
`h_crit` is the minimum KDE bandwidth at which a distribution appears unimodal — a classical nonparametric statistic for modality testing. DCB replaces every non-differentiable step with a smooth surrogate, then uses the **Implicit Function Theorem** (IFT) to compute exact gradients through the root-finding step at O(1) memory cost.
|
|
234
234
|
|
|
235
235
|
```python
|
|
236
236
|
import torch
|
|
237
|
-
from dcb import DCBLayer
|
|
237
|
+
from dcb import DCBLayer, TrainingLayer
|
|
238
238
|
|
|
239
|
-
X = torch.randn(
|
|
240
|
-
layer = DCBLayer(
|
|
241
|
-
h_crit = layer(X)
|
|
242
|
-
h_crit.backward()
|
|
239
|
+
X = torch.randn(10_000, requires_grad=True) # 1D samples, any n from 5K to 1B
|
|
240
|
+
layer = DCBLayer()
|
|
241
|
+
h_crit = layer(X) # differentiable scalar
|
|
242
|
+
h_crit.backward() # exact IFT gradients
|
|
243
|
+
|
|
244
|
+
# For repeated training-loop use with warm-start bracket caching:
|
|
245
|
+
layer = TrainingLayer(warm_start=True)
|
|
246
|
+
for batch in dataloader:
|
|
247
|
+
h = layer(batch) # 1.8× faster after first call on CPU; ~10× on CUDA with compile=True
|
|
243
248
|
```
|
|
244
249
|
|
|
245
250
|
## Installation
|
|
@@ -249,7 +254,6 @@ pip install diffcb
|
|
|
249
254
|
```
|
|
250
255
|
|
|
251
256
|
Or from source:
|
|
252
|
-
|
|
253
257
|
```bash
|
|
254
258
|
git clone https://github.com/ryZhangHason/differentiable-critical-bandwidth
|
|
255
259
|
cd differentiable-critical-bandwidth
|
|
@@ -258,58 +262,139 @@ pip install -e ".[dev]"
|
|
|
258
262
|
|
|
259
263
|
## Accuracy vs R's `bw.crit`
|
|
260
264
|
|
|
261
|
-
|
|
265
|
+
Validated against R's `multimode::bw.crit(data, mod0=1)` (Hall & York 2001).
|
|
266
|
+
**Same-sample protocol** (identical data fed to both Python and R):
|
|
262
267
|
|
|
263
|
-
| n | DCB vs R
|
|
264
|
-
|
|
265
|
-
|
|
|
266
|
-
|
|
|
267
|
-
|
|
|
268
|
+
| n | DCB error vs R | Notes |
|
|
269
|
+
|---|---------------|-------|
|
|
270
|
+
| 5K–25K | **< 0.005%** | Direct-KDE path, zero histogram bias |
|
|
271
|
+
| 100K | **0.003%** | FFT histogram path, G=16384 |
|
|
272
|
+
| 1M | **0.003%** | FFT path |
|
|
273
|
+
| 10M | **0.003%** | FFT path |
|
|
274
|
+
| 100M+ | **< 0.01%** | Histogram-dominated; sketch available |
|
|
275
|
+
|
|
276
|
+
Independent-sample error (~0.2–0.5%) reflects natural sampling variability (two RNGs), not algorithmic error. The 0.003% algorithmic error sits below R's own ~0.001% numerical noise floor.
|
|
277
|
+
|
|
278
|
+
## Hardware Performance (v0.1.6)
|
|
279
|
+
|
|
280
|
+
| n | CPU (Apple M) | MPS | P100 GPU |
|
|
281
|
+
|---|:---:|:---:|:---:|
|
|
282
|
+
| 10K | 2,300 ms | 1,400 ms | **107 ms** |
|
|
283
|
+
| 50K | 2,900 ms | 1,700 ms | **167 ms** |
|
|
284
|
+
| 100K | 265 ms | 248 ms | **35 ms** |
|
|
285
|
+
| 1M | 269 ms | 189 ms | **36 ms** |
|
|
286
|
+
| 10M | 544 ms | — | **44 ms** |
|
|
287
|
+
|
|
288
|
+
P100 speedup: **43–116× vs CPU**. Peak 116× at n=50K (direct-KDE GPU parallelism).
|
|
289
|
+
|
|
290
|
+
Cumulative speedup vs v0.1.4 on CPU: 1.1× (100K), 1.7× (1M), **4.2× (10M)**.
|
|
268
291
|
|
|
269
|
-
|
|
292
|
+
## API Reference
|
|
270
293
|
|
|
271
|
-
|
|
294
|
+
### `DCBLayer`
|
|
272
295
|
|
|
273
296
|
```python
|
|
274
297
|
DCBLayer(
|
|
275
|
-
target_modes=1,
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
|
|
298
|
+
target_modes=1, # target number of modes (default 1)
|
|
299
|
+
use_fft=True, # FFT path for n > 50K (default True)
|
|
300
|
+
max_n_exact=None, # sketch above this n (None = always exact)
|
|
301
|
+
G_min=16384, # minimum FFT histogram bins (accuracy ↑ with G)
|
|
302
|
+
use_richardson=True, # Richardson extrapolation on h_crit (30% accuracy gain)
|
|
303
|
+
direct_n_max=25_000, # use direct-KDE (no histogram) for n ≤ this
|
|
304
|
+
direct_M=2048, # direct-KDE evaluation grid size
|
|
305
|
+
use_compile=False, # infrastructure flag; use TrainingLayer for compile
|
|
306
|
+
safe_backward=False, # clamp IFT denominator near bifurcations
|
|
281
307
|
)
|
|
282
308
|
```
|
|
283
309
|
|
|
284
|
-
|
|
310
|
+
### `TrainingLayer` (for ML training loops)
|
|
311
|
+
|
|
312
|
+
```python
|
|
313
|
+
from dcb import TrainingLayer
|
|
314
|
+
|
|
315
|
+
layer = TrainingLayer(
|
|
316
|
+
warm_start=True, # cache h_prev; init bracket to [0.95h, 1.05h] → 1.8× CPU speedup
|
|
317
|
+
compile=False, # torch.compile opt-in (requires float32, Python ≤ 3.11 on CUDA)
|
|
318
|
+
warm_margin=0.05, # bracket half-width around cached h_crit
|
|
319
|
+
**dcb_kwargs, # any DCBLayer parameter
|
|
320
|
+
)
|
|
321
|
+
layer.reset_cache() # call on distribution shift
|
|
322
|
+
```
|
|
323
|
+
|
|
324
|
+
### Direct-KDE path (n ≤ 25K)
|
|
285
325
|
|
|
286
|
-
|
|
326
|
+
For small samples, DCB evaluates f′_h directly without histogram binning (O(n·M) per evaluation, zero binning bias). This is 3–4× slower on CPU but **80–96× faster than CPU on GPU**.
|
|
327
|
+
|
|
328
|
+
```python
|
|
329
|
+
# Force direct-KDE for all n (accuracy benchmark):
|
|
330
|
+
layer = DCBLayer(direct_n_max=float('inf'))
|
|
331
|
+
|
|
332
|
+
# Disable direct-KDE (speed benchmark):
|
|
333
|
+
layer = DCBLayer(direct_n_max=0)
|
|
334
|
+
```
|
|
335
|
+
|
|
336
|
+
### Richardson extrapolation
|
|
337
|
+
|
|
338
|
+
By default (`use_richardson=True`), DCB runs a second bisection at G/2=8192 and combines:
|
|
339
|
+
`h̃ = (4·ĥ(G) − ĥ(G/2)) / 3`, reducing error ~30%. On GPU this adds 38% overhead with
|
|
340
|
+
<0.01% accuracy gain — consider `use_richardson=False` for GPU training loops.
|
|
341
|
+
|
|
342
|
+
## Known Limitations
|
|
343
|
+
|
|
344
|
+
- **`compile=True` on MPS**: blocked by float64 in `_refine_hcrit` fallback (fix in v0.1.7)
|
|
345
|
+
- **`compile=True` on CUDA with Python 3.12**: requires torch ≥ 2.4 or Python ≤ 3.11
|
|
346
|
+
- **`gradcheck` with direct-KDE forward**: forward (n ≤ 25K) uses exact KDE; backward uses smooth IFT surrogate — gradcheck will fail by design; use `DCBLayer(direct_n_max=0)` for gradcheck
|
|
347
|
+
- **n > 100M**: requires streaming histogram (not yet public API); use `max_n_exact=1_000_000` sketch as workaround
|
|
348
|
+
|
|
349
|
+
## Confirmed Experimental Results
|
|
287
350
|
|
|
288
351
|
| Experiment | Result | Criterion |
|
|
289
352
|
|---|---|---|
|
|
290
|
-
|
|
|
291
|
-
|
|
|
292
|
-
|
|
|
293
|
-
|
|
|
294
|
-
|
|
|
353
|
+
| Accuracy vs R (same data, n=100K) | **0.003%** | < 0.01% ✓ |
|
|
354
|
+
| Validation (m≥2, Marron-Wand) | R²=0.91, MAE=0.07, ρ=0.89 | R²≥0.85 ✓ |
|
|
355
|
+
| Speedup vs scipy (CUDA T4, n=8192) | **10.5×** | ≥3× ✓ |
|
|
356
|
+
| GAN mode preservation | h_crit=1.232 >> 0.3 | h_crit>0.3 ✓ |
|
|
357
|
+
| Anomaly AUC (KDDCup99) | DCB=**0.9982** vs IF=0.9867 | DCB≥IF ✓ |
|
|
358
|
+
| GPU speedup (P100, n=50K) | **116×** vs CPU | — |
|
|
359
|
+
| GPU speedup (P100, n=100K) | **43×** vs CPU | — |
|
|
360
|
+
|
|
361
|
+
## Changelog
|
|
362
|
+
|
|
363
|
+
### v0.1.6 (2026-05-30)
|
|
364
|
+
- `TrainingLayer`: warm-start bracket caching (1.82× CPU speedup in training loops)
|
|
365
|
+
- `direct_mode_count_batch`: direct-KDE path for n ≤ 25K (zero histogram bias; 80–96× GPU speedup)
|
|
366
|
+
- Compile-ready trisection: tensor lo/hi, no `.item()` inside loop, fixed 16-round unroll
|
|
367
|
+
- `mode_count_from_C_batch` returns `Tensor(B,)` (was `list[int]`) — enables torch.compile tracing
|
|
368
|
+
|
|
369
|
+
### v0.1.5 (2026-05-29)
|
|
370
|
+
- Richardson extrapolation on h_crit scalar (30% accuracy gain, G=16384+8192)
|
|
371
|
+
- alloc/sync hygiene: removed `nonzero_mask` host sync (4.2× faster at n=10M)
|
|
372
|
+
- Batched trisection bisection (one irfft dispatch per round)
|
|
373
|
+
- Eliminated duplicate O(n) histogram in `_refine_hcrit` (C_external reuse)
|
|
374
|
+
|
|
375
|
+
### v0.1.4 (2026-05-29)
|
|
376
|
+
- FFT histogram path: C hoisted out of bisection loop (Worker 1)
|
|
377
|
+
- Device-native histogram: CUDA histc, MPS scatter_add_, CPU bucketize+bincount
|
|
378
|
+
- float32 FFT default; pad_factor 4→2 (halves irfft size)
|
|
379
|
+
- Adaptive bisection early-exit
|
|
380
|
+
|
|
381
|
+
### v0.1.1 (2026-05-29)
|
|
382
|
+
- MPS histc OOM bug fixed (bucketize+bincount)
|
|
383
|
+
- Sketch API: max_n_exact=1M, sketch_size=500K
|
|
384
|
+
- Domain consistency and bias warning fixes
|
|
295
385
|
|
|
296
386
|
## Repository Structure
|
|
297
387
|
|
|
298
388
|
```
|
|
299
|
-
dcb/
|
|
300
|
-
layer.py
|
|
301
|
-
solver.py
|
|
302
|
-
fft_kde.py
|
|
303
|
-
|
|
304
|
-
|
|
305
|
-
|
|
306
|
-
|
|
307
|
-
|
|
308
|
-
phase3_anomaly.py Anomaly detection (Table 2, Figure 5)
|
|
309
|
-
round20_*.py Large-n R comparison and streaming benchmarks
|
|
310
|
-
round21_*.py Accuracy improvement experiments
|
|
311
|
-
tests/ Unit tests (pytest, 45 passed, 1 xfailed)
|
|
312
|
-
outputs/ All generated figures and tables (PDFs, PNGs, CSVs)
|
|
389
|
+
dcb/
|
|
390
|
+
layer.py DCBLayer nn.Module + DCBFunction autograd
|
|
391
|
+
solver.py IFT root-finder, trisection bisection, Richardson pass
|
|
392
|
+
fft_kde.py FFT mode counter, direct_mode_count_batch, precompute_fft
|
|
393
|
+
training.py TrainingLayer with warm-start and compile support
|
|
394
|
+
kde.py Direct KDE derivatives (IFT backward path)
|
|
395
|
+
utils.py Grid, Silverman bandwidth, sg() stabiliser
|
|
396
|
+
experiments/ Reproduction scripts for all benchmarks and paper figures
|
|
397
|
+
tests/ Unit tests (45 passed, 1 xfailed)
|
|
313
398
|
```
|
|
314
399
|
|
|
315
400
|
## License
|
diffcb-0.1.7/README.md
ADDED
|
@@ -0,0 +1,180 @@
|
|
|
1
|
+
# DCB — Differentiable Critical Bandwidth
|
|
2
|
+
|
|
3
|
+
[](https://pypi.org/project/diffcb/)
|
|
4
|
+
[](LICENSE)
|
|
5
|
+
[](https://www.python.org/)
|
|
6
|
+
|
|
7
|
+
A PyTorch package that makes **Silverman's critical bandwidth test (1981)** fully differentiable, enabling end-to-end gradient-based optimisation over the modal structure of continuous distributions.
|
|
8
|
+
|
|
9
|
+
## Overview
|
|
10
|
+
|
|
11
|
+
`h_crit` is the minimum KDE bandwidth at which a distribution appears unimodal — a classical nonparametric statistic for modality testing. DCB replaces every non-differentiable step with a smooth surrogate, then uses the **Implicit Function Theorem** (IFT) to compute exact gradients through the root-finding step at O(1) memory cost.
|
|
12
|
+
|
|
13
|
+
```python
|
|
14
|
+
import torch
|
|
15
|
+
from dcb import DCBLayer, TrainingLayer
|
|
16
|
+
|
|
17
|
+
X = torch.randn(10_000, requires_grad=True) # 1D samples, any n from 5K to 1B
|
|
18
|
+
layer = DCBLayer()
|
|
19
|
+
h_crit = layer(X) # differentiable scalar
|
|
20
|
+
h_crit.backward() # exact IFT gradients
|
|
21
|
+
|
|
22
|
+
# For repeated training-loop use with warm-start bracket caching:
|
|
23
|
+
layer = TrainingLayer(warm_start=True)
|
|
24
|
+
for batch in dataloader:
|
|
25
|
+
h = layer(batch) # 1.8× faster after first call on CPU; ~10× on CUDA with compile=True
|
|
26
|
+
```
|
|
27
|
+
|
|
28
|
+
## Installation
|
|
29
|
+
|
|
30
|
+
```bash
|
|
31
|
+
pip install diffcb
|
|
32
|
+
```
|
|
33
|
+
|
|
34
|
+
Or from source:
|
|
35
|
+
```bash
|
|
36
|
+
git clone https://github.com/ryZhangHason/differentiable-critical-bandwidth
|
|
37
|
+
cd differentiable-critical-bandwidth
|
|
38
|
+
pip install -e ".[dev]"
|
|
39
|
+
```
|
|
40
|
+
|
|
41
|
+
## Accuracy vs R's `bw.crit`
|
|
42
|
+
|
|
43
|
+
Validated against R's `multimode::bw.crit(data, mod0=1)` (Hall & York 2001).
|
|
44
|
+
**Same-sample protocol** (identical data fed to both Python and R):
|
|
45
|
+
|
|
46
|
+
| n | DCB error vs R | Notes |
|
|
47
|
+
|---|---------------|-------|
|
|
48
|
+
| 5K–25K | **< 0.005%** | Direct-KDE path, zero histogram bias |
|
|
49
|
+
| 100K | **0.003%** | FFT histogram path, G=16384 |
|
|
50
|
+
| 1M | **0.003%** | FFT path |
|
|
51
|
+
| 10M | **0.003%** | FFT path |
|
|
52
|
+
| 100M+ | **< 0.01%** | Histogram-dominated; sketch available |
|
|
53
|
+
|
|
54
|
+
Independent-sample error (~0.2–0.5%) reflects natural sampling variability (two RNGs), not algorithmic error. The 0.003% algorithmic error sits below R's own ~0.001% numerical noise floor.
|
|
55
|
+
|
|
56
|
+
## Hardware Performance (v0.1.6)
|
|
57
|
+
|
|
58
|
+
| n | CPU (Apple M) | MPS | P100 GPU |
|
|
59
|
+
|---|:---:|:---:|:---:|
|
|
60
|
+
| 10K | 2,300 ms | 1,400 ms | **107 ms** |
|
|
61
|
+
| 50K | 2,900 ms | 1,700 ms | **167 ms** |
|
|
62
|
+
| 100K | 265 ms | 248 ms | **35 ms** |
|
|
63
|
+
| 1M | 269 ms | 189 ms | **36 ms** |
|
|
64
|
+
| 10M | 544 ms | — | **44 ms** |
|
|
65
|
+
|
|
66
|
+
P100 speedup: **43–116× vs CPU**. Peak 116× at n=50K (direct-KDE GPU parallelism).
|
|
67
|
+
|
|
68
|
+
Cumulative speedup vs v0.1.4 on CPU: 1.1× (100K), 1.7× (1M), **4.2× (10M)**.
|
|
69
|
+
|
|
70
|
+
## API Reference
|
|
71
|
+
|
|
72
|
+
### `DCBLayer`
|
|
73
|
+
|
|
74
|
+
```python
|
|
75
|
+
DCBLayer(
|
|
76
|
+
target_modes=1, # target number of modes (default 1)
|
|
77
|
+
use_fft=True, # FFT path for n > 50K (default True)
|
|
78
|
+
max_n_exact=None, # sketch above this n (None = always exact)
|
|
79
|
+
G_min=16384, # minimum FFT histogram bins (accuracy ↑ with G)
|
|
80
|
+
use_richardson=True, # Richardson extrapolation on h_crit (30% accuracy gain)
|
|
81
|
+
direct_n_max=25_000, # use direct-KDE (no histogram) for n ≤ this
|
|
82
|
+
direct_M=2048, # direct-KDE evaluation grid size
|
|
83
|
+
use_compile=False, # infrastructure flag; use TrainingLayer for compile
|
|
84
|
+
safe_backward=False, # clamp IFT denominator near bifurcations
|
|
85
|
+
)
|
|
86
|
+
```
|
|
87
|
+
|
|
88
|
+
### `TrainingLayer` (for ML training loops)
|
|
89
|
+
|
|
90
|
+
```python
|
|
91
|
+
from dcb import TrainingLayer
|
|
92
|
+
|
|
93
|
+
layer = TrainingLayer(
|
|
94
|
+
warm_start=True, # cache h_prev; init bracket to [0.95h, 1.05h] → 1.8× CPU speedup
|
|
95
|
+
compile=False, # torch.compile opt-in (requires float32, Python ≤ 3.11 on CUDA)
|
|
96
|
+
warm_margin=0.05, # bracket half-width around cached h_crit
|
|
97
|
+
**dcb_kwargs, # any DCBLayer parameter
|
|
98
|
+
)
|
|
99
|
+
layer.reset_cache() # call on distribution shift
|
|
100
|
+
```
|
|
101
|
+
|
|
102
|
+
### Direct-KDE path (n ≤ 25K)
|
|
103
|
+
|
|
104
|
+
For small samples, DCB evaluates f′_h directly without histogram binning (O(n·M) per evaluation, zero binning bias). This is 3–4× slower on CPU but **80–96× faster than CPU on GPU**.
|
|
105
|
+
|
|
106
|
+
```python
|
|
107
|
+
# Force direct-KDE for all n (accuracy benchmark):
|
|
108
|
+
layer = DCBLayer(direct_n_max=float('inf'))
|
|
109
|
+
|
|
110
|
+
# Disable direct-KDE (speed benchmark):
|
|
111
|
+
layer = DCBLayer(direct_n_max=0)
|
|
112
|
+
```
|
|
113
|
+
|
|
114
|
+
### Richardson extrapolation
|
|
115
|
+
|
|
116
|
+
By default (`use_richardson=True`), DCB runs a second bisection at G/2=8192 and combines:
|
|
117
|
+
`h̃ = (4·ĥ(G) − ĥ(G/2)) / 3`, reducing error ~30%. On GPU this adds 38% overhead with
|
|
118
|
+
<0.01% accuracy gain — consider `use_richardson=False` for GPU training loops.
|
|
119
|
+
|
|
120
|
+
## Known Limitations
|
|
121
|
+
|
|
122
|
+
- **`compile=True` on MPS**: blocked by float64 in `_refine_hcrit` fallback (fix in v0.1.7)
|
|
123
|
+
- **`compile=True` on CUDA with Python 3.12**: requires torch ≥ 2.4 or Python ≤ 3.11
|
|
124
|
+
- **`gradcheck` with direct-KDE forward**: forward (n ≤ 25K) uses exact KDE; backward uses smooth IFT surrogate — gradcheck will fail by design; use `DCBLayer(direct_n_max=0)` for gradcheck
|
|
125
|
+
- **n > 100M**: requires streaming histogram (not yet public API); use `max_n_exact=1_000_000` sketch as workaround
|
|
126
|
+
|
|
127
|
+
## Confirmed Experimental Results
|
|
128
|
+
|
|
129
|
+
| Experiment | Result | Criterion |
|
|
130
|
+
|---|---|---|
|
|
131
|
+
| Accuracy vs R (same data, n=100K) | **0.003%** | < 0.01% ✓ |
|
|
132
|
+
| Validation (m≥2, Marron-Wand) | R²=0.91, MAE=0.07, ρ=0.89 | R²≥0.85 ✓ |
|
|
133
|
+
| Speedup vs scipy (CUDA T4, n=8192) | **10.5×** | ≥3× ✓ |
|
|
134
|
+
| GAN mode preservation | h_crit=1.232 >> 0.3 | h_crit>0.3 ✓ |
|
|
135
|
+
| Anomaly AUC (KDDCup99) | DCB=**0.9982** vs IF=0.9867 | DCB≥IF ✓ |
|
|
136
|
+
| GPU speedup (P100, n=50K) | **116×** vs CPU | — |
|
|
137
|
+
| GPU speedup (P100, n=100K) | **43×** vs CPU | — |
|
|
138
|
+
|
|
139
|
+
## Changelog
|
|
140
|
+
|
|
141
|
+
### v0.1.6 (2026-05-30)
|
|
142
|
+
- `TrainingLayer`: warm-start bracket caching (1.82× CPU speedup in training loops)
|
|
143
|
+
- `direct_mode_count_batch`: direct-KDE path for n ≤ 25K (zero histogram bias; 80–96× GPU speedup)
|
|
144
|
+
- Compile-ready trisection: tensor lo/hi, no `.item()` inside loop, fixed 16-round unroll
|
|
145
|
+
- `mode_count_from_C_batch` returns `Tensor(B,)` (was `list[int]`) — enables torch.compile tracing
|
|
146
|
+
|
|
147
|
+
### v0.1.5 (2026-05-29)
|
|
148
|
+
- Richardson extrapolation on h_crit scalar (30% accuracy gain, G=16384+8192)
|
|
149
|
+
- alloc/sync hygiene: removed `nonzero_mask` host sync (4.2× faster at n=10M)
|
|
150
|
+
- Batched trisection bisection (one irfft dispatch per round)
|
|
151
|
+
- Eliminated duplicate O(n) histogram in `_refine_hcrit` (C_external reuse)
|
|
152
|
+
|
|
153
|
+
### v0.1.4 (2026-05-29)
|
|
154
|
+
- FFT histogram path: C hoisted out of bisection loop (Worker 1)
|
|
155
|
+
- Device-native histogram: CUDA histc, MPS scatter_add_, CPU bucketize+bincount
|
|
156
|
+
- float32 FFT default; pad_factor 4→2 (halves irfft size)
|
|
157
|
+
- Adaptive bisection early-exit
|
|
158
|
+
|
|
159
|
+
### v0.1.1 (2026-05-29)
|
|
160
|
+
- MPS histc OOM bug fixed (bucketize+bincount)
|
|
161
|
+
- Sketch API: max_n_exact=1M, sketch_size=500K
|
|
162
|
+
- Domain consistency and bias warning fixes
|
|
163
|
+
|
|
164
|
+
## Repository Structure
|
|
165
|
+
|
|
166
|
+
```
|
|
167
|
+
dcb/
|
|
168
|
+
layer.py DCBLayer nn.Module + DCBFunction autograd
|
|
169
|
+
solver.py IFT root-finder, trisection bisection, Richardson pass
|
|
170
|
+
fft_kde.py FFT mode counter, direct_mode_count_batch, precompute_fft
|
|
171
|
+
training.py TrainingLayer with warm-start and compile support
|
|
172
|
+
kde.py Direct KDE derivatives (IFT backward path)
|
|
173
|
+
utils.py Grid, Silverman bandwidth, sg() stabiliser
|
|
174
|
+
experiments/ Reproduction scripts for all benchmarks and paper figures
|
|
175
|
+
tests/ Unit tests (45 passed, 1 xfailed)
|
|
176
|
+
```
|
|
177
|
+
|
|
178
|
+
## License
|
|
179
|
+
|
|
180
|
+
Apache 2.0 — see [LICENSE](LICENSE).
|
|
@@ -12,11 +12,13 @@ utilities. Requires PyTorch >= 2.0, NumPy >= 1.24, and SciPy >= 1.10.
|
|
|
12
12
|
"""
|
|
13
13
|
|
|
14
14
|
from dcb.layer import DCBLayer, DifferentiableCriticalBandwidth
|
|
15
|
+
from dcb.training import TrainingLayer
|
|
15
16
|
from dcb.utils import anneal_eps_tau
|
|
16
17
|
from dcb.kde import soft_mode_count_cross, soft_mode_count
|
|
17
18
|
|
|
18
19
|
__all__ = [
|
|
19
20
|
"DCBLayer", "DifferentiableCriticalBandwidth",
|
|
21
|
+
"TrainingLayer",
|
|
20
22
|
"anneal_eps_tau", "soft_mode_count_cross", "soft_mode_count",
|
|
21
23
|
]
|
|
22
|
-
__version__ = "0.1.
|
|
24
|
+
__version__ = "0.1.7"
|