gpuicalcc 0.1.3__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- gpuicalcc-0.1.3/LICENSE +19 -0
- gpuicalcc-0.1.3/PKG-INFO +137 -0
- gpuicalcc-0.1.3/README.md +109 -0
- gpuicalcc-0.1.3/gpuicalcc.egg-info/PKG-INFO +137 -0
- gpuicalcc-0.1.3/gpuicalcc.egg-info/SOURCES.txt +9 -0
- gpuicalcc-0.1.3/gpuicalcc.egg-info/dependency_links.txt +1 -0
- gpuicalcc-0.1.3/gpuicalcc.egg-info/requires.txt +4 -0
- gpuicalcc-0.1.3/gpuicalcc.egg-info/top_level.txt +1 -0
- gpuicalcc-0.1.3/gpuicalcc.py +300 -0
- gpuicalcc-0.1.3/pyproject.toml +44 -0
- gpuicalcc-0.1.3/setup.cfg +4 -0
gpuicalcc-0.1.3/LICENSE
ADDED
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
Creative Commons Attribution 4.0 International (CC BY 4.0)
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Kleinverse AI, Inc.
|
|
4
|
+
|
|
5
|
+
You are free to:
|
|
6
|
+
Share — copy and redistribute the material in any medium or format
|
|
7
|
+
Adapt — remix, transform, and build upon the material for any purpose,
|
|
8
|
+
even commercially.
|
|
9
|
+
|
|
10
|
+
Under the following terms:
|
|
11
|
+
Attribution — You must give appropriate credit, provide a link to the
|
|
12
|
+
license, and indicate if changes were made. You may do so
|
|
13
|
+
in any reasonable manner, but not in any way that suggests
|
|
14
|
+
the licensor endorses you or your use.
|
|
15
|
+
|
|
16
|
+
No additional restrictions — You may not apply legal terms or technological
|
|
17
|
+
measures that legally restrict others from doing anything the license permits.
|
|
18
|
+
|
|
19
|
+
Full license text: https://creativecommons.org/licenses/by/4.0/legalcode
|
gpuicalcc-0.1.3/PKG-INFO
ADDED
|
@@ -0,0 +1,137 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: gpuicalcc
|
|
3
|
+
Version: 0.1.3
|
|
4
|
+
Summary: GPU-accelerated locally centered contrast functions for FastICA
|
|
5
|
+
Author-email: Tetsuya Saito <ted@kleinverse.io>
|
|
6
|
+
License: CC BY 4.0
|
|
7
|
+
Project-URL: Repository, https://github.com/Kleinverse/gpuicalcc
|
|
8
|
+
Project-URL: CPU Package, https://github.com/Kleinverse/icalcc
|
|
9
|
+
Project-URL: Research, https://github.com/Kleinverse/research
|
|
10
|
+
Keywords: ICA,independent component analysis,blind source separation,locally centered contrast,cumulants,higher-order statistics,Renyi entropy,FastICA,scikit-learn,GPU,PyTorch,CUDA
|
|
11
|
+
Classifier: Development Status :: 4 - Beta
|
|
12
|
+
Classifier: Intended Audience :: Science/Research
|
|
13
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
14
|
+
Classifier: Programming Language :: Python :: 3
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
19
|
+
Classifier: License :: Other/Proprietary License
|
|
20
|
+
Requires-Python: >=3.9
|
|
21
|
+
Description-Content-Type: text/markdown
|
|
22
|
+
License-File: LICENSE
|
|
23
|
+
Requires-Dist: numpy>=1.24
|
|
24
|
+
Requires-Dist: scikit-learn>=1.3
|
|
25
|
+
Requires-Dist: icalcc>=0.1.3
|
|
26
|
+
Requires-Dist: torch>=2.0
|
|
27
|
+
Dynamic: license-file
|
|
28
|
+
|
|
29
|
+
# gpuicalcc
|
|
30
|
+
GPU-accelerated locally centered contrast functions for FastICA.
|
|
31
|
+
PyTorch extension of [icalcc](https://github.com/Kleinverse/icalcc).
|
|
32
|
+
Same API, drop-in replacement with CUDA acceleration for bounded
|
|
33
|
+
and polynomial LCC contrasts.
|
|
34
|
+
```python
|
|
35
|
+
from gpuicalcc import GPUICALCC
|
|
36
|
+
ica = GPUICALCC(n_components=4, K='ltanh', device='cuda', random_state=0)
|
|
37
|
+
S_hat = ica.fit_transform(X)
|
|
38
|
+
```
|
|
39
|
+
|
|
40
|
+
## Installation
|
|
41
|
+
```bash
|
|
42
|
+
pip install gpuicalcc
|
|
43
|
+
```
|
|
44
|
+
Requires PyTorch with CUDA. See [pytorch.org](https://pytorch.org)
|
|
45
|
+
for installation instructions.
|
|
46
|
+
|
|
47
|
+
## Supported K Values
|
|
48
|
+
| K | Description |
|
|
49
|
+
|---|---|
|
|
50
|
+
| `'ltanh'` | Bounded LCC-tanh — 40–48× GPU speedup |
|
|
51
|
+
| `'lexp'` | Bounded LCC-exp — 40–48× GPU speedup |
|
|
52
|
+
| `4` | Polynomial LCC order 4 |
|
|
53
|
+
| `6` | Polynomial LCC order 6, couples m₃, m₄, m₆ |
|
|
54
|
+
| `8` | Polynomial LCC order 8, up to 2.4× GPU speedup at N≥500k |
|
|
55
|
+
| `'tanh'` | Classical logcosh contrast (CPU fallback) |
|
|
56
|
+
| `'exp'` | Classical Gaussian contrast (CPU fallback) |
|
|
57
|
+
| `'skew'` | Classical cube contrast (CPU fallback) |
|
|
58
|
+
|
|
59
|
+
Bounded contrasts (`'ltanh'`, `'lexp'`) achieve 40–48× speedup
|
|
60
|
+
across all dataset sizes due to the O(NB) pairwise computation
|
|
61
|
+
being embarrassingly parallel. Polynomial contrasts benefit from
|
|
62
|
+
GPU acceleration at N ≥ 500,000. Classical contrasts fall back
|
|
63
|
+
to the CPU implementation in `icalcc`.
|
|
64
|
+
|
|
65
|
+
## Usage
|
|
66
|
+
```python
|
|
67
|
+
from gpuicalcc import GPUICALCC
|
|
68
|
+
|
|
69
|
+
# Bounded LCC-tanh (recommended for heavy-tailed or skewed sources)
|
|
70
|
+
ica = GPUICALCC(n_components=4, K='ltanh', device='cuda', random_state=0)
|
|
71
|
+
S_hat = ica.fit_transform(X)
|
|
72
|
+
|
|
73
|
+
# Bounded LCC-tanh with memory limit
|
|
74
|
+
ica = GPUICALCC(n_components=4, K='ltanh', device='cuda',
|
|
75
|
+
batch_size=500, gpu_mem_limit=8, random_state=0)
|
|
76
|
+
S_hat = ica.fit_transform(X)
|
|
77
|
+
|
|
78
|
+
# Polynomial LCC order 8 (near-Gaussian sources)
|
|
79
|
+
ica = GPUICALCC(n_components=4, K=8, device='cuda', random_state=0)
|
|
80
|
+
S_hat = ica.fit_transform(X)
|
|
81
|
+
|
|
82
|
+
# Falls back to CPU automatically if CUDA unavailable
|
|
83
|
+
ica = GPUICALCC(n_components=4, K='ltanh', device='cuda', random_state=0)
|
|
84
|
+
```
|
|
85
|
+
|
|
86
|
+
## Parameters
|
|
87
|
+
| Parameter | Default | Description |
|
|
88
|
+
|---|---|---|
|
|
89
|
+
| `device` | `'cuda'` | PyTorch device. Falls back to `'cpu'` if CUDA unavailable |
|
|
90
|
+
| `batch_size` | `500` | Subsample size B for bounded pairwise computation |
|
|
91
|
+
| `gpu_mem_limit` | `None` | GPU memory limit in GB. Auto-detected if None |
|
|
92
|
+
| `clear_gpu` | `True` | Clear GPU cache after fit |
|
|
93
|
+
|
|
94
|
+
## Benchmark
|
|
95
|
+

|
|
96
|
+
|
|
97
|
+
Bounded contrasts (`ltanh`, `lexp`) achieve 40–48× speedup across
|
|
98
|
+
all dataset sizes. Polynomial contrasts (K=4,6,8) benefit from GPU
|
|
99
|
+
acceleration at N ≥ 500,000, reaching up to 2.4× at K=8. All runs
|
|
100
|
+
use batch_size=500. Full benchmark table:
|
|
101
|
+
[Experiment code](https://github.com/Kleinverse/research/tree/main/icalcc).
|
|
102
|
+
|
|
103
|
+
## Requirements
|
|
104
|
+
- Python ≥ 3.9
|
|
105
|
+
- numpy ≥ 1.24
|
|
106
|
+
- scikit-learn ≥ 1.3
|
|
107
|
+
- icalcc ≥ 0.1.0
|
|
108
|
+
- torch ≥ 2.0
|
|
109
|
+
|
|
110
|
+
## See Also
|
|
111
|
+
- [icalcc](https://github.com/Kleinverse/icalcc) — CPU version
|
|
112
|
+
- [Experiment code](https://github.com/Kleinverse/research/tree/main/icalcc)
|
|
113
|
+
|
|
114
|
+
## Citation
|
|
115
|
+
|
|
116
|
+
If you use this package, please cite both the software paper and
|
|
117
|
+
the underlying LCC kernel paper:
|
|
118
|
+
```bibtex
|
|
119
|
+
@article{saito2026icalcc,
|
|
120
|
+
author = {Saito, Tetsuya},
|
|
121
|
+
title = {{ICALCC}: Locally Centered Contrast Functions for
|
|
122
|
+
{FastICA} with {GPU} Acceleration},
|
|
123
|
+
journal = {TechRxiv},
|
|
124
|
+
year = {2026}
|
|
125
|
+
}
|
|
126
|
+
|
|
127
|
+
@article{saito2026lcc,
|
|
128
|
+
author = {Saito, Tetsuya},
|
|
129
|
+
title = {Locally Centered Cyclic Kernels for Higher-Order
|
|
130
|
+
Independent Component Analysis},
|
|
131
|
+
journal = {TechRxiv},
|
|
132
|
+
year = {2026}
|
|
133
|
+
}
|
|
134
|
+
```
|
|
135
|
+
|
|
136
|
+
## License
|
|
137
|
+
[CC BY 4.0](https://creativecommons.org/licenses/by/4.0/)
|
|
@@ -0,0 +1,109 @@
|
|
|
1
|
+
# gpuicalcc
|
|
2
|
+
GPU-accelerated locally centered contrast functions for FastICA.
|
|
3
|
+
PyTorch extension of [icalcc](https://github.com/Kleinverse/icalcc).
|
|
4
|
+
Same API, drop-in replacement with CUDA acceleration for bounded
|
|
5
|
+
and polynomial LCC contrasts.
|
|
6
|
+
```python
|
|
7
|
+
from gpuicalcc import GPUICALCC
|
|
8
|
+
ica = GPUICALCC(n_components=4, K='ltanh', device='cuda', random_state=0)
|
|
9
|
+
S_hat = ica.fit_transform(X)
|
|
10
|
+
```
|
|
11
|
+
|
|
12
|
+
## Installation
|
|
13
|
+
```bash
|
|
14
|
+
pip install gpuicalcc
|
|
15
|
+
```
|
|
16
|
+
Requires PyTorch with CUDA. See [pytorch.org](https://pytorch.org)
|
|
17
|
+
for installation instructions.
|
|
18
|
+
|
|
19
|
+
## Supported K Values
|
|
20
|
+
| K | Description |
|
|
21
|
+
|---|---|
|
|
22
|
+
| `'ltanh'` | Bounded LCC-tanh — 40–48× GPU speedup |
|
|
23
|
+
| `'lexp'` | Bounded LCC-exp — 40–48× GPU speedup |
|
|
24
|
+
| `4` | Polynomial LCC order 4 |
|
|
25
|
+
| `6` | Polynomial LCC order 6, couples m₃, m₄, m₆ |
|
|
26
|
+
| `8` | Polynomial LCC order 8, up to 2.4× GPU speedup at N≥500k |
|
|
27
|
+
| `'tanh'` | Classical logcosh contrast (CPU fallback) |
|
|
28
|
+
| `'exp'` | Classical Gaussian contrast (CPU fallback) |
|
|
29
|
+
| `'skew'` | Classical cube contrast (CPU fallback) |
|
|
30
|
+
|
|
31
|
+
Bounded contrasts (`'ltanh'`, `'lexp'`) achieve 40–48× speedup
|
|
32
|
+
across all dataset sizes due to the O(NB) pairwise computation
|
|
33
|
+
being embarrassingly parallel. Polynomial contrasts benefit from
|
|
34
|
+
GPU acceleration at N ≥ 500,000. Classical contrasts fall back
|
|
35
|
+
to the CPU implementation in `icalcc`.
|
|
36
|
+
|
|
37
|
+
## Usage
|
|
38
|
+
```python
|
|
39
|
+
from gpuicalcc import GPUICALCC
|
|
40
|
+
|
|
41
|
+
# Bounded LCC-tanh (recommended for heavy-tailed or skewed sources)
|
|
42
|
+
ica = GPUICALCC(n_components=4, K='ltanh', device='cuda', random_state=0)
|
|
43
|
+
S_hat = ica.fit_transform(X)
|
|
44
|
+
|
|
45
|
+
# Bounded LCC-tanh with memory limit
|
|
46
|
+
ica = GPUICALCC(n_components=4, K='ltanh', device='cuda',
|
|
47
|
+
batch_size=500, gpu_mem_limit=8, random_state=0)
|
|
48
|
+
S_hat = ica.fit_transform(X)
|
|
49
|
+
|
|
50
|
+
# Polynomial LCC order 8 (near-Gaussian sources)
|
|
51
|
+
ica = GPUICALCC(n_components=4, K=8, device='cuda', random_state=0)
|
|
52
|
+
S_hat = ica.fit_transform(X)
|
|
53
|
+
|
|
54
|
+
# Falls back to CPU automatically if CUDA unavailable
|
|
55
|
+
ica = GPUICALCC(n_components=4, K='ltanh', device='cuda', random_state=0)
|
|
56
|
+
```
|
|
57
|
+
|
|
58
|
+
## Parameters
|
|
59
|
+
| Parameter | Default | Description |
|
|
60
|
+
|---|---|---|
|
|
61
|
+
| `device` | `'cuda'` | PyTorch device. Falls back to `'cpu'` if CUDA unavailable |
|
|
62
|
+
| `batch_size` | `500` | Subsample size B for bounded pairwise computation |
|
|
63
|
+
| `gpu_mem_limit` | `None` | GPU memory limit in GB. Auto-detected if None |
|
|
64
|
+
| `clear_gpu` | `True` | Clear GPU cache after fit |
|
|
65
|
+
|
|
66
|
+
## Benchmark
|
|
67
|
+

|
|
68
|
+
|
|
69
|
+
Bounded contrasts (`ltanh`, `lexp`) achieve 40–48× speedup across
|
|
70
|
+
all dataset sizes. Polynomial contrasts (K=4,6,8) benefit from GPU
|
|
71
|
+
acceleration at N ≥ 500,000, reaching up to 2.4× at K=8. All runs
|
|
72
|
+
use batch_size=500. Full benchmark table:
|
|
73
|
+
[Experiment code](https://github.com/Kleinverse/research/tree/main/icalcc).
|
|
74
|
+
|
|
75
|
+
## Requirements
|
|
76
|
+
- Python ≥ 3.9
|
|
77
|
+
- numpy ≥ 1.24
|
|
78
|
+
- scikit-learn ≥ 1.3
|
|
79
|
+
- icalcc ≥ 0.1.0
|
|
80
|
+
- torch ≥ 2.0
|
|
81
|
+
|
|
82
|
+
## See Also
|
|
83
|
+
- [icalcc](https://github.com/Kleinverse/icalcc) — CPU version
|
|
84
|
+
- [Experiment code](https://github.com/Kleinverse/research/tree/main/icalcc)
|
|
85
|
+
|
|
86
|
+
## Citation
|
|
87
|
+
|
|
88
|
+
If you use this package, please cite both the software paper and
|
|
89
|
+
the underlying LCC kernel paper:
|
|
90
|
+
```bibtex
|
|
91
|
+
@article{saito2026icalcc,
|
|
92
|
+
author = {Saito, Tetsuya},
|
|
93
|
+
title = {{ICALCC}: Locally Centered Contrast Functions for
|
|
94
|
+
{FastICA} with {GPU} Acceleration},
|
|
95
|
+
journal = {TechRxiv},
|
|
96
|
+
year = {2026}
|
|
97
|
+
}
|
|
98
|
+
|
|
99
|
+
@article{saito2026lcc,
|
|
100
|
+
author = {Saito, Tetsuya},
|
|
101
|
+
title = {Locally Centered Cyclic Kernels for Higher-Order
|
|
102
|
+
Independent Component Analysis},
|
|
103
|
+
journal = {TechRxiv},
|
|
104
|
+
year = {2026}
|
|
105
|
+
}
|
|
106
|
+
```
|
|
107
|
+
|
|
108
|
+
## License
|
|
109
|
+
[CC BY 4.0](https://creativecommons.org/licenses/by/4.0/)
|
|
@@ -0,0 +1,137 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: gpuicalcc
|
|
3
|
+
Version: 0.1.3
|
|
4
|
+
Summary: GPU-accelerated locally centered contrast functions for FastICA
|
|
5
|
+
Author-email: Tetsuya Saito <ted@kleinverse.io>
|
|
6
|
+
License: CC BY 4.0
|
|
7
|
+
Project-URL: Repository, https://github.com/Kleinverse/gpuicalcc
|
|
8
|
+
Project-URL: CPU Package, https://github.com/Kleinverse/icalcc
|
|
9
|
+
Project-URL: Research, https://github.com/Kleinverse/research
|
|
10
|
+
Keywords: ICA,independent component analysis,blind source separation,locally centered contrast,cumulants,higher-order statistics,Renyi entropy,FastICA,scikit-learn,GPU,PyTorch,CUDA
|
|
11
|
+
Classifier: Development Status :: 4 - Beta
|
|
12
|
+
Classifier: Intended Audience :: Science/Research
|
|
13
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
14
|
+
Classifier: Programming Language :: Python :: 3
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
19
|
+
Classifier: License :: Other/Proprietary License
|
|
20
|
+
Requires-Python: >=3.9
|
|
21
|
+
Description-Content-Type: text/markdown
|
|
22
|
+
License-File: LICENSE
|
|
23
|
+
Requires-Dist: numpy>=1.24
|
|
24
|
+
Requires-Dist: scikit-learn>=1.3
|
|
25
|
+
Requires-Dist: icalcc>=0.1.3
|
|
26
|
+
Requires-Dist: torch>=2.0
|
|
27
|
+
Dynamic: license-file
|
|
28
|
+
|
|
29
|
+
# gpuicalcc
|
|
30
|
+
GPU-accelerated locally centered contrast functions for FastICA.
|
|
31
|
+
PyTorch extension of [icalcc](https://github.com/Kleinverse/icalcc).
|
|
32
|
+
Same API, drop-in replacement with CUDA acceleration for bounded
|
|
33
|
+
and polynomial LCC contrasts.
|
|
34
|
+
```python
|
|
35
|
+
from gpuicalcc import GPUICALCC
|
|
36
|
+
ica = GPUICALCC(n_components=4, K='ltanh', device='cuda', random_state=0)
|
|
37
|
+
S_hat = ica.fit_transform(X)
|
|
38
|
+
```
|
|
39
|
+
|
|
40
|
+
## Installation
|
|
41
|
+
```bash
|
|
42
|
+
pip install gpuicalcc
|
|
43
|
+
```
|
|
44
|
+
Requires PyTorch with CUDA. See [pytorch.org](https://pytorch.org)
|
|
45
|
+
for installation instructions.
|
|
46
|
+
|
|
47
|
+
## Supported K Values
|
|
48
|
+
| K | Description |
|
|
49
|
+
|---|---|
|
|
50
|
+
| `'ltanh'` | Bounded LCC-tanh — 40–48× GPU speedup |
|
|
51
|
+
| `'lexp'` | Bounded LCC-exp — 40–48× GPU speedup |
|
|
52
|
+
| `4` | Polynomial LCC order 4 |
|
|
53
|
+
| `6` | Polynomial LCC order 6, couples m₃, m₄, m₆ |
|
|
54
|
+
| `8` | Polynomial LCC order 8, up to 2.4× GPU speedup at N≥500k |
|
|
55
|
+
| `'tanh'` | Classical logcosh contrast (CPU fallback) |
|
|
56
|
+
| `'exp'` | Classical Gaussian contrast (CPU fallback) |
|
|
57
|
+
| `'skew'` | Classical cube contrast (CPU fallback) |
|
|
58
|
+
|
|
59
|
+
Bounded contrasts (`'ltanh'`, `'lexp'`) achieve 40–48× speedup
|
|
60
|
+
across all dataset sizes due to the O(NB) pairwise computation
|
|
61
|
+
being embarrassingly parallel. Polynomial contrasts benefit from
|
|
62
|
+
GPU acceleration at N ≥ 500,000. Classical contrasts fall back
|
|
63
|
+
to the CPU implementation in `icalcc`.
|
|
64
|
+
|
|
65
|
+
## Usage
|
|
66
|
+
```python
|
|
67
|
+
from gpuicalcc import GPUICALCC
|
|
68
|
+
|
|
69
|
+
# Bounded LCC-tanh (recommended for heavy-tailed or skewed sources)
|
|
70
|
+
ica = GPUICALCC(n_components=4, K='ltanh', device='cuda', random_state=0)
|
|
71
|
+
S_hat = ica.fit_transform(X)
|
|
72
|
+
|
|
73
|
+
# Bounded LCC-tanh with memory limit
|
|
74
|
+
ica = GPUICALCC(n_components=4, K='ltanh', device='cuda',
|
|
75
|
+
batch_size=500, gpu_mem_limit=8, random_state=0)
|
|
76
|
+
S_hat = ica.fit_transform(X)
|
|
77
|
+
|
|
78
|
+
# Polynomial LCC order 8 (near-Gaussian sources)
|
|
79
|
+
ica = GPUICALCC(n_components=4, K=8, device='cuda', random_state=0)
|
|
80
|
+
S_hat = ica.fit_transform(X)
|
|
81
|
+
|
|
82
|
+
# Falls back to CPU automatically if CUDA unavailable
|
|
83
|
+
ica = GPUICALCC(n_components=4, K='ltanh', device='cuda', random_state=0)
|
|
84
|
+
```
|
|
85
|
+
|
|
86
|
+
## Parameters
|
|
87
|
+
| Parameter | Default | Description |
|
|
88
|
+
|---|---|---|
|
|
89
|
+
| `device` | `'cuda'` | PyTorch device. Falls back to `'cpu'` if CUDA unavailable |
|
|
90
|
+
| `batch_size` | `500` | Subsample size B for bounded pairwise computation |
|
|
91
|
+
| `gpu_mem_limit` | `None` | GPU memory limit in GB. Auto-detected if None |
|
|
92
|
+
| `clear_gpu` | `True` | Clear GPU cache after fit |
|
|
93
|
+
|
|
94
|
+
## Benchmark
|
|
95
|
+

|
|
96
|
+
|
|
97
|
+
Bounded contrasts (`ltanh`, `lexp`) achieve 40–48× speedup across
|
|
98
|
+
all dataset sizes. Polynomial contrasts (K=4,6,8) benefit from GPU
|
|
99
|
+
acceleration at N ≥ 500,000, reaching up to 2.4× at K=8. All runs
|
|
100
|
+
use batch_size=500. Full benchmark table:
|
|
101
|
+
[Experiment code](https://github.com/Kleinverse/research/tree/main/icalcc).
|
|
102
|
+
|
|
103
|
+
## Requirements
|
|
104
|
+
- Python ≥ 3.9
|
|
105
|
+
- numpy ≥ 1.24
|
|
106
|
+
- scikit-learn ≥ 1.3
|
|
107
|
+
- icalcc ≥ 0.1.0
|
|
108
|
+
- torch ≥ 2.0
|
|
109
|
+
|
|
110
|
+
## See Also
|
|
111
|
+
- [icalcc](https://github.com/Kleinverse/icalcc) — CPU version
|
|
112
|
+
- [Experiment code](https://github.com/Kleinverse/research/tree/main/icalcc)
|
|
113
|
+
|
|
114
|
+
## Citation
|
|
115
|
+
|
|
116
|
+
If you use this package, please cite both the software paper and
|
|
117
|
+
the underlying LCC kernel paper:
|
|
118
|
+
```bibtex
|
|
119
|
+
@article{saito2026icalcc,
|
|
120
|
+
author = {Saito, Tetsuya},
|
|
121
|
+
title = {{ICALCC}: Locally Centered Contrast Functions for
|
|
122
|
+
{FastICA} with {GPU} Acceleration},
|
|
123
|
+
journal = {TechRxiv},
|
|
124
|
+
year = {2026}
|
|
125
|
+
}
|
|
126
|
+
|
|
127
|
+
@article{saito2026lcc,
|
|
128
|
+
author = {Saito, Tetsuya},
|
|
129
|
+
title = {Locally Centered Cyclic Kernels for Higher-Order
|
|
130
|
+
Independent Component Analysis},
|
|
131
|
+
journal = {TechRxiv},
|
|
132
|
+
year = {2026}
|
|
133
|
+
}
|
|
134
|
+
```
|
|
135
|
+
|
|
136
|
+
## License
|
|
137
|
+
[CC BY 4.0](https://creativecommons.org/licenses/by/4.0/)
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
gpuicalcc
|
|
@@ -0,0 +1,300 @@
|
|
|
1
|
+
"""gpuicalcc: PyTorch-accelerated locally centered cyclic contrasts.
|
|
2
|
+
|
|
3
|
+
GPU extension for ICALCC. Same API as sklearn FastICA:
|
|
4
|
+
just replace the import and set K.
|
|
5
|
+
|
|
6
|
+
from gpuicalcc import GPUICALCC
|
|
7
|
+
est = GPUICALCC(K='ltanh', device='cuda')
|
|
8
|
+
est.fit(X)
|
|
9
|
+
|
|
10
|
+
Supported K values: 4, 6, 8, 'ltanh', 'lexp', 'tanh', 'exp', 'skew'.
|
|
11
|
+
|
|
12
|
+
Requirements: icalcc, torch
|
|
13
|
+
|
|
14
|
+
Reference: T. Saito, "Locally Centered Cyclic Kernels for Higher-Order
|
|
15
|
+
Independent Component Analysis," TechRxiv, 2026.
|
|
16
|
+
https://doi.org/10.36227/techrxiv.XXXXXXX
|
|
17
|
+
"""
|
|
18
|
+
|
|
19
|
+
import numpy as np
|
|
20
|
+
import warnings
|
|
21
|
+
import torch
|
|
22
|
+
from sklearn.exceptions import ConvergenceWarning
|
|
23
|
+
from icalcc import ICALCC
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
# -------------------------------------------------------------------
|
|
27
|
+
# Polynomial LCC on GPU (K = 4, 6, 8)
|
|
28
|
+
# -------------------------------------------------------------------
|
|
29
|
+
|
|
30
|
+
def _gpu_lcc_h_gprime(y_np, k, device="cuda"):
|
|
31
|
+
"""Polynomial LCC nonlinearity on GPU."""
|
|
32
|
+
y = torch.as_tensor(y_np, dtype=torch.float64, device=device)
|
|
33
|
+
y2 = y * y
|
|
34
|
+
y3 = y2 * y
|
|
35
|
+
|
|
36
|
+
if k == 4:
|
|
37
|
+
gy = (-3.0 / 16) * y3
|
|
38
|
+
gpy = (-9.0 / 16) * y2
|
|
39
|
+
return gy.cpu().numpy(), gpy.cpu().numpy()
|
|
40
|
+
|
|
41
|
+
m3 = y3.mean().item()
|
|
42
|
+
m4 = (y2 * y2).mean().item()
|
|
43
|
+
|
|
44
|
+
if k == 6:
|
|
45
|
+
dJ3 = 145 * m3 / 1944.0
|
|
46
|
+
dJ4 = 115.0 / 2592
|
|
47
|
+
dJ6 = -5.0 / 7776
|
|
48
|
+
y4 = y2 * y2
|
|
49
|
+
gy = dJ3 * 3 * y2 + dJ4 * 4 * y3 + dJ6 * 6 * (y4 * y)
|
|
50
|
+
gpy = dJ3 * 6 * y + dJ4 * 12 * y2 + dJ6 * 30 * y4
|
|
51
|
+
return gy.cpu().numpy(), gpy.cpu().numpy()
|
|
52
|
+
|
|
53
|
+
# k == 8
|
|
54
|
+
m5 = (y2 * y3).mean().item()
|
|
55
|
+
m6 = (y3 * y3).mean().item()
|
|
56
|
+
dJ3 = -7665 * m3 / 65536.0 + 497 * m5 / 262144.0
|
|
57
|
+
dJ4 = 2765 * m4 / 1048576.0 - 18795.0 / 524288
|
|
58
|
+
dJ5 = 497 * m3 / 262144.0
|
|
59
|
+
dJ6 = 329.0 / 524288
|
|
60
|
+
dJ8 = -7.0 / 2097152
|
|
61
|
+
y4 = y2 * y2
|
|
62
|
+
gy = (dJ3 * 3 * y2 + dJ4 * 4 * y3
|
|
63
|
+
+ dJ5 * 5 * y4 + dJ6 * 6 * (y4 * y)
|
|
64
|
+
+ dJ8 * 8 * (y4 * y3))
|
|
65
|
+
gpy = (dJ3 * 6 * y + dJ4 * 12 * y2
|
|
66
|
+
+ dJ5 * 20 * y3 + dJ6 * 30 * y4
|
|
67
|
+
+ dJ8 * 56 * (y3 * y3))
|
|
68
|
+
return gy.cpu().numpy(), gpy.cpu().numpy()
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
def _gpu_lcc_contrast(x, K=6, device="cuda"):
|
|
72
|
+
"""Polynomial LCC contrast for sklearn fun parameter."""
|
|
73
|
+
if x.ndim == 1:
|
|
74
|
+
return _gpu_lcc_h_gprime(x, K, device=device)
|
|
75
|
+
p, N = x.shape
|
|
76
|
+
gY = np.empty_like(x)
|
|
77
|
+
gpy = np.empty(p, dtype=x.dtype)
|
|
78
|
+
for i in range(p):
|
|
79
|
+
gi, gpi = _gpu_lcc_h_gprime(x[i], K, device=device)
|
|
80
|
+
gY[i] = gi
|
|
81
|
+
gpy[i] = gpi.mean()
|
|
82
|
+
return gY, gpy
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
# -------------------------------------------------------------------
|
|
86
|
+
# Bounded LCC on GPU (K = 'ltanh', 'lexp')
|
|
87
|
+
# -------------------------------------------------------------------
|
|
88
|
+
|
|
89
|
+
def _get_gpu_mem_bytes(device, gpu_mem_limit):
|
|
90
|
+
"""Resolve effective GPU memory limit in bytes.
|
|
91
|
+
|
|
92
|
+
Auto-detects free memory. If gpu_mem_limit is set and exceeds
|
|
93
|
+
97% of total, warns and caps at 97%.
|
|
94
|
+
"""
|
|
95
|
+
if not device.startswith("cuda"):
|
|
96
|
+
return 4 * 1024**3 # 4GB fallback for CPU
|
|
97
|
+
free, total = torch.cuda.mem_get_info(device)
|
|
98
|
+
safe = int(total * 0.97)
|
|
99
|
+
if gpu_mem_limit is None:
|
|
100
|
+
return min(free, safe)
|
|
101
|
+
requested = int(gpu_mem_limit * 1024**3)
|
|
102
|
+
if requested > safe:
|
|
103
|
+
warnings.warn(
|
|
104
|
+
f"gpu_mem_limit={gpu_mem_limit}GB exceeds 97% of "
|
|
105
|
+
f"GPU memory ({total/1024**3:.1f}GB). "
|
|
106
|
+
f"Capping at {safe/1024**3:.1f}GB.",
|
|
107
|
+
RuntimeWarning)
|
|
108
|
+
return safe
|
|
109
|
+
return requested
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
def _gpu_bounded_contrast(x, G="tanh", batch_size=500,
|
|
113
|
+
device="cuda", gpu_mem_limit=None):
|
|
114
|
+
"""Pairwise bounded LCC contrast on GPU.
|
|
115
|
+
|
|
116
|
+
Handles both deflation (1-D) and parallel (2-D).
|
|
117
|
+
Auto-detects GPU memory; user can override via gpu_mem_limit (GB).
|
|
118
|
+
Halves chunk size on OOM.
|
|
119
|
+
"""
|
|
120
|
+
if x.ndim == 1:
|
|
121
|
+
return _gpu_bounded_1d(x, G, batch_size, device)
|
|
122
|
+
|
|
123
|
+
p, N = x.shape
|
|
124
|
+
B = min(batch_size, N)
|
|
125
|
+
step = max(1, N // B)
|
|
126
|
+
idx = np.arange(0, N, step)[:B]
|
|
127
|
+
|
|
128
|
+
mem = _get_gpu_mem_bytes(device, gpu_mem_limit)
|
|
129
|
+
max_chunk = max(1, mem // (p * B * 8 * 6))
|
|
130
|
+
|
|
131
|
+
X_batch = torch.as_tensor(x[:, idx], dtype=torch.float64,
|
|
132
|
+
device=device)
|
|
133
|
+
gY = np.empty_like(x)
|
|
134
|
+
gpY = np.empty_like(x)
|
|
135
|
+
|
|
136
|
+
start = 0
|
|
137
|
+
while start < N:
|
|
138
|
+
end = min(start + max_chunk, N)
|
|
139
|
+
try:
|
|
140
|
+
X_chunk = torch.as_tensor(
|
|
141
|
+
x[:, start:end], dtype=torch.float64, device=device)
|
|
142
|
+
diff = X_chunk.unsqueeze(2) - X_batch.unsqueeze(1)
|
|
143
|
+
|
|
144
|
+
if G == "tanh":
|
|
145
|
+
t = torch.tanh(diff)
|
|
146
|
+
gY[:, start:end] = t.mean(dim=2).cpu().numpy()
|
|
147
|
+
gpY[:, start:end] = (1.0 - t * t).mean(
|
|
148
|
+
dim=2).cpu().numpy()
|
|
149
|
+
else: # exp
|
|
150
|
+
e = torch.exp(-0.5 * diff * diff)
|
|
151
|
+
gY[:, start:end] = (diff * e).mean(
|
|
152
|
+
dim=2).cpu().numpy()
|
|
153
|
+
gpY[:, start:end] = ((1.0 - diff * diff) * e).mean(
|
|
154
|
+
dim=2).cpu().numpy()
|
|
155
|
+
|
|
156
|
+
del diff, X_chunk
|
|
157
|
+
torch.cuda.empty_cache()
|
|
158
|
+
start = end
|
|
159
|
+
|
|
160
|
+
except torch.cuda.OutOfMemoryError:
|
|
161
|
+
del X_chunk
|
|
162
|
+
torch.cuda.empty_cache()
|
|
163
|
+
max_chunk = max(1, max_chunk // 2)
|
|
164
|
+
|
|
165
|
+
del X_batch
|
|
166
|
+
return gY, gpY.mean(axis=1)
|
|
167
|
+
|
|
168
|
+
|
|
169
|
+
def _gpu_bounded_1d(y_np, G, batch_size, device):
|
|
170
|
+
"""1-D bounded LCC for deflation algorithm."""
|
|
171
|
+
N = len(y_np)
|
|
172
|
+
B = min(batch_size, N)
|
|
173
|
+
step = max(1, N // B)
|
|
174
|
+
idx = np.arange(0, N, step)[:B]
|
|
175
|
+
|
|
176
|
+
y = torch.as_tensor(y_np, dtype=torch.float64, device=device)
|
|
177
|
+
y_batch = y[idx]
|
|
178
|
+
diff = y.unsqueeze(1) - y_batch.unsqueeze(0)
|
|
179
|
+
|
|
180
|
+
if G == "tanh":
|
|
181
|
+
t = torch.tanh(diff)
|
|
182
|
+
gy = t.mean(dim=1)
|
|
183
|
+
gpy = (1.0 - t * t).mean(dim=1)
|
|
184
|
+
else:
|
|
185
|
+
e = torch.exp(-0.5 * diff * diff)
|
|
186
|
+
gy = (diff * e).mean(dim=1)
|
|
187
|
+
gpy = ((1.0 - diff * diff) * e).mean(dim=1)
|
|
188
|
+
|
|
189
|
+
return gy.cpu().numpy(), gpy.cpu().numpy()
|
|
190
|
+
|
|
191
|
+
|
|
192
|
+
# -------------------------------------------------------------------
|
|
193
|
+
# Main class
|
|
194
|
+
# -------------------------------------------------------------------
|
|
195
|
+
|
|
196
|
+
class GPUICALCC(ICALCC):
|
|
197
|
+
"""GPU-accelerated ICALCC.
|
|
198
|
+
|
|
199
|
+
Accelerates all LCC contrasts (polynomial and bounded) via
|
|
200
|
+
PyTorch. Classical contrasts ('tanh', 'exp', 'skew') and
|
|
201
|
+
FastICA(k) contrasts fall back to the parent class.
|
|
202
|
+
|
|
203
|
+
Parameters
|
|
204
|
+
----------
|
|
205
|
+
K : contrast selector (same as ICALCC)
|
|
206
|
+
device : str, default='cuda'
|
|
207
|
+
PyTorch device. Falls back to 'cpu' if CUDA unavailable.
|
|
208
|
+
batch_size : int, default=500
|
|
209
|
+
Subsample size for bounded pairwise computation.
|
|
210
|
+
**kwargs : passed to ICALCC
|
|
211
|
+
|
|
212
|
+
Examples
|
|
213
|
+
--------
|
|
214
|
+
>>> from gpuicalcc import GPUICALCC
|
|
215
|
+
>>> est = GPUICALCC(n_components=4, K='ltanh', device='cuda')
|
|
216
|
+
>>> S_hat = est.fit_transform(X)
|
|
217
|
+
>>> est6 = GPUICALCC(n_components=4, K=6, device='cuda')
|
|
218
|
+
>>> S6 = est6.fit_transform(X)
|
|
219
|
+
"""
|
|
220
|
+
|
|
221
|
+
def __init__(self, n_components=None, *, K=6, device="cuda",
|
|
222
|
+
batch_size=500, gpu_mem_limit=None,
|
|
223
|
+
algorithm="parallel",
|
|
224
|
+
whiten="unit-variance", max_iter=200, tol=1e-4,
|
|
225
|
+
w_init=None, whiten_solver="svd",
|
|
226
|
+
random_state=None, clear_gpu=True):
|
|
227
|
+
|
|
228
|
+
if device.startswith("cuda") and not torch.cuda.is_available():
|
|
229
|
+
warnings.warn(
|
|
230
|
+
"CUDA not available, falling back to device='cpu'.",
|
|
231
|
+
RuntimeWarning)
|
|
232
|
+
device = "cpu"
|
|
233
|
+
self.device = device
|
|
234
|
+
self.batch_size = batch_size
|
|
235
|
+
self.gpu_mem_limit = gpu_mem_limit
|
|
236
|
+
self.clear_gpu = clear_gpu
|
|
237
|
+
|
|
238
|
+
super().__init__(
|
|
239
|
+
n_components=n_components, K=K,
|
|
240
|
+
algorithm=algorithm, whiten=whiten,
|
|
241
|
+
max_iter=max_iter, tol=tol, w_init=w_init,
|
|
242
|
+
whiten_solver=whiten_solver, random_state=random_state)
|
|
243
|
+
|
|
244
|
+
# override with GPU versions
|
|
245
|
+
if K in self._LCC_BOUNDED_MAP:
|
|
246
|
+
G = self._LCC_BOUNDED_MAP[K]
|
|
247
|
+
self.fun = _gpu_bounded_contrast
|
|
248
|
+
self.fun_args = dict(G=G, batch_size=batch_size,
|
|
249
|
+
device=self.device,
|
|
250
|
+
gpu_mem_limit=gpu_mem_limit)
|
|
251
|
+
elif isinstance(K, int) and K in (4, 6, 8):
|
|
252
|
+
self.fun = _gpu_lcc_contrast
|
|
253
|
+
self.fun_args = dict(K=K, device=self.device)
|
|
254
|
+
|
|
255
|
+
def get_params(self, deep=True):
|
|
256
|
+
params = super().get_params(deep=deep)
|
|
257
|
+
params["device"] = self.device
|
|
258
|
+
params["batch_size"] = self.batch_size
|
|
259
|
+
params["gpu_mem_limit"] = self.gpu_mem_limit
|
|
260
|
+
params["clear_gpu"] = self.clear_gpu
|
|
261
|
+
return params
|
|
262
|
+
|
|
263
|
+
def set_params(self, **params):
|
|
264
|
+
device = params.pop("device", None)
|
|
265
|
+
batch_size = params.pop("batch_size", None)
|
|
266
|
+
gpu_mem_limit = params.pop("gpu_mem_limit", None)
|
|
267
|
+
if device is not None:
|
|
268
|
+
self.device = device
|
|
269
|
+
if batch_size is not None:
|
|
270
|
+
self.batch_size = batch_size
|
|
271
|
+
if gpu_mem_limit is not None:
|
|
272
|
+
self.gpu_mem_limit = gpu_mem_limit
|
|
273
|
+
super().set_params(**params)
|
|
274
|
+
K = self.K
|
|
275
|
+
if K in self._LCC_BOUNDED_MAP:
|
|
276
|
+
G = self._LCC_BOUNDED_MAP[K]
|
|
277
|
+
self.fun = _gpu_bounded_contrast
|
|
278
|
+
self.fun_args = dict(G=G, batch_size=self.batch_size,
|
|
279
|
+
device=self.device,
|
|
280
|
+
gpu_mem_limit=self.gpu_mem_limit)
|
|
281
|
+
elif isinstance(K, int) and K in (4, 6, 8):
|
|
282
|
+
self.fun = _gpu_lcc_contrast
|
|
283
|
+
self.fun_args = dict(K=K, device=self.device)
|
|
284
|
+
return self
|
|
285
|
+
|
|
286
|
+
def _clear(self):
|
|
287
|
+
if self.clear_gpu and self.device.startswith("cuda"):
|
|
288
|
+
import gc
|
|
289
|
+
gc.collect()
|
|
290
|
+
torch.cuda.empty_cache()
|
|
291
|
+
|
|
292
|
+
def fit(self, X, y=None):
|
|
293
|
+
result = super().fit(X, y)
|
|
294
|
+
self._clear()
|
|
295
|
+
return result
|
|
296
|
+
|
|
297
|
+
def fit_transform(self, X, y=None):
|
|
298
|
+
result = super().fit_transform(X, y)
|
|
299
|
+
self._clear()
|
|
300
|
+
return result
|
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=68", "wheel"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "gpuicalcc"
|
|
7
|
+
version = "0.1.3"
|
|
8
|
+
description = "GPU-accelerated locally centered contrast functions for FastICA"
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
license = { text = "CC BY 4.0" }
|
|
11
|
+
authors = [
|
|
12
|
+
{ name = "Tetsuya Saito", email = "ted@kleinverse.io" }
|
|
13
|
+
]
|
|
14
|
+
keywords = [
|
|
15
|
+
"ICA", "independent component analysis", "blind source separation",
|
|
16
|
+
"locally centered contrast", "cumulants", "higher-order statistics",
|
|
17
|
+
"Renyi entropy", "FastICA", "scikit-learn", "GPU", "PyTorch", "CUDA"
|
|
18
|
+
]
|
|
19
|
+
classifiers = [
|
|
20
|
+
"Development Status :: 4 - Beta",
|
|
21
|
+
"Intended Audience :: Science/Research",
|
|
22
|
+
"Topic :: Scientific/Engineering :: Artificial Intelligence",
|
|
23
|
+
"Programming Language :: Python :: 3",
|
|
24
|
+
"Programming Language :: Python :: 3.9",
|
|
25
|
+
"Programming Language :: Python :: 3.10",
|
|
26
|
+
"Programming Language :: Python :: 3.11",
|
|
27
|
+
"Programming Language :: Python :: 3.12",
|
|
28
|
+
"License :: Other/Proprietary License",
|
|
29
|
+
]
|
|
30
|
+
requires-python = ">=3.9"
|
|
31
|
+
dependencies = [
|
|
32
|
+
"numpy>=1.24",
|
|
33
|
+
"scikit-learn>=1.3",
|
|
34
|
+
"icalcc>=0.1.3",
|
|
35
|
+
"torch>=2.0",
|
|
36
|
+
]
|
|
37
|
+
|
|
38
|
+
[project.urls]
|
|
39
|
+
Repository = "https://github.com/Kleinverse/gpuicalcc"
|
|
40
|
+
"CPU Package" = "https://github.com/Kleinverse/icalcc"
|
|
41
|
+
Research = "https://github.com/Kleinverse/research"
|
|
42
|
+
|
|
43
|
+
[tool.setuptools]
|
|
44
|
+
py-modules = ["gpuicalcc"]
|