gpuicalcc 0.1.3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,19 @@
1
+ Creative Commons Attribution 4.0 International (CC BY 4.0)
2
+
3
+ Copyright (c) 2026 Kleinverse AI, Inc.
4
+
5
+ You are free to:
6
+ Share — copy and redistribute the material in any medium or format
7
+ Adapt — remix, transform, and build upon the material for any purpose,
8
+ even commercially.
9
+
10
+ Under the following terms:
11
+ Attribution — You must give appropriate credit, provide a link to the
12
+ license, and indicate if changes were made. You may do so
13
+ in any reasonable manner, but not in any way that suggests
14
+ the licensor endorses you or your use.
15
+
16
+ No additional restrictions — You may not apply legal terms or technological
17
+ measures that legally restrict others from doing anything the license permits.
18
+
19
+ Full license text: https://creativecommons.org/licenses/by/4.0/legalcode
@@ -0,0 +1,137 @@
1
+ Metadata-Version: 2.4
2
+ Name: gpuicalcc
3
+ Version: 0.1.3
4
+ Summary: GPU-accelerated locally centered contrast functions for FastICA
5
+ Author-email: Tetsuya Saito <ted@kleinverse.io>
6
+ License: CC BY 4.0
7
+ Project-URL: Repository, https://github.com/Kleinverse/gpuicalcc
8
+ Project-URL: CPU Package, https://github.com/Kleinverse/icalcc
9
+ Project-URL: Research, https://github.com/Kleinverse/research
10
+ Keywords: ICA,independent component analysis,blind source separation,locally centered contrast,cumulants,higher-order statistics,Renyi entropy,FastICA,scikit-learn,GPU,PyTorch,CUDA
11
+ Classifier: Development Status :: 4 - Beta
12
+ Classifier: Intended Audience :: Science/Research
13
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
14
+ Classifier: Programming Language :: Python :: 3
15
+ Classifier: Programming Language :: Python :: 3.9
16
+ Classifier: Programming Language :: Python :: 3.10
17
+ Classifier: Programming Language :: Python :: 3.11
18
+ Classifier: Programming Language :: Python :: 3.12
19
+ Classifier: License :: Other/Proprietary License
20
+ Requires-Python: >=3.9
21
+ Description-Content-Type: text/markdown
22
+ License-File: LICENSE
23
+ Requires-Dist: numpy>=1.24
24
+ Requires-Dist: scikit-learn>=1.3
25
+ Requires-Dist: icalcc>=0.1.3
26
+ Requires-Dist: torch>=2.0
27
+ Dynamic: license-file
28
+
29
+ # gpuicalcc
30
+ GPU-accelerated locally centered contrast functions for FastICA.
31
+ PyTorch extension of [icalcc](https://github.com/Kleinverse/icalcc).
32
+ Same API, drop-in replacement with CUDA acceleration for bounded
33
+ and polynomial LCC contrasts.
34
+ ```python
35
+ from gpuicalcc import GPUICALCC
36
+ ica = GPUICALCC(n_components=4, K='ltanh', device='cuda', random_state=0)
37
+ S_hat = ica.fit_transform(X)
38
+ ```
39
+
40
+ ## Installation
41
+ ```bash
42
+ pip install gpuicalcc
43
+ ```
44
+ Requires PyTorch with CUDA. See [pytorch.org](https://pytorch.org)
45
+ for installation instructions.
46
+
47
+ ## Supported K Values
48
+ | K | Description |
49
+ |---|---|
50
+ | `'ltanh'` | Bounded LCC-tanh — 40–48× GPU speedup |
51
+ | `'lexp'` | Bounded LCC-exp — 40–48× GPU speedup |
52
+ | `4` | Polynomial LCC order 4 |
53
+ | `6` | Polynomial LCC order 6, couples m₃, m₄, m₆ |
54
+ | `8` | Polynomial LCC order 8, up to 2.4× GPU speedup at N≥500k |
55
+ | `'tanh'` | Classical logcosh contrast (CPU fallback) |
56
+ | `'exp'` | Classical Gaussian contrast (CPU fallback) |
57
+ | `'skew'` | Classical cube contrast (CPU fallback) |
58
+
59
+ Bounded contrasts (`'ltanh'`, `'lexp'`) achieve 40–48× speedup
60
+ across all dataset sizes due to the O(NB) pairwise computation
61
+ being embarrassingly parallel. Polynomial contrasts benefit from
62
+ GPU acceleration at N ≥ 500,000. Classical contrasts fall back
63
+ to the CPU implementation in `icalcc`.
64
+
65
+ ## Usage
66
+ ```python
67
+ from gpuicalcc import GPUICALCC
68
+
69
+ # Bounded LCC-tanh (recommended for heavy-tailed or skewed sources)
70
+ ica = GPUICALCC(n_components=4, K='ltanh', device='cuda', random_state=0)
71
+ S_hat = ica.fit_transform(X)
72
+
73
+ # Bounded LCC-tanh with memory limit
74
+ ica = GPUICALCC(n_components=4, K='ltanh', device='cuda',
75
+ batch_size=500, gpu_mem_limit=8, random_state=0)
76
+ S_hat = ica.fit_transform(X)
77
+
78
+ # Polynomial LCC order 8 (near-Gaussian sources)
79
+ ica = GPUICALCC(n_components=4, K=8, device='cuda', random_state=0)
80
+ S_hat = ica.fit_transform(X)
81
+
82
+ # Falls back to CPU automatically if CUDA unavailable
83
+ ica = GPUICALCC(n_components=4, K='ltanh', device='cuda', random_state=0)
84
+ ```
85
+
86
+ ## Parameters
87
+ | Parameter | Default | Description |
88
+ |---|---|---|
89
+ | `device` | `'cuda'` | PyTorch device. Falls back to `'cpu'` if CUDA unavailable |
90
+ | `batch_size` | `500` | Subsample size B for bounded pairwise computation |
91
+ | `gpu_mem_limit` | `None` | GPU memory limit in GB. Auto-detected if None |
92
+ | `clear_gpu` | `True` | Clear GPU cache after fit |
93
+
94
+ ## Benchmark
95
+ ![GPU Benchmark](https://github.com/Kleinverse/research/blob/main/icalcc/img/benchmark.png)
96
+
97
+ Bounded contrasts (`ltanh`, `lexp`) achieve 40–48× speedup across
98
+ all dataset sizes. Polynomial contrasts (K=4,6,8) benefit from GPU
99
+ acceleration at N ≥ 500,000, reaching up to 2.4× at K=8. All runs
100
+ use batch_size=500. Full benchmark table:
101
+ [Experiment code](https://github.com/Kleinverse/research/tree/main/icalcc).
102
+
103
+ ## Requirements
104
+ - Python ≥ 3.9
105
+ - numpy ≥ 1.24
106
+ - scikit-learn ≥ 1.3
107
+ - icalcc ≥ 0.1.0
108
+ - torch ≥ 2.0
109
+
110
+ ## See Also
111
+ - [icalcc](https://github.com/Kleinverse/icalcc) — CPU version
112
+ - [Experiment code](https://github.com/Kleinverse/research/tree/main/icalcc)
113
+
114
+ ## Citation
115
+
116
+ If you use this package, please cite both the software paper and
117
+ the underlying LCC kernel paper:
118
+ ```bibtex
119
+ @article{saito2026icalcc,
120
+ author = {Saito, Tetsuya},
121
+ title = {{ICALCC}: Locally Centered Contrast Functions for
122
+ {FastICA} with {GPU} Acceleration},
123
+ journal = {TechRxiv},
124
+ year = {2026}
125
+ }
126
+
127
+ @article{saito2026lcc,
128
+ author = {Saito, Tetsuya},
129
+ title = {Locally Centered Cyclic Kernels for Higher-Order
130
+ Independent Component Analysis},
131
+ journal = {TechRxiv},
132
+ year = {2026}
133
+ }
134
+ ```
135
+
136
+ ## License
137
+ [CC BY 4.0](https://creativecommons.org/licenses/by/4.0/)
@@ -0,0 +1,109 @@
1
+ # gpuicalcc
2
+ GPU-accelerated locally centered contrast functions for FastICA.
3
+ PyTorch extension of [icalcc](https://github.com/Kleinverse/icalcc).
4
+ Same API, drop-in replacement with CUDA acceleration for bounded
5
+ and polynomial LCC contrasts.
6
+ ```python
7
+ from gpuicalcc import GPUICALCC
8
+ ica = GPUICALCC(n_components=4, K='ltanh', device='cuda', random_state=0)
9
+ S_hat = ica.fit_transform(X)
10
+ ```
11
+
12
+ ## Installation
13
+ ```bash
14
+ pip install gpuicalcc
15
+ ```
16
+ Requires PyTorch with CUDA. See [pytorch.org](https://pytorch.org)
17
+ for installation instructions.
18
+
19
+ ## Supported K Values
20
+ | K | Description |
21
+ |---|---|
22
+ | `'ltanh'` | Bounded LCC-tanh — 40–48× GPU speedup |
23
+ | `'lexp'` | Bounded LCC-exp — 40–48× GPU speedup |
24
+ | `4` | Polynomial LCC order 4 |
25
+ | `6` | Polynomial LCC order 6, couples m₃, m₄, m₆ |
26
+ | `8` | Polynomial LCC order 8, up to 2.4× GPU speedup at N≥500k |
27
+ | `'tanh'` | Classical logcosh contrast (CPU fallback) |
28
+ | `'exp'` | Classical Gaussian contrast (CPU fallback) |
29
+ | `'skew'` | Classical cube contrast (CPU fallback) |
30
+
31
+ Bounded contrasts (`'ltanh'`, `'lexp'`) achieve 40–48× speedup
32
+ across all dataset sizes due to the O(NB) pairwise computation
33
+ being embarrassingly parallel. Polynomial contrasts benefit from
34
+ GPU acceleration at N ≥ 500,000. Classical contrasts fall back
35
+ to the CPU implementation in `icalcc`.
36
+
37
+ ## Usage
38
+ ```python
39
+ from gpuicalcc import GPUICALCC
40
+
41
+ # Bounded LCC-tanh (recommended for heavy-tailed or skewed sources)
42
+ ica = GPUICALCC(n_components=4, K='ltanh', device='cuda', random_state=0)
43
+ S_hat = ica.fit_transform(X)
44
+
45
+ # Bounded LCC-tanh with memory limit
46
+ ica = GPUICALCC(n_components=4, K='ltanh', device='cuda',
47
+ batch_size=500, gpu_mem_limit=8, random_state=0)
48
+ S_hat = ica.fit_transform(X)
49
+
50
+ # Polynomial LCC order 8 (near-Gaussian sources)
51
+ ica = GPUICALCC(n_components=4, K=8, device='cuda', random_state=0)
52
+ S_hat = ica.fit_transform(X)
53
+
54
+ # Falls back to CPU automatically if CUDA unavailable
55
+ ica = GPUICALCC(n_components=4, K='ltanh', device='cuda', random_state=0)
56
+ ```
57
+
58
+ ## Parameters
59
+ | Parameter | Default | Description |
60
+ |---|---|---|
61
+ | `device` | `'cuda'` | PyTorch device. Falls back to `'cpu'` if CUDA unavailable |
62
+ | `batch_size` | `500` | Subsample size B for bounded pairwise computation |
63
+ | `gpu_mem_limit` | `None` | GPU memory limit in GB. Auto-detected if None |
64
+ | `clear_gpu` | `True` | Clear GPU cache after fit |
65
+
66
+ ## Benchmark
67
+ ![GPU Benchmark](https://github.com/Kleinverse/research/blob/main/icalcc/img/benchmark.png)
68
+
69
+ Bounded contrasts (`ltanh`, `lexp`) achieve 40–48× speedup across
70
+ all dataset sizes. Polynomial contrasts (K=4,6,8) benefit from GPU
71
+ acceleration at N ≥ 500,000, reaching up to 2.4× at K=8. All runs
72
+ use batch_size=500. Full benchmark table:
73
+ [Experiment code](https://github.com/Kleinverse/research/tree/main/icalcc).
74
+
75
+ ## Requirements
76
+ - Python ≥ 3.9
77
+ - numpy ≥ 1.24
78
+ - scikit-learn ≥ 1.3
79
+ - icalcc ≥ 0.1.0
80
+ - torch ≥ 2.0
81
+
82
+ ## See Also
83
+ - [icalcc](https://github.com/Kleinverse/icalcc) — CPU version
84
+ - [Experiment code](https://github.com/Kleinverse/research/tree/main/icalcc)
85
+
86
+ ## Citation
87
+
88
+ If you use this package, please cite both the software paper and
89
+ the underlying LCC kernel paper:
90
+ ```bibtex
91
+ @article{saito2026icalcc,
92
+ author = {Saito, Tetsuya},
93
+ title = {{ICALCC}: Locally Centered Contrast Functions for
94
+ {FastICA} with {GPU} Acceleration},
95
+ journal = {TechRxiv},
96
+ year = {2026}
97
+ }
98
+
99
+ @article{saito2026lcc,
100
+ author = {Saito, Tetsuya},
101
+ title = {Locally Centered Cyclic Kernels for Higher-Order
102
+ Independent Component Analysis},
103
+ journal = {TechRxiv},
104
+ year = {2026}
105
+ }
106
+ ```
107
+
108
+ ## License
109
+ [CC BY 4.0](https://creativecommons.org/licenses/by/4.0/)
@@ -0,0 +1,137 @@
1
+ Metadata-Version: 2.4
2
+ Name: gpuicalcc
3
+ Version: 0.1.3
4
+ Summary: GPU-accelerated locally centered contrast functions for FastICA
5
+ Author-email: Tetsuya Saito <ted@kleinverse.io>
6
+ License: CC BY 4.0
7
+ Project-URL: Repository, https://github.com/Kleinverse/gpuicalcc
8
+ Project-URL: CPU Package, https://github.com/Kleinverse/icalcc
9
+ Project-URL: Research, https://github.com/Kleinverse/research
10
+ Keywords: ICA,independent component analysis,blind source separation,locally centered contrast,cumulants,higher-order statistics,Renyi entropy,FastICA,scikit-learn,GPU,PyTorch,CUDA
11
+ Classifier: Development Status :: 4 - Beta
12
+ Classifier: Intended Audience :: Science/Research
13
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
14
+ Classifier: Programming Language :: Python :: 3
15
+ Classifier: Programming Language :: Python :: 3.9
16
+ Classifier: Programming Language :: Python :: 3.10
17
+ Classifier: Programming Language :: Python :: 3.11
18
+ Classifier: Programming Language :: Python :: 3.12
19
+ Classifier: License :: Other/Proprietary License
20
+ Requires-Python: >=3.9
21
+ Description-Content-Type: text/markdown
22
+ License-File: LICENSE
23
+ Requires-Dist: numpy>=1.24
24
+ Requires-Dist: scikit-learn>=1.3
25
+ Requires-Dist: icalcc>=0.1.3
26
+ Requires-Dist: torch>=2.0
27
+ Dynamic: license-file
28
+
29
+ # gpuicalcc
30
+ GPU-accelerated locally centered contrast functions for FastICA.
31
+ PyTorch extension of [icalcc](https://github.com/Kleinverse/icalcc).
32
+ Same API, drop-in replacement with CUDA acceleration for bounded
33
+ and polynomial LCC contrasts.
34
+ ```python
35
+ from gpuicalcc import GPUICALCC
36
+ ica = GPUICALCC(n_components=4, K='ltanh', device='cuda', random_state=0)
37
+ S_hat = ica.fit_transform(X)
38
+ ```
39
+
40
+ ## Installation
41
+ ```bash
42
+ pip install gpuicalcc
43
+ ```
44
+ Requires PyTorch with CUDA. See [pytorch.org](https://pytorch.org)
45
+ for installation instructions.
46
+
47
+ ## Supported K Values
48
+ | K | Description |
49
+ |---|---|
50
+ | `'ltanh'` | Bounded LCC-tanh — 40–48× GPU speedup |
51
+ | `'lexp'` | Bounded LCC-exp — 40–48× GPU speedup |
52
+ | `4` | Polynomial LCC order 4 |
53
+ | `6` | Polynomial LCC order 6, couples m₃, m₄, m₆ |
54
+ | `8` | Polynomial LCC order 8, up to 2.4× GPU speedup at N≥500k |
55
+ | `'tanh'` | Classical logcosh contrast (CPU fallback) |
56
+ | `'exp'` | Classical Gaussian contrast (CPU fallback) |
57
+ | `'skew'` | Classical cube contrast (CPU fallback) |
58
+
59
+ Bounded contrasts (`'ltanh'`, `'lexp'`) achieve 40–48× speedup
60
+ across all dataset sizes due to the O(NB) pairwise computation
61
+ being embarrassingly parallel. Polynomial contrasts benefit from
62
+ GPU acceleration at N ≥ 500,000. Classical contrasts fall back
63
+ to the CPU implementation in `icalcc`.
64
+
65
+ ## Usage
66
+ ```python
67
+ from gpuicalcc import GPUICALCC
68
+
69
+ # Bounded LCC-tanh (recommended for heavy-tailed or skewed sources)
70
+ ica = GPUICALCC(n_components=4, K='ltanh', device='cuda', random_state=0)
71
+ S_hat = ica.fit_transform(X)
72
+
73
+ # Bounded LCC-tanh with memory limit
74
+ ica = GPUICALCC(n_components=4, K='ltanh', device='cuda',
75
+ batch_size=500, gpu_mem_limit=8, random_state=0)
76
+ S_hat = ica.fit_transform(X)
77
+
78
+ # Polynomial LCC order 8 (near-Gaussian sources)
79
+ ica = GPUICALCC(n_components=4, K=8, device='cuda', random_state=0)
80
+ S_hat = ica.fit_transform(X)
81
+
82
+ # Falls back to CPU automatically if CUDA unavailable
83
+ ica = GPUICALCC(n_components=4, K='ltanh', device='cuda', random_state=0)
84
+ ```
85
+
86
+ ## Parameters
87
+ | Parameter | Default | Description |
88
+ |---|---|---|
89
+ | `device` | `'cuda'` | PyTorch device. Falls back to `'cpu'` if CUDA unavailable |
90
+ | `batch_size` | `500` | Subsample size B for bounded pairwise computation |
91
+ | `gpu_mem_limit` | `None` | GPU memory limit in GB. Auto-detected if None |
92
+ | `clear_gpu` | `True` | Clear GPU cache after fit |
93
+
94
+ ## Benchmark
95
+ ![GPU Benchmark](https://github.com/Kleinverse/research/blob/main/icalcc/img/benchmark.png)
96
+
97
+ Bounded contrasts (`ltanh`, `lexp`) achieve 40–48× speedup across
98
+ all dataset sizes. Polynomial contrasts (K=4,6,8) benefit from GPU
99
+ acceleration at N ≥ 500,000, reaching up to 2.4× at K=8. All runs
100
+ use batch_size=500. Full benchmark table:
101
+ [Experiment code](https://github.com/Kleinverse/research/tree/main/icalcc).
102
+
103
+ ## Requirements
104
+ - Python ≥ 3.9
105
+ - numpy ≥ 1.24
106
+ - scikit-learn ≥ 1.3
107
+ - icalcc ≥ 0.1.0
108
+ - torch ≥ 2.0
109
+
110
+ ## See Also
111
+ - [icalcc](https://github.com/Kleinverse/icalcc) — CPU version
112
+ - [Experiment code](https://github.com/Kleinverse/research/tree/main/icalcc)
113
+
114
+ ## Citation
115
+
116
+ If you use this package, please cite both the software paper and
117
+ the underlying LCC kernel paper:
118
+ ```bibtex
119
+ @article{saito2026icalcc,
120
+ author = {Saito, Tetsuya},
121
+ title = {{ICALCC}: Locally Centered Contrast Functions for
122
+ {FastICA} with {GPU} Acceleration},
123
+ journal = {TechRxiv},
124
+ year = {2026}
125
+ }
126
+
127
+ @article{saito2026lcc,
128
+ author = {Saito, Tetsuya},
129
+ title = {Locally Centered Cyclic Kernels for Higher-Order
130
+ Independent Component Analysis},
131
+ journal = {TechRxiv},
132
+ year = {2026}
133
+ }
134
+ ```
135
+
136
+ ## License
137
+ [CC BY 4.0](https://creativecommons.org/licenses/by/4.0/)
@@ -0,0 +1,9 @@
1
+ LICENSE
2
+ README.md
3
+ gpuicalcc.py
4
+ pyproject.toml
5
+ gpuicalcc.egg-info/PKG-INFO
6
+ gpuicalcc.egg-info/SOURCES.txt
7
+ gpuicalcc.egg-info/dependency_links.txt
8
+ gpuicalcc.egg-info/requires.txt
9
+ gpuicalcc.egg-info/top_level.txt
@@ -0,0 +1,4 @@
1
+ numpy>=1.24
2
+ scikit-learn>=1.3
3
+ icalcc>=0.1.3
4
+ torch>=2.0
@@ -0,0 +1 @@
1
+ gpuicalcc
@@ -0,0 +1,300 @@
1
+ """gpuicalcc: PyTorch-accelerated locally centered cyclic contrasts.
2
+
3
+ GPU extension for ICALCC. Same API as sklearn FastICA:
4
+ just replace the import and set K.
5
+
6
+ from gpuicalcc import GPUICALCC
7
+ est = GPUICALCC(K='ltanh', device='cuda')
8
+ est.fit(X)
9
+
10
+ Supported K values: 4, 6, 8, 'ltanh', 'lexp', 'tanh', 'exp', 'skew'.
11
+
12
+ Requirements: icalcc, torch
13
+
14
+ Reference: T. Saito, "Locally Centered Cyclic Kernels for Higher-Order
15
+ Independent Component Analysis," TechRxiv, 2026.
16
+ https://doi.org/10.36227/techrxiv.XXXXXXX
17
+ """
18
+
19
+ import numpy as np
20
+ import warnings
21
+ import torch
22
+ from sklearn.exceptions import ConvergenceWarning
23
+ from icalcc import ICALCC
24
+
25
+
26
+ # -------------------------------------------------------------------
27
+ # Polynomial LCC on GPU (K = 4, 6, 8)
28
+ # -------------------------------------------------------------------
29
+
30
+ def _gpu_lcc_h_gprime(y_np, k, device="cuda"):
31
+ """Polynomial LCC nonlinearity on GPU."""
32
+ y = torch.as_tensor(y_np, dtype=torch.float64, device=device)
33
+ y2 = y * y
34
+ y3 = y2 * y
35
+
36
+ if k == 4:
37
+ gy = (-3.0 / 16) * y3
38
+ gpy = (-9.0 / 16) * y2
39
+ return gy.cpu().numpy(), gpy.cpu().numpy()
40
+
41
+ m3 = y3.mean().item()
42
+ m4 = (y2 * y2).mean().item()
43
+
44
+ if k == 6:
45
+ dJ3 = 145 * m3 / 1944.0
46
+ dJ4 = 115.0 / 2592
47
+ dJ6 = -5.0 / 7776
48
+ y4 = y2 * y2
49
+ gy = dJ3 * 3 * y2 + dJ4 * 4 * y3 + dJ6 * 6 * (y4 * y)
50
+ gpy = dJ3 * 6 * y + dJ4 * 12 * y2 + dJ6 * 30 * y4
51
+ return gy.cpu().numpy(), gpy.cpu().numpy()
52
+
53
+ # k == 8
54
+ m5 = (y2 * y3).mean().item()
55
+ m6 = (y3 * y3).mean().item()
56
+ dJ3 = -7665 * m3 / 65536.0 + 497 * m5 / 262144.0
57
+ dJ4 = 2765 * m4 / 1048576.0 - 18795.0 / 524288
58
+ dJ5 = 497 * m3 / 262144.0
59
+ dJ6 = 329.0 / 524288
60
+ dJ8 = -7.0 / 2097152
61
+ y4 = y2 * y2
62
+ gy = (dJ3 * 3 * y2 + dJ4 * 4 * y3
63
+ + dJ5 * 5 * y4 + dJ6 * 6 * (y4 * y)
64
+ + dJ8 * 8 * (y4 * y3))
65
+ gpy = (dJ3 * 6 * y + dJ4 * 12 * y2
66
+ + dJ5 * 20 * y3 + dJ6 * 30 * y4
67
+ + dJ8 * 56 * (y3 * y3))
68
+ return gy.cpu().numpy(), gpy.cpu().numpy()
69
+
70
+
71
+ def _gpu_lcc_contrast(x, K=6, device="cuda"):
72
+ """Polynomial LCC contrast for sklearn fun parameter."""
73
+ if x.ndim == 1:
74
+ return _gpu_lcc_h_gprime(x, K, device=device)
75
+ p, N = x.shape
76
+ gY = np.empty_like(x)
77
+ gpy = np.empty(p, dtype=x.dtype)
78
+ for i in range(p):
79
+ gi, gpi = _gpu_lcc_h_gprime(x[i], K, device=device)
80
+ gY[i] = gi
81
+ gpy[i] = gpi.mean()
82
+ return gY, gpy
83
+
84
+
85
+ # -------------------------------------------------------------------
86
+ # Bounded LCC on GPU (K = 'ltanh', 'lexp')
87
+ # -------------------------------------------------------------------
88
+
89
+ def _get_gpu_mem_bytes(device, gpu_mem_limit):
90
+ """Resolve effective GPU memory limit in bytes.
91
+
92
+ Auto-detects free memory. If gpu_mem_limit is set and exceeds
93
+ 97% of total, warns and caps at 97%.
94
+ """
95
+ if not device.startswith("cuda"):
96
+ return 4 * 1024**3 # 4GB fallback for CPU
97
+ free, total = torch.cuda.mem_get_info(device)
98
+ safe = int(total * 0.97)
99
+ if gpu_mem_limit is None:
100
+ return min(free, safe)
101
+ requested = int(gpu_mem_limit * 1024**3)
102
+ if requested > safe:
103
+ warnings.warn(
104
+ f"gpu_mem_limit={gpu_mem_limit}GB exceeds 97% of "
105
+ f"GPU memory ({total/1024**3:.1f}GB). "
106
+ f"Capping at {safe/1024**3:.1f}GB.",
107
+ RuntimeWarning)
108
+ return safe
109
+ return requested
110
+
111
+
112
+ def _gpu_bounded_contrast(x, G="tanh", batch_size=500,
113
+ device="cuda", gpu_mem_limit=None):
114
+ """Pairwise bounded LCC contrast on GPU.
115
+
116
+ Handles both deflation (1-D) and parallel (2-D).
117
+ Auto-detects GPU memory; user can override via gpu_mem_limit (GB).
118
+ Halves chunk size on OOM.
119
+ """
120
+ if x.ndim == 1:
121
+ return _gpu_bounded_1d(x, G, batch_size, device)
122
+
123
+ p, N = x.shape
124
+ B = min(batch_size, N)
125
+ step = max(1, N // B)
126
+ idx = np.arange(0, N, step)[:B]
127
+
128
+ mem = _get_gpu_mem_bytes(device, gpu_mem_limit)
129
+ max_chunk = max(1, mem // (p * B * 8 * 6))
130
+
131
+ X_batch = torch.as_tensor(x[:, idx], dtype=torch.float64,
132
+ device=device)
133
+ gY = np.empty_like(x)
134
+ gpY = np.empty_like(x)
135
+
136
+ start = 0
137
+ while start < N:
138
+ end = min(start + max_chunk, N)
139
+ try:
140
+ X_chunk = torch.as_tensor(
141
+ x[:, start:end], dtype=torch.float64, device=device)
142
+ diff = X_chunk.unsqueeze(2) - X_batch.unsqueeze(1)
143
+
144
+ if G == "tanh":
145
+ t = torch.tanh(diff)
146
+ gY[:, start:end] = t.mean(dim=2).cpu().numpy()
147
+ gpY[:, start:end] = (1.0 - t * t).mean(
148
+ dim=2).cpu().numpy()
149
+ else: # exp
150
+ e = torch.exp(-0.5 * diff * diff)
151
+ gY[:, start:end] = (diff * e).mean(
152
+ dim=2).cpu().numpy()
153
+ gpY[:, start:end] = ((1.0 - diff * diff) * e).mean(
154
+ dim=2).cpu().numpy()
155
+
156
+ del diff, X_chunk
157
+ torch.cuda.empty_cache()
158
+ start = end
159
+
160
+ except torch.cuda.OutOfMemoryError:
161
+ del X_chunk
162
+ torch.cuda.empty_cache()
163
+ max_chunk = max(1, max_chunk // 2)
164
+
165
+ del X_batch
166
+ return gY, gpY.mean(axis=1)
167
+
168
+
169
+ def _gpu_bounded_1d(y_np, G, batch_size, device):
170
+ """1-D bounded LCC for deflation algorithm."""
171
+ N = len(y_np)
172
+ B = min(batch_size, N)
173
+ step = max(1, N // B)
174
+ idx = np.arange(0, N, step)[:B]
175
+
176
+ y = torch.as_tensor(y_np, dtype=torch.float64, device=device)
177
+ y_batch = y[idx]
178
+ diff = y.unsqueeze(1) - y_batch.unsqueeze(0)
179
+
180
+ if G == "tanh":
181
+ t = torch.tanh(diff)
182
+ gy = t.mean(dim=1)
183
+ gpy = (1.0 - t * t).mean(dim=1)
184
+ else:
185
+ e = torch.exp(-0.5 * diff * diff)
186
+ gy = (diff * e).mean(dim=1)
187
+ gpy = ((1.0 - diff * diff) * e).mean(dim=1)
188
+
189
+ return gy.cpu().numpy(), gpy.cpu().numpy()
190
+
191
+
192
+ # -------------------------------------------------------------------
193
+ # Main class
194
+ # -------------------------------------------------------------------
195
+
196
+ class GPUICALCC(ICALCC):
197
+ """GPU-accelerated ICALCC.
198
+
199
+ Accelerates all LCC contrasts (polynomial and bounded) via
200
+ PyTorch. Classical contrasts ('tanh', 'exp', 'skew') and
201
+ FastICA(k) contrasts fall back to the parent class.
202
+
203
+ Parameters
204
+ ----------
205
+ K : contrast selector (same as ICALCC)
206
+ device : str, default='cuda'
207
+ PyTorch device. Falls back to 'cpu' if CUDA unavailable.
208
+ batch_size : int, default=500
209
+ Subsample size for bounded pairwise computation.
210
+ **kwargs : passed to ICALCC
211
+
212
+ Examples
213
+ --------
214
+ >>> from gpuicalcc import GPUICALCC
215
+ >>> est = GPUICALCC(n_components=4, K='ltanh', device='cuda')
216
+ >>> S_hat = est.fit_transform(X)
217
+ >>> est6 = GPUICALCC(n_components=4, K=6, device='cuda')
218
+ >>> S6 = est6.fit_transform(X)
219
+ """
220
+
221
+ def __init__(self, n_components=None, *, K=6, device="cuda",
222
+ batch_size=500, gpu_mem_limit=None,
223
+ algorithm="parallel",
224
+ whiten="unit-variance", max_iter=200, tol=1e-4,
225
+ w_init=None, whiten_solver="svd",
226
+ random_state=None, clear_gpu=True):
227
+
228
+ if device.startswith("cuda") and not torch.cuda.is_available():
229
+ warnings.warn(
230
+ "CUDA not available, falling back to device='cpu'.",
231
+ RuntimeWarning)
232
+ device = "cpu"
233
+ self.device = device
234
+ self.batch_size = batch_size
235
+ self.gpu_mem_limit = gpu_mem_limit
236
+ self.clear_gpu = clear_gpu
237
+
238
+ super().__init__(
239
+ n_components=n_components, K=K,
240
+ algorithm=algorithm, whiten=whiten,
241
+ max_iter=max_iter, tol=tol, w_init=w_init,
242
+ whiten_solver=whiten_solver, random_state=random_state)
243
+
244
+ # override with GPU versions
245
+ if K in self._LCC_BOUNDED_MAP:
246
+ G = self._LCC_BOUNDED_MAP[K]
247
+ self.fun = _gpu_bounded_contrast
248
+ self.fun_args = dict(G=G, batch_size=batch_size,
249
+ device=self.device,
250
+ gpu_mem_limit=gpu_mem_limit)
251
+ elif isinstance(K, int) and K in (4, 6, 8):
252
+ self.fun = _gpu_lcc_contrast
253
+ self.fun_args = dict(K=K, device=self.device)
254
+
255
+ def get_params(self, deep=True):
256
+ params = super().get_params(deep=deep)
257
+ params["device"] = self.device
258
+ params["batch_size"] = self.batch_size
259
+ params["gpu_mem_limit"] = self.gpu_mem_limit
260
+ params["clear_gpu"] = self.clear_gpu
261
+ return params
262
+
263
+ def set_params(self, **params):
264
+ device = params.pop("device", None)
265
+ batch_size = params.pop("batch_size", None)
266
+ gpu_mem_limit = params.pop("gpu_mem_limit", None)
267
+ if device is not None:
268
+ self.device = device
269
+ if batch_size is not None:
270
+ self.batch_size = batch_size
271
+ if gpu_mem_limit is not None:
272
+ self.gpu_mem_limit = gpu_mem_limit
273
+ super().set_params(**params)
274
+ K = self.K
275
+ if K in self._LCC_BOUNDED_MAP:
276
+ G = self._LCC_BOUNDED_MAP[K]
277
+ self.fun = _gpu_bounded_contrast
278
+ self.fun_args = dict(G=G, batch_size=self.batch_size,
279
+ device=self.device,
280
+ gpu_mem_limit=self.gpu_mem_limit)
281
+ elif isinstance(K, int) and K in (4, 6, 8):
282
+ self.fun = _gpu_lcc_contrast
283
+ self.fun_args = dict(K=K, device=self.device)
284
+ return self
285
+
286
+ def _clear(self):
287
+ if self.clear_gpu and self.device.startswith("cuda"):
288
+ import gc
289
+ gc.collect()
290
+ torch.cuda.empty_cache()
291
+
292
+ def fit(self, X, y=None):
293
+ result = super().fit(X, y)
294
+ self._clear()
295
+ return result
296
+
297
+ def fit_transform(self, X, y=None):
298
+ result = super().fit_transform(X, y)
299
+ self._clear()
300
+ return result
@@ -0,0 +1,44 @@
1
+ [build-system]
2
+ requires = ["setuptools>=68", "wheel"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "gpuicalcc"
7
+ version = "0.1.3"
8
+ description = "GPU-accelerated locally centered contrast functions for FastICA"
9
+ readme = "README.md"
10
+ license = { text = "CC BY 4.0" }
11
+ authors = [
12
+ { name = "Tetsuya Saito", email = "ted@kleinverse.io" }
13
+ ]
14
+ keywords = [
15
+ "ICA", "independent component analysis", "blind source separation",
16
+ "locally centered contrast", "cumulants", "higher-order statistics",
17
+ "Renyi entropy", "FastICA", "scikit-learn", "GPU", "PyTorch", "CUDA"
18
+ ]
19
+ classifiers = [
20
+ "Development Status :: 4 - Beta",
21
+ "Intended Audience :: Science/Research",
22
+ "Topic :: Scientific/Engineering :: Artificial Intelligence",
23
+ "Programming Language :: Python :: 3",
24
+ "Programming Language :: Python :: 3.9",
25
+ "Programming Language :: Python :: 3.10",
26
+ "Programming Language :: Python :: 3.11",
27
+ "Programming Language :: Python :: 3.12",
28
+ "License :: Other/Proprietary License",
29
+ ]
30
+ requires-python = ">=3.9"
31
+ dependencies = [
32
+ "numpy>=1.24",
33
+ "scikit-learn>=1.3",
34
+ "icalcc>=0.1.3",
35
+ "torch>=2.0",
36
+ ]
37
+
38
+ [project.urls]
39
+ Repository = "https://github.com/Kleinverse/gpuicalcc"
40
+ "CPU Package" = "https://github.com/Kleinverse/icalcc"
41
+ Research = "https://github.com/Kleinverse/research"
42
+
43
+ [tool.setuptools]
44
+ py-modules = ["gpuicalcc"]
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+