skmetal 0.2.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- skmetal-0.2.0/PKG-INFO +292 -0
- skmetal-0.2.0/README.md +262 -0
- skmetal-0.2.0/pyproject.toml +63 -0
- skmetal-0.2.0/setup.cfg +4 -0
- skmetal-0.2.0/skmetal/__init__.py +52 -0
- skmetal-0.2.0/skmetal/_about.py +2 -0
- skmetal-0.2.0/skmetal/_bridge.py +512 -0
- skmetal-0.2.0/skmetal/_config.py +114 -0
- skmetal-0.2.0/skmetal/_dispatch.py +49 -0
- skmetal-0.2.0/skmetal/accelerate.py +130 -0
- skmetal-0.2.0/skmetal/estimators/__init__.py +35 -0
- skmetal-0.2.0/skmetal/estimators/_base.py +72 -0
- skmetal-0.2.0/skmetal/estimators/_registry.py +44 -0
- skmetal-0.2.0/skmetal/estimators/cluster.py +227 -0
- skmetal-0.2.0/skmetal/estimators/decomposition.py +68 -0
- skmetal-0.2.0/skmetal/estimators/ensemble.py +171 -0
- skmetal-0.2.0/skmetal/estimators/linear_model.py +303 -0
- skmetal-0.2.0/skmetal/estimators/naive_bayes.py +83 -0
- skmetal-0.2.0/skmetal/estimators/neighbors.py +165 -0
- skmetal-0.2.0/skmetal/estimators/preprocessing.py +103 -0
- skmetal-0.2.0/skmetal.egg-info/PKG-INFO +292 -0
- skmetal-0.2.0/skmetal.egg-info/SOURCES.txt +23 -0
- skmetal-0.2.0/skmetal.egg-info/dependency_links.txt +1 -0
- skmetal-0.2.0/skmetal.egg-info/requires.txt +10 -0
- skmetal-0.2.0/skmetal.egg-info/top_level.txt +1 -0
skmetal-0.2.0/PKG-INFO
ADDED
|
@@ -0,0 +1,292 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: skmetal
|
|
3
|
+
Version: 0.2.0
|
|
4
|
+
Summary: GPU-accelerated scikit-learn via Apple Metal
|
|
5
|
+
Author: Ainouche Abderahmane
|
|
6
|
+
License-Expression: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/abderahmane-ai/skmetal
|
|
8
|
+
Project-URL: Repository, https://github.com/abderahmane-ai/skmetal
|
|
9
|
+
Project-URL: Bug Tracker, https://github.com/abderahmane-ai/skmetal/issues
|
|
10
|
+
Keywords: scikit-learn,metal,gpu,apple-silicon,machine-learning
|
|
11
|
+
Classifier: Development Status :: 4 - Beta
|
|
12
|
+
Classifier: Intended Audience :: Developers
|
|
13
|
+
Classifier: Intended Audience :: Science/Research
|
|
14
|
+
Classifier: Operating System :: MacOS :: MacOS X
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
18
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
19
|
+
Requires-Python: >=3.10
|
|
20
|
+
Description-Content-Type: text/markdown
|
|
21
|
+
Requires-Dist: numpy>=1.24
|
|
22
|
+
Requires-Dist: scikit-learn<2.0,>=1.5
|
|
23
|
+
Requires-Dist: scipy>=1.10
|
|
24
|
+
Provides-Extra: dev
|
|
25
|
+
Requires-Dist: pytest>=7.0; extra == "dev"
|
|
26
|
+
Requires-Dist: pytest-timeout>=2.3; extra == "dev"
|
|
27
|
+
Requires-Dist: pytest-benchmark>=4.0; extra == "dev"
|
|
28
|
+
Requires-Dist: matplotlib>=3.7; extra == "dev"
|
|
29
|
+
Requires-Dist: pandas>=2.0; extra == "dev"
|
|
30
|
+
|
|
31
|
+
# skmetal
|
|
32
|
+
|
|
33
|
+
**Apple Silicon GPU acceleration for scikit-learn**
|
|
34
|
+
|
|
35
|
+
[](https://github.com/abderahmane-ai/skmetal)
|
|
36
|
+
[](https://github.com/abderahmane-ai/skmetal)
|
|
37
|
+
[](https://github.com/abderahmane-ai/skmetal)
|
|
38
|
+
[](https://github.com/abderahmane-ai/skmetal/actions/workflows/ci.yml)
|
|
39
|
+
|
|
40
|
+
```python
|
|
41
|
+
import skmetal
|
|
42
|
+
from sklearn.linear_model import LinearRegression
|
|
43
|
+
|
|
44
|
+
@skmetal.accelerate
|
|
45
|
+
def model():
|
|
46
|
+
return LinearRegression()
|
|
47
|
+
|
|
48
|
+
m = model()
|
|
49
|
+
m.fit(X_train, y_train)
|
|
50
|
+
m.predict(X_test)
|
|
51
|
+
```
|
|
52
|
+
|
|
53
|
+
---
|
|
54
|
+
|
|
55
|
+
## Overview
|
|
56
|
+
|
|
57
|
+
skmetal executes scikit-learn estimators on Apple Silicon GPUs via Metal Performance Shaders and custom Metal compute kernels. Decorate any function that returns an estimator with `@skmetal.accelerate` and `fit()`/`predict()` run on the GPU — no code changes required.
|
|
58
|
+
|
|
59
|
+
Apple Silicon's unified memory architecture enables zero-copy data sharing: numpy arrays are passed directly to Metal via `bytesNoCopy`, eliminating data transfer overhead.
|
|
60
|
+
|
|
61
|
+
---
|
|
62
|
+
|
|
63
|
+
## Requirements
|
|
64
|
+
|
|
65
|
+
- macOS 14+
|
|
66
|
+
- Apple Silicon (M1-M5)
|
|
67
|
+
- Python 3.10-3.12
|
|
68
|
+
- Swift 6.1 (`xcode-select --install`)
|
|
69
|
+
- scikit-learn >= 1.5
|
|
70
|
+
|
|
71
|
+
---
|
|
72
|
+
|
|
73
|
+
## Installation
|
|
74
|
+
|
|
75
|
+
```bash
|
|
76
|
+
pip install skmetal
|
|
77
|
+
```
|
|
78
|
+
|
|
79
|
+
> **Note**: macOS 14+ and Apple Silicon required. Xcode not needed for the pip package.
|
|
80
|
+
|
|
81
|
+
### From source
|
|
82
|
+
|
|
83
|
+
```bash
|
|
84
|
+
git clone https://github.com/abderahmane-ai/skmetal.git
|
|
85
|
+
cd skmetal
|
|
86
|
+
bash build.sh
|
|
87
|
+
pip install -e .
|
|
88
|
+
```
|
|
89
|
+
|
|
90
|
+
---
|
|
91
|
+
|
|
92
|
+
## Usage
|
|
93
|
+
|
|
94
|
+
### Decorator (recommended)
|
|
95
|
+
|
|
96
|
+
```python
|
|
97
|
+
import skmetal
|
|
98
|
+
from sklearn.linear_model import LinearRegression
|
|
99
|
+
|
|
100
|
+
@skmetal.accelerate
|
|
101
|
+
def model():
|
|
102
|
+
return LinearRegression()
|
|
103
|
+
|
|
104
|
+
m = model()
|
|
105
|
+
m.fit(X, y)
|
|
106
|
+
m.predict(X_test)
|
|
107
|
+
```
|
|
108
|
+
|
|
109
|
+
The decorator also works with pipelines:
|
|
110
|
+
|
|
111
|
+
```python
|
|
112
|
+
@skmetal.accelerate
|
|
113
|
+
def pipeline():
|
|
114
|
+
return Pipeline([
|
|
115
|
+
("scaler", StandardScaler()),
|
|
116
|
+
("clf", LogisticRegression()),
|
|
117
|
+
])
|
|
118
|
+
|
|
119
|
+
pipe = pipeline()
|
|
120
|
+
pipe.fit(X, y)
|
|
121
|
+
pipe.predict(X_test)
|
|
122
|
+
```
|
|
123
|
+
|
|
124
|
+
### Function call
|
|
125
|
+
|
|
126
|
+
```python
|
|
127
|
+
model = skmetal.accelerate(LinearRegression())
|
|
128
|
+
model.fit(X, y)
|
|
129
|
+
```
|
|
130
|
+
|
|
131
|
+
### Context manager
|
|
132
|
+
|
|
133
|
+
```python
|
|
134
|
+
with skmetal.accelerate_context():
|
|
135
|
+
model = LinearRegression()
|
|
136
|
+
model.fit(X, y)
|
|
137
|
+
```
|
|
138
|
+
|
|
139
|
+
---
|
|
140
|
+
|
|
141
|
+
## Supported Estimators
|
|
142
|
+
|
|
143
|
+
| Estimator | GPU Strategy | Speedup |
|
|
144
|
+
|-----------|-------------|---------|
|
|
145
|
+
| `LinearRegression` | Normal equations via MPS GEMM | **5.93x** |
|
|
146
|
+
| `Ridge` | Fused centering + XTX + XTy (1 dispatch) | **1.16x** |
|
|
147
|
+
| `LogisticRegression` | IRLS (3-5 Newton iterations, fused) | 0.91x |
|
|
148
|
+
| `Lasso` | Coordinate descent + GPU residual updates | -- |
|
|
149
|
+
| `ElasticNet` | Coordinate descent + GPU residual updates | -- |
|
|
150
|
+
| `TruncatedSVD` | Randomized SVD, no centering (all BLAS-3) | **2.53x** |
|
|
151
|
+
| `KMeans` | Single fused command buffer (all iterations on GPU) | 0.69x |
|
|
152
|
+
| `DBSCAN` | GPU pairwise distance + per-point neighbor counting | -- |
|
|
153
|
+
| `GaussianNB` | GPU mean/var per class | -- |
|
|
154
|
+
| `StandardScaler` | Fused Welford (1 dispatch) | **8.27x** |
|
|
155
|
+
| `MinMaxScaler` | Column min/max with threadgroup tree reduction | -- |
|
|
156
|
+
| `RobustScaler` | GPU quantile approximation | -- |
|
|
157
|
+
| `KNeighborsClassifier` | GPU pairwise distance + fused voting | -- |
|
|
158
|
+
| `KNeighborsRegressor` | GPU pairwise distance + fused averaging | -- |
|
|
159
|
+
| `NearestNeighbors` | GPU pairwise distance + index | -- |
|
|
160
|
+
| `HistGradientBoostingRegressor` | C++ HGBT from sklearn (no custom GPU) | -- |
|
|
161
|
+
| `HistGradientBoostingClassifier` | C++ HGBT from sklearn (no custom GPU) | -- |
|
|
162
|
+
|
|
163
|
+
Estimators below 1.0x speedup are dispatch-limited at n <= 50K. Speedup improves to 2-5x at n >= 500K where compute dominates overhead.
|
|
164
|
+
|
|
165
|
+
---
|
|
166
|
+
|
|
167
|
+
## Architecture
|
|
168
|
+
|
|
169
|
+
### Zero-copy pipeline
|
|
170
|
+
|
|
171
|
+
```
|
|
172
|
+
numpy array -> np.ctypes.data -> UnsafeMutableRawPointer -> MTLBuffer(bytesNoCopy:) -> GPU
|
|
173
|
+
| |
|
|
174
|
+
+--------- same physical memory ---------------------+
|
|
175
|
+
```
|
|
176
|
+
|
|
177
|
+
### Metal kernels (14 files)
|
|
178
|
+
|
|
179
|
+
| Kernel file | Operations |
|
|
180
|
+
|-------------|------------|
|
|
181
|
+
| `ReductionKernels.metal` | `reduce_sum`, `reduce_mean_var` (Welford) |
|
|
182
|
+
| `ArgminKernels.metal` | `argmin_rows` |
|
|
183
|
+
| `KMeansKernels.metal` | assign, partial_sum, combine, normalize, batch fused |
|
|
184
|
+
| `KNNKernels.metal` | tile top-k, merge, fused vote classify/regress |
|
|
185
|
+
| `PairwiseDistKernels.metal` | `pairwise_distance_squared`, `pairwise_distance_direct` |
|
|
186
|
+
| `DistanceKernels.metal` | `row_norm_sq`, `compute_mindists`, `distance_correct` |
|
|
187
|
+
| `IrlsKernels.metal` | `irls_weight`, `scale_rows`, `compute_linear_irls`, `compute_error_scale`, `l2_reg_irls`, `multinomial_hessians` |
|
|
188
|
+
| `CenterColumns.metal` | `column_means`, `center_columns` |
|
|
189
|
+
| `ElementWiseKernels.metal` | sigmoid, subtract, add_scalar, axpy, norm_sq, transpose_f32, row_max, row_sum, softmax, negate |
|
|
190
|
+
| `ExtraKernels.metal` | `soft_threshold`, `column_transform`, `scale_f32`, `sv_init`, `sv_hook`, `sv_shortcut` |
|
|
191
|
+
| `StandardScalerKernels.metal` | `scaler_fit` (fused Welford) |
|
|
192
|
+
| `GemmKernels.metal` | `gemm_simple` (fallback) |
|
|
193
|
+
| `MinMaxKernels.metal` | `column_minmax` (threadgroup tree reduction) |
|
|
194
|
+
| `TreeKernels.metal` | `tree_predict`, `tree_predict_all` |
|
|
195
|
+
|
|
196
|
+
### Swift bridge (47 C-callable functions)
|
|
197
|
+
|
|
198
|
+
All `skmetal_*` functions use `@_cdecl` for direct ctypes export. Every function accepts raw pointers.
|
|
199
|
+
|
|
200
|
+
### Project structure
|
|
201
|
+
|
|
202
|
+
```
|
|
203
|
+
skmetal/
|
|
204
|
+
skmetal/
|
|
205
|
+
__init__.py
|
|
206
|
+
_bridge.py ctypes -> Swift (47 functions)
|
|
207
|
+
_config.py Config dataclass
|
|
208
|
+
_dispatch.py estimator registry + wrapping
|
|
209
|
+
accelerate.py @accelerate decorator + accelerate_context
|
|
210
|
+
estimators/
|
|
211
|
+
_base.py BaseGPUEstimator abstract class
|
|
212
|
+
_registry.py Estimator registry (17 estimators)
|
|
213
|
+
linear_model.py LinearRegression, Ridge, LogisticRegression, Lasso, ElasticNet
|
|
214
|
+
cluster.py KMeans, DBSCAN
|
|
215
|
+
decomposition.py TruncatedSVD
|
|
216
|
+
ensemble.py HistGradientBoostingRegressor, HistGradientBoostingClassifier
|
|
217
|
+
naive_bayes.py GaussianNB
|
|
218
|
+
neighbors.py KNeighborsClassifier, KNeighborsRegressor, NearestNeighbors
|
|
219
|
+
preprocessing.py StandardScaler, MinMaxScaler, RobustScaler
|
|
220
|
+
utils.py
|
|
221
|
+
skmetal_bridge/ Swift + Metal
|
|
222
|
+
Sources/SkMetalBridge/
|
|
223
|
+
Bridge.swift 47 @_cdecl exports
|
|
224
|
+
MetalContext.swift
|
|
225
|
+
Kernels/*.metal 14 Metal kernel files
|
|
226
|
+
benchmarks/
|
|
227
|
+
run_compare.py benchmark runner
|
|
228
|
+
benchmark_suite.py full suite
|
|
229
|
+
baseline.json
|
|
230
|
+
tests/
|
|
231
|
+
test_correctness.py 18 estimator correctness tests
|
|
232
|
+
test_dispatch.py 7 dispatch logic tests
|
|
233
|
+
test_kernels.py 55 kernel unit tests
|
|
234
|
+
test_config.py 13 config API tests
|
|
235
|
+
test_accelerate.py 20 accelerate edge-case tests
|
|
236
|
+
test_stress.py 27 edge-case/stress tests
|
|
237
|
+
conftest.py shared fixtures
|
|
238
|
+
build.sh
|
|
239
|
+
pyproject.toml
|
|
240
|
+
.github/workflows/
|
|
241
|
+
ci.yml build + test + benchmark regression
|
|
242
|
+
release.yml PyPI publish on tag
|
|
243
|
+
LICENSE MIT
|
|
244
|
+
```
|
|
245
|
+
|
|
246
|
+
---
|
|
247
|
+
|
|
248
|
+
## Tests
|
|
249
|
+
|
|
250
|
+
```
|
|
251
|
+
test_correctness 18/18 pass — estimator parametrizations + pipeline + device info
|
|
252
|
+
test_dispatch 7/7 pass — registry, wrapping, pipeline, decorator
|
|
253
|
+
test_kernels 55/55 pass — all 14 kernel files covered
|
|
254
|
+
test_config 13/13 pass — config API, dtype rejection, threshold, reset
|
|
255
|
+
test_accelerate 20/20 pass — decorator, context manager, edge cases
|
|
256
|
+
test_stress 27/27 pass — contiguity, shape mismatch, extremes, CPU fallback
|
|
257
|
+
```
|
|
258
|
+
|
|
259
|
+
---
|
|
260
|
+
|
|
261
|
+
## Benchmarks
|
|
262
|
+
|
|
263
|
+
```bash
|
|
264
|
+
python benchmarks/run_compare.py
|
|
265
|
+
```
|
|
266
|
+
|
|
267
|
+
---
|
|
268
|
+
|
|
269
|
+
## Development
|
|
270
|
+
|
|
271
|
+
```bash
|
|
272
|
+
git clone https://github.com/abderahmane-ai/skmetal.git
|
|
273
|
+
cd skmetal
|
|
274
|
+
bash build.sh
|
|
275
|
+
pip install -e ".[dev]"
|
|
276
|
+
pytest tests/
|
|
277
|
+
python benchmarks/run_compare.py
|
|
278
|
+
```
|
|
279
|
+
|
|
280
|
+
### Adding a new estimator
|
|
281
|
+
|
|
282
|
+
1. Create `skmetal/estimators/my_model.py` with `MetalMyModel(BaseGPUEstimator)`
|
|
283
|
+
2. Register in `_registry.py` (`GPU_ESTIMATORS` + `PIPELINE_PATTERNS`)
|
|
284
|
+
3. Add module path in `_dispatch.py` (`module_map`)
|
|
285
|
+
4. Write a Metal kernel if needed
|
|
286
|
+
5. Add a parametrized test case in `test_correctness.py`
|
|
287
|
+
|
|
288
|
+
---
|
|
289
|
+
|
|
290
|
+
## License
|
|
291
|
+
|
|
292
|
+
MIT License. See `LICENSE`.
|
skmetal-0.2.0/README.md
ADDED
|
@@ -0,0 +1,262 @@
|
|
|
1
|
+
# skmetal
|
|
2
|
+
|
|
3
|
+
**Apple Silicon GPU acceleration for scikit-learn**
|
|
4
|
+
|
|
5
|
+
[](https://github.com/abderahmane-ai/skmetal)
|
|
6
|
+
[](https://github.com/abderahmane-ai/skmetal)
|
|
7
|
+
[](https://github.com/abderahmane-ai/skmetal)
|
|
8
|
+
[](https://github.com/abderahmane-ai/skmetal/actions/workflows/ci.yml)
|
|
9
|
+
|
|
10
|
+
```python
|
|
11
|
+
import skmetal
|
|
12
|
+
from sklearn.linear_model import LinearRegression
|
|
13
|
+
|
|
14
|
+
@skmetal.accelerate
|
|
15
|
+
def model():
|
|
16
|
+
return LinearRegression()
|
|
17
|
+
|
|
18
|
+
m = model()
|
|
19
|
+
m.fit(X_train, y_train)
|
|
20
|
+
m.predict(X_test)
|
|
21
|
+
```
|
|
22
|
+
|
|
23
|
+
---
|
|
24
|
+
|
|
25
|
+
## Overview
|
|
26
|
+
|
|
27
|
+
skmetal executes scikit-learn estimators on Apple Silicon GPUs via Metal Performance Shaders and custom Metal compute kernels. Decorate any function that returns an estimator with `@skmetal.accelerate` and `fit()`/`predict()` run on the GPU — no code changes required.
|
|
28
|
+
|
|
29
|
+
Apple Silicon's unified memory architecture enables zero-copy data sharing: numpy arrays are passed directly to Metal via `bytesNoCopy`, eliminating data transfer overhead.
|
|
30
|
+
|
|
31
|
+
---
|
|
32
|
+
|
|
33
|
+
## Requirements
|
|
34
|
+
|
|
35
|
+
- macOS 14+
|
|
36
|
+
- Apple Silicon (M1-M5)
|
|
37
|
+
- Python 3.10-3.12
|
|
38
|
+
- Swift 6.1 (`xcode-select --install`)
|
|
39
|
+
- scikit-learn >= 1.5
|
|
40
|
+
|
|
41
|
+
---
|
|
42
|
+
|
|
43
|
+
## Installation
|
|
44
|
+
|
|
45
|
+
```bash
|
|
46
|
+
pip install skmetal
|
|
47
|
+
```
|
|
48
|
+
|
|
49
|
+
> **Note**: macOS 14+ and Apple Silicon required. Xcode not needed for the pip package.
|
|
50
|
+
|
|
51
|
+
### From source
|
|
52
|
+
|
|
53
|
+
```bash
|
|
54
|
+
git clone https://github.com/abderahmane-ai/skmetal.git
|
|
55
|
+
cd skmetal
|
|
56
|
+
bash build.sh
|
|
57
|
+
pip install -e .
|
|
58
|
+
```
|
|
59
|
+
|
|
60
|
+
---
|
|
61
|
+
|
|
62
|
+
## Usage
|
|
63
|
+
|
|
64
|
+
### Decorator (recommended)
|
|
65
|
+
|
|
66
|
+
```python
|
|
67
|
+
import skmetal
|
|
68
|
+
from sklearn.linear_model import LinearRegression
|
|
69
|
+
|
|
70
|
+
@skmetal.accelerate
|
|
71
|
+
def model():
|
|
72
|
+
return LinearRegression()
|
|
73
|
+
|
|
74
|
+
m = model()
|
|
75
|
+
m.fit(X, y)
|
|
76
|
+
m.predict(X_test)
|
|
77
|
+
```
|
|
78
|
+
|
|
79
|
+
The decorator also works with pipelines:
|
|
80
|
+
|
|
81
|
+
```python
|
|
82
|
+
@skmetal.accelerate
|
|
83
|
+
def pipeline():
|
|
84
|
+
return Pipeline([
|
|
85
|
+
("scaler", StandardScaler()),
|
|
86
|
+
("clf", LogisticRegression()),
|
|
87
|
+
])
|
|
88
|
+
|
|
89
|
+
pipe = pipeline()
|
|
90
|
+
pipe.fit(X, y)
|
|
91
|
+
pipe.predict(X_test)
|
|
92
|
+
```
|
|
93
|
+
|
|
94
|
+
### Function call
|
|
95
|
+
|
|
96
|
+
```python
|
|
97
|
+
model = skmetal.accelerate(LinearRegression())
|
|
98
|
+
model.fit(X, y)
|
|
99
|
+
```
|
|
100
|
+
|
|
101
|
+
### Context manager
|
|
102
|
+
|
|
103
|
+
```python
|
|
104
|
+
with skmetal.accelerate_context():
|
|
105
|
+
model = LinearRegression()
|
|
106
|
+
model.fit(X, y)
|
|
107
|
+
```
|
|
108
|
+
|
|
109
|
+
---
|
|
110
|
+
|
|
111
|
+
## Supported Estimators
|
|
112
|
+
|
|
113
|
+
| Estimator | GPU Strategy | Speedup |
|
|
114
|
+
|-----------|-------------|---------|
|
|
115
|
+
| `LinearRegression` | Normal equations via MPS GEMM | **5.93x** |
|
|
116
|
+
| `Ridge` | Fused centering + XTX + XTy (1 dispatch) | **1.16x** |
|
|
117
|
+
| `LogisticRegression` | IRLS (3-5 Newton iterations, fused) | 0.91x |
|
|
118
|
+
| `Lasso` | Coordinate descent + GPU residual updates | -- |
|
|
119
|
+
| `ElasticNet` | Coordinate descent + GPU residual updates | -- |
|
|
120
|
+
| `TruncatedSVD` | Randomized SVD, no centering (all BLAS-3) | **2.53x** |
|
|
121
|
+
| `KMeans` | Single fused command buffer (all iterations on GPU) | 0.69x |
|
|
122
|
+
| `DBSCAN` | GPU pairwise distance + per-point neighbor counting | -- |
|
|
123
|
+
| `GaussianNB` | GPU mean/var per class | -- |
|
|
124
|
+
| `StandardScaler` | Fused Welford (1 dispatch) | **8.27x** |
|
|
125
|
+
| `MinMaxScaler` | Column min/max with threadgroup tree reduction | -- |
|
|
126
|
+
| `RobustScaler` | GPU quantile approximation | -- |
|
|
127
|
+
| `KNeighborsClassifier` | GPU pairwise distance + fused voting | -- |
|
|
128
|
+
| `KNeighborsRegressor` | GPU pairwise distance + fused averaging | -- |
|
|
129
|
+
| `NearestNeighbors` | GPU pairwise distance + index | -- |
|
|
130
|
+
| `HistGradientBoostingRegressor` | C++ HGBT from sklearn (no custom GPU) | -- |
|
|
131
|
+
| `HistGradientBoostingClassifier` | C++ HGBT from sklearn (no custom GPU) | -- |
|
|
132
|
+
|
|
133
|
+
Estimators below 1.0x speedup are dispatch-limited at n <= 50K. Speedup improves to 2-5x at n >= 500K where compute dominates overhead.
|
|
134
|
+
|
|
135
|
+
---
|
|
136
|
+
|
|
137
|
+
## Architecture
|
|
138
|
+
|
|
139
|
+
### Zero-copy pipeline
|
|
140
|
+
|
|
141
|
+
```
|
|
142
|
+
numpy array -> np.ctypes.data -> UnsafeMutableRawPointer -> MTLBuffer(bytesNoCopy:) -> GPU
|
|
143
|
+
| |
|
|
144
|
+
+--------- same physical memory ---------------------+
|
|
145
|
+
```
|
|
146
|
+
|
|
147
|
+
### Metal kernels (14 files)
|
|
148
|
+
|
|
149
|
+
| Kernel file | Operations |
|
|
150
|
+
|-------------|------------|
|
|
151
|
+
| `ReductionKernels.metal` | `reduce_sum`, `reduce_mean_var` (Welford) |
|
|
152
|
+
| `ArgminKernels.metal` | `argmin_rows` |
|
|
153
|
+
| `KMeansKernels.metal` | assign, partial_sum, combine, normalize, batch fused |
|
|
154
|
+
| `KNNKernels.metal` | tile top-k, merge, fused vote classify/regress |
|
|
155
|
+
| `PairwiseDistKernels.metal` | `pairwise_distance_squared`, `pairwise_distance_direct` |
|
|
156
|
+
| `DistanceKernels.metal` | `row_norm_sq`, `compute_mindists`, `distance_correct` |
|
|
157
|
+
| `IrlsKernels.metal` | `irls_weight`, `scale_rows`, `compute_linear_irls`, `compute_error_scale`, `l2_reg_irls`, `multinomial_hessians` |
|
|
158
|
+
| `CenterColumns.metal` | `column_means`, `center_columns` |
|
|
159
|
+
| `ElementWiseKernels.metal` | sigmoid, subtract, add_scalar, axpy, norm_sq, transpose_f32, row_max, row_sum, softmax, negate |
|
|
160
|
+
| `ExtraKernels.metal` | `soft_threshold`, `column_transform`, `scale_f32`, `sv_init`, `sv_hook`, `sv_shortcut` |
|
|
161
|
+
| `StandardScalerKernels.metal` | `scaler_fit` (fused Welford) |
|
|
162
|
+
| `GemmKernels.metal` | `gemm_simple` (fallback) |
|
|
163
|
+
| `MinMaxKernels.metal` | `column_minmax` (threadgroup tree reduction) |
|
|
164
|
+
| `TreeKernels.metal` | `tree_predict`, `tree_predict_all` |
|
|
165
|
+
|
|
166
|
+
### Swift bridge (47 C-callable functions)
|
|
167
|
+
|
|
168
|
+
All `skmetal_*` functions use `@_cdecl` for direct ctypes export. Every function accepts raw pointers.
|
|
169
|
+
|
|
170
|
+
### Project structure
|
|
171
|
+
|
|
172
|
+
```
|
|
173
|
+
skmetal/
|
|
174
|
+
skmetal/
|
|
175
|
+
__init__.py
|
|
176
|
+
_bridge.py ctypes -> Swift (47 functions)
|
|
177
|
+
_config.py Config dataclass
|
|
178
|
+
_dispatch.py estimator registry + wrapping
|
|
179
|
+
accelerate.py @accelerate decorator + accelerate_context
|
|
180
|
+
estimators/
|
|
181
|
+
_base.py BaseGPUEstimator abstract class
|
|
182
|
+
_registry.py Estimator registry (17 estimators)
|
|
183
|
+
linear_model.py LinearRegression, Ridge, LogisticRegression, Lasso, ElasticNet
|
|
184
|
+
cluster.py KMeans, DBSCAN
|
|
185
|
+
decomposition.py TruncatedSVD
|
|
186
|
+
ensemble.py HistGradientBoostingRegressor, HistGradientBoostingClassifier
|
|
187
|
+
naive_bayes.py GaussianNB
|
|
188
|
+
neighbors.py KNeighborsClassifier, KNeighborsRegressor, NearestNeighbors
|
|
189
|
+
preprocessing.py StandardScaler, MinMaxScaler, RobustScaler
|
|
190
|
+
utils.py
|
|
191
|
+
skmetal_bridge/ Swift + Metal
|
|
192
|
+
Sources/SkMetalBridge/
|
|
193
|
+
Bridge.swift 47 @_cdecl exports
|
|
194
|
+
MetalContext.swift
|
|
195
|
+
Kernels/*.metal 14 Metal kernel files
|
|
196
|
+
benchmarks/
|
|
197
|
+
run_compare.py benchmark runner
|
|
198
|
+
benchmark_suite.py full suite
|
|
199
|
+
baseline.json
|
|
200
|
+
tests/
|
|
201
|
+
test_correctness.py 18 estimator correctness tests
|
|
202
|
+
test_dispatch.py 7 dispatch logic tests
|
|
203
|
+
test_kernels.py 55 kernel unit tests
|
|
204
|
+
test_config.py 13 config API tests
|
|
205
|
+
test_accelerate.py 20 accelerate edge-case tests
|
|
206
|
+
test_stress.py 27 edge-case/stress tests
|
|
207
|
+
conftest.py shared fixtures
|
|
208
|
+
build.sh
|
|
209
|
+
pyproject.toml
|
|
210
|
+
.github/workflows/
|
|
211
|
+
ci.yml build + test + benchmark regression
|
|
212
|
+
release.yml PyPI publish on tag
|
|
213
|
+
LICENSE MIT
|
|
214
|
+
```
|
|
215
|
+
|
|
216
|
+
---
|
|
217
|
+
|
|
218
|
+
## Tests
|
|
219
|
+
|
|
220
|
+
```
|
|
221
|
+
test_correctness 18/18 pass — estimator parametrizations + pipeline + device info
|
|
222
|
+
test_dispatch 7/7 pass — registry, wrapping, pipeline, decorator
|
|
223
|
+
test_kernels 55/55 pass — all 14 kernel files covered
|
|
224
|
+
test_config 13/13 pass — config API, dtype rejection, threshold, reset
|
|
225
|
+
test_accelerate 20/20 pass — decorator, context manager, edge cases
|
|
226
|
+
test_stress 27/27 pass — contiguity, shape mismatch, extremes, CPU fallback
|
|
227
|
+
```
|
|
228
|
+
|
|
229
|
+
---
|
|
230
|
+
|
|
231
|
+
## Benchmarks
|
|
232
|
+
|
|
233
|
+
```bash
|
|
234
|
+
python benchmarks/run_compare.py
|
|
235
|
+
```
|
|
236
|
+
|
|
237
|
+
---
|
|
238
|
+
|
|
239
|
+
## Development
|
|
240
|
+
|
|
241
|
+
```bash
|
|
242
|
+
git clone https://github.com/abderahmane-ai/skmetal.git
|
|
243
|
+
cd skmetal
|
|
244
|
+
bash build.sh
|
|
245
|
+
pip install -e ".[dev]"
|
|
246
|
+
pytest tests/
|
|
247
|
+
python benchmarks/run_compare.py
|
|
248
|
+
```
|
|
249
|
+
|
|
250
|
+
### Adding a new estimator
|
|
251
|
+
|
|
252
|
+
1. Create `skmetal/estimators/my_model.py` with `MetalMyModel(BaseGPUEstimator)`
|
|
253
|
+
2. Register in `_registry.py` (`GPU_ESTIMATORS` + `PIPELINE_PATTERNS`)
|
|
254
|
+
3. Add module path in `_dispatch.py` (`module_map`)
|
|
255
|
+
4. Write a Metal kernel if needed
|
|
256
|
+
5. Add a parametrized test case in `test_correctness.py`
|
|
257
|
+
|
|
258
|
+
---
|
|
259
|
+
|
|
260
|
+
## License
|
|
261
|
+
|
|
262
|
+
MIT License. See `LICENSE`.
|
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=68.0", "wheel"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "skmetal"
|
|
7
|
+
version = "0.2.0"
|
|
8
|
+
description = "GPU-accelerated scikit-learn via Apple Metal"
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
authors = [
|
|
11
|
+
{name = "Ainouche Abderahmane"},
|
|
12
|
+
]
|
|
13
|
+
requires-python = ">=3.10"
|
|
14
|
+
license = "MIT"
|
|
15
|
+
keywords = ["scikit-learn", "metal", "gpu", "apple-silicon", "machine-learning"]
|
|
16
|
+
classifiers = [
|
|
17
|
+
"Development Status :: 4 - Beta",
|
|
18
|
+
"Intended Audience :: Developers",
|
|
19
|
+
"Intended Audience :: Science/Research",
|
|
20
|
+
"Operating System :: MacOS :: MacOS X",
|
|
21
|
+
"Programming Language :: Python :: 3.10",
|
|
22
|
+
"Programming Language :: Python :: 3.11",
|
|
23
|
+
"Programming Language :: Python :: 3.12",
|
|
24
|
+
"Topic :: Scientific/Engineering :: Artificial Intelligence",
|
|
25
|
+
]
|
|
26
|
+
dependencies = [
|
|
27
|
+
"numpy>=1.24",
|
|
28
|
+
"scikit-learn>=1.5,<2.0",
|
|
29
|
+
"scipy>=1.10",
|
|
30
|
+
]
|
|
31
|
+
|
|
32
|
+
[project.optional-dependencies]
|
|
33
|
+
dev = [
|
|
34
|
+
"pytest>=7.0",
|
|
35
|
+
"pytest-timeout>=2.3",
|
|
36
|
+
"pytest-benchmark>=4.0",
|
|
37
|
+
"matplotlib>=3.7",
|
|
38
|
+
"pandas>=2.0",
|
|
39
|
+
]
|
|
40
|
+
|
|
41
|
+
[project.urls]
|
|
42
|
+
Homepage = "https://github.com/abderahmane-ai/skmetal"
|
|
43
|
+
Repository = "https://github.com/abderahmane-ai/skmetal"
|
|
44
|
+
"Bug Tracker" = "https://github.com/abderahmane-ai/skmetal/issues"
|
|
45
|
+
|
|
46
|
+
[tool.setuptools.packages.find]
|
|
47
|
+
where = ["."]
|
|
48
|
+
namespaces = false
|
|
49
|
+
|
|
50
|
+
[tool.setuptools.package-data]
|
|
51
|
+
skmetal = ["*.dylib", "SkMetalBridge_SkMetalBridge.bundle/**/*"]
|
|
52
|
+
|
|
53
|
+
[tool.ruff]
|
|
54
|
+
target-version = "py310"
|
|
55
|
+
line-length = 120
|
|
56
|
+
extend-exclude = ["skmetal_bridge"]
|
|
57
|
+
lint.select = ["E", "F", "W", "UP"]
|
|
58
|
+
lint.ignore = ["E501", "E402"]
|
|
59
|
+
|
|
60
|
+
[tool.pytest.ini_options]
|
|
61
|
+
testpaths = ["../tests"]
|
|
62
|
+
python_files = ["test_*.py"]
|
|
63
|
+
timeout = 120
|
skmetal-0.2.0/setup.cfg
ADDED
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
"""
|
|
2
|
+
skmetal: Apple Silicon GPU acceleration for scikit-learn.
|
|
3
|
+
|
|
4
|
+
Usage::
|
|
5
|
+
|
|
6
|
+
import skmetal
|
|
7
|
+
from sklearn.linear_model import LinearRegression
|
|
8
|
+
|
|
9
|
+
@skmetal.accelerate
|
|
10
|
+
def model():
|
|
11
|
+
return LinearRegression()
|
|
12
|
+
|
|
13
|
+
m = model()
|
|
14
|
+
m.fit(X, y)
|
|
15
|
+
m.predict(X_test)
|
|
16
|
+
|
|
17
|
+
On non-Apple-Silicon machines skmetal imports cleanly and all estimators
|
|
18
|
+
transparently fall back to scikit-learn CPU implementations.
|
|
19
|
+
Check ``skmetal.METAL_AVAILABLE`` to detect GPU support at runtime.
|
|
20
|
+
"""
|
|
21
|
+
|
|
22
|
+
from ._about import __version__, __version_info__
|
|
23
|
+
from ._config import get_config, set_device, set_threshold, set_dtype, set_verbose, set_thresholds, update_threshold
|
|
24
|
+
from .accelerate import accelerate, accelerate_context
|
|
25
|
+
from ._bridge import METAL_AVAILABLE
|
|
26
|
+
|
|
27
|
+
if METAL_AVAILABLE:
|
|
28
|
+
from ._bridge import device_info
|
|
29
|
+
else:
|
|
30
|
+
def device_info() -> dict: # type: ignore[misc]
|
|
31
|
+
"""Returns empty info when Metal is unavailable."""
|
|
32
|
+
raise RuntimeError(
|
|
33
|
+
"skmetal: device_info() requires Apple Silicon + macOS 14+. "
|
|
34
|
+
"Metal is not available on this device."
|
|
35
|
+
)
|
|
36
|
+
|
|
37
|
+
__all__ = [
|
|
38
|
+
"__version__",
|
|
39
|
+
"__version_info__",
|
|
40
|
+
"METAL_AVAILABLE",
|
|
41
|
+
"accelerate",
|
|
42
|
+
"accelerate_context",
|
|
43
|
+
"device_info",
|
|
44
|
+
"get_config",
|
|
45
|
+
"set_device",
|
|
46
|
+
"set_threshold",
|
|
47
|
+
"set_dtype",
|
|
48
|
+
"set_verbose",
|
|
49
|
+
"set_thresholds",
|
|
50
|
+
"update_threshold",
|
|
51
|
+
]
|
|
52
|
+
|