macmetalpy 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- macmetalpy-0.1.0/LICENSE +21 -0
- macmetalpy-0.1.0/MANIFEST.in +6 -0
- macmetalpy-0.1.0/PKG-INFO +360 -0
- macmetalpy-0.1.0/README.md +311 -0
- macmetalpy-0.1.0/pyproject.toml +62 -0
- macmetalpy-0.1.0/setup.cfg +4 -0
- macmetalpy-0.1.0/setup.py +10 -0
- macmetalpy-0.1.0/src/macmetalpy/__init__.py +306 -0
- macmetalpy-0.1.0/src/macmetalpy/_accelerator.c +868 -0
- macmetalpy-0.1.0/src/macmetalpy/_broadcasting.py +76 -0
- macmetalpy-0.1.0/src/macmetalpy/_config.py +60 -0
- macmetalpy-0.1.0/src/macmetalpy/_dtypes.py +146 -0
- macmetalpy-0.1.0/src/macmetalpy/_kernel_cache.py +223 -0
- macmetalpy-0.1.0/src/macmetalpy/_kernels.py +1232 -0
- macmetalpy-0.1.0/src/macmetalpy/_metal_backend.py +372 -0
- macmetalpy-0.1.0/src/macmetalpy/bitwise_ops.py +133 -0
- macmetalpy-0.1.0/src/macmetalpy/complex_ops.py +62 -0
- macmetalpy-0.1.0/src/macmetalpy/config_ops.py +94 -0
- macmetalpy-0.1.0/src/macmetalpy/creation.py +321 -0
- macmetalpy-0.1.0/src/macmetalpy/dtype_utils.py +158 -0
- macmetalpy-0.1.0/src/macmetalpy/fft.py +145 -0
- macmetalpy-0.1.0/src/macmetalpy/format_ops.py +112 -0
- macmetalpy-0.1.0/src/macmetalpy/functional.py +60 -0
- macmetalpy-0.1.0/src/macmetalpy/index_tricks.py +93 -0
- macmetalpy-0.1.0/src/macmetalpy/indexing.py +408 -0
- macmetalpy-0.1.0/src/macmetalpy/io.py +86 -0
- macmetalpy-0.1.0/src/macmetalpy/linalg.py +316 -0
- macmetalpy-0.1.0/src/macmetalpy/linalg_top.py +153 -0
- macmetalpy-0.1.0/src/macmetalpy/logic_ops.py +166 -0
- macmetalpy-0.1.0/src/macmetalpy/manipulation.py +671 -0
- macmetalpy-0.1.0/src/macmetalpy/math_ext.py +115 -0
- macmetalpy-0.1.0/src/macmetalpy/math_ops.py +741 -0
- macmetalpy-0.1.0/src/macmetalpy/nan_ops.py +748 -0
- macmetalpy-0.1.0/src/macmetalpy/ndarray.py +2477 -0
- macmetalpy-0.1.0/src/macmetalpy/poly_ops.py +81 -0
- macmetalpy-0.1.0/src/macmetalpy/random.py +376 -0
- macmetalpy-0.1.0/src/macmetalpy/raw_kernel.py +62 -0
- macmetalpy-0.1.0/src/macmetalpy/reductions.py +845 -0
- macmetalpy-0.1.0/src/macmetalpy/set_ops.py +85 -0
- macmetalpy-0.1.0/src/macmetalpy/sorting.py +366 -0
- macmetalpy-0.1.0/src/macmetalpy/ufunc.py +44 -0
- macmetalpy-0.1.0/src/macmetalpy/ufunc_ops.py +295 -0
- macmetalpy-0.1.0/src/macmetalpy/utils.py +80 -0
- macmetalpy-0.1.0/src/macmetalpy/window.py +29 -0
- macmetalpy-0.1.0/src/macmetalpy.egg-info/PKG-INFO +360 -0
- macmetalpy-0.1.0/src/macmetalpy.egg-info/SOURCES.txt +130 -0
- macmetalpy-0.1.0/src/macmetalpy.egg-info/dependency_links.txt +1 -0
- macmetalpy-0.1.0/src/macmetalpy.egg-info/requires.txt +6 -0
- macmetalpy-0.1.0/src/macmetalpy.egg-info/top_level.txt +1 -0
- macmetalpy-0.1.0/tests/conftest.py +202 -0
- macmetalpy-0.1.0/tests/test_broadcasting.py +109 -0
- macmetalpy-0.1.0/tests/test_comparison.py +172 -0
- macmetalpy-0.1.0/tests/test_comparison_funcs.py +341 -0
- macmetalpy-0.1.0/tests/test_complex_ops.py +170 -0
- macmetalpy-0.1.0/tests/test_config.py +84 -0
- macmetalpy-0.1.0/tests/test_config_integration.py +135 -0
- macmetalpy-0.1.0/tests/test_creation.py +710 -0
- macmetalpy-0.1.0/tests/test_creation_gaps.py +150 -0
- macmetalpy-0.1.0/tests/test_creation_linalg.py +362 -0
- macmetalpy-0.1.0/tests/test_creation_linalg_ext.py +287 -0
- macmetalpy-0.1.0/tests/test_creation_params_final.py +280 -0
- macmetalpy-0.1.0/tests/test_dtype_constants.py +120 -0
- macmetalpy-0.1.0/tests/test_dtype_format_final.py +410 -0
- macmetalpy-0.1.0/tests/test_dtype_system.py +205 -0
- macmetalpy-0.1.0/tests/test_dtype_utils.py +341 -0
- macmetalpy-0.1.0/tests/test_edge_shapes.py +392 -0
- macmetalpy-0.1.0/tests/test_elementwise.py +191 -0
- macmetalpy-0.1.0/tests/test_error_handling.py +366 -0
- macmetalpy-0.1.0/tests/test_extra.py +173 -0
- macmetalpy-0.1.0/tests/test_fft.py +389 -0
- macmetalpy-0.1.0/tests/test_fft_sort_idx_params.py +413 -0
- macmetalpy-0.1.0/tests/test_final_param_gaps.py +170 -0
- macmetalpy-0.1.0/tests/test_functional.py +177 -0
- macmetalpy-0.1.0/tests/test_index_tricks.py +213 -0
- macmetalpy-0.1.0/tests/test_indexing.py +606 -0
- macmetalpy-0.1.0/tests/test_indexing_funcs.py +413 -0
- macmetalpy-0.1.0/tests/test_inplace_ops.py +281 -0
- macmetalpy-0.1.0/tests/test_interop.py +131 -0
- macmetalpy-0.1.0/tests/test_io_gaps.py +150 -0
- macmetalpy-0.1.0/tests/test_linalg.py +613 -0
- macmetalpy-0.1.0/tests/test_linalg_gaps.py +136 -0
- macmetalpy-0.1.0/tests/test_linalg_logic_params.py +271 -0
- macmetalpy-0.1.0/tests/test_logic_bitwise.py +439 -0
- macmetalpy-0.1.0/tests/test_logic_gaps.py +121 -0
- macmetalpy-0.1.0/tests/test_manip_ext.py +262 -0
- macmetalpy-0.1.0/tests/test_manipulation.py +846 -0
- macmetalpy-0.1.0/tests/test_manipulation_gaps.py +191 -0
- macmetalpy-0.1.0/tests/test_manipulation_gpu.py +85 -0
- macmetalpy-0.1.0/tests/test_math_ext.py +316 -0
- macmetalpy-0.1.0/tests/test_math_ops.py +919 -0
- macmetalpy-0.1.0/tests/test_mathops_gpu.py +81 -0
- macmetalpy-0.1.0/tests/test_misc_params.py +493 -0
- macmetalpy-0.1.0/tests/test_missing_apis.py +696 -0
- macmetalpy-0.1.0/tests/test_nan_ops.py +480 -0
- macmetalpy-0.1.0/tests/test_nan_ops_gaps.py +181 -0
- macmetalpy-0.1.0/tests/test_nan_stats.py +356 -0
- macmetalpy-0.1.0/tests/test_nanops_gpu.py +136 -0
- macmetalpy-0.1.0/tests/test_nanops_params.py +583 -0
- macmetalpy-0.1.0/tests/test_ndarray.py +1214 -0
- macmetalpy-0.1.0/tests/test_ndarray_final.py +434 -0
- macmetalpy-0.1.0/tests/test_ndarray_gaps.py +787 -0
- macmetalpy-0.1.0/tests/test_numeric_edges.py +608 -0
- macmetalpy-0.1.0/tests/test_params_final.py +252 -0
- macmetalpy-0.1.0/tests/test_random.py +690 -0
- macmetalpy-0.1.0/tests/test_random_gaps.py +210 -0
- macmetalpy-0.1.0/tests/test_random_mod.py +719 -0
- macmetalpy-0.1.0/tests/test_rawkernel.py +220 -0
- macmetalpy-0.1.0/tests/test_reduction_gpu.py +73 -0
- macmetalpy-0.1.0/tests/test_reduction_params.py +827 -0
- macmetalpy-0.1.0/tests/test_reductions.py +697 -0
- macmetalpy-0.1.0/tests/test_reductions_gaps.py +364 -0
- macmetalpy-0.1.0/tests/test_search.py +82 -0
- macmetalpy-0.1.0/tests/test_set_ops.py +246 -0
- macmetalpy-0.1.0/tests/test_sort_manip.py +440 -0
- macmetalpy-0.1.0/tests/test_sort_set.py +362 -0
- macmetalpy-0.1.0/tests/test_sorting.py +266 -0
- macmetalpy-0.1.0/tests/test_sorting_gaps.py +230 -0
- macmetalpy-0.1.0/tests/test_sorting_gpu.py +72 -0
- macmetalpy-0.1.0/tests/test_stats_cum.py +487 -0
- macmetalpy-0.1.0/tests/test_strides_views.py +317 -0
- macmetalpy-0.1.0/tests/test_synchronize.py +92 -0
- macmetalpy-0.1.0/tests/test_ufunc.py +102 -0
- macmetalpy-0.1.0/tests/test_ufunc_ops.py +750 -0
- macmetalpy-0.1.0/tests/test_untested_creation.py +1024 -0
- macmetalpy-0.1.0/tests/test_untested_fft.py +214 -0
- macmetalpy-0.1.0/tests/test_untested_linalg.py +321 -0
- macmetalpy-0.1.0/tests/test_untested_math.py +710 -0
- macmetalpy-0.1.0/tests/test_untested_misc.py +747 -0
- macmetalpy-0.1.0/tests/test_untested_random.py +881 -0
- macmetalpy-0.1.0/tests/test_untested_reductions.py +910 -0
- macmetalpy-0.1.0/tests/test_window.py +109 -0
- macmetalpy-0.1.0/tests/test_window_misc.py +197 -0
macmetalpy-0.1.0/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Grant Klepzig
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,360 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: macmetalpy
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: CuPy-compatible GPU array library for Apple Silicon using MetalGPU
|
|
5
|
+
Author: Grant Klepzig
|
|
6
|
+
License: MIT License
|
|
7
|
+
|
|
8
|
+
Copyright (c) 2026 Grant Klepzig
|
|
9
|
+
|
|
10
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
11
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
12
|
+
in the Software without restriction, including without limitation the rights
|
|
13
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
14
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
15
|
+
furnished to do so, subject to the following conditions:
|
|
16
|
+
|
|
17
|
+
The above copyright notice and this permission notice shall be included in all
|
|
18
|
+
copies or substantial portions of the Software.
|
|
19
|
+
|
|
20
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
21
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
22
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
23
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
24
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
25
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
26
|
+
SOFTWARE.
|
|
27
|
+
|
|
28
|
+
Project-URL: Homepage, https://github.com/grantkl/MacMetalPy
|
|
29
|
+
Project-URL: Repository, https://github.com/grantkl/MacMetalPy
|
|
30
|
+
Project-URL: Bug Tracker, https://github.com/grantkl/MacMetalPy/issues
|
|
31
|
+
Classifier: Development Status :: 3 - Alpha
|
|
32
|
+
Classifier: Intended Audience :: Science/Research
|
|
33
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
34
|
+
Classifier: Operating System :: MacOS
|
|
35
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
36
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
37
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
38
|
+
Classifier: Programming Language :: C
|
|
39
|
+
Classifier: Topic :: Scientific/Engineering
|
|
40
|
+
Requires-Python: >=3.10
|
|
41
|
+
Description-Content-Type: text/markdown
|
|
42
|
+
License-File: LICENSE
|
|
43
|
+
Requires-Dist: numpy>=1.24
|
|
44
|
+
Requires-Dist: metalgpu>=0.0.5
|
|
45
|
+
Provides-Extra: dev
|
|
46
|
+
Requires-Dist: pytest>=7.0; extra == "dev"
|
|
47
|
+
Requires-Dist: ruff>=0.4.0; extra == "dev"
|
|
48
|
+
Dynamic: license-file
|
|
49
|
+
|
|
50
|
+
```
|
|
51
|
+
__ __ __ __ _ _ ____
|
|
52
|
+
| \/ | __ _ ___| \/ | ___| |_ __ _| | _ \ _ _
|
|
53
|
+
| |\/| |/ _` |/ __| |\/| |/ _ \ __/ _` | | |_) | | | |
|
|
54
|
+
| | | | (_| | (__| | | | __/ || (_| | | __/| |_| |
|
|
55
|
+
|_| |_|\__,_|\___|_| |_|\___|\__\__,_|_|_| \__, |
|
|
56
|
+
|___/
|
|
57
|
+
```
|
|
58
|
+
|
|
59
|
+
# MacMetalPy
|
|
60
|
+
|
|
61
|
+
### Shred data on Apple Silicon. No CUDA required.
|
|
62
|
+
|
|
63
|
+
A **CuPy-compatible** GPU array library that rips through computation on Apple Silicon using the **Metal** backend. Drop it into your existing CuPy code, swap the import, and let your M-series chip absolutely shred.
|
|
64
|
+
|
|
65
|
+
> **Heads up:** Metal GPUs operate in **float32** — there is no hardware float64. MacMetalPy auto-downcasts float64 → float32 by default (with warnings), or can fall back to CPU. See [Float Precision](#float-precision--the-float64-question) for details.
|
|
66
|
+
|
|
67
|
+
```python
|
|
68
|
+
import macmetalpy as cp
|
|
69
|
+
|
|
70
|
+
a = cp.random.randn(4096, 4096, dtype=cp.float32)
|
|
71
|
+
b = cp.random.randn(4096, 4096, dtype=cp.float32)
|
|
72
|
+
c = a @ b # 🔥 Metal GPU goes brrr
|
|
73
|
+
```
|
|
74
|
+
|
|
75
|
+
---
|
|
76
|
+
|
|
77
|
+
## The Setlist
|
|
78
|
+
|
|
79
|
+
- **Drop-in CuPy replacement** — `import macmetalpy as cp` and your existing code just works
|
|
80
|
+
- **200+ NumPy-compatible functions** — creation, math, linalg, FFT, random, indexing, sorting, reductions, and more
|
|
81
|
+
- **Async Metal dispatch** — operations fire off to the GPU and don't wait around
|
|
82
|
+
- **RawKernel** — write your own Metal Shading Language kernels when the built-in riffs aren't enough
|
|
83
|
+
- **17,000+ passing tests** — battle-tested across 10 dtypes and every edge case we could throw at it
|
|
84
|
+
- **Zero CUDA dependency** — pure Apple Silicon, pure Metal
|
|
85
|
+
|
|
86
|
+
---
|
|
87
|
+
|
|
88
|
+
## Plug In & Play
|
|
89
|
+
|
|
90
|
+
```bash
|
|
91
|
+
pip install macmetalpy
|
|
92
|
+
```
|
|
93
|
+
|
|
94
|
+
**Requirements:**
|
|
95
|
+
- macOS (Apple Silicon — M1/M2/M3/M4)
|
|
96
|
+
- Python >= 3.10
|
|
97
|
+
- numpy >= 1.24
|
|
98
|
+
- metalgpu >= 0.0.5
|
|
99
|
+
|
|
100
|
+
---
|
|
101
|
+
|
|
102
|
+
## Soundcheck
|
|
103
|
+
|
|
104
|
+
**Create arrays on the GPU:**
|
|
105
|
+
|
|
106
|
+
```python
|
|
107
|
+
import macmetalpy as cp
|
|
108
|
+
|
|
109
|
+
a = cp.zeros((1000, 1000), dtype=cp.float32)
|
|
110
|
+
b = cp.ones((1000, 1000), dtype=cp.float32)
|
|
111
|
+
c = cp.arange(0, 100, dtype=cp.int32)
|
|
112
|
+
d = cp.linspace(0, 1, 256, dtype=cp.float16)
|
|
113
|
+
```
|
|
114
|
+
|
|
115
|
+
**Rip through math:**
|
|
116
|
+
|
|
117
|
+
```python
|
|
118
|
+
import macmetalpy as cp
|
|
119
|
+
|
|
120
|
+
x = cp.random.randn(10000, dtype=cp.float32)
|
|
121
|
+
|
|
122
|
+
# Elementwise operations — all on the GPU
|
|
123
|
+
y = cp.sqrt(cp.abs(x)) + cp.exp(-x ** 2)
|
|
124
|
+
|
|
125
|
+
# Reductions
|
|
126
|
+
total = cp.sum(y)
|
|
127
|
+
avg = cp.mean(y)
|
|
128
|
+
```
|
|
129
|
+
|
|
130
|
+
**Linear algebra:**
|
|
131
|
+
|
|
132
|
+
```python
|
|
133
|
+
import macmetalpy as cp
|
|
134
|
+
|
|
135
|
+
A = cp.random.randn(512, 512, dtype=cp.float32)
|
|
136
|
+
b = cp.random.randn(512, dtype=cp.float32)
|
|
137
|
+
|
|
138
|
+
x = cp.linalg.solve(A, b) # Solve Ax = b
|
|
139
|
+
U, S, Vt = cp.linalg.svd(A) # SVD
|
|
140
|
+
eigenvalues = cp.linalg.eigvalsh(A @ A.T) # Eigenvalues
|
|
141
|
+
```
|
|
142
|
+
|
|
143
|
+
**Pull results back to CPU:**
|
|
144
|
+
|
|
145
|
+
```python
|
|
146
|
+
gpu_result = cp.sum(cp.random.randn(1000000, dtype=cp.float32))
|
|
147
|
+
numpy_array = gpu_result.get() # Transfer to NumPy
|
|
148
|
+
```
|
|
149
|
+
|
|
150
|
+
---
|
|
151
|
+
|
|
152
|
+
## Benchmarks — When Does the GPU Shred?
|
|
153
|
+
|
|
154
|
+
MacMetalPy vs NumPy on an **M4 Mac Mini**, float32. The GPU's advantage grows with array size — small arrays have fixed dispatch overhead, but once you're past ~100K elements, Metal starts winning, and at 10M+ it absolutely rips.
|
|
155
|
+
|
|
156
|
+
### The Scaling Story
|
|
157
|
+
|
|
158
|
+
| Operation | 1K | 100K | 1M | 10M |
|
|
159
|
+
|---|---|---|---|---|
|
|
160
|
+
| `a + b` | 0.29x | 0.96x | 1.07x | — |
|
|
161
|
+
| `sin(a)` | 0.73x | 1.03x | 3.42x | **3.71x** |
|
|
162
|
+
| `exp(a)` | 0.76x | 1.09x | 3.68x | **4.45x** |
|
|
163
|
+
| `tan(a)` | 0.81x | 1.08x | 8.64x | **14.0x** |
|
|
164
|
+
| `arcsin(a)` | 0.82x | 1.04x | 12.7x | **16.5x** |
|
|
165
|
+
| `power(a, b)` | 0.77x | 1.05x | 6.83x | **7.71x** |
|
|
166
|
+
| `floor_divide` | 0.80x | 1.04x | 17.2x | **26.7x** |
|
|
167
|
+
| `mod(a, b)` | 0.83x | 1.05x | 7.02x | **12.1x** |
|
|
168
|
+
| `cumsum(a)` | 0.66x | 0.96x | 2.43x | **2.81x** |
|
|
169
|
+
| `nanprod(a)` | 0.68x | 0.95x | 3.95x | **5.57x** |
|
|
170
|
+
| `sort(a)` | 0.91x | 1.07x | 9.16x | — |
|
|
171
|
+
|
|
172
|
+
> Values are speedup vs NumPy (higher = GPU faster). **Bold** = GPU wins by 2x+.
|
|
173
|
+
|
|
174
|
+
### By Category at 10M Elements
|
|
175
|
+
|
|
176
|
+
| Category | Avg Speedup | GPU Wins | Highlights |
|
|
177
|
+
|---|---|---|---|
|
|
178
|
+
| **Trig** | **8.0x** | 15/15 | Every trig op faster on GPU |
|
|
179
|
+
| **Math** | **5.8x** | 8/14 | Transcendentals dominate |
|
|
180
|
+
| **Ufuncs** | **5.3x** | 16/34 | `fmod` 17x, `heaviside` 20x |
|
|
181
|
+
| **NaN ops** | **3.2x** | 8/9 | `nancumprod` 5.2x, `nanprod` 5.6x |
|
|
182
|
+
| **Reductions** | **1.3x** | 5/13 | `prod` 4.2x, `cumsum` 2.8x |
|
|
183
|
+
| **Comparisons** | **1.2x** | 3/4 | `less` 1.4x, `equal` 1.2x |
|
|
184
|
+
| **Stats** | **1.3x** | 3/6 | `digitize` 2.1x |
|
|
185
|
+
|
|
186
|
+
### The Rule of Thumb
|
|
187
|
+
|
|
188
|
+
| Array Size | Who Wins | Why |
|
|
189
|
+
|---|---|---|
|
|
190
|
+
| **< 10K** | NumPy | GPU dispatch overhead dominates |
|
|
191
|
+
| **10K – 100K** | Roughly even | Overhead amortized, GPU warming up |
|
|
192
|
+
| **100K – 1M** | GPU pulls ahead | Parallel compute outpaces CPU SIMD |
|
|
193
|
+
| **1M+** | **GPU shreds** | 3-27x on compute-heavy ops |
|
|
194
|
+
|
|
195
|
+
> Run the benchmarks yourself: `python benchmarks/bench_vs_numpy.py --sizes small,medium,large,xlarge --serial`
|
|
196
|
+
|
|
197
|
+
---
|
|
198
|
+
|
|
199
|
+
## The Lineup
|
|
200
|
+
|
|
201
|
+
| Module | Functions | What it shreds |
|
|
202
|
+
|---|---|---|
|
|
203
|
+
| **Creation** | 25 | `zeros`, `ones`, `arange`, `linspace`, `eye`, `meshgrid`, ... |
|
|
204
|
+
| **Math** | 94 | `sqrt`, `exp`, `log`, `sin`, `cos`, `dot`, `where`, `clip`, ... |
|
|
205
|
+
| **Reductions** | 21 | `sum`, `mean`, `std`, `var`, `argmax`, `cumsum`, `median`, ... |
|
|
206
|
+
| **Linalg** | 25 | `solve`, `inv`, `svd`, `eigh`, `qr`, `det`, `norm`, `einsum`, ... |
|
|
207
|
+
| **Manipulation** | 33 | `reshape`, `transpose`, `concatenate`, `stack`, `pad`, `tile`, ... |
|
|
208
|
+
| **Indexing** | 23 | `take`, `put`, `nonzero`, `argwhere`, `fill_diagonal`, ... |
|
|
209
|
+
| **Sorting** | 9 | `sort`, `argsort`, `unique`, `searchsorted`, `partition`, ... |
|
|
210
|
+
| **FFT** | 19 | `fft`, `ifft`, `rfft`, `fft2`, `fftn`, `fftfreq`, ... |
|
|
211
|
+
| **Random** | 40+ | `randn`, `uniform`, `normal`, `poisson`, `choice`, `shuffle`, ... |
|
|
212
|
+
| **Logic & Bitwise** | 30 | `logical_and`, `greater`, `bitwise_xor`, `gcd`, `lcm`, ... |
|
|
213
|
+
| **NaN Ops** | 27 | `nansum`, `nanmean`, `histogram`, `corrcoef`, `gradient`, ... |
|
|
214
|
+
| **Set Ops** | 7 | `union1d`, `intersect1d`, `setdiff1d`, `isin`, ... |
|
|
215
|
+
|
|
216
|
+
---
|
|
217
|
+
|
|
218
|
+
## Custom Riffs
|
|
219
|
+
|
|
220
|
+
When the built-in operations don't cut it, write your own Metal Shading Language kernels with `RawKernel`:
|
|
221
|
+
|
|
222
|
+
```python
|
|
223
|
+
from macmetalpy import RawKernel
|
|
224
|
+
import macmetalpy as cp
|
|
225
|
+
import numpy as np
|
|
226
|
+
|
|
227
|
+
# Write a custom Metal kernel
|
|
228
|
+
kernel_source = """
|
|
229
|
+
#include <metal_stdlib>
|
|
230
|
+
using namespace metal;
|
|
231
|
+
|
|
232
|
+
kernel void saxpy(device float *x [[buffer(0)]],
|
|
233
|
+
device float *y [[buffer(1)]],
|
|
234
|
+
device float *out [[buffer(2)]],
|
|
235
|
+
uint id [[thread_position_in_grid]]) {
|
|
236
|
+
float alpha = 2.5f;
|
|
237
|
+
out[id] = alpha * x[id] + y[id];
|
|
238
|
+
}
|
|
239
|
+
"""
|
|
240
|
+
|
|
241
|
+
saxpy = RawKernel(kernel_source, 'saxpy')
|
|
242
|
+
|
|
243
|
+
N = 1_000_000
|
|
244
|
+
x = cp.random.randn(N, dtype=np.float32)
|
|
245
|
+
y = cp.random.randn(N, dtype=np.float32)
|
|
246
|
+
out = cp.empty(N, dtype=np.float32)
|
|
247
|
+
|
|
248
|
+
saxpy(N, (x, y, out)) # Launch N GPU threads
|
|
249
|
+
|
|
250
|
+
result = out.get()
|
|
251
|
+
```
|
|
252
|
+
|
|
253
|
+
Grid sizes can be 1D, 2D, or 3D:
|
|
254
|
+
|
|
255
|
+
```python
|
|
256
|
+
kernel(N, args) # 1D — N threads
|
|
257
|
+
kernel((W, H), args) # 2D grid
|
|
258
|
+
kernel((W, H, D), args) # 3D grid
|
|
259
|
+
```
|
|
260
|
+
|
|
261
|
+
---
|
|
262
|
+
|
|
263
|
+
## Float Precision & The float64 Question
|
|
264
|
+
|
|
265
|
+
**This is the biggest difference between MacMetalPy and NumPy/CuPy.**
|
|
266
|
+
|
|
267
|
+
Apple's Metal GPU has **no native float64 (double) support**. All GPU computation runs in **float32** (single precision) or **float16** (half precision). This is a hardware limitation — not a software one.
|
|
268
|
+
|
|
269
|
+
### What this means in practice
|
|
270
|
+
|
|
271
|
+
| Scenario | What happens |
|
|
272
|
+
|---|---|
|
|
273
|
+
| `cp.array([1.0, 2.0])` | Created as **float32** (NumPy would default to float64) |
|
|
274
|
+
| `cp.zeros(10, dtype=np.float64)` | **Downcast to float32** with a warning (by default) |
|
|
275
|
+
| `cp.linalg.solve(A, b)` | Runs in float32 — ~7 decimal digits of precision |
|
|
276
|
+
| `cp.sum(x, dtype=np.float64)` | Accumulates in float32 |
|
|
277
|
+
| `complex128` input | **Downcast to complex64** (two float32 values) |
|
|
278
|
+
|
|
279
|
+
### When float32 is fine (most cases)
|
|
280
|
+
|
|
281
|
+
- Machine learning / deep learning (models train in float16/float32 anyway)
|
|
282
|
+
- Image and signal processing
|
|
283
|
+
- General scientific computing where ~7 digits of precision is sufficient
|
|
284
|
+
- Data analysis and statistics on reasonably-scaled data
|
|
285
|
+
- FFT, random number generation, sorting, indexing
|
|
286
|
+
|
|
287
|
+
### When you might need float64
|
|
288
|
+
|
|
289
|
+
- Numerical methods sensitive to rounding (e.g., ill-conditioned linear systems)
|
|
290
|
+
- Financial calculations requiring exact decimal precision
|
|
291
|
+
- Accumulating very large sums (billions of elements) where error compounds
|
|
292
|
+
- Algorithms that rely on the full 15-16 digits of float64 precision
|
|
293
|
+
|
|
294
|
+
### Configuring float64 behavior
|
|
295
|
+
|
|
296
|
+
```python
|
|
297
|
+
from macmetalpy import set_config
|
|
298
|
+
|
|
299
|
+
# DEFAULT: Downcast float64 → float32, emit a warning
|
|
300
|
+
set_config(float64_behavior="downcast", warn_on_downcast=True)
|
|
301
|
+
|
|
302
|
+
# Silence the warnings if you know what you're doing
|
|
303
|
+
set_config(float64_behavior="downcast", warn_on_downcast=False)
|
|
304
|
+
|
|
305
|
+
# Fall back to CPU (NumPy) for any float64 operation
|
|
306
|
+
set_config(float64_behavior="cpu_fallback")
|
|
307
|
+
|
|
308
|
+
# Set the default float dtype for creation functions
|
|
309
|
+
set_config(default_float_dtype="float32")
|
|
310
|
+
```
|
|
311
|
+
|
|
312
|
+
### Comparison with NumPy and CuPy
|
|
313
|
+
|
|
314
|
+
| | NumPy (CPU) | CuPy (CUDA) | MacMetalPy (Metal) |
|
|
315
|
+
|---|---|---|---|
|
|
316
|
+
| Default float | float64 | float64 | **float32** |
|
|
317
|
+
| float64 support | Native | Native | Downcast or CPU fallback |
|
|
318
|
+
| float16 support | Software | Native | Native |
|
|
319
|
+
| complex128 | Native | Native | Downcast to complex64 |
|
|
320
|
+
| int8 / uint8 | Native | Native | **Not supported** |
|
|
321
|
+
| Precision digits | ~15-16 | ~15-16 | **~7** (float32) |
|
|
322
|
+
|
|
323
|
+
---
|
|
324
|
+
|
|
325
|
+
## Supported Amps
|
|
326
|
+
|
|
327
|
+
| Dtype | Metal Type | Notes |
|
|
328
|
+
|---|---|---|
|
|
329
|
+
| `float32` | `float` | Default float — full GPU support |
|
|
330
|
+
| `float16` | `half` | Half precision — fastest for large arrays |
|
|
331
|
+
| `int32` | `int` | Default int type |
|
|
332
|
+
| `int64` | `long` | 64-bit integer |
|
|
333
|
+
| `int16` | `short` | 16-bit integer |
|
|
334
|
+
| `uint32` | `uint` | Unsigned 32-bit |
|
|
335
|
+
| `uint64` | `uint64_t` | Unsigned 64-bit |
|
|
336
|
+
| `uint16` | `uint16_t` | Unsigned 16-bit |
|
|
337
|
+
| `bool` | `bool` | Boolean |
|
|
338
|
+
| `complex64` | float32 pairs | Stored as real/imag float32 |
|
|
339
|
+
|
|
340
|
+
**Not supported by Metal:** `float64`, `complex128`, `int8`, `uint8`, `longdouble`, `str_`, `bytes_`, `object_`
|
|
341
|
+
|
|
342
|
+
---
|
|
343
|
+
|
|
344
|
+
## Acknowledgments
|
|
345
|
+
|
|
346
|
+
MacMetalPy stands on the shoulders of giants:
|
|
347
|
+
|
|
348
|
+
- **[NumPy](https://numpy.org/)** — The foundation. MacMetalPy's API is modeled after NumPy's, because they got it right the first time.
|
|
349
|
+
- **[CuPy](https://cupy.dev/)** — The blueprint for GPU array libraries. CuPy proved that a drop-in NumPy replacement on the GPU is both possible and practical.
|
|
350
|
+
- **[metalgpu](https://github.com/MK-ek11/metalgpu)** — The engine under the hood. Without metalgpu's Python-to-Metal bridge, MacMetalPy wouldn't exist.
|
|
351
|
+
|
|
352
|
+
---
|
|
353
|
+
|
|
354
|
+
## The Crew
|
|
355
|
+
|
|
356
|
+
**License:** MIT
|
|
357
|
+
|
|
358
|
+
**Contributing:** Issues and PRs welcome. If you find a bug or want to add a new function, open an issue or submit a pull request.
|
|
359
|
+
|
|
360
|
+
**Built by** [@grantkl](https://github.com/grantkl)
|
|
@@ -0,0 +1,311 @@
|
|
|
1
|
+
```
|
|
2
|
+
__ __ __ __ _ _ ____
|
|
3
|
+
| \/ | __ _ ___| \/ | ___| |_ __ _| | _ \ _ _
|
|
4
|
+
| |\/| |/ _` |/ __| |\/| |/ _ \ __/ _` | | |_) | | | |
|
|
5
|
+
| | | | (_| | (__| | | | __/ || (_| | | __/| |_| |
|
|
6
|
+
|_| |_|\__,_|\___|_| |_|\___|\__\__,_|_|_| \__, |
|
|
7
|
+
|___/
|
|
8
|
+
```
|
|
9
|
+
|
|
10
|
+
# MacMetalPy
|
|
11
|
+
|
|
12
|
+
### Shred data on Apple Silicon. No CUDA required.
|
|
13
|
+
|
|
14
|
+
A **CuPy-compatible** GPU array library that rips through computation on Apple Silicon using the **Metal** backend. Drop it into your existing CuPy code, swap the import, and let your M-series chip absolutely shred.
|
|
15
|
+
|
|
16
|
+
> **Heads up:** Metal GPUs operate in **float32** — there is no hardware float64. MacMetalPy auto-downcasts float64 → float32 by default (with warnings), or can fall back to CPU. See [Float Precision](#float-precision--the-float64-question) for details.
|
|
17
|
+
|
|
18
|
+
```python
|
|
19
|
+
import macmetalpy as cp
|
|
20
|
+
|
|
21
|
+
a = cp.random.randn(4096, 4096, dtype=cp.float32)
|
|
22
|
+
b = cp.random.randn(4096, 4096, dtype=cp.float32)
|
|
23
|
+
c = a @ b # 🔥 Metal GPU goes brrr
|
|
24
|
+
```
|
|
25
|
+
|
|
26
|
+
---
|
|
27
|
+
|
|
28
|
+
## The Setlist
|
|
29
|
+
|
|
30
|
+
- **Drop-in CuPy replacement** — `import macmetalpy as cp` and your existing code just works
|
|
31
|
+
- **200+ NumPy-compatible functions** — creation, math, linalg, FFT, random, indexing, sorting, reductions, and more
|
|
32
|
+
- **Async Metal dispatch** — operations fire off to the GPU and don't wait around
|
|
33
|
+
- **RawKernel** — write your own Metal Shading Language kernels when the built-in riffs aren't enough
|
|
34
|
+
- **17,000+ passing tests** — battle-tested across 10 dtypes and every edge case we could throw at it
|
|
35
|
+
- **Zero CUDA dependency** — pure Apple Silicon, pure Metal
|
|
36
|
+
|
|
37
|
+
---
|
|
38
|
+
|
|
39
|
+
## Plug In & Play
|
|
40
|
+
|
|
41
|
+
```bash
|
|
42
|
+
pip install macmetalpy
|
|
43
|
+
```
|
|
44
|
+
|
|
45
|
+
**Requirements:**
|
|
46
|
+
- macOS (Apple Silicon — M1/M2/M3/M4)
|
|
47
|
+
- Python >= 3.10
|
|
48
|
+
- numpy >= 1.24
|
|
49
|
+
- metalgpu >= 0.0.5
|
|
50
|
+
|
|
51
|
+
---
|
|
52
|
+
|
|
53
|
+
## Soundcheck
|
|
54
|
+
|
|
55
|
+
**Create arrays on the GPU:**
|
|
56
|
+
|
|
57
|
+
```python
|
|
58
|
+
import macmetalpy as cp
|
|
59
|
+
|
|
60
|
+
a = cp.zeros((1000, 1000), dtype=cp.float32)
|
|
61
|
+
b = cp.ones((1000, 1000), dtype=cp.float32)
|
|
62
|
+
c = cp.arange(0, 100, dtype=cp.int32)
|
|
63
|
+
d = cp.linspace(0, 1, 256, dtype=cp.float16)
|
|
64
|
+
```
|
|
65
|
+
|
|
66
|
+
**Rip through math:**
|
|
67
|
+
|
|
68
|
+
```python
|
|
69
|
+
import macmetalpy as cp
|
|
70
|
+
|
|
71
|
+
x = cp.random.randn(10000, dtype=cp.float32)
|
|
72
|
+
|
|
73
|
+
# Elementwise operations — all on the GPU
|
|
74
|
+
y = cp.sqrt(cp.abs(x)) + cp.exp(-x ** 2)
|
|
75
|
+
|
|
76
|
+
# Reductions
|
|
77
|
+
total = cp.sum(y)
|
|
78
|
+
avg = cp.mean(y)
|
|
79
|
+
```
|
|
80
|
+
|
|
81
|
+
**Linear algebra:**
|
|
82
|
+
|
|
83
|
+
```python
|
|
84
|
+
import macmetalpy as cp
|
|
85
|
+
|
|
86
|
+
A = cp.random.randn(512, 512, dtype=cp.float32)
|
|
87
|
+
b = cp.random.randn(512, dtype=cp.float32)
|
|
88
|
+
|
|
89
|
+
x = cp.linalg.solve(A, b) # Solve Ax = b
|
|
90
|
+
U, S, Vt = cp.linalg.svd(A) # SVD
|
|
91
|
+
eigenvalues = cp.linalg.eigvalsh(A @ A.T) # Eigenvalues
|
|
92
|
+
```
|
|
93
|
+
|
|
94
|
+
**Pull results back to CPU:**
|
|
95
|
+
|
|
96
|
+
```python
|
|
97
|
+
gpu_result = cp.sum(cp.random.randn(1000000, dtype=cp.float32))
|
|
98
|
+
numpy_array = gpu_result.get() # Transfer to NumPy
|
|
99
|
+
```
|
|
100
|
+
|
|
101
|
+
---
|
|
102
|
+
|
|
103
|
+
## Benchmarks — When Does the GPU Shred?
|
|
104
|
+
|
|
105
|
+
MacMetalPy vs NumPy on an **M4 Mac Mini**, float32. The GPU's advantage grows with array size — small arrays have fixed dispatch overhead, but once you're past ~100K elements, Metal starts winning, and at 10M+ it absolutely rips.
|
|
106
|
+
|
|
107
|
+
### The Scaling Story
|
|
108
|
+
|
|
109
|
+
| Operation | 1K | 100K | 1M | 10M |
|
|
110
|
+
|---|---|---|---|---|
|
|
111
|
+
| `a + b` | 0.29x | 0.96x | 1.07x | — |
|
|
112
|
+
| `sin(a)` | 0.73x | 1.03x | 3.42x | **3.71x** |
|
|
113
|
+
| `exp(a)` | 0.76x | 1.09x | 3.68x | **4.45x** |
|
|
114
|
+
| `tan(a)` | 0.81x | 1.08x | 8.64x | **14.0x** |
|
|
115
|
+
| `arcsin(a)` | 0.82x | 1.04x | 12.7x | **16.5x** |
|
|
116
|
+
| `power(a, b)` | 0.77x | 1.05x | 6.83x | **7.71x** |
|
|
117
|
+
| `floor_divide` | 0.80x | 1.04x | 17.2x | **26.7x** |
|
|
118
|
+
| `mod(a, b)` | 0.83x | 1.05x | 7.02x | **12.1x** |
|
|
119
|
+
| `cumsum(a)` | 0.66x | 0.96x | 2.43x | **2.81x** |
|
|
120
|
+
| `nanprod(a)` | 0.68x | 0.95x | 3.95x | **5.57x** |
|
|
121
|
+
| `sort(a)` | 0.91x | 1.07x | 9.16x | — |
|
|
122
|
+
|
|
123
|
+
> Values are speedup vs NumPy (higher = GPU faster). **Bold** = GPU wins by 2x+.
|
|
124
|
+
|
|
125
|
+
### By Category at 10M Elements
|
|
126
|
+
|
|
127
|
+
| Category | Avg Speedup | GPU Wins | Highlights |
|
|
128
|
+
|---|---|---|---|
|
|
129
|
+
| **Trig** | **8.0x** | 15/15 | Every trig op faster on GPU |
|
|
130
|
+
| **Math** | **5.8x** | 8/14 | Transcendentals dominate |
|
|
131
|
+
| **Ufuncs** | **5.3x** | 16/34 | `fmod` 17x, `heaviside` 20x |
|
|
132
|
+
| **NaN ops** | **3.2x** | 8/9 | `nancumprod` 5.2x, `nanprod` 5.6x |
|
|
133
|
+
| **Reductions** | **1.3x** | 5/13 | `prod` 4.2x, `cumsum` 2.8x |
|
|
134
|
+
| **Comparisons** | **1.2x** | 3/4 | `less` 1.4x, `equal` 1.2x |
|
|
135
|
+
| **Stats** | **1.3x** | 3/6 | `digitize` 2.1x |
|
|
136
|
+
|
|
137
|
+
### The Rule of Thumb
|
|
138
|
+
|
|
139
|
+
| Array Size | Who Wins | Why |
|
|
140
|
+
|---|---|---|
|
|
141
|
+
| **< 10K** | NumPy | GPU dispatch overhead dominates |
|
|
142
|
+
| **10K – 100K** | Roughly even | Overhead amortized, GPU warming up |
|
|
143
|
+
| **100K – 1M** | GPU pulls ahead | Parallel compute outpaces CPU SIMD |
|
|
144
|
+
| **1M+** | **GPU shreds** | 3-27x on compute-heavy ops |
|
|
145
|
+
|
|
146
|
+
> Run the benchmarks yourself: `python benchmarks/bench_vs_numpy.py --sizes small,medium,large,xlarge --serial`
|
|
147
|
+
|
|
148
|
+
---
|
|
149
|
+
|
|
150
|
+
## The Lineup
|
|
151
|
+
|
|
152
|
+
| Module | Functions | What it shreds |
|
|
153
|
+
|---|---|---|
|
|
154
|
+
| **Creation** | 25 | `zeros`, `ones`, `arange`, `linspace`, `eye`, `meshgrid`, ... |
|
|
155
|
+
| **Math** | 94 | `sqrt`, `exp`, `log`, `sin`, `cos`, `dot`, `where`, `clip`, ... |
|
|
156
|
+
| **Reductions** | 21 | `sum`, `mean`, `std`, `var`, `argmax`, `cumsum`, `median`, ... |
|
|
157
|
+
| **Linalg** | 25 | `solve`, `inv`, `svd`, `eigh`, `qr`, `det`, `norm`, `einsum`, ... |
|
|
158
|
+
| **Manipulation** | 33 | `reshape`, `transpose`, `concatenate`, `stack`, `pad`, `tile`, ... |
|
|
159
|
+
| **Indexing** | 23 | `take`, `put`, `nonzero`, `argwhere`, `fill_diagonal`, ... |
|
|
160
|
+
| **Sorting** | 9 | `sort`, `argsort`, `unique`, `searchsorted`, `partition`, ... |
|
|
161
|
+
| **FFT** | 19 | `fft`, `ifft`, `rfft`, `fft2`, `fftn`, `fftfreq`, ... |
|
|
162
|
+
| **Random** | 40+ | `randn`, `uniform`, `normal`, `poisson`, `choice`, `shuffle`, ... |
|
|
163
|
+
| **Logic & Bitwise** | 30 | `logical_and`, `greater`, `bitwise_xor`, `gcd`, `lcm`, ... |
|
|
164
|
+
| **NaN Ops** | 27 | `nansum`, `nanmean`, `histogram`, `corrcoef`, `gradient`, ... |
|
|
165
|
+
| **Set Ops** | 7 | `union1d`, `intersect1d`, `setdiff1d`, `isin`, ... |
|
|
166
|
+
|
|
167
|
+
---
|
|
168
|
+
|
|
169
|
+
## Custom Riffs
|
|
170
|
+
|
|
171
|
+
When the built-in operations don't cut it, write your own Metal Shading Language kernels with `RawKernel`:
|
|
172
|
+
|
|
173
|
+
```python
|
|
174
|
+
from macmetalpy import RawKernel
|
|
175
|
+
import macmetalpy as cp
|
|
176
|
+
import numpy as np
|
|
177
|
+
|
|
178
|
+
# Write a custom Metal kernel
|
|
179
|
+
kernel_source = """
|
|
180
|
+
#include <metal_stdlib>
|
|
181
|
+
using namespace metal;
|
|
182
|
+
|
|
183
|
+
kernel void saxpy(device float *x [[buffer(0)]],
|
|
184
|
+
device float *y [[buffer(1)]],
|
|
185
|
+
device float *out [[buffer(2)]],
|
|
186
|
+
uint id [[thread_position_in_grid]]) {
|
|
187
|
+
float alpha = 2.5f;
|
|
188
|
+
out[id] = alpha * x[id] + y[id];
|
|
189
|
+
}
|
|
190
|
+
"""
|
|
191
|
+
|
|
192
|
+
saxpy = RawKernel(kernel_source, 'saxpy')
|
|
193
|
+
|
|
194
|
+
N = 1_000_000
|
|
195
|
+
x = cp.random.randn(N, dtype=np.float32)
|
|
196
|
+
y = cp.random.randn(N, dtype=np.float32)
|
|
197
|
+
out = cp.empty(N, dtype=np.float32)
|
|
198
|
+
|
|
199
|
+
saxpy(N, (x, y, out)) # Launch N GPU threads
|
|
200
|
+
|
|
201
|
+
result = out.get()
|
|
202
|
+
```
|
|
203
|
+
|
|
204
|
+
Grid sizes can be 1D, 2D, or 3D:
|
|
205
|
+
|
|
206
|
+
```python
|
|
207
|
+
kernel(N, args) # 1D — N threads
|
|
208
|
+
kernel((W, H), args) # 2D grid
|
|
209
|
+
kernel((W, H, D), args) # 3D grid
|
|
210
|
+
```
|
|
211
|
+
|
|
212
|
+
---
|
|
213
|
+
|
|
214
|
+
## Float Precision & The float64 Question
|
|
215
|
+
|
|
216
|
+
**This is the biggest difference between MacMetalPy and NumPy/CuPy.**
|
|
217
|
+
|
|
218
|
+
Apple's Metal GPU has **no native float64 (double) support**. All GPU computation runs in **float32** (single precision) or **float16** (half precision). This is a hardware limitation — not a software one.
|
|
219
|
+
|
|
220
|
+
### What this means in practice
|
|
221
|
+
|
|
222
|
+
| Scenario | What happens |
|
|
223
|
+
|---|---|
|
|
224
|
+
| `cp.array([1.0, 2.0])` | Created as **float32** (NumPy would default to float64) |
|
|
225
|
+
| `cp.zeros(10, dtype=np.float64)` | **Downcast to float32** with a warning (by default) |
|
|
226
|
+
| `cp.linalg.solve(A, b)` | Runs in float32 — ~7 decimal digits of precision |
|
|
227
|
+
| `cp.sum(x, dtype=np.float64)` | Accumulates in float32 |
|
|
228
|
+
| `complex128` input | **Downcast to complex64** (two float32 values) |
|
|
229
|
+
|
|
230
|
+
### When float32 is fine (most cases)
|
|
231
|
+
|
|
232
|
+
- Machine learning / deep learning (models train in float16/float32 anyway)
|
|
233
|
+
- Image and signal processing
|
|
234
|
+
- General scientific computing where ~7 digits of precision is sufficient
|
|
235
|
+
- Data analysis and statistics on reasonably-scaled data
|
|
236
|
+
- FFT, random number generation, sorting, indexing
|
|
237
|
+
|
|
238
|
+
### When you might need float64
|
|
239
|
+
|
|
240
|
+
- Numerical methods sensitive to rounding (e.g., ill-conditioned linear systems)
|
|
241
|
+
- Financial calculations requiring exact decimal precision
|
|
242
|
+
- Accumulating very large sums (billions of elements) where error compounds
|
|
243
|
+
- Algorithms that rely on the full 15-16 digits of float64 precision
|
|
244
|
+
|
|
245
|
+
### Configuring float64 behavior
|
|
246
|
+
|
|
247
|
+
```python
|
|
248
|
+
from macmetalpy import set_config
|
|
249
|
+
|
|
250
|
+
# DEFAULT: Downcast float64 → float32, emit a warning
|
|
251
|
+
set_config(float64_behavior="downcast", warn_on_downcast=True)
|
|
252
|
+
|
|
253
|
+
# Silence the warnings if you know what you're doing
|
|
254
|
+
set_config(float64_behavior="downcast", warn_on_downcast=False)
|
|
255
|
+
|
|
256
|
+
# Fall back to CPU (NumPy) for any float64 operation
|
|
257
|
+
set_config(float64_behavior="cpu_fallback")
|
|
258
|
+
|
|
259
|
+
# Set the default float dtype for creation functions
|
|
260
|
+
set_config(default_float_dtype="float32")
|
|
261
|
+
```
|
|
262
|
+
|
|
263
|
+
### Comparison with NumPy and CuPy
|
|
264
|
+
|
|
265
|
+
| | NumPy (CPU) | CuPy (CUDA) | MacMetalPy (Metal) |
|
|
266
|
+
|---|---|---|---|
|
|
267
|
+
| Default float | float64 | float64 | **float32** |
|
|
268
|
+
| float64 support | Native | Native | Downcast or CPU fallback |
|
|
269
|
+
| float16 support | Software | Native | Native |
|
|
270
|
+
| complex128 | Native | Native | Downcast to complex64 |
|
|
271
|
+
| int8 / uint8 | Native | Native | **Not supported** |
|
|
272
|
+
| Precision digits | ~15-16 | ~15-16 | **~7** (float32) |
|
|
273
|
+
|
|
274
|
+
---
|
|
275
|
+
|
|
276
|
+
## Supported Amps
|
|
277
|
+
|
|
278
|
+
| Dtype | Metal Type | Notes |
|
|
279
|
+
|---|---|---|
|
|
280
|
+
| `float32` | `float` | Default float — full GPU support |
|
|
281
|
+
| `float16` | `half` | Half precision — fastest for large arrays |
|
|
282
|
+
| `int32` | `int` | Default int type |
|
|
283
|
+
| `int64` | `long` | 64-bit integer |
|
|
284
|
+
| `int16` | `short` | 16-bit integer |
|
|
285
|
+
| `uint32` | `uint` | Unsigned 32-bit |
|
|
286
|
+
| `uint64` | `uint64_t` | Unsigned 64-bit |
|
|
287
|
+
| `uint16` | `uint16_t` | Unsigned 16-bit |
|
|
288
|
+
| `bool` | `bool` | Boolean |
|
|
289
|
+
| `complex64` | float32 pairs | Stored as real/imag float32 |
|
|
290
|
+
|
|
291
|
+
**Not supported by Metal:** `float64`, `complex128`, `int8`, `uint8`, `longdouble`, `str_`, `bytes_`, `object_`
|
|
292
|
+
|
|
293
|
+
---
|
|
294
|
+
|
|
295
|
+
## Acknowledgments
|
|
296
|
+
|
|
297
|
+
MacMetalPy stands on the shoulders of giants:
|
|
298
|
+
|
|
299
|
+
- **[NumPy](https://numpy.org/)** — The foundation. MacMetalPy's API is modeled after NumPy's, because they got it right the first time.
|
|
300
|
+
- **[CuPy](https://cupy.dev/)** — The blueprint for GPU array libraries. CuPy proved that a drop-in NumPy replacement on the GPU is both possible and practical.
|
|
301
|
+
- **[metalgpu](https://github.com/MK-ek11/metalgpu)** — The engine under the hood. Without metalgpu's Python-to-Metal bridge, MacMetalPy wouldn't exist.
|
|
302
|
+
|
|
303
|
+
---
|
|
304
|
+
|
|
305
|
+
## The Crew
|
|
306
|
+
|
|
307
|
+
**License:** MIT
|
|
308
|
+
|
|
309
|
+
**Contributing:** Issues and PRs welcome. If you find a bug or want to add a new function, open an issue or submit a pull request.
|
|
310
|
+
|
|
311
|
+
**Built by** [@grantkl](https://github.com/grantkl)
|