macmetalpy 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (132) hide show
  1. macmetalpy-0.1.0/LICENSE +21 -0
  2. macmetalpy-0.1.0/MANIFEST.in +6 -0
  3. macmetalpy-0.1.0/PKG-INFO +360 -0
  4. macmetalpy-0.1.0/README.md +311 -0
  5. macmetalpy-0.1.0/pyproject.toml +62 -0
  6. macmetalpy-0.1.0/setup.cfg +4 -0
  7. macmetalpy-0.1.0/setup.py +10 -0
  8. macmetalpy-0.1.0/src/macmetalpy/__init__.py +306 -0
  9. macmetalpy-0.1.0/src/macmetalpy/_accelerator.c +868 -0
  10. macmetalpy-0.1.0/src/macmetalpy/_broadcasting.py +76 -0
  11. macmetalpy-0.1.0/src/macmetalpy/_config.py +60 -0
  12. macmetalpy-0.1.0/src/macmetalpy/_dtypes.py +146 -0
  13. macmetalpy-0.1.0/src/macmetalpy/_kernel_cache.py +223 -0
  14. macmetalpy-0.1.0/src/macmetalpy/_kernels.py +1232 -0
  15. macmetalpy-0.1.0/src/macmetalpy/_metal_backend.py +372 -0
  16. macmetalpy-0.1.0/src/macmetalpy/bitwise_ops.py +133 -0
  17. macmetalpy-0.1.0/src/macmetalpy/complex_ops.py +62 -0
  18. macmetalpy-0.1.0/src/macmetalpy/config_ops.py +94 -0
  19. macmetalpy-0.1.0/src/macmetalpy/creation.py +321 -0
  20. macmetalpy-0.1.0/src/macmetalpy/dtype_utils.py +158 -0
  21. macmetalpy-0.1.0/src/macmetalpy/fft.py +145 -0
  22. macmetalpy-0.1.0/src/macmetalpy/format_ops.py +112 -0
  23. macmetalpy-0.1.0/src/macmetalpy/functional.py +60 -0
  24. macmetalpy-0.1.0/src/macmetalpy/index_tricks.py +93 -0
  25. macmetalpy-0.1.0/src/macmetalpy/indexing.py +408 -0
  26. macmetalpy-0.1.0/src/macmetalpy/io.py +86 -0
  27. macmetalpy-0.1.0/src/macmetalpy/linalg.py +316 -0
  28. macmetalpy-0.1.0/src/macmetalpy/linalg_top.py +153 -0
  29. macmetalpy-0.1.0/src/macmetalpy/logic_ops.py +166 -0
  30. macmetalpy-0.1.0/src/macmetalpy/manipulation.py +671 -0
  31. macmetalpy-0.1.0/src/macmetalpy/math_ext.py +115 -0
  32. macmetalpy-0.1.0/src/macmetalpy/math_ops.py +741 -0
  33. macmetalpy-0.1.0/src/macmetalpy/nan_ops.py +748 -0
  34. macmetalpy-0.1.0/src/macmetalpy/ndarray.py +2477 -0
  35. macmetalpy-0.1.0/src/macmetalpy/poly_ops.py +81 -0
  36. macmetalpy-0.1.0/src/macmetalpy/random.py +376 -0
  37. macmetalpy-0.1.0/src/macmetalpy/raw_kernel.py +62 -0
  38. macmetalpy-0.1.0/src/macmetalpy/reductions.py +845 -0
  39. macmetalpy-0.1.0/src/macmetalpy/set_ops.py +85 -0
  40. macmetalpy-0.1.0/src/macmetalpy/sorting.py +366 -0
  41. macmetalpy-0.1.0/src/macmetalpy/ufunc.py +44 -0
  42. macmetalpy-0.1.0/src/macmetalpy/ufunc_ops.py +295 -0
  43. macmetalpy-0.1.0/src/macmetalpy/utils.py +80 -0
  44. macmetalpy-0.1.0/src/macmetalpy/window.py +29 -0
  45. macmetalpy-0.1.0/src/macmetalpy.egg-info/PKG-INFO +360 -0
  46. macmetalpy-0.1.0/src/macmetalpy.egg-info/SOURCES.txt +130 -0
  47. macmetalpy-0.1.0/src/macmetalpy.egg-info/dependency_links.txt +1 -0
  48. macmetalpy-0.1.0/src/macmetalpy.egg-info/requires.txt +6 -0
  49. macmetalpy-0.1.0/src/macmetalpy.egg-info/top_level.txt +1 -0
  50. macmetalpy-0.1.0/tests/conftest.py +202 -0
  51. macmetalpy-0.1.0/tests/test_broadcasting.py +109 -0
  52. macmetalpy-0.1.0/tests/test_comparison.py +172 -0
  53. macmetalpy-0.1.0/tests/test_comparison_funcs.py +341 -0
  54. macmetalpy-0.1.0/tests/test_complex_ops.py +170 -0
  55. macmetalpy-0.1.0/tests/test_config.py +84 -0
  56. macmetalpy-0.1.0/tests/test_config_integration.py +135 -0
  57. macmetalpy-0.1.0/tests/test_creation.py +710 -0
  58. macmetalpy-0.1.0/tests/test_creation_gaps.py +150 -0
  59. macmetalpy-0.1.0/tests/test_creation_linalg.py +362 -0
  60. macmetalpy-0.1.0/tests/test_creation_linalg_ext.py +287 -0
  61. macmetalpy-0.1.0/tests/test_creation_params_final.py +280 -0
  62. macmetalpy-0.1.0/tests/test_dtype_constants.py +120 -0
  63. macmetalpy-0.1.0/tests/test_dtype_format_final.py +410 -0
  64. macmetalpy-0.1.0/tests/test_dtype_system.py +205 -0
  65. macmetalpy-0.1.0/tests/test_dtype_utils.py +341 -0
  66. macmetalpy-0.1.0/tests/test_edge_shapes.py +392 -0
  67. macmetalpy-0.1.0/tests/test_elementwise.py +191 -0
  68. macmetalpy-0.1.0/tests/test_error_handling.py +366 -0
  69. macmetalpy-0.1.0/tests/test_extra.py +173 -0
  70. macmetalpy-0.1.0/tests/test_fft.py +389 -0
  71. macmetalpy-0.1.0/tests/test_fft_sort_idx_params.py +413 -0
  72. macmetalpy-0.1.0/tests/test_final_param_gaps.py +170 -0
  73. macmetalpy-0.1.0/tests/test_functional.py +177 -0
  74. macmetalpy-0.1.0/tests/test_index_tricks.py +213 -0
  75. macmetalpy-0.1.0/tests/test_indexing.py +606 -0
  76. macmetalpy-0.1.0/tests/test_indexing_funcs.py +413 -0
  77. macmetalpy-0.1.0/tests/test_inplace_ops.py +281 -0
  78. macmetalpy-0.1.0/tests/test_interop.py +131 -0
  79. macmetalpy-0.1.0/tests/test_io_gaps.py +150 -0
  80. macmetalpy-0.1.0/tests/test_linalg.py +613 -0
  81. macmetalpy-0.1.0/tests/test_linalg_gaps.py +136 -0
  82. macmetalpy-0.1.0/tests/test_linalg_logic_params.py +271 -0
  83. macmetalpy-0.1.0/tests/test_logic_bitwise.py +439 -0
  84. macmetalpy-0.1.0/tests/test_logic_gaps.py +121 -0
  85. macmetalpy-0.1.0/tests/test_manip_ext.py +262 -0
  86. macmetalpy-0.1.0/tests/test_manipulation.py +846 -0
  87. macmetalpy-0.1.0/tests/test_manipulation_gaps.py +191 -0
  88. macmetalpy-0.1.0/tests/test_manipulation_gpu.py +85 -0
  89. macmetalpy-0.1.0/tests/test_math_ext.py +316 -0
  90. macmetalpy-0.1.0/tests/test_math_ops.py +919 -0
  91. macmetalpy-0.1.0/tests/test_mathops_gpu.py +81 -0
  92. macmetalpy-0.1.0/tests/test_misc_params.py +493 -0
  93. macmetalpy-0.1.0/tests/test_missing_apis.py +696 -0
  94. macmetalpy-0.1.0/tests/test_nan_ops.py +480 -0
  95. macmetalpy-0.1.0/tests/test_nan_ops_gaps.py +181 -0
  96. macmetalpy-0.1.0/tests/test_nan_stats.py +356 -0
  97. macmetalpy-0.1.0/tests/test_nanops_gpu.py +136 -0
  98. macmetalpy-0.1.0/tests/test_nanops_params.py +583 -0
  99. macmetalpy-0.1.0/tests/test_ndarray.py +1214 -0
  100. macmetalpy-0.1.0/tests/test_ndarray_final.py +434 -0
  101. macmetalpy-0.1.0/tests/test_ndarray_gaps.py +787 -0
  102. macmetalpy-0.1.0/tests/test_numeric_edges.py +608 -0
  103. macmetalpy-0.1.0/tests/test_params_final.py +252 -0
  104. macmetalpy-0.1.0/tests/test_random.py +690 -0
  105. macmetalpy-0.1.0/tests/test_random_gaps.py +210 -0
  106. macmetalpy-0.1.0/tests/test_random_mod.py +719 -0
  107. macmetalpy-0.1.0/tests/test_rawkernel.py +220 -0
  108. macmetalpy-0.1.0/tests/test_reduction_gpu.py +73 -0
  109. macmetalpy-0.1.0/tests/test_reduction_params.py +827 -0
  110. macmetalpy-0.1.0/tests/test_reductions.py +697 -0
  111. macmetalpy-0.1.0/tests/test_reductions_gaps.py +364 -0
  112. macmetalpy-0.1.0/tests/test_search.py +82 -0
  113. macmetalpy-0.1.0/tests/test_set_ops.py +246 -0
  114. macmetalpy-0.1.0/tests/test_sort_manip.py +440 -0
  115. macmetalpy-0.1.0/tests/test_sort_set.py +362 -0
  116. macmetalpy-0.1.0/tests/test_sorting.py +266 -0
  117. macmetalpy-0.1.0/tests/test_sorting_gaps.py +230 -0
  118. macmetalpy-0.1.0/tests/test_sorting_gpu.py +72 -0
  119. macmetalpy-0.1.0/tests/test_stats_cum.py +487 -0
  120. macmetalpy-0.1.0/tests/test_strides_views.py +317 -0
  121. macmetalpy-0.1.0/tests/test_synchronize.py +92 -0
  122. macmetalpy-0.1.0/tests/test_ufunc.py +102 -0
  123. macmetalpy-0.1.0/tests/test_ufunc_ops.py +750 -0
  124. macmetalpy-0.1.0/tests/test_untested_creation.py +1024 -0
  125. macmetalpy-0.1.0/tests/test_untested_fft.py +214 -0
  126. macmetalpy-0.1.0/tests/test_untested_linalg.py +321 -0
  127. macmetalpy-0.1.0/tests/test_untested_math.py +710 -0
  128. macmetalpy-0.1.0/tests/test_untested_misc.py +747 -0
  129. macmetalpy-0.1.0/tests/test_untested_random.py +881 -0
  130. macmetalpy-0.1.0/tests/test_untested_reductions.py +910 -0
  131. macmetalpy-0.1.0/tests/test_window.py +109 -0
  132. macmetalpy-0.1.0/tests/test_window_misc.py +197 -0
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Grant Klepzig
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,6 @@
1
+ include LICENSE
2
+ include README.md
3
+ include pyproject.toml
4
+ include setup.py
5
+ recursive-include src *.py *.c
6
+ recursive-include tests *.py
@@ -0,0 +1,360 @@
1
+ Metadata-Version: 2.4
2
+ Name: macmetalpy
3
+ Version: 0.1.0
4
+ Summary: CuPy-compatible GPU array library for Apple Silicon using MetalGPU
5
+ Author: Grant Klepzig
6
+ License: MIT License
7
+
8
+ Copyright (c) 2026 Grant Klepzig
9
+
10
+ Permission is hereby granted, free of charge, to any person obtaining a copy
11
+ of this software and associated documentation files (the "Software"), to deal
12
+ in the Software without restriction, including without limitation the rights
13
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
14
+ copies of the Software, and to permit persons to whom the Software is
15
+ furnished to do so, subject to the following conditions:
16
+
17
+ The above copyright notice and this permission notice shall be included in all
18
+ copies or substantial portions of the Software.
19
+
20
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
23
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
24
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
25
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
26
+ SOFTWARE.
27
+
28
+ Project-URL: Homepage, https://github.com/grantkl/MacMetalPy
29
+ Project-URL: Repository, https://github.com/grantkl/MacMetalPy
30
+ Project-URL: Bug Tracker, https://github.com/grantkl/MacMetalPy/issues
31
+ Classifier: Development Status :: 3 - Alpha
32
+ Classifier: Intended Audience :: Science/Research
33
+ Classifier: License :: OSI Approved :: MIT License
34
+ Classifier: Operating System :: MacOS
35
+ Classifier: Programming Language :: Python :: 3.10
36
+ Classifier: Programming Language :: Python :: 3.11
37
+ Classifier: Programming Language :: Python :: 3.12
38
+ Classifier: Programming Language :: C
39
+ Classifier: Topic :: Scientific/Engineering
40
+ Requires-Python: >=3.10
41
+ Description-Content-Type: text/markdown
42
+ License-File: LICENSE
43
+ Requires-Dist: numpy>=1.24
44
+ Requires-Dist: metalgpu>=0.0.5
45
+ Provides-Extra: dev
46
+ Requires-Dist: pytest>=7.0; extra == "dev"
47
+ Requires-Dist: ruff>=0.4.0; extra == "dev"
48
+ Dynamic: license-file
49
+
50
+ ```
51
+ __ __ __ __ _ _ ____
52
+ | \/ | __ _ ___| \/ | ___| |_ __ _| | _ \ _ _
53
+ | |\/| |/ _` |/ __| |\/| |/ _ \ __/ _` | | |_) | | | |
54
+ | | | | (_| | (__| | | | __/ || (_| | | __/| |_| |
55
+ |_| |_|\__,_|\___|_| |_|\___|\__\__,_|_|_| \__, |
56
+ |___/
57
+ ```
58
+
59
+ # MacMetalPy
60
+
61
+ ### Shred data on Apple Silicon. No CUDA required.
62
+
63
+ A **CuPy-compatible** GPU array library that rips through computation on Apple Silicon using the **Metal** backend. Drop it into your existing CuPy code, swap the import, and let your M-series chip absolutely shred.
64
+
65
+ > **Heads up:** Metal GPUs operate in **float32** — there is no hardware float64. MacMetalPy auto-downcasts float64 → float32 by default (with warnings), or can fall back to CPU. See [Float Precision](#float-precision--the-float64-question) for details.
66
+
67
+ ```python
68
+ import macmetalpy as cp
69
+
70
+ a = cp.random.randn(4096, 4096, dtype=cp.float32)
71
+ b = cp.random.randn(4096, 4096, dtype=cp.float32)
72
+ c = a @ b # 🔥 Metal GPU goes brrr
73
+ ```
74
+
75
+ ---
76
+
77
+ ## The Setlist
78
+
79
+ - **Drop-in CuPy replacement** — `import macmetalpy as cp` and your existing code just works
80
+ - **200+ NumPy-compatible functions** — creation, math, linalg, FFT, random, indexing, sorting, reductions, and more
81
+ - **Async Metal dispatch** — operations fire off to the GPU and don't wait around
82
+ - **RawKernel** — write your own Metal Shading Language kernels when the built-in riffs aren't enough
83
+ - **17,000+ passing tests** — battle-tested across 10 dtypes and every edge case we could throw at it
84
+ - **Zero CUDA dependency** — pure Apple Silicon, pure Metal
85
+
86
+ ---
87
+
88
+ ## Plug In & Play
89
+
90
+ ```bash
91
+ pip install macmetalpy
92
+ ```
93
+
94
+ **Requirements:**
95
+ - macOS (Apple Silicon — M1/M2/M3/M4)
96
+ - Python >= 3.10
97
+ - numpy >= 1.24
98
+ - metalgpu >= 0.0.5
99
+
100
+ ---
101
+
102
+ ## Soundcheck
103
+
104
+ **Create arrays on the GPU:**
105
+
106
+ ```python
107
+ import macmetalpy as cp
108
+
109
+ a = cp.zeros((1000, 1000), dtype=cp.float32)
110
+ b = cp.ones((1000, 1000), dtype=cp.float32)
111
+ c = cp.arange(0, 100, dtype=cp.int32)
112
+ d = cp.linspace(0, 1, 256, dtype=cp.float16)
113
+ ```
114
+
115
+ **Rip through math:**
116
+
117
+ ```python
118
+ import macmetalpy as cp
119
+
120
+ x = cp.random.randn(10000, dtype=cp.float32)
121
+
122
+ # Elementwise operations — all on the GPU
123
+ y = cp.sqrt(cp.abs(x)) + cp.exp(-x ** 2)
124
+
125
+ # Reductions
126
+ total = cp.sum(y)
127
+ avg = cp.mean(y)
128
+ ```
129
+
130
+ **Linear algebra:**
131
+
132
+ ```python
133
+ import macmetalpy as cp
134
+
135
+ A = cp.random.randn(512, 512, dtype=cp.float32)
136
+ b = cp.random.randn(512, dtype=cp.float32)
137
+
138
+ x = cp.linalg.solve(A, b) # Solve Ax = b
139
+ U, S, Vt = cp.linalg.svd(A) # SVD
140
+ eigenvalues = cp.linalg.eigvalsh(A @ A.T) # Eigenvalues
141
+ ```
142
+
143
+ **Pull results back to CPU:**
144
+
145
+ ```python
146
+ gpu_result = cp.sum(cp.random.randn(1000000, dtype=cp.float32))
147
+ numpy_array = gpu_result.get() # Transfer to NumPy
148
+ ```
149
+
150
+ ---
151
+
152
+ ## Benchmarks — When Does the GPU Shred?
153
+
154
+ MacMetalPy vs NumPy on an **M4 Mac Mini**, float32. The GPU's advantage grows with array size — small arrays have fixed dispatch overhead, but once you're past ~100K elements, Metal starts winning, and at 10M+ it absolutely rips.
155
+
156
+ ### The Scaling Story
157
+
158
+ | Operation | 1K | 100K | 1M | 10M |
159
+ |---|---|---|---|---|
160
+ | `a + b` | 0.29x | 0.96x | 1.07x | — |
161
+ | `sin(a)` | 0.73x | 1.03x | 3.42x | **3.71x** |
162
+ | `exp(a)` | 0.76x | 1.09x | 3.68x | **4.45x** |
163
+ | `tan(a)` | 0.81x | 1.08x | 8.64x | **14.0x** |
164
+ | `arcsin(a)` | 0.82x | 1.04x | 12.7x | **16.5x** |
165
+ | `power(a, b)` | 0.77x | 1.05x | 6.83x | **7.71x** |
166
+ | `floor_divide` | 0.80x | 1.04x | 17.2x | **26.7x** |
167
+ | `mod(a, b)` | 0.83x | 1.05x | 7.02x | **12.1x** |
168
+ | `cumsum(a)` | 0.66x | 0.96x | 2.43x | **2.81x** |
169
+ | `nanprod(a)` | 0.68x | 0.95x | 3.95x | **5.57x** |
170
+ | `sort(a)` | 0.91x | 1.07x | 9.16x | — |
171
+
172
+ > Values are speedup vs NumPy (higher = GPU faster). **Bold** = GPU wins by 2x+.
173
+
174
+ ### By Category at 10M Elements
175
+
176
+ | Category | Avg Speedup | GPU Wins | Highlights |
177
+ |---|---|---|---|
178
+ | **Trig** | **8.0x** | 15/15 | Every trig op faster on GPU |
179
+ | **Math** | **5.8x** | 8/14 | Transcendentals dominate |
180
+ | **Ufuncs** | **5.3x** | 16/34 | `fmod` 17x, `heaviside` 20x |
181
+ | **NaN ops** | **3.2x** | 8/9 | `nancumprod` 5.2x, `nanprod` 5.6x |
182
+ | **Reductions** | **1.3x** | 5/13 | `prod` 4.2x, `cumsum` 2.8x |
183
+ | **Comparisons** | **1.2x** | 3/4 | `less` 1.4x, `equal` 1.2x |
184
+ | **Stats** | **1.3x** | 3/6 | `digitize` 2.1x |
185
+
186
+ ### The Rule of Thumb
187
+
188
+ | Array Size | Who Wins | Why |
189
+ |---|---|---|
190
+ | **< 10K** | NumPy | GPU dispatch overhead dominates |
191
+ | **10K – 100K** | Roughly even | Overhead amortized, GPU warming up |
192
+ | **100K – 1M** | GPU pulls ahead | Parallel compute outpaces CPU SIMD |
193
+ | **1M+** | **GPU shreds** | 3-27x on compute-heavy ops |
194
+
195
+ > Run the benchmarks yourself: `python benchmarks/bench_vs_numpy.py --sizes small,medium,large,xlarge --serial`
196
+
197
+ ---
198
+
199
+ ## The Lineup
200
+
201
+ | Module | Functions | What it shreds |
202
+ |---|---|---|
203
+ | **Creation** | 25 | `zeros`, `ones`, `arange`, `linspace`, `eye`, `meshgrid`, ... |
204
+ | **Math** | 94 | `sqrt`, `exp`, `log`, `sin`, `cos`, `dot`, `where`, `clip`, ... |
205
+ | **Reductions** | 21 | `sum`, `mean`, `std`, `var`, `argmax`, `cumsum`, `median`, ... |
206
+ | **Linalg** | 25 | `solve`, `inv`, `svd`, `eigh`, `qr`, `det`, `norm`, `einsum`, ... |
207
+ | **Manipulation** | 33 | `reshape`, `transpose`, `concatenate`, `stack`, `pad`, `tile`, ... |
208
+ | **Indexing** | 23 | `take`, `put`, `nonzero`, `argwhere`, `fill_diagonal`, ... |
209
+ | **Sorting** | 9 | `sort`, `argsort`, `unique`, `searchsorted`, `partition`, ... |
210
+ | **FFT** | 19 | `fft`, `ifft`, `rfft`, `fft2`, `fftn`, `fftfreq`, ... |
211
+ | **Random** | 40+ | `randn`, `uniform`, `normal`, `poisson`, `choice`, `shuffle`, ... |
212
+ | **Logic & Bitwise** | 30 | `logical_and`, `greater`, `bitwise_xor`, `gcd`, `lcm`, ... |
213
+ | **NaN Ops** | 27 | `nansum`, `nanmean`, `histogram`, `corrcoef`, `gradient`, ... |
214
+ | **Set Ops** | 7 | `union1d`, `intersect1d`, `setdiff1d`, `isin`, ... |
215
+
216
+ ---
217
+
218
+ ## Custom Riffs
219
+
220
+ When the built-in operations don't cut it, write your own Metal Shading Language kernels with `RawKernel`:
221
+
222
+ ```python
223
+ from macmetalpy import RawKernel
224
+ import macmetalpy as cp
225
+ import numpy as np
226
+
227
+ # Write a custom Metal kernel
228
+ kernel_source = """
229
+ #include <metal_stdlib>
230
+ using namespace metal;
231
+
232
+ kernel void saxpy(device float *x [[buffer(0)]],
233
+ device float *y [[buffer(1)]],
234
+ device float *out [[buffer(2)]],
235
+ uint id [[thread_position_in_grid]]) {
236
+ float alpha = 2.5f;
237
+ out[id] = alpha * x[id] + y[id];
238
+ }
239
+ """
240
+
241
+ saxpy = RawKernel(kernel_source, 'saxpy')
242
+
243
+ N = 1_000_000
244
+ x = cp.random.randn(N, dtype=np.float32)
245
+ y = cp.random.randn(N, dtype=np.float32)
246
+ out = cp.empty(N, dtype=np.float32)
247
+
248
+ saxpy(N, (x, y, out)) # Launch N GPU threads
249
+
250
+ result = out.get()
251
+ ```
252
+
253
+ Grid sizes can be 1D, 2D, or 3D:
254
+
255
+ ```python
256
+ kernel(N, args) # 1D — N threads
257
+ kernel((W, H), args) # 2D grid
258
+ kernel((W, H, D), args) # 3D grid
259
+ ```
260
+
261
+ ---
262
+
263
+ ## Float Precision & The float64 Question
264
+
265
+ **This is the biggest difference between MacMetalPy and NumPy/CuPy.**
266
+
267
+ Apple's Metal GPU has **no native float64 (double) support**. All GPU computation runs in **float32** (single precision) or **float16** (half precision). This is a hardware limitation — not a software one.
268
+
269
+ ### What this means in practice
270
+
271
+ | Scenario | What happens |
272
+ |---|---|
273
+ | `cp.array([1.0, 2.0])` | Created as **float32** (NumPy would default to float64) |
274
+ | `cp.zeros(10, dtype=np.float64)` | **Downcast to float32** with a warning (by default) |
275
+ | `cp.linalg.solve(A, b)` | Runs in float32 — ~7 decimal digits of precision |
276
+ | `cp.sum(x, dtype=np.float64)` | Accumulates in float32 |
277
+ | `complex128` input | **Downcast to complex64** (two float32 values) |
278
+
279
+ ### When float32 is fine (most cases)
280
+
281
+ - Machine learning / deep learning (models train in float16/float32 anyway)
282
+ - Image and signal processing
283
+ - General scientific computing where ~7 digits of precision is sufficient
284
+ - Data analysis and statistics on reasonably-scaled data
285
+ - FFT, random number generation, sorting, indexing
286
+
287
+ ### When you might need float64
288
+
289
+ - Numerical methods sensitive to rounding (e.g., ill-conditioned linear systems)
290
+ - Financial calculations requiring exact decimal precision
291
+ - Accumulating very large sums (billions of elements) where error compounds
292
+ - Algorithms that rely on the full 15-16 digits of float64 precision
293
+
294
+ ### Configuring float64 behavior
295
+
296
+ ```python
297
+ from macmetalpy import set_config
298
+
299
+ # DEFAULT: Downcast float64 → float32, emit a warning
300
+ set_config(float64_behavior="downcast", warn_on_downcast=True)
301
+
302
+ # Silence the warnings if you know what you're doing
303
+ set_config(float64_behavior="downcast", warn_on_downcast=False)
304
+
305
+ # Fall back to CPU (NumPy) for any float64 operation
306
+ set_config(float64_behavior="cpu_fallback")
307
+
308
+ # Set the default float dtype for creation functions
309
+ set_config(default_float_dtype="float32")
310
+ ```
311
+
312
+ ### Comparison with NumPy and CuPy
313
+
314
+ | | NumPy (CPU) | CuPy (CUDA) | MacMetalPy (Metal) |
315
+ |---|---|---|---|
316
+ | Default float | float64 | float64 | **float32** |
317
+ | float64 support | Native | Native | Downcast or CPU fallback |
318
+ | float16 support | Software | Native | Native |
319
+ | complex128 | Native | Native | Downcast to complex64 |
320
+ | int8 / uint8 | Native | Native | **Not supported** |
321
+ | Precision digits | ~15-16 | ~15-16 | **~7** (float32) |
322
+
323
+ ---
324
+
325
+ ## Supported Amps
326
+
327
+ | Dtype | Metal Type | Notes |
328
+ |---|---|---|
329
+ | `float32` | `float` | Default float — full GPU support |
330
+ | `float16` | `half` | Half precision — fastest for large arrays |
331
+ | `int32` | `int` | Default int type |
332
+ | `int64` | `long` | 64-bit integer |
333
+ | `int16` | `short` | 16-bit integer |
334
+ | `uint32` | `uint` | Unsigned 32-bit |
335
+ | `uint64` | `uint64_t` | Unsigned 64-bit |
336
+ | `uint16` | `uint16_t` | Unsigned 16-bit |
337
+ | `bool` | `bool` | Boolean |
338
+ | `complex64` | float32 pairs | Stored as real/imag float32 |
339
+
340
+ **Not supported by Metal:** `float64`, `complex128`, `int8`, `uint8`, `longdouble`, `str_`, `bytes_`, `object_`
341
+
342
+ ---
343
+
344
+ ## Acknowledgments
345
+
346
+ MacMetalPy stands on the shoulders of giants:
347
+
348
+ - **[NumPy](https://numpy.org/)** — The foundation. MacMetalPy's API is modeled after NumPy's, because they got it right the first time.
349
+ - **[CuPy](https://cupy.dev/)** — The blueprint for GPU array libraries. CuPy proved that a drop-in NumPy replacement on the GPU is both possible and practical.
350
+ - **[metalgpu](https://github.com/MK-ek11/metalgpu)** — The engine under the hood. Without metalgpu's Python-to-Metal bridge, MacMetalPy wouldn't exist.
351
+
352
+ ---
353
+
354
+ ## The Crew
355
+
356
+ **License:** MIT
357
+
358
+ **Contributing:** Issues and PRs welcome. If you find a bug or want to add a new function, open an issue or submit a pull request.
359
+
360
+ **Built by** [@grantkl](https://github.com/grantkl)
@@ -0,0 +1,311 @@
1
+ ```
2
+ __ __ __ __ _ _ ____
3
+ | \/ | __ _ ___| \/ | ___| |_ __ _| | _ \ _ _
4
+ | |\/| |/ _` |/ __| |\/| |/ _ \ __/ _` | | |_) | | | |
5
+ | | | | (_| | (__| | | | __/ || (_| | | __/| |_| |
6
+ |_| |_|\__,_|\___|_| |_|\___|\__\__,_|_|_| \__, |
7
+ |___/
8
+ ```
9
+
10
+ # MacMetalPy
11
+
12
+ ### Shred data on Apple Silicon. No CUDA required.
13
+
14
+ A **CuPy-compatible** GPU array library that rips through computation on Apple Silicon using the **Metal** backend. Drop it into your existing CuPy code, swap the import, and let your M-series chip absolutely shred.
15
+
16
+ > **Heads up:** Metal GPUs operate in **float32** — there is no hardware float64. MacMetalPy auto-downcasts float64 → float32 by default (with warnings), or can fall back to CPU. See [Float Precision](#float-precision--the-float64-question) for details.
17
+
18
+ ```python
19
+ import macmetalpy as cp
20
+
21
+ a = cp.random.randn(4096, 4096, dtype=cp.float32)
22
+ b = cp.random.randn(4096, 4096, dtype=cp.float32)
23
+ c = a @ b # 🔥 Metal GPU goes brrr
24
+ ```
25
+
26
+ ---
27
+
28
+ ## The Setlist
29
+
30
+ - **Drop-in CuPy replacement** — `import macmetalpy as cp` and your existing code just works
31
+ - **200+ NumPy-compatible functions** — creation, math, linalg, FFT, random, indexing, sorting, reductions, and more
32
+ - **Async Metal dispatch** — operations fire off to the GPU and don't wait around
33
+ - **RawKernel** — write your own Metal Shading Language kernels when the built-in riffs aren't enough
34
+ - **17,000+ passing tests** — battle-tested across 10 dtypes and every edge case we could throw at it
35
+ - **Zero CUDA dependency** — pure Apple Silicon, pure Metal
36
+
37
+ ---
38
+
39
+ ## Plug In & Play
40
+
41
+ ```bash
42
+ pip install macmetalpy
43
+ ```
44
+
45
+ **Requirements:**
46
+ - macOS (Apple Silicon — M1/M2/M3/M4)
47
+ - Python >= 3.10
48
+ - numpy >= 1.24
49
+ - metalgpu >= 0.0.5
50
+
51
+ ---
52
+
53
+ ## Soundcheck
54
+
55
+ **Create arrays on the GPU:**
56
+
57
+ ```python
58
+ import macmetalpy as cp
59
+
60
+ a = cp.zeros((1000, 1000), dtype=cp.float32)
61
+ b = cp.ones((1000, 1000), dtype=cp.float32)
62
+ c = cp.arange(0, 100, dtype=cp.int32)
63
+ d = cp.linspace(0, 1, 256, dtype=cp.float16)
64
+ ```
65
+
66
+ **Rip through math:**
67
+
68
+ ```python
69
+ import macmetalpy as cp
70
+
71
+ x = cp.random.randn(10000, dtype=cp.float32)
72
+
73
+ # Elementwise operations — all on the GPU
74
+ y = cp.sqrt(cp.abs(x)) + cp.exp(-x ** 2)
75
+
76
+ # Reductions
77
+ total = cp.sum(y)
78
+ avg = cp.mean(y)
79
+ ```
80
+
81
+ **Linear algebra:**
82
+
83
+ ```python
84
+ import macmetalpy as cp
85
+
86
+ A = cp.random.randn(512, 512, dtype=cp.float32)
87
+ b = cp.random.randn(512, dtype=cp.float32)
88
+
89
+ x = cp.linalg.solve(A, b) # Solve Ax = b
90
+ U, S, Vt = cp.linalg.svd(A) # SVD
91
+ eigenvalues = cp.linalg.eigvalsh(A @ A.T) # Eigenvalues
92
+ ```
93
+
94
+ **Pull results back to CPU:**
95
+
96
+ ```python
97
+ gpu_result = cp.sum(cp.random.randn(1000000, dtype=cp.float32))
98
+ numpy_array = gpu_result.get() # Transfer to NumPy
99
+ ```
100
+
101
+ ---
102
+
103
+ ## Benchmarks — When Does the GPU Shred?
104
+
105
+ MacMetalPy vs NumPy on an **M4 Mac Mini**, float32. The GPU's advantage grows with array size — small arrays have fixed dispatch overhead, but once you're past ~100K elements, Metal starts winning, and at 10M+ it absolutely rips.
106
+
107
+ ### The Scaling Story
108
+
109
+ | Operation | 1K | 100K | 1M | 10M |
110
+ |---|---|---|---|---|
111
+ | `a + b` | 0.29x | 0.96x | 1.07x | — |
112
+ | `sin(a)` | 0.73x | 1.03x | 3.42x | **3.71x** |
113
+ | `exp(a)` | 0.76x | 1.09x | 3.68x | **4.45x** |
114
+ | `tan(a)` | 0.81x | 1.08x | 8.64x | **14.0x** |
115
+ | `arcsin(a)` | 0.82x | 1.04x | 12.7x | **16.5x** |
116
+ | `power(a, b)` | 0.77x | 1.05x | 6.83x | **7.71x** |
117
+ | `floor_divide` | 0.80x | 1.04x | 17.2x | **26.7x** |
118
+ | `mod(a, b)` | 0.83x | 1.05x | 7.02x | **12.1x** |
119
+ | `cumsum(a)` | 0.66x | 0.96x | 2.43x | **2.81x** |
120
+ | `nanprod(a)` | 0.68x | 0.95x | 3.95x | **5.57x** |
121
+ | `sort(a)` | 0.91x | 1.07x | 9.16x | — |
122
+
123
+ > Values are speedup vs NumPy (higher = GPU faster). **Bold** = GPU wins by 2x+.
124
+
125
+ ### By Category at 10M Elements
126
+
127
+ | Category | Avg Speedup | GPU Wins | Highlights |
128
+ |---|---|---|---|
129
+ | **Trig** | **8.0x** | 15/15 | Every trig op faster on GPU |
130
+ | **Math** | **5.8x** | 8/14 | Transcendentals dominate |
131
+ | **Ufuncs** | **5.3x** | 16/34 | `fmod` 17x, `heaviside` 20x |
132
+ | **NaN ops** | **3.2x** | 8/9 | `nancumprod` 5.2x, `nanprod` 5.6x |
133
+ | **Reductions** | **1.3x** | 5/13 | `prod` 4.2x, `cumsum` 2.8x |
134
+ | **Comparisons** | **1.2x** | 3/4 | `less` 1.4x, `equal` 1.2x |
135
+ | **Stats** | **1.3x** | 3/6 | `digitize` 2.1x |
136
+
137
+ ### The Rule of Thumb
138
+
139
+ | Array Size | Who Wins | Why |
140
+ |---|---|---|
141
+ | **< 10K** | NumPy | GPU dispatch overhead dominates |
142
+ | **10K – 100K** | Roughly even | Overhead amortized, GPU warming up |
143
+ | **100K – 1M** | GPU pulls ahead | Parallel compute outpaces CPU SIMD |
144
+ | **1M+** | **GPU shreds** | 3-27x on compute-heavy ops |
145
+
146
+ > Run the benchmarks yourself: `python benchmarks/bench_vs_numpy.py --sizes small,medium,large,xlarge --serial`
147
+
148
+ ---
149
+
150
+ ## The Lineup
151
+
152
+ | Module | Functions | What it shreds |
153
+ |---|---|---|
154
+ | **Creation** | 25 | `zeros`, `ones`, `arange`, `linspace`, `eye`, `meshgrid`, ... |
155
+ | **Math** | 94 | `sqrt`, `exp`, `log`, `sin`, `cos`, `dot`, `where`, `clip`, ... |
156
+ | **Reductions** | 21 | `sum`, `mean`, `std`, `var`, `argmax`, `cumsum`, `median`, ... |
157
+ | **Linalg** | 25 | `solve`, `inv`, `svd`, `eigh`, `qr`, `det`, `norm`, `einsum`, ... |
158
+ | **Manipulation** | 33 | `reshape`, `transpose`, `concatenate`, `stack`, `pad`, `tile`, ... |
159
+ | **Indexing** | 23 | `take`, `put`, `nonzero`, `argwhere`, `fill_diagonal`, ... |
160
+ | **Sorting** | 9 | `sort`, `argsort`, `unique`, `searchsorted`, `partition`, ... |
161
+ | **FFT** | 19 | `fft`, `ifft`, `rfft`, `fft2`, `fftn`, `fftfreq`, ... |
162
+ | **Random** | 40+ | `randn`, `uniform`, `normal`, `poisson`, `choice`, `shuffle`, ... |
163
+ | **Logic & Bitwise** | 30 | `logical_and`, `greater`, `bitwise_xor`, `gcd`, `lcm`, ... |
164
+ | **NaN Ops** | 27 | `nansum`, `nanmean`, `histogram`, `corrcoef`, `gradient`, ... |
165
+ | **Set Ops** | 7 | `union1d`, `intersect1d`, `setdiff1d`, `isin`, ... |
166
+
167
+ ---
168
+
169
+ ## Custom Riffs
170
+
171
+ When the built-in operations don't cut it, write your own Metal Shading Language kernels with `RawKernel`:
172
+
173
+ ```python
174
+ from macmetalpy import RawKernel
175
+ import macmetalpy as cp
176
+ import numpy as np
177
+
178
+ # Write a custom Metal kernel
179
+ kernel_source = """
180
+ #include <metal_stdlib>
181
+ using namespace metal;
182
+
183
+ kernel void saxpy(device float *x [[buffer(0)]],
184
+ device float *y [[buffer(1)]],
185
+ device float *out [[buffer(2)]],
186
+ uint id [[thread_position_in_grid]]) {
187
+ float alpha = 2.5f;
188
+ out[id] = alpha * x[id] + y[id];
189
+ }
190
+ """
191
+
192
+ saxpy = RawKernel(kernel_source, 'saxpy')
193
+
194
+ N = 1_000_000
195
+ x = cp.random.randn(N, dtype=np.float32)
196
+ y = cp.random.randn(N, dtype=np.float32)
197
+ out = cp.empty(N, dtype=np.float32)
198
+
199
+ saxpy(N, (x, y, out)) # Launch N GPU threads
200
+
201
+ result = out.get()
202
+ ```
203
+
204
+ Grid sizes can be 1D, 2D, or 3D:
205
+
206
+ ```python
207
+ kernel(N, args) # 1D — N threads
208
+ kernel((W, H), args) # 2D grid
209
+ kernel((W, H, D), args) # 3D grid
210
+ ```
211
+
212
+ ---
213
+
214
+ ## Float Precision & The float64 Question
215
+
216
+ **This is the biggest difference between MacMetalPy and NumPy/CuPy.**
217
+
218
+ Apple's Metal GPU has **no native float64 (double) support**. All GPU computation runs in **float32** (single precision) or **float16** (half precision). This is a hardware limitation — not a software one.
219
+
220
+ ### What this means in practice
221
+
222
+ | Scenario | What happens |
223
+ |---|---|
224
+ | `cp.array([1.0, 2.0])` | Created as **float32** (NumPy would default to float64) |
225
+ | `cp.zeros(10, dtype=np.float64)` | **Downcast to float32** with a warning (by default) |
226
+ | `cp.linalg.solve(A, b)` | Runs in float32 — ~7 decimal digits of precision |
227
+ | `cp.sum(x, dtype=np.float64)` | Accumulates in float32 |
228
+ | `complex128` input | **Downcast to complex64** (two float32 values) |
229
+
230
+ ### When float32 is fine (most cases)
231
+
232
+ - Machine learning / deep learning (models train in float16/float32 anyway)
233
+ - Image and signal processing
234
+ - General scientific computing where ~7 digits of precision is sufficient
235
+ - Data analysis and statistics on reasonably-scaled data
236
+ - FFT, random number generation, sorting, indexing
237
+
238
+ ### When you might need float64
239
+
240
+ - Numerical methods sensitive to rounding (e.g., ill-conditioned linear systems)
241
+ - Financial calculations requiring exact decimal precision
242
+ - Accumulating very large sums (billions of elements) where error compounds
243
+ - Algorithms that rely on the full 15-16 digits of float64 precision
244
+
245
+ ### Configuring float64 behavior
246
+
247
+ ```python
248
+ from macmetalpy import set_config
249
+
250
+ # DEFAULT: Downcast float64 → float32, emit a warning
251
+ set_config(float64_behavior="downcast", warn_on_downcast=True)
252
+
253
+ # Silence the warnings if you know what you're doing
254
+ set_config(float64_behavior="downcast", warn_on_downcast=False)
255
+
256
+ # Fall back to CPU (NumPy) for any float64 operation
257
+ set_config(float64_behavior="cpu_fallback")
258
+
259
+ # Set the default float dtype for creation functions
260
+ set_config(default_float_dtype="float32")
261
+ ```
262
+
263
+ ### Comparison with NumPy and CuPy
264
+
265
+ | | NumPy (CPU) | CuPy (CUDA) | MacMetalPy (Metal) |
266
+ |---|---|---|---|
267
+ | Default float | float64 | float64 | **float32** |
268
+ | float64 support | Native | Native | Downcast or CPU fallback |
269
+ | float16 support | Software | Native | Native |
270
+ | complex128 | Native | Native | Downcast to complex64 |
271
+ | int8 / uint8 | Native | Native | **Not supported** |
272
+ | Precision digits | ~15-16 | ~15-16 | **~7** (float32) |
273
+
274
+ ---
275
+
276
+ ## Supported Amps
277
+
278
+ | Dtype | Metal Type | Notes |
279
+ |---|---|---|
280
+ | `float32` | `float` | Default float — full GPU support |
281
+ | `float16` | `half` | Half precision — fastest for large arrays |
282
+ | `int32` | `int` | Default int type |
283
+ | `int64` | `long` | 64-bit integer |
284
+ | `int16` | `short` | 16-bit integer |
285
+ | `uint32` | `uint` | Unsigned 32-bit |
286
+ | `uint64` | `uint64_t` | Unsigned 64-bit |
287
+ | `uint16` | `uint16_t` | Unsigned 16-bit |
288
+ | `bool` | `bool` | Boolean |
289
+ | `complex64` | float32 pairs | Stored as real/imag float32 |
290
+
291
+ **Not supported by Metal:** `float64`, `complex128`, `int8`, `uint8`, `longdouble`, `str_`, `bytes_`, `object_`
292
+
293
+ ---
294
+
295
+ ## Acknowledgments
296
+
297
+ MacMetalPy stands on the shoulders of giants:
298
+
299
+ - **[NumPy](https://numpy.org/)** — The foundation. MacMetalPy's API is modeled after NumPy's, because they got it right the first time.
300
+ - **[CuPy](https://cupy.dev/)** — The blueprint for GPU array libraries. CuPy proved that a drop-in NumPy replacement on the GPU is both possible and practical.
301
+ - **[metalgpu](https://github.com/MK-ek11/metalgpu)** — The engine under the hood. Without metalgpu's Python-to-Metal bridge, MacMetalPy wouldn't exist.
302
+
303
+ ---
304
+
305
+ ## The Crew
306
+
307
+ **License:** MIT
308
+
309
+ **Contributing:** Issues and PRs welcome. If you find a bug or want to add a new function, open an issue or submit a pull request.
310
+
311
+ **Built by** [@grantkl](https://github.com/grantkl)