capnhook-ml 0.1.0__cp312-cp312-macosx_11_0_arm64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
simd/binary.hpp ADDED
@@ -0,0 +1,84 @@
1
+ #pragma once
2
+
3
+ #include <cstddef>
4
+ #include <cstdlib>
5
+ #include <stdexcept>
6
+ #include <nanobind/nanobind.h>
7
+ #include <nanobind/ndarray.h>
8
+ #include <hwy/highway.h>
9
+
10
+ #include "../alloc.hpp"
11
+
12
+ namespace nb = nanobind;
13
+
14
+ HWY_BEFORE_NAMESPACE();
15
+ namespace hwy {
16
+ namespace HWY_NAMESPACE {
17
+ namespace capnhook {
18
+
19
+ template <typename T, typename Op>
20
+ nb::ndarray<nb::numpy, T, nb::ndim<1>> binary(nb::ndarray<T, nb::c_contig> a,
21
+ nb::ndarray<T, nb::c_contig> b) {
22
+ const size_t N = a.shape(0);
23
+ if (b.shape(0) != N) throw std::runtime_error("shape mismatch");
24
+ const T* A = a.data();
25
+ const T* B = b.data();
26
+
27
+ const size_t bytes = N * sizeof(T);
28
+ void* raw = aligned_alloc64(bytes);
29
+ T* C = static_cast<T*>(raw);
30
+
31
+ #if defined(_MSC_VER)
32
+ nb::capsule deleter(C, [](void* p) noexcept { _aligned_free(p); });
33
+ #else
34
+ nb::capsule deleter(C, [](void* p) noexcept { free(p); });
35
+ #endif
36
+
37
+ const ScalableTag<T> d;
38
+ const size_t L = Lanes(d);
39
+ Op op;
40
+ size_t i = 0;
41
+
42
+ for (; i + L <= N; i += L) {
43
+ auto va = Load(d, A + i);
44
+ auto vb = Load(d, B + i);
45
+ auto vc = op(va, vb);
46
+ Store(vc, d, C + i);
47
+ }
48
+ for (; i < N; ++i) {
49
+ C[i] = op(A[i], B[i]);
50
+ }
51
+
52
+ return nb::ndarray<nb::numpy, T, nb::ndim<1>>(C, { N }, deleter);
53
+ }
54
+
55
+ #define DEFINE_SIMD_BINARY_OP(Symbol, expr_scalar, expr_simd) \
56
+ struct Symbol##Op { \
57
+ template <typename V> HWY_INLINE V operator()(V a, V b) const { \
58
+ return (expr_simd); \
59
+ } \
60
+ HWY_INLINE float operator()(float a, float b) const { return (expr_scalar); } \
61
+ HWY_INLINE double operator()(double a, double b) const { return (expr_scalar); } \
62
+ }; \
63
+ inline nb::ndarray<nb::numpy, float, nb::ndim<1>> \
64
+ Symbol(nb::ndarray<float, nb::c_contig> a, \
65
+ nb::ndarray<float, nb::c_contig> b) { \
66
+ return binary<float, Symbol##Op>(a, b); \
67
+ } \
68
+ inline nb::ndarray<nb::numpy, double, nb::ndim<1>> \
69
+ Symbol(nb::ndarray<double, nb::c_contig> a, \
70
+ nb::ndarray<double, nb::c_contig> b) { \
71
+ return binary<double, Symbol##Op>(a, b); \
72
+ }
73
+
74
+ DEFINE_SIMD_BINARY_OP(add, a + b, Add(a, b))
75
+ DEFINE_SIMD_BINARY_OP(sub, a - b, Sub(a, b))
76
+ DEFINE_SIMD_BINARY_OP(mul, a * b, Mul(a, b))
77
+ DEFINE_SIMD_BINARY_OP(div, a / b, Div(a, b))
78
+
79
+ } // capnhook
80
+ } // HWY_NAMESPACE
81
+ } // hwy
82
+ HWY_AFTER_NAMESPACE();
83
+
84
+ namespace capnhook = hwy::HWY_NAMESPACE::capnhook;
simd/linalg.hpp ADDED
@@ -0,0 +1,109 @@
1
+ #pragma once
2
+
3
+ #include <cstddef>
4
+ #include <cstdlib>
5
+ #include <cstring>
6
+ #include <stdexcept>
7
+ #include <vector>
8
+ #include <cmath>
9
+ #include <algorithm>
10
+ #include <type_traits>
11
+ #include <nanobind/nanobind.h>
12
+ #include <nanobind/ndarray.h>
13
+
14
+ #include "../alloc.hpp"
15
+
16
+ #ifdef USE_ACCELERATE
17
+ #include <Accelerate/Accelerate.h>
18
+ #else
19
+ #include <cblas.h>
20
+ #endif
21
+
22
+ namespace nb = nanobind;
23
+
24
+ HWY_BEFORE_NAMESPACE();
25
+ namespace hwy {
26
+ namespace HWY_NAMESPACE {
27
+ namespace capnhook {
28
+
29
+ template <typename T>
30
+ T dot(nb::ndarray<T, nb::c_contig> a, nb::ndarray<T, nb::c_contig> b) {
31
+ const size_t N = a.shape(0);
32
+
33
+ if (b.shape(0) != N) {
34
+ throw std::runtime_error("dot: vectors must have the same length");
35
+ }
36
+
37
+ const T* A = a.data();
38
+ const T* B = b.data();
39
+
40
+ T result = 0;
41
+
42
+ if constexpr (std::is_same_v<T, float>) {
43
+ result = cblas_sdot(N, A, 1, B, 1);
44
+ } else if constexpr (std::is_same_v<T, double>) {
45
+ result = cblas_ddot(N, A, 1, B, 1);
46
+ }
47
+
48
+ return result;
49
+ }
50
+
51
+ template <typename T>
52
+ nb::ndarray<nb::numpy, T, nb::ndim<2>> matmul(nb::ndarray<T, nb::c_contig, nb::ndim<2>> A,
53
+ nb::ndarray<T, nb::c_contig, nb::ndim<2>> B) {
54
+ size_t M = A.shape(0), K = A.shape(1),
55
+ K2 = B.shape(0), N = B.shape(1);
56
+ if (K2 != K) throw std::runtime_error("matmul: inner dims must match");
57
+
58
+ size_t bytes = M * N * sizeof(T);
59
+ void* raw = aligned_alloc64(bytes);
60
+ T* C = static_cast<T*>(raw);
61
+
62
+ T alpha = T(1), beta = T(0);
63
+ // row‑major
64
+ if constexpr (std::is_same_v<T, float>) {
65
+ cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans,
66
+ M, N, K, alpha,
67
+ A.data(), K,
68
+ B.data(), N,
69
+ beta,
70
+ C, N);
71
+ } else {
72
+ cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans,
73
+ M, N, K, alpha,
74
+ A.data(), K,
75
+ B.data(), N,
76
+ beta,
77
+ C, N);
78
+ }
79
+
80
+ #if defined(_MSC_VER)
81
+ nb::capsule deleter(C, [](void* p) noexcept { _aligned_free(p); });
82
+ #else
83
+ nb::capsule deleter(C, [](void* p) noexcept { free(p); });
84
+ #endif
85
+ return { C, { M, N }, deleter };
86
+ }
87
+
88
+ template <typename T>
89
+ T trace(nb::ndarray<T, nb::c_contig, nb::ndim<2>> A) {
90
+ size_t M = A.shape(0), N = A.shape(1);
91
+ size_t n = std::min(M, N);
92
+ const T* data = A.data();
93
+ T sum = T(0);
94
+ for (size_t i = 0; i < n; ++i)
95
+ sum += data[i * N + i];
96
+ return sum;
97
+ }
98
+
99
+ template <typename T>
100
+ T norm(nb::ndarray<T, nb::c_contig> a) {
101
+ return std::sqrt(dot(a, a));
102
+ }
103
+
104
+ } // capnhook
105
+ } // HWY_NAMESPACE
106
+ } // hwy
107
+ HWY_AFTER_NAMESPACE();
108
+
109
+ namespace capnhook = hwy::HWY_NAMESPACE::capnhook;
simd/reduce.hpp ADDED
@@ -0,0 +1,261 @@
1
+ #pragma once
2
+
3
+ #include <cstddef>
4
+ #include <cstdlib>
5
+ #include <algorithm>
6
+ #include <cmath>
7
+ #include <stdexcept>
8
+ #include <nanobind/nanobind.h>
9
+ #include <nanobind/ndarray.h>
10
+ #include <hwy/highway.h>
11
+
12
+ #include "../alloc.hpp"
13
+ #include "binary.hpp"
14
+
15
+ namespace nb = nanobind;
16
+
17
+ HWY_BEFORE_NAMESPACE();
18
+ namespace hwy {
19
+ namespace HWY_NAMESPACE {
20
+ namespace capnhook {
21
+
22
+ template <typename T>
23
+ T reduce_sum(nb::ndarray<T, nb::c_contig> a) {
24
+ const T* A = a.data();
25
+ size_t N = a.shape(0);
26
+
27
+ if (N == 0) throw std::runtime_error("reduce_sum: zero-length input");
28
+ if (N == 1) return A[0];
29
+
30
+ const ScalableTag<T> d;
31
+ auto acc = Zero(d);
32
+ size_t i = 0, L = Lanes(d);
33
+
34
+ if (N < L) {
35
+ T sum = T(0);
36
+ for (size_t j = 0; j < N; j++) {
37
+ sum += A[j];
38
+ }
39
+ return sum;
40
+ }
41
+
42
+ for (; i + L <= N; i += L) {
43
+ acc = Add(acc, Load(d, A + i));
44
+ }
45
+
46
+ T total = GetLane(SumOfLanes(d, acc));
47
+ for (; i < N; ++i) total += A[i];
48
+ return total;
49
+ }
50
+
51
+ template <typename T>
52
+ T reduce_min(nb::ndarray<T, nb::c_contig> a) {
53
+ const T* A = a.data();
54
+ size_t N = a.shape(0);
55
+
56
+ if (N == 0) throw std::runtime_error("reduce_min: zero-length input");
57
+ if (N == 1) return A[0];
58
+
59
+ const ScalableTag<T> d;
60
+ size_t L = Lanes(d);
61
+
62
+ if (N < L) {
63
+ T min_val = A[0];
64
+ for (size_t j = 1; j < N; j++) {
65
+ min_val = std::min(min_val, A[j]);
66
+ }
67
+ return min_val;
68
+ }
69
+
70
+ auto acc = Load(d, A);
71
+ size_t i = L;
72
+
73
+ for (; i + L <= N; i += L) {
74
+ acc = Min(acc, Load(d, A + i));
75
+ }
76
+
77
+ T m = GetLane(MinOfLanes(d, acc));
78
+ for (; i < N; ++i) m = std::min(m, A[i]);
79
+ return m;
80
+ }
81
+
82
+ template <typename T>
83
+ T reduce_max(nb::ndarray<T, nb::c_contig> a) {
84
+ const T* A = a.data();
85
+ size_t N = a.shape(0);
86
+
87
+ if (N == 0) throw std::runtime_error("reduce_max: zero-length input");
88
+ if (N == 1) return A[0];
89
+
90
+ const ScalableTag<T> d;
91
+ size_t L = Lanes(d);
92
+
93
+ if (N < L) {
94
+ T max_val = A[0];
95
+ for (size_t j = 1; j < N; j++) {
96
+ max_val = std::max(max_val, A[j]);
97
+ }
98
+ return max_val;
99
+ }
100
+
101
+ auto acc = Load(d, A);
102
+ size_t i = L;
103
+
104
+ for (; i + L <= N; i += L) {
105
+ acc = Max(acc, Load(d, A + i));
106
+ }
107
+
108
+ T m = GetLane(MaxOfLanes(d, acc));
109
+ for (; i < N; ++i) m = std::max(m, A[i]);
110
+ return m;
111
+ }
112
+
113
+ template <typename T>
114
+ T reduce_prod(nb::ndarray<T, nb::c_contig> a) {
115
+ const T* A = a.data();
116
+ size_t N = a.shape(0);
117
+
118
+ if (N == 0) throw std::runtime_error("reduce_prod: zero-length input");
119
+ if (N == 1) return A[0];
120
+
121
+ const ScalableTag<T> d;
122
+ size_t L = Lanes(d);
123
+
124
+ if (N < L) {
125
+ T prod = T(1);
126
+ for (size_t j = 0; j < N; j++) {
127
+ prod *= A[j];
128
+ }
129
+ return prod;
130
+ }
131
+
132
+ auto acc = Set(d, T(1));
133
+ size_t i = 0;
134
+
135
+ for (; i + L <= N; i += L) {
136
+ acc = Mul(acc, Load(d, A + i));
137
+ }
138
+
139
+ T product = GetLane(SumOfLanes(d, acc));
140
+ for (; i < N; ++i) product *= A[i];
141
+ return product;
142
+ }
143
+
144
+
145
+ template <typename T>
146
+ T reduce_mean(nb::ndarray<T, nb::c_contig> a) {
147
+ size_t N = a.shape(0);
148
+ return reduce_sum<T>(a) / T(N);
149
+ }
150
+
151
+ template <typename T>
152
+ T reduce_var(nb::ndarray<T, nb::c_contig> a) {
153
+ size_t N = a.shape(0);
154
+ T mu = reduce_mean<T>(a);
155
+ const T* A = a.data();
156
+ T var = T(0);
157
+ for (size_t i = 0; i < N; ++i) {
158
+ T d = A[i] - mu;
159
+ var += d * d;
160
+ }
161
+ return var / T(N);
162
+ }
163
+
164
+ template <typename T>
165
+ T reduce_std(nb::ndarray<T, nb::c_contig> a) {
166
+ return std::sqrt(reduce_var<T>(a));
167
+ }
168
+
169
+
170
+ template <typename T>
171
+ bool reduce_any(nb::ndarray<T, nb::c_contig> a) {
172
+ const T* A = a.data();
173
+ size_t N = a.shape(0);
174
+ for (size_t i = 0; i < N; ++i) if (A[i] != T(0)) return true;
175
+ return false;
176
+ }
177
+
178
+ template <typename T>
179
+ bool reduce_all(nb::ndarray<T, nb::c_contig> a) {
180
+ const T* A = a.data();
181
+ size_t N = a.shape(0);
182
+ for (size_t i = 0; i < N; ++i) if (A[i] == T(0)) return false;
183
+ return true;
184
+ }
185
+
186
+
187
+ template <typename T>
188
+ size_t argmax(nb::ndarray<T, nb::c_contig> a) {
189
+ const T* A = a.data();
190
+ size_t N = a.shape(0);
191
+ if (N == 0) throw std::runtime_error("argmax: zero-length input");
192
+ size_t idx = 0;
193
+ T best = A[0];
194
+ for (size_t i = 1; i < N; ++i) {
195
+ if (A[i] > best) { best = A[i]; idx = i; }
196
+ }
197
+ return idx;
198
+ }
199
+
200
+ template <typename T>
201
+ size_t argmin(nb::ndarray<T, nb::c_contig> a) {
202
+ const T* A = a.data();
203
+ size_t N = a.shape(0);
204
+ if (N == 0) throw std::runtime_error("argmin: zero-length input");
205
+ size_t idx = 0;
206
+ T best = A[0];
207
+ for (size_t i = 1; i < N; ++i) {
208
+ if (A[i] < best) { best = A[i]; idx = i; }
209
+ }
210
+ return idx;
211
+ }
212
+
213
+
214
+ template <typename T>
215
+ nb::ndarray<nb::numpy, T, nb::ndim<1>>
216
+ cumsum(nb::ndarray<T, nb::c_contig> a) {
217
+ size_t N = a.shape(0);
218
+ const T* A = a.data();
219
+ size_t bytes = N * sizeof(T);
220
+ void* raw = aligned_alloc64(bytes);
221
+ T* C = static_cast<T*>(raw);
222
+ #if defined(_MSC_VER)
223
+ nb::capsule deleter(C, [](void* p) noexcept { _aligned_free(p); });
224
+ #else
225
+ nb::capsule deleter(C, [](void* p) noexcept { free(p); });
226
+ #endif
227
+ T acc = T(0);
228
+ for (size_t i = 0; i < N; ++i) {
229
+ acc += A[i];
230
+ C[i] = acc;
231
+ }
232
+ return { C, { N }, deleter };
233
+ }
234
+
235
+ template <typename T>
236
+ nb::ndarray<nb::numpy, T, nb::ndim<1>>
237
+ cumprod(nb::ndarray<T, nb::c_contig> a) {
238
+ size_t N = a.shape(0);
239
+ const T* A = a.data();
240
+ size_t bytes = N * sizeof(T);
241
+ void* raw = aligned_alloc64(bytes);
242
+ T* C = static_cast<T*>(raw);
243
+ #if defined(_MSC_VER)
244
+ nb::capsule deleter(C, [](void* p) noexcept { _aligned_free(p); });
245
+ #else
246
+ nb::capsule deleter(C, [](void* p) noexcept { free(p); });
247
+ #endif
248
+ T acc = T(1);
249
+ for (size_t i = 0; i < N; ++i) {
250
+ acc *= A[i];
251
+ C[i] = acc;
252
+ }
253
+ return { C, { N }, deleter };
254
+ }
255
+
256
+ } // capnhook
257
+ } // HWY_NAMESPACE
258
+ } // hwy
259
+ HWY_AFTER_NAMESPACE();
260
+
261
+ namespace capnhook = hwy::HWY_NAMESPACE::capnhook;
simd/unary.hpp ADDED
@@ -0,0 +1,83 @@
1
+ #pragma once
2
+
3
+ #include <cstddef>
4
+ #include <cstdlib>
5
+ #include <cmath>
6
+ #include <nanobind/nanobind.h>
7
+ #include <nanobind/ndarray.h>
8
+ #include <hwy/highway.h>
9
+ #include <hwy/contrib/math/math-inl.h>
10
+
11
+ #include "../alloc.hpp"
12
+
13
+ namespace nb = nanobind;
14
+
15
+ HWY_BEFORE_NAMESPACE();
16
+ namespace hwy {
17
+ namespace HWY_NAMESPACE {
18
+ namespace capnhook {
19
+
20
+ template <typename T, typename Op>
21
+ nb::ndarray<nb::numpy, T, nb::ndim<1>> unary(nb::ndarray<T, nb::c_contig> a) {
22
+ const size_t N = a.shape(0);
23
+ const T* A = a.data();
24
+
25
+ const size_t bytes = N * sizeof(T);
26
+ void* raw = aligned_alloc64(bytes);
27
+ T* C = static_cast<T*>(raw);
28
+
29
+ #if defined(_MSC_VER)
30
+ nb::capsule deleter(C, [](void* p) noexcept { _aligned_free(p); });
31
+ #else
32
+ nb::capsule deleter(C, [](void* p) noexcept { free(p); });
33
+ #endif
34
+
35
+ const ScalableTag<T> d;
36
+ Op op;
37
+ size_t i = 0;
38
+ const size_t L = Lanes(d);
39
+
40
+ for (; i + L <= N; i += L) {
41
+ auto v = Load(d, A + i);
42
+ Store(op(d, v), d, C + i);
43
+ }
44
+
45
+ for (; i < N; ++i) {
46
+ C[i] = op(A[i]);
47
+ }
48
+
49
+ return { C, { N }, deleter };
50
+ }
51
+
52
+ #define DEFINE_SIMD_UNARY_OP(Symbol, expr_scalar, expr_simd) \
53
+ struct Symbol##Op { \
54
+ template <class D, class V> \
55
+ HWY_INLINE V operator()(D d, V v) const { \
56
+ return expr_simd; \
57
+ } \
58
+ HWY_INLINE float operator()(float x) const { return (expr_scalar); } \
59
+ HWY_INLINE double operator()(double x) const { return (expr_scalar); } \
60
+ }; \
61
+ inline nb::ndarray<nb::numpy, float, nb::ndim<1>> \
62
+ Symbol(nb::ndarray<float, nb::c_contig> a) { \
63
+ return unary<float, Symbol##Op>(a); \
64
+ } \
65
+ inline nb::ndarray<nb::numpy, double, nb::ndim<1>> \
66
+ Symbol(nb::ndarray<double, nb::c_contig> a) { \
67
+ return unary<double, Symbol##Op>(a); \
68
+ }
69
+
70
+ DEFINE_SIMD_UNARY_OP(exp, std::exp(x), hwy::HWY_NAMESPACE::Exp(d, v))
71
+ DEFINE_SIMD_UNARY_OP(log, std::log(x), hwy::HWY_NAMESPACE::Log(d, v))
72
+ DEFINE_SIMD_UNARY_OP(sqrt, std::sqrt(x), hwy::HWY_NAMESPACE::Sqrt(v))
73
+ DEFINE_SIMD_UNARY_OP(sin, std::sin(x), hwy::HWY_NAMESPACE::Sin(d, v))
74
+ DEFINE_SIMD_UNARY_OP(cos, std::cos(x), hwy::HWY_NAMESPACE::Cos(d, v))
75
+ DEFINE_SIMD_UNARY_OP(asin, std::asin(x), hwy::HWY_NAMESPACE::Asin(d, v))
76
+ DEFINE_SIMD_UNARY_OP(acos, std::acos(x), hwy::HWY_NAMESPACE::Acos(d, v))
77
+
78
+ } // capnhook
79
+ } // HWY_NAMESPACE
80
+ } // hwy
81
+ HWY_AFTER_NAMESPACE();
82
+
83
+ namespace capnhook = hwy::HWY_NAMESPACE::capnhook;