capnhook-ml 0.1.0__cp312-cp312-macosx_11_0_arm64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- capnhook_ml-0.1.0.dist-info/METADATA +736 -0
- capnhook_ml-0.1.0.dist-info/RECORD +10 -0
- capnhook_ml-0.1.0.dist-info/WHEEL +5 -0
- capnhook_ml-0.1.0.dist-info/licenses/LICENSE +674 -0
- capnhook_ml-0.1.0.dist-info/top_level.txt +2 -0
- capnhook_ml.cpython-312-darwin.so +0 -0
- simd/binary.hpp +84 -0
- simd/linalg.hpp +109 -0
- simd/reduce.hpp +261 -0
- simd/unary.hpp +83 -0
simd/binary.hpp
ADDED
@@ -0,0 +1,84 @@
|
|
1
|
+
#pragma once
|
2
|
+
|
3
|
+
#include <cstddef>
|
4
|
+
#include <cstdlib>
|
5
|
+
#include <stdexcept>
|
6
|
+
#include <nanobind/nanobind.h>
|
7
|
+
#include <nanobind/ndarray.h>
|
8
|
+
#include <hwy/highway.h>
|
9
|
+
|
10
|
+
#include "../alloc.hpp"
|
11
|
+
|
12
|
+
namespace nb = nanobind;
|
13
|
+
|
14
|
+
HWY_BEFORE_NAMESPACE();
|
15
|
+
namespace hwy {
|
16
|
+
namespace HWY_NAMESPACE {
|
17
|
+
namespace capnhook {
|
18
|
+
|
19
|
+
template <typename T, typename Op>
|
20
|
+
nb::ndarray<nb::numpy, T, nb::ndim<1>> binary(nb::ndarray<T, nb::c_contig> a,
|
21
|
+
nb::ndarray<T, nb::c_contig> b) {
|
22
|
+
const size_t N = a.shape(0);
|
23
|
+
if (b.shape(0) != N) throw std::runtime_error("shape mismatch");
|
24
|
+
const T* A = a.data();
|
25
|
+
const T* B = b.data();
|
26
|
+
|
27
|
+
const size_t bytes = N * sizeof(T);
|
28
|
+
void* raw = aligned_alloc64(bytes);
|
29
|
+
T* C = static_cast<T*>(raw);
|
30
|
+
|
31
|
+
#if defined(_MSC_VER)
|
32
|
+
nb::capsule deleter(C, [](void* p) noexcept { _aligned_free(p); });
|
33
|
+
#else
|
34
|
+
nb::capsule deleter(C, [](void* p) noexcept { free(p); });
|
35
|
+
#endif
|
36
|
+
|
37
|
+
const ScalableTag<T> d;
|
38
|
+
const size_t L = Lanes(d);
|
39
|
+
Op op;
|
40
|
+
size_t i = 0;
|
41
|
+
|
42
|
+
for (; i + L <= N; i += L) {
|
43
|
+
auto va = Load(d, A + i);
|
44
|
+
auto vb = Load(d, B + i);
|
45
|
+
auto vc = op(va, vb);
|
46
|
+
Store(vc, d, C + i);
|
47
|
+
}
|
48
|
+
for (; i < N; ++i) {
|
49
|
+
C[i] = op(A[i], B[i]);
|
50
|
+
}
|
51
|
+
|
52
|
+
return nb::ndarray<nb::numpy, T, nb::ndim<1>>(C, { N }, deleter);
|
53
|
+
}
|
54
|
+
|
55
|
+
#define DEFINE_SIMD_BINARY_OP(Symbol, expr_scalar, expr_simd) \
|
56
|
+
struct Symbol##Op { \
|
57
|
+
template <typename V> HWY_INLINE V operator()(V a, V b) const { \
|
58
|
+
return (expr_simd); \
|
59
|
+
} \
|
60
|
+
HWY_INLINE float operator()(float a, float b) const { return (expr_scalar); } \
|
61
|
+
HWY_INLINE double operator()(double a, double b) const { return (expr_scalar); } \
|
62
|
+
}; \
|
63
|
+
inline nb::ndarray<nb::numpy, float, nb::ndim<1>> \
|
64
|
+
Symbol(nb::ndarray<float, nb::c_contig> a, \
|
65
|
+
nb::ndarray<float, nb::c_contig> b) { \
|
66
|
+
return binary<float, Symbol##Op>(a, b); \
|
67
|
+
} \
|
68
|
+
inline nb::ndarray<nb::numpy, double, nb::ndim<1>> \
|
69
|
+
Symbol(nb::ndarray<double, nb::c_contig> a, \
|
70
|
+
nb::ndarray<double, nb::c_contig> b) { \
|
71
|
+
return binary<double, Symbol##Op>(a, b); \
|
72
|
+
}
|
73
|
+
|
74
|
+
DEFINE_SIMD_BINARY_OP(add, a + b, Add(a, b))
|
75
|
+
DEFINE_SIMD_BINARY_OP(sub, a - b, Sub(a, b))
|
76
|
+
DEFINE_SIMD_BINARY_OP(mul, a * b, Mul(a, b))
|
77
|
+
DEFINE_SIMD_BINARY_OP(div, a / b, Div(a, b))
|
78
|
+
|
79
|
+
} // capnhook
|
80
|
+
} // HWY_NAMESPACE
|
81
|
+
} // hwy
|
82
|
+
HWY_AFTER_NAMESPACE();
|
83
|
+
|
84
|
+
namespace capnhook = hwy::HWY_NAMESPACE::capnhook;
|
simd/linalg.hpp
ADDED
@@ -0,0 +1,109 @@
|
|
1
|
+
#pragma once
|
2
|
+
|
3
|
+
#include <cstddef>
|
4
|
+
#include <cstdlib>
|
5
|
+
#include <cstring>
|
6
|
+
#include <stdexcept>
|
7
|
+
#include <vector>
|
8
|
+
#include <cmath>
|
9
|
+
#include <algorithm>
|
10
|
+
#include <type_traits>
|
11
|
+
#include <nanobind/nanobind.h>
|
12
|
+
#include <nanobind/ndarray.h>
|
13
|
+
|
14
|
+
#include "../alloc.hpp"
|
15
|
+
|
16
|
+
#ifdef USE_ACCELERATE
|
17
|
+
#include <Accelerate/Accelerate.h>
|
18
|
+
#else
|
19
|
+
#include <cblas.h>
|
20
|
+
#endif
|
21
|
+
|
22
|
+
namespace nb = nanobind;
|
23
|
+
|
24
|
+
HWY_BEFORE_NAMESPACE();
|
25
|
+
namespace hwy {
|
26
|
+
namespace HWY_NAMESPACE {
|
27
|
+
namespace capnhook {
|
28
|
+
|
29
|
+
template <typename T>
|
30
|
+
T dot(nb::ndarray<T, nb::c_contig> a, nb::ndarray<T, nb::c_contig> b) {
|
31
|
+
const size_t N = a.shape(0);
|
32
|
+
|
33
|
+
if (b.shape(0) != N) {
|
34
|
+
throw std::runtime_error("dot: vectors must have the same length");
|
35
|
+
}
|
36
|
+
|
37
|
+
const T* A = a.data();
|
38
|
+
const T* B = b.data();
|
39
|
+
|
40
|
+
T result = 0;
|
41
|
+
|
42
|
+
if constexpr (std::is_same_v<T, float>) {
|
43
|
+
result = cblas_sdot(N, A, 1, B, 1);
|
44
|
+
} else if constexpr (std::is_same_v<T, double>) {
|
45
|
+
result = cblas_ddot(N, A, 1, B, 1);
|
46
|
+
}
|
47
|
+
|
48
|
+
return result;
|
49
|
+
}
|
50
|
+
|
51
|
+
template <typename T>
|
52
|
+
nb::ndarray<nb::numpy, T, nb::ndim<2>> matmul(nb::ndarray<T, nb::c_contig, nb::ndim<2>> A,
|
53
|
+
nb::ndarray<T, nb::c_contig, nb::ndim<2>> B) {
|
54
|
+
size_t M = A.shape(0), K = A.shape(1),
|
55
|
+
K2 = B.shape(0), N = B.shape(1);
|
56
|
+
if (K2 != K) throw std::runtime_error("matmul: inner dims must match");
|
57
|
+
|
58
|
+
size_t bytes = M * N * sizeof(T);
|
59
|
+
void* raw = aligned_alloc64(bytes);
|
60
|
+
T* C = static_cast<T*>(raw);
|
61
|
+
|
62
|
+
T alpha = T(1), beta = T(0);
|
63
|
+
// row‑major
|
64
|
+
if constexpr (std::is_same_v<T, float>) {
|
65
|
+
cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans,
|
66
|
+
M, N, K, alpha,
|
67
|
+
A.data(), K,
|
68
|
+
B.data(), N,
|
69
|
+
beta,
|
70
|
+
C, N);
|
71
|
+
} else {
|
72
|
+
cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans,
|
73
|
+
M, N, K, alpha,
|
74
|
+
A.data(), K,
|
75
|
+
B.data(), N,
|
76
|
+
beta,
|
77
|
+
C, N);
|
78
|
+
}
|
79
|
+
|
80
|
+
#if defined(_MSC_VER)
|
81
|
+
nb::capsule deleter(C, [](void* p) noexcept { _aligned_free(p); });
|
82
|
+
#else
|
83
|
+
nb::capsule deleter(C, [](void* p) noexcept { free(p); });
|
84
|
+
#endif
|
85
|
+
return { C, { M, N }, deleter };
|
86
|
+
}
|
87
|
+
|
88
|
+
template <typename T>
|
89
|
+
T trace(nb::ndarray<T, nb::c_contig, nb::ndim<2>> A) {
|
90
|
+
size_t M = A.shape(0), N = A.shape(1);
|
91
|
+
size_t n = std::min(M, N);
|
92
|
+
const T* data = A.data();
|
93
|
+
T sum = T(0);
|
94
|
+
for (size_t i = 0; i < n; ++i)
|
95
|
+
sum += data[i * N + i];
|
96
|
+
return sum;
|
97
|
+
}
|
98
|
+
|
99
|
+
template <typename T>
|
100
|
+
T norm(nb::ndarray<T, nb::c_contig> a) {
|
101
|
+
return std::sqrt(dot(a, a));
|
102
|
+
}
|
103
|
+
|
104
|
+
} // capnhook
|
105
|
+
} // HWY_NAMESPACE
|
106
|
+
} // hwy
|
107
|
+
HWY_AFTER_NAMESPACE();
|
108
|
+
|
109
|
+
namespace capnhook = hwy::HWY_NAMESPACE::capnhook;
|
simd/reduce.hpp
ADDED
@@ -0,0 +1,261 @@
|
|
1
|
+
#pragma once
|
2
|
+
|
3
|
+
#include <cstddef>
|
4
|
+
#include <cstdlib>
|
5
|
+
#include <algorithm>
|
6
|
+
#include <cmath>
|
7
|
+
#include <stdexcept>
|
8
|
+
#include <nanobind/nanobind.h>
|
9
|
+
#include <nanobind/ndarray.h>
|
10
|
+
#include <hwy/highway.h>
|
11
|
+
|
12
|
+
#include "../alloc.hpp"
|
13
|
+
#include "binary.hpp"
|
14
|
+
|
15
|
+
namespace nb = nanobind;
|
16
|
+
|
17
|
+
HWY_BEFORE_NAMESPACE();
|
18
|
+
namespace hwy {
|
19
|
+
namespace HWY_NAMESPACE {
|
20
|
+
namespace capnhook {
|
21
|
+
|
22
|
+
template <typename T>
|
23
|
+
T reduce_sum(nb::ndarray<T, nb::c_contig> a) {
|
24
|
+
const T* A = a.data();
|
25
|
+
size_t N = a.shape(0);
|
26
|
+
|
27
|
+
if (N == 0) throw std::runtime_error("reduce_sum: zero-length input");
|
28
|
+
if (N == 1) return A[0];
|
29
|
+
|
30
|
+
const ScalableTag<T> d;
|
31
|
+
auto acc = Zero(d);
|
32
|
+
size_t i = 0, L = Lanes(d);
|
33
|
+
|
34
|
+
if (N < L) {
|
35
|
+
T sum = T(0);
|
36
|
+
for (size_t j = 0; j < N; j++) {
|
37
|
+
sum += A[j];
|
38
|
+
}
|
39
|
+
return sum;
|
40
|
+
}
|
41
|
+
|
42
|
+
for (; i + L <= N; i += L) {
|
43
|
+
acc = Add(acc, Load(d, A + i));
|
44
|
+
}
|
45
|
+
|
46
|
+
T total = GetLane(SumOfLanes(d, acc));
|
47
|
+
for (; i < N; ++i) total += A[i];
|
48
|
+
return total;
|
49
|
+
}
|
50
|
+
|
51
|
+
template <typename T>
|
52
|
+
T reduce_min(nb::ndarray<T, nb::c_contig> a) {
|
53
|
+
const T* A = a.data();
|
54
|
+
size_t N = a.shape(0);
|
55
|
+
|
56
|
+
if (N == 0) throw std::runtime_error("reduce_min: zero-length input");
|
57
|
+
if (N == 1) return A[0];
|
58
|
+
|
59
|
+
const ScalableTag<T> d;
|
60
|
+
size_t L = Lanes(d);
|
61
|
+
|
62
|
+
if (N < L) {
|
63
|
+
T min_val = A[0];
|
64
|
+
for (size_t j = 1; j < N; j++) {
|
65
|
+
min_val = std::min(min_val, A[j]);
|
66
|
+
}
|
67
|
+
return min_val;
|
68
|
+
}
|
69
|
+
|
70
|
+
auto acc = Load(d, A);
|
71
|
+
size_t i = L;
|
72
|
+
|
73
|
+
for (; i + L <= N; i += L) {
|
74
|
+
acc = Min(acc, Load(d, A + i));
|
75
|
+
}
|
76
|
+
|
77
|
+
T m = GetLane(MinOfLanes(d, acc));
|
78
|
+
for (; i < N; ++i) m = std::min(m, A[i]);
|
79
|
+
return m;
|
80
|
+
}
|
81
|
+
|
82
|
+
template <typename T>
|
83
|
+
T reduce_max(nb::ndarray<T, nb::c_contig> a) {
|
84
|
+
const T* A = a.data();
|
85
|
+
size_t N = a.shape(0);
|
86
|
+
|
87
|
+
if (N == 0) throw std::runtime_error("reduce_max: zero-length input");
|
88
|
+
if (N == 1) return A[0];
|
89
|
+
|
90
|
+
const ScalableTag<T> d;
|
91
|
+
size_t L = Lanes(d);
|
92
|
+
|
93
|
+
if (N < L) {
|
94
|
+
T max_val = A[0];
|
95
|
+
for (size_t j = 1; j < N; j++) {
|
96
|
+
max_val = std::max(max_val, A[j]);
|
97
|
+
}
|
98
|
+
return max_val;
|
99
|
+
}
|
100
|
+
|
101
|
+
auto acc = Load(d, A);
|
102
|
+
size_t i = L;
|
103
|
+
|
104
|
+
for (; i + L <= N; i += L) {
|
105
|
+
acc = Max(acc, Load(d, A + i));
|
106
|
+
}
|
107
|
+
|
108
|
+
T m = GetLane(MaxOfLanes(d, acc));
|
109
|
+
for (; i < N; ++i) m = std::max(m, A[i]);
|
110
|
+
return m;
|
111
|
+
}
|
112
|
+
|
113
|
+
template <typename T>
|
114
|
+
T reduce_prod(nb::ndarray<T, nb::c_contig> a) {
|
115
|
+
const T* A = a.data();
|
116
|
+
size_t N = a.shape(0);
|
117
|
+
|
118
|
+
if (N == 0) throw std::runtime_error("reduce_prod: zero-length input");
|
119
|
+
if (N == 1) return A[0];
|
120
|
+
|
121
|
+
const ScalableTag<T> d;
|
122
|
+
size_t L = Lanes(d);
|
123
|
+
|
124
|
+
if (N < L) {
|
125
|
+
T prod = T(1);
|
126
|
+
for (size_t j = 0; j < N; j++) {
|
127
|
+
prod *= A[j];
|
128
|
+
}
|
129
|
+
return prod;
|
130
|
+
}
|
131
|
+
|
132
|
+
auto acc = Set(d, T(1));
|
133
|
+
size_t i = 0;
|
134
|
+
|
135
|
+
for (; i + L <= N; i += L) {
|
136
|
+
acc = Mul(acc, Load(d, A + i));
|
137
|
+
}
|
138
|
+
|
139
|
+
T product = GetLane(SumOfLanes(d, acc));
|
140
|
+
for (; i < N; ++i) product *= A[i];
|
141
|
+
return product;
|
142
|
+
}
|
143
|
+
|
144
|
+
|
145
|
+
template <typename T>
|
146
|
+
T reduce_mean(nb::ndarray<T, nb::c_contig> a) {
|
147
|
+
size_t N = a.shape(0);
|
148
|
+
return reduce_sum<T>(a) / T(N);
|
149
|
+
}
|
150
|
+
|
151
|
+
template <typename T>
|
152
|
+
T reduce_var(nb::ndarray<T, nb::c_contig> a) {
|
153
|
+
size_t N = a.shape(0);
|
154
|
+
T mu = reduce_mean<T>(a);
|
155
|
+
const T* A = a.data();
|
156
|
+
T var = T(0);
|
157
|
+
for (size_t i = 0; i < N; ++i) {
|
158
|
+
T d = A[i] - mu;
|
159
|
+
var += d * d;
|
160
|
+
}
|
161
|
+
return var / T(N);
|
162
|
+
}
|
163
|
+
|
164
|
+
template <typename T>
|
165
|
+
T reduce_std(nb::ndarray<T, nb::c_contig> a) {
|
166
|
+
return std::sqrt(reduce_var<T>(a));
|
167
|
+
}
|
168
|
+
|
169
|
+
|
170
|
+
template <typename T>
|
171
|
+
bool reduce_any(nb::ndarray<T, nb::c_contig> a) {
|
172
|
+
const T* A = a.data();
|
173
|
+
size_t N = a.shape(0);
|
174
|
+
for (size_t i = 0; i < N; ++i) if (A[i] != T(0)) return true;
|
175
|
+
return false;
|
176
|
+
}
|
177
|
+
|
178
|
+
template <typename T>
|
179
|
+
bool reduce_all(nb::ndarray<T, nb::c_contig> a) {
|
180
|
+
const T* A = a.data();
|
181
|
+
size_t N = a.shape(0);
|
182
|
+
for (size_t i = 0; i < N; ++i) if (A[i] == T(0)) return false;
|
183
|
+
return true;
|
184
|
+
}
|
185
|
+
|
186
|
+
|
187
|
+
template <typename T>
|
188
|
+
size_t argmax(nb::ndarray<T, nb::c_contig> a) {
|
189
|
+
const T* A = a.data();
|
190
|
+
size_t N = a.shape(0);
|
191
|
+
if (N == 0) throw std::runtime_error("argmax: zero-length input");
|
192
|
+
size_t idx = 0;
|
193
|
+
T best = A[0];
|
194
|
+
for (size_t i = 1; i < N; ++i) {
|
195
|
+
if (A[i] > best) { best = A[i]; idx = i; }
|
196
|
+
}
|
197
|
+
return idx;
|
198
|
+
}
|
199
|
+
|
200
|
+
template <typename T>
|
201
|
+
size_t argmin(nb::ndarray<T, nb::c_contig> a) {
|
202
|
+
const T* A = a.data();
|
203
|
+
size_t N = a.shape(0);
|
204
|
+
if (N == 0) throw std::runtime_error("argmin: zero-length input");
|
205
|
+
size_t idx = 0;
|
206
|
+
T best = A[0];
|
207
|
+
for (size_t i = 1; i < N; ++i) {
|
208
|
+
if (A[i] < best) { best = A[i]; idx = i; }
|
209
|
+
}
|
210
|
+
return idx;
|
211
|
+
}
|
212
|
+
|
213
|
+
|
214
|
+
template <typename T>
|
215
|
+
nb::ndarray<nb::numpy, T, nb::ndim<1>>
|
216
|
+
cumsum(nb::ndarray<T, nb::c_contig> a) {
|
217
|
+
size_t N = a.shape(0);
|
218
|
+
const T* A = a.data();
|
219
|
+
size_t bytes = N * sizeof(T);
|
220
|
+
void* raw = aligned_alloc64(bytes);
|
221
|
+
T* C = static_cast<T*>(raw);
|
222
|
+
#if defined(_MSC_VER)
|
223
|
+
nb::capsule deleter(C, [](void* p) noexcept { _aligned_free(p); });
|
224
|
+
#else
|
225
|
+
nb::capsule deleter(C, [](void* p) noexcept { free(p); });
|
226
|
+
#endif
|
227
|
+
T acc = T(0);
|
228
|
+
for (size_t i = 0; i < N; ++i) {
|
229
|
+
acc += A[i];
|
230
|
+
C[i] = acc;
|
231
|
+
}
|
232
|
+
return { C, { N }, deleter };
|
233
|
+
}
|
234
|
+
|
235
|
+
template <typename T>
|
236
|
+
nb::ndarray<nb::numpy, T, nb::ndim<1>>
|
237
|
+
cumprod(nb::ndarray<T, nb::c_contig> a) {
|
238
|
+
size_t N = a.shape(0);
|
239
|
+
const T* A = a.data();
|
240
|
+
size_t bytes = N * sizeof(T);
|
241
|
+
void* raw = aligned_alloc64(bytes);
|
242
|
+
T* C = static_cast<T*>(raw);
|
243
|
+
#if defined(_MSC_VER)
|
244
|
+
nb::capsule deleter(C, [](void* p) noexcept { _aligned_free(p); });
|
245
|
+
#else
|
246
|
+
nb::capsule deleter(C, [](void* p) noexcept { free(p); });
|
247
|
+
#endif
|
248
|
+
T acc = T(1);
|
249
|
+
for (size_t i = 0; i < N; ++i) {
|
250
|
+
acc *= A[i];
|
251
|
+
C[i] = acc;
|
252
|
+
}
|
253
|
+
return { C, { N }, deleter };
|
254
|
+
}
|
255
|
+
|
256
|
+
} // capnhook
|
257
|
+
} // HWY_NAMESPACE
|
258
|
+
} // hwy
|
259
|
+
HWY_AFTER_NAMESPACE();
|
260
|
+
|
261
|
+
namespace capnhook = hwy::HWY_NAMESPACE::capnhook;
|
simd/unary.hpp
ADDED
@@ -0,0 +1,83 @@
|
|
1
|
+
#pragma once
|
2
|
+
|
3
|
+
#include <cstddef>
|
4
|
+
#include <cstdlib>
|
5
|
+
#include <cmath>
|
6
|
+
#include <nanobind/nanobind.h>
|
7
|
+
#include <nanobind/ndarray.h>
|
8
|
+
#include <hwy/highway.h>
|
9
|
+
#include <hwy/contrib/math/math-inl.h>
|
10
|
+
|
11
|
+
#include "../alloc.hpp"
|
12
|
+
|
13
|
+
namespace nb = nanobind;
|
14
|
+
|
15
|
+
HWY_BEFORE_NAMESPACE();
|
16
|
+
namespace hwy {
|
17
|
+
namespace HWY_NAMESPACE {
|
18
|
+
namespace capnhook {
|
19
|
+
|
20
|
+
template <typename T, typename Op>
|
21
|
+
nb::ndarray<nb::numpy, T, nb::ndim<1>> unary(nb::ndarray<T, nb::c_contig> a) {
|
22
|
+
const size_t N = a.shape(0);
|
23
|
+
const T* A = a.data();
|
24
|
+
|
25
|
+
const size_t bytes = N * sizeof(T);
|
26
|
+
void* raw = aligned_alloc64(bytes);
|
27
|
+
T* C = static_cast<T*>(raw);
|
28
|
+
|
29
|
+
#if defined(_MSC_VER)
|
30
|
+
nb::capsule deleter(C, [](void* p) noexcept { _aligned_free(p); });
|
31
|
+
#else
|
32
|
+
nb::capsule deleter(C, [](void* p) noexcept { free(p); });
|
33
|
+
#endif
|
34
|
+
|
35
|
+
const ScalableTag<T> d;
|
36
|
+
Op op;
|
37
|
+
size_t i = 0;
|
38
|
+
const size_t L = Lanes(d);
|
39
|
+
|
40
|
+
for (; i + L <= N; i += L) {
|
41
|
+
auto v = Load(d, A + i);
|
42
|
+
Store(op(d, v), d, C + i);
|
43
|
+
}
|
44
|
+
|
45
|
+
for (; i < N; ++i) {
|
46
|
+
C[i] = op(A[i]);
|
47
|
+
}
|
48
|
+
|
49
|
+
return { C, { N }, deleter };
|
50
|
+
}
|
51
|
+
|
52
|
+
#define DEFINE_SIMD_UNARY_OP(Symbol, expr_scalar, expr_simd) \
|
53
|
+
struct Symbol##Op { \
|
54
|
+
template <class D, class V> \
|
55
|
+
HWY_INLINE V operator()(D d, V v) const { \
|
56
|
+
return expr_simd; \
|
57
|
+
} \
|
58
|
+
HWY_INLINE float operator()(float x) const { return (expr_scalar); } \
|
59
|
+
HWY_INLINE double operator()(double x) const { return (expr_scalar); } \
|
60
|
+
}; \
|
61
|
+
inline nb::ndarray<nb::numpy, float, nb::ndim<1>> \
|
62
|
+
Symbol(nb::ndarray<float, nb::c_contig> a) { \
|
63
|
+
return unary<float, Symbol##Op>(a); \
|
64
|
+
} \
|
65
|
+
inline nb::ndarray<nb::numpy, double, nb::ndim<1>> \
|
66
|
+
Symbol(nb::ndarray<double, nb::c_contig> a) { \
|
67
|
+
return unary<double, Symbol##Op>(a); \
|
68
|
+
}
|
69
|
+
|
70
|
+
DEFINE_SIMD_UNARY_OP(exp, std::exp(x), hwy::HWY_NAMESPACE::Exp(d, v))
|
71
|
+
DEFINE_SIMD_UNARY_OP(log, std::log(x), hwy::HWY_NAMESPACE::Log(d, v))
|
72
|
+
DEFINE_SIMD_UNARY_OP(sqrt, std::sqrt(x), hwy::HWY_NAMESPACE::Sqrt(v))
|
73
|
+
DEFINE_SIMD_UNARY_OP(sin, std::sin(x), hwy::HWY_NAMESPACE::Sin(d, v))
|
74
|
+
DEFINE_SIMD_UNARY_OP(cos, std::cos(x), hwy::HWY_NAMESPACE::Cos(d, v))
|
75
|
+
DEFINE_SIMD_UNARY_OP(asin, std::asin(x), hwy::HWY_NAMESPACE::Asin(d, v))
|
76
|
+
DEFINE_SIMD_UNARY_OP(acos, std::acos(x), hwy::HWY_NAMESPACE::Acos(d, v))
|
77
|
+
|
78
|
+
} // capnhook
|
79
|
+
} // HWY_NAMESPACE
|
80
|
+
} // hwy
|
81
|
+
HWY_AFTER_NAMESPACE();
|
82
|
+
|
83
|
+
namespace capnhook = hwy::HWY_NAMESPACE::capnhook;
|