gumath 0.2.0dev5 → 0.2.0dev8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CONTRIBUTING.md +7 -2
- data/Gemfile +0 -3
- data/ext/ruby_gumath/GPATH +0 -0
- data/ext/ruby_gumath/GRTAGS +0 -0
- data/ext/ruby_gumath/GTAGS +0 -0
- data/ext/ruby_gumath/extconf.rb +0 -5
- data/ext/ruby_gumath/functions.c +10 -2
- data/ext/ruby_gumath/gufunc_object.c +15 -4
- data/ext/ruby_gumath/gufunc_object.h +9 -3
- data/ext/ruby_gumath/gumath/Makefile +63 -0
- data/ext/ruby_gumath/gumath/Makefile.in +1 -0
- data/ext/ruby_gumath/gumath/config.h +56 -0
- data/ext/ruby_gumath/gumath/config.h.in +3 -0
- data/ext/ruby_gumath/gumath/config.log +497 -0
- data/ext/ruby_gumath/gumath/config.status +1034 -0
- data/ext/ruby_gumath/gumath/configure +375 -4
- data/ext/ruby_gumath/gumath/configure.ac +47 -3
- data/ext/ruby_gumath/gumath/libgumath/Makefile +236 -0
- data/ext/ruby_gumath/gumath/libgumath/Makefile.in +90 -24
- data/ext/ruby_gumath/gumath/libgumath/Makefile.vc +54 -15
- data/ext/ruby_gumath/gumath/libgumath/apply.c +92 -28
- data/ext/ruby_gumath/gumath/libgumath/apply.o +0 -0
- data/ext/ruby_gumath/gumath/libgumath/common.o +0 -0
- data/ext/ruby_gumath/gumath/libgumath/cpu_device_binary.o +0 -0
- data/ext/ruby_gumath/gumath/libgumath/cpu_device_unary.o +0 -0
- data/ext/ruby_gumath/gumath/libgumath/cpu_host_binary.o +0 -0
- data/ext/ruby_gumath/gumath/libgumath/cpu_host_unary.o +0 -0
- data/ext/ruby_gumath/gumath/libgumath/examples.o +0 -0
- data/ext/ruby_gumath/gumath/libgumath/extending/graph.c +27 -20
- data/ext/ruby_gumath/gumath/libgumath/extending/pdist.c +1 -1
- data/ext/ruby_gumath/gumath/libgumath/func.c +13 -9
- data/ext/ruby_gumath/gumath/libgumath/func.o +0 -0
- data/ext/ruby_gumath/gumath/libgumath/graph.o +0 -0
- data/ext/ruby_gumath/gumath/libgumath/gumath.h +55 -14
- data/ext/ruby_gumath/gumath/libgumath/kernels/common.c +513 -0
- data/ext/ruby_gumath/gumath/libgumath/kernels/common.h +155 -0
- data/ext/ruby_gumath/gumath/libgumath/kernels/contrib/bfloat16.h +520 -0
- data/ext/ruby_gumath/gumath/libgumath/kernels/cpu_device_binary.cc +1123 -0
- data/ext/ruby_gumath/gumath/libgumath/kernels/cpu_device_binary.h +1062 -0
- data/ext/ruby_gumath/gumath/libgumath/kernels/cpu_device_msvc.cc +555 -0
- data/ext/ruby_gumath/gumath/libgumath/kernels/cpu_device_unary.cc +368 -0
- data/ext/ruby_gumath/gumath/libgumath/kernels/cpu_device_unary.h +335 -0
- data/ext/ruby_gumath/gumath/libgumath/kernels/cpu_host_binary.c +2952 -0
- data/ext/ruby_gumath/gumath/libgumath/kernels/cpu_host_unary.c +1100 -0
- data/ext/ruby_gumath/gumath/libgumath/kernels/cuda_device_binary.cu +1143 -0
- data/ext/ruby_gumath/gumath/libgumath/kernels/cuda_device_binary.h +1061 -0
- data/ext/ruby_gumath/gumath/libgumath/kernels/cuda_device_unary.cu +528 -0
- data/ext/ruby_gumath/gumath/libgumath/kernels/cuda_device_unary.h +463 -0
- data/ext/ruby_gumath/gumath/libgumath/kernels/cuda_host_binary.c +2817 -0
- data/ext/ruby_gumath/gumath/libgumath/kernels/cuda_host_unary.c +1331 -0
- data/ext/ruby_gumath/gumath/libgumath/kernels/device.hh +614 -0
- data/ext/ruby_gumath/gumath/libgumath/libgumath.a +0 -0
- data/ext/ruby_gumath/gumath/libgumath/libgumath.so +1 -0
- data/ext/ruby_gumath/gumath/libgumath/libgumath.so.0 +1 -0
- data/ext/ruby_gumath/gumath/libgumath/libgumath.so.0.2.0dev3 +0 -0
- data/ext/ruby_gumath/gumath/libgumath/nploops.o +0 -0
- data/ext/ruby_gumath/gumath/libgumath/pdist.o +0 -0
- data/ext/ruby_gumath/gumath/libgumath/quaternion.o +0 -0
- data/ext/ruby_gumath/gumath/libgumath/tbl.o +0 -0
- data/ext/ruby_gumath/gumath/libgumath/thread.c +17 -4
- data/ext/ruby_gumath/gumath/libgumath/thread.o +0 -0
- data/ext/ruby_gumath/gumath/libgumath/xndloops.c +110 -0
- data/ext/ruby_gumath/gumath/libgumath/xndloops.o +0 -0
- data/ext/ruby_gumath/gumath/python/gumath/__init__.py +150 -0
- data/ext/ruby_gumath/gumath/python/gumath/_gumath.c +446 -80
- data/ext/ruby_gumath/gumath/python/gumath/cuda.c +78 -0
- data/ext/ruby_gumath/gumath/python/gumath/examples.c +0 -5
- data/ext/ruby_gumath/gumath/python/gumath/functions.c +2 -2
- data/ext/ruby_gumath/gumath/python/gumath/gumath.h +246 -0
- data/ext/ruby_gumath/gumath/python/gumath/libgumath.a +0 -0
- data/ext/ruby_gumath/gumath/python/gumath/libgumath.so +1 -0
- data/ext/ruby_gumath/gumath/python/gumath/libgumath.so.0 +1 -0
- data/ext/ruby_gumath/gumath/python/gumath/libgumath.so.0.2.0dev3 +0 -0
- data/ext/ruby_gumath/gumath/python/gumath/pygumath.h +31 -2
- data/ext/ruby_gumath/gumath/python/gumath_aux.py +767 -0
- data/ext/ruby_gumath/gumath/python/randdec.py +535 -0
- data/ext/ruby_gumath/gumath/python/randfloat.py +177 -0
- data/ext/ruby_gumath/gumath/python/test_gumath.py +1504 -24
- data/ext/ruby_gumath/gumath/python/test_xndarray.py +462 -0
- data/ext/ruby_gumath/gumath/setup.py +67 -6
- data/ext/ruby_gumath/gumath/tools/detect_cuda_arch.cc +35 -0
- data/ext/ruby_gumath/include/gumath.h +55 -14
- data/ext/ruby_gumath/include/ruby_gumath.h +4 -1
- data/ext/ruby_gumath/lib/libgumath.a +0 -0
- data/ext/ruby_gumath/lib/libgumath.so.0.2.0dev3 +0 -0
- data/ext/ruby_gumath/ruby_gumath.c +231 -70
- data/ext/ruby_gumath/ruby_gumath.h +4 -1
- data/ext/ruby_gumath/ruby_gumath_internal.h +25 -0
- data/ext/ruby_gumath/util.c +34 -0
- data/ext/ruby_gumath/util.h +9 -0
- data/gumath.gemspec +3 -2
- data/lib/gumath.rb +55 -1
- data/lib/gumath/version.rb +2 -2
- data/lib/ruby_gumath.so +0 -0
- metadata +63 -10
- data/ext/ruby_gumath/gumath/libgumath/extending/bfloat16.c +0 -130
- data/ext/ruby_gumath/gumath/libgumath/kernels/binary.c +0 -547
- data/ext/ruby_gumath/gumath/libgumath/kernels/unary.c +0 -449
|
@@ -0,0 +1,614 @@
|
|
|
1
|
+
/*
|
|
2
|
+
* BSD 3-Clause License
|
|
3
|
+
*
|
|
4
|
+
* Copyright (c) 2017-2018, plures
|
|
5
|
+
* All rights reserved.
|
|
6
|
+
*
|
|
7
|
+
* Redistribution and use in source and binary forms, with or without
|
|
8
|
+
* modification, are permitted provided that the following conditions are met:
|
|
9
|
+
*
|
|
10
|
+
* 1. Redistributions of source code must retain the above copyright notice,
|
|
11
|
+
* this list of conditions and the following disclaimer.
|
|
12
|
+
*
|
|
13
|
+
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
|
14
|
+
* this list of conditions and the following disclaimer in the documentation
|
|
15
|
+
* and/or other materials provided with the distribution.
|
|
16
|
+
*
|
|
17
|
+
* 3. Neither the name of the copyright holder nor the names of its
|
|
18
|
+
* contributors may be used to endorse or promote products derived from
|
|
19
|
+
* this software without specific prior written permission.
|
|
20
|
+
*
|
|
21
|
+
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
|
22
|
+
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
23
|
+
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
|
24
|
+
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
|
25
|
+
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
|
26
|
+
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
|
27
|
+
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
|
28
|
+
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
|
29
|
+
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
|
30
|
+
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
31
|
+
*/
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
#ifndef DEVICE_HH
|
|
35
|
+
#define DEVICE_HH
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
#include <cstdint>
|
|
39
|
+
#include <cinttypes>
|
|
40
|
+
#include <complex>
|
|
41
|
+
#include "contrib/bfloat16.h"
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
#ifdef __CUDACC__
|
|
45
|
+
#include <cuda_fp16.h>
|
|
46
|
+
#define DEVICE __device__
|
|
47
|
+
#define ISNAN(x) (isnan(x))
|
|
48
|
+
#else
|
|
49
|
+
#define DEVICE
|
|
50
|
+
#define ISNAN(x) (std::isnan(x))
|
|
51
|
+
#endif
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
/*****************************************************************************/
|
|
55
|
+
/* Divmod */
|
|
56
|
+
/*****************************************************************************/
|
|
57
|
+
|
|
58
|
+
/* Python: floatobject.c */
|
|
59
|
+
static inline DEVICE void
|
|
60
|
+
_divmod(double *q, double *r, double vx, double wx)
|
|
61
|
+
{
|
|
62
|
+
double div, mod, floordiv;
|
|
63
|
+
|
|
64
|
+
mod = fmod(vx, wx);
|
|
65
|
+
/* fmod is typically exact, so vx-mod is *mathematically* an
|
|
66
|
+
exact multiple of wx. But this is fp arithmetic, and fp
|
|
67
|
+
vx - mod is an approximation; the result is that div may
|
|
68
|
+
not be an exact integral value after the division, although
|
|
69
|
+
it will always be very close to one.
|
|
70
|
+
*/
|
|
71
|
+
div = (vx - mod) / wx;
|
|
72
|
+
if (mod) {
|
|
73
|
+
/* ensure the remainder has the same sign as the denominator */
|
|
74
|
+
if ((wx < 0) != (mod < 0)) {
|
|
75
|
+
mod += wx;
|
|
76
|
+
div -= 1.0;
|
|
77
|
+
}
|
|
78
|
+
}
|
|
79
|
+
else {
|
|
80
|
+
/* the remainder is zero, and in the presence of signed zeroes
|
|
81
|
+
fmod returns different results across platforms; ensure
|
|
82
|
+
it has the same sign as the denominator. */
|
|
83
|
+
mod = copysign(0.0, wx);
|
|
84
|
+
}
|
|
85
|
+
/* snap quotient to nearest integral value */
|
|
86
|
+
if (div) {
|
|
87
|
+
floordiv = floor(div);
|
|
88
|
+
if (div - floordiv > 0.5)
|
|
89
|
+
floordiv += 1.0;
|
|
90
|
+
}
|
|
91
|
+
else {
|
|
92
|
+
/* div is zero - get the same sign as the true quotient */
|
|
93
|
+
floordiv = copysign(0.0, vx / wx); /* zero w/ sign of vx/wx */
|
|
94
|
+
}
|
|
95
|
+
|
|
96
|
+
*q = floordiv;
|
|
97
|
+
*r = mod;
|
|
98
|
+
}
|
|
99
|
+
|
|
100
|
+
static inline DEVICE void
|
|
101
|
+
_divmod(float *q, float *r, float vx, float wx)
|
|
102
|
+
{
|
|
103
|
+
float div, mod, floordiv;
|
|
104
|
+
|
|
105
|
+
mod = fmodf(vx, wx);
|
|
106
|
+
/* fmod is typically exact, so vx-mod is *mathematically* an
|
|
107
|
+
exact multiple of wx. But this is fp arithmetic, and fp
|
|
108
|
+
vx - mod is an approximation; the result is that div may
|
|
109
|
+
not be an exact integral value after the division, although
|
|
110
|
+
it will always be very close to one.
|
|
111
|
+
*/
|
|
112
|
+
div = (vx - mod) / wx;
|
|
113
|
+
if (mod) {
|
|
114
|
+
/* ensure the remainder has the same sign as the denominator */
|
|
115
|
+
if ((wx < 0) != (mod < 0)) {
|
|
116
|
+
mod += wx;
|
|
117
|
+
div -= 1.0;
|
|
118
|
+
}
|
|
119
|
+
}
|
|
120
|
+
else {
|
|
121
|
+
/* the remainder is zero, and in the presence of signed zeroes
|
|
122
|
+
fmod returns different results across platforms; ensure
|
|
123
|
+
it has the same sign as the denominator. */
|
|
124
|
+
mod = copysignf(0.0, wx);
|
|
125
|
+
}
|
|
126
|
+
/* snap quotient to nearest integral value */
|
|
127
|
+
if (div) {
|
|
128
|
+
floordiv = floorf(div);
|
|
129
|
+
if (div - floordiv > 0.5)
|
|
130
|
+
floordiv += 1.0;
|
|
131
|
+
}
|
|
132
|
+
else {
|
|
133
|
+
/* div is zero - get the same sign as the true quotient */
|
|
134
|
+
floordiv = copysignf(0.0, vx / wx); /* zero w/ sign of vx/wx */
|
|
135
|
+
}
|
|
136
|
+
|
|
137
|
+
*q = floordiv;
|
|
138
|
+
*r = mod;
|
|
139
|
+
}
|
|
140
|
+
|
|
141
|
+
static inline DEVICE void
|
|
142
|
+
_divmod(bfloat16_t *q, bfloat16_t *r, bfloat16_t a, bfloat16_t b)
|
|
143
|
+
{
|
|
144
|
+
float qq;
|
|
145
|
+
float rr;
|
|
146
|
+
|
|
147
|
+
_divmod(&qq, &rr, (float)a, (float)b);
|
|
148
|
+
|
|
149
|
+
*q = (bfloat16_t)qq;
|
|
150
|
+
*r = (bfloat16_t)rr;
|
|
151
|
+
}
|
|
152
|
+
|
|
153
|
+
#define divmod_unsigned(T) \
|
|
154
|
+
static inline DEVICE void \
|
|
155
|
+
_divmod(T *q, T *r, T a, T b) \
|
|
156
|
+
{ \
|
|
157
|
+
if (b == 0) { \
|
|
158
|
+
*q = 0; \
|
|
159
|
+
*r = 0; \
|
|
160
|
+
} \
|
|
161
|
+
else { \
|
|
162
|
+
*q = a / b; \
|
|
163
|
+
*r = a % b; \
|
|
164
|
+
} \
|
|
165
|
+
}
|
|
166
|
+
|
|
167
|
+
divmod_unsigned(uint8_t)
|
|
168
|
+
divmod_unsigned(uint16_t)
|
|
169
|
+
divmod_unsigned(uint32_t)
|
|
170
|
+
divmod_unsigned(uint64_t)
|
|
171
|
+
|
|
172
|
+
#define divmod_signed(T, MIN) \
|
|
173
|
+
static inline DEVICE void \
|
|
174
|
+
_divmod(T *q, T *r, T a, T b) \
|
|
175
|
+
{ \
|
|
176
|
+
if (b == 0 || (a == MIN && b == -1)) { \
|
|
177
|
+
*q = 0; \
|
|
178
|
+
*r = 0; \
|
|
179
|
+
} \
|
|
180
|
+
else { \
|
|
181
|
+
T qq = a / b; \
|
|
182
|
+
T rr = a % b; \
|
|
183
|
+
\
|
|
184
|
+
*q = rr ? (qq - ((a < 0) ^ (b < 0))) : qq; \
|
|
185
|
+
*r = a - *q * b; \
|
|
186
|
+
} \
|
|
187
|
+
}
|
|
188
|
+
|
|
189
|
+
divmod_signed(int8_t, INT8_MIN)
|
|
190
|
+
divmod_signed(int16_t, INT16_MIN)
|
|
191
|
+
divmod_signed(int32_t, INT32_MIN)
|
|
192
|
+
divmod_signed(int64_t, INT64_MIN)
|
|
193
|
+
|
|
194
|
+
template <class T>
|
|
195
|
+
static inline DEVICE T
|
|
196
|
+
_floor_divide(T a, T b)
|
|
197
|
+
{
|
|
198
|
+
T q;
|
|
199
|
+
T r;
|
|
200
|
+
|
|
201
|
+
_divmod(&q, &r, a, b);
|
|
202
|
+
|
|
203
|
+
return q;
|
|
204
|
+
}
|
|
205
|
+
|
|
206
|
+
template <class T>
|
|
207
|
+
static inline DEVICE T
|
|
208
|
+
_remainder(T a, T b)
|
|
209
|
+
{
|
|
210
|
+
T q;
|
|
211
|
+
T r;
|
|
212
|
+
|
|
213
|
+
_divmod(&q, &r, a, b);
|
|
214
|
+
|
|
215
|
+
return r;
|
|
216
|
+
}
|
|
217
|
+
|
|
218
|
+
|
|
219
|
+
/*****************************************************************************/
|
|
220
|
+
/* Abs */
|
|
221
|
+
/*****************************************************************************/
|
|
222
|
+
|
|
223
|
+
#ifdef __CUDACC__
|
|
224
|
+
#define abs_unsigned(T) \
|
|
225
|
+
static inline DEVICE T \
|
|
226
|
+
_abs(T x) \
|
|
227
|
+
{ \
|
|
228
|
+
return x; \
|
|
229
|
+
}
|
|
230
|
+
|
|
231
|
+
abs_unsigned(bool)
|
|
232
|
+
|
|
233
|
+
abs_unsigned(uint8_t)
|
|
234
|
+
abs_unsigned(uint16_t)
|
|
235
|
+
abs_unsigned(uint32_t)
|
|
236
|
+
abs_unsigned(uint64_t)
|
|
237
|
+
|
|
238
|
+
#define abs_signed(T) \
|
|
239
|
+
static inline DEVICE T \
|
|
240
|
+
_abs(T x) \
|
|
241
|
+
{ \
|
|
242
|
+
return x < 0 ? -x : x; \
|
|
243
|
+
}
|
|
244
|
+
|
|
245
|
+
abs_signed(int8_t)
|
|
246
|
+
abs_signed(int16_t)
|
|
247
|
+
abs_signed(int32_t)
|
|
248
|
+
abs_signed(int64_t)
|
|
249
|
+
|
|
250
|
+
static inline DEVICE float32_t
|
|
251
|
+
_abs(float32_t x)
|
|
252
|
+
{
|
|
253
|
+
return fabsf(x);
|
|
254
|
+
}
|
|
255
|
+
|
|
256
|
+
static inline DEVICE float64_t
|
|
257
|
+
_abs(float64_t x)
|
|
258
|
+
{
|
|
259
|
+
return fabs(x);
|
|
260
|
+
}
|
|
261
|
+
|
|
262
|
+
static inline DEVICE complex64_t
|
|
263
|
+
_abs(complex64_t x)
|
|
264
|
+
{
|
|
265
|
+
return thrust::abs(x);
|
|
266
|
+
}
|
|
267
|
+
|
|
268
|
+
static inline DEVICE complex128_t
|
|
269
|
+
_abs(complex128_t x)
|
|
270
|
+
{
|
|
271
|
+
return thrust::abs(x);
|
|
272
|
+
}
|
|
273
|
+
#endif
|
|
274
|
+
|
|
275
|
+
|
|
276
|
+
/*****************************************************************************/
|
|
277
|
+
/* Pow */
|
|
278
|
+
/*****************************************************************************/
|
|
279
|
+
|
|
280
|
+
#define pow_unsigned(name, T, mask) \
|
|
281
|
+
static inline DEVICE T \
|
|
282
|
+
name(T base, T exp) \
|
|
283
|
+
{ \
|
|
284
|
+
uint64_t r = 1; \
|
|
285
|
+
\
|
|
286
|
+
while (exp > 0) { \
|
|
287
|
+
if (exp & 1) { \
|
|
288
|
+
r = (r * base) & mask; \
|
|
289
|
+
} \
|
|
290
|
+
base = (base * base) & mask; \
|
|
291
|
+
exp >>= 1; \
|
|
292
|
+
} \
|
|
293
|
+
\
|
|
294
|
+
return (T)r; \
|
|
295
|
+
}
|
|
296
|
+
|
|
297
|
+
pow_unsigned(_pow, uint8_t, UINT8_MAX)
|
|
298
|
+
pow_unsigned(_pow, uint16_t, UINT16_MAX)
|
|
299
|
+
pow_unsigned(_pow, uint32_t, UINT32_MAX)
|
|
300
|
+
pow_unsigned(_pow, uint64_t, UINT64_MAX)
|
|
301
|
+
|
|
302
|
+
pow_unsigned(_pow_int8_t, uint8_t, INT8_MAX)
|
|
303
|
+
pow_unsigned(_pow_int16_t, uint16_t, INT16_MAX)
|
|
304
|
+
pow_unsigned(_pow_int32_t, uint32_t, INT32_MAX)
|
|
305
|
+
pow_unsigned(_pow_int64_t, uint64_t, INT64_MAX)
|
|
306
|
+
|
|
307
|
+
#define pow_signed(T, U, MIN, MAX) \
|
|
308
|
+
static inline DEVICE T \
|
|
309
|
+
_pow(T ibase, T exp) \
|
|
310
|
+
{ \
|
|
311
|
+
U base; \
|
|
312
|
+
T r; \
|
|
313
|
+
\
|
|
314
|
+
if (ibase < 0) { \
|
|
315
|
+
base = (U)(-ibase); \
|
|
316
|
+
r = (T)_pow_##T(base, exp); \
|
|
317
|
+
return (exp % 2 == 0) ? r : -r; \
|
|
318
|
+
} \
|
|
319
|
+
else { \
|
|
320
|
+
base = (U)ibase; \
|
|
321
|
+
return _pow_##T(base, exp); \
|
|
322
|
+
} \
|
|
323
|
+
}
|
|
324
|
+
|
|
325
|
+
pow_signed(int8_t, uint8_t, INT8_MIN, INT8_MAX)
|
|
326
|
+
pow_signed(int16_t, uint16_t, INT16_MIN, INT16_MAX)
|
|
327
|
+
pow_signed(int32_t, uint32_t, INT32_MIN, INT32_MAX)
|
|
328
|
+
pow_signed(int64_t, uint64_t, INT64_MIN, INT64_MAX)
|
|
329
|
+
|
|
330
|
+
static inline DEVICE bfloat16_t
|
|
331
|
+
_pow(bfloat16_t x, bfloat16_t y)
|
|
332
|
+
{
|
|
333
|
+
return (bfloat16_t)powf((float)x, (float)y);
|
|
334
|
+
}
|
|
335
|
+
|
|
336
|
+
static inline DEVICE float32_t
|
|
337
|
+
_pow(float32_t x, float32_t y)
|
|
338
|
+
{
|
|
339
|
+
return powf(x, y);
|
|
340
|
+
}
|
|
341
|
+
|
|
342
|
+
static inline DEVICE float64_t
|
|
343
|
+
_pow(float64_t x, float64_t y)
|
|
344
|
+
{
|
|
345
|
+
return pow(x, y);
|
|
346
|
+
}
|
|
347
|
+
|
|
348
|
+
#ifdef __CUDACC__
|
|
349
|
+
static inline DEVICE half
|
|
350
|
+
_pow(half x, half y)
|
|
351
|
+
{
|
|
352
|
+
return __float2half(pow(__half2float(x), __half2float(y)));
|
|
353
|
+
}
|
|
354
|
+
#endif
|
|
355
|
+
|
|
356
|
+
|
|
357
|
+
/*****************************************************************************/
|
|
358
|
+
/* Complex pow */
|
|
359
|
+
/*****************************************************************************/
|
|
360
|
+
|
|
361
|
+
#ifdef __CUDACC__
|
|
362
|
+
template <class T>
|
|
363
|
+
using Complex = thrust::complex<T>;
|
|
364
|
+
|
|
365
|
+
template <class T>
|
|
366
|
+
static inline DEVICE Complex<T>
|
|
367
|
+
_cpow(Complex<T> x, Complex<T> y)
|
|
368
|
+
{
|
|
369
|
+
return thrust::pow<T>(x, y);
|
|
370
|
+
}
|
|
371
|
+
#else
|
|
372
|
+
template <class T>
|
|
373
|
+
using Complex = std::complex<T>;
|
|
374
|
+
|
|
375
|
+
template <class T>
|
|
376
|
+
static inline DEVICE Complex<T>
|
|
377
|
+
_cpow(Complex<T> x, Complex<T> y)
|
|
378
|
+
{
|
|
379
|
+
return std::pow<T>(x, y);
|
|
380
|
+
}
|
|
381
|
+
#endif
|
|
382
|
+
|
|
383
|
+
static inline DEVICE double xhypot(double x, double y) { return hypot(x, y); }
|
|
384
|
+
static inline DEVICE double xpow(double x, double y) { return pow(x, y); }
|
|
385
|
+
static inline DEVICE double xatan2(double x, double y) { return atan2(x, y); }
|
|
386
|
+
static inline DEVICE double xexp(double x) { return exp(x); }
|
|
387
|
+
static inline DEVICE double xlog(double x) { return log(x); }
|
|
388
|
+
static inline DEVICE float xhypot(float x, float y) { return hypotf(x, y); }
|
|
389
|
+
static inline DEVICE float xpow(float x, float y) { return powf(x, y); }
|
|
390
|
+
static inline DEVICE float xatan2(float x, float y) { return atan2f(x, y); }
|
|
391
|
+
static inline DEVICE float xexp(float x) { return expf(x); }
|
|
392
|
+
static inline DEVICE float xlog(float x) { return logf(x); }
|
|
393
|
+
|
|
394
|
+
|
|
395
|
+
/* Python: complexobject.c */
|
|
396
|
+
template <class T>
|
|
397
|
+
static inline DEVICE Complex<T>
|
|
398
|
+
c_quot(const Complex<T> a, const Complex<T> b)
|
|
399
|
+
{
|
|
400
|
+
/* This algorithm is better, and is pretty obvious: first divide the
|
|
401
|
+
* numerators and denominator by whichever of {b.real, b.imag} has
|
|
402
|
+
* larger magnitude. The earliest reference I found was to CACM
|
|
403
|
+
* Algorithm 116 (Complex Division, Robert L. Smith, Stanford
|
|
404
|
+
* University). As usual, though, we're still ignoring all IEEE
|
|
405
|
+
* endcases.
|
|
406
|
+
*/
|
|
407
|
+
const T abs_breal = b.real() < 0 ? -b.real() : b.real();
|
|
408
|
+
const T abs_bimag = b.imag() < 0 ? -b.imag() : b.imag();
|
|
409
|
+
T real, imag;
|
|
410
|
+
|
|
411
|
+
if (abs_breal >= abs_bimag) {
|
|
412
|
+
/* divide tops and bottom by b.real */
|
|
413
|
+
if (abs_breal == 0.0) {
|
|
414
|
+
// errno = EDOM;
|
|
415
|
+
real = imag = 0.0;
|
|
416
|
+
}
|
|
417
|
+
else {
|
|
418
|
+
const T ratio = b.imag() / b.real();
|
|
419
|
+
const T denom = b.real() + b.imag() * ratio;
|
|
420
|
+
real = (a.real() + a.imag() * ratio) / denom;
|
|
421
|
+
imag = (a.imag() - a.real() * ratio) / denom;
|
|
422
|
+
}
|
|
423
|
+
}
|
|
424
|
+
else if (abs_bimag >= abs_breal) {
|
|
425
|
+
/* divide tops and bottom by b.imag */
|
|
426
|
+
const T ratio = b.real() / b.imag();
|
|
427
|
+
const T denom = b.real() * ratio + b.imag();
|
|
428
|
+
real = (a.real() * ratio + a.imag()) / denom;
|
|
429
|
+
imag = (a.imag() * ratio - a.real()) / denom;
|
|
430
|
+
}
|
|
431
|
+
else {
|
|
432
|
+
/* At least one of b.real or b.imag is a NaN */
|
|
433
|
+
real = imag = NAN;
|
|
434
|
+
}
|
|
435
|
+
|
|
436
|
+
return Complex<T>{real, imag};
|
|
437
|
+
}
|
|
438
|
+
|
|
439
|
+
template <class T>
|
|
440
|
+
static inline DEVICE Complex<T>
|
|
441
|
+
c_pow(const Complex<T> a, const Complex<T> b)
|
|
442
|
+
{
|
|
443
|
+
if (b.real() == 0 && b.imag() == 0) {
|
|
444
|
+
return Complex<T>{1, 0};
|
|
445
|
+
}
|
|
446
|
+
else if (a.real() == 0 && a.imag() == 0) {
|
|
447
|
+
// if (b.imag() != 0 || b.real() < 0)
|
|
448
|
+
// errno = EDOM;
|
|
449
|
+
return Complex<T>{0, 0};
|
|
450
|
+
}
|
|
451
|
+
else {
|
|
452
|
+
T vabs = xhypot(a.real(), a.imag());
|
|
453
|
+
T len = xpow(vabs, b.real());
|
|
454
|
+
T at = xatan2(a.imag(), a.real());
|
|
455
|
+
T phase = at * b.real();
|
|
456
|
+
|
|
457
|
+
if (b.imag() != 0) {
|
|
458
|
+
len /= xexp(at * b.imag());
|
|
459
|
+
phase += b.imag() * xlog(vabs);
|
|
460
|
+
}
|
|
461
|
+
|
|
462
|
+
T real = len*cos(phase);
|
|
463
|
+
T imag = len*sin(phase);
|
|
464
|
+
|
|
465
|
+
return Complex<T>{real, imag};
|
|
466
|
+
}
|
|
467
|
+
}
|
|
468
|
+
|
|
469
|
+
template <class T>
|
|
470
|
+
static inline DEVICE Complex<T>
|
|
471
|
+
c_powu(Complex<T> base, uint64_t exp)
|
|
472
|
+
{
|
|
473
|
+
Complex<T> r{1, 0};
|
|
474
|
+
|
|
475
|
+
while (exp > 0) {
|
|
476
|
+
if (exp & 1) {
|
|
477
|
+
r = r * base;
|
|
478
|
+
}
|
|
479
|
+
base = base * base;
|
|
480
|
+
exp >>= 1;
|
|
481
|
+
}
|
|
482
|
+
|
|
483
|
+
return r;
|
|
484
|
+
}
|
|
485
|
+
|
|
486
|
+
template <class T>
|
|
487
|
+
static inline DEVICE Complex<T>
|
|
488
|
+
c_powi(Complex<T> x, int64_t n)
|
|
489
|
+
{
|
|
490
|
+
if (n > 99 || n < -99) {
|
|
491
|
+
Complex<T> y{(T)n, 0};
|
|
492
|
+
return c_pow(x, y);
|
|
493
|
+
}
|
|
494
|
+
else if (n > 0) {
|
|
495
|
+
return c_powu(x, (uint64_t)n);
|
|
496
|
+
}
|
|
497
|
+
else {
|
|
498
|
+
Complex<T> one{1, 0};
|
|
499
|
+
return c_quot(one, c_powu(x, (uint64_t)(-n)));
|
|
500
|
+
}
|
|
501
|
+
}
|
|
502
|
+
|
|
503
|
+
template <class T>
|
|
504
|
+
static inline DEVICE Complex<T>
|
|
505
|
+
complex_pow(Complex<T> a, Complex<T> exponent)
|
|
506
|
+
{
|
|
507
|
+
int64_t int_exponent;
|
|
508
|
+
|
|
509
|
+
int_exponent = (int64_t)exponent.real();
|
|
510
|
+
if (exponent.imag() == 0 && exponent.real() == int_exponent) {
|
|
511
|
+
return c_powi(a, int_exponent);
|
|
512
|
+
}
|
|
513
|
+
else {
|
|
514
|
+
return c_pow(a, exponent);
|
|
515
|
+
}
|
|
516
|
+
}
|
|
517
|
+
|
|
518
|
+
template <class T>
|
|
519
|
+
static inline DEVICE Complex<T>
|
|
520
|
+
_pow(Complex<T> x, Complex<T> y)
|
|
521
|
+
{
|
|
522
|
+
Complex<double> a = x;
|
|
523
|
+
Complex<double> b = y;
|
|
524
|
+
Complex<double> r = complex_pow(a, b);
|
|
525
|
+
return (Complex<T>)r;
|
|
526
|
+
}
|
|
527
|
+
|
|
528
|
+
|
|
529
|
+
/*****************************************************************************/
|
|
530
|
+
/* Lexicographic comparison for complex numbers */
|
|
531
|
+
/*****************************************************************************/
|
|
532
|
+
|
|
533
|
+
template <class T>
|
|
534
|
+
static inline DEVICE bool
|
|
535
|
+
_isnan(T a)
|
|
536
|
+
{
|
|
537
|
+
return ISNAN(a.real()) || ISNAN(a.imag());
|
|
538
|
+
}
|
|
539
|
+
|
|
540
|
+
template <class T, class U>
|
|
541
|
+
static inline DEVICE bool
|
|
542
|
+
lexorder_lt(T a, U b)
|
|
543
|
+
{
|
|
544
|
+
if (_isnan(a) || _isnan(b)) {
|
|
545
|
+
return false;
|
|
546
|
+
}
|
|
547
|
+
|
|
548
|
+
return a.real() < b.real() || (a.real() == b.real() && a.imag() < b.imag());
|
|
549
|
+
}
|
|
550
|
+
|
|
551
|
+
template <class T, class U>
|
|
552
|
+
static inline DEVICE bool
|
|
553
|
+
lexorder_le(T a, U b)
|
|
554
|
+
{
|
|
555
|
+
if (_isnan(a) || _isnan(b)) {
|
|
556
|
+
return false;
|
|
557
|
+
}
|
|
558
|
+
|
|
559
|
+
return a.real() < b.real() || (a.real() == b.real() && a.imag() <= b.imag());
|
|
560
|
+
}
|
|
561
|
+
|
|
562
|
+
template <class T, class U>
|
|
563
|
+
static inline DEVICE bool
|
|
564
|
+
lexorder_ge(T a, U b)
|
|
565
|
+
{
|
|
566
|
+
if (_isnan(a) || _isnan(b)) {
|
|
567
|
+
return false;
|
|
568
|
+
}
|
|
569
|
+
|
|
570
|
+
return a.real() > b.real() || (a.real() == b.real() && a.imag() >= b.imag());
|
|
571
|
+
}
|
|
572
|
+
|
|
573
|
+
template <class T, class U>
|
|
574
|
+
static inline DEVICE bool
|
|
575
|
+
lexorder_gt(T a, U b)
|
|
576
|
+
{
|
|
577
|
+
if (_isnan(a) || _isnan(b)) {
|
|
578
|
+
return false;
|
|
579
|
+
}
|
|
580
|
+
|
|
581
|
+
return a.real() > b.real() || (a.real() == b.real() && a.imag() > b.imag());
|
|
582
|
+
}
|
|
583
|
+
|
|
584
|
+
template <class T, class U>
|
|
585
|
+
static inline DEVICE bool
|
|
586
|
+
lexorder_eqn(T a, U b)
|
|
587
|
+
{
|
|
588
|
+
bool real_equal = a.real() == b.real() || (ISNAN(a.real()) && ISNAN(b.real()));
|
|
589
|
+
bool imag_equal = a.imag() == b.imag() || (ISNAN(a.imag()) && ISNAN(b.imag()));
|
|
590
|
+
|
|
591
|
+
return real_equal && imag_equal;
|
|
592
|
+
}
|
|
593
|
+
|
|
594
|
+
|
|
595
|
+
/*****************************************************************************/
|
|
596
|
+
/* Half equality */
|
|
597
|
+
/*****************************************************************************/
|
|
598
|
+
|
|
599
|
+
#ifdef __CUDACC__
|
|
600
|
+
static inline DEVICE bool
|
|
601
|
+
half_ne(half a, half b)
|
|
602
|
+
{
|
|
603
|
+
return !__heq(a, b);
|
|
604
|
+
}
|
|
605
|
+
|
|
606
|
+
static inline DEVICE bool
|
|
607
|
+
half_eqn(half a, half b)
|
|
608
|
+
{
|
|
609
|
+
return __heq(a, b) || (__hisnan(a) && __hisnan(b));
|
|
610
|
+
}
|
|
611
|
+
#endif
|
|
612
|
+
|
|
613
|
+
|
|
614
|
+
#endif /* DEVICE_HH */
|