gumath 0.2.0dev5 → 0.2.0dev8

Sign up to get free protection for your applications and to get access to all the features.
Files changed (99) hide show
  1. checksums.yaml +4 -4
  2. data/CONTRIBUTING.md +7 -2
  3. data/Gemfile +0 -3
  4. data/ext/ruby_gumath/GPATH +0 -0
  5. data/ext/ruby_gumath/GRTAGS +0 -0
  6. data/ext/ruby_gumath/GTAGS +0 -0
  7. data/ext/ruby_gumath/extconf.rb +0 -5
  8. data/ext/ruby_gumath/functions.c +10 -2
  9. data/ext/ruby_gumath/gufunc_object.c +15 -4
  10. data/ext/ruby_gumath/gufunc_object.h +9 -3
  11. data/ext/ruby_gumath/gumath/Makefile +63 -0
  12. data/ext/ruby_gumath/gumath/Makefile.in +1 -0
  13. data/ext/ruby_gumath/gumath/config.h +56 -0
  14. data/ext/ruby_gumath/gumath/config.h.in +3 -0
  15. data/ext/ruby_gumath/gumath/config.log +497 -0
  16. data/ext/ruby_gumath/gumath/config.status +1034 -0
  17. data/ext/ruby_gumath/gumath/configure +375 -4
  18. data/ext/ruby_gumath/gumath/configure.ac +47 -3
  19. data/ext/ruby_gumath/gumath/libgumath/Makefile +236 -0
  20. data/ext/ruby_gumath/gumath/libgumath/Makefile.in +90 -24
  21. data/ext/ruby_gumath/gumath/libgumath/Makefile.vc +54 -15
  22. data/ext/ruby_gumath/gumath/libgumath/apply.c +92 -28
  23. data/ext/ruby_gumath/gumath/libgumath/apply.o +0 -0
  24. data/ext/ruby_gumath/gumath/libgumath/common.o +0 -0
  25. data/ext/ruby_gumath/gumath/libgumath/cpu_device_binary.o +0 -0
  26. data/ext/ruby_gumath/gumath/libgumath/cpu_device_unary.o +0 -0
  27. data/ext/ruby_gumath/gumath/libgumath/cpu_host_binary.o +0 -0
  28. data/ext/ruby_gumath/gumath/libgumath/cpu_host_unary.o +0 -0
  29. data/ext/ruby_gumath/gumath/libgumath/examples.o +0 -0
  30. data/ext/ruby_gumath/gumath/libgumath/extending/graph.c +27 -20
  31. data/ext/ruby_gumath/gumath/libgumath/extending/pdist.c +1 -1
  32. data/ext/ruby_gumath/gumath/libgumath/func.c +13 -9
  33. data/ext/ruby_gumath/gumath/libgumath/func.o +0 -0
  34. data/ext/ruby_gumath/gumath/libgumath/graph.o +0 -0
  35. data/ext/ruby_gumath/gumath/libgumath/gumath.h +55 -14
  36. data/ext/ruby_gumath/gumath/libgumath/kernels/common.c +513 -0
  37. data/ext/ruby_gumath/gumath/libgumath/kernels/common.h +155 -0
  38. data/ext/ruby_gumath/gumath/libgumath/kernels/contrib/bfloat16.h +520 -0
  39. data/ext/ruby_gumath/gumath/libgumath/kernels/cpu_device_binary.cc +1123 -0
  40. data/ext/ruby_gumath/gumath/libgumath/kernels/cpu_device_binary.h +1062 -0
  41. data/ext/ruby_gumath/gumath/libgumath/kernels/cpu_device_msvc.cc +555 -0
  42. data/ext/ruby_gumath/gumath/libgumath/kernels/cpu_device_unary.cc +368 -0
  43. data/ext/ruby_gumath/gumath/libgumath/kernels/cpu_device_unary.h +335 -0
  44. data/ext/ruby_gumath/gumath/libgumath/kernels/cpu_host_binary.c +2952 -0
  45. data/ext/ruby_gumath/gumath/libgumath/kernels/cpu_host_unary.c +1100 -0
  46. data/ext/ruby_gumath/gumath/libgumath/kernels/cuda_device_binary.cu +1143 -0
  47. data/ext/ruby_gumath/gumath/libgumath/kernels/cuda_device_binary.h +1061 -0
  48. data/ext/ruby_gumath/gumath/libgumath/kernels/cuda_device_unary.cu +528 -0
  49. data/ext/ruby_gumath/gumath/libgumath/kernels/cuda_device_unary.h +463 -0
  50. data/ext/ruby_gumath/gumath/libgumath/kernels/cuda_host_binary.c +2817 -0
  51. data/ext/ruby_gumath/gumath/libgumath/kernels/cuda_host_unary.c +1331 -0
  52. data/ext/ruby_gumath/gumath/libgumath/kernels/device.hh +614 -0
  53. data/ext/ruby_gumath/gumath/libgumath/libgumath.a +0 -0
  54. data/ext/ruby_gumath/gumath/libgumath/libgumath.so +1 -0
  55. data/ext/ruby_gumath/gumath/libgumath/libgumath.so.0 +1 -0
  56. data/ext/ruby_gumath/gumath/libgumath/libgumath.so.0.2.0dev3 +0 -0
  57. data/ext/ruby_gumath/gumath/libgumath/nploops.o +0 -0
  58. data/ext/ruby_gumath/gumath/libgumath/pdist.o +0 -0
  59. data/ext/ruby_gumath/gumath/libgumath/quaternion.o +0 -0
  60. data/ext/ruby_gumath/gumath/libgumath/tbl.o +0 -0
  61. data/ext/ruby_gumath/gumath/libgumath/thread.c +17 -4
  62. data/ext/ruby_gumath/gumath/libgumath/thread.o +0 -0
  63. data/ext/ruby_gumath/gumath/libgumath/xndloops.c +110 -0
  64. data/ext/ruby_gumath/gumath/libgumath/xndloops.o +0 -0
  65. data/ext/ruby_gumath/gumath/python/gumath/__init__.py +150 -0
  66. data/ext/ruby_gumath/gumath/python/gumath/_gumath.c +446 -80
  67. data/ext/ruby_gumath/gumath/python/gumath/cuda.c +78 -0
  68. data/ext/ruby_gumath/gumath/python/gumath/examples.c +0 -5
  69. data/ext/ruby_gumath/gumath/python/gumath/functions.c +2 -2
  70. data/ext/ruby_gumath/gumath/python/gumath/gumath.h +246 -0
  71. data/ext/ruby_gumath/gumath/python/gumath/libgumath.a +0 -0
  72. data/ext/ruby_gumath/gumath/python/gumath/libgumath.so +1 -0
  73. data/ext/ruby_gumath/gumath/python/gumath/libgumath.so.0 +1 -0
  74. data/ext/ruby_gumath/gumath/python/gumath/libgumath.so.0.2.0dev3 +0 -0
  75. data/ext/ruby_gumath/gumath/python/gumath/pygumath.h +31 -2
  76. data/ext/ruby_gumath/gumath/python/gumath_aux.py +767 -0
  77. data/ext/ruby_gumath/gumath/python/randdec.py +535 -0
  78. data/ext/ruby_gumath/gumath/python/randfloat.py +177 -0
  79. data/ext/ruby_gumath/gumath/python/test_gumath.py +1504 -24
  80. data/ext/ruby_gumath/gumath/python/test_xndarray.py +462 -0
  81. data/ext/ruby_gumath/gumath/setup.py +67 -6
  82. data/ext/ruby_gumath/gumath/tools/detect_cuda_arch.cc +35 -0
  83. data/ext/ruby_gumath/include/gumath.h +55 -14
  84. data/ext/ruby_gumath/include/ruby_gumath.h +4 -1
  85. data/ext/ruby_gumath/lib/libgumath.a +0 -0
  86. data/ext/ruby_gumath/lib/libgumath.so.0.2.0dev3 +0 -0
  87. data/ext/ruby_gumath/ruby_gumath.c +231 -70
  88. data/ext/ruby_gumath/ruby_gumath.h +4 -1
  89. data/ext/ruby_gumath/ruby_gumath_internal.h +25 -0
  90. data/ext/ruby_gumath/util.c +34 -0
  91. data/ext/ruby_gumath/util.h +9 -0
  92. data/gumath.gemspec +3 -2
  93. data/lib/gumath.rb +55 -1
  94. data/lib/gumath/version.rb +2 -2
  95. data/lib/ruby_gumath.so +0 -0
  96. metadata +63 -10
  97. data/ext/ruby_gumath/gumath/libgumath/extending/bfloat16.c +0 -130
  98. data/ext/ruby_gumath/gumath/libgumath/kernels/binary.c +0 -547
  99. data/ext/ruby_gumath/gumath/libgumath/kernels/unary.c +0 -449
@@ -0,0 +1,614 @@
1
+ /*
2
+ * BSD 3-Clause License
3
+ *
4
+ * Copyright (c) 2017-2018, plures
5
+ * All rights reserved.
6
+ *
7
+ * Redistribution and use in source and binary forms, with or without
8
+ * modification, are permitted provided that the following conditions are met:
9
+ *
10
+ * 1. Redistributions of source code must retain the above copyright notice,
11
+ * this list of conditions and the following disclaimer.
12
+ *
13
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
14
+ * this list of conditions and the following disclaimer in the documentation
15
+ * and/or other materials provided with the distribution.
16
+ *
17
+ * 3. Neither the name of the copyright holder nor the names of its
18
+ * contributors may be used to endorse or promote products derived from
19
+ * this software without specific prior written permission.
20
+ *
21
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
22
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
24
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
25
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
26
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
27
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
28
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
29
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
30
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31
+ */
32
+
33
+
34
+ #ifndef DEVICE_HH
35
+ #define DEVICE_HH
36
+
37
+
38
+ #include <cstdint>
39
+ #include <cinttypes>
40
+ #include <complex>
41
+ #include "contrib/bfloat16.h"
42
+
43
+
44
+ #ifdef __CUDACC__
45
+ #include <cuda_fp16.h>
46
+ #define DEVICE __device__
47
+ #define ISNAN(x) (isnan(x))
48
+ #else
49
+ #define DEVICE
50
+ #define ISNAN(x) (std::isnan(x))
51
+ #endif
52
+
53
+
54
+ /*****************************************************************************/
55
+ /* Divmod */
56
+ /*****************************************************************************/
57
+
58
+ /* Python: floatobject.c */
59
+ static inline DEVICE void
60
+ _divmod(double *q, double *r, double vx, double wx)
61
+ {
62
+ double div, mod, floordiv;
63
+
64
+ mod = fmod(vx, wx);
65
+ /* fmod is typically exact, so vx-mod is *mathematically* an
66
+ exact multiple of wx. But this is fp arithmetic, and fp
67
+ vx - mod is an approximation; the result is that div may
68
+ not be an exact integral value after the division, although
69
+ it will always be very close to one.
70
+ */
71
+ div = (vx - mod) / wx;
72
+ if (mod) {
73
+ /* ensure the remainder has the same sign as the denominator */
74
+ if ((wx < 0) != (mod < 0)) {
75
+ mod += wx;
76
+ div -= 1.0;
77
+ }
78
+ }
79
+ else {
80
+ /* the remainder is zero, and in the presence of signed zeroes
81
+ fmod returns different results across platforms; ensure
82
+ it has the same sign as the denominator. */
83
+ mod = copysign(0.0, wx);
84
+ }
85
+ /* snap quotient to nearest integral value */
86
+ if (div) {
87
+ floordiv = floor(div);
88
+ if (div - floordiv > 0.5)
89
+ floordiv += 1.0;
90
+ }
91
+ else {
92
+ /* div is zero - get the same sign as the true quotient */
93
+ floordiv = copysign(0.0, vx / wx); /* zero w/ sign of vx/wx */
94
+ }
95
+
96
+ *q = floordiv;
97
+ *r = mod;
98
+ }
99
+
100
+ static inline DEVICE void
101
+ _divmod(float *q, float *r, float vx, float wx)
102
+ {
103
+ float div, mod, floordiv;
104
+
105
+ mod = fmodf(vx, wx);
106
+ /* fmod is typically exact, so vx-mod is *mathematically* an
107
+ exact multiple of wx. But this is fp arithmetic, and fp
108
+ vx - mod is an approximation; the result is that div may
109
+ not be an exact integral value after the division, although
110
+ it will always be very close to one.
111
+ */
112
+ div = (vx - mod) / wx;
113
+ if (mod) {
114
+ /* ensure the remainder has the same sign as the denominator */
115
+ if ((wx < 0) != (mod < 0)) {
116
+ mod += wx;
117
+ div -= 1.0;
118
+ }
119
+ }
120
+ else {
121
+ /* the remainder is zero, and in the presence of signed zeroes
122
+ fmod returns different results across platforms; ensure
123
+ it has the same sign as the denominator. */
124
+ mod = copysignf(0.0, wx);
125
+ }
126
+ /* snap quotient to nearest integral value */
127
+ if (div) {
128
+ floordiv = floorf(div);
129
+ if (div - floordiv > 0.5)
130
+ floordiv += 1.0;
131
+ }
132
+ else {
133
+ /* div is zero - get the same sign as the true quotient */
134
+ floordiv = copysignf(0.0, vx / wx); /* zero w/ sign of vx/wx */
135
+ }
136
+
137
+ *q = floordiv;
138
+ *r = mod;
139
+ }
140
+
141
+ static inline DEVICE void
142
+ _divmod(bfloat16_t *q, bfloat16_t *r, bfloat16_t a, bfloat16_t b)
143
+ {
144
+ float qq;
145
+ float rr;
146
+
147
+ _divmod(&qq, &rr, (float)a, (float)b);
148
+
149
+ *q = (bfloat16_t)qq;
150
+ *r = (bfloat16_t)rr;
151
+ }
152
+
153
+ #define divmod_unsigned(T) \
154
+ static inline DEVICE void \
155
+ _divmod(T *q, T *r, T a, T b) \
156
+ { \
157
+ if (b == 0) { \
158
+ *q = 0; \
159
+ *r = 0; \
160
+ } \
161
+ else { \
162
+ *q = a / b; \
163
+ *r = a % b; \
164
+ } \
165
+ }
166
+
167
+ divmod_unsigned(uint8_t)
168
+ divmod_unsigned(uint16_t)
169
+ divmod_unsigned(uint32_t)
170
+ divmod_unsigned(uint64_t)
171
+
172
+ #define divmod_signed(T, MIN) \
173
+ static inline DEVICE void \
174
+ _divmod(T *q, T *r, T a, T b) \
175
+ { \
176
+ if (b == 0 || (a == MIN && b == -1)) { \
177
+ *q = 0; \
178
+ *r = 0; \
179
+ } \
180
+ else { \
181
+ T qq = a / b; \
182
+ T rr = a % b; \
183
+ \
184
+ *q = rr ? (qq - ((a < 0) ^ (b < 0))) : qq; \
185
+ *r = a - *q * b; \
186
+ } \
187
+ }
188
+
189
+ divmod_signed(int8_t, INT8_MIN)
190
+ divmod_signed(int16_t, INT16_MIN)
191
+ divmod_signed(int32_t, INT32_MIN)
192
+ divmod_signed(int64_t, INT64_MIN)
193
+
194
+ template <class T>
195
+ static inline DEVICE T
196
+ _floor_divide(T a, T b)
197
+ {
198
+ T q;
199
+ T r;
200
+
201
+ _divmod(&q, &r, a, b);
202
+
203
+ return q;
204
+ }
205
+
206
+ template <class T>
207
+ static inline DEVICE T
208
+ _remainder(T a, T b)
209
+ {
210
+ T q;
211
+ T r;
212
+
213
+ _divmod(&q, &r, a, b);
214
+
215
+ return r;
216
+ }
217
+
218
+
219
+ /*****************************************************************************/
220
+ /* Abs */
221
+ /*****************************************************************************/
222
+
223
+ #ifdef __CUDACC__
224
+ #define abs_unsigned(T) \
225
+ static inline DEVICE T \
226
+ _abs(T x) \
227
+ { \
228
+ return x; \
229
+ }
230
+
231
+ abs_unsigned(bool)
232
+
233
+ abs_unsigned(uint8_t)
234
+ abs_unsigned(uint16_t)
235
+ abs_unsigned(uint32_t)
236
+ abs_unsigned(uint64_t)
237
+
238
+ #define abs_signed(T) \
239
+ static inline DEVICE T \
240
+ _abs(T x) \
241
+ { \
242
+ return x < 0 ? -x : x; \
243
+ }
244
+
245
+ abs_signed(int8_t)
246
+ abs_signed(int16_t)
247
+ abs_signed(int32_t)
248
+ abs_signed(int64_t)
249
+
250
+ static inline DEVICE float32_t
251
+ _abs(float32_t x)
252
+ {
253
+ return fabsf(x);
254
+ }
255
+
256
+ static inline DEVICE float64_t
257
+ _abs(float64_t x)
258
+ {
259
+ return fabs(x);
260
+ }
261
+
262
+ static inline DEVICE complex64_t
263
+ _abs(complex64_t x)
264
+ {
265
+ return thrust::abs(x);
266
+ }
267
+
268
+ static inline DEVICE complex128_t
269
+ _abs(complex128_t x)
270
+ {
271
+ return thrust::abs(x);
272
+ }
273
+ #endif
274
+
275
+
276
+ /*****************************************************************************/
277
+ /* Pow */
278
+ /*****************************************************************************/
279
+
280
+ #define pow_unsigned(name, T, mask) \
281
+ static inline DEVICE T \
282
+ name(T base, T exp) \
283
+ { \
284
+ uint64_t r = 1; \
285
+ \
286
+ while (exp > 0) { \
287
+ if (exp & 1) { \
288
+ r = (r * base) & mask; \
289
+ } \
290
+ base = (base * base) & mask; \
291
+ exp >>= 1; \
292
+ } \
293
+ \
294
+ return (T)r; \
295
+ }
296
+
297
+ pow_unsigned(_pow, uint8_t, UINT8_MAX)
298
+ pow_unsigned(_pow, uint16_t, UINT16_MAX)
299
+ pow_unsigned(_pow, uint32_t, UINT32_MAX)
300
+ pow_unsigned(_pow, uint64_t, UINT64_MAX)
301
+
302
+ pow_unsigned(_pow_int8_t, uint8_t, INT8_MAX)
303
+ pow_unsigned(_pow_int16_t, uint16_t, INT16_MAX)
304
+ pow_unsigned(_pow_int32_t, uint32_t, INT32_MAX)
305
+ pow_unsigned(_pow_int64_t, uint64_t, INT64_MAX)
306
+
307
+ #define pow_signed(T, U, MIN, MAX) \
308
+ static inline DEVICE T \
309
+ _pow(T ibase, T exp) \
310
+ { \
311
+ U base; \
312
+ T r; \
313
+ \
314
+ if (ibase < 0) { \
315
+ base = (U)(-ibase); \
316
+ r = (T)_pow_##T(base, exp); \
317
+ return (exp % 2 == 0) ? r : -r; \
318
+ } \
319
+ else { \
320
+ base = (U)ibase; \
321
+ return _pow_##T(base, exp); \
322
+ } \
323
+ }
324
+
325
+ pow_signed(int8_t, uint8_t, INT8_MIN, INT8_MAX)
326
+ pow_signed(int16_t, uint16_t, INT16_MIN, INT16_MAX)
327
+ pow_signed(int32_t, uint32_t, INT32_MIN, INT32_MAX)
328
+ pow_signed(int64_t, uint64_t, INT64_MIN, INT64_MAX)
329
+
330
+ static inline DEVICE bfloat16_t
331
+ _pow(bfloat16_t x, bfloat16_t y)
332
+ {
333
+ return (bfloat16_t)powf((float)x, (float)y);
334
+ }
335
+
336
+ static inline DEVICE float32_t
337
+ _pow(float32_t x, float32_t y)
338
+ {
339
+ return powf(x, y);
340
+ }
341
+
342
+ static inline DEVICE float64_t
343
+ _pow(float64_t x, float64_t y)
344
+ {
345
+ return pow(x, y);
346
+ }
347
+
348
+ #ifdef __CUDACC__
349
+ static inline DEVICE half
350
+ _pow(half x, half y)
351
+ {
352
+ return __float2half(pow(__half2float(x), __half2float(y)));
353
+ }
354
+ #endif
355
+
356
+
357
+ /*****************************************************************************/
358
+ /* Complex pow */
359
+ /*****************************************************************************/
360
+
361
+ #ifdef __CUDACC__
362
+ template <class T>
363
+ using Complex = thrust::complex<T>;
364
+
365
+ template <class T>
366
+ static inline DEVICE Complex<T>
367
+ _cpow(Complex<T> x, Complex<T> y)
368
+ {
369
+ return thrust::pow<T>(x, y);
370
+ }
371
+ #else
372
+ template <class T>
373
+ using Complex = std::complex<T>;
374
+
375
+ template <class T>
376
+ static inline DEVICE Complex<T>
377
+ _cpow(Complex<T> x, Complex<T> y)
378
+ {
379
+ return std::pow<T>(x, y);
380
+ }
381
+ #endif
382
+
383
+ static inline DEVICE double xhypot(double x, double y) { return hypot(x, y); }
384
+ static inline DEVICE double xpow(double x, double y) { return pow(x, y); }
385
+ static inline DEVICE double xatan2(double x, double y) { return atan2(x, y); }
386
+ static inline DEVICE double xexp(double x) { return exp(x); }
387
+ static inline DEVICE double xlog(double x) { return log(x); }
388
+ static inline DEVICE float xhypot(float x, float y) { return hypotf(x, y); }
389
+ static inline DEVICE float xpow(float x, float y) { return powf(x, y); }
390
+ static inline DEVICE float xatan2(float x, float y) { return atan2f(x, y); }
391
+ static inline DEVICE float xexp(float x) { return expf(x); }
392
+ static inline DEVICE float xlog(float x) { return logf(x); }
393
+
394
+
395
+ /* Python: complexobject.c */
396
+ template <class T>
397
+ static inline DEVICE Complex<T>
398
+ c_quot(const Complex<T> a, const Complex<T> b)
399
+ {
400
+ /* This algorithm is better, and is pretty obvious: first divide the
401
+ * numerators and denominator by whichever of {b.real, b.imag} has
402
+ * larger magnitude. The earliest reference I found was to CACM
403
+ * Algorithm 116 (Complex Division, Robert L. Smith, Stanford
404
+ * University). As usual, though, we're still ignoring all IEEE
405
+ * endcases.
406
+ */
407
+ const T abs_breal = b.real() < 0 ? -b.real() : b.real();
408
+ const T abs_bimag = b.imag() < 0 ? -b.imag() : b.imag();
409
+ T real, imag;
410
+
411
+ if (abs_breal >= abs_bimag) {
412
+ /* divide tops and bottom by b.real */
413
+ if (abs_breal == 0.0) {
414
+ // errno = EDOM;
415
+ real = imag = 0.0;
416
+ }
417
+ else {
418
+ const T ratio = b.imag() / b.real();
419
+ const T denom = b.real() + b.imag() * ratio;
420
+ real = (a.real() + a.imag() * ratio) / denom;
421
+ imag = (a.imag() - a.real() * ratio) / denom;
422
+ }
423
+ }
424
+ else if (abs_bimag >= abs_breal) {
425
+ /* divide tops and bottom by b.imag */
426
+ const T ratio = b.real() / b.imag();
427
+ const T denom = b.real() * ratio + b.imag();
428
+ real = (a.real() * ratio + a.imag()) / denom;
429
+ imag = (a.imag() * ratio - a.real()) / denom;
430
+ }
431
+ else {
432
+ /* At least one of b.real or b.imag is a NaN */
433
+ real = imag = NAN;
434
+ }
435
+
436
+ return Complex<T>{real, imag};
437
+ }
438
+
439
+ template <class T>
440
+ static inline DEVICE Complex<T>
441
+ c_pow(const Complex<T> a, const Complex<T> b)
442
+ {
443
+ if (b.real() == 0 && b.imag() == 0) {
444
+ return Complex<T>{1, 0};
445
+ }
446
+ else if (a.real() == 0 && a.imag() == 0) {
447
+ // if (b.imag() != 0 || b.real() < 0)
448
+ // errno = EDOM;
449
+ return Complex<T>{0, 0};
450
+ }
451
+ else {
452
+ T vabs = xhypot(a.real(), a.imag());
453
+ T len = xpow(vabs, b.real());
454
+ T at = xatan2(a.imag(), a.real());
455
+ T phase = at * b.real();
456
+
457
+ if (b.imag() != 0) {
458
+ len /= xexp(at * b.imag());
459
+ phase += b.imag() * xlog(vabs);
460
+ }
461
+
462
+ T real = len*cos(phase);
463
+ T imag = len*sin(phase);
464
+
465
+ return Complex<T>{real, imag};
466
+ }
467
+ }
468
+
469
+ template <class T>
470
+ static inline DEVICE Complex<T>
471
+ c_powu(Complex<T> base, uint64_t exp)
472
+ {
473
+ Complex<T> r{1, 0};
474
+
475
+ while (exp > 0) {
476
+ if (exp & 1) {
477
+ r = r * base;
478
+ }
479
+ base = base * base;
480
+ exp >>= 1;
481
+ }
482
+
483
+ return r;
484
+ }
485
+
486
+ template <class T>
487
+ static inline DEVICE Complex<T>
488
+ c_powi(Complex<T> x, int64_t n)
489
+ {
490
+ if (n > 99 || n < -99) {
491
+ Complex<T> y{(T)n, 0};
492
+ return c_pow(x, y);
493
+ }
494
+ else if (n > 0) {
495
+ return c_powu(x, (uint64_t)n);
496
+ }
497
+ else {
498
+ Complex<T> one{1, 0};
499
+ return c_quot(one, c_powu(x, (uint64_t)(-n)));
500
+ }
501
+ }
502
+
503
+ template <class T>
504
+ static inline DEVICE Complex<T>
505
+ complex_pow(Complex<T> a, Complex<T> exponent)
506
+ {
507
+ int64_t int_exponent;
508
+
509
+ int_exponent = (int64_t)exponent.real();
510
+ if (exponent.imag() == 0 && exponent.real() == int_exponent) {
511
+ return c_powi(a, int_exponent);
512
+ }
513
+ else {
514
+ return c_pow(a, exponent);
515
+ }
516
+ }
517
+
518
+ template <class T>
519
+ static inline DEVICE Complex<T>
520
+ _pow(Complex<T> x, Complex<T> y)
521
+ {
522
+ Complex<double> a = x;
523
+ Complex<double> b = y;
524
+ Complex<double> r = complex_pow(a, b);
525
+ return (Complex<T>)r;
526
+ }
527
+
528
+
529
+ /*****************************************************************************/
530
+ /* Lexicographic comparison for complex numbers */
531
+ /*****************************************************************************/
532
+
533
+ template <class T>
534
+ static inline DEVICE bool
535
+ _isnan(T a)
536
+ {
537
+ return ISNAN(a.real()) || ISNAN(a.imag());
538
+ }
539
+
540
+ template <class T, class U>
541
+ static inline DEVICE bool
542
+ lexorder_lt(T a, U b)
543
+ {
544
+ if (_isnan(a) || _isnan(b)) {
545
+ return false;
546
+ }
547
+
548
+ return a.real() < b.real() || (a.real() == b.real() && a.imag() < b.imag());
549
+ }
550
+
551
+ template <class T, class U>
552
+ static inline DEVICE bool
553
+ lexorder_le(T a, U b)
554
+ {
555
+ if (_isnan(a) || _isnan(b)) {
556
+ return false;
557
+ }
558
+
559
+ return a.real() < b.real() || (a.real() == b.real() && a.imag() <= b.imag());
560
+ }
561
+
562
+ template <class T, class U>
563
+ static inline DEVICE bool
564
+ lexorder_ge(T a, U b)
565
+ {
566
+ if (_isnan(a) || _isnan(b)) {
567
+ return false;
568
+ }
569
+
570
+ return a.real() > b.real() || (a.real() == b.real() && a.imag() >= b.imag());
571
+ }
572
+
573
+ template <class T, class U>
574
+ static inline DEVICE bool
575
+ lexorder_gt(T a, U b)
576
+ {
577
+ if (_isnan(a) || _isnan(b)) {
578
+ return false;
579
+ }
580
+
581
+ return a.real() > b.real() || (a.real() == b.real() && a.imag() > b.imag());
582
+ }
583
+
584
+ template <class T, class U>
585
+ static inline DEVICE bool
586
+ lexorder_eqn(T a, U b)
587
+ {
588
+ bool real_equal = a.real() == b.real() || (ISNAN(a.real()) && ISNAN(b.real()));
589
+ bool imag_equal = a.imag() == b.imag() || (ISNAN(a.imag()) && ISNAN(b.imag()));
590
+
591
+ return real_equal && imag_equal;
592
+ }
593
+
594
+
595
+ /*****************************************************************************/
596
+ /* Half equality */
597
+ /*****************************************************************************/
598
+
599
+ #ifdef __CUDACC__
600
+ static inline DEVICE bool
601
+ half_ne(half a, half b)
602
+ {
603
+ return !__heq(a, b);
604
+ }
605
+
606
+ static inline DEVICE bool
607
+ half_eqn(half a, half b)
608
+ {
609
+ return __heq(a, b) || (__hisnan(a) && __hisnan(b));
610
+ }
611
+ #endif
612
+
613
+
614
+ #endif /* DEVICE_HH */