gumath 0.2.0dev5 → 0.2.0dev8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (99) hide show
  1. checksums.yaml +4 -4
  2. data/CONTRIBUTING.md +7 -2
  3. data/Gemfile +0 -3
  4. data/ext/ruby_gumath/GPATH +0 -0
  5. data/ext/ruby_gumath/GRTAGS +0 -0
  6. data/ext/ruby_gumath/GTAGS +0 -0
  7. data/ext/ruby_gumath/extconf.rb +0 -5
  8. data/ext/ruby_gumath/functions.c +10 -2
  9. data/ext/ruby_gumath/gufunc_object.c +15 -4
  10. data/ext/ruby_gumath/gufunc_object.h +9 -3
  11. data/ext/ruby_gumath/gumath/Makefile +63 -0
  12. data/ext/ruby_gumath/gumath/Makefile.in +1 -0
  13. data/ext/ruby_gumath/gumath/config.h +56 -0
  14. data/ext/ruby_gumath/gumath/config.h.in +3 -0
  15. data/ext/ruby_gumath/gumath/config.log +497 -0
  16. data/ext/ruby_gumath/gumath/config.status +1034 -0
  17. data/ext/ruby_gumath/gumath/configure +375 -4
  18. data/ext/ruby_gumath/gumath/configure.ac +47 -3
  19. data/ext/ruby_gumath/gumath/libgumath/Makefile +236 -0
  20. data/ext/ruby_gumath/gumath/libgumath/Makefile.in +90 -24
  21. data/ext/ruby_gumath/gumath/libgumath/Makefile.vc +54 -15
  22. data/ext/ruby_gumath/gumath/libgumath/apply.c +92 -28
  23. data/ext/ruby_gumath/gumath/libgumath/apply.o +0 -0
  24. data/ext/ruby_gumath/gumath/libgumath/common.o +0 -0
  25. data/ext/ruby_gumath/gumath/libgumath/cpu_device_binary.o +0 -0
  26. data/ext/ruby_gumath/gumath/libgumath/cpu_device_unary.o +0 -0
  27. data/ext/ruby_gumath/gumath/libgumath/cpu_host_binary.o +0 -0
  28. data/ext/ruby_gumath/gumath/libgumath/cpu_host_unary.o +0 -0
  29. data/ext/ruby_gumath/gumath/libgumath/examples.o +0 -0
  30. data/ext/ruby_gumath/gumath/libgumath/extending/graph.c +27 -20
  31. data/ext/ruby_gumath/gumath/libgumath/extending/pdist.c +1 -1
  32. data/ext/ruby_gumath/gumath/libgumath/func.c +13 -9
  33. data/ext/ruby_gumath/gumath/libgumath/func.o +0 -0
  34. data/ext/ruby_gumath/gumath/libgumath/graph.o +0 -0
  35. data/ext/ruby_gumath/gumath/libgumath/gumath.h +55 -14
  36. data/ext/ruby_gumath/gumath/libgumath/kernels/common.c +513 -0
  37. data/ext/ruby_gumath/gumath/libgumath/kernels/common.h +155 -0
  38. data/ext/ruby_gumath/gumath/libgumath/kernels/contrib/bfloat16.h +520 -0
  39. data/ext/ruby_gumath/gumath/libgumath/kernels/cpu_device_binary.cc +1123 -0
  40. data/ext/ruby_gumath/gumath/libgumath/kernels/cpu_device_binary.h +1062 -0
  41. data/ext/ruby_gumath/gumath/libgumath/kernels/cpu_device_msvc.cc +555 -0
  42. data/ext/ruby_gumath/gumath/libgumath/kernels/cpu_device_unary.cc +368 -0
  43. data/ext/ruby_gumath/gumath/libgumath/kernels/cpu_device_unary.h +335 -0
  44. data/ext/ruby_gumath/gumath/libgumath/kernels/cpu_host_binary.c +2952 -0
  45. data/ext/ruby_gumath/gumath/libgumath/kernels/cpu_host_unary.c +1100 -0
  46. data/ext/ruby_gumath/gumath/libgumath/kernels/cuda_device_binary.cu +1143 -0
  47. data/ext/ruby_gumath/gumath/libgumath/kernels/cuda_device_binary.h +1061 -0
  48. data/ext/ruby_gumath/gumath/libgumath/kernels/cuda_device_unary.cu +528 -0
  49. data/ext/ruby_gumath/gumath/libgumath/kernels/cuda_device_unary.h +463 -0
  50. data/ext/ruby_gumath/gumath/libgumath/kernels/cuda_host_binary.c +2817 -0
  51. data/ext/ruby_gumath/gumath/libgumath/kernels/cuda_host_unary.c +1331 -0
  52. data/ext/ruby_gumath/gumath/libgumath/kernels/device.hh +614 -0
  53. data/ext/ruby_gumath/gumath/libgumath/libgumath.a +0 -0
  54. data/ext/ruby_gumath/gumath/libgumath/libgumath.so +1 -0
  55. data/ext/ruby_gumath/gumath/libgumath/libgumath.so.0 +1 -0
  56. data/ext/ruby_gumath/gumath/libgumath/libgumath.so.0.2.0dev3 +0 -0
  57. data/ext/ruby_gumath/gumath/libgumath/nploops.o +0 -0
  58. data/ext/ruby_gumath/gumath/libgumath/pdist.o +0 -0
  59. data/ext/ruby_gumath/gumath/libgumath/quaternion.o +0 -0
  60. data/ext/ruby_gumath/gumath/libgumath/tbl.o +0 -0
  61. data/ext/ruby_gumath/gumath/libgumath/thread.c +17 -4
  62. data/ext/ruby_gumath/gumath/libgumath/thread.o +0 -0
  63. data/ext/ruby_gumath/gumath/libgumath/xndloops.c +110 -0
  64. data/ext/ruby_gumath/gumath/libgumath/xndloops.o +0 -0
  65. data/ext/ruby_gumath/gumath/python/gumath/__init__.py +150 -0
  66. data/ext/ruby_gumath/gumath/python/gumath/_gumath.c +446 -80
  67. data/ext/ruby_gumath/gumath/python/gumath/cuda.c +78 -0
  68. data/ext/ruby_gumath/gumath/python/gumath/examples.c +0 -5
  69. data/ext/ruby_gumath/gumath/python/gumath/functions.c +2 -2
  70. data/ext/ruby_gumath/gumath/python/gumath/gumath.h +246 -0
  71. data/ext/ruby_gumath/gumath/python/gumath/libgumath.a +0 -0
  72. data/ext/ruby_gumath/gumath/python/gumath/libgumath.so +1 -0
  73. data/ext/ruby_gumath/gumath/python/gumath/libgumath.so.0 +1 -0
  74. data/ext/ruby_gumath/gumath/python/gumath/libgumath.so.0.2.0dev3 +0 -0
  75. data/ext/ruby_gumath/gumath/python/gumath/pygumath.h +31 -2
  76. data/ext/ruby_gumath/gumath/python/gumath_aux.py +767 -0
  77. data/ext/ruby_gumath/gumath/python/randdec.py +535 -0
  78. data/ext/ruby_gumath/gumath/python/randfloat.py +177 -0
  79. data/ext/ruby_gumath/gumath/python/test_gumath.py +1504 -24
  80. data/ext/ruby_gumath/gumath/python/test_xndarray.py +462 -0
  81. data/ext/ruby_gumath/gumath/setup.py +67 -6
  82. data/ext/ruby_gumath/gumath/tools/detect_cuda_arch.cc +35 -0
  83. data/ext/ruby_gumath/include/gumath.h +55 -14
  84. data/ext/ruby_gumath/include/ruby_gumath.h +4 -1
  85. data/ext/ruby_gumath/lib/libgumath.a +0 -0
  86. data/ext/ruby_gumath/lib/libgumath.so.0.2.0dev3 +0 -0
  87. data/ext/ruby_gumath/ruby_gumath.c +231 -70
  88. data/ext/ruby_gumath/ruby_gumath.h +4 -1
  89. data/ext/ruby_gumath/ruby_gumath_internal.h +25 -0
  90. data/ext/ruby_gumath/util.c +34 -0
  91. data/ext/ruby_gumath/util.h +9 -0
  92. data/gumath.gemspec +3 -2
  93. data/lib/gumath.rb +55 -1
  94. data/lib/gumath/version.rb +2 -2
  95. data/lib/ruby_gumath.so +0 -0
  96. metadata +63 -10
  97. data/ext/ruby_gumath/gumath/libgumath/extending/bfloat16.c +0 -130
  98. data/ext/ruby_gumath/gumath/libgumath/kernels/binary.c +0 -547
  99. data/ext/ruby_gumath/gumath/libgumath/kernels/unary.c +0 -449
@@ -0,0 +1,614 @@
1
+ /*
2
+ * BSD 3-Clause License
3
+ *
4
+ * Copyright (c) 2017-2018, plures
5
+ * All rights reserved.
6
+ *
7
+ * Redistribution and use in source and binary forms, with or without
8
+ * modification, are permitted provided that the following conditions are met:
9
+ *
10
+ * 1. Redistributions of source code must retain the above copyright notice,
11
+ * this list of conditions and the following disclaimer.
12
+ *
13
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
14
+ * this list of conditions and the following disclaimer in the documentation
15
+ * and/or other materials provided with the distribution.
16
+ *
17
+ * 3. Neither the name of the copyright holder nor the names of its
18
+ * contributors may be used to endorse or promote products derived from
19
+ * this software without specific prior written permission.
20
+ *
21
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
22
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
24
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
25
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
26
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
27
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
28
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
29
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
30
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31
+ */
32
+
33
+
34
+ #ifndef DEVICE_HH
35
+ #define DEVICE_HH
36
+
37
+
38
+ #include <cstdint>
39
+ #include <cinttypes>
40
+ #include <complex>
41
+ #include "contrib/bfloat16.h"
42
+
43
+
44
+ #ifdef __CUDACC__
45
+ #include <cuda_fp16.h>
46
+ #define DEVICE __device__
47
+ #define ISNAN(x) (isnan(x))
48
+ #else
49
+ #define DEVICE
50
+ #define ISNAN(x) (std::isnan(x))
51
+ #endif
52
+
53
+
54
+ /*****************************************************************************/
55
+ /* Divmod */
56
+ /*****************************************************************************/
57
+
58
+ /* Python: floatobject.c */
59
+ static inline DEVICE void
60
+ _divmod(double *q, double *r, double vx, double wx)
61
+ {
62
+ double div, mod, floordiv;
63
+
64
+ mod = fmod(vx, wx);
65
+ /* fmod is typically exact, so vx-mod is *mathematically* an
66
+ exact multiple of wx. But this is fp arithmetic, and fp
67
+ vx - mod is an approximation; the result is that div may
68
+ not be an exact integral value after the division, although
69
+ it will always be very close to one.
70
+ */
71
+ div = (vx - mod) / wx;
72
+ if (mod) {
73
+ /* ensure the remainder has the same sign as the denominator */
74
+ if ((wx < 0) != (mod < 0)) {
75
+ mod += wx;
76
+ div -= 1.0;
77
+ }
78
+ }
79
+ else {
80
+ /* the remainder is zero, and in the presence of signed zeroes
81
+ fmod returns different results across platforms; ensure
82
+ it has the same sign as the denominator. */
83
+ mod = copysign(0.0, wx);
84
+ }
85
+ /* snap quotient to nearest integral value */
86
+ if (div) {
87
+ floordiv = floor(div);
88
+ if (div - floordiv > 0.5)
89
+ floordiv += 1.0;
90
+ }
91
+ else {
92
+ /* div is zero - get the same sign as the true quotient */
93
+ floordiv = copysign(0.0, vx / wx); /* zero w/ sign of vx/wx */
94
+ }
95
+
96
+ *q = floordiv;
97
+ *r = mod;
98
+ }
99
+
100
+ static inline DEVICE void
101
+ _divmod(float *q, float *r, float vx, float wx)
102
+ {
103
+ float div, mod, floordiv;
104
+
105
+ mod = fmodf(vx, wx);
106
+ /* fmod is typically exact, so vx-mod is *mathematically* an
107
+ exact multiple of wx. But this is fp arithmetic, and fp
108
+ vx - mod is an approximation; the result is that div may
109
+ not be an exact integral value after the division, although
110
+ it will always be very close to one.
111
+ */
112
+ div = (vx - mod) / wx;
113
+ if (mod) {
114
+ /* ensure the remainder has the same sign as the denominator */
115
+ if ((wx < 0) != (mod < 0)) {
116
+ mod += wx;
117
+ div -= 1.0;
118
+ }
119
+ }
120
+ else {
121
+ /* the remainder is zero, and in the presence of signed zeroes
122
+ fmod returns different results across platforms; ensure
123
+ it has the same sign as the denominator. */
124
+ mod = copysignf(0.0, wx);
125
+ }
126
+ /* snap quotient to nearest integral value */
127
+ if (div) {
128
+ floordiv = floorf(div);
129
+ if (div - floordiv > 0.5)
130
+ floordiv += 1.0;
131
+ }
132
+ else {
133
+ /* div is zero - get the same sign as the true quotient */
134
+ floordiv = copysignf(0.0, vx / wx); /* zero w/ sign of vx/wx */
135
+ }
136
+
137
+ *q = floordiv;
138
+ *r = mod;
139
+ }
140
+
141
+ static inline DEVICE void
142
+ _divmod(bfloat16_t *q, bfloat16_t *r, bfloat16_t a, bfloat16_t b)
143
+ {
144
+ float qq;
145
+ float rr;
146
+
147
+ _divmod(&qq, &rr, (float)a, (float)b);
148
+
149
+ *q = (bfloat16_t)qq;
150
+ *r = (bfloat16_t)rr;
151
+ }
152
+
153
+ #define divmod_unsigned(T) \
154
+ static inline DEVICE void \
155
+ _divmod(T *q, T *r, T a, T b) \
156
+ { \
157
+ if (b == 0) { \
158
+ *q = 0; \
159
+ *r = 0; \
160
+ } \
161
+ else { \
162
+ *q = a / b; \
163
+ *r = a % b; \
164
+ } \
165
+ }
166
+
167
+ divmod_unsigned(uint8_t)
168
+ divmod_unsigned(uint16_t)
169
+ divmod_unsigned(uint32_t)
170
+ divmod_unsigned(uint64_t)
171
+
172
+ #define divmod_signed(T, MIN) \
173
+ static inline DEVICE void \
174
+ _divmod(T *q, T *r, T a, T b) \
175
+ { \
176
+ if (b == 0 || (a == MIN && b == -1)) { \
177
+ *q = 0; \
178
+ *r = 0; \
179
+ } \
180
+ else { \
181
+ T qq = a / b; \
182
+ T rr = a % b; \
183
+ \
184
+ *q = rr ? (qq - ((a < 0) ^ (b < 0))) : qq; \
185
+ *r = a - *q * b; \
186
+ } \
187
+ }
188
+
189
+ divmod_signed(int8_t, INT8_MIN)
190
+ divmod_signed(int16_t, INT16_MIN)
191
+ divmod_signed(int32_t, INT32_MIN)
192
+ divmod_signed(int64_t, INT64_MIN)
193
+
194
+ template <class T>
195
+ static inline DEVICE T
196
+ _floor_divide(T a, T b)
197
+ {
198
+ T q;
199
+ T r;
200
+
201
+ _divmod(&q, &r, a, b);
202
+
203
+ return q;
204
+ }
205
+
206
+ template <class T>
207
+ static inline DEVICE T
208
+ _remainder(T a, T b)
209
+ {
210
+ T q;
211
+ T r;
212
+
213
+ _divmod(&q, &r, a, b);
214
+
215
+ return r;
216
+ }
217
+
218
+
219
+ /*****************************************************************************/
220
+ /* Abs */
221
+ /*****************************************************************************/
222
+
223
+ #ifdef __CUDACC__
224
+ #define abs_unsigned(T) \
225
+ static inline DEVICE T \
226
+ _abs(T x) \
227
+ { \
228
+ return x; \
229
+ }
230
+
231
+ abs_unsigned(bool)
232
+
233
+ abs_unsigned(uint8_t)
234
+ abs_unsigned(uint16_t)
235
+ abs_unsigned(uint32_t)
236
+ abs_unsigned(uint64_t)
237
+
238
+ #define abs_signed(T) \
239
+ static inline DEVICE T \
240
+ _abs(T x) \
241
+ { \
242
+ return x < 0 ? -x : x; \
243
+ }
244
+
245
+ abs_signed(int8_t)
246
+ abs_signed(int16_t)
247
+ abs_signed(int32_t)
248
+ abs_signed(int64_t)
249
+
250
+ static inline DEVICE float32_t
251
+ _abs(float32_t x)
252
+ {
253
+ return fabsf(x);
254
+ }
255
+
256
+ static inline DEVICE float64_t
257
+ _abs(float64_t x)
258
+ {
259
+ return fabs(x);
260
+ }
261
+
262
+ static inline DEVICE complex64_t
263
+ _abs(complex64_t x)
264
+ {
265
+ return thrust::abs(x);
266
+ }
267
+
268
+ static inline DEVICE complex128_t
269
+ _abs(complex128_t x)
270
+ {
271
+ return thrust::abs(x);
272
+ }
273
+ #endif
274
+
275
+
276
+ /*****************************************************************************/
277
+ /* Pow */
278
+ /*****************************************************************************/
279
+
280
+ #define pow_unsigned(name, T, mask) \
281
+ static inline DEVICE T \
282
+ name(T base, T exp) \
283
+ { \
284
+ uint64_t r = 1; \
285
+ \
286
+ while (exp > 0) { \
287
+ if (exp & 1) { \
288
+ r = (r * base) & mask; \
289
+ } \
290
+ base = (base * base) & mask; \
291
+ exp >>= 1; \
292
+ } \
293
+ \
294
+ return (T)r; \
295
+ }
296
+
297
+ pow_unsigned(_pow, uint8_t, UINT8_MAX)
298
+ pow_unsigned(_pow, uint16_t, UINT16_MAX)
299
+ pow_unsigned(_pow, uint32_t, UINT32_MAX)
300
+ pow_unsigned(_pow, uint64_t, UINT64_MAX)
301
+
302
+ pow_unsigned(_pow_int8_t, uint8_t, INT8_MAX)
303
+ pow_unsigned(_pow_int16_t, uint16_t, INT16_MAX)
304
+ pow_unsigned(_pow_int32_t, uint32_t, INT32_MAX)
305
+ pow_unsigned(_pow_int64_t, uint64_t, INT64_MAX)
306
+
307
+ #define pow_signed(T, U, MIN, MAX) \
308
+ static inline DEVICE T \
309
+ _pow(T ibase, T exp) \
310
+ { \
311
+ U base; \
312
+ T r; \
313
+ \
314
+ if (ibase < 0) { \
315
+ base = (U)(-ibase); \
316
+ r = (T)_pow_##T(base, exp); \
317
+ return (exp % 2 == 0) ? r : -r; \
318
+ } \
319
+ else { \
320
+ base = (U)ibase; \
321
+ return _pow_##T(base, exp); \
322
+ } \
323
+ }
324
+
325
+ pow_signed(int8_t, uint8_t, INT8_MIN, INT8_MAX)
326
+ pow_signed(int16_t, uint16_t, INT16_MIN, INT16_MAX)
327
+ pow_signed(int32_t, uint32_t, INT32_MIN, INT32_MAX)
328
+ pow_signed(int64_t, uint64_t, INT64_MIN, INT64_MAX)
329
+
330
+ static inline DEVICE bfloat16_t
331
+ _pow(bfloat16_t x, bfloat16_t y)
332
+ {
333
+ return (bfloat16_t)powf((float)x, (float)y);
334
+ }
335
+
336
+ static inline DEVICE float32_t
337
+ _pow(float32_t x, float32_t y)
338
+ {
339
+ return powf(x, y);
340
+ }
341
+
342
+ static inline DEVICE float64_t
343
+ _pow(float64_t x, float64_t y)
344
+ {
345
+ return pow(x, y);
346
+ }
347
+
348
+ #ifdef __CUDACC__
349
+ static inline DEVICE half
350
+ _pow(half x, half y)
351
+ {
352
+ return __float2half(pow(__half2float(x), __half2float(y)));
353
+ }
354
+ #endif
355
+
356
+
357
+ /*****************************************************************************/
358
+ /* Complex pow */
359
+ /*****************************************************************************/
360
+
361
+ #ifdef __CUDACC__
362
+ template <class T>
363
+ using Complex = thrust::complex<T>;
364
+
365
+ template <class T>
366
+ static inline DEVICE Complex<T>
367
+ _cpow(Complex<T> x, Complex<T> y)
368
+ {
369
+ return thrust::pow<T>(x, y);
370
+ }
371
+ #else
372
+ template <class T>
373
+ using Complex = std::complex<T>;
374
+
375
+ template <class T>
376
+ static inline DEVICE Complex<T>
377
+ _cpow(Complex<T> x, Complex<T> y)
378
+ {
379
+ return std::pow<T>(x, y);
380
+ }
381
+ #endif
382
+
383
+ static inline DEVICE double xhypot(double x, double y) { return hypot(x, y); }
384
+ static inline DEVICE double xpow(double x, double y) { return pow(x, y); }
385
+ static inline DEVICE double xatan2(double x, double y) { return atan2(x, y); }
386
+ static inline DEVICE double xexp(double x) { return exp(x); }
387
+ static inline DEVICE double xlog(double x) { return log(x); }
388
+ static inline DEVICE float xhypot(float x, float y) { return hypotf(x, y); }
389
+ static inline DEVICE float xpow(float x, float y) { return powf(x, y); }
390
+ static inline DEVICE float xatan2(float x, float y) { return atan2f(x, y); }
391
+ static inline DEVICE float xexp(float x) { return expf(x); }
392
+ static inline DEVICE float xlog(float x) { return logf(x); }
393
+
394
+
395
+ /* Python: complexobject.c */
396
+ template <class T>
397
+ static inline DEVICE Complex<T>
398
+ c_quot(const Complex<T> a, const Complex<T> b)
399
+ {
400
+ /* This algorithm is better, and is pretty obvious: first divide the
401
+ * numerators and denominator by whichever of {b.real, b.imag} has
402
+ * larger magnitude. The earliest reference I found was to CACM
403
+ * Algorithm 116 (Complex Division, Robert L. Smith, Stanford
404
+ * University). As usual, though, we're still ignoring all IEEE
405
+ * endcases.
406
+ */
407
+ const T abs_breal = b.real() < 0 ? -b.real() : b.real();
408
+ const T abs_bimag = b.imag() < 0 ? -b.imag() : b.imag();
409
+ T real, imag;
410
+
411
+ if (abs_breal >= abs_bimag) {
412
+ /* divide tops and bottom by b.real */
413
+ if (abs_breal == 0.0) {
414
+ // errno = EDOM;
415
+ real = imag = 0.0;
416
+ }
417
+ else {
418
+ const T ratio = b.imag() / b.real();
419
+ const T denom = b.real() + b.imag() * ratio;
420
+ real = (a.real() + a.imag() * ratio) / denom;
421
+ imag = (a.imag() - a.real() * ratio) / denom;
422
+ }
423
+ }
424
+ else if (abs_bimag >= abs_breal) {
425
+ /* divide tops and bottom by b.imag */
426
+ const T ratio = b.real() / b.imag();
427
+ const T denom = b.real() * ratio + b.imag();
428
+ real = (a.real() * ratio + a.imag()) / denom;
429
+ imag = (a.imag() * ratio - a.real()) / denom;
430
+ }
431
+ else {
432
+ /* At least one of b.real or b.imag is a NaN */
433
+ real = imag = NAN;
434
+ }
435
+
436
+ return Complex<T>{real, imag};
437
+ }
438
+
439
+ template <class T>
440
+ static inline DEVICE Complex<T>
441
+ c_pow(const Complex<T> a, const Complex<T> b)
442
+ {
443
+ if (b.real() == 0 && b.imag() == 0) {
444
+ return Complex<T>{1, 0};
445
+ }
446
+ else if (a.real() == 0 && a.imag() == 0) {
447
+ // if (b.imag() != 0 || b.real() < 0)
448
+ // errno = EDOM;
449
+ return Complex<T>{0, 0};
450
+ }
451
+ else {
452
+ T vabs = xhypot(a.real(), a.imag());
453
+ T len = xpow(vabs, b.real());
454
+ T at = xatan2(a.imag(), a.real());
455
+ T phase = at * b.real();
456
+
457
+ if (b.imag() != 0) {
458
+ len /= xexp(at * b.imag());
459
+ phase += b.imag() * xlog(vabs);
460
+ }
461
+
462
+ T real = len*cos(phase);
463
+ T imag = len*sin(phase);
464
+
465
+ return Complex<T>{real, imag};
466
+ }
467
+ }
468
+
469
+ template <class T>
470
+ static inline DEVICE Complex<T>
471
+ c_powu(Complex<T> base, uint64_t exp)
472
+ {
473
+ Complex<T> r{1, 0};
474
+
475
+ while (exp > 0) {
476
+ if (exp & 1) {
477
+ r = r * base;
478
+ }
479
+ base = base * base;
480
+ exp >>= 1;
481
+ }
482
+
483
+ return r;
484
+ }
485
+
486
+ template <class T>
487
+ static inline DEVICE Complex<T>
488
+ c_powi(Complex<T> x, int64_t n)
489
+ {
490
+ if (n > 99 || n < -99) {
491
+ Complex<T> y{(T)n, 0};
492
+ return c_pow(x, y);
493
+ }
494
+ else if (n > 0) {
495
+ return c_powu(x, (uint64_t)n);
496
+ }
497
+ else {
498
+ Complex<T> one{1, 0};
499
+ return c_quot(one, c_powu(x, (uint64_t)(-n)));
500
+ }
501
+ }
502
+
503
+ template <class T>
504
+ static inline DEVICE Complex<T>
505
+ complex_pow(Complex<T> a, Complex<T> exponent)
506
+ {
507
+ int64_t int_exponent;
508
+
509
+ int_exponent = (int64_t)exponent.real();
510
+ if (exponent.imag() == 0 && exponent.real() == int_exponent) {
511
+ return c_powi(a, int_exponent);
512
+ }
513
+ else {
514
+ return c_pow(a, exponent);
515
+ }
516
+ }
517
+
518
+ template <class T>
519
+ static inline DEVICE Complex<T>
520
+ _pow(Complex<T> x, Complex<T> y)
521
+ {
522
+ Complex<double> a = x;
523
+ Complex<double> b = y;
524
+ Complex<double> r = complex_pow(a, b);
525
+ return (Complex<T>)r;
526
+ }
527
+
528
+
529
+ /*****************************************************************************/
530
+ /* Lexicographic comparison for complex numbers */
531
+ /*****************************************************************************/
532
+
533
+ template <class T>
534
+ static inline DEVICE bool
535
+ _isnan(T a)
536
+ {
537
+ return ISNAN(a.real()) || ISNAN(a.imag());
538
+ }
539
+
540
+ template <class T, class U>
541
+ static inline DEVICE bool
542
+ lexorder_lt(T a, U b)
543
+ {
544
+ if (_isnan(a) || _isnan(b)) {
545
+ return false;
546
+ }
547
+
548
+ return a.real() < b.real() || (a.real() == b.real() && a.imag() < b.imag());
549
+ }
550
+
551
+ template <class T, class U>
552
+ static inline DEVICE bool
553
+ lexorder_le(T a, U b)
554
+ {
555
+ if (_isnan(a) || _isnan(b)) {
556
+ return false;
557
+ }
558
+
559
+ return a.real() < b.real() || (a.real() == b.real() && a.imag() <= b.imag());
560
+ }
561
+
562
+ template <class T, class U>
563
+ static inline DEVICE bool
564
+ lexorder_ge(T a, U b)
565
+ {
566
+ if (_isnan(a) || _isnan(b)) {
567
+ return false;
568
+ }
569
+
570
+ return a.real() > b.real() || (a.real() == b.real() && a.imag() >= b.imag());
571
+ }
572
+
573
+ template <class T, class U>
574
+ static inline DEVICE bool
575
+ lexorder_gt(T a, U b)
576
+ {
577
+ if (_isnan(a) || _isnan(b)) {
578
+ return false;
579
+ }
580
+
581
+ return a.real() > b.real() || (a.real() == b.real() && a.imag() > b.imag());
582
+ }
583
+
584
+ template <class T, class U>
585
+ static inline DEVICE bool
586
+ lexorder_eqn(T a, U b)
587
+ {
588
+ bool real_equal = a.real() == b.real() || (ISNAN(a.real()) && ISNAN(b.real()));
589
+ bool imag_equal = a.imag() == b.imag() || (ISNAN(a.imag()) && ISNAN(b.imag()));
590
+
591
+ return real_equal && imag_equal;
592
+ }
593
+
594
+
595
+ /*****************************************************************************/
596
+ /* Half equality */
597
+ /*****************************************************************************/
598
+
599
+ #ifdef __CUDACC__
600
+ static inline DEVICE bool
601
+ half_ne(half a, half b)
602
+ {
603
+ return !__heq(a, b);
604
+ }
605
+
606
+ static inline DEVICE bool
607
+ half_eqn(half a, half b)
608
+ {
609
+ return __heq(a, b) || (__hisnan(a) && __hisnan(b));
610
+ }
611
+ #endif
612
+
613
+
614
+ #endif /* DEVICE_HH */