gumath 0.2.0dev5 → 0.2.0dev8

Sign up to get free protection for your applications and to get access to all the features.
Files changed (99) hide show
  1. checksums.yaml +4 -4
  2. data/CONTRIBUTING.md +7 -2
  3. data/Gemfile +0 -3
  4. data/ext/ruby_gumath/GPATH +0 -0
  5. data/ext/ruby_gumath/GRTAGS +0 -0
  6. data/ext/ruby_gumath/GTAGS +0 -0
  7. data/ext/ruby_gumath/extconf.rb +0 -5
  8. data/ext/ruby_gumath/functions.c +10 -2
  9. data/ext/ruby_gumath/gufunc_object.c +15 -4
  10. data/ext/ruby_gumath/gufunc_object.h +9 -3
  11. data/ext/ruby_gumath/gumath/Makefile +63 -0
  12. data/ext/ruby_gumath/gumath/Makefile.in +1 -0
  13. data/ext/ruby_gumath/gumath/config.h +56 -0
  14. data/ext/ruby_gumath/gumath/config.h.in +3 -0
  15. data/ext/ruby_gumath/gumath/config.log +497 -0
  16. data/ext/ruby_gumath/gumath/config.status +1034 -0
  17. data/ext/ruby_gumath/gumath/configure +375 -4
  18. data/ext/ruby_gumath/gumath/configure.ac +47 -3
  19. data/ext/ruby_gumath/gumath/libgumath/Makefile +236 -0
  20. data/ext/ruby_gumath/gumath/libgumath/Makefile.in +90 -24
  21. data/ext/ruby_gumath/gumath/libgumath/Makefile.vc +54 -15
  22. data/ext/ruby_gumath/gumath/libgumath/apply.c +92 -28
  23. data/ext/ruby_gumath/gumath/libgumath/apply.o +0 -0
  24. data/ext/ruby_gumath/gumath/libgumath/common.o +0 -0
  25. data/ext/ruby_gumath/gumath/libgumath/cpu_device_binary.o +0 -0
  26. data/ext/ruby_gumath/gumath/libgumath/cpu_device_unary.o +0 -0
  27. data/ext/ruby_gumath/gumath/libgumath/cpu_host_binary.o +0 -0
  28. data/ext/ruby_gumath/gumath/libgumath/cpu_host_unary.o +0 -0
  29. data/ext/ruby_gumath/gumath/libgumath/examples.o +0 -0
  30. data/ext/ruby_gumath/gumath/libgumath/extending/graph.c +27 -20
  31. data/ext/ruby_gumath/gumath/libgumath/extending/pdist.c +1 -1
  32. data/ext/ruby_gumath/gumath/libgumath/func.c +13 -9
  33. data/ext/ruby_gumath/gumath/libgumath/func.o +0 -0
  34. data/ext/ruby_gumath/gumath/libgumath/graph.o +0 -0
  35. data/ext/ruby_gumath/gumath/libgumath/gumath.h +55 -14
  36. data/ext/ruby_gumath/gumath/libgumath/kernels/common.c +513 -0
  37. data/ext/ruby_gumath/gumath/libgumath/kernels/common.h +155 -0
  38. data/ext/ruby_gumath/gumath/libgumath/kernels/contrib/bfloat16.h +520 -0
  39. data/ext/ruby_gumath/gumath/libgumath/kernels/cpu_device_binary.cc +1123 -0
  40. data/ext/ruby_gumath/gumath/libgumath/kernels/cpu_device_binary.h +1062 -0
  41. data/ext/ruby_gumath/gumath/libgumath/kernels/cpu_device_msvc.cc +555 -0
  42. data/ext/ruby_gumath/gumath/libgumath/kernels/cpu_device_unary.cc +368 -0
  43. data/ext/ruby_gumath/gumath/libgumath/kernels/cpu_device_unary.h +335 -0
  44. data/ext/ruby_gumath/gumath/libgumath/kernels/cpu_host_binary.c +2952 -0
  45. data/ext/ruby_gumath/gumath/libgumath/kernels/cpu_host_unary.c +1100 -0
  46. data/ext/ruby_gumath/gumath/libgumath/kernels/cuda_device_binary.cu +1143 -0
  47. data/ext/ruby_gumath/gumath/libgumath/kernels/cuda_device_binary.h +1061 -0
  48. data/ext/ruby_gumath/gumath/libgumath/kernels/cuda_device_unary.cu +528 -0
  49. data/ext/ruby_gumath/gumath/libgumath/kernels/cuda_device_unary.h +463 -0
  50. data/ext/ruby_gumath/gumath/libgumath/kernels/cuda_host_binary.c +2817 -0
  51. data/ext/ruby_gumath/gumath/libgumath/kernels/cuda_host_unary.c +1331 -0
  52. data/ext/ruby_gumath/gumath/libgumath/kernels/device.hh +614 -0
  53. data/ext/ruby_gumath/gumath/libgumath/libgumath.a +0 -0
  54. data/ext/ruby_gumath/gumath/libgumath/libgumath.so +1 -0
  55. data/ext/ruby_gumath/gumath/libgumath/libgumath.so.0 +1 -0
  56. data/ext/ruby_gumath/gumath/libgumath/libgumath.so.0.2.0dev3 +0 -0
  57. data/ext/ruby_gumath/gumath/libgumath/nploops.o +0 -0
  58. data/ext/ruby_gumath/gumath/libgumath/pdist.o +0 -0
  59. data/ext/ruby_gumath/gumath/libgumath/quaternion.o +0 -0
  60. data/ext/ruby_gumath/gumath/libgumath/tbl.o +0 -0
  61. data/ext/ruby_gumath/gumath/libgumath/thread.c +17 -4
  62. data/ext/ruby_gumath/gumath/libgumath/thread.o +0 -0
  63. data/ext/ruby_gumath/gumath/libgumath/xndloops.c +110 -0
  64. data/ext/ruby_gumath/gumath/libgumath/xndloops.o +0 -0
  65. data/ext/ruby_gumath/gumath/python/gumath/__init__.py +150 -0
  66. data/ext/ruby_gumath/gumath/python/gumath/_gumath.c +446 -80
  67. data/ext/ruby_gumath/gumath/python/gumath/cuda.c +78 -0
  68. data/ext/ruby_gumath/gumath/python/gumath/examples.c +0 -5
  69. data/ext/ruby_gumath/gumath/python/gumath/functions.c +2 -2
  70. data/ext/ruby_gumath/gumath/python/gumath/gumath.h +246 -0
  71. data/ext/ruby_gumath/gumath/python/gumath/libgumath.a +0 -0
  72. data/ext/ruby_gumath/gumath/python/gumath/libgumath.so +1 -0
  73. data/ext/ruby_gumath/gumath/python/gumath/libgumath.so.0 +1 -0
  74. data/ext/ruby_gumath/gumath/python/gumath/libgumath.so.0.2.0dev3 +0 -0
  75. data/ext/ruby_gumath/gumath/python/gumath/pygumath.h +31 -2
  76. data/ext/ruby_gumath/gumath/python/gumath_aux.py +767 -0
  77. data/ext/ruby_gumath/gumath/python/randdec.py +535 -0
  78. data/ext/ruby_gumath/gumath/python/randfloat.py +177 -0
  79. data/ext/ruby_gumath/gumath/python/test_gumath.py +1504 -24
  80. data/ext/ruby_gumath/gumath/python/test_xndarray.py +462 -0
  81. data/ext/ruby_gumath/gumath/setup.py +67 -6
  82. data/ext/ruby_gumath/gumath/tools/detect_cuda_arch.cc +35 -0
  83. data/ext/ruby_gumath/include/gumath.h +55 -14
  84. data/ext/ruby_gumath/include/ruby_gumath.h +4 -1
  85. data/ext/ruby_gumath/lib/libgumath.a +0 -0
  86. data/ext/ruby_gumath/lib/libgumath.so.0.2.0dev3 +0 -0
  87. data/ext/ruby_gumath/ruby_gumath.c +231 -70
  88. data/ext/ruby_gumath/ruby_gumath.h +4 -1
  89. data/ext/ruby_gumath/ruby_gumath_internal.h +25 -0
  90. data/ext/ruby_gumath/util.c +34 -0
  91. data/ext/ruby_gumath/util.h +9 -0
  92. data/gumath.gemspec +3 -2
  93. data/lib/gumath.rb +55 -1
  94. data/lib/gumath/version.rb +2 -2
  95. data/lib/ruby_gumath.so +0 -0
  96. metadata +63 -10
  97. data/ext/ruby_gumath/gumath/libgumath/extending/bfloat16.c +0 -130
  98. data/ext/ruby_gumath/gumath/libgumath/kernels/binary.c +0 -547
  99. data/ext/ruby_gumath/gumath/libgumath/kernels/unary.c +0 -449
@@ -0,0 +1,1143 @@
1
+ /*
2
+ * BSD 3-Clause License
3
+ *
4
+ * Copyright (c) 2017-2018, plures
5
+ * All rights reserved.
6
+ *
7
+ * Redistribution and use in source and binary forms, with or without
8
+ * modification, are permitted provided that the following conditions are met:
9
+ *
10
+ * 1. Redistributions of source code must retain the above copyright notice,
11
+ * this list of conditions and the following disclaimer.
12
+ *
13
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
14
+ * this list of conditions and the following disclaimer in the documentation
15
+ * and/or other materials provided with the distribution.
16
+ *
17
+ * 3. Neither the name of the copyright holder nor the names of its
18
+ * contributors may be used to endorse or promote products derived from
19
+ * this software without specific prior written permission.
20
+ *
21
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
22
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
24
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
25
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
26
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
27
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
28
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
29
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
30
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31
+ */
32
+
33
+
34
+ #include <cinttypes>
35
+ #include <thrust/complex.h>
36
+ #include "contrib/bfloat16.h"
37
+ #include "cuda_device_binary.h"
38
+ #include "device.hh"
39
+
40
+
41
+ /*****************************************************************************/
42
+ /* CUDA device binary kernels */
43
+ /*****************************************************************************/
44
+
45
+ #define CUDA_DEVICE_BINARY(name, func, t0, t1, t2, common) \
46
+ static __global__ void \
47
+ _1D_C_##name##_##t0##_##t1##_##t2( \
48
+ const t0##_t *x0, const t1##_t *x1, t2##_t *x2, \
49
+ const int64_t N) \
50
+ { \
51
+ int64_t index = threadIdx.x + blockIdx.x * blockDim.x; \
52
+ int64_t stride = blockDim.x * gridDim.x; \
53
+ \
54
+ for (int64_t i = index; i < N; i += stride) { \
55
+ x2[i] = func((common##_t)x0[i], (common##_t)x1[i]); \
56
+ } \
57
+ } \
58
+ \
59
+ extern "C" void \
60
+ gm_cuda_device_fixed_1D_C_##name##_##t0##_##t1##_##t2( \
61
+ const char *a0, const char *a1, char *a2, \
62
+ const int64_t N) \
63
+ { \
64
+ const t0##_t *x0 = (const t0##_t *)a0; \
65
+ const t1##_t *x1 = (const t1##_t *)a1; \
66
+ t2##_t *x2 = (t2##_t *)a2; \
67
+ int blockSize = 256; \
68
+ int64_t numBlocks = (N + blockSize - 1) / blockSize; \
69
+ \
70
+ _1D_C_##name##_##t0##_##t1##_##t2<<<numBlocks, blockSize>>>(x0, x1, x2, N); \
71
+ } \
72
+ \
73
+ static __global__ void \
74
+ _1D_S_##name##_##t0##_##t1##_##t2( \
75
+ const t0##_t *x0, const t1##_t *x1, t2##_t *x2, \
76
+ const int64_t s0, const int64_t s1, const int64_t s2, \
77
+ const int64_t N) \
78
+ { \
79
+ int64_t index = threadIdx.x + blockIdx.x * blockDim.x; \
80
+ int64_t stride = blockDim.x * gridDim.x; \
81
+ \
82
+ for (int64_t i = index; i < N; i += stride) { \
83
+ const int64_t i0 = i * s0; \
84
+ const int64_t i1 = i * s1; \
85
+ const int64_t i2 = i * s2; \
86
+ x2[i2] = func((common##_t)x0[i0], (common##_t)x1[i1]); \
87
+ } \
88
+ } \
89
+ \
90
+ extern "C" void \
91
+ gm_cuda_device_fixed_1D_S_##name##_##t0##_##t1##_##t2( \
92
+ const char *a0, const char *a1, char *a2, \
93
+ const int64_t s0, const int64_t s1, const int64_t s2, \
94
+ const int64_t N) \
95
+ { \
96
+ const t0##_t *x0 = (const t0##_t *)a0; \
97
+ const t1##_t *x1 = (const t1##_t *)a1; \
98
+ t2##_t *x2 = (t2##_t *)a2; \
99
+ int blockSize = 256; \
100
+ int64_t numBlocks = (N + blockSize - 1) / blockSize; \
101
+ \
102
+ _1D_S_##name##_##t0##_##t1##_##t2<<<numBlocks, blockSize>>>(x0, x1, x2, \
103
+ s0, s1, s2, N); \
104
+ } \
105
+ \
106
+ static __global__ void \
107
+ _0D_##name##_##t0##_##t1##_##t2(const t0##_t *x0, const t1##_t *x1, t2##_t *x2) \
108
+ { \
109
+ *x2 = func((common##_t)*x0, (common##_t)*x1); \
110
+ } \
111
+ \
112
+ extern "C" void \
113
+ gm_cuda_device_0D_##name##_##t0##_##t1##_##t2( \
114
+ const char *a0, const char *a1, char *a2) \
115
+ { \
116
+ const t0##_t *x0 = (const t0##_t *)a0; \
117
+ const t1##_t *x1 = (const t1##_t *)a1; \
118
+ t2##_t *x2 = (t2##_t *)a2; \
119
+ \
120
+ _0D_##name##_##t0##_##t1##_##t2<<<1, 1>>>(x0, x1, x2); \
121
+ }
122
+
123
+ #define CUDA_DEVICE_NOIMPL(name, func, t0, t1, t2, common)
124
+ #define CUDA_DEVICE_NOKERN(name, func, t0, t1, t2, common)
125
+
126
+
127
+ /*****************************************************************************/
128
+ /* Arithmetic */
129
+ /*****************************************************************************/
130
+
131
+ #define CUDA_DEVICE_ALL_BINARY(name, func, hfunc) \
132
+ CUDA_DEVICE_BINARY(name, func, uint8, uint8, uint8, uint8) \
133
+ CUDA_DEVICE_BINARY(name, func, uint8, uint16, uint16, uint16) \
134
+ CUDA_DEVICE_BINARY(name, func, uint8, uint32, uint32, uint32) \
135
+ CUDA_DEVICE_BINARY(name, func, uint8, uint64, uint64, uint64) \
136
+ CUDA_DEVICE_BINARY(name, func, uint8, int8, int16, int16) \
137
+ CUDA_DEVICE_BINARY(name, func, uint8, int16, int16, int16) \
138
+ CUDA_DEVICE_BINARY(name, func, uint8, int32, int32, int32) \
139
+ CUDA_DEVICE_BINARY(name, func, uint8, int64, int64, int64) \
140
+ CUDA_DEVICE_BINARY(name, func, uint8, bfloat16, bfloat16, bfloat16) \
141
+ CUDA_DEVICE_BINARY(name, hfunc, uint8, float16, float16, float16) \
142
+ CUDA_DEVICE_BINARY(name, func, uint8, float32, float32, float32) \
143
+ CUDA_DEVICE_BINARY(name, func, uint8, float64, float64, float64) \
144
+ CUDA_DEVICE_NOIMPL(name, func, uint8, complex32, complex32, complex32) \
145
+ CUDA_DEVICE_BINARY(name, func, uint8, complex64, complex64, complex64) \
146
+ CUDA_DEVICE_BINARY(name, func, uint8, complex128, complex128, complex128) \
147
+ \
148
+ CUDA_DEVICE_BINARY(name, func, uint16, uint8, uint16, uint16) \
149
+ CUDA_DEVICE_BINARY(name, func, uint16, uint16, uint16, uint16) \
150
+ CUDA_DEVICE_BINARY(name, func, uint16, uint32, uint32, uint32) \
151
+ CUDA_DEVICE_BINARY(name, func, uint16, uint64, uint64, uint64) \
152
+ CUDA_DEVICE_BINARY(name, func, uint16, int8, int32, int32) \
153
+ CUDA_DEVICE_BINARY(name, func, uint16, int16, int32, int32) \
154
+ CUDA_DEVICE_BINARY(name, func, uint16, int32, int32, int32) \
155
+ CUDA_DEVICE_BINARY(name, func, uint16, int64, int64, int64) \
156
+ CUDA_DEVICE_BINARY(name, func, uint16, bfloat16, float32, float32) \
157
+ CUDA_DEVICE_BINARY(name, func, uint16, float16, float32, float32) \
158
+ CUDA_DEVICE_BINARY(name, func, uint16, float32, float32, float32) \
159
+ CUDA_DEVICE_BINARY(name, func, uint16, float64, float64, float64) \
160
+ CUDA_DEVICE_NOIMPL(name, func, uint16, complex32, complex64, complex64) \
161
+ CUDA_DEVICE_BINARY(name, func, uint16, complex64, complex64, complex64) \
162
+ CUDA_DEVICE_BINARY(name, func, uint16, complex128, complex128, complex128) \
163
+ \
164
+ CUDA_DEVICE_BINARY(name, func, uint32, uint8, uint32, uint32) \
165
+ CUDA_DEVICE_BINARY(name, func, uint32, uint16, uint32, uint32) \
166
+ CUDA_DEVICE_BINARY(name, func, uint32, uint32, uint32, uint32) \
167
+ CUDA_DEVICE_BINARY(name, func, uint32, uint64, uint64, uint64) \
168
+ CUDA_DEVICE_BINARY(name, func, uint32, int8, int64, int64) \
169
+ CUDA_DEVICE_BINARY(name, func, uint32, int16, int64, int64) \
170
+ CUDA_DEVICE_BINARY(name, func, uint32, int32, int64, int64) \
171
+ CUDA_DEVICE_BINARY(name, func, uint32, int64, int64, int64) \
172
+ CUDA_DEVICE_BINARY(name, func, uint32, bfloat16, float64, float64) \
173
+ CUDA_DEVICE_BINARY(name, func, uint32, float16, float64, float64) \
174
+ CUDA_DEVICE_BINARY(name, func, uint32, float32, float64, float64) \
175
+ CUDA_DEVICE_BINARY(name, func, uint32, float64, float64, float64) \
176
+ CUDA_DEVICE_NOIMPL(name, func, uint32, complex32, complex128, complex128) \
177
+ CUDA_DEVICE_BINARY(name, func, uint32, complex64, complex128, complex128) \
178
+ CUDA_DEVICE_BINARY(name, func, uint32, complex128, complex128, complex128) \
179
+ \
180
+ CUDA_DEVICE_BINARY(name, func, uint64, uint8, uint64, uint64) \
181
+ CUDA_DEVICE_BINARY(name, func, uint64, uint16, uint64, uint64) \
182
+ CUDA_DEVICE_BINARY(name, func, uint64, uint32, uint64, uint64) \
183
+ CUDA_DEVICE_BINARY(name, func, uint64, uint64, uint64, uint64) \
184
+ \
185
+ CUDA_DEVICE_BINARY(name, func, int8, uint8, int16, int16) \
186
+ CUDA_DEVICE_BINARY(name, func, int8, uint16, int32, int32) \
187
+ CUDA_DEVICE_BINARY(name, func, int8, uint32, int64, int64) \
188
+ CUDA_DEVICE_BINARY(name, func, int8, int8, int8, int8) \
189
+ CUDA_DEVICE_BINARY(name, func, int8, int16, int16, int16) \
190
+ CUDA_DEVICE_BINARY(name, func, int8, int32, int32, int32) \
191
+ CUDA_DEVICE_BINARY(name, func, int8, int64, int64, int64) \
192
+ CUDA_DEVICE_BINARY(name, func, int8, bfloat16, bfloat16, bfloat16) \
193
+ CUDA_DEVICE_BINARY(name, hfunc, int8, float16, float16, float16) \
194
+ CUDA_DEVICE_BINARY(name, func, int8, float32, float32, float32) \
195
+ CUDA_DEVICE_BINARY(name, func, int8, float64, float64, float64) \
196
+ CUDA_DEVICE_NOIMPL(name, func, int8, complex32, complex32, complex32) \
197
+ CUDA_DEVICE_BINARY(name, func, int8, complex64, complex64, complex64) \
198
+ CUDA_DEVICE_BINARY(name, func, int8, complex128, complex128, complex128) \
199
+ \
200
+ CUDA_DEVICE_BINARY(name, func, int16, uint8, int16, int16) \
201
+ CUDA_DEVICE_BINARY(name, func, int16, uint16, int32, int32) \
202
+ CUDA_DEVICE_BINARY(name, func, int16, uint32, int64, int64) \
203
+ CUDA_DEVICE_BINARY(name, func, int16, int8, int16, int16) \
204
+ CUDA_DEVICE_BINARY(name, func, int16, int16, int16, int16) \
205
+ CUDA_DEVICE_BINARY(name, func, int16, int32, int32, int32) \
206
+ CUDA_DEVICE_BINARY(name, func, int16, int64, int64, int64) \
207
+ CUDA_DEVICE_BINARY(name, func, int16, bfloat16, float32, float32) \
208
+ CUDA_DEVICE_BINARY(name, func, int16, float16, float32, float32) \
209
+ CUDA_DEVICE_BINARY(name, func, int16, float32, float32, float32) \
210
+ CUDA_DEVICE_BINARY(name, func, int16, float64, float64, float64) \
211
+ CUDA_DEVICE_NOIMPL(name, func, int16, complex32, complex64, complex64) \
212
+ CUDA_DEVICE_BINARY(name, func, int16, complex64, complex64, complex64) \
213
+ CUDA_DEVICE_BINARY(name, func, int16, complex128, complex128, complex128) \
214
+ \
215
+ CUDA_DEVICE_BINARY(name, func, int32, uint8, int32, int32) \
216
+ CUDA_DEVICE_BINARY(name, func, int32, uint16, int32, int32) \
217
+ CUDA_DEVICE_BINARY(name, func, int32, uint32, int64, int64) \
218
+ CUDA_DEVICE_BINARY(name, func, int32, int8, int32, int32) \
219
+ CUDA_DEVICE_BINARY(name, func, int32, int16, int32, int32) \
220
+ CUDA_DEVICE_BINARY(name, func, int32, int32, int32, int32) \
221
+ CUDA_DEVICE_BINARY(name, func, int32, int64, int64, int64) \
222
+ CUDA_DEVICE_BINARY(name, func, int32, bfloat16, float64, float64) \
223
+ CUDA_DEVICE_BINARY(name, func, int32, float16, float64, float64) \
224
+ CUDA_DEVICE_BINARY(name, func, int32, float32, float64, float64) \
225
+ CUDA_DEVICE_BINARY(name, func, int32, float64, float64, float64) \
226
+ CUDA_DEVICE_NOIMPL(name, func, int32, complex32, complex128, complex128) \
227
+ CUDA_DEVICE_BINARY(name, func, int32, complex64, complex128, complex128) \
228
+ CUDA_DEVICE_BINARY(name, func, int32, complex128, complex128, complex128) \
229
+ \
230
+ CUDA_DEVICE_BINARY(name, func, int64, uint8, int64, int64) \
231
+ CUDA_DEVICE_BINARY(name, func, int64, uint16, int64, int64) \
232
+ CUDA_DEVICE_BINARY(name, func, int64, uint32, int64, int64) \
233
+ CUDA_DEVICE_BINARY(name, func, int64, int8, int64, int64) \
234
+ CUDA_DEVICE_BINARY(name, func, int64, int16, int64, int64) \
235
+ CUDA_DEVICE_BINARY(name, func, int64, int32, int64, int64) \
236
+ CUDA_DEVICE_BINARY(name, func, int64, int64, int64, int64) \
237
+ \
238
+ CUDA_DEVICE_BINARY(name, func, bfloat16, uint8, bfloat16, bfloat16) \
239
+ CUDA_DEVICE_BINARY(name, func, bfloat16, uint16, float32, float32) \
240
+ CUDA_DEVICE_BINARY(name, func, bfloat16, uint32, float64, float64) \
241
+ CUDA_DEVICE_BINARY(name, func, bfloat16, int8, bfloat16, bfloat16) \
242
+ CUDA_DEVICE_BINARY(name, func, bfloat16, int16, float32, float32) \
243
+ CUDA_DEVICE_BINARY(name, func, bfloat16, int32, float64, float64) \
244
+ CUDA_DEVICE_BINARY(name, func, bfloat16, bfloat16, bfloat16, bfloat16) \
245
+ CUDA_DEVICE_BINARY(name, func, bfloat16, float16, float32, float32) \
246
+ CUDA_DEVICE_BINARY(name, func, bfloat16, float32, float32, float32) \
247
+ CUDA_DEVICE_BINARY(name, func, bfloat16, float64, float64, float64) \
248
+ CUDA_DEVICE_NOIMPL(name, func, bfloat16, complex32, complex32, complex64) \
249
+ CUDA_DEVICE_BINARY(name, func, bfloat16, complex64, complex64, complex64) \
250
+ CUDA_DEVICE_BINARY(name, func, bfloat16, complex128, complex128, complex128) \
251
+ \
252
+ CUDA_DEVICE_BINARY(name, hfunc, float16, uint8, float16, float16) \
253
+ CUDA_DEVICE_BINARY(name, func, float16, uint16, float32, float32) \
254
+ CUDA_DEVICE_BINARY(name, func, float16, uint32, float64, float64) \
255
+ CUDA_DEVICE_BINARY(name, hfunc, float16, int8, float16, float16) \
256
+ CUDA_DEVICE_BINARY(name, func, float16, int16, float32, float32) \
257
+ CUDA_DEVICE_BINARY(name, func, float16, int32, float64, float64) \
258
+ CUDA_DEVICE_BINARY(name, func, float16, bfloat16, float32, float32) \
259
+ CUDA_DEVICE_BINARY(name, hfunc, float16, float16, float16, float16) \
260
+ CUDA_DEVICE_BINARY(name, func, float16, float32, float32, float32) \
261
+ CUDA_DEVICE_BINARY(name, func, float16, float64, float64, float64) \
262
+ CUDA_DEVICE_NOIMPL(name, func, float16, complex32, complex32, complex32) \
263
+ CUDA_DEVICE_BINARY(name, func, float16, complex64, complex64, complex64) \
264
+ CUDA_DEVICE_BINARY(name, func, float16, complex128, complex128, complex128) \
265
+ \
266
+ CUDA_DEVICE_BINARY(name, func, float32, uint8, float32, float32) \
267
+ CUDA_DEVICE_BINARY(name, func, float32, uint16, float32, float32) \
268
+ CUDA_DEVICE_BINARY(name, func, float32, uint32, float64, float64) \
269
+ CUDA_DEVICE_BINARY(name, func, float32, int8, float32, float32) \
270
+ CUDA_DEVICE_BINARY(name, func, float32, int16, float32, float32) \
271
+ CUDA_DEVICE_BINARY(name, func, float32, int32, float64, float64) \
272
+ CUDA_DEVICE_BINARY(name, func, float32, bfloat16, float32, float32) \
273
+ CUDA_DEVICE_BINARY(name, func, float32, float16, float32, float32) \
274
+ CUDA_DEVICE_BINARY(name, func, float32, float32, float32, float32) \
275
+ CUDA_DEVICE_BINARY(name, func, float32, float64, float64, float64) \
276
+ CUDA_DEVICE_NOIMPL(name, func, float32, complex32, complex64, complex64) \
277
+ CUDA_DEVICE_BINARY(name, func, float32, complex64, complex64, complex64) \
278
+ CUDA_DEVICE_BINARY(name, func, float32, complex128, complex128, complex128) \
279
+ \
280
+ CUDA_DEVICE_BINARY(name, func, float64, uint8, float64, float64) \
281
+ CUDA_DEVICE_BINARY(name, func, float64, uint16, float64, float64) \
282
+ CUDA_DEVICE_BINARY(name, func, float64, uint32, float64, float64) \
283
+ CUDA_DEVICE_BINARY(name, func, float64, int8, float64, float64) \
284
+ CUDA_DEVICE_BINARY(name, func, float64, int16, float64, float64) \
285
+ CUDA_DEVICE_BINARY(name, func, float64, int32, float64, float64) \
286
+ CUDA_DEVICE_BINARY(name, func, float64, bfloat16, float64, float64) \
287
+ CUDA_DEVICE_BINARY(name, func, float64, float16, float64, float64) \
288
+ CUDA_DEVICE_BINARY(name, func, float64, float32, float64, float64) \
289
+ CUDA_DEVICE_BINARY(name, func, float64, float64, float64, float64) \
290
+ CUDA_DEVICE_NOIMPL(name, func, float64, complex32, complex128, complex128) \
291
+ CUDA_DEVICE_BINARY(name, func, float64, complex64, complex128, complex128) \
292
+ CUDA_DEVICE_BINARY(name, func, float64, complex128, complex128, complex128) \
293
+ \
294
+ CUDA_DEVICE_NOIMPL(name, func, complex32, uint8, complex32, complex32) \
295
+ CUDA_DEVICE_NOIMPL(name, func, complex32, uint16, complex64, complex64) \
296
+ CUDA_DEVICE_NOIMPL(name, func, complex32, uint32, complex128, complex128) \
297
+ CUDA_DEVICE_NOIMPL(name, func, complex32, int8, complex32, complex32) \
298
+ CUDA_DEVICE_NOIMPL(name, func, complex32, int16, complex64, complex64) \
299
+ CUDA_DEVICE_NOIMPL(name, func, complex32, int32, complex128, complex128) \
300
+ CUDA_DEVICE_NOIMPL(name, func, complex32, bfloat16, complex64, complex64) \
301
+ CUDA_DEVICE_NOIMPL(name, func, complex32, float16, complex32, complex32) \
302
+ CUDA_DEVICE_NOIMPL(name, func, complex32, float32, complex64, complex64) \
303
+ CUDA_DEVICE_NOIMPL(name, func, complex32, float64, complex128, complex128) \
304
+ CUDA_DEVICE_NOIMPL(name, func, complex32, complex32, complex32, complex32) \
305
+ CUDA_DEVICE_NOIMPL(name, func, complex32, complex64, complex64, complex64) \
306
+ CUDA_DEVICE_NOIMPL(name, func, complex32, complex128, complex128, complex128) \
307
+ \
308
+ CUDA_DEVICE_BINARY(name, func, complex64, uint8, complex64, complex64) \
309
+ CUDA_DEVICE_BINARY(name, func, complex64, uint16, complex64, complex64) \
310
+ CUDA_DEVICE_BINARY(name, func, complex64, uint32, complex128, complex128) \
311
+ CUDA_DEVICE_BINARY(name, func, complex64, int8, complex64, complex64) \
312
+ CUDA_DEVICE_BINARY(name, func, complex64, int16, complex64, complex64) \
313
+ CUDA_DEVICE_BINARY(name, func, complex64, int32, complex128, complex128) \
314
+ CUDA_DEVICE_BINARY(name, func, complex64, bfloat16, complex64, complex64) \
315
+ CUDA_DEVICE_BINARY(name, func, complex64, float16, complex64, complex64) \
316
+ CUDA_DEVICE_BINARY(name, func, complex64, float32, complex64, complex64) \
317
+ CUDA_DEVICE_BINARY(name, func, complex64, float64, complex128, complex128) \
318
+ CUDA_DEVICE_NOIMPL(name, func, complex64, complex32, complex64, complex64) \
319
+ CUDA_DEVICE_BINARY(name, func, complex64, complex64, complex64, complex64) \
320
+ CUDA_DEVICE_BINARY(name, func, complex64, complex128, complex128, complex128) \
321
+ \
322
+ CUDA_DEVICE_BINARY(name, func, complex128, uint8, complex128, complex128) \
323
+ CUDA_DEVICE_BINARY(name, func, complex128, uint16, complex128, complex128) \
324
+ CUDA_DEVICE_BINARY(name, func, complex128, uint32, complex128, complex128) \
325
+ CUDA_DEVICE_BINARY(name, func, complex128, int8, complex128, complex128) \
326
+ CUDA_DEVICE_BINARY(name, func, complex128, int16, complex128, complex128) \
327
+ CUDA_DEVICE_BINARY(name, func, complex128, int32, complex128, complex128) \
328
+ CUDA_DEVICE_BINARY(name, func, complex128, bfloat16, complex128, complex128) \
329
+ CUDA_DEVICE_BINARY(name, func, complex128, float16, complex128, complex128) \
330
+ CUDA_DEVICE_BINARY(name, func, complex128, float32, complex128, complex128) \
331
+ CUDA_DEVICE_BINARY(name, func, complex128, float64, complex128, complex128) \
332
+ CUDA_DEVICE_NOIMPL(name, func, complex128, complex32, complex128, complex128) \
333
+ CUDA_DEVICE_BINARY(name, func, complex128, complex64, complex128, complex128) \
334
+ CUDA_DEVICE_BINARY(name, func, complex128, complex128, complex128, complex128) \
335
+
336
+ #define CUDA_DEVICE_ALL_BINARY_NO_COMPLEX(name, func, hfunc) \
337
+ CUDA_DEVICE_BINARY(name, func, uint8, uint8, uint8, uint8) \
338
+ CUDA_DEVICE_BINARY(name, func, uint8, uint16, uint16, uint16) \
339
+ CUDA_DEVICE_BINARY(name, func, uint8, uint32, uint32, uint32) \
340
+ CUDA_DEVICE_BINARY(name, func, uint8, uint64, uint64, uint64) \
341
+ CUDA_DEVICE_BINARY(name, func, uint8, int8, int16, int16) \
342
+ CUDA_DEVICE_BINARY(name, func, uint8, int16, int16, int16) \
343
+ CUDA_DEVICE_BINARY(name, func, uint8, int32, int32, int32) \
344
+ CUDA_DEVICE_BINARY(name, func, uint8, int64, int64, int64) \
345
+ CUDA_DEVICE_BINARY(name, func, uint8, bfloat16, bfloat16, bfloat16) \
346
+ CUDA_DEVICE_NOIMPL(name, hfunc, uint8, float16, float16, float16) \
347
+ CUDA_DEVICE_BINARY(name, func, uint8, float32, float32, float32) \
348
+ CUDA_DEVICE_BINARY(name, func, uint8, float64, float64, float64) \
349
+ CUDA_DEVICE_NOKERN(name, func, uint8, complex32, complex32, complex32) \
350
+ CUDA_DEVICE_NOKERN(name, func, uint8, complex64, complex64, complex64) \
351
+ CUDA_DEVICE_NOKERN(name, func, uint8, complex128, complex128, complex128) \
352
+ \
353
+ CUDA_DEVICE_BINARY(name, func, uint16, uint8, uint16, uint16) \
354
+ CUDA_DEVICE_BINARY(name, func, uint16, uint16, uint16, uint16) \
355
+ CUDA_DEVICE_BINARY(name, func, uint16, uint32, uint32, uint32) \
356
+ CUDA_DEVICE_BINARY(name, func, uint16, uint64, uint64, uint64) \
357
+ CUDA_DEVICE_BINARY(name, func, uint16, int8, int32, int32) \
358
+ CUDA_DEVICE_BINARY(name, func, uint16, int16, int32, int32) \
359
+ CUDA_DEVICE_BINARY(name, func, uint16, int32, int32, int32) \
360
+ CUDA_DEVICE_BINARY(name, func, uint16, int64, int64, int64) \
361
+ CUDA_DEVICE_BINARY(name, func, uint16, bfloat16, float32, float32) \
362
+ CUDA_DEVICE_BINARY(name, func, uint16, float16, float32, float32) \
363
+ CUDA_DEVICE_BINARY(name, func, uint16, float32, float32, float32) \
364
+ CUDA_DEVICE_BINARY(name, func, uint16, float64, float64, float64) \
365
+ CUDA_DEVICE_NOKERN(name, func, uint16, complex32, complex64, complex64) \
366
+ CUDA_DEVICE_NOKERN(name, func, uint16, complex64, complex64, complex64) \
367
+ CUDA_DEVICE_NOKERN(name, func, uint16, complex128, complex128, complex128) \
368
+ \
369
+ CUDA_DEVICE_BINARY(name, func, uint32, uint8, uint32, uint32) \
370
+ CUDA_DEVICE_BINARY(name, func, uint32, uint16, uint32, uint32) \
371
+ CUDA_DEVICE_BINARY(name, func, uint32, uint32, uint32, uint32) \
372
+ CUDA_DEVICE_BINARY(name, func, uint32, uint64, uint64, uint64) \
373
+ CUDA_DEVICE_BINARY(name, func, uint32, int8, int64, int64) \
374
+ CUDA_DEVICE_BINARY(name, func, uint32, int16, int64, int64) \
375
+ CUDA_DEVICE_BINARY(name, func, uint32, int32, int64, int64) \
376
+ CUDA_DEVICE_BINARY(name, func, uint32, int64, int64, int64) \
377
+ CUDA_DEVICE_BINARY(name, func, uint32, bfloat16, float64, float64) \
378
+ CUDA_DEVICE_BINARY(name, func, uint32, float16, float64, float64) \
379
+ CUDA_DEVICE_BINARY(name, func, uint32, float32, float64, float64) \
380
+ CUDA_DEVICE_BINARY(name, func, uint32, float64, float64, float64) \
381
+ CUDA_DEVICE_NOKERN(name, func, uint32, complex32, complex128, complex128) \
382
+ CUDA_DEVICE_NOKERN(name, func, uint32, complex64, complex128, complex128) \
383
+ CUDA_DEVICE_NOKERN(name, func, uint32, complex128, complex128, complex128) \
384
+ \
385
+ CUDA_DEVICE_BINARY(name, func, uint64, uint8, uint64, uint64) \
386
+ CUDA_DEVICE_BINARY(name, func, uint64, uint16, uint64, uint64) \
387
+ CUDA_DEVICE_BINARY(name, func, uint64, uint32, uint64, uint64) \
388
+ CUDA_DEVICE_BINARY(name, func, uint64, uint64, uint64, uint64) \
389
+ \
390
+ CUDA_DEVICE_BINARY(name, func, int8, uint8, int16, int16) \
391
+ CUDA_DEVICE_BINARY(name, func, int8, uint16, int32, int32) \
392
+ CUDA_DEVICE_BINARY(name, func, int8, uint32, int64, int64) \
393
+ CUDA_DEVICE_BINARY(name, func, int8, int8, int8, int8) \
394
+ CUDA_DEVICE_BINARY(name, func, int8, int16, int16, int16) \
395
+ CUDA_DEVICE_BINARY(name, func, int8, int32, int32, int32) \
396
+ CUDA_DEVICE_BINARY(name, func, int8, int64, int64, int64) \
397
+ CUDA_DEVICE_BINARY(name, func, int8, bfloat16, bfloat16, bfloat16) \
398
+ CUDA_DEVICE_NOIMPL(name, hfunc, int8, float16, float16, float16) \
399
+ CUDA_DEVICE_BINARY(name, func, int8, float32, float32, float32) \
400
+ CUDA_DEVICE_BINARY(name, func, int8, float64, float64, float64) \
401
+ CUDA_DEVICE_NOKERN(name, func, int8, complex32, complex32, complex32) \
402
+ CUDA_DEVICE_NOKERN(name, func, int8, complex64, complex64, complex64) \
403
+ CUDA_DEVICE_NOKERN(name, func, int8, complex128, complex128, complex128) \
404
+ \
405
+ CUDA_DEVICE_BINARY(name, func, int16, uint8, int16, int16) \
406
+ CUDA_DEVICE_BINARY(name, func, int16, uint16, int32, int32) \
407
+ CUDA_DEVICE_BINARY(name, func, int16, uint32, int64, int64) \
408
+ CUDA_DEVICE_BINARY(name, func, int16, int8, int16, int16) \
409
+ CUDA_DEVICE_BINARY(name, func, int16, int16, int16, int16) \
410
+ CUDA_DEVICE_BINARY(name, func, int16, int32, int32, int32) \
411
+ CUDA_DEVICE_BINARY(name, func, int16, int64, int64, int64) \
412
+ CUDA_DEVICE_BINARY(name, func, int16, bfloat16, float32, float32) \
413
+ CUDA_DEVICE_BINARY(name, func, int16, float16, float32, float32) \
414
+ CUDA_DEVICE_BINARY(name, func, int16, float32, float32, float32) \
415
+ CUDA_DEVICE_BINARY(name, func, int16, float64, float64, float64) \
416
+ CUDA_DEVICE_NOKERN(name, func, int16, complex32, complex64, complex64) \
417
+ CUDA_DEVICE_NOKERN(name, func, int16, complex64, complex64, complex64) \
418
+ CUDA_DEVICE_NOKERN(name, func, int16, complex128, complex128, complex128) \
419
+ \
420
+ CUDA_DEVICE_BINARY(name, func, int32, uint8, int32, int32) \
421
+ CUDA_DEVICE_BINARY(name, func, int32, uint16, int32, int32) \
422
+ CUDA_DEVICE_BINARY(name, func, int32, uint32, int64, int64) \
423
+ CUDA_DEVICE_BINARY(name, func, int32, int8, int32, int32) \
424
+ CUDA_DEVICE_BINARY(name, func, int32, int16, int32, int32) \
425
+ CUDA_DEVICE_BINARY(name, func, int32, int32, int32, int32) \
426
+ CUDA_DEVICE_BINARY(name, func, int32, int64, int64, int64) \
427
+ CUDA_DEVICE_BINARY(name, func, int32, bfloat16, float64, float64) \
428
+ CUDA_DEVICE_BINARY(name, func, int32, float16, float64, float64) \
429
+ CUDA_DEVICE_BINARY(name, func, int32, float32, float64, float64) \
430
+ CUDA_DEVICE_BINARY(name, func, int32, float64, float64, float64) \
431
+ CUDA_DEVICE_NOKERN(name, func, int32, complex32, complex128, complex128) \
432
+ CUDA_DEVICE_NOKERN(name, func, int32, complex64, complex128, complex128) \
433
+ CUDA_DEVICE_NOKERN(name, func, int32, complex128, complex128, complex128) \
434
+ \
435
+ CUDA_DEVICE_BINARY(name, func, int64, uint8, int64, int64) \
436
+ CUDA_DEVICE_BINARY(name, func, int64, uint16, int64, int64) \
437
+ CUDA_DEVICE_BINARY(name, func, int64, uint32, int64, int64) \
438
+ CUDA_DEVICE_BINARY(name, func, int64, int8, int64, int64) \
439
+ CUDA_DEVICE_BINARY(name, func, int64, int16, int64, int64) \
440
+ CUDA_DEVICE_BINARY(name, func, int64, int32, int64, int64) \
441
+ CUDA_DEVICE_BINARY(name, func, int64, int64, int64, int64) \
442
+ \
443
+ CUDA_DEVICE_BINARY(name, func, bfloat16, uint8, bfloat16, bfloat16) \
444
+ CUDA_DEVICE_BINARY(name, func, bfloat16, uint16, float32, float32) \
445
+ CUDA_DEVICE_BINARY(name, func, bfloat16, uint32, float64, float64) \
446
+ CUDA_DEVICE_BINARY(name, func, bfloat16, int8, bfloat16, bfloat16) \
447
+ CUDA_DEVICE_BINARY(name, func, bfloat16, int16, float32, float32) \
448
+ CUDA_DEVICE_BINARY(name, func, bfloat16, int32, float64, float64) \
449
+ CUDA_DEVICE_BINARY(name, func, bfloat16, bfloat16, bfloat16, bfloat16) \
450
+ CUDA_DEVICE_BINARY(name, func, bfloat16, float16, float32, float32) \
451
+ CUDA_DEVICE_BINARY(name, func, bfloat16, float32, float32, float32) \
452
+ CUDA_DEVICE_BINARY(name, func, bfloat16, float64, float64, float64) \
453
+ CUDA_DEVICE_NOKERN(name, func, bfloat16, complex32, complex32, complex32) \
454
+ CUDA_DEVICE_NOKERN(name, func, bfloat16, complex64, complex64, complex64) \
455
+ CUDA_DEVICE_NOKERN(name, func, bfloat16, complex128, complex128, complex128) \
456
+ \
457
+ CUDA_DEVICE_NOIMPL(name, hfunc, float16, uint8, float16, float16) \
458
+ CUDA_DEVICE_BINARY(name, func, float16, uint16, float32, float32) \
459
+ CUDA_DEVICE_BINARY(name, func, float16, uint32, float64, float64) \
460
+ CUDA_DEVICE_NOIMPL(name, hfunc, float16, int8, float16, float16) \
461
+ CUDA_DEVICE_BINARY(name, func, float16, int16, float32, float32) \
462
+ CUDA_DEVICE_BINARY(name, func, float16, int32, float64, float64) \
463
+ CUDA_DEVICE_BINARY(name, func, float16, bfloat16, float32, float32) \
464
+ CUDA_DEVICE_NOIMPL(name, hfunc, float16, float16, float16, float16) \
465
+ CUDA_DEVICE_BINARY(name, func, float16, float32, float32, float32) \
466
+ CUDA_DEVICE_BINARY(name, func, float16, float64, float64, float64) \
467
+ CUDA_DEVICE_NOKERN(name, func, float16, complex32, complex32, complex32) \
468
+ CUDA_DEVICE_NOKERN(name, func, float16, complex64, complex64, complex64) \
469
+ CUDA_DEVICE_NOKERN(name, func, float16, complex128, complex128, complex128) \
470
+ \
471
+ CUDA_DEVICE_BINARY(name, func, float32, uint8, float32, float32) \
472
+ CUDA_DEVICE_BINARY(name, func, float32, uint16, float32, float32) \
473
+ CUDA_DEVICE_BINARY(name, func, float32, uint32, float64, float64) \
474
+ CUDA_DEVICE_BINARY(name, func, float32, int8, float32, float32) \
475
+ CUDA_DEVICE_BINARY(name, func, float32, int16, float32, float32) \
476
+ CUDA_DEVICE_BINARY(name, func, float32, int32, float64, float64) \
477
+ CUDA_DEVICE_BINARY(name, func, float32, bfloat16, float32, float32) \
478
+ CUDA_DEVICE_BINARY(name, func, float32, float16, float32, float32) \
479
+ CUDA_DEVICE_BINARY(name, func, float32, float32, float32, float32) \
480
+ CUDA_DEVICE_BINARY(name, func, float32, float64, float64, float64) \
481
+ CUDA_DEVICE_NOKERN(name, func, float32, complex32, complex64, complex64) \
482
+ CUDA_DEVICE_NOKERN(name, func, float32, complex64, complex64, complex64) \
483
+ CUDA_DEVICE_NOKERN(name, func, float32, complex128, complex128, complex128) \
484
+ \
485
+ CUDA_DEVICE_BINARY(name, func, float64, uint8, float64, float64) \
486
+ CUDA_DEVICE_BINARY(name, func, float64, uint16, float64, float64) \
487
+ CUDA_DEVICE_BINARY(name, func, float64, uint32, float64, float64) \
488
+ CUDA_DEVICE_BINARY(name, func, float64, int8, float64, float64) \
489
+ CUDA_DEVICE_BINARY(name, func, float64, int16, float64, float64) \
490
+ CUDA_DEVICE_BINARY(name, func, float64, int32, float64, float64) \
491
+ CUDA_DEVICE_BINARY(name, func, float64, bfloat16, float64, float64) \
492
+ CUDA_DEVICE_BINARY(name, func, float64, float16, float64, float64) \
493
+ CUDA_DEVICE_BINARY(name, func, float64, float32, float64, float64) \
494
+ CUDA_DEVICE_BINARY(name, func, float64, float64, float64, float64) \
495
+ CUDA_DEVICE_NOKERN(name, func, float64, complex32, complex128, complex128) \
496
+ CUDA_DEVICE_NOKERN(name, func, float64, complex64, complex128, complex128) \
497
+ CUDA_DEVICE_NOKERN(name, func, float64, complex128, complex128, complex128) \
498
+ \
499
+ CUDA_DEVICE_NOKERN(name, func, complex32, uint8, complex32, complex32) \
500
+ CUDA_DEVICE_NOKERN(name, func, complex32, uint16, complex64, complex64) \
501
+ CUDA_DEVICE_NOKERN(name, func, complex32, uint32, complex128, complex128) \
502
+ CUDA_DEVICE_NOKERN(name, func, complex32, int8, complex32, complex32) \
503
+ CUDA_DEVICE_NOKERN(name, func, complex32, int16, complex64, complex64) \
504
+ CUDA_DEVICE_NOKERN(name, func, complex32, int32, complex128, complex128) \
505
+ CUDA_DEVICE_NOKERN(name, func, complex32, bfloat16, complex64, complex64) \
506
+ CUDA_DEVICE_NOKERN(name, func, complex32, float16, complex32, complex32) \
507
+ CUDA_DEVICE_NOKERN(name, func, complex32, float32, complex64, complex64) \
508
+ CUDA_DEVICE_NOKERN(name, func, complex32, float64, complex128, complex128) \
509
+ CUDA_DEVICE_NOKERN(name, func, complex32, complex32, complex32, complex32) \
510
+ CUDA_DEVICE_NOKERN(name, func, complex32, complex64, complex64, complex64) \
511
+ CUDA_DEVICE_NOKERN(name, func, complex32, complex128, complex128, complex128) \
512
+ \
513
+ CUDA_DEVICE_NOKERN(name, func, complex64, uint8, complex64, complex64) \
514
+ CUDA_DEVICE_NOKERN(name, func, complex64, uint16, complex64, complex64) \
515
+ CUDA_DEVICE_NOKERN(name, func, complex64, uint32, complex128, complex128) \
516
+ CUDA_DEVICE_NOKERN(name, func, complex64, int8, complex64, complex64) \
517
+ CUDA_DEVICE_NOKERN(name, func, complex64, int16, complex64, complex64) \
518
+ CUDA_DEVICE_NOKERN(name, func, complex64, int32, complex128, complex128) \
519
+ CUDA_DEVICE_NOKERN(name, func, complex64, bfloat16, complex64, complex64) \
520
+ CUDA_DEVICE_NOKERN(name, func, complex64, float16, complex64, complex64) \
521
+ CUDA_DEVICE_NOKERN(name, func, complex64, float32, complex64, complex64) \
522
+ CUDA_DEVICE_NOKERN(name, func, complex64, float64, complex128, complex128) \
523
+ CUDA_DEVICE_NOKERN(name, func, complex64, complex32, complex64, complex64) \
524
+ CUDA_DEVICE_NOKERN(name, func, complex64, complex64, complex64, complex64) \
525
+ CUDA_DEVICE_NOKERN(name, func, complex64, complex128, complex128, complex128) \
526
+ \
527
+ CUDA_DEVICE_NOKERN(name, func, complex128, uint8, complex128, complex128) \
528
+ CUDA_DEVICE_NOKERN(name, func, complex128, uint16, complex128, complex128) \
529
+ CUDA_DEVICE_NOKERN(name, func, complex128, uint32, complex128, complex128) \
530
+ CUDA_DEVICE_NOKERN(name, func, complex128, int8, complex128, complex128) \
531
+ CUDA_DEVICE_NOKERN(name, func, complex128, int16, complex128, complex128) \
532
+ CUDA_DEVICE_NOKERN(name, func, complex128, int32, complex128, complex128) \
533
+ CUDA_DEVICE_NOKERN(name, func, complex128, bfloat16, complex128, complex128) \
534
+ CUDA_DEVICE_NOKERN(name, func, complex128, float16, complex128, complex128) \
535
+ CUDA_DEVICE_NOKERN(name, func, complex128, float32, complex128, complex128) \
536
+ CUDA_DEVICE_NOKERN(name, func, complex128, float64, complex128, complex128) \
537
+ CUDA_DEVICE_NOKERN(name, func, complex128, complex32, complex128, complex128) \
538
+ CUDA_DEVICE_NOKERN(name, func, complex128, complex64, complex128, complex128) \
539
+ CUDA_DEVICE_NOKERN(name, func, complex128, complex128, complex128, complex128) \
540
+
541
+ #define CUDA_DEVICE_ALL_BINARY_FLOAT_RETURN(name, func, hfunc) \
542
+ CUDA_DEVICE_BINARY(name, hfunc, uint8, uint8, float16, float16) \
543
+ CUDA_DEVICE_BINARY(name, func, uint8, uint16, float32, float32) \
544
+ CUDA_DEVICE_BINARY(name, func, uint8, uint32, float64, float64) \
545
+ CUDA_DEVICE_NOKERN(name, func, uint8, uint64, uint64, uint64) \
546
+ CUDA_DEVICE_BINARY(name, hfunc, uint8, int8, float16, float16) \
547
+ CUDA_DEVICE_BINARY(name, func, uint8, int16, float32, float32) \
548
+ CUDA_DEVICE_BINARY(name, func, uint8, int32, float64, float64) \
549
+ CUDA_DEVICE_NOKERN(name, func, uint8, int64, int64, int64) \
550
+ CUDA_DEVICE_BINARY(name, func, uint8, bfloat16, bfloat16, bfloat16) \
551
+ CUDA_DEVICE_BINARY(name, hfunc, uint8, float16, float16, float16) \
552
+ CUDA_DEVICE_BINARY(name, func, uint8, float32, float32, float32) \
553
+ CUDA_DEVICE_BINARY(name, func, uint8, float64, float64, float64) \
554
+ CUDA_DEVICE_NOIMPL(name, func, uint8, complex32, complex32, complex32) \
555
+ CUDA_DEVICE_BINARY(name, func, uint8, complex64, complex64, complex64) \
556
+ CUDA_DEVICE_BINARY(name, func, uint8, complex128, complex128, complex128) \
557
+ \
558
+ CUDA_DEVICE_BINARY(name, func, uint16, uint8, float32, float32) \
559
+ CUDA_DEVICE_BINARY(name, func, uint16, uint16, float32, float32) \
560
+ CUDA_DEVICE_BINARY(name, func, uint16, uint32, float64, float64) \
561
+ CUDA_DEVICE_NOKERN(name, func, uint16, uint64, uint64, uint64) \
562
+ CUDA_DEVICE_BINARY(name, func, uint16, int8, float32, float32) \
563
+ CUDA_DEVICE_BINARY(name, func, uint16, int16, float32, float32) \
564
+ CUDA_DEVICE_BINARY(name, func, uint16, int32, float64, float64) \
565
+ CUDA_DEVICE_NOKERN(name, func, uint16, int64, int64, int64) \
566
+ CUDA_DEVICE_BINARY(name, func, uint16, bfloat16, float32, float32) \
567
+ CUDA_DEVICE_BINARY(name, func, uint16, float16, float32, float32) \
568
+ CUDA_DEVICE_BINARY(name, func, uint16, float32, float32, float32) \
569
+ CUDA_DEVICE_BINARY(name, func, uint16, float64, float64, float64) \
570
+ CUDA_DEVICE_NOIMPL(name, func, uint16, complex32, complex64, complex64) \
571
+ CUDA_DEVICE_BINARY(name, func, uint16, complex64, complex64, complex64) \
572
+ CUDA_DEVICE_BINARY(name, func, uint16, complex128, complex128, complex128) \
573
+ \
574
+ CUDA_DEVICE_BINARY(name, func, uint32, uint8, float64, float64) \
575
+ CUDA_DEVICE_BINARY(name, func, uint32, uint16, float64, float64) \
576
+ CUDA_DEVICE_BINARY(name, func, uint32, uint32, float64, float64) \
577
+ CUDA_DEVICE_NOKERN(name, func, uint32, uint64, uint64, uint64) \
578
+ CUDA_DEVICE_BINARY(name, func, uint32, int8, float64, float64) \
579
+ CUDA_DEVICE_BINARY(name, func, uint32, int16, float64, float64) \
580
+ CUDA_DEVICE_BINARY(name, func, uint32, int32, float64, float64) \
581
+ CUDA_DEVICE_NOKERN(name, func, uint32, int64, int64, int64) \
582
+ CUDA_DEVICE_BINARY(name, func, uint32, bfloat16, float64, float64) \
583
+ CUDA_DEVICE_BINARY(name, func, uint32, float16, float64, float64) \
584
+ CUDA_DEVICE_BINARY(name, func, uint32, float32, float64, float64) \
585
+ CUDA_DEVICE_BINARY(name, func, uint32, float64, float64, float64) \
586
+ CUDA_DEVICE_NOIMPL(name, func, uint32, complex32, complex128, complex128) \
587
+ CUDA_DEVICE_BINARY(name, func, uint32, complex64, complex128, complex128) \
588
+ CUDA_DEVICE_BINARY(name, func, uint32, complex128, complex128, complex128) \
589
+ \
590
+ CUDA_DEVICE_NOKERN(name, func, uint64, uint8, uint64, uint64) \
591
+ CUDA_DEVICE_NOKERN(name, func, uint64, uint16, uint64, uint64) \
592
+ CUDA_DEVICE_NOKERN(name, func, uint64, uint32, uint64, uint64) \
593
+ CUDA_DEVICE_NOKERN(name, func, uint64, uint64, uint64, uint64) \
594
+ \
595
+ CUDA_DEVICE_BINARY(name, hfunc, int8, uint8, float16, float16) \
596
+ CUDA_DEVICE_BINARY(name, func, int8, uint16, float32, float32) \
597
+ CUDA_DEVICE_BINARY(name, func, int8, uint32, float64, float64) \
598
+ CUDA_DEVICE_BINARY(name, hfunc, int8, int8, float16, float16) \
599
+ CUDA_DEVICE_BINARY(name, func, int8, int16, float32, float32) \
600
+ CUDA_DEVICE_BINARY(name, func, int8, int32, float64, float64) \
601
+ CUDA_DEVICE_NOKERN(name, func, int8, int64, int64, int64) \
602
+ CUDA_DEVICE_BINARY(name, func, int8, bfloat16, bfloat16, bfloat16) \
603
+ CUDA_DEVICE_BINARY(name, hfunc, int8, float16, float16, float16) \
604
+ CUDA_DEVICE_BINARY(name, func, int8, float32, float32, float32) \
605
+ CUDA_DEVICE_BINARY(name, func, int8, float64, float64, float64) \
606
+ CUDA_DEVICE_NOIMPL(name, func, int8, complex32, complex32, complex32) \
607
+ CUDA_DEVICE_BINARY(name, func, int8, complex64, complex64, complex64) \
608
+ CUDA_DEVICE_BINARY(name, func, int8, complex128, complex128, complex128) \
609
+ \
610
+ CUDA_DEVICE_BINARY(name, func, int16, uint8, float32, float32) \
611
+ CUDA_DEVICE_BINARY(name, func, int16, uint16, float32, float32) \
612
+ CUDA_DEVICE_BINARY(name, func, int16, uint32, float64, float64) \
613
+ CUDA_DEVICE_BINARY(name, func, int16, int8, float32, float32) \
614
+ CUDA_DEVICE_BINARY(name, func, int16, int16, float32, float32) \
615
+ CUDA_DEVICE_BINARY(name, func, int16, int32, float64, float64) \
616
+ CUDA_DEVICE_NOKERN(name, func, int16, int64, int64, int64) \
617
+ CUDA_DEVICE_BINARY(name, func, int16, bfloat16, float32, float32) \
618
+ CUDA_DEVICE_BINARY(name, func, int16, float16, float32, float32) \
619
+ CUDA_DEVICE_BINARY(name, func, int16, float32, float32, float32) \
620
+ CUDA_DEVICE_BINARY(name, func, int16, float64, float64, float64) \
621
+ CUDA_DEVICE_NOIMPL(name, func, int16, complex32, complex64, complex64) \
622
+ CUDA_DEVICE_BINARY(name, func, int16, complex64, complex64, complex64) \
623
+ CUDA_DEVICE_BINARY(name, func, int16, complex128, complex128, complex128) \
624
+ \
625
+ CUDA_DEVICE_BINARY(name, func, int32, uint8, float64, float64) \
626
+ CUDA_DEVICE_BINARY(name, func, int32, uint16, float64, float64) \
627
+ CUDA_DEVICE_BINARY(name, func, int32, uint32, float64, float64) \
628
+ CUDA_DEVICE_BINARY(name, func, int32, int8, float64, float64) \
629
+ CUDA_DEVICE_BINARY(name, func, int32, int16, float64, float64) \
630
+ CUDA_DEVICE_BINARY(name, func, int32, int32, float64, float64) \
631
+ CUDA_DEVICE_NOKERN(name, func, int32, int64, int64, int64) \
632
+ CUDA_DEVICE_BINARY(name, func, int32, bfloat16, float64, float64) \
633
+ CUDA_DEVICE_BINARY(name, func, int32, float16, float64, float64) \
634
+ CUDA_DEVICE_BINARY(name, func, int32, float32, float64, float64) \
635
+ CUDA_DEVICE_BINARY(name, func, int32, float64, float64, float64) \
636
+ CUDA_DEVICE_NOIMPL(name, func, int32, complex32, complex128, complex128) \
637
+ CUDA_DEVICE_BINARY(name, func, int32, complex64, complex128, complex128) \
638
+ CUDA_DEVICE_BINARY(name, func, int32, complex128, complex128, complex128) \
639
+ \
640
+ CUDA_DEVICE_NOKERN(name, func, int64, uint8, int64, int64) \
641
+ CUDA_DEVICE_NOKERN(name, func, int64, uint16, int64, int64) \
642
+ CUDA_DEVICE_NOKERN(name, func, int64, uint32, int64, int64) \
643
+ CUDA_DEVICE_NOKERN(name, func, int64, int8, int64, int64) \
644
+ CUDA_DEVICE_NOKERN(name, func, int64, int16, int64, int64) \
645
+ CUDA_DEVICE_NOKERN(name, func, int64, int32, int64, int64) \
646
+ CUDA_DEVICE_NOKERN(name, func, int64, int64, int64, int64) \
647
+ \
648
+ CUDA_DEVICE_BINARY(name, func, bfloat16, uint8, bfloat16, bfloat16) \
649
+ CUDA_DEVICE_BINARY(name, func, bfloat16, uint16, float32, float32) \
650
+ CUDA_DEVICE_BINARY(name, func, bfloat16, uint32, float64, float64) \
651
+ CUDA_DEVICE_BINARY(name, func, bfloat16, int8, bfloat16, bfloat16) \
652
+ CUDA_DEVICE_BINARY(name, func, bfloat16, int16, float32, float32) \
653
+ CUDA_DEVICE_BINARY(name, func, bfloat16, int32, float64, float64) \
654
+ CUDA_DEVICE_BINARY(name, func, bfloat16, bfloat16, bfloat16, bfloat16) \
655
+ CUDA_DEVICE_BINARY(name, func, bfloat16, float16, float32, float32) \
656
+ CUDA_DEVICE_BINARY(name, func, bfloat16, float32, float32, float32) \
657
+ CUDA_DEVICE_BINARY(name, func, bfloat16, float64, float64, float64) \
658
+ CUDA_DEVICE_NOIMPL(name, func, bfloat16, complex32, complex64, complex64) \
659
+ CUDA_DEVICE_BINARY(name, func, bfloat16, complex64, complex64, complex64) \
660
+ CUDA_DEVICE_BINARY(name, func, bfloat16, complex128, complex128, complex128) \
661
+ \
662
+ CUDA_DEVICE_BINARY(name, hfunc, float16, uint8, float16, float16) \
663
+ CUDA_DEVICE_BINARY(name, func, float16, uint16, float32, float32) \
664
+ CUDA_DEVICE_BINARY(name, func, float16, uint32, float64, float64) \
665
+ CUDA_DEVICE_BINARY(name, hfunc, float16, int8, float16, float16) \
666
+ CUDA_DEVICE_BINARY(name, func, float16, int16, float32, float32) \
667
+ CUDA_DEVICE_BINARY(name, func, float16, int32, float64, float64) \
668
+ CUDA_DEVICE_BINARY(name, func, float16, bfloat16, float32, float32) \
669
+ CUDA_DEVICE_BINARY(name, hfunc, float16, float16, float16, float16) \
670
+ CUDA_DEVICE_BINARY(name, func, float16, float32, float32, float32) \
671
+ CUDA_DEVICE_BINARY(name, func, float16, float64, float64, float64) \
672
+ CUDA_DEVICE_NOIMPL(name, func, float16, complex32, complex32, complex32) \
673
+ CUDA_DEVICE_BINARY(name, func, float16, complex64, complex64, complex64) \
674
+ CUDA_DEVICE_BINARY(name, func, float16, complex128, complex128, complex128) \
675
+ \
676
+ CUDA_DEVICE_BINARY(name, func, float32, uint8, float32, float32) \
677
+ CUDA_DEVICE_BINARY(name, func, float32, uint16, float32, float32) \
678
+ CUDA_DEVICE_BINARY(name, func, float32, uint32, float64, float64) \
679
+ CUDA_DEVICE_BINARY(name, func, float32, int8, float32, float32) \
680
+ CUDA_DEVICE_BINARY(name, func, float32, int16, float32, float32) \
681
+ CUDA_DEVICE_BINARY(name, func, float32, int32, float64, float64) \
682
+ CUDA_DEVICE_BINARY(name, func, float32, bfloat16, float32, float32) \
683
+ CUDA_DEVICE_BINARY(name, func, float32, float16, float32, float32) \
684
+ CUDA_DEVICE_BINARY(name, func, float32, float32, float32, float32) \
685
+ CUDA_DEVICE_BINARY(name, func, float32, float64, float64, float64) \
686
+ CUDA_DEVICE_NOIMPL(name, func, float32, complex32, complex64, complex64) \
687
+ CUDA_DEVICE_BINARY(name, func, float32, complex64, complex64, complex64) \
688
+ CUDA_DEVICE_BINARY(name, func, float32, complex128, complex128, complex128) \
689
+ \
690
+ CUDA_DEVICE_BINARY(name, func, float64, uint8, float64, float64) \
691
+ CUDA_DEVICE_BINARY(name, func, float64, uint16, float64, float64) \
692
+ CUDA_DEVICE_BINARY(name, func, float64, uint32, float64, float64) \
693
+ CUDA_DEVICE_BINARY(name, func, float64, int8, float64, float64) \
694
+ CUDA_DEVICE_BINARY(name, func, float64, int16, float64, float64) \
695
+ CUDA_DEVICE_BINARY(name, func, float64, int32, float64, float64) \
696
+ CUDA_DEVICE_BINARY(name, func, float64, bfloat16, float64, float64) \
697
+ CUDA_DEVICE_BINARY(name, func, float64, float16, float64, float64) \
698
+ CUDA_DEVICE_BINARY(name, func, float64, float32, float64, float64) \
699
+ CUDA_DEVICE_BINARY(name, func, float64, float64, float64, float64) \
700
+ CUDA_DEVICE_NOIMPL(name, func, float64, complex32, complex128, complex128) \
701
+ CUDA_DEVICE_BINARY(name, func, float64, complex64, complex128, complex128) \
702
+ CUDA_DEVICE_BINARY(name, func, float64, complex128, complex128, complex128) \
703
+ \
704
+ CUDA_DEVICE_NOIMPL(name, func, complex32, uint8, complex32, complex32) \
705
+ CUDA_DEVICE_NOIMPL(name, func, complex32, uint16, complex64, complex64) \
706
+ CUDA_DEVICE_NOIMPL(name, func, complex32, uint32, complex128, complex128) \
707
+ CUDA_DEVICE_NOIMPL(name, func, complex32, int8, complex32, complex32) \
708
+ CUDA_DEVICE_NOIMPL(name, func, complex32, int16, complex64, complex64) \
709
+ CUDA_DEVICE_NOIMPL(name, func, complex32, int32, complex128, complex128) \
710
+ CUDA_DEVICE_NOIMPL(name, func, complex32, bfloat16, complex64, complex64) \
711
+ CUDA_DEVICE_NOIMPL(name, func, complex32, float16, complex32, complex32) \
712
+ CUDA_DEVICE_NOIMPL(name, func, complex32, float32, complex64, complex64) \
713
+ CUDA_DEVICE_NOIMPL(name, func, complex32, float64, complex128, complex128) \
714
+ CUDA_DEVICE_NOIMPL(name, func, complex32, complex32, complex32, complex32) \
715
+ CUDA_DEVICE_NOIMPL(name, func, complex32, complex64, complex64, complex64) \
716
+ CUDA_DEVICE_NOIMPL(name, func, complex32, complex128, complex128, complex128) \
717
+ \
718
+ CUDA_DEVICE_BINARY(name, func, complex64, uint8, complex64, complex64) \
719
+ CUDA_DEVICE_BINARY(name, func, complex64, uint16, complex64, complex64) \
720
+ CUDA_DEVICE_BINARY(name, func, complex64, uint32, complex128, complex128) \
721
+ CUDA_DEVICE_BINARY(name, func, complex64, int8, complex64, complex64) \
722
+ CUDA_DEVICE_BINARY(name, func, complex64, int16, complex64, complex64) \
723
+ CUDA_DEVICE_BINARY(name, func, complex64, int32, complex128, complex128) \
724
+ CUDA_DEVICE_BINARY(name, func, complex64, bfloat16, complex64, complex64) \
725
+ CUDA_DEVICE_BINARY(name, func, complex64, float16, complex64, complex64) \
726
+ CUDA_DEVICE_BINARY(name, func, complex64, float32, complex64, complex64) \
727
+ CUDA_DEVICE_BINARY(name, func, complex64, float64, complex128, complex128) \
728
+ CUDA_DEVICE_NOIMPL(name, func, complex64, complex32, complex64, complex64) \
729
+ CUDA_DEVICE_BINARY(name, func, complex64, complex64, complex64, complex64) \
730
+ CUDA_DEVICE_BINARY(name, func, complex64, complex128, complex128, complex128) \
731
+ \
732
+ CUDA_DEVICE_BINARY(name, func, complex128, uint8, complex128, complex128) \
733
+ CUDA_DEVICE_BINARY(name, func, complex128, uint16, complex128, complex128) \
734
+ CUDA_DEVICE_BINARY(name, func, complex128, uint32, complex128, complex128) \
735
+ CUDA_DEVICE_BINARY(name, func, complex128, int8, complex128, complex128) \
736
+ CUDA_DEVICE_BINARY(name, func, complex128, int16, complex128, complex128) \
737
+ CUDA_DEVICE_BINARY(name, func, complex128, int32, complex128, complex128) \
738
+ CUDA_DEVICE_BINARY(name, func, complex128, bfloat16, complex128, complex128) \
739
+ CUDA_DEVICE_BINARY(name, func, complex128, float16, complex128, complex128) \
740
+ CUDA_DEVICE_BINARY(name, func, complex128, float32, complex128, complex128) \
741
+ CUDA_DEVICE_BINARY(name, func, complex128, float64, complex128, complex128) \
742
+ CUDA_DEVICE_NOIMPL(name, func, complex128, complex32, complex128, complex128) \
743
+ CUDA_DEVICE_BINARY(name, func, complex128, complex64, complex128, complex128) \
744
+ CUDA_DEVICE_BINARY(name, func, complex128, complex128, complex128, complex128)
745
+
746
+ #define add(x, y) x + y
747
+ CUDA_DEVICE_ALL_BINARY(add, add, __hadd)
748
+
749
+ #define subtract(x, y) x - y
750
+ CUDA_DEVICE_ALL_BINARY(subtract, subtract, __hsub)
751
+
752
+ #define multiply(x, y) x * y
753
+ CUDA_DEVICE_ALL_BINARY(multiply, multiply, __hmul)
754
+
755
+ #define floor_divide(x, y) x * y
756
+ CUDA_DEVICE_ALL_BINARY_NO_COMPLEX(floor_divide, _floor_divide, _floor_divide)
757
+
758
+ #define remainder(x, y) x % y
759
+ CUDA_DEVICE_ALL_BINARY_NO_COMPLEX(remainder, _remainder, _remainder)
760
+
761
+ #define divide(x, y) x / y
762
+ CUDA_DEVICE_ALL_BINARY_FLOAT_RETURN(divide, divide, __hdiv)
763
+
764
+ CUDA_DEVICE_ALL_BINARY(power, _pow, _pow)
765
+
766
+
767
+ /*****************************************************************************/
768
+ /* Comparison */
769
+ /*****************************************************************************/
770
+
771
+ #define CUDA_DEVICE_ALL_COMPARISON(name, func, hfunc, cfunc) \
772
+ CUDA_DEVICE_BINARY(name, func, uint8, uint8, bool, uint8) \
773
+ CUDA_DEVICE_BINARY(name, func, uint8, uint16, bool, uint16) \
774
+ CUDA_DEVICE_BINARY(name, func, uint8, uint32, bool, uint32) \
775
+ CUDA_DEVICE_BINARY(name, func, uint8, uint64, bool, uint64) \
776
+ CUDA_DEVICE_BINARY(name, func, uint8, int8, bool, int16) \
777
+ CUDA_DEVICE_BINARY(name, func, uint8, int16, bool, int16) \
778
+ CUDA_DEVICE_BINARY(name, func, uint8, int32, bool, int32) \
779
+ CUDA_DEVICE_BINARY(name, func, uint8, int64, bool, int64) \
780
+ CUDA_DEVICE_BINARY(name, func, uint8, bfloat16, bool, bfloat16) \
781
+ CUDA_DEVICE_BINARY(name, hfunc, uint8, float16, bool, float16) \
782
+ CUDA_DEVICE_BINARY(name, func, uint8, float32, bool, float32) \
783
+ CUDA_DEVICE_BINARY(name, func, uint8, float64, bool, float64) \
784
+ CUDA_DEVICE_NOIMPL(name, cfunc, uint8, complex32, bool, complex32) \
785
+ CUDA_DEVICE_BINARY(name, cfunc, uint8, complex64, bool, complex64) \
786
+ CUDA_DEVICE_BINARY(name, cfunc, uint8, complex128, bool, complex128) \
787
+ \
788
+ CUDA_DEVICE_BINARY(name, func, uint16, uint8, bool, uint16) \
789
+ CUDA_DEVICE_BINARY(name, func, uint16, uint16, bool, uint16) \
790
+ CUDA_DEVICE_BINARY(name, func, uint16, uint32, bool, uint32) \
791
+ CUDA_DEVICE_BINARY(name, func, uint16, uint64, bool, uint64) \
792
+ CUDA_DEVICE_BINARY(name, func, uint16, int8, bool, int32) \
793
+ CUDA_DEVICE_BINARY(name, func, uint16, int16, bool, int32) \
794
+ CUDA_DEVICE_BINARY(name, func, uint16, int32, bool, int32) \
795
+ CUDA_DEVICE_BINARY(name, func, uint16, int64, bool, int64) \
796
+ CUDA_DEVICE_BINARY(name, func, uint16, bfloat16, bool, float32) \
797
+ CUDA_DEVICE_BINARY(name, func, uint16, float16, bool, float32) \
798
+ CUDA_DEVICE_BINARY(name, func, uint16, float32, bool, float32) \
799
+ CUDA_DEVICE_BINARY(name, func, uint16, float64, bool, float64) \
800
+ CUDA_DEVICE_NOIMPL(name, cfunc, uint16, complex32, bool, complex64) \
801
+ CUDA_DEVICE_BINARY(name, cfunc, uint16, complex64, bool, complex64) \
802
+ CUDA_DEVICE_BINARY(name, cfunc, uint16, complex128, bool, complex128) \
803
+ \
804
+ CUDA_DEVICE_BINARY(name, func, uint32, uint8, bool, uint32) \
805
+ CUDA_DEVICE_BINARY(name, func, uint32, uint16, bool, uint32) \
806
+ CUDA_DEVICE_BINARY(name, func, uint32, uint32, bool, uint32) \
807
+ CUDA_DEVICE_BINARY(name, func, uint32, uint64, bool, uint64) \
808
+ CUDA_DEVICE_BINARY(name, func, uint32, int8, bool, int64) \
809
+ CUDA_DEVICE_BINARY(name, func, uint32, int16, bool, int64) \
810
+ CUDA_DEVICE_BINARY(name, func, uint32, int32, bool, int64) \
811
+ CUDA_DEVICE_BINARY(name, func, uint32, int64, bool, int64) \
812
+ CUDA_DEVICE_BINARY(name, func, uint32, bfloat16, bool, float64) \
813
+ CUDA_DEVICE_BINARY(name, func, uint32, float16, bool, float64) \
814
+ CUDA_DEVICE_BINARY(name, func, uint32, float32, bool, float64) \
815
+ CUDA_DEVICE_BINARY(name, func, uint32, float64, bool, float64) \
816
+ CUDA_DEVICE_NOIMPL(name, cfunc, uint32, complex32, bool, complex128) \
817
+ CUDA_DEVICE_BINARY(name, cfunc, uint32, complex64, bool, complex128) \
818
+ CUDA_DEVICE_BINARY(name, cfunc, uint32, complex128, bool, complex128) \
819
+ \
820
+ CUDA_DEVICE_BINARY(name, func, uint64, uint8, bool, uint64) \
821
+ CUDA_DEVICE_BINARY(name, func, uint64, uint16, bool, uint64) \
822
+ CUDA_DEVICE_BINARY(name, func, uint64, uint32, bool, uint64) \
823
+ CUDA_DEVICE_BINARY(name, func, uint64, uint64, bool, uint64) \
824
+ \
825
+ CUDA_DEVICE_BINARY(name, func, int8, uint8, bool, int16) \
826
+ CUDA_DEVICE_BINARY(name, func, int8, uint16, bool, int32) \
827
+ CUDA_DEVICE_BINARY(name, func, int8, uint32, bool, int64) \
828
+ CUDA_DEVICE_BINARY(name, func, int8, int8, bool, int8) \
829
+ CUDA_DEVICE_BINARY(name, func, int8, int16, bool, int16) \
830
+ CUDA_DEVICE_BINARY(name, func, int8, int32, bool, int32) \
831
+ CUDA_DEVICE_BINARY(name, func, int8, int64, bool, int64) \
832
+ CUDA_DEVICE_BINARY(name, func, int8, bfloat16, bool, bfloat16) \
833
+ CUDA_DEVICE_BINARY(name, hfunc, int8, float16, bool, float16) \
834
+ CUDA_DEVICE_BINARY(name, func, int8, float32, bool, float32) \
835
+ CUDA_DEVICE_BINARY(name, func, int8, float64, bool, float64) \
836
+ CUDA_DEVICE_NOIMPL(name, cfunc, int8, complex32, bool, complex32) \
837
+ CUDA_DEVICE_BINARY(name, cfunc, int8, complex64, bool, complex64) \
838
+ CUDA_DEVICE_BINARY(name, cfunc, int8, complex128, bool, complex128) \
839
+ \
840
+ CUDA_DEVICE_BINARY(name, func, int16, uint8, bool, int16) \
841
+ CUDA_DEVICE_BINARY(name, func, int16, uint16, bool, int32) \
842
+ CUDA_DEVICE_BINARY(name, func, int16, uint32, bool, int64) \
843
+ CUDA_DEVICE_BINARY(name, func, int16, int8, bool, int16) \
844
+ CUDA_DEVICE_BINARY(name, func, int16, int16, bool, int16) \
845
+ CUDA_DEVICE_BINARY(name, func, int16, int32, bool, int32) \
846
+ CUDA_DEVICE_BINARY(name, func, int16, int64, bool, int64) \
847
+ CUDA_DEVICE_BINARY(name, func, int16, bfloat16, bool, float32) \
848
+ CUDA_DEVICE_BINARY(name, func, int16, float16, bool, float32) \
849
+ CUDA_DEVICE_BINARY(name, func, int16, float32, bool, float32) \
850
+ CUDA_DEVICE_BINARY(name, func, int16, float64, bool, float64) \
851
+ CUDA_DEVICE_NOIMPL(name, cfunc, int16, complex32, bool, complex64) \
852
+ CUDA_DEVICE_BINARY(name, cfunc, int16, complex64, bool, complex64) \
853
+ CUDA_DEVICE_BINARY(name, cfunc, int16, complex128, bool, complex128) \
854
+ \
855
+ CUDA_DEVICE_BINARY(name, func, int32, uint8, bool, int32) \
856
+ CUDA_DEVICE_BINARY(name, func, int32, uint16, bool, int32) \
857
+ CUDA_DEVICE_BINARY(name, func, int32, uint32, bool, int64) \
858
+ CUDA_DEVICE_BINARY(name, func, int32, int8, bool, int32) \
859
+ CUDA_DEVICE_BINARY(name, func, int32, int16, bool, int32) \
860
+ CUDA_DEVICE_BINARY(name, func, int32, int32, bool, int32) \
861
+ CUDA_DEVICE_BINARY(name, func, int32, int64, bool, int64) \
862
+ CUDA_DEVICE_BINARY(name, func, int32, bfloat16, bool, float64) \
863
+ CUDA_DEVICE_BINARY(name, func, int32, float16, bool, float64) \
864
+ CUDA_DEVICE_BINARY(name, func, int32, float32, bool, float64) \
865
+ CUDA_DEVICE_BINARY(name, func, int32, float64, bool, float64) \
866
+ CUDA_DEVICE_NOIMPL(name, cfunc, int32, complex32, bool, complex128) \
867
+ CUDA_DEVICE_BINARY(name, cfunc, int32, complex64, bool, complex128) \
868
+ CUDA_DEVICE_BINARY(name, cfunc, int32, complex128, bool, complex128) \
869
+ \
870
+ CUDA_DEVICE_BINARY(name, func, int64, uint8, bool, int64) \
871
+ CUDA_DEVICE_BINARY(name, func, int64, uint16, bool, int64) \
872
+ CUDA_DEVICE_BINARY(name, func, int64, uint32, bool, int64) \
873
+ CUDA_DEVICE_BINARY(name, func, int64, int8, bool, int64) \
874
+ CUDA_DEVICE_BINARY(name, func, int64, int16, bool, int64) \
875
+ CUDA_DEVICE_BINARY(name, func, int64, int32, bool, int64) \
876
+ CUDA_DEVICE_BINARY(name, func, int64, int64, bool, int64) \
877
+ \
878
+ CUDA_DEVICE_BINARY(name, func, bfloat16, uint8, bool, bfloat16) \
879
+ CUDA_DEVICE_BINARY(name, func, bfloat16, uint16, bool, float32) \
880
+ CUDA_DEVICE_BINARY(name, func, bfloat16, uint32, bool, float64) \
881
+ CUDA_DEVICE_BINARY(name, func, bfloat16, int8, bool, bfloat16) \
882
+ CUDA_DEVICE_BINARY(name, func, bfloat16, int16, bool, float32) \
883
+ CUDA_DEVICE_BINARY(name, func, bfloat16, int32, bool, float64) \
884
+ CUDA_DEVICE_BINARY(name, func, bfloat16, bfloat16, bool, bfloat16) \
885
+ CUDA_DEVICE_BINARY(name, func, bfloat16, float16, bool, float32) \
886
+ CUDA_DEVICE_BINARY(name, func, bfloat16, float32, bool, float32) \
887
+ CUDA_DEVICE_BINARY(name, func, bfloat16, float64, bool, float64) \
888
+ CUDA_DEVICE_NOIMPL(name, cfunc, bfloat16, complex32, bool, complex64) \
889
+ CUDA_DEVICE_BINARY(name, cfunc, bfloat16, complex64, bool, complex64) \
890
+ CUDA_DEVICE_BINARY(name, cfunc, bfloat16, complex128, bool, complex128) \
891
+ \
892
+ CUDA_DEVICE_BINARY(name, hfunc, float16, uint8, bool, float16) \
893
+ CUDA_DEVICE_BINARY(name, func, float16, uint16, bool, float32) \
894
+ CUDA_DEVICE_BINARY(name, func, float16, uint32, bool, float64) \
895
+ CUDA_DEVICE_BINARY(name, hfunc, float16, int8, bool, float16) \
896
+ CUDA_DEVICE_BINARY(name, func, float16, int16, bool, float32) \
897
+ CUDA_DEVICE_BINARY(name, func, float16, int32, bool, float64) \
898
+ CUDA_DEVICE_BINARY(name, func, float16, bfloat16, bool, float32) \
899
+ CUDA_DEVICE_BINARY(name, hfunc, float16, float16, bool, float16) \
900
+ CUDA_DEVICE_BINARY(name, func, float16, float32, bool, float32) \
901
+ CUDA_DEVICE_BINARY(name, func, float16, float64, bool, float64) \
902
+ CUDA_DEVICE_NOIMPL(name, cfunc, float16, complex32, bool, complex32) \
903
+ CUDA_DEVICE_BINARY(name, cfunc, float16, complex64, bool, complex64) \
904
+ CUDA_DEVICE_BINARY(name, cfunc, float16, complex128, bool, complex128) \
905
+ \
906
+ CUDA_DEVICE_BINARY(name, func, float32, uint8, bool, float32) \
907
+ CUDA_DEVICE_BINARY(name, func, float32, uint16, bool, float32) \
908
+ CUDA_DEVICE_BINARY(name, func, float32, uint32, bool, float64) \
909
+ CUDA_DEVICE_BINARY(name, func, float32, int8, bool, float32) \
910
+ CUDA_DEVICE_BINARY(name, func, float32, int16, bool, float32) \
911
+ CUDA_DEVICE_BINARY(name, func, float32, int32, bool, float64) \
912
+ CUDA_DEVICE_BINARY(name, func, float32, bfloat16, bool, float32) \
913
+ CUDA_DEVICE_BINARY(name, func, float32, float16, bool, float32) \
914
+ CUDA_DEVICE_BINARY(name, func, float32, float32, bool, float32) \
915
+ CUDA_DEVICE_BINARY(name, func, float32, float64, bool, float64) \
916
+ CUDA_DEVICE_NOIMPL(name, cfunc, float32, complex32, bool, complex64) \
917
+ CUDA_DEVICE_BINARY(name, cfunc, float32, complex64, bool, complex64) \
918
+ CUDA_DEVICE_BINARY(name, cfunc, float32, complex128, bool, complex128) \
919
+ \
920
+ CUDA_DEVICE_BINARY(name, func, float64, uint8, bool, float64) \
921
+ CUDA_DEVICE_BINARY(name, func, float64, uint16, bool, float64) \
922
+ CUDA_DEVICE_BINARY(name, func, float64, uint32, bool, float64) \
923
+ CUDA_DEVICE_BINARY(name, func, float64, int8, bool, float64) \
924
+ CUDA_DEVICE_BINARY(name, func, float64, int16, bool, float64) \
925
+ CUDA_DEVICE_BINARY(name, func, float64, int32, bool, float64) \
926
+ CUDA_DEVICE_BINARY(name, func, float64, bfloat16, bool, float64) \
927
+ CUDA_DEVICE_BINARY(name, func, float64, float16, bool, float64) \
928
+ CUDA_DEVICE_BINARY(name, func, float64, float32, bool, float64) \
929
+ CUDA_DEVICE_BINARY(name, func, float64, float64, bool, float64) \
930
+ CUDA_DEVICE_NOIMPL(name, cfunc, float64, complex32, bool, complex128) \
931
+ CUDA_DEVICE_BINARY(name, cfunc, float64, complex64, bool, complex128) \
932
+ CUDA_DEVICE_BINARY(name, cfunc, float64, complex128, bool, complex128) \
933
+ \
934
+ CUDA_DEVICE_NOIMPL(name, cfunc, complex32, uint8, bool, complex32) \
935
+ CUDA_DEVICE_NOIMPL(name, cfunc, complex32, uint16, bool, complex64) \
936
+ CUDA_DEVICE_NOIMPL(name, cfunc, complex32, uint32, bool, complex128) \
937
+ CUDA_DEVICE_NOIMPL(name, cfunc, complex32, int8, bool, complex32) \
938
+ CUDA_DEVICE_NOIMPL(name, cfunc, complex32, int16, bool, complex64) \
939
+ CUDA_DEVICE_NOIMPL(name, cfunc, complex32, int32, bool, complex128) \
940
+ CUDA_DEVICE_NOIMPL(name, cfunc, complex32, bfloat16, bool, complex64) \
941
+ CUDA_DEVICE_NOIMPL(name, cfunc, complex32, float16, bool, complex32) \
942
+ CUDA_DEVICE_NOIMPL(name, cfunc, complex32, float32, bool, complex64) \
943
+ CUDA_DEVICE_NOIMPL(name, cfunc, complex32, float64, bool, complex128) \
944
+ CUDA_DEVICE_NOIMPL(name, cfunc, complex32, complex32, bool, complex32) \
945
+ CUDA_DEVICE_NOIMPL(name, cfunc, complex32, complex64, bool, complex64) \
946
+ CUDA_DEVICE_NOIMPL(name, cfunc, complex32, complex128, bool, complex128) \
947
+ \
948
+ CUDA_DEVICE_BINARY(name, cfunc, complex64, uint8, bool, complex64) \
949
+ CUDA_DEVICE_BINARY(name, cfunc, complex64, uint16, bool, complex64) \
950
+ CUDA_DEVICE_BINARY(name, cfunc, complex64, uint32, bool, complex128) \
951
+ CUDA_DEVICE_BINARY(name, cfunc, complex64, int8, bool, complex64) \
952
+ CUDA_DEVICE_BINARY(name, cfunc, complex64, int16, bool, complex64) \
953
+ CUDA_DEVICE_BINARY(name, cfunc, complex64, int32, bool, complex128) \
954
+ CUDA_DEVICE_BINARY(name, cfunc, complex64, bfloat16, bool, complex64) \
955
+ CUDA_DEVICE_BINARY(name, cfunc, complex64, float16, bool, complex64) \
956
+ CUDA_DEVICE_BINARY(name, cfunc, complex64, float32, bool, complex64) \
957
+ CUDA_DEVICE_BINARY(name, cfunc, complex64, float64, bool, complex128) \
958
+ CUDA_DEVICE_NOIMPL(name, cfunc, complex64, complex32, bool, complex64) \
959
+ CUDA_DEVICE_BINARY(name, cfunc, complex64, complex64, bool, complex64) \
960
+ CUDA_DEVICE_BINARY(name, cfunc, complex64, complex128, bool, complex128) \
961
+ \
962
+ CUDA_DEVICE_BINARY(name, cfunc, complex128, uint8, bool, complex128) \
963
+ CUDA_DEVICE_BINARY(name, cfunc, complex128, uint16, bool, complex128) \
964
+ CUDA_DEVICE_BINARY(name, cfunc, complex128, uint32, bool, complex128) \
965
+ CUDA_DEVICE_BINARY(name, cfunc, complex128, int8, bool, complex128) \
966
+ CUDA_DEVICE_BINARY(name, cfunc, complex128, int16, bool, complex128) \
967
+ CUDA_DEVICE_BINARY(name, cfunc, complex128, int32, bool, complex128) \
968
+ CUDA_DEVICE_BINARY(name, cfunc, complex128, bfloat16, bool, complex128) \
969
+ CUDA_DEVICE_BINARY(name, cfunc, complex128, float16, bool, complex128) \
970
+ CUDA_DEVICE_BINARY(name, cfunc, complex128, float32, bool, complex128) \
971
+ CUDA_DEVICE_BINARY(name, cfunc, complex128, float64, bool, complex128) \
972
+ CUDA_DEVICE_NOIMPL(name, cfunc, complex128, complex32, bool, complex128) \
973
+ CUDA_DEVICE_BINARY(name, cfunc, complex128, complex64, bool, complex128) \
974
+ CUDA_DEVICE_BINARY(name, cfunc, complex128, complex128, bool, complex128)
975
+
976
+
977
+ #define less(x, y) x < y
978
+ CUDA_DEVICE_ALL_COMPARISON(less, less, __hlt, lexorder_lt)
979
+
980
+ #define less_equal(x, y) x <= y
981
+ CUDA_DEVICE_ALL_COMPARISON(less_equal, less_equal, __hle, lexorder_le)
982
+
983
+ #define greater_equal(x, y) x >= y
984
+ CUDA_DEVICE_ALL_COMPARISON(greater_equal, greater_equal, __hge, lexorder_ge)
985
+
986
+ #define greater(x, y) x > y
987
+ CUDA_DEVICE_ALL_COMPARISON(greater, greater, __hgt, lexorder_gt)
988
+
989
+ #define equal(x, y) x == y
990
+ CUDA_DEVICE_ALL_COMPARISON(equal, equal, __heq, equal)
991
+
992
+ #define not_equal(x, y) x != y
993
+ CUDA_DEVICE_ALL_COMPARISON(not_equal, not_equal, half_ne, not_equal)
994
+
995
+ #define equaln(x, y) (x == y || (x != x && y != y))
996
+ CUDA_DEVICE_ALL_COMPARISON(equaln, equaln, half_eqn, lexorder_eqn)
997
+
998
+
999
+ /*****************************************************************************/
1000
+ /* Bitwise */
1001
+ /*****************************************************************************/
1002
+
1003
+ #define CUDA_DEVICE_ALL_BITWISE(name, func) \
1004
+ CUDA_DEVICE_BINARY(name, func, bool, bool, bool, bool) \
1005
+ CUDA_DEVICE_BINARY(name, func, bool, uint8, uint8, uint8) \
1006
+ CUDA_DEVICE_BINARY(name, func, bool, uint16, uint16, uint16) \
1007
+ CUDA_DEVICE_BINARY(name, func, bool, uint32, uint32, uint32) \
1008
+ CUDA_DEVICE_BINARY(name, func, bool, uint64, uint64, uint64) \
1009
+ CUDA_DEVICE_BINARY(name, func, bool, int8, int8, int8) \
1010
+ CUDA_DEVICE_BINARY(name, func, bool, int16, int16, int16) \
1011
+ CUDA_DEVICE_BINARY(name, func, bool, int32, int32, int32) \
1012
+ CUDA_DEVICE_BINARY(name, func, bool, int64, int64, int64) \
1013
+ \
1014
+ CUDA_DEVICE_BINARY(name, func, uint8, bool, uint8, uint8) \
1015
+ CUDA_DEVICE_BINARY(name, func, uint8, uint8, uint8, uint8) \
1016
+ CUDA_DEVICE_BINARY(name, func, uint8, uint16, uint16, uint16) \
1017
+ CUDA_DEVICE_BINARY(name, func, uint8, uint32, uint32, uint32) \
1018
+ CUDA_DEVICE_BINARY(name, func, uint8, uint64, uint64, uint64) \
1019
+ CUDA_DEVICE_BINARY(name, func, uint8, int8, int16, int16) \
1020
+ CUDA_DEVICE_BINARY(name, func, uint8, int16, int16, int16) \
1021
+ CUDA_DEVICE_BINARY(name, func, uint8, int32, int32, int32) \
1022
+ CUDA_DEVICE_BINARY(name, func, uint8, int64, int64, int64) \
1023
+ \
1024
+ CUDA_DEVICE_BINARY(name, func, uint16, bool, uint16, uint16) \
1025
+ CUDA_DEVICE_BINARY(name, func, uint16, uint8, uint16, uint16) \
1026
+ CUDA_DEVICE_BINARY(name, func, uint16, uint16, uint16, uint16) \
1027
+ CUDA_DEVICE_BINARY(name, func, uint16, uint32, uint32, uint32) \
1028
+ CUDA_DEVICE_BINARY(name, func, uint16, uint64, uint64, uint64) \
1029
+ CUDA_DEVICE_BINARY(name, func, uint16, int8, int32, int32) \
1030
+ CUDA_DEVICE_BINARY(name, func, uint16, int16, int32, int32) \
1031
+ CUDA_DEVICE_BINARY(name, func, uint16, int32, int32, int32) \
1032
+ CUDA_DEVICE_BINARY(name, func, uint16, int64, int64, int64) \
1033
+ \
1034
+ CUDA_DEVICE_BINARY(name, func, uint32, bool, uint32, uint32) \
1035
+ CUDA_DEVICE_BINARY(name, func, uint32, uint8, uint32, uint32) \
1036
+ CUDA_DEVICE_BINARY(name, func, uint32, uint16, uint32, uint32) \
1037
+ CUDA_DEVICE_BINARY(name, func, uint32, uint32, uint32, uint32) \
1038
+ CUDA_DEVICE_BINARY(name, func, uint32, uint64, uint64, uint64) \
1039
+ CUDA_DEVICE_BINARY(name, func, uint32, int8, int64, int64) \
1040
+ CUDA_DEVICE_BINARY(name, func, uint32, int16, int64, int64) \
1041
+ CUDA_DEVICE_BINARY(name, func, uint32, int32, int64, int64) \
1042
+ CUDA_DEVICE_BINARY(name, func, uint32, int64, int64, int64) \
1043
+ \
1044
+ CUDA_DEVICE_BINARY(name, func, uint64, bool, uint64, uint64) \
1045
+ CUDA_DEVICE_BINARY(name, func, uint64, uint8, uint64, uint64) \
1046
+ CUDA_DEVICE_BINARY(name, func, uint64, uint16, uint64, uint64) \
1047
+ CUDA_DEVICE_BINARY(name, func, uint64, uint32, uint64, uint64) \
1048
+ CUDA_DEVICE_BINARY(name, func, uint64, uint64, uint64, uint64) \
1049
+ \
1050
+ CUDA_DEVICE_BINARY(name, func, int8, bool, int8, int8) \
1051
+ CUDA_DEVICE_BINARY(name, func, int8, uint8, int16, int16) \
1052
+ CUDA_DEVICE_BINARY(name, func, int8, uint16, int32, int32) \
1053
+ CUDA_DEVICE_BINARY(name, func, int8, uint32, int64, int64) \
1054
+ CUDA_DEVICE_BINARY(name, func, int8, int8, int8, int8) \
1055
+ CUDA_DEVICE_BINARY(name, func, int8, int16, int16, int16) \
1056
+ CUDA_DEVICE_BINARY(name, func, int8, int32, int32, int32) \
1057
+ CUDA_DEVICE_BINARY(name, func, int8, int64, int64, int64) \
1058
+ \
1059
+ CUDA_DEVICE_BINARY(name, func, int16, bool, int16, int16) \
1060
+ CUDA_DEVICE_BINARY(name, func, int16, uint8, int16, int16) \
1061
+ CUDA_DEVICE_BINARY(name, func, int16, uint16, int32, int32) \
1062
+ CUDA_DEVICE_BINARY(name, func, int16, uint32, int64, int64) \
1063
+ CUDA_DEVICE_BINARY(name, func, int16, int8, int16, int16) \
1064
+ CUDA_DEVICE_BINARY(name, func, int16, int16, int16, int16) \
1065
+ CUDA_DEVICE_BINARY(name, func, int16, int32, int32, int32) \
1066
+ CUDA_DEVICE_BINARY(name, func, int16, int64, int64, int64) \
1067
+ \
1068
+ CUDA_DEVICE_BINARY(name, func, int32, bool, int32, int32) \
1069
+ CUDA_DEVICE_BINARY(name, func, int32, uint8, int32, int32) \
1070
+ CUDA_DEVICE_BINARY(name, func, int32, uint16, int32, int32) \
1071
+ CUDA_DEVICE_BINARY(name, func, int32, uint32, int64, int64) \
1072
+ CUDA_DEVICE_BINARY(name, func, int32, int8, int32, int32) \
1073
+ CUDA_DEVICE_BINARY(name, func, int32, int16, int32, int32) \
1074
+ CUDA_DEVICE_BINARY(name, func, int32, int32, int32, int32) \
1075
+ CUDA_DEVICE_BINARY(name, func, int32, int64, int64, int64) \
1076
+ \
1077
+ CUDA_DEVICE_BINARY(name, func, int64, bool, int64, int64) \
1078
+ CUDA_DEVICE_BINARY(name, func, int64, uint8, int64, int64) \
1079
+ CUDA_DEVICE_BINARY(name, func, int64, uint16, int64, int64) \
1080
+ CUDA_DEVICE_BINARY(name, func, int64, uint32, int64, int64) \
1081
+ CUDA_DEVICE_BINARY(name, func, int64, int8, int64, int64) \
1082
+ CUDA_DEVICE_BINARY(name, func, int64, int16, int64, int64) \
1083
+ CUDA_DEVICE_BINARY(name, func, int64, int32, int64, int64) \
1084
+ CUDA_DEVICE_BINARY(name, func, int64, int64, int64, int64)
1085
+
1086
+ #define bitwise_and(x, y) x & y
1087
+ CUDA_DEVICE_ALL_BITWISE(bitwise_and, bitwise_and)
1088
+
1089
+ #define bitwise_or(x, y) x | y
1090
+ CUDA_DEVICE_ALL_BITWISE(bitwise_or, bitwise_or)
1091
+
1092
+ #define bitwise_xor(x, y) x ^ y
1093
+ CUDA_DEVICE_ALL_BITWISE(bitwise_xor, bitwise_xor)
1094
+
1095
+
1096
+ /*****************************************************************************/
1097
+ /* Two return values */
1098
+ /*****************************************************************************/
1099
+
1100
+ #define CUDA_DEVICE_BINARY_MV(name, func, t0, t1, t2, t3) \
1101
+ static __global__ void \
1102
+ _1D_C_##name##_##t0##_##t1##_##t2##_##t3( \
1103
+ const t0##_t *x0, const t1##_t *x1, t2##_t *x2, t2##_t *x3, \
1104
+ int64_t N) \
1105
+ { \
1106
+ int64_t index = threadIdx.x + blockIdx.x * blockDim.x; \
1107
+ int64_t stride = blockDim.x * gridDim.x; \
1108
+ \
1109
+ for (int64_t i = index; i < N; i += stride) { \
1110
+ func(&x2[i], &x3[i], x0[i], x1[i]); \
1111
+ } \
1112
+ } \
1113
+ \
1114
+ extern "C" void \
1115
+ gm_cuda_device_fixed_1D_C_##name##_##t0##_##t1##_##t2##_##t3( \
1116
+ const char *a0, const char *a1, char *a2, char *a3, \
1117
+ int64_t N) \
1118
+ { \
1119
+ const t0##_t *x0 = (const t0##_t *)a0; \
1120
+ const t1##_t *x1 = (const t1##_t *)a1; \
1121
+ t2##_t *x2 = (t2##_t *)a2; \
1122
+ t3##_t *x3 = (t3##_t *)a3; \
1123
+ int blockSize = 256; \
1124
+ int64_t numBlocks = (N + blockSize - 1) / blockSize; \
1125
+ \
1126
+ _1D_C_##name##_##t0##_##t1##_##t2##_##t3<<<numBlocks, blockSize>>>( \
1127
+ x0, x1, x2, x3, N); \
1128
+ }
1129
+
1130
+ #define CUDA_DEVICE_ALL_BINARY_MV(name, func) \
1131
+ CUDA_DEVICE_BINARY_MV(name, func, uint8, uint8, uint8, uint8) \
1132
+ CUDA_DEVICE_BINARY_MV(name, func, uint16, uint16, uint16, uint16) \
1133
+ CUDA_DEVICE_BINARY_MV(name, func, uint32, uint32, uint32, uint32) \
1134
+ CUDA_DEVICE_BINARY_MV(name, func, uint64, uint64, uint64, uint64) \
1135
+ CUDA_DEVICE_BINARY_MV(name, func, int8, int8, int8, int8) \
1136
+ CUDA_DEVICE_BINARY_MV(name, func, int16, int16, int16, int16) \
1137
+ CUDA_DEVICE_BINARY_MV(name, func, int32, int32, int32, int32) \
1138
+ CUDA_DEVICE_BINARY_MV(name, func, int64, int64, int64, int64) \
1139
+ CUDA_DEVICE_BINARY_MV(name, func, bfloat16, bfloat16, bfloat16, bfloat16) \
1140
+ CUDA_DEVICE_BINARY_MV(name, func, float32, float32, float32, float32) \
1141
+ CUDA_DEVICE_BINARY_MV(name, func, float64, float64, float64, float64)
1142
+
1143
+ CUDA_DEVICE_ALL_BINARY_MV(divmod, _divmod)