gumath 0.2.0dev5 → 0.2.0dev8

Sign up to get free protection for your applications and to get access to all the features.
Files changed (99) hide show
  1. checksums.yaml +4 -4
  2. data/CONTRIBUTING.md +7 -2
  3. data/Gemfile +0 -3
  4. data/ext/ruby_gumath/GPATH +0 -0
  5. data/ext/ruby_gumath/GRTAGS +0 -0
  6. data/ext/ruby_gumath/GTAGS +0 -0
  7. data/ext/ruby_gumath/extconf.rb +0 -5
  8. data/ext/ruby_gumath/functions.c +10 -2
  9. data/ext/ruby_gumath/gufunc_object.c +15 -4
  10. data/ext/ruby_gumath/gufunc_object.h +9 -3
  11. data/ext/ruby_gumath/gumath/Makefile +63 -0
  12. data/ext/ruby_gumath/gumath/Makefile.in +1 -0
  13. data/ext/ruby_gumath/gumath/config.h +56 -0
  14. data/ext/ruby_gumath/gumath/config.h.in +3 -0
  15. data/ext/ruby_gumath/gumath/config.log +497 -0
  16. data/ext/ruby_gumath/gumath/config.status +1034 -0
  17. data/ext/ruby_gumath/gumath/configure +375 -4
  18. data/ext/ruby_gumath/gumath/configure.ac +47 -3
  19. data/ext/ruby_gumath/gumath/libgumath/Makefile +236 -0
  20. data/ext/ruby_gumath/gumath/libgumath/Makefile.in +90 -24
  21. data/ext/ruby_gumath/gumath/libgumath/Makefile.vc +54 -15
  22. data/ext/ruby_gumath/gumath/libgumath/apply.c +92 -28
  23. data/ext/ruby_gumath/gumath/libgumath/apply.o +0 -0
  24. data/ext/ruby_gumath/gumath/libgumath/common.o +0 -0
  25. data/ext/ruby_gumath/gumath/libgumath/cpu_device_binary.o +0 -0
  26. data/ext/ruby_gumath/gumath/libgumath/cpu_device_unary.o +0 -0
  27. data/ext/ruby_gumath/gumath/libgumath/cpu_host_binary.o +0 -0
  28. data/ext/ruby_gumath/gumath/libgumath/cpu_host_unary.o +0 -0
  29. data/ext/ruby_gumath/gumath/libgumath/examples.o +0 -0
  30. data/ext/ruby_gumath/gumath/libgumath/extending/graph.c +27 -20
  31. data/ext/ruby_gumath/gumath/libgumath/extending/pdist.c +1 -1
  32. data/ext/ruby_gumath/gumath/libgumath/func.c +13 -9
  33. data/ext/ruby_gumath/gumath/libgumath/func.o +0 -0
  34. data/ext/ruby_gumath/gumath/libgumath/graph.o +0 -0
  35. data/ext/ruby_gumath/gumath/libgumath/gumath.h +55 -14
  36. data/ext/ruby_gumath/gumath/libgumath/kernels/common.c +513 -0
  37. data/ext/ruby_gumath/gumath/libgumath/kernels/common.h +155 -0
  38. data/ext/ruby_gumath/gumath/libgumath/kernels/contrib/bfloat16.h +520 -0
  39. data/ext/ruby_gumath/gumath/libgumath/kernels/cpu_device_binary.cc +1123 -0
  40. data/ext/ruby_gumath/gumath/libgumath/kernels/cpu_device_binary.h +1062 -0
  41. data/ext/ruby_gumath/gumath/libgumath/kernels/cpu_device_msvc.cc +555 -0
  42. data/ext/ruby_gumath/gumath/libgumath/kernels/cpu_device_unary.cc +368 -0
  43. data/ext/ruby_gumath/gumath/libgumath/kernels/cpu_device_unary.h +335 -0
  44. data/ext/ruby_gumath/gumath/libgumath/kernels/cpu_host_binary.c +2952 -0
  45. data/ext/ruby_gumath/gumath/libgumath/kernels/cpu_host_unary.c +1100 -0
  46. data/ext/ruby_gumath/gumath/libgumath/kernels/cuda_device_binary.cu +1143 -0
  47. data/ext/ruby_gumath/gumath/libgumath/kernels/cuda_device_binary.h +1061 -0
  48. data/ext/ruby_gumath/gumath/libgumath/kernels/cuda_device_unary.cu +528 -0
  49. data/ext/ruby_gumath/gumath/libgumath/kernels/cuda_device_unary.h +463 -0
  50. data/ext/ruby_gumath/gumath/libgumath/kernels/cuda_host_binary.c +2817 -0
  51. data/ext/ruby_gumath/gumath/libgumath/kernels/cuda_host_unary.c +1331 -0
  52. data/ext/ruby_gumath/gumath/libgumath/kernels/device.hh +614 -0
  53. data/ext/ruby_gumath/gumath/libgumath/libgumath.a +0 -0
  54. data/ext/ruby_gumath/gumath/libgumath/libgumath.so +1 -0
  55. data/ext/ruby_gumath/gumath/libgumath/libgumath.so.0 +1 -0
  56. data/ext/ruby_gumath/gumath/libgumath/libgumath.so.0.2.0dev3 +0 -0
  57. data/ext/ruby_gumath/gumath/libgumath/nploops.o +0 -0
  58. data/ext/ruby_gumath/gumath/libgumath/pdist.o +0 -0
  59. data/ext/ruby_gumath/gumath/libgumath/quaternion.o +0 -0
  60. data/ext/ruby_gumath/gumath/libgumath/tbl.o +0 -0
  61. data/ext/ruby_gumath/gumath/libgumath/thread.c +17 -4
  62. data/ext/ruby_gumath/gumath/libgumath/thread.o +0 -0
  63. data/ext/ruby_gumath/gumath/libgumath/xndloops.c +110 -0
  64. data/ext/ruby_gumath/gumath/libgumath/xndloops.o +0 -0
  65. data/ext/ruby_gumath/gumath/python/gumath/__init__.py +150 -0
  66. data/ext/ruby_gumath/gumath/python/gumath/_gumath.c +446 -80
  67. data/ext/ruby_gumath/gumath/python/gumath/cuda.c +78 -0
  68. data/ext/ruby_gumath/gumath/python/gumath/examples.c +0 -5
  69. data/ext/ruby_gumath/gumath/python/gumath/functions.c +2 -2
  70. data/ext/ruby_gumath/gumath/python/gumath/gumath.h +246 -0
  71. data/ext/ruby_gumath/gumath/python/gumath/libgumath.a +0 -0
  72. data/ext/ruby_gumath/gumath/python/gumath/libgumath.so +1 -0
  73. data/ext/ruby_gumath/gumath/python/gumath/libgumath.so.0 +1 -0
  74. data/ext/ruby_gumath/gumath/python/gumath/libgumath.so.0.2.0dev3 +0 -0
  75. data/ext/ruby_gumath/gumath/python/gumath/pygumath.h +31 -2
  76. data/ext/ruby_gumath/gumath/python/gumath_aux.py +767 -0
  77. data/ext/ruby_gumath/gumath/python/randdec.py +535 -0
  78. data/ext/ruby_gumath/gumath/python/randfloat.py +177 -0
  79. data/ext/ruby_gumath/gumath/python/test_gumath.py +1504 -24
  80. data/ext/ruby_gumath/gumath/python/test_xndarray.py +462 -0
  81. data/ext/ruby_gumath/gumath/setup.py +67 -6
  82. data/ext/ruby_gumath/gumath/tools/detect_cuda_arch.cc +35 -0
  83. data/ext/ruby_gumath/include/gumath.h +55 -14
  84. data/ext/ruby_gumath/include/ruby_gumath.h +4 -1
  85. data/ext/ruby_gumath/lib/libgumath.a +0 -0
  86. data/ext/ruby_gumath/lib/libgumath.so.0.2.0dev3 +0 -0
  87. data/ext/ruby_gumath/ruby_gumath.c +231 -70
  88. data/ext/ruby_gumath/ruby_gumath.h +4 -1
  89. data/ext/ruby_gumath/ruby_gumath_internal.h +25 -0
  90. data/ext/ruby_gumath/util.c +34 -0
  91. data/ext/ruby_gumath/util.h +9 -0
  92. data/gumath.gemspec +3 -2
  93. data/lib/gumath.rb +55 -1
  94. data/lib/gumath/version.rb +2 -2
  95. data/lib/ruby_gumath.so +0 -0
  96. metadata +63 -10
  97. data/ext/ruby_gumath/gumath/libgumath/extending/bfloat16.c +0 -130
  98. data/ext/ruby_gumath/gumath/libgumath/kernels/binary.c +0 -547
  99. data/ext/ruby_gumath/gumath/libgumath/kernels/unary.c +0 -449
@@ -0,0 +1,1061 @@
1
+ /*
2
+ * BSD 3-Clause License
3
+ *
4
+ * Copyright (c) 2017-2018, plures
5
+ * All rights reserved.
6
+ *
7
+ * Redistribution and use in source and binary forms, with or without
8
+ * modification, are permitted provided that the following conditions are met:
9
+ *
10
+ * 1. Redistributions of source code must retain the above copyright notice,
11
+ * this list of conditions and the following disclaimer.
12
+ *
13
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
14
+ * this list of conditions and the following disclaimer in the documentation
15
+ * and/or other materials provided with the distribution.
16
+ *
17
+ * 3. Neither the name of the copyright holder nor the names of its
18
+ * contributors may be used to endorse or promote products derived from
19
+ * this software without specific prior written permission.
20
+ *
21
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
22
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
24
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
25
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
26
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
27
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
28
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
29
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
30
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31
+ */
32
+
33
+
34
+ #ifndef CUDA_DEVICE_BINARY_H
35
+ #define CUDA_DEVICE_BINARY_H
36
+
37
+
38
+ #ifdef __cplusplus
39
+ #include <cinttypes>
40
+ #include <cuda_fp16.h>
41
+ #include <thrust/complex.h>
42
+ #include "contrib/bfloat16.h"
43
+
44
+ typedef half float16_t;
45
+ typedef tf::bfloat16 bfloat16_t;
46
+ typedef thrust::complex<float> complex64_t;
47
+ typedef thrust::complex<double> complex128_t;
48
+ #else
49
+ #include <stdint.h>
50
+ #endif
51
+
52
+
53
+ typedef bool bool_t;
54
+ typedef float float32_t;
55
+ typedef double float64_t;
56
+
57
+
58
+ /*****************************************************************************/
59
+ /* Cuda device kernel signature */
60
+ /*****************************************************************************/
61
+
62
+ #ifdef __cplusplus
63
+ #define CUDA_DEVICE_BINARY_DECL(name, t0, t1, t2) \
64
+ extern "C" void gm_cuda_device_fixed_1D_C_##name##_##t0##_##t1##_##t2( \
65
+ const char *a0, const char *a1, char *a2, \
66
+ const int64_t N); \
67
+ extern "C" void gm_cuda_device_fixed_1D_S_##name##_##t0##_##t1##_##t2( \
68
+ const char *a0, const char *a1, char *a2, \
69
+ const int64_t s0, const int64_t s1, const int64_t s2, \
70
+ const int64_t N); \
71
+ extern "C" void gm_cuda_device_0D_##name##_##t0##_##t1##_##t2( \
72
+ const char *a0, const char *a1, char *a2);
73
+
74
+ #define CUDA_DEVICE_BINARY_MV_DECL(name, t0, t1, t2, t3) \
75
+ extern "C" void gm_cuda_device_fixed_1D_C_##name##_##t0##_##t1##_##t2##_##t3( \
76
+ const char *a0, const char *a1, char *a2, char *a3, \
77
+ const int64_t N);
78
+ #else
79
+ #define CUDA_DEVICE_BINARY_DECL(name, t0, t1, t2) \
80
+ void gm_cuda_device_fixed_1D_C_##name##_##t0##_##t1##_##t2( \
81
+ const char *a0, const char *a1, char *a2, \
82
+ const int64_t N); \
83
+ void gm_cuda_device_fixed_1D_S_##name##_##t0##_##t1##_##t2( \
84
+ const char *a0, const char *a1, char *a2, \
85
+ const int64_t s0, const int64_t s1, const int64_t s2, \
86
+ const int64_t N); \
87
+ void gm_cuda_device_0D_##name##_##t0##_##t1##_##t2( \
88
+ const char *a0, const char *a1, char *a2);
89
+
90
+ #define CUDA_DEVICE_BINARY_MV_DECL(name, t0, t1, t2, t3) \
91
+ void gm_cuda_device_fixed_1D_C_##name##_##t0##_##t1##_##t2##_##t3( \
92
+ const char *a0, const char *a1, char *a2, char *a3, \
93
+ const int64_t N);
94
+ #endif
95
+
96
+ #define CUDA_DEVICE_NOKERN_DECL(name, t0, t1, t2)
97
+ #define CUDA_DEVICE_NOIMPL_DECL(name, t0, t1, t2)
98
+
99
+
100
+ /*****************************************************************************/
101
+ /* Arithmetic */
102
+ /*****************************************************************************/
103
+
104
+ #define CUDA_DEVICE_BINARY_ARITHMETIC_DECL(name) \
105
+ CUDA_DEVICE_BINARY_DECL(name, uint8, uint8, uint8) \
106
+ CUDA_DEVICE_BINARY_DECL(name, uint8, uint16, uint16) \
107
+ CUDA_DEVICE_BINARY_DECL(name, uint8, uint32, uint32) \
108
+ CUDA_DEVICE_BINARY_DECL(name, uint8, uint64, uint64) \
109
+ CUDA_DEVICE_BINARY_DECL(name, uint8, int8, int16) \
110
+ CUDA_DEVICE_BINARY_DECL(name, uint8, int16, int16) \
111
+ CUDA_DEVICE_BINARY_DECL(name, uint8, int32, int32) \
112
+ CUDA_DEVICE_BINARY_DECL(name, uint8, int64, int64) \
113
+ CUDA_DEVICE_BINARY_DECL(name, uint8, bfloat16, bfloat16) \
114
+ CUDA_DEVICE_BINARY_DECL(name, uint8, float16, float16) \
115
+ CUDA_DEVICE_BINARY_DECL(name, uint8, float32, float32) \
116
+ CUDA_DEVICE_BINARY_DECL(name, uint8, float64, float64) \
117
+ CUDA_DEVICE_NOIMPL_DECL(name, uint8, complex32, complex32) \
118
+ CUDA_DEVICE_BINARY_DECL(name, uint8, complex64, complex64) \
119
+ CUDA_DEVICE_BINARY_DECL(name, uint8, complex128, complex128) \
120
+ \
121
+ CUDA_DEVICE_BINARY_DECL(name, uint16, uint8, uint16) \
122
+ CUDA_DEVICE_BINARY_DECL(name, uint16, uint16, uint16) \
123
+ CUDA_DEVICE_BINARY_DECL(name, uint16, uint32, uint32) \
124
+ CUDA_DEVICE_BINARY_DECL(name, uint16, uint64, uint64) \
125
+ CUDA_DEVICE_BINARY_DECL(name, uint16, int8, int32) \
126
+ CUDA_DEVICE_BINARY_DECL(name, uint16, int16, int32) \
127
+ CUDA_DEVICE_BINARY_DECL(name, uint16, int32, int32) \
128
+ CUDA_DEVICE_BINARY_DECL(name, uint16, int64, int64) \
129
+ CUDA_DEVICE_BINARY_DECL(name, uint16, bfloat16, float32) \
130
+ CUDA_DEVICE_BINARY_DECL(name, uint16, float16, float32) \
131
+ CUDA_DEVICE_BINARY_DECL(name, uint16, float32, float32) \
132
+ CUDA_DEVICE_BINARY_DECL(name, uint16, float64, float64) \
133
+ CUDA_DEVICE_NOIMPL_DECL(name, uint16, complex32, complex64) \
134
+ CUDA_DEVICE_BINARY_DECL(name, uint16, complex64, complex64) \
135
+ CUDA_DEVICE_BINARY_DECL(name, uint16, complex128, complex128) \
136
+ \
137
+ CUDA_DEVICE_BINARY_DECL(name, uint32, uint8, uint32) \
138
+ CUDA_DEVICE_BINARY_DECL(name, uint32, uint16, uint32) \
139
+ CUDA_DEVICE_BINARY_DECL(name, uint32, uint32, uint32) \
140
+ CUDA_DEVICE_BINARY_DECL(name, uint32, uint64, uint64) \
141
+ CUDA_DEVICE_BINARY_DECL(name, uint32, int8, int64) \
142
+ CUDA_DEVICE_BINARY_DECL(name, uint32, int16, int64) \
143
+ CUDA_DEVICE_BINARY_DECL(name, uint32, int32, int64) \
144
+ CUDA_DEVICE_BINARY_DECL(name, uint32, int64, int64) \
145
+ CUDA_DEVICE_BINARY_DECL(name, uint32, bfloat16, float64) \
146
+ CUDA_DEVICE_BINARY_DECL(name, uint32, float16, float64) \
147
+ CUDA_DEVICE_BINARY_DECL(name, uint32, float32, float64) \
148
+ CUDA_DEVICE_BINARY_DECL(name, uint32, float64, float64) \
149
+ CUDA_DEVICE_NOIMPL_DECL(name, uint32, complex32, complex128) \
150
+ CUDA_DEVICE_BINARY_DECL(name, uint32, complex64, complex128) \
151
+ CUDA_DEVICE_BINARY_DECL(name, uint32, complex128, complex128) \
152
+ \
153
+ CUDA_DEVICE_BINARY_DECL(name, uint64, uint8, uint64) \
154
+ CUDA_DEVICE_BINARY_DECL(name, uint64, uint16, uint64) \
155
+ CUDA_DEVICE_BINARY_DECL(name, uint64, uint32, uint64) \
156
+ CUDA_DEVICE_BINARY_DECL(name, uint64, uint64, uint64) \
157
+ \
158
+ CUDA_DEVICE_BINARY_DECL(name, int8, uint8, int16) \
159
+ CUDA_DEVICE_BINARY_DECL(name, int8, uint16, int32) \
160
+ CUDA_DEVICE_BINARY_DECL(name, int8, uint32, int64) \
161
+ CUDA_DEVICE_BINARY_DECL(name, int8, int8, int8) \
162
+ CUDA_DEVICE_BINARY_DECL(name, int8, int16, int16) \
163
+ CUDA_DEVICE_BINARY_DECL(name, int8, int32, int32) \
164
+ CUDA_DEVICE_BINARY_DECL(name, int8, int64, int64) \
165
+ CUDA_DEVICE_BINARY_DECL(name, int8, bfloat16, bfloat16) \
166
+ CUDA_DEVICE_BINARY_DECL(name, int8, float16, float16) \
167
+ CUDA_DEVICE_BINARY_DECL(name, int8, float32, float32) \
168
+ CUDA_DEVICE_BINARY_DECL(name, int8, float64, float64) \
169
+ CUDA_DEVICE_NOIMPL_DECL(name, int8, complex32, complex32) \
170
+ CUDA_DEVICE_BINARY_DECL(name, int8, complex64, complex64) \
171
+ CUDA_DEVICE_BINARY_DECL(name, int8, complex128, complex128) \
172
+ \
173
+ CUDA_DEVICE_BINARY_DECL(name, int16, uint8, int16) \
174
+ CUDA_DEVICE_BINARY_DECL(name, int16, uint16, int32) \
175
+ CUDA_DEVICE_BINARY_DECL(name, int16, uint32, int64) \
176
+ CUDA_DEVICE_BINARY_DECL(name, int16, int8, int16) \
177
+ CUDA_DEVICE_BINARY_DECL(name, int16, int16, int16) \
178
+ CUDA_DEVICE_BINARY_DECL(name, int16, int32, int32) \
179
+ CUDA_DEVICE_BINARY_DECL(name, int16, int64, int64) \
180
+ CUDA_DEVICE_BINARY_DECL(name, int16, bfloat16, float32) \
181
+ CUDA_DEVICE_BINARY_DECL(name, int16, float16, float32) \
182
+ CUDA_DEVICE_BINARY_DECL(name, int16, float32, float32) \
183
+ CUDA_DEVICE_BINARY_DECL(name, int16, float64, float64) \
184
+ CUDA_DEVICE_NOIMPL_DECL(name, int16, complex32, complex64) \
185
+ CUDA_DEVICE_BINARY_DECL(name, int16, complex64, complex64) \
186
+ CUDA_DEVICE_BINARY_DECL(name, int16, complex128, complex128) \
187
+ \
188
+ CUDA_DEVICE_BINARY_DECL(name, int32, uint8, int32) \
189
+ CUDA_DEVICE_BINARY_DECL(name, int32, uint16, int32) \
190
+ CUDA_DEVICE_BINARY_DECL(name, int32, uint32, int64) \
191
+ CUDA_DEVICE_BINARY_DECL(name, int32, int8, int32) \
192
+ CUDA_DEVICE_BINARY_DECL(name, int32, int16, int32) \
193
+ CUDA_DEVICE_BINARY_DECL(name, int32, int32, int32) \
194
+ CUDA_DEVICE_BINARY_DECL(name, int32, int64, int64) \
195
+ CUDA_DEVICE_BINARY_DECL(name, int32, bfloat16, float64) \
196
+ CUDA_DEVICE_BINARY_DECL(name, int32, float16, float64) \
197
+ CUDA_DEVICE_BINARY_DECL(name, int32, float32, float64) \
198
+ CUDA_DEVICE_BINARY_DECL(name, int32, float64, float64) \
199
+ CUDA_DEVICE_NOIMPL_DECL(name, int32, complex32, complex128) \
200
+ CUDA_DEVICE_BINARY_DECL(name, int32, complex64, complex128) \
201
+ CUDA_DEVICE_BINARY_DECL(name, int32, complex128, complex128) \
202
+ \
203
+ CUDA_DEVICE_BINARY_DECL(name, int64, uint8, int64) \
204
+ CUDA_DEVICE_BINARY_DECL(name, int64, uint16, int64) \
205
+ CUDA_DEVICE_BINARY_DECL(name, int64, uint32, int64) \
206
+ CUDA_DEVICE_BINARY_DECL(name, int64, int8, int64) \
207
+ CUDA_DEVICE_BINARY_DECL(name, int64, int16, int64) \
208
+ CUDA_DEVICE_BINARY_DECL(name, int64, int32, int64) \
209
+ CUDA_DEVICE_BINARY_DECL(name, int64, int64, int64) \
210
+ \
211
+ CUDA_DEVICE_BINARY_DECL(name, bfloat16, uint8, bfloat16) \
212
+ CUDA_DEVICE_BINARY_DECL(name, bfloat16, uint16, float32) \
213
+ CUDA_DEVICE_BINARY_DECL(name, bfloat16, uint32, float64) \
214
+ CUDA_DEVICE_BINARY_DECL(name, bfloat16, int8, bfloat16) \
215
+ CUDA_DEVICE_BINARY_DECL(name, bfloat16, int16, float32) \
216
+ CUDA_DEVICE_BINARY_DECL(name, bfloat16, int32, float64) \
217
+ CUDA_DEVICE_BINARY_DECL(name, bfloat16, bfloat16, bfloat16) \
218
+ CUDA_DEVICE_BINARY_DECL(name, bfloat16, float16, float32) \
219
+ CUDA_DEVICE_BINARY_DECL(name, bfloat16, float32, float32) \
220
+ CUDA_DEVICE_BINARY_DECL(name, bfloat16, float64, float64) \
221
+ CUDA_DEVICE_NOIMPL_DECL(name, bfloat16, complex32, complex64) \
222
+ CUDA_DEVICE_BINARY_DECL(name, bfloat16, complex64, complex64) \
223
+ CUDA_DEVICE_BINARY_DECL(name, bfloat16, complex128, complex128) \
224
+ \
225
+ CUDA_DEVICE_BINARY_DECL(name, float16, uint8, float16) \
226
+ CUDA_DEVICE_BINARY_DECL(name, float16, uint16, float32) \
227
+ CUDA_DEVICE_BINARY_DECL(name, float16, uint32, float64) \
228
+ CUDA_DEVICE_BINARY_DECL(name, float16, int8, float16) \
229
+ CUDA_DEVICE_BINARY_DECL(name, float16, int16, float32) \
230
+ CUDA_DEVICE_BINARY_DECL(name, float16, int32, float64) \
231
+ CUDA_DEVICE_BINARY_DECL(name, float16, bfloat16, float32) \
232
+ CUDA_DEVICE_BINARY_DECL(name, float16, float16, float16) \
233
+ CUDA_DEVICE_BINARY_DECL(name, float16, float32, float32) \
234
+ CUDA_DEVICE_BINARY_DECL(name, float16, float64, float64) \
235
+ CUDA_DEVICE_NOIMPL_DECL(name, float16, complex32, complex32) \
236
+ CUDA_DEVICE_BINARY_DECL(name, float16, complex64, complex64) \
237
+ CUDA_DEVICE_BINARY_DECL(name, float16, complex128, complex128) \
238
+ \
239
+ CUDA_DEVICE_BINARY_DECL(name, float32, uint8, float32) \
240
+ CUDA_DEVICE_BINARY_DECL(name, float32, uint16, float32) \
241
+ CUDA_DEVICE_BINARY_DECL(name, float32, uint32, float64) \
242
+ CUDA_DEVICE_BINARY_DECL(name, float32, int8, float32) \
243
+ CUDA_DEVICE_BINARY_DECL(name, float32, int16, float32) \
244
+ CUDA_DEVICE_BINARY_DECL(name, float32, int32, float64) \
245
+ CUDA_DEVICE_BINARY_DECL(name, float32, bfloat16, float32) \
246
+ CUDA_DEVICE_BINARY_DECL(name, float32, float16, float32) \
247
+ CUDA_DEVICE_BINARY_DECL(name, float32, float32, float32) \
248
+ CUDA_DEVICE_BINARY_DECL(name, float32, float64, float64) \
249
+ CUDA_DEVICE_NOIMPL_DECL(name, float32, complex32, complex64) \
250
+ CUDA_DEVICE_BINARY_DECL(name, float32, complex64, complex64) \
251
+ CUDA_DEVICE_BINARY_DECL(name, float32, complex128, complex128) \
252
+ \
253
+ CUDA_DEVICE_BINARY_DECL(name, float64, uint8, float64) \
254
+ CUDA_DEVICE_BINARY_DECL(name, float64, uint16, float64) \
255
+ CUDA_DEVICE_BINARY_DECL(name, float64, uint32, float64) \
256
+ CUDA_DEVICE_BINARY_DECL(name, float64, int8, float64) \
257
+ CUDA_DEVICE_BINARY_DECL(name, float64, int16, float64) \
258
+ CUDA_DEVICE_BINARY_DECL(name, float64, int32, float64) \
259
+ CUDA_DEVICE_BINARY_DECL(name, float64, bfloat16, float64) \
260
+ CUDA_DEVICE_BINARY_DECL(name, float64, float16, float64) \
261
+ CUDA_DEVICE_BINARY_DECL(name, float64, float32, float64) \
262
+ CUDA_DEVICE_BINARY_DECL(name, float64, float64, float64) \
263
+ CUDA_DEVICE_NOIMPL_DECL(name, float64, complex32, complex128) \
264
+ CUDA_DEVICE_BINARY_DECL(name, float64, complex64, complex128) \
265
+ CUDA_DEVICE_BINARY_DECL(name, float64, complex128, complex128) \
266
+ \
267
+ CUDA_DEVICE_NOIMPL_DECL(name, complex32, uint8, complex32) \
268
+ CUDA_DEVICE_NOIMPL_DECL(name, complex32, uint16, complex64) \
269
+ CUDA_DEVICE_NOIMPL_DECL(name, complex32, uint32, complex128) \
270
+ CUDA_DEVICE_NOIMPL_DECL(name, complex32, int8, complex32) \
271
+ CUDA_DEVICE_NOIMPL_DECL(name, complex32, int16, complex64) \
272
+ CUDA_DEVICE_NOIMPL_DECL(name, complex32, int32, complex128) \
273
+ CUDA_DEVICE_NOIMPL_DECL(name, complex32, bfloat16, complex64) \
274
+ CUDA_DEVICE_NOIMPL_DECL(name, complex32, float16, complex32) \
275
+ CUDA_DEVICE_NOIMPL_DECL(name, complex32, float32, complex64) \
276
+ CUDA_DEVICE_NOIMPL_DECL(name, complex32, float64, complex128) \
277
+ CUDA_DEVICE_NOIMPL_DECL(name, complex32, complex32, complex32) \
278
+ CUDA_DEVICE_NOIMPL_DECL(name, complex32, complex64, complex64) \
279
+ CUDA_DEVICE_NOIMPL_DECL(name, complex32, complex128, complex128) \
280
+ \
281
+ CUDA_DEVICE_BINARY_DECL(name, complex64, uint8, complex64) \
282
+ CUDA_DEVICE_BINARY_DECL(name, complex64, uint16, complex64) \
283
+ CUDA_DEVICE_BINARY_DECL(name, complex64, uint32, complex128) \
284
+ CUDA_DEVICE_BINARY_DECL(name, complex64, int8, complex64) \
285
+ CUDA_DEVICE_BINARY_DECL(name, complex64, int16, complex64) \
286
+ CUDA_DEVICE_BINARY_DECL(name, complex64, int32, complex128) \
287
+ CUDA_DEVICE_BINARY_DECL(name, complex64, bfloat16, complex64) \
288
+ CUDA_DEVICE_BINARY_DECL(name, complex64, float16, complex64) \
289
+ CUDA_DEVICE_BINARY_DECL(name, complex64, float32, complex64) \
290
+ CUDA_DEVICE_BINARY_DECL(name, complex64, float64, complex128) \
291
+ CUDA_DEVICE_NOIMPL_DECL(name, complex64, complex32, complex64) \
292
+ CUDA_DEVICE_BINARY_DECL(name, complex64, complex64, complex64) \
293
+ CUDA_DEVICE_BINARY_DECL(name, complex64, complex128, complex128) \
294
+ \
295
+ CUDA_DEVICE_BINARY_DECL(name, complex128, uint8, complex128) \
296
+ CUDA_DEVICE_BINARY_DECL(name, complex128, uint16, complex128) \
297
+ CUDA_DEVICE_BINARY_DECL(name, complex128, uint32, complex128) \
298
+ CUDA_DEVICE_BINARY_DECL(name, complex128, int8, complex128) \
299
+ CUDA_DEVICE_BINARY_DECL(name, complex128, int16, complex128) \
300
+ CUDA_DEVICE_BINARY_DECL(name, complex128, int32, complex128) \
301
+ CUDA_DEVICE_BINARY_DECL(name, complex128, bfloat16, complex128) \
302
+ CUDA_DEVICE_BINARY_DECL(name, complex128, float16, complex128) \
303
+ CUDA_DEVICE_BINARY_DECL(name, complex128, float32, complex128) \
304
+ CUDA_DEVICE_BINARY_DECL(name, complex128, float64, complex128) \
305
+ CUDA_DEVICE_NOIMPL_DECL(name, complex128, complex32, complex128) \
306
+ CUDA_DEVICE_BINARY_DECL(name, complex128, complex64, complex128) \
307
+ CUDA_DEVICE_BINARY_DECL(name, complex128, complex128, complex128)
308
+
309
+ #define CUDA_DEVICE_BINARY_ARITHMETIC_NO_COMPLEX_DECL(name) \
310
+ CUDA_DEVICE_BINARY_DECL(name, uint8, uint8, uint8) \
311
+ CUDA_DEVICE_BINARY_DECL(name, uint8, uint16, uint16) \
312
+ CUDA_DEVICE_BINARY_DECL(name, uint8, uint32, uint32) \
313
+ CUDA_DEVICE_BINARY_DECL(name, uint8, uint64, uint64) \
314
+ CUDA_DEVICE_BINARY_DECL(name, uint8, int8, int16) \
315
+ CUDA_DEVICE_BINARY_DECL(name, uint8, int16, int16) \
316
+ CUDA_DEVICE_BINARY_DECL(name, uint8, int32, int32) \
317
+ CUDA_DEVICE_BINARY_DECL(name, uint8, int64, int64) \
318
+ CUDA_DEVICE_BINARY_DECL(name, uint8, bfloat16, bfloat16) \
319
+ CUDA_DEVICE_NOIMPL_DECL(name, uint8, float16, float16) \
320
+ CUDA_DEVICE_BINARY_DECL(name, uint8, float32, float32) \
321
+ CUDA_DEVICE_BINARY_DECL(name, uint8, float64, float64) \
322
+ CUDA_DEVICE_NOKERN_DECL(name, uint8, complex32, complex32) \
323
+ CUDA_DEVICE_NOKERN_DECL(name, uint8, complex64, complex64) \
324
+ CUDA_DEVICE_NOKERN_DECL(name, uint8, complex128, complex128) \
325
+ \
326
+ CUDA_DEVICE_BINARY_DECL(name, uint16, uint8, uint16) \
327
+ CUDA_DEVICE_BINARY_DECL(name, uint16, uint16, uint16) \
328
+ CUDA_DEVICE_BINARY_DECL(name, uint16, uint32, uint32) \
329
+ CUDA_DEVICE_BINARY_DECL(name, uint16, uint64, uint64) \
330
+ CUDA_DEVICE_BINARY_DECL(name, uint16, int8, int32) \
331
+ CUDA_DEVICE_BINARY_DECL(name, uint16, int16, int32) \
332
+ CUDA_DEVICE_BINARY_DECL(name, uint16, int32, int32) \
333
+ CUDA_DEVICE_BINARY_DECL(name, uint16, int64, int64) \
334
+ CUDA_DEVICE_BINARY_DECL(name, uint16, bfloat16, float32) \
335
+ CUDA_DEVICE_BINARY_DECL(name, uint16, float16, float32) \
336
+ CUDA_DEVICE_BINARY_DECL(name, uint16, float32, float32) \
337
+ CUDA_DEVICE_BINARY_DECL(name, uint16, float64, float64) \
338
+ CUDA_DEVICE_NOKERN_DECL(name, uint16, complex32, complex64) \
339
+ CUDA_DEVICE_NOKERN_DECL(name, uint16, complex64, complex64) \
340
+ CUDA_DEVICE_NOKERN_DECL(name, uint16, complex128, complex128) \
341
+ \
342
+ CUDA_DEVICE_BINARY_DECL(name, uint32, uint8, uint32) \
343
+ CUDA_DEVICE_BINARY_DECL(name, uint32, uint16, uint32) \
344
+ CUDA_DEVICE_BINARY_DECL(name, uint32, uint32, uint32) \
345
+ CUDA_DEVICE_BINARY_DECL(name, uint32, uint64, uint64) \
346
+ CUDA_DEVICE_BINARY_DECL(name, uint32, int8, int64) \
347
+ CUDA_DEVICE_BINARY_DECL(name, uint32, int16, int64) \
348
+ CUDA_DEVICE_BINARY_DECL(name, uint32, int32, int64) \
349
+ CUDA_DEVICE_BINARY_DECL(name, uint32, int64, int64) \
350
+ CUDA_DEVICE_BINARY_DECL(name, uint32, bfloat16, float64) \
351
+ CUDA_DEVICE_BINARY_DECL(name, uint32, float16, float64) \
352
+ CUDA_DEVICE_BINARY_DECL(name, uint32, float32, float64) \
353
+ CUDA_DEVICE_BINARY_DECL(name, uint32, float64, float64) \
354
+ CUDA_DEVICE_NOKERN_DECL(name, uint32, complex32, complex128) \
355
+ CUDA_DEVICE_NOKERN_DECL(name, uint32, complex64, complex128) \
356
+ CUDA_DEVICE_NOKERN_DECL(name, uint32, complex128, complex128) \
357
+ \
358
+ CUDA_DEVICE_BINARY_DECL(name, uint64, uint8, uint64) \
359
+ CUDA_DEVICE_BINARY_DECL(name, uint64, uint16, uint64) \
360
+ CUDA_DEVICE_BINARY_DECL(name, uint64, uint32, uint64) \
361
+ CUDA_DEVICE_BINARY_DECL(name, uint64, uint64, uint64) \
362
+ \
363
+ CUDA_DEVICE_BINARY_DECL(name, int8, uint8, int16) \
364
+ CUDA_DEVICE_BINARY_DECL(name, int8, uint16, int32) \
365
+ CUDA_DEVICE_BINARY_DECL(name, int8, uint32, int64) \
366
+ CUDA_DEVICE_BINARY_DECL(name, int8, int8, int8) \
367
+ CUDA_DEVICE_BINARY_DECL(name, int8, int16, int16) \
368
+ CUDA_DEVICE_BINARY_DECL(name, int8, int32, int32) \
369
+ CUDA_DEVICE_BINARY_DECL(name, int8, int64, int64) \
370
+ CUDA_DEVICE_BINARY_DECL(name, int8, bfloat16, bfloat16) \
371
+ CUDA_DEVICE_NOIMPL_DECL(name, int8, float16, float16) \
372
+ CUDA_DEVICE_BINARY_DECL(name, int8, float32, float32) \
373
+ CUDA_DEVICE_BINARY_DECL(name, int8, float64, float64) \
374
+ CUDA_DEVICE_NOKERN_DECL(name, int8, complex32, complex32) \
375
+ CUDA_DEVICE_NOKERN_DECL(name, int8, complex64, complex64) \
376
+ CUDA_DEVICE_NOKERN_DECL(name, int8, complex128, complex128) \
377
+ \
378
+ CUDA_DEVICE_BINARY_DECL(name, int16, uint8, int16) \
379
+ CUDA_DEVICE_BINARY_DECL(name, int16, uint16, int32) \
380
+ CUDA_DEVICE_BINARY_DECL(name, int16, uint32, int64) \
381
+ CUDA_DEVICE_BINARY_DECL(name, int16, int8, int16) \
382
+ CUDA_DEVICE_BINARY_DECL(name, int16, int16, int16) \
383
+ CUDA_DEVICE_BINARY_DECL(name, int16, int32, int32) \
384
+ CUDA_DEVICE_BINARY_DECL(name, int16, int64, int64) \
385
+ CUDA_DEVICE_BINARY_DECL(name, int16, bfloat16, float32) \
386
+ CUDA_DEVICE_BINARY_DECL(name, int16, float16, float32) \
387
+ CUDA_DEVICE_BINARY_DECL(name, int16, float32, float32) \
388
+ CUDA_DEVICE_BINARY_DECL(name, int16, float64, float64) \
389
+ CUDA_DEVICE_NOKERN_DECL(name, int16, complex32, complex64) \
390
+ CUDA_DEVICE_NOKERN_DECL(name, int16, complex64, complex64) \
391
+ CUDA_DEVICE_NOKERN_DECL(name, int16, complex128, complex128) \
392
+ \
393
+ CUDA_DEVICE_BINARY_DECL(name, int32, uint8, int32) \
394
+ CUDA_DEVICE_BINARY_DECL(name, int32, uint16, int32) \
395
+ CUDA_DEVICE_BINARY_DECL(name, int32, uint32, int64) \
396
+ CUDA_DEVICE_BINARY_DECL(name, int32, int8, int32) \
397
+ CUDA_DEVICE_BINARY_DECL(name, int32, int16, int32) \
398
+ CUDA_DEVICE_BINARY_DECL(name, int32, int32, int32) \
399
+ CUDA_DEVICE_BINARY_DECL(name, int32, int64, int64) \
400
+ CUDA_DEVICE_BINARY_DECL(name, int32, bfloat16, float64) \
401
+ CUDA_DEVICE_BINARY_DECL(name, int32, float16, float64) \
402
+ CUDA_DEVICE_BINARY_DECL(name, int32, float32, float64) \
403
+ CUDA_DEVICE_BINARY_DECL(name, int32, float64, float64) \
404
+ CUDA_DEVICE_NOKERN_DECL(name, int32, complex32, complex128) \
405
+ CUDA_DEVICE_NOKERN_DECL(name, int32, complex64, complex128) \
406
+ CUDA_DEVICE_NOKERN_DECL(name, int32, complex128, complex128) \
407
+ \
408
+ CUDA_DEVICE_BINARY_DECL(name, int64, uint8, int64) \
409
+ CUDA_DEVICE_BINARY_DECL(name, int64, uint16, int64) \
410
+ CUDA_DEVICE_BINARY_DECL(name, int64, uint32, int64) \
411
+ CUDA_DEVICE_BINARY_DECL(name, int64, int8, int64) \
412
+ CUDA_DEVICE_BINARY_DECL(name, int64, int16, int64) \
413
+ CUDA_DEVICE_BINARY_DECL(name, int64, int32, int64) \
414
+ CUDA_DEVICE_BINARY_DECL(name, int64, int64, int64) \
415
+ \
416
+ CUDA_DEVICE_BINARY_DECL(name, bfloat16, uint8, bfloat16) \
417
+ CUDA_DEVICE_BINARY_DECL(name, bfloat16, uint16, float32) \
418
+ CUDA_DEVICE_BINARY_DECL(name, bfloat16, uint32, float64) \
419
+ CUDA_DEVICE_BINARY_DECL(name, bfloat16, int8, bfloat16) \
420
+ CUDA_DEVICE_BINARY_DECL(name, bfloat16, int16, float32) \
421
+ CUDA_DEVICE_BINARY_DECL(name, bfloat16, int32, float64) \
422
+ CUDA_DEVICE_BINARY_DECL(name, bfloat16, bfloat16, bfloat16) \
423
+ CUDA_DEVICE_BINARY_DECL(name, bfloat16, float16, float32) \
424
+ CUDA_DEVICE_BINARY_DECL(name, bfloat16, float32, float32) \
425
+ CUDA_DEVICE_BINARY_DECL(name, bfloat16, float64, float64) \
426
+ CUDA_DEVICE_NOKERN_DECL(name, bfloat16, complex32, complex64) \
427
+ CUDA_DEVICE_NOKERN_DECL(name, bfloat16, complex64, complex64) \
428
+ CUDA_DEVICE_NOKERN_DECL(name, bfloat16, complex128, complex128) \
429
+ \
430
+ CUDA_DEVICE_BINARY_DECL(name, float32, uint8, float32) \
431
+ CUDA_DEVICE_NOIMPL_DECL(name, float16, uint8, float16) \
432
+ CUDA_DEVICE_BINARY_DECL(name, float16, uint16, float32) \
433
+ CUDA_DEVICE_BINARY_DECL(name, float16, uint32, float64) \
434
+ CUDA_DEVICE_NOIMPL_DECL(name, float16, int8, float16) \
435
+ CUDA_DEVICE_BINARY_DECL(name, float16, int16, float32) \
436
+ CUDA_DEVICE_BINARY_DECL(name, float16, int32, float64) \
437
+ CUDA_DEVICE_BINARY_DECL(name, float16, bfloat16, float32) \
438
+ CUDA_DEVICE_NOIMPL_DECL(name, float16, float16, float16) \
439
+ CUDA_DEVICE_BINARY_DECL(name, float16, float32, float32) \
440
+ CUDA_DEVICE_BINARY_DECL(name, float16, float64, float64) \
441
+ CUDA_DEVICE_NOKERN_DECL(name, float16, complex32, complex32) \
442
+ CUDA_DEVICE_NOKERN_DECL(name, float16, complex64, complex64) \
443
+ CUDA_DEVICE_NOKERN_DECL(name, float16, complex128, complex128) \
444
+ \
445
+ CUDA_DEVICE_BINARY_DECL(name, float32, uint8, float32) \
446
+ CUDA_DEVICE_BINARY_DECL(name, float32, uint16, float32) \
447
+ CUDA_DEVICE_BINARY_DECL(name, float32, uint32, float64) \
448
+ CUDA_DEVICE_BINARY_DECL(name, float32, int8, float32) \
449
+ CUDA_DEVICE_BINARY_DECL(name, float32, int16, float32) \
450
+ CUDA_DEVICE_BINARY_DECL(name, float32, int32, float64) \
451
+ CUDA_DEVICE_BINARY_DECL(name, float32, bfloat16, float32) \
452
+ CUDA_DEVICE_BINARY_DECL(name, float32, float16, float32) \
453
+ CUDA_DEVICE_BINARY_DECL(name, float32, float32, float32) \
454
+ CUDA_DEVICE_BINARY_DECL(name, float32, float64, float64) \
455
+ CUDA_DEVICE_NOKERN_DECL(name, float32, complex32, complex64) \
456
+ CUDA_DEVICE_NOKERN_DECL(name, float32, complex64, complex64) \
457
+ CUDA_DEVICE_NOKERN_DECL(name, float32, complex128, complex128) \
458
+ \
459
+ CUDA_DEVICE_BINARY_DECL(name, float64, uint8, float64) \
460
+ CUDA_DEVICE_BINARY_DECL(name, float64, uint16, float64) \
461
+ CUDA_DEVICE_BINARY_DECL(name, float64, uint32, float64) \
462
+ CUDA_DEVICE_BINARY_DECL(name, float64, int8, float64) \
463
+ CUDA_DEVICE_BINARY_DECL(name, float64, int16, float64) \
464
+ CUDA_DEVICE_BINARY_DECL(name, float64, int32, float64) \
465
+ CUDA_DEVICE_BINARY_DECL(name, float64, bfloat16, float64) \
466
+ CUDA_DEVICE_BINARY_DECL(name, float64, float16, float64) \
467
+ CUDA_DEVICE_BINARY_DECL(name, float64, float32, float64) \
468
+ CUDA_DEVICE_BINARY_DECL(name, float64, float64, float64) \
469
+ CUDA_DEVICE_NOKERN_DECL(name, float64, complex32, complex128) \
470
+ CUDA_DEVICE_NOKERN_DECL(name, float64, complex64, complex128) \
471
+ CUDA_DEVICE_NOKERN_DECL(name, float64, complex128, complex128) \
472
+ \
473
+ CUDA_DEVICE_NOIMPL_DECL(name, complex32, uint8, complex32) \
474
+ CUDA_DEVICE_NOIMPL_DECL(name, complex32, uint16, complex64) \
475
+ CUDA_DEVICE_NOIMPL_DECL(name, complex32, uint32, complex128) \
476
+ CUDA_DEVICE_NOIMPL_DECL(name, complex32, int8, complex32) \
477
+ CUDA_DEVICE_NOIMPL_DECL(name, complex32, int16, complex64) \
478
+ CUDA_DEVICE_NOIMPL_DECL(name, complex32, int32, complex128) \
479
+ CUDA_DEVICE_NOIMPL_DECL(name, complex32, bfloat16, complex64) \
480
+ CUDA_DEVICE_NOIMPL_DECL(name, complex32, float16, complex32) \
481
+ CUDA_DEVICE_NOIMPL_DECL(name, complex32, float32, complex64) \
482
+ CUDA_DEVICE_NOIMPL_DECL(name, complex32, float64, complex128) \
483
+ CUDA_DEVICE_NOKERN_DECL(name, complex32, complex32, complex32) \
484
+ CUDA_DEVICE_NOKERN_DECL(name, complex32, complex64, complex64) \
485
+ CUDA_DEVICE_NOKERN_DECL(name, complex32, complex128, complex128) \
486
+ \
487
+ CUDA_DEVICE_BINARY_DECL(name, complex64, uint8, complex64) \
488
+ CUDA_DEVICE_BINARY_DECL(name, complex64, uint16, complex64) \
489
+ CUDA_DEVICE_BINARY_DECL(name, complex64, uint32, complex128) \
490
+ CUDA_DEVICE_BINARY_DECL(name, complex64, int8, complex64) \
491
+ CUDA_DEVICE_BINARY_DECL(name, complex64, int16, complex64) \
492
+ CUDA_DEVICE_BINARY_DECL(name, complex64, int32, complex128) \
493
+ CUDA_DEVICE_BINARY_DECL(name, complex64, bfloat16, complex64) \
494
+ CUDA_DEVICE_BINARY_DECL(name, complex64, float16, complex64) \
495
+ CUDA_DEVICE_BINARY_DECL(name, complex64, float32, complex64) \
496
+ CUDA_DEVICE_BINARY_DECL(name, complex64, float64, complex128) \
497
+ CUDA_DEVICE_NOKERN_DECL(name, complex64, complex32, complex64) \
498
+ CUDA_DEVICE_NOKERN_DECL(name, complex64, complex64, complex64) \
499
+ CUDA_DEVICE_NOKERN_DECL(name, complex64, complex128, complex128) \
500
+ \
501
+ CUDA_DEVICE_BINARY_DECL(name, complex128, uint8, complex128) \
502
+ CUDA_DEVICE_BINARY_DECL(name, complex128, uint16, complex128) \
503
+ CUDA_DEVICE_BINARY_DECL(name, complex128, uint32, complex128) \
504
+ CUDA_DEVICE_BINARY_DECL(name, complex128, int8, complex128) \
505
+ CUDA_DEVICE_BINARY_DECL(name, complex128, int16, complex128) \
506
+ CUDA_DEVICE_BINARY_DECL(name, complex128, int32, complex128) \
507
+ CUDA_DEVICE_BINARY_DECL(name, complex128, bfloat16, complex128) \
508
+ CUDA_DEVICE_BINARY_DECL(name, complex128, float16, complex128) \
509
+ CUDA_DEVICE_BINARY_DECL(name, complex128, float32, complex128) \
510
+ CUDA_DEVICE_BINARY_DECL(name, complex128, float64, complex128) \
511
+ CUDA_DEVICE_NOKERN_DECL(name, complex128, complex32, complex128) \
512
+ CUDA_DEVICE_NOKERN_DECL(name, complex128, complex64, complex128) \
513
+ CUDA_DEVICE_NOKERN_DECL(name, complex128, complex128, complex128)
514
+
515
+ #define CUDA_DEVICE_BINARY_ARITHMETIC_FLOAT_RETURN_DECL(name) \
516
+ CUDA_DEVICE_BINARY_DECL(name, uint8, uint8, float16) \
517
+ CUDA_DEVICE_BINARY_DECL(name, uint8, uint16, float32) \
518
+ CUDA_DEVICE_BINARY_DECL(name, uint8, uint32, float64) \
519
+ CUDA_DEVICE_NOKERN_DECL(name, uint8, uint64, uint64) \
520
+ CUDA_DEVICE_BINARY_DECL(name, uint8, int8, float16) \
521
+ CUDA_DEVICE_BINARY_DECL(name, uint8, int16, float32) \
522
+ CUDA_DEVICE_BINARY_DECL(name, uint8, int32, float64) \
523
+ CUDA_DEVICE_NOKERN_DECL(name, uint8, int64, int64) \
524
+ CUDA_DEVICE_BINARY_DECL(name, uint8, bfloat16, bfloat16) \
525
+ CUDA_DEVICE_BINARY_DECL(name, uint8, float16, float16) \
526
+ CUDA_DEVICE_BINARY_DECL(name, uint8, float32, float32) \
527
+ CUDA_DEVICE_BINARY_DECL(name, uint8, float64, float64) \
528
+ CUDA_DEVICE_NOIMPL_DECL(name, uint8, complex32, complex32) \
529
+ CUDA_DEVICE_BINARY_DECL(name, uint8, complex64, complex64) \
530
+ CUDA_DEVICE_BINARY_DECL(name, uint8, complex128, complex128) \
531
+ \
532
+ CUDA_DEVICE_BINARY_DECL(name, uint16, uint8, float32) \
533
+ CUDA_DEVICE_BINARY_DECL(name, uint16, uint16, float32) \
534
+ CUDA_DEVICE_BINARY_DECL(name, uint16, uint32, float64) \
535
+ CUDA_DEVICE_NOKERN_DECL(name, uint16, uint64, uint64) \
536
+ CUDA_DEVICE_BINARY_DECL(name, uint16, int8, float32) \
537
+ CUDA_DEVICE_BINARY_DECL(name, uint16, int16, float32) \
538
+ CUDA_DEVICE_BINARY_DECL(name, uint16, int32, float64) \
539
+ CUDA_DEVICE_NOKERN_DECL(name, uint16, int64, int64) \
540
+ CUDA_DEVICE_BINARY_DECL(name, uint16, bfloat16, float32) \
541
+ CUDA_DEVICE_BINARY_DECL(name, uint16, float16, float32) \
542
+ CUDA_DEVICE_BINARY_DECL(name, uint16, float32, float32) \
543
+ CUDA_DEVICE_BINARY_DECL(name, uint16, float64, float64) \
544
+ CUDA_DEVICE_NOIMPL_DECL(name, uint16, complex32, complex64) \
545
+ CUDA_DEVICE_BINARY_DECL(name, uint16, complex64, complex64) \
546
+ CUDA_DEVICE_BINARY_DECL(name, uint16, complex128, complex128) \
547
+ \
548
+ CUDA_DEVICE_BINARY_DECL(name, uint32, uint8, float64) \
549
+ CUDA_DEVICE_BINARY_DECL(name, uint32, uint16, float64) \
550
+ CUDA_DEVICE_BINARY_DECL(name, uint32, uint32, float64) \
551
+ CUDA_DEVICE_NOKERN_DECL(name, uint32, uint64, uint64) \
552
+ CUDA_DEVICE_BINARY_DECL(name, uint32, int8, float64) \
553
+ CUDA_DEVICE_BINARY_DECL(name, uint32, int16, float64) \
554
+ CUDA_DEVICE_BINARY_DECL(name, uint32, int32, float64) \
555
+ CUDA_DEVICE_NOKERN_DECL(name, uint32, int64, int64) \
556
+ CUDA_DEVICE_BINARY_DECL(name, uint32, bfloat16, float64) \
557
+ CUDA_DEVICE_BINARY_DECL(name, uint32, float16, float64) \
558
+ CUDA_DEVICE_BINARY_DECL(name, uint32, float32, float64) \
559
+ CUDA_DEVICE_BINARY_DECL(name, uint32, float64, float64) \
560
+ CUDA_DEVICE_NOIMPL_DECL(name, uint32, complex32, complex128) \
561
+ CUDA_DEVICE_BINARY_DECL(name, uint32, complex64, complex128) \
562
+ CUDA_DEVICE_BINARY_DECL(name, uint32, complex128, complex128) \
563
+ \
564
+ CUDA_DEVICE_NOKERN_DECL(name, uint64, uint8, uint64) \
565
+ CUDA_DEVICE_NOKERN_DECL(name, uint64, uint16, uint64) \
566
+ CUDA_DEVICE_NOKERN_DECL(name, uint64, uint32, uint64) \
567
+ CUDA_DEVICE_NOKERN_DECL(name, uint64, uint64, uint64) \
568
+ \
569
+ CUDA_DEVICE_BINARY_DECL(name, int8, uint8, float16) \
570
+ CUDA_DEVICE_BINARY_DECL(name, int8, uint16, float32) \
571
+ CUDA_DEVICE_BINARY_DECL(name, int8, uint32, float64) \
572
+ CUDA_DEVICE_BINARY_DECL(name, int8, int8, float16) \
573
+ CUDA_DEVICE_BINARY_DECL(name, int8, int16, float32) \
574
+ CUDA_DEVICE_BINARY_DECL(name, int8, int32, float64) \
575
+ CUDA_DEVICE_NOKERN_DECL(name, int8, int64, int64) \
576
+ CUDA_DEVICE_BINARY_DECL(name, int8, bfloat16, bfloat16) \
577
+ CUDA_DEVICE_BINARY_DECL(name, int8, float16, float16) \
578
+ CUDA_DEVICE_BINARY_DECL(name, int8, float32, float32) \
579
+ CUDA_DEVICE_BINARY_DECL(name, int8, float64, float64) \
580
+ CUDA_DEVICE_NOIMPL_DECL(name, int8, complex32, complex32) \
581
+ CUDA_DEVICE_BINARY_DECL(name, int8, complex64, complex64) \
582
+ CUDA_DEVICE_BINARY_DECL(name, int8, complex128, complex128) \
583
+ \
584
+ CUDA_DEVICE_BINARY_DECL(name, int16, uint8, float32) \
585
+ CUDA_DEVICE_BINARY_DECL(name, int16, uint16, float32) \
586
+ CUDA_DEVICE_BINARY_DECL(name, int16, uint32, float64) \
587
+ CUDA_DEVICE_BINARY_DECL(name, int16, int8, float32) \
588
+ CUDA_DEVICE_BINARY_DECL(name, int16, int16, float32) \
589
+ CUDA_DEVICE_BINARY_DECL(name, int16, int32, float64) \
590
+ CUDA_DEVICE_NOKERN_DECL(name, int16, int64, int64) \
591
+ CUDA_DEVICE_BINARY_DECL(name, int16, bfloat16, float32) \
592
+ CUDA_DEVICE_BINARY_DECL(name, int16, float16, float32) \
593
+ CUDA_DEVICE_BINARY_DECL(name, int16, float32, float32) \
594
+ CUDA_DEVICE_BINARY_DECL(name, int16, float64, float64) \
595
+ CUDA_DEVICE_NOIMPL_DECL(name, int16, complex32, complex64) \
596
+ CUDA_DEVICE_BINARY_DECL(name, int16, complex64, complex64) \
597
+ CUDA_DEVICE_BINARY_DECL(name, int16, complex128, complex128) \
598
+ \
599
+ CUDA_DEVICE_BINARY_DECL(name, int32, uint8, float64) \
600
+ CUDA_DEVICE_BINARY_DECL(name, int32, uint16, float64) \
601
+ CUDA_DEVICE_BINARY_DECL(name, int32, uint32, float64) \
602
+ CUDA_DEVICE_BINARY_DECL(name, int32, int8, float64) \
603
+ CUDA_DEVICE_BINARY_DECL(name, int32, int16, float64) \
604
+ CUDA_DEVICE_BINARY_DECL(name, int32, int32, float64) \
605
+ CUDA_DEVICE_NOKERN_DECL(name, int32, int64, int64) \
606
+ CUDA_DEVICE_BINARY_DECL(name, int32, bfloat16, float64) \
607
+ CUDA_DEVICE_BINARY_DECL(name, int32, float16, float64) \
608
+ CUDA_DEVICE_BINARY_DECL(name, int32, float32, float64) \
609
+ CUDA_DEVICE_BINARY_DECL(name, int32, float64, float64) \
610
+ CUDA_DEVICE_NOIMPL_DECL(name, int32, complex32, complex128) \
611
+ CUDA_DEVICE_BINARY_DECL(name, int32, complex64, complex128) \
612
+ CUDA_DEVICE_BINARY_DECL(name, int32, complex128, complex128) \
613
+ \
614
+ CUDA_DEVICE_NOKERN_DECL(name, int64, uint8, int64) \
615
+ CUDA_DEVICE_NOKERN_DECL(name, int64, uint16, int64) \
616
+ CUDA_DEVICE_NOKERN_DECL(name, int64, uint32, int64) \
617
+ CUDA_DEVICE_NOKERN_DECL(name, int64, int8, int64) \
618
+ CUDA_DEVICE_NOKERN_DECL(name, int64, int16, int64) \
619
+ CUDA_DEVICE_NOKERN_DECL(name, int64, int32, int64) \
620
+ CUDA_DEVICE_NOKERN_DECL(name, int64, int64, int64) \
621
+ \
622
+ CUDA_DEVICE_BINARY_DECL(name, bfloat16, uint8, bfloat16) \
623
+ CUDA_DEVICE_BINARY_DECL(name, bfloat16, uint16, float32) \
624
+ CUDA_DEVICE_BINARY_DECL(name, bfloat16, uint32, float64) \
625
+ CUDA_DEVICE_BINARY_DECL(name, bfloat16, int8, bfloat16) \
626
+ CUDA_DEVICE_BINARY_DECL(name, bfloat16, int16, float32) \
627
+ CUDA_DEVICE_BINARY_DECL(name, bfloat16, int32, float64) \
628
+ CUDA_DEVICE_BINARY_DECL(name, bfloat16, bfloat16, bfloat16) \
629
+ CUDA_DEVICE_BINARY_DECL(name, bfloat16, float16, float32) \
630
+ CUDA_DEVICE_BINARY_DECL(name, bfloat16, float32, float32) \
631
+ CUDA_DEVICE_BINARY_DECL(name, bfloat16, float64, float64) \
632
+ CUDA_DEVICE_NOIMPL_DECL(name, bfloat16, complex32, complex64) \
633
+ CUDA_DEVICE_BINARY_DECL(name, bfloat16, complex64, complex64) \
634
+ CUDA_DEVICE_BINARY_DECL(name, bfloat16, complex128, complex128) \
635
+ \
636
+ CUDA_DEVICE_BINARY_DECL(name, float16, uint8, float16) \
637
+ CUDA_DEVICE_BINARY_DECL(name, float16, uint16, float32) \
638
+ CUDA_DEVICE_BINARY_DECL(name, float16, uint32, float64) \
639
+ CUDA_DEVICE_BINARY_DECL(name, float16, int8, float16) \
640
+ CUDA_DEVICE_BINARY_DECL(name, float16, int16, float32) \
641
+ CUDA_DEVICE_BINARY_DECL(name, float16, int32, float64) \
642
+ CUDA_DEVICE_BINARY_DECL(name, float16, bfloat16, float32) \
643
+ CUDA_DEVICE_BINARY_DECL(name, float16, float16, float16) \
644
+ CUDA_DEVICE_BINARY_DECL(name, float16, float32, float32) \
645
+ CUDA_DEVICE_BINARY_DECL(name, float16, float64, float64) \
646
+ CUDA_DEVICE_NOIMPL_DECL(name, float16, complex32, complex32) \
647
+ CUDA_DEVICE_BINARY_DECL(name, float16, complex64, complex64) \
648
+ CUDA_DEVICE_BINARY_DECL(name, float16, complex128, complex128) \
649
+ \
650
+ CUDA_DEVICE_BINARY_DECL(name, float32, uint8, float32) \
651
+ CUDA_DEVICE_BINARY_DECL(name, float32, uint16, float32) \
652
+ CUDA_DEVICE_BINARY_DECL(name, float32, uint32, float64) \
653
+ CUDA_DEVICE_BINARY_DECL(name, float32, int8, float32) \
654
+ CUDA_DEVICE_BINARY_DECL(name, float32, int16, float32) \
655
+ CUDA_DEVICE_BINARY_DECL(name, float32, int32, float64) \
656
+ CUDA_DEVICE_BINARY_DECL(name, float32, bfloat16, float32) \
657
+ CUDA_DEVICE_BINARY_DECL(name, float32, float16, float32) \
658
+ CUDA_DEVICE_BINARY_DECL(name, float32, float32, float32) \
659
+ CUDA_DEVICE_BINARY_DECL(name, float32, float64, float64) \
660
+ CUDA_DEVICE_NOIMPL_DECL(name, float32, complex32, complex64) \
661
+ CUDA_DEVICE_BINARY_DECL(name, float32, complex64, complex64) \
662
+ CUDA_DEVICE_BINARY_DECL(name, float32, complex128, complex128) \
663
+ \
664
+ CUDA_DEVICE_BINARY_DECL(name, float64, uint8, float64) \
665
+ CUDA_DEVICE_BINARY_DECL(name, float64, uint16, float64) \
666
+ CUDA_DEVICE_BINARY_DECL(name, float64, uint32, float64) \
667
+ CUDA_DEVICE_BINARY_DECL(name, float64, int8, float64) \
668
+ CUDA_DEVICE_BINARY_DECL(name, float64, int16, float64) \
669
+ CUDA_DEVICE_BINARY_DECL(name, float64, int32, float64) \
670
+ CUDA_DEVICE_BINARY_DECL(name, float64, bfloat16, float64) \
671
+ CUDA_DEVICE_BINARY_DECL(name, float64, float16, float64) \
672
+ CUDA_DEVICE_BINARY_DECL(name, float64, float32, float64) \
673
+ CUDA_DEVICE_BINARY_DECL(name, float64, float64, float64) \
674
+ CUDA_DEVICE_NOIMPL_DECL(name, float64, complex32, complex128) \
675
+ CUDA_DEVICE_BINARY_DECL(name, float64, complex64, complex128) \
676
+ CUDA_DEVICE_BINARY_DECL(name, float64, complex128, complex128) \
677
+ \
678
+ CUDA_DEVICE_NOIMPL_DECL(name, complex32, uint8, complex32) \
679
+ CUDA_DEVICE_NOIMPL_DECL(name, complex32, uint16, complex64) \
680
+ CUDA_DEVICE_NOIMPL_DECL(name, complex32, uint32, complex128) \
681
+ CUDA_DEVICE_NOIMPL_DECL(name, complex32, int8, complex32) \
682
+ CUDA_DEVICE_NOIMPL_DECL(name, complex32, int16, complex64) \
683
+ CUDA_DEVICE_NOIMPL_DECL(name, complex32, int32, complex128) \
684
+ CUDA_DEVICE_NOIMPL_DECL(name, complex32, bfloat16, complex64) \
685
+ CUDA_DEVICE_NOIMPL_DECL(name, complex32, float16, complex32) \
686
+ CUDA_DEVICE_NOIMPL_DECL(name, complex32, float32, complex64) \
687
+ CUDA_DEVICE_NOIMPL_DECL(name, complex32, float64, complex128) \
688
+ CUDA_DEVICE_NOIMPL_DECL(name, complex32, complex32, complex32) \
689
+ CUDA_DEVICE_NOIMPL_DECL(name, complex32, complex64, complex64) \
690
+ CUDA_DEVICE_NOIMPL_DECL(name, complex32, complex128, complex128) \
691
+ \
692
+ CUDA_DEVICE_BINARY_DECL(name, complex64, uint8, complex64) \
693
+ CUDA_DEVICE_BINARY_DECL(name, complex64, uint16, complex64) \
694
+ CUDA_DEVICE_BINARY_DECL(name, complex64, uint32, complex128) \
695
+ CUDA_DEVICE_BINARY_DECL(name, complex64, int8, complex64) \
696
+ CUDA_DEVICE_BINARY_DECL(name, complex64, int16, complex64) \
697
+ CUDA_DEVICE_BINARY_DECL(name, complex64, int32, complex128) \
698
+ CUDA_DEVICE_BINARY_DECL(name, complex64, bfloat16, complex64) \
699
+ CUDA_DEVICE_BINARY_DECL(name, complex64, float16, complex64) \
700
+ CUDA_DEVICE_BINARY_DECL(name, complex64, float32, complex64) \
701
+ CUDA_DEVICE_BINARY_DECL(name, complex64, float64, complex128) \
702
+ CUDA_DEVICE_NOIMPL_DECL(name, complex64, complex32, complex64) \
703
+ CUDA_DEVICE_BINARY_DECL(name, complex64, complex64, complex64) \
704
+ CUDA_DEVICE_BINARY_DECL(name, complex64, complex128, complex128) \
705
+ \
706
+ CUDA_DEVICE_BINARY_DECL(name, complex128, uint8, complex128) \
707
+ CUDA_DEVICE_BINARY_DECL(name, complex128, uint16, complex128) \
708
+ CUDA_DEVICE_BINARY_DECL(name, complex128, uint32, complex128) \
709
+ CUDA_DEVICE_BINARY_DECL(name, complex128, int8, complex128) \
710
+ CUDA_DEVICE_BINARY_DECL(name, complex128, int16, complex128) \
711
+ CUDA_DEVICE_BINARY_DECL(name, complex128, int32, complex128) \
712
+ CUDA_DEVICE_BINARY_DECL(name, complex128, bfloat16, complex128) \
713
+ CUDA_DEVICE_BINARY_DECL(name, complex128, float16, complex128) \
714
+ CUDA_DEVICE_BINARY_DECL(name, complex128, float32, complex128) \
715
+ CUDA_DEVICE_BINARY_DECL(name, complex128, float64, complex128) \
716
+ CUDA_DEVICE_NOIMPL_DECL(name, complex128, complex32, complex128) \
717
+ CUDA_DEVICE_BINARY_DECL(name, complex128, complex64, complex128) \
718
+ CUDA_DEVICE_BINARY_DECL(name, complex128, complex128, complex128)
719
+
720
+
721
+ CUDA_DEVICE_BINARY_ARITHMETIC_DECL(add)
722
+ CUDA_DEVICE_BINARY_ARITHMETIC_DECL(subtract)
723
+ CUDA_DEVICE_BINARY_ARITHMETIC_DECL(multiply)
724
+ CUDA_DEVICE_BINARY_ARITHMETIC_NO_COMPLEX_DECL(floor_divide)
725
+ CUDA_DEVICE_BINARY_ARITHMETIC_NO_COMPLEX_DECL(remainder)
726
+ CUDA_DEVICE_BINARY_ARITHMETIC_FLOAT_RETURN_DECL(divide)
727
+ CUDA_DEVICE_BINARY_ARITHMETIC_DECL(power)
728
+
729
+
730
+ /*****************************************************************************/
731
+ /* Comparison */
732
+ /*****************************************************************************/
733
+
734
+ #define CUDA_DEVICE_ALL_COMPARISON_DECL(name) \
735
+ CUDA_DEVICE_BINARY_DECL(name, uint8, uint8, bool) \
736
+ CUDA_DEVICE_BINARY_DECL(name, uint8, uint16, bool) \
737
+ CUDA_DEVICE_BINARY_DECL(name, uint8, uint32, bool) \
738
+ CUDA_DEVICE_BINARY_DECL(name, uint8, uint64, bool) \
739
+ CUDA_DEVICE_BINARY_DECL(name, uint8, int8, bool) \
740
+ CUDA_DEVICE_BINARY_DECL(name, uint8, int16, bool) \
741
+ CUDA_DEVICE_BINARY_DECL(name, uint8, int32, bool) \
742
+ CUDA_DEVICE_BINARY_DECL(name, uint8, int64, bool) \
743
+ CUDA_DEVICE_BINARY_DECL(name, uint8, bfloat16, bool) \
744
+ CUDA_DEVICE_BINARY_DECL(name, uint8, float16, bool) \
745
+ CUDA_DEVICE_BINARY_DECL(name, uint8, float32, bool) \
746
+ CUDA_DEVICE_BINARY_DECL(name, uint8, float64, bool) \
747
+ CUDA_DEVICE_NOIMPL_DECL(name, uint8, complex32, bool) \
748
+ CUDA_DEVICE_BINARY_DECL(name, uint8, complex64, bool) \
749
+ CUDA_DEVICE_BINARY_DECL(name, uint8, complex128, bool) \
750
+ \
751
+ CUDA_DEVICE_BINARY_DECL(name, uint16, uint8, bool) \
752
+ CUDA_DEVICE_BINARY_DECL(name, uint16, uint16, bool) \
753
+ CUDA_DEVICE_BINARY_DECL(name, uint16, uint32, bool) \
754
+ CUDA_DEVICE_BINARY_DECL(name, uint16, uint64, bool) \
755
+ CUDA_DEVICE_BINARY_DECL(name, uint16, int8, bool) \
756
+ CUDA_DEVICE_BINARY_DECL(name, uint16, int16, bool) \
757
+ CUDA_DEVICE_BINARY_DECL(name, uint16, int32, bool) \
758
+ CUDA_DEVICE_BINARY_DECL(name, uint16, int64, bool) \
759
+ CUDA_DEVICE_BINARY_DECL(name, uint16, bfloat16, bool) \
760
+ CUDA_DEVICE_BINARY_DECL(name, uint16, float16, bool) \
761
+ CUDA_DEVICE_BINARY_DECL(name, uint16, float32, bool) \
762
+ CUDA_DEVICE_BINARY_DECL(name, uint16, float64, bool) \
763
+ CUDA_DEVICE_NOIMPL_DECL(name, uint16, complex32, bool) \
764
+ CUDA_DEVICE_BINARY_DECL(name, uint16, complex64, bool) \
765
+ CUDA_DEVICE_BINARY_DECL(name, uint16, complex128, bool) \
766
+ \
767
+ CUDA_DEVICE_BINARY_DECL(name, uint32, uint8, bool) \
768
+ CUDA_DEVICE_BINARY_DECL(name, uint32, uint16, bool) \
769
+ CUDA_DEVICE_BINARY_DECL(name, uint32, uint32, bool) \
770
+ CUDA_DEVICE_BINARY_DECL(name, uint32, uint64, bool) \
771
+ CUDA_DEVICE_BINARY_DECL(name, uint32, int8, bool) \
772
+ CUDA_DEVICE_BINARY_DECL(name, uint32, int16, bool) \
773
+ CUDA_DEVICE_BINARY_DECL(name, uint32, int32, bool) \
774
+ CUDA_DEVICE_BINARY_DECL(name, uint32, int64, bool) \
775
+ CUDA_DEVICE_BINARY_DECL(name, uint32, bfloat16, bool) \
776
+ CUDA_DEVICE_BINARY_DECL(name, uint32, float16, bool) \
777
+ CUDA_DEVICE_BINARY_DECL(name, uint32, float32, bool) \
778
+ CUDA_DEVICE_BINARY_DECL(name, uint32, float64, bool) \
779
+ CUDA_DEVICE_NOIMPL_DECL(name, uint32, complex32, bool) \
780
+ CUDA_DEVICE_BINARY_DECL(name, uint32, complex64, bool) \
781
+ CUDA_DEVICE_BINARY_DECL(name, uint32, complex128, bool) \
782
+ \
783
+ CUDA_DEVICE_BINARY_DECL(name, uint64, uint8, bool) \
784
+ CUDA_DEVICE_BINARY_DECL(name, uint64, uint16, bool) \
785
+ CUDA_DEVICE_BINARY_DECL(name, uint64, uint32, bool) \
786
+ CUDA_DEVICE_BINARY_DECL(name, uint64, uint64, bool) \
787
+ \
788
+ CUDA_DEVICE_BINARY_DECL(name, int8, uint8, bool) \
789
+ CUDA_DEVICE_BINARY_DECL(name, int8, uint16, bool) \
790
+ CUDA_DEVICE_BINARY_DECL(name, int8, uint32, bool) \
791
+ CUDA_DEVICE_BINARY_DECL(name, int8, int8, bool) \
792
+ CUDA_DEVICE_BINARY_DECL(name, int8, int16, bool) \
793
+ CUDA_DEVICE_BINARY_DECL(name, int8, int32, bool) \
794
+ CUDA_DEVICE_BINARY_DECL(name, int8, int64, bool) \
795
+ CUDA_DEVICE_BINARY_DECL(name, int8, bfloat16, bool) \
796
+ CUDA_DEVICE_BINARY_DECL(name, int8, float16, bool) \
797
+ CUDA_DEVICE_BINARY_DECL(name, int8, float32, bool) \
798
+ CUDA_DEVICE_BINARY_DECL(name, int8, float64, bool) \
799
+ CUDA_DEVICE_NOIMPL_DECL(name, int8, complex32, bool) \
800
+ CUDA_DEVICE_BINARY_DECL(name, int8, complex64, bool) \
801
+ CUDA_DEVICE_BINARY_DECL(name, int8, complex128, bool) \
802
+ \
803
+ CUDA_DEVICE_BINARY_DECL(name, int16, uint8, bool) \
804
+ CUDA_DEVICE_BINARY_DECL(name, int16, uint16, bool) \
805
+ CUDA_DEVICE_BINARY_DECL(name, int16, uint32, bool) \
806
+ CUDA_DEVICE_BINARY_DECL(name, int16, int8, bool) \
807
+ CUDA_DEVICE_BINARY_DECL(name, int16, int16, bool) \
808
+ CUDA_DEVICE_BINARY_DECL(name, int16, int32, bool) \
809
+ CUDA_DEVICE_BINARY_DECL(name, int16, int64, bool) \
810
+ CUDA_DEVICE_BINARY_DECL(name, int16, bfloat16, bool) \
811
+ CUDA_DEVICE_BINARY_DECL(name, int16, float16, bool) \
812
+ CUDA_DEVICE_BINARY_DECL(name, int16, float32, bool) \
813
+ CUDA_DEVICE_BINARY_DECL(name, int16, float64, bool) \
814
+ CUDA_DEVICE_NOIMPL_DECL(name, int16, complex32, bool) \
815
+ CUDA_DEVICE_BINARY_DECL(name, int16, complex64, bool) \
816
+ CUDA_DEVICE_BINARY_DECL(name, int16, complex128, bool) \
817
+ \
818
+ CUDA_DEVICE_BINARY_DECL(name, int32, uint8, bool) \
819
+ CUDA_DEVICE_BINARY_DECL(name, int32, uint16, bool) \
820
+ CUDA_DEVICE_BINARY_DECL(name, int32, uint32, bool) \
821
+ CUDA_DEVICE_BINARY_DECL(name, int32, int8, bool) \
822
+ CUDA_DEVICE_BINARY_DECL(name, int32, int16, bool) \
823
+ CUDA_DEVICE_BINARY_DECL(name, int32, int32, bool) \
824
+ CUDA_DEVICE_BINARY_DECL(name, int32, int64, bool) \
825
+ CUDA_DEVICE_BINARY_DECL(name, int32, bfloat16, bool) \
826
+ CUDA_DEVICE_BINARY_DECL(name, int32, float16, bool) \
827
+ CUDA_DEVICE_BINARY_DECL(name, int32, float32, bool) \
828
+ CUDA_DEVICE_BINARY_DECL(name, int32, float64, bool) \
829
+ CUDA_DEVICE_NOIMPL_DECL(name, int32, complex32, bool) \
830
+ CUDA_DEVICE_BINARY_DECL(name, int32, complex64, bool) \
831
+ CUDA_DEVICE_BINARY_DECL(name, int32, complex128, bool) \
832
+ \
833
+ CUDA_DEVICE_BINARY_DECL(name, int64, uint8, bool) \
834
+ CUDA_DEVICE_BINARY_DECL(name, int64, uint16, bool) \
835
+ CUDA_DEVICE_BINARY_DECL(name, int64, uint32, bool) \
836
+ CUDA_DEVICE_BINARY_DECL(name, int64, int8, bool) \
837
+ CUDA_DEVICE_BINARY_DECL(name, int64, int16, bool) \
838
+ CUDA_DEVICE_BINARY_DECL(name, int64, int32, bool) \
839
+ CUDA_DEVICE_BINARY_DECL(name, int64, int64, bool) \
840
+ \
841
+ CUDA_DEVICE_BINARY_DECL(name, bfloat16, uint8, bool) \
842
+ CUDA_DEVICE_BINARY_DECL(name, bfloat16, uint16, bool) \
843
+ CUDA_DEVICE_BINARY_DECL(name, bfloat16, uint32, bool) \
844
+ CUDA_DEVICE_BINARY_DECL(name, bfloat16, int8, bool) \
845
+ CUDA_DEVICE_BINARY_DECL(name, bfloat16, int16, bool) \
846
+ CUDA_DEVICE_BINARY_DECL(name, bfloat16, int32, bool) \
847
+ CUDA_DEVICE_BINARY_DECL(name, bfloat16, bfloat16, bool) \
848
+ CUDA_DEVICE_BINARY_DECL(name, bfloat16, float16, bool) \
849
+ CUDA_DEVICE_BINARY_DECL(name, bfloat16, float32, bool) \
850
+ CUDA_DEVICE_BINARY_DECL(name, bfloat16, float64, bool) \
851
+ CUDA_DEVICE_NOIMPL_DECL(name, bfloat16, complex32, bool) \
852
+ CUDA_DEVICE_BINARY_DECL(name, bfloat16, complex64, bool) \
853
+ CUDA_DEVICE_BINARY_DECL(name, bfloat16, complex128, bool) \
854
+ \
855
+ CUDA_DEVICE_BINARY_DECL(name, float16, uint8, bool) \
856
+ CUDA_DEVICE_BINARY_DECL(name, float16, uint16, bool) \
857
+ CUDA_DEVICE_BINARY_DECL(name, float16, uint32, bool) \
858
+ CUDA_DEVICE_BINARY_DECL(name, float16, int8, bool) \
859
+ CUDA_DEVICE_BINARY_DECL(name, float16, int16, bool) \
860
+ CUDA_DEVICE_BINARY_DECL(name, float16, int32, bool) \
861
+ CUDA_DEVICE_BINARY_DECL(name, float16, bfloat16, bool) \
862
+ CUDA_DEVICE_BINARY_DECL(name, float16, float16, bool) \
863
+ CUDA_DEVICE_BINARY_DECL(name, float16, float32, bool) \
864
+ CUDA_DEVICE_BINARY_DECL(name, float16, float64, bool) \
865
+ CUDA_DEVICE_NOIMPL_DECL(name, float16, complex32, bool) \
866
+ CUDA_DEVICE_BINARY_DECL(name, float16, complex64, bool) \
867
+ CUDA_DEVICE_BINARY_DECL(name, float16, complex128, bool) \
868
+ \
869
+ CUDA_DEVICE_BINARY_DECL(name, float32, uint8, bool) \
870
+ CUDA_DEVICE_BINARY_DECL(name, float32, uint16, bool) \
871
+ CUDA_DEVICE_BINARY_DECL(name, float32, uint32, bool) \
872
+ CUDA_DEVICE_BINARY_DECL(name, float32, int8, bool) \
873
+ CUDA_DEVICE_BINARY_DECL(name, float32, int16, bool) \
874
+ CUDA_DEVICE_BINARY_DECL(name, float32, int32, bool) \
875
+ CUDA_DEVICE_BINARY_DECL(name, float32, bfloat16, bool) \
876
+ CUDA_DEVICE_BINARY_DECL(name, float32, float16, bool) \
877
+ CUDA_DEVICE_BINARY_DECL(name, float32, float32, bool) \
878
+ CUDA_DEVICE_BINARY_DECL(name, float32, float64, bool) \
879
+ CUDA_DEVICE_NOIMPL_DECL(name, float32, complex32, bool) \
880
+ CUDA_DEVICE_BINARY_DECL(name, float32, complex64, bool) \
881
+ CUDA_DEVICE_BINARY_DECL(name, float32, complex128, bool) \
882
+ \
883
+ CUDA_DEVICE_BINARY_DECL(name, float64, uint8, bool) \
884
+ CUDA_DEVICE_BINARY_DECL(name, float64, uint16, bool) \
885
+ CUDA_DEVICE_BINARY_DECL(name, float64, uint32, bool) \
886
+ CUDA_DEVICE_BINARY_DECL(name, float64, int8, bool) \
887
+ CUDA_DEVICE_BINARY_DECL(name, float64, int16, bool) \
888
+ CUDA_DEVICE_BINARY_DECL(name, float64, int32, bool) \
889
+ CUDA_DEVICE_BINARY_DECL(name, float64, bfloat16, bool) \
890
+ CUDA_DEVICE_BINARY_DECL(name, float64, float16, bool) \
891
+ CUDA_DEVICE_BINARY_DECL(name, float64, float32, bool) \
892
+ CUDA_DEVICE_BINARY_DECL(name, float64, float64, bool) \
893
+ CUDA_DEVICE_NOIMPL_DECL(name, float64, complex32, bool) \
894
+ CUDA_DEVICE_BINARY_DECL(name, float64, complex64, bool) \
895
+ CUDA_DEVICE_BINARY_DECL(name, float64, complex128, bool) \
896
+ \
897
+ CUDA_DEVICE_NOIMPL_DECL(name, complex32, uint8, bool) \
898
+ CUDA_DEVICE_NOIMPL_DECL(name, complex32, uint16, bool) \
899
+ CUDA_DEVICE_NOIMPL_DECL(name, complex32, uint32, bool) \
900
+ CUDA_DEVICE_NOIMPL_DECL(name, complex32, int8, bool) \
901
+ CUDA_DEVICE_NOIMPL_DECL(name, complex32, int16, bool) \
902
+ CUDA_DEVICE_NOIMPL_DECL(name, complex32, int32, bool) \
903
+ CUDA_DEVICE_NOIMPL_DECL(name, complex32, bfloat16, bool) \
904
+ CUDA_DEVICE_NOIMPL_DECL(name, complex32, float16, bool) \
905
+ CUDA_DEVICE_NOIMPL_DECL(name, complex32, float32, bool) \
906
+ CUDA_DEVICE_NOIMPL_DECL(name, complex32, float64, bool) \
907
+ CUDA_DEVICE_NOIMPL_DECL(name, complex32, complex32, bool) \
908
+ CUDA_DEVICE_NOIMPL_DECL(name, complex32, complex64, bool) \
909
+ CUDA_DEVICE_NOIMPL_DECL(name, complex32, complex128, bool) \
910
+ \
911
+ CUDA_DEVICE_BINARY_DECL(name, complex64, uint8, bool) \
912
+ CUDA_DEVICE_BINARY_DECL(name, complex64, uint16, bool) \
913
+ CUDA_DEVICE_BINARY_DECL(name, complex64, uint32, bool) \
914
+ CUDA_DEVICE_BINARY_DECL(name, complex64, int8, bool) \
915
+ CUDA_DEVICE_BINARY_DECL(name, complex64, int16, bool) \
916
+ CUDA_DEVICE_BINARY_DECL(name, complex64, int32, bool) \
917
+ CUDA_DEVICE_BINARY_DECL(name, complex64, bfloat16, bool) \
918
+ CUDA_DEVICE_BINARY_DECL(name, complex64, float16, bool) \
919
+ CUDA_DEVICE_BINARY_DECL(name, complex64, float32, bool) \
920
+ CUDA_DEVICE_BINARY_DECL(name, complex64, float64, bool) \
921
+ CUDA_DEVICE_NOIMPL_DECL(name, complex64, complex32, bool) \
922
+ CUDA_DEVICE_BINARY_DECL(name, complex64, complex64, bool) \
923
+ CUDA_DEVICE_BINARY_DECL(name, complex64, complex128, bool) \
924
+ \
925
+ CUDA_DEVICE_BINARY_DECL(name, complex128, uint8, bool) \
926
+ CUDA_DEVICE_BINARY_DECL(name, complex128, uint16, bool) \
927
+ CUDA_DEVICE_BINARY_DECL(name, complex128, uint32, bool) \
928
+ CUDA_DEVICE_BINARY_DECL(name, complex128, int8, bool) \
929
+ CUDA_DEVICE_BINARY_DECL(name, complex128, int16, bool) \
930
+ CUDA_DEVICE_BINARY_DECL(name, complex128, int32, bool) \
931
+ CUDA_DEVICE_BINARY_DECL(name, complex128, bfloat16, bool) \
932
+ CUDA_DEVICE_BINARY_DECL(name, complex128, float16, bool) \
933
+ CUDA_DEVICE_BINARY_DECL(name, complex128, float32, bool) \
934
+ CUDA_DEVICE_BINARY_DECL(name, complex128, float64, bool) \
935
+ CUDA_DEVICE_NOIMPL_DECL(name, complex128, complex32, bool) \
936
+ CUDA_DEVICE_BINARY_DECL(name, complex128, complex64, bool) \
937
+ CUDA_DEVICE_BINARY_DECL(name, complex128, complex128, bool)
938
+
939
+
940
+ CUDA_DEVICE_ALL_COMPARISON_DECL(less)
941
+ CUDA_DEVICE_ALL_COMPARISON_DECL(less_equal)
942
+ CUDA_DEVICE_ALL_COMPARISON_DECL(greater_equal)
943
+ CUDA_DEVICE_ALL_COMPARISON_DECL(greater)
944
+ CUDA_DEVICE_ALL_COMPARISON_DECL(equal)
945
+ CUDA_DEVICE_ALL_COMPARISON_DECL(not_equal)
946
+ CUDA_DEVICE_ALL_COMPARISON_DECL(equaln)
947
+
948
+
949
+ /*****************************************************************************/
950
+ /* Bitwise */
951
+ /*****************************************************************************/
952
+
953
+ #define CUDA_DEVICE_ALL_BITWISE_DECL(name) \
954
+ CUDA_DEVICE_BINARY_DECL(name, bool, bool, bool) \
955
+ CUDA_DEVICE_BINARY_DECL(name, bool, uint8, uint8) \
956
+ CUDA_DEVICE_BINARY_DECL(name, bool, uint16, uint16) \
957
+ CUDA_DEVICE_BINARY_DECL(name, bool, uint32, uint32) \
958
+ CUDA_DEVICE_BINARY_DECL(name, bool, uint64, uint64) \
959
+ CUDA_DEVICE_BINARY_DECL(name, bool, int8, int8) \
960
+ CUDA_DEVICE_BINARY_DECL(name, bool, int16, int16) \
961
+ CUDA_DEVICE_BINARY_DECL(name, bool, int32, int32) \
962
+ CUDA_DEVICE_BINARY_DECL(name, bool, int64, int64) \
963
+ \
964
+ CUDA_DEVICE_BINARY_DECL(name, uint8, bool, uint8) \
965
+ CUDA_DEVICE_BINARY_DECL(name, uint8, uint8, uint8) \
966
+ CUDA_DEVICE_BINARY_DECL(name, uint8, uint16, uint16) \
967
+ CUDA_DEVICE_BINARY_DECL(name, uint8, uint32, uint32) \
968
+ CUDA_DEVICE_BINARY_DECL(name, uint8, uint64, uint64) \
969
+ CUDA_DEVICE_BINARY_DECL(name, uint8, int8, int16) \
970
+ CUDA_DEVICE_BINARY_DECL(name, uint8, int16, int16) \
971
+ CUDA_DEVICE_BINARY_DECL(name, uint8, int32, int32) \
972
+ CUDA_DEVICE_BINARY_DECL(name, uint8, int64, int64) \
973
+ \
974
+ CUDA_DEVICE_BINARY_DECL(name, uint16, bool, uint16) \
975
+ CUDA_DEVICE_BINARY_DECL(name, uint16, uint8, uint16) \
976
+ CUDA_DEVICE_BINARY_DECL(name, uint16, uint16, uint16) \
977
+ CUDA_DEVICE_BINARY_DECL(name, uint16, uint32, uint32) \
978
+ CUDA_DEVICE_BINARY_DECL(name, uint16, uint64, uint64) \
979
+ CUDA_DEVICE_BINARY_DECL(name, uint16, int8, int32) \
980
+ CUDA_DEVICE_BINARY_DECL(name, uint16, int16, int32) \
981
+ CUDA_DEVICE_BINARY_DECL(name, uint16, int32, int32) \
982
+ CUDA_DEVICE_BINARY_DECL(name, uint16, int64, int64) \
983
+ \
984
+ CUDA_DEVICE_BINARY_DECL(name, uint32, bool, uint32) \
985
+ CUDA_DEVICE_BINARY_DECL(name, uint32, uint8, uint32) \
986
+ CUDA_DEVICE_BINARY_DECL(name, uint32, uint16, uint32) \
987
+ CUDA_DEVICE_BINARY_DECL(name, uint32, uint32, uint32) \
988
+ CUDA_DEVICE_BINARY_DECL(name, uint32, uint64, uint64) \
989
+ CUDA_DEVICE_BINARY_DECL(name, uint32, int8, int64) \
990
+ CUDA_DEVICE_BINARY_DECL(name, uint32, int16, int64) \
991
+ CUDA_DEVICE_BINARY_DECL(name, uint32, int32, int64) \
992
+ CUDA_DEVICE_BINARY_DECL(name, uint32, int64, int64) \
993
+ \
994
+ CUDA_DEVICE_BINARY_DECL(name, uint64, bool, uint64) \
995
+ CUDA_DEVICE_BINARY_DECL(name, uint64, uint8, uint64) \
996
+ CUDA_DEVICE_BINARY_DECL(name, uint64, uint16, uint64) \
997
+ CUDA_DEVICE_BINARY_DECL(name, uint64, uint32, uint64) \
998
+ CUDA_DEVICE_BINARY_DECL(name, uint64, uint64, uint64) \
999
+ \
1000
+ CUDA_DEVICE_BINARY_DECL(name, int8, bool, int8) \
1001
+ CUDA_DEVICE_BINARY_DECL(name, int8, uint8, int16) \
1002
+ CUDA_DEVICE_BINARY_DECL(name, int8, uint16, int32) \
1003
+ CUDA_DEVICE_BINARY_DECL(name, int8, uint32, int64) \
1004
+ CUDA_DEVICE_BINARY_DECL(name, int8, int8, int8) \
1005
+ CUDA_DEVICE_BINARY_DECL(name, int8, int16, int16) \
1006
+ CUDA_DEVICE_BINARY_DECL(name, int8, int32, int32) \
1007
+ CUDA_DEVICE_BINARY_DECL(name, int8, int64, int64) \
1008
+ \
1009
+ CUDA_DEVICE_BINARY_DECL(name, int16, bool, int16) \
1010
+ CUDA_DEVICE_BINARY_DECL(name, int16, uint8, int16) \
1011
+ CUDA_DEVICE_BINARY_DECL(name, int16, uint16, int32) \
1012
+ CUDA_DEVICE_BINARY_DECL(name, int16, uint32, int64) \
1013
+ CUDA_DEVICE_BINARY_DECL(name, int16, int8, int16) \
1014
+ CUDA_DEVICE_BINARY_DECL(name, int16, int16, int16) \
1015
+ CUDA_DEVICE_BINARY_DECL(name, int16, int32, int32) \
1016
+ CUDA_DEVICE_BINARY_DECL(name, int16, int64, int64) \
1017
+ \
1018
+ CUDA_DEVICE_BINARY_DECL(name, int32, bool, int32) \
1019
+ CUDA_DEVICE_BINARY_DECL(name, int32, uint8, int32) \
1020
+ CUDA_DEVICE_BINARY_DECL(name, int32, uint16, int32) \
1021
+ CUDA_DEVICE_BINARY_DECL(name, int32, uint32, int64) \
1022
+ CUDA_DEVICE_BINARY_DECL(name, int32, int8, int32) \
1023
+ CUDA_DEVICE_BINARY_DECL(name, int32, int16, int32) \
1024
+ CUDA_DEVICE_BINARY_DECL(name, int32, int32, int32) \
1025
+ CUDA_DEVICE_BINARY_DECL(name, int32, int64, int64) \
1026
+ \
1027
+ CUDA_DEVICE_BINARY_DECL(name, int64, bool, int64) \
1028
+ CUDA_DEVICE_BINARY_DECL(name, int64, uint8, int64) \
1029
+ CUDA_DEVICE_BINARY_DECL(name, int64, uint16, int64) \
1030
+ CUDA_DEVICE_BINARY_DECL(name, int64, uint32, int64) \
1031
+ CUDA_DEVICE_BINARY_DECL(name, int64, int8, int64) \
1032
+ CUDA_DEVICE_BINARY_DECL(name, int64, int16, int64) \
1033
+ CUDA_DEVICE_BINARY_DECL(name, int64, int32, int64) \
1034
+ CUDA_DEVICE_BINARY_DECL(name, int64, int64, int64)
1035
+
1036
+ CUDA_DEVICE_ALL_BITWISE_DECL(bitwise_and)
1037
+ CUDA_DEVICE_ALL_BITWISE_DECL(bitwise_or)
1038
+ CUDA_DEVICE_ALL_BITWISE_DECL(bitwise_xor)
1039
+
1040
+
1041
+ /*****************************************************************************/
1042
+ /* Two return values */
1043
+ /*****************************************************************************/
1044
+
1045
+ #define CUDA_DEVICE_ALL_BINARY_MV_DECL(name) \
1046
+ CUDA_DEVICE_BINARY_MV_DECL(name, uint8, uint8, uint8, uint8) \
1047
+ CUDA_DEVICE_BINARY_MV_DECL(name, uint16, uint16, uint16, uint16) \
1048
+ CUDA_DEVICE_BINARY_MV_DECL(name, uint32, uint32, uint32, uint32) \
1049
+ CUDA_DEVICE_BINARY_MV_DECL(name, uint64, uint64, uint64, uint64) \
1050
+ CUDA_DEVICE_BINARY_MV_DECL(name, int8, int8, int8, int8) \
1051
+ CUDA_DEVICE_BINARY_MV_DECL(name, int16, int16, int16, int16) \
1052
+ CUDA_DEVICE_BINARY_MV_DECL(name, int32, int32, int32, int32) \
1053
+ CUDA_DEVICE_BINARY_MV_DECL(name, int64, int64, int64, int64) \
1054
+ CUDA_DEVICE_BINARY_MV_DECL(name, bfloat16, bfloat16, bfloat16, bfloat16) \
1055
+ CUDA_DEVICE_BINARY_MV_DECL(name, float32, float32, float32, float32) \
1056
+ CUDA_DEVICE_BINARY_MV_DECL(name, float64, float64, float64, float64)
1057
+
1058
+ CUDA_DEVICE_ALL_BINARY_MV_DECL(divmod)
1059
+
1060
+
1061
+ #endif /* CUDA_DEVICE_BINARY_H */