gumath 0.2.0dev5 → 0.2.0dev8

Sign up to get free protection for your applications and to get access to all the features.
Files changed (99) hide show
  1. checksums.yaml +4 -4
  2. data/CONTRIBUTING.md +7 -2
  3. data/Gemfile +0 -3
  4. data/ext/ruby_gumath/GPATH +0 -0
  5. data/ext/ruby_gumath/GRTAGS +0 -0
  6. data/ext/ruby_gumath/GTAGS +0 -0
  7. data/ext/ruby_gumath/extconf.rb +0 -5
  8. data/ext/ruby_gumath/functions.c +10 -2
  9. data/ext/ruby_gumath/gufunc_object.c +15 -4
  10. data/ext/ruby_gumath/gufunc_object.h +9 -3
  11. data/ext/ruby_gumath/gumath/Makefile +63 -0
  12. data/ext/ruby_gumath/gumath/Makefile.in +1 -0
  13. data/ext/ruby_gumath/gumath/config.h +56 -0
  14. data/ext/ruby_gumath/gumath/config.h.in +3 -0
  15. data/ext/ruby_gumath/gumath/config.log +497 -0
  16. data/ext/ruby_gumath/gumath/config.status +1034 -0
  17. data/ext/ruby_gumath/gumath/configure +375 -4
  18. data/ext/ruby_gumath/gumath/configure.ac +47 -3
  19. data/ext/ruby_gumath/gumath/libgumath/Makefile +236 -0
  20. data/ext/ruby_gumath/gumath/libgumath/Makefile.in +90 -24
  21. data/ext/ruby_gumath/gumath/libgumath/Makefile.vc +54 -15
  22. data/ext/ruby_gumath/gumath/libgumath/apply.c +92 -28
  23. data/ext/ruby_gumath/gumath/libgumath/apply.o +0 -0
  24. data/ext/ruby_gumath/gumath/libgumath/common.o +0 -0
  25. data/ext/ruby_gumath/gumath/libgumath/cpu_device_binary.o +0 -0
  26. data/ext/ruby_gumath/gumath/libgumath/cpu_device_unary.o +0 -0
  27. data/ext/ruby_gumath/gumath/libgumath/cpu_host_binary.o +0 -0
  28. data/ext/ruby_gumath/gumath/libgumath/cpu_host_unary.o +0 -0
  29. data/ext/ruby_gumath/gumath/libgumath/examples.o +0 -0
  30. data/ext/ruby_gumath/gumath/libgumath/extending/graph.c +27 -20
  31. data/ext/ruby_gumath/gumath/libgumath/extending/pdist.c +1 -1
  32. data/ext/ruby_gumath/gumath/libgumath/func.c +13 -9
  33. data/ext/ruby_gumath/gumath/libgumath/func.o +0 -0
  34. data/ext/ruby_gumath/gumath/libgumath/graph.o +0 -0
  35. data/ext/ruby_gumath/gumath/libgumath/gumath.h +55 -14
  36. data/ext/ruby_gumath/gumath/libgumath/kernels/common.c +513 -0
  37. data/ext/ruby_gumath/gumath/libgumath/kernels/common.h +155 -0
  38. data/ext/ruby_gumath/gumath/libgumath/kernels/contrib/bfloat16.h +520 -0
  39. data/ext/ruby_gumath/gumath/libgumath/kernels/cpu_device_binary.cc +1123 -0
  40. data/ext/ruby_gumath/gumath/libgumath/kernels/cpu_device_binary.h +1062 -0
  41. data/ext/ruby_gumath/gumath/libgumath/kernels/cpu_device_msvc.cc +555 -0
  42. data/ext/ruby_gumath/gumath/libgumath/kernels/cpu_device_unary.cc +368 -0
  43. data/ext/ruby_gumath/gumath/libgumath/kernels/cpu_device_unary.h +335 -0
  44. data/ext/ruby_gumath/gumath/libgumath/kernels/cpu_host_binary.c +2952 -0
  45. data/ext/ruby_gumath/gumath/libgumath/kernels/cpu_host_unary.c +1100 -0
  46. data/ext/ruby_gumath/gumath/libgumath/kernels/cuda_device_binary.cu +1143 -0
  47. data/ext/ruby_gumath/gumath/libgumath/kernels/cuda_device_binary.h +1061 -0
  48. data/ext/ruby_gumath/gumath/libgumath/kernels/cuda_device_unary.cu +528 -0
  49. data/ext/ruby_gumath/gumath/libgumath/kernels/cuda_device_unary.h +463 -0
  50. data/ext/ruby_gumath/gumath/libgumath/kernels/cuda_host_binary.c +2817 -0
  51. data/ext/ruby_gumath/gumath/libgumath/kernels/cuda_host_unary.c +1331 -0
  52. data/ext/ruby_gumath/gumath/libgumath/kernels/device.hh +614 -0
  53. data/ext/ruby_gumath/gumath/libgumath/libgumath.a +0 -0
  54. data/ext/ruby_gumath/gumath/libgumath/libgumath.so +1 -0
  55. data/ext/ruby_gumath/gumath/libgumath/libgumath.so.0 +1 -0
  56. data/ext/ruby_gumath/gumath/libgumath/libgumath.so.0.2.0dev3 +0 -0
  57. data/ext/ruby_gumath/gumath/libgumath/nploops.o +0 -0
  58. data/ext/ruby_gumath/gumath/libgumath/pdist.o +0 -0
  59. data/ext/ruby_gumath/gumath/libgumath/quaternion.o +0 -0
  60. data/ext/ruby_gumath/gumath/libgumath/tbl.o +0 -0
  61. data/ext/ruby_gumath/gumath/libgumath/thread.c +17 -4
  62. data/ext/ruby_gumath/gumath/libgumath/thread.o +0 -0
  63. data/ext/ruby_gumath/gumath/libgumath/xndloops.c +110 -0
  64. data/ext/ruby_gumath/gumath/libgumath/xndloops.o +0 -0
  65. data/ext/ruby_gumath/gumath/python/gumath/__init__.py +150 -0
  66. data/ext/ruby_gumath/gumath/python/gumath/_gumath.c +446 -80
  67. data/ext/ruby_gumath/gumath/python/gumath/cuda.c +78 -0
  68. data/ext/ruby_gumath/gumath/python/gumath/examples.c +0 -5
  69. data/ext/ruby_gumath/gumath/python/gumath/functions.c +2 -2
  70. data/ext/ruby_gumath/gumath/python/gumath/gumath.h +246 -0
  71. data/ext/ruby_gumath/gumath/python/gumath/libgumath.a +0 -0
  72. data/ext/ruby_gumath/gumath/python/gumath/libgumath.so +1 -0
  73. data/ext/ruby_gumath/gumath/python/gumath/libgumath.so.0 +1 -0
  74. data/ext/ruby_gumath/gumath/python/gumath/libgumath.so.0.2.0dev3 +0 -0
  75. data/ext/ruby_gumath/gumath/python/gumath/pygumath.h +31 -2
  76. data/ext/ruby_gumath/gumath/python/gumath_aux.py +767 -0
  77. data/ext/ruby_gumath/gumath/python/randdec.py +535 -0
  78. data/ext/ruby_gumath/gumath/python/randfloat.py +177 -0
  79. data/ext/ruby_gumath/gumath/python/test_gumath.py +1504 -24
  80. data/ext/ruby_gumath/gumath/python/test_xndarray.py +462 -0
  81. data/ext/ruby_gumath/gumath/setup.py +67 -6
  82. data/ext/ruby_gumath/gumath/tools/detect_cuda_arch.cc +35 -0
  83. data/ext/ruby_gumath/include/gumath.h +55 -14
  84. data/ext/ruby_gumath/include/ruby_gumath.h +4 -1
  85. data/ext/ruby_gumath/lib/libgumath.a +0 -0
  86. data/ext/ruby_gumath/lib/libgumath.so.0.2.0dev3 +0 -0
  87. data/ext/ruby_gumath/ruby_gumath.c +231 -70
  88. data/ext/ruby_gumath/ruby_gumath.h +4 -1
  89. data/ext/ruby_gumath/ruby_gumath_internal.h +25 -0
  90. data/ext/ruby_gumath/util.c +34 -0
  91. data/ext/ruby_gumath/util.h +9 -0
  92. data/gumath.gemspec +3 -2
  93. data/lib/gumath.rb +55 -1
  94. data/lib/gumath/version.rb +2 -2
  95. data/lib/ruby_gumath.so +0 -0
  96. metadata +63 -10
  97. data/ext/ruby_gumath/gumath/libgumath/extending/bfloat16.c +0 -130
  98. data/ext/ruby_gumath/gumath/libgumath/kernels/binary.c +0 -547
  99. data/ext/ruby_gumath/gumath/libgumath/kernels/unary.c +0 -449
@@ -0,0 +1,1123 @@
1
+ /*
2
+ * BSD 3-Clause License
3
+ *
4
+ * Copyright (c) 2017-2018, plures
5
+ * All rights reserved.
6
+ *
7
+ * Redistribution and use in source and binary forms, with or without
8
+ * modification, are permitted provided that the following conditions are met:
9
+ *
10
+ * 1. Redistributions of source code must retain the above copyright notice,
11
+ * this list of conditions and the following disclaimer.
12
+ *
13
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
14
+ * this list of conditions and the following disclaimer in the documentation
15
+ * and/or other materials provided with the distribution.
16
+ *
17
+ * 3. Neither the name of the copyright holder nor the names of its
18
+ * contributors may be used to endorse or promote products derived from
19
+ * this software without specific prior written permission.
20
+ *
21
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
22
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
24
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
25
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
26
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
27
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
28
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
29
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
30
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31
+ */
32
+
33
+
34
+ #include <cinttypes>
35
+ #include <complex>
36
+ #include <cmath>
37
+ #include "contrib/bfloat16.h"
38
+ #include "cpu_device_binary.h"
39
+ #include "device.hh"
40
+
41
+
42
+ /*****************************************************************************/
43
+ /* CPU device binary kernels */
44
+ /*****************************************************************************/
45
+
46
+ #define CPU_DEVICE_BINARY(name, func, t0, t1, t2, common) \
47
+ extern "C" void \
48
+ gm_cpu_device_fixed_1D_C_##name##_##t0##_##t1##_##t2( \
49
+ const char *a0, const char *a1, char *a2, \
50
+ const int64_t N) \
51
+ { \
52
+ const t0##_t *x0 = (const t0##_t *)a0; \
53
+ const t1##_t *x1 = (const t1##_t *)a1; \
54
+ t2##_t *x2 = (t2##_t *)a2; \
55
+ int64_t i; \
56
+ \
57
+ for (i = 0; i < N-7; i += 8) { \
58
+ x2[i] = func((common##_t)x0[i], (common##_t)x1[i]); \
59
+ x2[i+1] = func((common##_t)x0[i+1], (common##_t)x1[i+1]); \
60
+ x2[i+2] = func((common##_t)x0[i+2], (common##_t)x1[i+2]); \
61
+ x2[i+3] = func((common##_t)x0[i+3], (common##_t)x1[i+3]); \
62
+ x2[i+4] = func((common##_t)x0[i+4], (common##_t)x1[i+4]); \
63
+ x2[i+5] = func((common##_t)x0[i+5], (common##_t)x1[i+5]); \
64
+ x2[i+6] = func((common##_t)x0[i+6], (common##_t)x1[i+6]); \
65
+ x2[i+7] = func((common##_t)x0[i+7], (common##_t)x1[i+7]); \
66
+ } \
67
+ for (; i < N; i++) { \
68
+ x2[i] = func((common##_t)x0[i], (common##_t)x1[i]); \
69
+ } \
70
+ } \
71
+ \
72
+ extern "C" void \
73
+ gm_cpu_device_fixed_1D_S_##name##_##t0##_##t1##_##t2( \
74
+ const char *a0, const char *a1, char *a2, \
75
+ const int64_t s0, const int64_t s1, const int64_t s2, \
76
+ const int64_t N) \
77
+ { \
78
+ const t0##_t *x0 = (const t0##_t *)a0; \
79
+ const t1##_t *x1 = (const t1##_t *)a1; \
80
+ t2##_t *x2 = (t2##_t *)a2; \
81
+ int64_t i, k0, k1, k2; \
82
+ \
83
+ for (i=0, k0=0, k1=0, k2=0; i < N; i++, k0+=s0, k1+=s1, k2+=s2) { \
84
+ x2[k2] = func((common##_t)x0[k0], (common##_t)x1[k1]); \
85
+ } \
86
+ } \
87
+ \
88
+ extern "C" void \
89
+ gm_cpu_device_0D_##name##_##t0##_##t1##_##t2( \
90
+ const char *a0, const char *a1, char *a2) \
91
+ { \
92
+ const t0##_t x0 = *(const t0##_t *)a0; \
93
+ const t1##_t x1 = *(const t1##_t *)a1; \
94
+ t2##_t *x2 = (t2##_t *)a2; \
95
+ *x2 = func((common##_t)x0, (common##_t)x1); \
96
+ }
97
+
98
+ #ifdef _MSC_VER
99
+ #define CPU_DEVICE_BINARYC(name, func, t0, t1, t2, common)
100
+ #else
101
+ #define CPU_DEVICE_BINARYC(name, func, t0, t1, t2, common) \
102
+ CPU_DEVICE_BINARY(name, func, t0, t1, t2, common)
103
+ #endif
104
+
105
+ #define CPU_DEVICE_NOIMPL(name, func, t0, t1, t2, common)
106
+ #define CPU_DEVICE_NOKERN(name, func, t0, t1, t2, common)
107
+
108
+
109
+ /*****************************************************************************/
110
+ /* Arithmetic */
111
+ /*****************************************************************************/
112
+
113
+ #define CPU_DEVICE_ALL_BINARY(name, func, hfunc) \
114
+ CPU_DEVICE_BINARY(name, func, uint8, uint8, uint8, uint8) \
115
+ CPU_DEVICE_BINARY(name, func, uint8, uint16, uint16, uint16) \
116
+ CPU_DEVICE_BINARY(name, func, uint8, uint32, uint32, uint32) \
117
+ CPU_DEVICE_BINARY(name, func, uint8, uint64, uint64, uint64) \
118
+ CPU_DEVICE_BINARY(name, func, uint8, int8, int16, int16) \
119
+ CPU_DEVICE_BINARY(name, func, uint8, int16, int16, int16) \
120
+ CPU_DEVICE_BINARY(name, func, uint8, int32, int32, int32) \
121
+ CPU_DEVICE_BINARY(name, func, uint8, int64, int64, int64) \
122
+ CPU_DEVICE_BINARY(name, func, uint8, bfloat16, bfloat16, bfloat16) \
123
+ CPU_DEVICE_NOIMPL(name, hfunc, uint8, float16, float16, float16) \
124
+ CPU_DEVICE_BINARY(name, func, uint8, float32, float32, float32) \
125
+ CPU_DEVICE_BINARY(name, func, uint8, float64, float64, float64) \
126
+ CPU_DEVICE_NOIMPL(name, func, uint8, complex32, complex32, complex32) \
127
+ CPU_DEVICE_BINARYC(name, func, uint8, complex64, complex64, complex64) \
128
+ CPU_DEVICE_BINARYC(name, func, uint8, complex128, complex128, complex128) \
129
+ \
130
+ CPU_DEVICE_BINARY(name, func, uint16, uint8, uint16, uint16) \
131
+ CPU_DEVICE_BINARY(name, func, uint16, uint16, uint16, uint16) \
132
+ CPU_DEVICE_BINARY(name, func, uint16, uint32, uint32, uint32) \
133
+ CPU_DEVICE_BINARY(name, func, uint16, uint64, uint64, uint64) \
134
+ CPU_DEVICE_BINARY(name, func, uint16, int8, int32, int32) \
135
+ CPU_DEVICE_BINARY(name, func, uint16, int16, int32, int32) \
136
+ CPU_DEVICE_BINARY(name, func, uint16, int32, int32, int32) \
137
+ CPU_DEVICE_BINARY(name, func, uint16, int64, int64, int64) \
138
+ CPU_DEVICE_BINARY(name, func, uint16, bfloat16, float32, float32) \
139
+ CPU_DEVICE_NOIMPL(name, func, uint16, float16, float32, float32) \
140
+ CPU_DEVICE_BINARY(name, func, uint16, float32, float32, float32) \
141
+ CPU_DEVICE_BINARY(name, func, uint16, float64, float64, float64) \
142
+ CPU_DEVICE_NOIMPL(name, func, uint16, complex32, complex64, complex64) \
143
+ CPU_DEVICE_BINARYC(name, func, uint16, complex64, complex64, complex64) \
144
+ CPU_DEVICE_BINARYC(name, func, uint16, complex128, complex128, complex128) \
145
+ \
146
+ CPU_DEVICE_BINARY(name, func, uint32, uint8, uint32, uint32) \
147
+ CPU_DEVICE_BINARY(name, func, uint32, uint16, uint32, uint32) \
148
+ CPU_DEVICE_BINARY(name, func, uint32, uint32, uint32, uint32) \
149
+ CPU_DEVICE_BINARY(name, func, uint32, uint64, uint64, uint64) \
150
+ CPU_DEVICE_BINARY(name, func, uint32, int8, int64, int64) \
151
+ CPU_DEVICE_BINARY(name, func, uint32, int16, int64, int64) \
152
+ CPU_DEVICE_BINARY(name, func, uint32, int32, int64, int64) \
153
+ CPU_DEVICE_BINARY(name, func, uint32, int64, int64, int64) \
154
+ CPU_DEVICE_BINARY(name, func, uint32, bfloat16, float64, float64) \
155
+ CPU_DEVICE_NOIMPL(name, func, uint32, float16, float64, float64) \
156
+ CPU_DEVICE_BINARY(name, func, uint32, float32, float64, float64) \
157
+ CPU_DEVICE_BINARY(name, func, uint32, float64, float64, float64) \
158
+ CPU_DEVICE_NOIMPL(name, func, uint32, complex32, complex128, complex128) \
159
+ CPU_DEVICE_BINARYC(name, func, uint32, complex64, complex128, complex128) \
160
+ CPU_DEVICE_BINARYC(name, func, uint32, complex128, complex128, complex128) \
161
+ \
162
+ CPU_DEVICE_BINARY(name, func, uint64, uint8, uint64, uint64) \
163
+ CPU_DEVICE_BINARY(name, func, uint64, uint16, uint64, uint64) \
164
+ CPU_DEVICE_BINARY(name, func, uint64, uint32, uint64, uint64) \
165
+ CPU_DEVICE_BINARY(name, func, uint64, uint64, uint64, uint64) \
166
+ \
167
+ CPU_DEVICE_BINARY(name, func, int8, uint8, int16, int16) \
168
+ CPU_DEVICE_BINARY(name, func, int8, uint16, int32, int32) \
169
+ CPU_DEVICE_BINARY(name, func, int8, uint32, int64, int64) \
170
+ CPU_DEVICE_BINARY(name, func, int8, int8, int8, int8) \
171
+ CPU_DEVICE_BINARY(name, func, int8, int16, int16, int16) \
172
+ CPU_DEVICE_BINARY(name, func, int8, int32, int32, int32) \
173
+ CPU_DEVICE_BINARY(name, func, int8, int64, int64, int64) \
174
+ CPU_DEVICE_BINARY(name, func, int8, bfloat16, bfloat16, bfloat16) \
175
+ CPU_DEVICE_NOIMPL(name, hfunc, int8, float16, float16, float16) \
176
+ CPU_DEVICE_BINARY(name, func, int8, float32, float32, float32) \
177
+ CPU_DEVICE_BINARY(name, func, int8, float64, float64, float64) \
178
+ CPU_DEVICE_NOIMPL(name, func, int8, complex32, complex32, complex32) \
179
+ CPU_DEVICE_BINARYC(name, func, int8, complex64, complex64, complex64) \
180
+ CPU_DEVICE_BINARYC(name, func, int8, complex128, complex128, complex128) \
181
+ \
182
+ CPU_DEVICE_BINARY(name, func, int16, uint8, int16, int16) \
183
+ CPU_DEVICE_BINARY(name, func, int16, uint16, int32, int32) \
184
+ CPU_DEVICE_BINARY(name, func, int16, uint32, int64, int64) \
185
+ CPU_DEVICE_BINARY(name, func, int16, int8, int16, int16) \
186
+ CPU_DEVICE_BINARY(name, func, int16, int16, int16, int16) \
187
+ CPU_DEVICE_BINARY(name, func, int16, int32, int32, int32) \
188
+ CPU_DEVICE_BINARY(name, func, int16, int64, int64, int64) \
189
+ CPU_DEVICE_BINARY(name, func, int16, bfloat16, float32, float32) \
190
+ CPU_DEVICE_NOIMPL(name, func, int16, float16, float32, float32) \
191
+ CPU_DEVICE_BINARY(name, func, int16, float32, float32, float32) \
192
+ CPU_DEVICE_BINARY(name, func, int16, float64, float64, float64) \
193
+ CPU_DEVICE_NOIMPL(name, func, int16, complex32, complex64, complex64) \
194
+ CPU_DEVICE_BINARYC(name, func, int16, complex64, complex64, complex64) \
195
+ CPU_DEVICE_BINARYC(name, func, int16, complex128, complex128, complex128) \
196
+ \
197
+ CPU_DEVICE_BINARY(name, func, int32, uint8, int32, int32) \
198
+ CPU_DEVICE_BINARY(name, func, int32, uint16, int32, int32) \
199
+ CPU_DEVICE_BINARY(name, func, int32, uint32, int64, int64) \
200
+ CPU_DEVICE_BINARY(name, func, int32, int8, int32, int32) \
201
+ CPU_DEVICE_BINARY(name, func, int32, int16, int32, int32) \
202
+ CPU_DEVICE_BINARY(name, func, int32, int32, int32, int32) \
203
+ CPU_DEVICE_BINARY(name, func, int32, int64, int64, int64) \
204
+ CPU_DEVICE_BINARY(name, func, int32, bfloat16, float64, float64) \
205
+ CPU_DEVICE_NOIMPL(name, func, int32, float16, float64, float64) \
206
+ CPU_DEVICE_BINARY(name, func, int32, float32, float64, float64) \
207
+ CPU_DEVICE_BINARY(name, func, int32, float64, float64, float64) \
208
+ CPU_DEVICE_NOIMPL(name, func, int32, complex32, complex128, complex128) \
209
+ CPU_DEVICE_BINARYC(name, func, int32, complex64, complex128, complex128) \
210
+ CPU_DEVICE_BINARYC(name, func, int32, complex128, complex128, complex128) \
211
+ \
212
+ CPU_DEVICE_BINARY(name, func, int64, uint8, int64, int64) \
213
+ CPU_DEVICE_BINARY(name, func, int64, uint16, int64, int64) \
214
+ CPU_DEVICE_BINARY(name, func, int64, uint32, int64, int64) \
215
+ CPU_DEVICE_BINARY(name, func, int64, int8, int64, int64) \
216
+ CPU_DEVICE_BINARY(name, func, int64, int16, int64, int64) \
217
+ CPU_DEVICE_BINARY(name, func, int64, int32, int64, int64) \
218
+ CPU_DEVICE_BINARY(name, func, int64, int64, int64, int64) \
219
+ \
220
+ CPU_DEVICE_BINARY(name, func, bfloat16, uint8, bfloat16, bfloat16) \
221
+ CPU_DEVICE_BINARY(name, func, bfloat16, uint16, float32, float32) \
222
+ CPU_DEVICE_BINARY(name, func, bfloat16, uint32, float64, float64) \
223
+ CPU_DEVICE_BINARY(name, func, bfloat16, int8, bfloat16, bfloat16) \
224
+ CPU_DEVICE_BINARY(name, func, bfloat16, int16, float32, float32) \
225
+ CPU_DEVICE_BINARY(name, func, bfloat16, int32, float64, float64) \
226
+ CPU_DEVICE_BINARY(name, func, bfloat16, bfloat16, bfloat16, bfloat16) \
227
+ CPU_DEVICE_NOIMPL(name, func, bfloat16, float16, float32, float32) \
228
+ CPU_DEVICE_BINARY(name, func, bfloat16, float32, float32, float32) \
229
+ CPU_DEVICE_BINARY(name, func, bfloat16, float64, float64, float64) \
230
+ CPU_DEVICE_NOIMPL(name, func, bfloat16, complex32, complex32, complex64) \
231
+ CPU_DEVICE_BINARY(name, func, bfloat16, complex64, complex64, complex64) \
232
+ CPU_DEVICE_BINARY(name, func, bfloat16, complex128, complex128, complex128) \
233
+ \
234
+ CPU_DEVICE_NOIMPL(name, hfunc, float16, uint8, float16, float16) \
235
+ CPU_DEVICE_NOIMPL(name, func, float16, uint16, float32, float32) \
236
+ CPU_DEVICE_NOIMPL(name, func, float16, uint32, float64, float64) \
237
+ CPU_DEVICE_NOIMPL(name, hfunc, float16, int8, float16, float16) \
238
+ CPU_DEVICE_NOIMPL(name, func, float16, int16, float32, float32) \
239
+ CPU_DEVICE_NOIMPL(name, func, float16, int32, float64, float64) \
240
+ CPU_DEVICE_NOIMPL(name, func, float16, bfloat16, float32, float32) \
241
+ CPU_DEVICE_NOIMPL(name, hfunc, float16, float16, float16, float16) \
242
+ CPU_DEVICE_NOIMPL(name, func, float16, float32, float32, float32) \
243
+ CPU_DEVICE_NOIMPL(name, func, float16, float64, float64, float64) \
244
+ CPU_DEVICE_NOIMPL(name, func, float16, complex32, complex32, complex32) \
245
+ CPU_DEVICE_NOIMPL(name, func, float16, complex64, complex64, complex64) \
246
+ CPU_DEVICE_NOIMPL(name, func, float16, complex128, complex128, complex128) \
247
+ \
248
+ CPU_DEVICE_BINARY(name, func, float32, uint8, float32, float32) \
249
+ CPU_DEVICE_BINARY(name, func, float32, uint16, float32, float32) \
250
+ CPU_DEVICE_BINARY(name, func, float32, uint32, float64, float64) \
251
+ CPU_DEVICE_BINARY(name, func, float32, int8, float32, float32) \
252
+ CPU_DEVICE_BINARY(name, func, float32, int16, float32, float32) \
253
+ CPU_DEVICE_BINARY(name, func, float32, int32, float64, float64) \
254
+ CPU_DEVICE_BINARY(name, func, float32, bfloat16, float32, float32) \
255
+ CPU_DEVICE_NOIMPL(name, func, float32, float16, float32, float32) \
256
+ CPU_DEVICE_BINARY(name, func, float32, float32, float32, float32) \
257
+ CPU_DEVICE_BINARY(name, func, float32, float64, float64, float64) \
258
+ CPU_DEVICE_NOIMPL(name, func, float32, complex32, complex64, complex64) \
259
+ CPU_DEVICE_BINARYC(name, func, float32, complex64, complex64, complex64) \
260
+ CPU_DEVICE_BINARYC(name, func, float32, complex128, complex128, complex128) \
261
+ \
262
+ CPU_DEVICE_BINARY(name, func, float64, uint8, float64, float64) \
263
+ CPU_DEVICE_BINARY(name, func, float64, uint16, float64, float64) \
264
+ CPU_DEVICE_BINARY(name, func, float64, uint32, float64, float64) \
265
+ CPU_DEVICE_BINARY(name, func, float64, int8, float64, float64) \
266
+ CPU_DEVICE_BINARY(name, func, float64, int16, float64, float64) \
267
+ CPU_DEVICE_BINARY(name, func, float64, int32, float64, float64) \
268
+ CPU_DEVICE_BINARY(name, func, float64, bfloat16, float64, float64) \
269
+ CPU_DEVICE_NOIMPL(name, func, float64, float16, float64, float64) \
270
+ CPU_DEVICE_BINARY(name, func, float64, float32, float64, float64) \
271
+ CPU_DEVICE_BINARY(name, func, float64, float64, float64, float64) \
272
+ CPU_DEVICE_NOIMPL(name, func, float64, complex32, complex128, complex128) \
273
+ CPU_DEVICE_BINARYC(name, func, float64, complex64, complex128, complex128) \
274
+ CPU_DEVICE_BINARYC(name, func, float64, complex128, complex128, complex128) \
275
+ \
276
+ CPU_DEVICE_NOIMPL(name, func, complex32, uint8, complex32, complex32) \
277
+ CPU_DEVICE_NOIMPL(name, func, complex32, uint16, complex64, complex64) \
278
+ CPU_DEVICE_NOIMPL(name, func, complex32, uint32, complex128, complex128) \
279
+ CPU_DEVICE_NOIMPL(name, func, complex32, int8, complex32, complex32) \
280
+ CPU_DEVICE_NOIMPL(name, func, complex32, int16, complex64, complex64) \
281
+ CPU_DEVICE_NOIMPL(name, func, complex32, int32, complex128, complex128) \
282
+ CPU_DEVICE_NOIMPL(name, func, complex32, bfloat16, complex64, complex64) \
283
+ CPU_DEVICE_NOIMPL(name, func, complex32, float16, complex32, complex32) \
284
+ CPU_DEVICE_NOIMPL(name, func, complex32, float32, complex64, complex64) \
285
+ CPU_DEVICE_NOIMPL(name, func, complex32, float64, complex128, complex128) \
286
+ CPU_DEVICE_NOIMPL(name, func, complex32, complex32, complex32, complex32) \
287
+ CPU_DEVICE_NOIMPL(name, func, complex32, complex64, complex64, complex64) \
288
+ CPU_DEVICE_NOIMPL(name, func, complex32, complex128, complex128, complex128) \
289
+ \
290
+ CPU_DEVICE_BINARYC(name, func, complex64, uint8, complex64, complex64) \
291
+ CPU_DEVICE_BINARYC(name, func, complex64, uint16, complex64, complex64) \
292
+ CPU_DEVICE_BINARYC(name, func, complex64, uint32, complex128, complex128) \
293
+ CPU_DEVICE_BINARYC(name, func, complex64, int8, complex64, complex64) \
294
+ CPU_DEVICE_BINARYC(name, func, complex64, int16, complex64, complex64) \
295
+ CPU_DEVICE_BINARYC(name, func, complex64, int32, complex128, complex128) \
296
+ CPU_DEVICE_BINARY(name, func, complex64, bfloat16, complex64, complex64) \
297
+ CPU_DEVICE_NOIMPL(name, func, complex64, float16, complex64, complex64) \
298
+ CPU_DEVICE_BINARYC(name, func, complex64, float32, complex64, complex64) \
299
+ CPU_DEVICE_BINARYC(name, func, complex64, float64, complex128, complex128) \
300
+ CPU_DEVICE_NOIMPL(name, func, complex64, complex32, complex64, complex64) \
301
+ CPU_DEVICE_BINARYC(name, func, complex64, complex64, complex64, complex64) \
302
+ CPU_DEVICE_BINARYC(name, func, complex64, complex128, complex128, complex128) \
303
+ \
304
+ CPU_DEVICE_BINARYC(name, func, complex128, uint8, complex128, complex128) \
305
+ CPU_DEVICE_BINARYC(name, func, complex128, uint16, complex128, complex128) \
306
+ CPU_DEVICE_BINARYC(name, func, complex128, uint32, complex128, complex128) \
307
+ CPU_DEVICE_BINARYC(name, func, complex128, int8, complex128, complex128) \
308
+ CPU_DEVICE_BINARYC(name, func, complex128, int16, complex128, complex128) \
309
+ CPU_DEVICE_BINARYC(name, func, complex128, int32, complex128, complex128) \
310
+ CPU_DEVICE_BINARY(name, func, complex128, bfloat16, complex128, complex128) \
311
+ CPU_DEVICE_NOIMPL(name, func, complex128, float16, complex128, complex128) \
312
+ CPU_DEVICE_BINARYC(name, func, complex128, float32, complex128, complex128) \
313
+ CPU_DEVICE_BINARYC(name, func, complex128, float64, complex128, complex128) \
314
+ CPU_DEVICE_NOIMPL(name, func, complex128, complex32, complex128, complex128) \
315
+ CPU_DEVICE_BINARYC(name, func, complex128, complex64, complex128, complex128) \
316
+ CPU_DEVICE_BINARYC(name, func, complex128, complex128, complex128, complex128)
317
+
318
+ #define CPU_DEVICE_ALL_BINARY_NO_COMPLEX(name, func, hfunc) \
319
+ CPU_DEVICE_BINARY(name, func, uint8, uint8, uint8, uint8) \
320
+ CPU_DEVICE_BINARY(name, func, uint8, uint16, uint16, uint16) \
321
+ CPU_DEVICE_BINARY(name, func, uint8, uint32, uint32, uint32) \
322
+ CPU_DEVICE_BINARY(name, func, uint8, uint64, uint64, uint64) \
323
+ CPU_DEVICE_BINARY(name, func, uint8, int8, int16, int16) \
324
+ CPU_DEVICE_BINARY(name, func, uint8, int16, int16, int16) \
325
+ CPU_DEVICE_BINARY(name, func, uint8, int32, int32, int32) \
326
+ CPU_DEVICE_BINARY(name, func, uint8, int64, int64, int64) \
327
+ CPU_DEVICE_BINARY(name, func, uint8, bfloat16, bfloat16, bfloat16) \
328
+ CPU_DEVICE_NOIMPL(name, hfunc, uint8, float16, float16, float16) \
329
+ CPU_DEVICE_BINARY(name, func, uint8, float32, float32, float32) \
330
+ CPU_DEVICE_BINARY(name, func, uint8, float64, float64, float64) \
331
+ CPU_DEVICE_NOKERN(name, func, uint8, complex32, complex32, complex32) \
332
+ CPU_DEVICE_NOKERN(name, func, uint8, complex64, complex64, complex64) \
333
+ CPU_DEVICE_NOKERN(name, func, uint8, complex128, complex128, complex128) \
334
+ \
335
+ CPU_DEVICE_BINARY(name, func, uint16, uint8, uint16, uint16) \
336
+ CPU_DEVICE_BINARY(name, func, uint16, uint16, uint16, uint16) \
337
+ CPU_DEVICE_BINARY(name, func, uint16, uint32, uint32, uint32) \
338
+ CPU_DEVICE_BINARY(name, func, uint16, uint64, uint64, uint64) \
339
+ CPU_DEVICE_BINARY(name, func, uint16, int8, int32, int32) \
340
+ CPU_DEVICE_BINARY(name, func, uint16, int16, int32, int32) \
341
+ CPU_DEVICE_BINARY(name, func, uint16, int32, int32, int32) \
342
+ CPU_DEVICE_BINARY(name, func, uint16, int64, int64, int64) \
343
+ CPU_DEVICE_BINARY(name, func, uint16, bfloat16, float32, float32) \
344
+ CPU_DEVICE_NOIMPL(name, func, uint16, float16, float32, float32) \
345
+ CPU_DEVICE_BINARY(name, func, uint16, float32, float32, float32) \
346
+ CPU_DEVICE_BINARY(name, func, uint16, float64, float64, float64) \
347
+ CPU_DEVICE_NOKERN(name, func, uint16, complex32, complex64, complex64) \
348
+ CPU_DEVICE_NOKERN(name, func, uint16, complex64, complex64, complex64) \
349
+ CPU_DEVICE_NOKERN(name, func, uint16, complex128, complex128, complex128) \
350
+ \
351
+ CPU_DEVICE_BINARY(name, func, uint32, uint8, uint32, uint32) \
352
+ CPU_DEVICE_BINARY(name, func, uint32, uint16, uint32, uint32) \
353
+ CPU_DEVICE_BINARY(name, func, uint32, uint32, uint32, uint32) \
354
+ CPU_DEVICE_BINARY(name, func, uint32, uint64, uint64, uint64) \
355
+ CPU_DEVICE_BINARY(name, func, uint32, int8, int64, int64) \
356
+ CPU_DEVICE_BINARY(name, func, uint32, int16, int64, int64) \
357
+ CPU_DEVICE_BINARY(name, func, uint32, int32, int64, int64) \
358
+ CPU_DEVICE_BINARY(name, func, uint32, int64, int64, int64) \
359
+ CPU_DEVICE_BINARY(name, func, uint32, bfloat16, float64, float64) \
360
+ CPU_DEVICE_NOIMPL(name, func, uint32, float16, float64, float64) \
361
+ CPU_DEVICE_BINARY(name, func, uint32, float32, float64, float64) \
362
+ CPU_DEVICE_BINARY(name, func, uint32, float64, float64, float64) \
363
+ CPU_DEVICE_NOKERN(name, func, uint32, complex32, complex128, complex128) \
364
+ CPU_DEVICE_NOKERN(name, func, uint32, complex64, complex128, complex128) \
365
+ CPU_DEVICE_NOKERN(name, func, uint32, complex128, complex128, complex128) \
366
+ \
367
+ CPU_DEVICE_BINARY(name, func, uint64, uint8, uint64, uint64) \
368
+ CPU_DEVICE_BINARY(name, func, uint64, uint16, uint64, uint64) \
369
+ CPU_DEVICE_BINARY(name, func, uint64, uint32, uint64, uint64) \
370
+ CPU_DEVICE_BINARY(name, func, uint64, uint64, uint64, uint64) \
371
+ \
372
+ CPU_DEVICE_BINARY(name, func, int8, uint8, int16, int16) \
373
+ CPU_DEVICE_BINARY(name, func, int8, uint16, int32, int32) \
374
+ CPU_DEVICE_BINARY(name, func, int8, uint32, int64, int64) \
375
+ CPU_DEVICE_BINARY(name, func, int8, int8, int8, int8) \
376
+ CPU_DEVICE_BINARY(name, func, int8, int16, int16, int16) \
377
+ CPU_DEVICE_BINARY(name, func, int8, int32, int32, int32) \
378
+ CPU_DEVICE_BINARY(name, func, int8, int64, int64, int64) \
379
+ CPU_DEVICE_BINARY(name, func, int8, bfloat16, bfloat16, bfloat16) \
380
+ CPU_DEVICE_NOIMPL(name, hfunc, int8, float16, float16, float16) \
381
+ CPU_DEVICE_BINARY(name, func, int8, float32, float32, float32) \
382
+ CPU_DEVICE_BINARY(name, func, int8, float64, float64, float64) \
383
+ CPU_DEVICE_NOKERN(name, func, int8, complex32, complex32, complex32) \
384
+ CPU_DEVICE_NOKERN(name, func, int8, complex64, complex64, complex64) \
385
+ CPU_DEVICE_NOKERN(name, func, int8, complex128, complex128, complex128) \
386
+ \
387
+ CPU_DEVICE_BINARY(name, func, int16, uint8, int16, int16) \
388
+ CPU_DEVICE_BINARY(name, func, int16, uint16, int32, int32) \
389
+ CPU_DEVICE_BINARY(name, func, int16, uint32, int64, int64) \
390
+ CPU_DEVICE_BINARY(name, func, int16, int8, int16, int16) \
391
+ CPU_DEVICE_BINARY(name, func, int16, int16, int16, int16) \
392
+ CPU_DEVICE_BINARY(name, func, int16, int32, int32, int32) \
393
+ CPU_DEVICE_BINARY(name, func, int16, int64, int64, int64) \
394
+ CPU_DEVICE_BINARY(name, func, int16, bfloat16, float32, float32) \
395
+ CPU_DEVICE_NOIMPL(name, func, int16, float16, float32, float32) \
396
+ CPU_DEVICE_BINARY(name, func, int16, float32, float32, float32) \
397
+ CPU_DEVICE_BINARY(name, func, int16, float64, float64, float64) \
398
+ CPU_DEVICE_NOKERN(name, func, int16, complex32, complex64, complex64) \
399
+ CPU_DEVICE_NOKERN(name, func, int16, complex64, complex64, complex64) \
400
+ CPU_DEVICE_NOKERN(name, func, int16, complex128, complex128, complex128) \
401
+ \
402
+ CPU_DEVICE_BINARY(name, func, int32, uint8, int32, int32) \
403
+ CPU_DEVICE_BINARY(name, func, int32, uint16, int32, int32) \
404
+ CPU_DEVICE_BINARY(name, func, int32, uint32, int64, int64) \
405
+ CPU_DEVICE_BINARY(name, func, int32, int8, int32, int32) \
406
+ CPU_DEVICE_BINARY(name, func, int32, int16, int32, int32) \
407
+ CPU_DEVICE_BINARY(name, func, int32, int32, int32, int32) \
408
+ CPU_DEVICE_BINARY(name, func, int32, int64, int64, int64) \
409
+ CPU_DEVICE_BINARY(name, func, int32, bfloat16, float64, float64) \
410
+ CPU_DEVICE_NOIMPL(name, func, int32, float16, float64, float64) \
411
+ CPU_DEVICE_BINARY(name, func, int32, float32, float64, float64) \
412
+ CPU_DEVICE_BINARY(name, func, int32, float64, float64, float64) \
413
+ CPU_DEVICE_NOKERN(name, func, int32, complex32, complex128, complex128) \
414
+ CPU_DEVICE_NOKERN(name, func, int32, complex64, complex128, complex128) \
415
+ CPU_DEVICE_NOKERN(name, func, int32, complex128, complex128, complex128) \
416
+ \
417
+ CPU_DEVICE_BINARY(name, func, int64, uint8, int64, int64) \
418
+ CPU_DEVICE_BINARY(name, func, int64, uint16, int64, int64) \
419
+ CPU_DEVICE_BINARY(name, func, int64, uint32, int64, int64) \
420
+ CPU_DEVICE_BINARY(name, func, int64, int8, int64, int64) \
421
+ CPU_DEVICE_BINARY(name, func, int64, int16, int64, int64) \
422
+ CPU_DEVICE_BINARY(name, func, int64, int32, int64, int64) \
423
+ CPU_DEVICE_BINARY(name, func, int64, int64, int64, int64) \
424
+ \
425
+ CPU_DEVICE_BINARY(name, func, bfloat16, uint8, bfloat16, bfloat16) \
426
+ CPU_DEVICE_BINARY(name, func, bfloat16, uint16, float32, float32) \
427
+ CPU_DEVICE_BINARY(name, func, bfloat16, uint32, float64, float64) \
428
+ CPU_DEVICE_BINARY(name, func, bfloat16, int8, bfloat16, bfloat16) \
429
+ CPU_DEVICE_BINARY(name, func, bfloat16, int16, float32, float32) \
430
+ CPU_DEVICE_BINARY(name, func, bfloat16, int32, float64, float64) \
431
+ CPU_DEVICE_BINARY(name, func, bfloat16, bfloat16, bfloat16, bfloat16) \
432
+ CPU_DEVICE_NOIMPL(name, func, bfloat16, float16, float32, float32) \
433
+ CPU_DEVICE_BINARY(name, func, bfloat16, float32, float32, float32) \
434
+ CPU_DEVICE_BINARY(name, func, bfloat16, float64, float64, float64) \
435
+ CPU_DEVICE_NOKERN(name, func, bfloat16, complex32, complex32, complex32) \
436
+ CPU_DEVICE_NOKERN(name, func, bfloat16, complex64, complex64, complex64) \
437
+ CPU_DEVICE_NOKERN(name, func, bfloat16, complex128, complex128, complex128) \
438
+ \
439
+ CPU_DEVICE_NOIMPL(name, hfunc, float16, uint8, float16, float16) \
440
+ CPU_DEVICE_NOIMPL(name, func, float16, uint16, float32, float32) \
441
+ CPU_DEVICE_NOIMPL(name, func, float16, uint32, float64, float64) \
442
+ CPU_DEVICE_NOIMPL(name, hfunc, float16, int8, float16, float16) \
443
+ CPU_DEVICE_NOIMPL(name, func, float16, int16, float32, float32) \
444
+ CPU_DEVICE_NOIMPL(name, func, float16, int32, float64, float64) \
445
+ CPU_DEVICE_NOIMPL(name, func, float16, bfloat16, float32, float32) \
446
+ CPU_DEVICE_NOIMPL(name, hfunc, float16, float16, float16, float16) \
447
+ CPU_DEVICE_NOIMPL(name, func, float16, float32, float32, float32) \
448
+ CPU_DEVICE_NOIMPL(name, func, float16, float64, float64, float64) \
449
+ CPU_DEVICE_NOKERN(name, func, float16, complex32, complex32, complex32) \
450
+ CPU_DEVICE_NOKERN(name, func, float16, complex64, complex64, complex64) \
451
+ CPU_DEVICE_NOKERN(name, func, float16, complex128, complex128, complex128) \
452
+ \
453
+ CPU_DEVICE_BINARY(name, func, float32, uint8, float32, float32) \
454
+ CPU_DEVICE_BINARY(name, func, float32, uint16, float32, float32) \
455
+ CPU_DEVICE_BINARY(name, func, float32, uint32, float64, float64) \
456
+ CPU_DEVICE_BINARY(name, func, float32, int8, float32, float32) \
457
+ CPU_DEVICE_BINARY(name, func, float32, int16, float32, float32) \
458
+ CPU_DEVICE_BINARY(name, func, float32, int32, float64, float64) \
459
+ CPU_DEVICE_BINARY(name, func, float32, bfloat16, float32, float32) \
460
+ CPU_DEVICE_NOIMPL(name, func, float32, float16, float32, float32) \
461
+ CPU_DEVICE_BINARY(name, func, float32, float32, float32, float32) \
462
+ CPU_DEVICE_BINARY(name, func, float32, float64, float64, float64) \
463
+ CPU_DEVICE_NOKERN(name, func, float32, complex32, complex64, complex64) \
464
+ CPU_DEVICE_NOKERN(name, func, float32, complex64, complex64, complex64) \
465
+ CPU_DEVICE_NOKERN(name, func, float32, complex128, complex128, complex128) \
466
+ \
467
+ CPU_DEVICE_BINARY(name, func, float64, uint8, float64, float64) \
468
+ CPU_DEVICE_BINARY(name, func, float64, uint16, float64, float64) \
469
+ CPU_DEVICE_BINARY(name, func, float64, uint32, float64, float64) \
470
+ CPU_DEVICE_BINARY(name, func, float64, int8, float64, float64) \
471
+ CPU_DEVICE_BINARY(name, func, float64, int16, float64, float64) \
472
+ CPU_DEVICE_BINARY(name, func, float64, int32, float64, float64) \
473
+ CPU_DEVICE_BINARY(name, func, float64, bfloat16, float64, float64) \
474
+ CPU_DEVICE_NOIMPL(name, func, float64, float16, float64, float64) \
475
+ CPU_DEVICE_BINARY(name, func, float64, float32, float64, float64) \
476
+ CPU_DEVICE_BINARY(name, func, float64, float64, float64, float64) \
477
+ CPU_DEVICE_NOKERN(name, func, float64, complex32, complex128, complex128) \
478
+ CPU_DEVICE_NOKERN(name, func, float64, complex64, complex128, complex128) \
479
+ CPU_DEVICE_NOKERN(name, func, float64, complex128, complex128, complex128) \
480
+ \
481
+ CPU_DEVICE_NOKERN(name, func, complex32, uint8, complex32, complex32) \
482
+ CPU_DEVICE_NOKERN(name, func, complex32, uint16, complex64, complex64) \
483
+ CPU_DEVICE_NOKERN(name, func, complex32, uint32, complex128, complex128) \
484
+ CPU_DEVICE_NOKERN(name, func, complex32, int8, complex32, complex32) \
485
+ CPU_DEVICE_NOKERN(name, func, complex32, int16, complex64, complex64) \
486
+ CPU_DEVICE_NOKERN(name, func, complex32, int32, complex128, complex128) \
487
+ CPU_DEVICE_NOKERN(name, func, complex32, bfloat16, complex64, complex64) \
488
+ CPU_DEVICE_NOKERN(name, func, complex32, float16, complex32, complex32) \
489
+ CPU_DEVICE_NOKERN(name, func, complex32, float32, complex64, complex64) \
490
+ CPU_DEVICE_NOKERN(name, func, complex32, float64, complex128, complex128) \
491
+ CPU_DEVICE_NOKERN(name, func, complex32, complex32, complex32, complex32) \
492
+ CPU_DEVICE_NOKERN(name, func, complex32, complex64, complex64, complex64) \
493
+ CPU_DEVICE_NOKERN(name, func, complex32, complex128, complex128, complex128) \
494
+ \
495
+ CPU_DEVICE_NOKERN(name, func, complex64, uint8, complex64, complex64) \
496
+ CPU_DEVICE_NOKERN(name, func, complex64, uint16, complex64, complex64) \
497
+ CPU_DEVICE_NOKERN(name, func, complex64, uint32, complex128, complex128) \
498
+ CPU_DEVICE_NOKERN(name, func, complex64, int8, complex64, complex64) \
499
+ CPU_DEVICE_NOKERN(name, func, complex64, int16, complex64, complex64) \
500
+ CPU_DEVICE_NOKERN(name, func, complex64, int32, complex128, complex128) \
501
+ CPU_DEVICE_NOKERN(name, func, complex64, bfloat16, complex64, complex64) \
502
+ CPU_DEVICE_NOKERN(name, func, complex64, float16, complex64, complex64) \
503
+ CPU_DEVICE_NOKERN(name, func, complex64, float32, complex64, complex64) \
504
+ CPU_DEVICE_NOKERN(name, func, complex64, float64, complex128, complex128) \
505
+ CPU_DEVICE_NOKERN(name, func, complex64, complex32, complex64, complex64) \
506
+ CPU_DEVICE_NOKERN(name, func, complex64, complex64, complex64, complex64) \
507
+ CPU_DEVICE_NOKERN(name, func, complex64, complex128, complex128, complex128) \
508
+ \
509
+ CPU_DEVICE_NOKERN(name, func, complex128, uint8, complex128, complex128) \
510
+ CPU_DEVICE_NOKERN(name, func, complex128, uint16, complex128, complex128) \
511
+ CPU_DEVICE_NOKERN(name, func, complex128, uint32, complex128, complex128) \
512
+ CPU_DEVICE_NOKERN(name, func, complex128, int8, complex128, complex128) \
513
+ CPU_DEVICE_NOKERN(name, func, complex128, int16, complex128, complex128) \
514
+ CPU_DEVICE_NOKERN(name, func, complex128, int32, complex128, complex128) \
515
+ CPU_DEVICE_NOKERN(name, func, complex128, bfloat16, complex128, complex128) \
516
+ CPU_DEVICE_NOIMPL(name, func, complex128, float16, complex128, complex128) \
517
+ CPU_DEVICE_NOKERN(name, func, complex128, float32, complex128, complex128) \
518
+ CPU_DEVICE_NOKERN(name, func, complex128, float64, complex128, complex128) \
519
+ CPU_DEVICE_NOKERN(name, func, complex128, complex32, complex128, complex128) \
520
+ CPU_DEVICE_NOKERN(name, func, complex128, complex64, complex128, complex128) \
521
+ CPU_DEVICE_NOKERN(name, func, complex128, complex128, complex128, complex128) \
522
+
523
+ #define CPU_DEVICE_ALL_BINARY_FLOAT_RETURN(name, func, hfunc) \
524
+ CPU_DEVICE_NOIMPL(name, hfunc, uint8, uint8, float16, float16) \
525
+ CPU_DEVICE_BINARY(name, func, uint8, uint16, float32, float32) \
526
+ CPU_DEVICE_BINARY(name, func, uint8, uint32, float64, float64) \
527
+ CPU_DEVICE_NOKERN(name, func, uint8, uint64, uint64, uint64) \
528
+ CPU_DEVICE_NOIMPL(name, hfunc, uint8, int8, float16, float16) \
529
+ CPU_DEVICE_BINARY(name, func, uint8, int16, float32, float32) \
530
+ CPU_DEVICE_BINARY(name, func, uint8, int32, float64, float64) \
531
+ CPU_DEVICE_NOKERN(name, func, uint8, int64, int64, int64) \
532
+ CPU_DEVICE_BINARY(name, func, uint8, bfloat16, bfloat16, bfloat16) \
533
+ CPU_DEVICE_NOIMPL(name, hfunc, uint8, float16, float16, float16) \
534
+ CPU_DEVICE_BINARY(name, func, uint8, float32, float32, float32) \
535
+ CPU_DEVICE_BINARY(name, func, uint8, float64, float64, float64) \
536
+ CPU_DEVICE_NOIMPL(name, func, uint8, complex32, complex32, complex32) \
537
+ CPU_DEVICE_BINARYC(name, func, uint8, complex64, complex64, complex64) \
538
+ CPU_DEVICE_BINARYC(name, func, uint8, complex128, complex128, complex128) \
539
+ \
540
+ CPU_DEVICE_BINARY(name, func, uint16, uint8, float32, float32) \
541
+ CPU_DEVICE_BINARY(name, func, uint16, uint16, float32, float32) \
542
+ CPU_DEVICE_BINARY(name, func, uint16, uint32, float64, float64) \
543
+ CPU_DEVICE_NOKERN(name, func, uint16, uint64, uint64, uint64) \
544
+ CPU_DEVICE_BINARY(name, func, uint16, int8, float32, float32) \
545
+ CPU_DEVICE_BINARY(name, func, uint16, int16, float32, float32) \
546
+ CPU_DEVICE_BINARY(name, func, uint16, int32, float64, float64) \
547
+ CPU_DEVICE_NOKERN(name, func, uint16, int64, int64, int64) \
548
+ CPU_DEVICE_BINARY(name, func, uint16, bfloat16, float32, float32) \
549
+ CPU_DEVICE_NOIMPL(name, func, uint16, float16, float32, float32) \
550
+ CPU_DEVICE_BINARY(name, func, uint16, float32, float32, float32) \
551
+ CPU_DEVICE_BINARY(name, func, uint16, float64, float64, float64) \
552
+ CPU_DEVICE_NOIMPL(name, func, uint16, complex32, complex64, complex64) \
553
+ CPU_DEVICE_BINARYC(name, func, uint16, complex64, complex64, complex64) \
554
+ CPU_DEVICE_BINARYC(name, func, uint16, complex128, complex128, complex128) \
555
+ \
556
+ CPU_DEVICE_BINARY(name, func, uint32, uint8, float64, float64) \
557
+ CPU_DEVICE_BINARY(name, func, uint32, uint16, float64, float64) \
558
+ CPU_DEVICE_BINARY(name, func, uint32, uint32, float64, float64) \
559
+ CPU_DEVICE_NOKERN(name, func, uint32, uint64, uint64, uint64) \
560
+ CPU_DEVICE_BINARY(name, func, uint32, int8, float64, float64) \
561
+ CPU_DEVICE_BINARY(name, func, uint32, int16, float64, float64) \
562
+ CPU_DEVICE_BINARY(name, func, uint32, int32, float64, float64) \
563
+ CPU_DEVICE_NOKERN(name, func, uint32, int64, int64, int64) \
564
+ CPU_DEVICE_BINARY(name, func, uint32, bfloat16, float64, float64) \
565
+ CPU_DEVICE_NOIMPL(name, func, uint32, float16, float64, float64) \
566
+ CPU_DEVICE_BINARY(name, func, uint32, float32, float64, float64) \
567
+ CPU_DEVICE_BINARY(name, func, uint32, float64, float64, float64) \
568
+ CPU_DEVICE_NOIMPL(name, func, uint32, complex32, complex128, complex128) \
569
+ CPU_DEVICE_BINARYC(name, func, uint32, complex64, complex128, complex128) \
570
+ CPU_DEVICE_BINARYC(name, func, uint32, complex128, complex128, complex128) \
571
+ \
572
+ CPU_DEVICE_NOKERN(name, func, uint64, uint8, uint64, uint64) \
573
+ CPU_DEVICE_NOKERN(name, func, uint64, uint16, uint64, uint64) \
574
+ CPU_DEVICE_NOKERN(name, func, uint64, uint32, uint64, uint64) \
575
+ CPU_DEVICE_NOKERN(name, func, uint64, uint64, uint64, uint64) \
576
+ \
577
+ CPU_DEVICE_NOIMPL(name, hfunc, int8, uint8, float16, float16) \
578
+ CPU_DEVICE_BINARY(name, func, int8, uint16, float32, float32) \
579
+ CPU_DEVICE_BINARY(name, func, int8, uint32, float64, float64) \
580
+ CPU_DEVICE_NOIMPL(name, hfunc, int8, int8, float16, float16) \
581
+ CPU_DEVICE_BINARY(name, func, int8, int16, float32, float32) \
582
+ CPU_DEVICE_BINARY(name, func, int8, int32, float64, float64) \
583
+ CPU_DEVICE_NOKERN(name, func, int8, int64, int64, int64) \
584
+ CPU_DEVICE_BINARY(name, func, int8, bfloat16, bfloat16, bfloat16) \
585
+ CPU_DEVICE_NOIMPL(name, hfunc, int8, float16, float16, float16) \
586
+ CPU_DEVICE_BINARY(name, func, int8, float32, float32, float32) \
587
+ CPU_DEVICE_BINARY(name, func, int8, float64, float64, float64) \
588
+ CPU_DEVICE_NOIMPL(name, func, int8, complex32, complex32, complex32) \
589
+ CPU_DEVICE_BINARYC(name, func, int8, complex64, complex64, complex64) \
590
+ CPU_DEVICE_BINARYC(name, func, int8, complex128, complex128, complex128) \
591
+ \
592
+ CPU_DEVICE_BINARY(name, func, int16, uint8, float32, float32) \
593
+ CPU_DEVICE_BINARY(name, func, int16, uint16, float32, float32) \
594
+ CPU_DEVICE_BINARY(name, func, int16, uint32, float64, float64) \
595
+ CPU_DEVICE_BINARY(name, func, int16, int8, float32, float32) \
596
+ CPU_DEVICE_BINARY(name, func, int16, int16, float32, float32) \
597
+ CPU_DEVICE_BINARY(name, func, int16, int32, float64, float64) \
598
+ CPU_DEVICE_NOKERN(name, func, int16, int64, int64, int64) \
599
+ CPU_DEVICE_BINARY(name, func, int16, bfloat16, float32, float32) \
600
+ CPU_DEVICE_NOIMPL(name, func, int16, float16, float32, float32) \
601
+ CPU_DEVICE_BINARY(name, func, int16, float32, float32, float32) \
602
+ CPU_DEVICE_BINARY(name, func, int16, float64, float64, float64) \
603
+ CPU_DEVICE_NOIMPL(name, func, int16, complex32, complex64, complex64) \
604
+ CPU_DEVICE_BINARYC(name, func, int16, complex64, complex64, complex64) \
605
+ CPU_DEVICE_BINARYC(name, func, int16, complex128, complex128, complex128) \
606
+ \
607
+ CPU_DEVICE_BINARY(name, func, int32, uint8, float64, float64) \
608
+ CPU_DEVICE_BINARY(name, func, int32, uint16, float64, float64) \
609
+ CPU_DEVICE_BINARY(name, func, int32, uint32, float64, float64) \
610
+ CPU_DEVICE_BINARY(name, func, int32, int8, float64, float64) \
611
+ CPU_DEVICE_BINARY(name, func, int32, int16, float64, float64) \
612
+ CPU_DEVICE_BINARY(name, func, int32, int32, float64, float64) \
613
+ CPU_DEVICE_NOKERN(name, func, int32, int64, int64, int64) \
614
+ CPU_DEVICE_BINARY(name, func, int32, bfloat16, float64, float64) \
615
+ CPU_DEVICE_NOIMPL(name, func, int32, float16, float64, float64) \
616
+ CPU_DEVICE_BINARY(name, func, int32, float32, float64, float64) \
617
+ CPU_DEVICE_BINARY(name, func, int32, float64, float64, float64) \
618
+ CPU_DEVICE_NOIMPL(name, func, int32, complex32, complex128, complex128) \
619
+ CPU_DEVICE_BINARYC(name, func, int32, complex64, complex128, complex128) \
620
+ CPU_DEVICE_BINARYC(name, func, int32, complex128, complex128, complex128) \
621
+ \
622
+ CPU_DEVICE_NOKERN(name, func, int64, uint8, int64, int64) \
623
+ CPU_DEVICE_NOKERN(name, func, int64, uint16, int64, int64) \
624
+ CPU_DEVICE_NOKERN(name, func, int64, uint32, int64, int64) \
625
+ CPU_DEVICE_NOKERN(name, func, int64, int8, int64, int64) \
626
+ CPU_DEVICE_NOKERN(name, func, int64, int16, int64, int64) \
627
+ CPU_DEVICE_NOKERN(name, func, int64, int32, int64, int64) \
628
+ CPU_DEVICE_NOKERN(name, func, int64, int64, int64, int64) \
629
+ \
630
+ CPU_DEVICE_BINARY(name, func, bfloat16, uint8, bfloat16, bfloat16) \
631
+ CPU_DEVICE_BINARY(name, func, bfloat16, uint16, float32, float32) \
632
+ CPU_DEVICE_BINARY(name, func, bfloat16, uint32, float64, float64) \
633
+ CPU_DEVICE_BINARY(name, func, bfloat16, int8, bfloat16, bfloat16) \
634
+ CPU_DEVICE_BINARY(name, func, bfloat16, int16, float32, float32) \
635
+ CPU_DEVICE_BINARY(name, func, bfloat16, int32, float64, float64) \
636
+ CPU_DEVICE_BINARY(name, func, bfloat16, bfloat16, bfloat16, bfloat16) \
637
+ CPU_DEVICE_NOIMPL(name, func, bfloat16, float16, float32, float32) \
638
+ CPU_DEVICE_BINARY(name, func, bfloat16, float32, float32, float32) \
639
+ CPU_DEVICE_BINARY(name, func, bfloat16, float64, float64, float64) \
640
+ CPU_DEVICE_NOIMPL(name, func, bfloat16, complex32, complex64, complex64) \
641
+ CPU_DEVICE_BINARY(name, func, bfloat16, complex64, complex64, complex64) \
642
+ CPU_DEVICE_BINARY(name, func, bfloat16, complex128, complex128, complex128) \
643
+ \
644
+ CPU_DEVICE_NOIMPL(name, hfunc, float16, uint8, float16, float16) \
645
+ CPU_DEVICE_NOIMPL(name, func, float16, uint16, float32, float32) \
646
+ CPU_DEVICE_NOIMPL(name, func, float16, uint32, float64, float64) \
647
+ CPU_DEVICE_NOIMPL(name, hfunc, float16, int8, float16, float16) \
648
+ CPU_DEVICE_NOIMPL(name, func, float16, int16, float32, float32) \
649
+ CPU_DEVICE_NOIMPL(name, func, float16, int32, float64, float64) \
650
+ CPU_DEVICE_NOIMPL(name, func, float16, bfloat16, float32, float32) \
651
+ CPU_DEVICE_NOIMPL(name, hfunc, float16, float16, float16, float16) \
652
+ CPU_DEVICE_NOIMPL(name, func, float16, float32, float32, float32) \
653
+ CPU_DEVICE_NOIMPL(name, func, float16, float64, float64, float64) \
654
+ CPU_DEVICE_NOIMPL(name, func, float16, complex32, complex32, complex32) \
655
+ CPU_DEVICE_NOIMPL(name, func, float16, complex64, complex64, complex64) \
656
+ CPU_DEVICE_NOIMPL(name, func, float16, complex128, complex128, complex128) \
657
+ \
658
+ CPU_DEVICE_BINARY(name, func, float32, uint8, float32, float32) \
659
+ CPU_DEVICE_BINARY(name, func, float32, uint16, float32, float32) \
660
+ CPU_DEVICE_BINARY(name, func, float32, uint32, float64, float64) \
661
+ CPU_DEVICE_BINARY(name, func, float32, int8, float32, float32) \
662
+ CPU_DEVICE_BINARY(name, func, float32, int16, float32, float32) \
663
+ CPU_DEVICE_BINARY(name, func, float32, int32, float64, float64) \
664
+ CPU_DEVICE_BINARY(name, func, float32, bfloat16, float32, float32) \
665
+ CPU_DEVICE_NOIMPL(name, func, float32, float16, float32, float32) \
666
+ CPU_DEVICE_BINARY(name, func, float32, float32, float32, float32) \
667
+ CPU_DEVICE_BINARY(name, func, float32, float64, float64, float64) \
668
+ CPU_DEVICE_NOIMPL(name, func, float32, complex32, complex64, complex64) \
669
+ CPU_DEVICE_BINARYC(name, func, float32, complex64, complex64, complex64) \
670
+ CPU_DEVICE_BINARYC(name, func, float32, complex128, complex128, complex128) \
671
+ \
672
+ CPU_DEVICE_BINARY(name, func, float64, uint8, float64, float64) \
673
+ CPU_DEVICE_BINARY(name, func, float64, uint16, float64, float64) \
674
+ CPU_DEVICE_BINARY(name, func, float64, uint32, float64, float64) \
675
+ CPU_DEVICE_BINARY(name, func, float64, int8, float64, float64) \
676
+ CPU_DEVICE_BINARY(name, func, float64, int16, float64, float64) \
677
+ CPU_DEVICE_BINARY(name, func, float64, int32, float64, float64) \
678
+ CPU_DEVICE_BINARY(name, func, float64, bfloat16, float64, float64) \
679
+ CPU_DEVICE_NOIMPL(name, func, float64, float16, float64, float64) \
680
+ CPU_DEVICE_BINARY(name, func, float64, float32, float64, float64) \
681
+ CPU_DEVICE_BINARY(name, func, float64, float64, float64, float64) \
682
+ CPU_DEVICE_NOIMPL(name, func, float64, complex32, complex128, complex128) \
683
+ CPU_DEVICE_BINARYC(name, func, float64, complex64, complex128, complex128) \
684
+ CPU_DEVICE_BINARYC(name, func, float64, complex128, complex128, complex128) \
685
+ \
686
+ CPU_DEVICE_NOIMPL(name, func, complex32, uint8, complex32, complex32) \
687
+ CPU_DEVICE_NOIMPL(name, func, complex32, uint16, complex64, complex64) \
688
+ CPU_DEVICE_NOIMPL(name, func, complex32, uint32, complex128, complex128) \
689
+ CPU_DEVICE_NOIMPL(name, func, complex32, int8, complex32, complex32) \
690
+ CPU_DEVICE_NOIMPL(name, func, complex32, int16, complex64, complex64) \
691
+ CPU_DEVICE_NOIMPL(name, func, complex32, int32, complex128, complex128) \
692
+ CPU_DEVICE_NOIMPL(name, func, complex32, bfloat16, complex64, complex64) \
693
+ CPU_DEVICE_NOIMPL(name, func, complex32, float16, complex32, complex32) \
694
+ CPU_DEVICE_NOIMPL(name, func, complex32, float32, complex64, complex64) \
695
+ CPU_DEVICE_NOIMPL(name, func, complex32, float64, complex128, complex128) \
696
+ CPU_DEVICE_NOIMPL(name, func, complex32, complex32, complex32, complex32) \
697
+ CPU_DEVICE_NOIMPL(name, func, complex32, complex64, complex64, complex64) \
698
+ CPU_DEVICE_NOIMPL(name, func, complex32, complex128, complex128, complex128) \
699
+ \
700
+ CPU_DEVICE_BINARYC(name, func, complex64, uint8, complex64, complex64) \
701
+ CPU_DEVICE_BINARYC(name, func, complex64, uint16, complex64, complex64) \
702
+ CPU_DEVICE_BINARYC(name, func, complex64, uint32, complex128, complex128) \
703
+ CPU_DEVICE_BINARYC(name, func, complex64, int8, complex64, complex64) \
704
+ CPU_DEVICE_BINARYC(name, func, complex64, int16, complex64, complex64) \
705
+ CPU_DEVICE_BINARYC(name, func, complex64, int32, complex128, complex128) \
706
+ CPU_DEVICE_BINARY(name, func, complex64, bfloat16, complex64, complex64) \
707
+ CPU_DEVICE_NOIMPL(name, func, complex64, float16, complex64, complex64) \
708
+ CPU_DEVICE_BINARYC(name, func, complex64, float32, complex64, complex64) \
709
+ CPU_DEVICE_BINARYC(name, func, complex64, float64, complex128, complex128) \
710
+ CPU_DEVICE_NOIMPL(name, func, complex64, complex32, complex64, complex64) \
711
+ CPU_DEVICE_BINARYC(name, func, complex64, complex64, complex64, complex64) \
712
+ CPU_DEVICE_BINARYC(name, func, complex64, complex128, complex128, complex128) \
713
+ \
714
+ CPU_DEVICE_BINARYC(name, func, complex128, uint8, complex128, complex128) \
715
+ CPU_DEVICE_BINARYC(name, func, complex128, uint16, complex128, complex128) \
716
+ CPU_DEVICE_BINARYC(name, func, complex128, uint32, complex128, complex128) \
717
+ CPU_DEVICE_BINARYC(name, func, complex128, int8, complex128, complex128) \
718
+ CPU_DEVICE_BINARYC(name, func, complex128, int16, complex128, complex128) \
719
+ CPU_DEVICE_BINARYC(name, func, complex128, int32, complex128, complex128) \
720
+ CPU_DEVICE_BINARY(name, func, complex128, bfloat16, complex128, complex128) \
721
+ CPU_DEVICE_NOIMPL(name, func, complex128, float16, complex128, complex128) \
722
+ CPU_DEVICE_BINARYC(name, func, complex128, float32, complex128, complex128) \
723
+ CPU_DEVICE_BINARYC(name, func, complex128, float64, complex128, complex128) \
724
+ CPU_DEVICE_NOIMPL(name, func, complex128, complex32, complex128, complex128) \
725
+ CPU_DEVICE_BINARYC(name, func, complex128, complex64, complex128, complex128) \
726
+ CPU_DEVICE_BINARYC(name, func, complex128, complex128, complex128, complex128)
727
+
728
+ #define add(x, y) x + y
729
+ CPU_DEVICE_ALL_BINARY(add, add, add)
730
+
731
+ #define subtract(x, y) x - y
732
+ CPU_DEVICE_ALL_BINARY(subtract, subtract, sub)
733
+
734
+ #define multiply(x, y) x * y
735
+ CPU_DEVICE_ALL_BINARY(multiply, multiply, multiply)
736
+
737
+ #define floor_divide(x, y) x / y
738
+ CPU_DEVICE_ALL_BINARY_NO_COMPLEX(floor_divide, _floor_divide, _floor_divide)
739
+
740
+ #define remainder(x, y) x % y
741
+ CPU_DEVICE_ALL_BINARY_NO_COMPLEX(remainder, _remainder, _remainder)
742
+
743
+ #define divide(x, y) x / y
744
+ CPU_DEVICE_ALL_BINARY_FLOAT_RETURN(divide, divide, divide)
745
+
746
+ CPU_DEVICE_ALL_BINARY(power, _pow, _pow)
747
+
748
+
749
+ /*****************************************************************************/
750
+ /* Comparison */
751
+ /*****************************************************************************/
752
+
753
+ #define CPU_DEVICE_ALL_COMPARISON(name, func, hfunc, cfunc) \
754
+ CPU_DEVICE_BINARY(name, func, uint8, uint8, bool, uint8) \
755
+ CPU_DEVICE_BINARY(name, func, uint8, uint16, bool, uint16) \
756
+ CPU_DEVICE_BINARY(name, func, uint8, uint32, bool, uint32) \
757
+ CPU_DEVICE_BINARY(name, func, uint8, uint64, bool, uint64) \
758
+ CPU_DEVICE_BINARY(name, func, uint8, int8, bool, int16) \
759
+ CPU_DEVICE_BINARY(name, func, uint8, int16, bool, int16) \
760
+ CPU_DEVICE_BINARY(name, func, uint8, int32, bool, int32) \
761
+ CPU_DEVICE_BINARY(name, func, uint8, int64, bool, int64) \
762
+ CPU_DEVICE_BINARY(name, func, uint8, bfloat16, bool, bfloat16) \
763
+ CPU_DEVICE_NOIMPL(name, func, uint8, float16, bool, float16) \
764
+ CPU_DEVICE_BINARY(name, func, uint8, float32, bool, float32) \
765
+ CPU_DEVICE_BINARY(name, func, uint8, float64, bool, float64) \
766
+ CPU_DEVICE_NOIMPL(name, cfunc, uint8, complex32, bool, complex32) \
767
+ CPU_DEVICE_BINARYC(name, cfunc, uint8, complex64, bool, complex64) \
768
+ CPU_DEVICE_BINARYC(name, cfunc, uint8, complex128, bool, complex128) \
769
+ \
770
+ CPU_DEVICE_BINARY(name, func, uint16, uint8, bool, uint16) \
771
+ CPU_DEVICE_BINARY(name, func, uint16, uint16, bool, uint16) \
772
+ CPU_DEVICE_BINARY(name, func, uint16, uint32, bool, uint32) \
773
+ CPU_DEVICE_BINARY(name, func, uint16, uint64, bool, uint64) \
774
+ CPU_DEVICE_BINARY(name, func, uint16, int8, bool, int32) \
775
+ CPU_DEVICE_BINARY(name, func, uint16, int16, bool, int32) \
776
+ CPU_DEVICE_BINARY(name, func, uint16, int32, bool, int32) \
777
+ CPU_DEVICE_BINARY(name, func, uint16, int64, bool, int64) \
778
+ CPU_DEVICE_BINARY(name, func, uint16, bfloat16, bool, float32) \
779
+ CPU_DEVICE_NOIMPL(name, func, uint16, float16, bool, float32) \
780
+ CPU_DEVICE_BINARY(name, func, uint16, float32, bool, float32) \
781
+ CPU_DEVICE_BINARY(name, func, uint16, float64, bool, float64) \
782
+ CPU_DEVICE_NOIMPL(name, cfunc, uint16, complex32, bool, complex64) \
783
+ CPU_DEVICE_BINARYC(name, cfunc, uint16, complex64, bool, complex64) \
784
+ CPU_DEVICE_BINARYC(name, cfunc, uint16, complex128, bool, complex128) \
785
+ \
786
+ CPU_DEVICE_BINARY(name, func, uint32, uint8, bool, uint32) \
787
+ CPU_DEVICE_BINARY(name, func, uint32, uint16, bool, uint32) \
788
+ CPU_DEVICE_BINARY(name, func, uint32, uint32, bool, uint32) \
789
+ CPU_DEVICE_BINARY(name, func, uint32, uint64, bool, uint64) \
790
+ CPU_DEVICE_BINARY(name, func, uint32, int8, bool, int64) \
791
+ CPU_DEVICE_BINARY(name, func, uint32, int16, bool, int64) \
792
+ CPU_DEVICE_BINARY(name, func, uint32, int32, bool, int64) \
793
+ CPU_DEVICE_BINARY(name, func, uint32, int64, bool, int64) \
794
+ CPU_DEVICE_BINARY(name, func, uint32, bfloat16, bool, float64) \
795
+ CPU_DEVICE_NOIMPL(name, func, uint32, float16, bool, float64) \
796
+ CPU_DEVICE_BINARY(name, func, uint32, float32, bool, float64) \
797
+ CPU_DEVICE_BINARY(name, func, uint32, float64, bool, float64) \
798
+ CPU_DEVICE_NOIMPL(name, cfunc, uint32, complex32, bool, complex128) \
799
+ CPU_DEVICE_BINARYC(name, cfunc, uint32, complex64, bool, complex128) \
800
+ CPU_DEVICE_BINARYC(name, cfunc, uint32, complex128, bool, complex128) \
801
+ \
802
+ CPU_DEVICE_BINARY(name, func, uint64, uint8, bool, uint64) \
803
+ CPU_DEVICE_BINARY(name, func, uint64, uint16, bool, uint64) \
804
+ CPU_DEVICE_BINARY(name, func, uint64, uint32, bool, uint64) \
805
+ CPU_DEVICE_BINARY(name, func, uint64, uint64, bool, uint64) \
806
+ \
807
+ CPU_DEVICE_BINARY(name, func, int8, uint8, bool, int16) \
808
+ CPU_DEVICE_BINARY(name, func, int8, uint16, bool, int32) \
809
+ CPU_DEVICE_BINARY(name, func, int8, uint32, bool, int64) \
810
+ CPU_DEVICE_BINARY(name, func, int8, int8, bool, int8) \
811
+ CPU_DEVICE_BINARY(name, func, int8, int16, bool, int16) \
812
+ CPU_DEVICE_BINARY(name, func, int8, int32, bool, int32) \
813
+ CPU_DEVICE_BINARY(name, func, int8, int64, bool, int64) \
814
+ CPU_DEVICE_BINARY(name, func, int8, bfloat16, bool, bfloat16) \
815
+ CPU_DEVICE_NOIMPL(name, func, int8, float16, bool, float16) \
816
+ CPU_DEVICE_BINARY(name, func, int8, float32, bool, float32) \
817
+ CPU_DEVICE_BINARY(name, func, int8, float64, bool, float64) \
818
+ CPU_DEVICE_NOIMPL(name, cfunc, int8, complex32, bool, complex32) \
819
+ CPU_DEVICE_BINARYC(name, cfunc, int8, complex64, bool, complex64) \
820
+ CPU_DEVICE_BINARYC(name, cfunc, int8, complex128, bool, complex128) \
821
+ \
822
+ CPU_DEVICE_BINARY(name, func, int16, uint8, bool, int16) \
823
+ CPU_DEVICE_BINARY(name, func, int16, uint16, bool, int32) \
824
+ CPU_DEVICE_BINARY(name, func, int16, uint32, bool, int64) \
825
+ CPU_DEVICE_BINARY(name, func, int16, int8, bool, int16) \
826
+ CPU_DEVICE_BINARY(name, func, int16, int16, bool, int16) \
827
+ CPU_DEVICE_BINARY(name, func, int16, int32, bool, int32) \
828
+ CPU_DEVICE_BINARY(name, func, int16, int64, bool, int64) \
829
+ CPU_DEVICE_BINARY(name, func, int16, bfloat16, bool, float32) \
830
+ CPU_DEVICE_NOIMPL(name, func, int16, float16, bool, float32) \
831
+ CPU_DEVICE_BINARY(name, func, int16, float32, bool, float32) \
832
+ CPU_DEVICE_BINARY(name, func, int16, float64, bool, float64) \
833
+ CPU_DEVICE_NOIMPL(name, cfunc, int16, complex32, bool, complex64) \
834
+ CPU_DEVICE_BINARYC(name, cfunc, int16, complex64, bool, complex64) \
835
+ CPU_DEVICE_BINARYC(name, cfunc, int16, complex128, bool, complex128) \
836
+ \
837
+ CPU_DEVICE_BINARY(name, func, int32, uint8, bool, int32) \
838
+ CPU_DEVICE_BINARY(name, func, int32, uint16, bool, int32) \
839
+ CPU_DEVICE_BINARY(name, func, int32, uint32, bool, int64) \
840
+ CPU_DEVICE_BINARY(name, func, int32, int8, bool, int32) \
841
+ CPU_DEVICE_BINARY(name, func, int32, int16, bool, int32) \
842
+ CPU_DEVICE_BINARY(name, func, int32, int32, bool, int32) \
843
+ CPU_DEVICE_BINARY(name, func, int32, int64, bool, int64) \
844
+ CPU_DEVICE_BINARY(name, func, int32, bfloat16, bool, float64) \
845
+ CPU_DEVICE_NOIMPL(name, func, int32, float16, bool, float64) \
846
+ CPU_DEVICE_BINARY(name, func, int32, float32, bool, float64) \
847
+ CPU_DEVICE_BINARY(name, func, int32, float64, bool, float64) \
848
+ CPU_DEVICE_NOIMPL(name, cfunc, int32, complex32, bool, complex128) \
849
+ CPU_DEVICE_BINARYC(name, cfunc, int32, complex64, bool, complex128) \
850
+ CPU_DEVICE_BINARYC(name, cfunc, int32, complex128, bool, complex128) \
851
+ \
852
+ CPU_DEVICE_BINARY(name, func, int64, uint8, bool, int64) \
853
+ CPU_DEVICE_BINARY(name, func, int64, uint16, bool, int64) \
854
+ CPU_DEVICE_BINARY(name, func, int64, uint32, bool, int64) \
855
+ CPU_DEVICE_BINARY(name, func, int64, int8, bool, int64) \
856
+ CPU_DEVICE_BINARY(name, func, int64, int16, bool, int64) \
857
+ CPU_DEVICE_BINARY(name, func, int64, int32, bool, int64) \
858
+ CPU_DEVICE_BINARY(name, func, int64, int64, bool, int64) \
859
+ \
860
+ CPU_DEVICE_BINARY(name, func, bfloat16, uint8, bool, bfloat16) \
861
+ CPU_DEVICE_BINARY(name, func, bfloat16, uint16, bool, float32) \
862
+ CPU_DEVICE_BINARY(name, func, bfloat16, uint32, bool, float64) \
863
+ CPU_DEVICE_BINARY(name, func, bfloat16, int8, bool, bfloat16) \
864
+ CPU_DEVICE_BINARY(name, func, bfloat16, int16, bool, float32) \
865
+ CPU_DEVICE_BINARY(name, func, bfloat16, int32, bool, float64) \
866
+ CPU_DEVICE_BINARY(name, func, bfloat16, bfloat16, bool, bfloat16) \
867
+ CPU_DEVICE_NOIMPL(name, func, bfloat16, float16, bool, float32) \
868
+ CPU_DEVICE_BINARY(name, func, bfloat16, float32, bool, float32) \
869
+ CPU_DEVICE_BINARY(name, func, bfloat16, float64, bool, float64) \
870
+ CPU_DEVICE_NOIMPL(name, cfunc, bfloat16, complex32, bool, complex64) \
871
+ CPU_DEVICE_BINARY(name, cfunc, bfloat16, complex64, bool, complex64) \
872
+ CPU_DEVICE_BINARY(name, cfunc, bfloat16, complex128, bool, complex128) \
873
+ \
874
+ CPU_DEVICE_NOIMPL(name, func, float16, uint8, bool, float16) \
875
+ CPU_DEVICE_NOIMPL(name, func, float16, uint16, bool, float32) \
876
+ CPU_DEVICE_NOIMPL(name, func, float16, uint32, bool, float64) \
877
+ CPU_DEVICE_NOIMPL(name, func, float16, int8, bool, float16) \
878
+ CPU_DEVICE_NOIMPL(name, func, float16, int16, bool, float32) \
879
+ CPU_DEVICE_NOIMPL(name, func, float16, int32, bool, float64) \
880
+ CPU_DEVICE_NOIMPL(name, func, float16, bfloat16, bool, float32) \
881
+ CPU_DEVICE_NOIMPL(name, func, float16, float16, bool, float16) \
882
+ CPU_DEVICE_NOIMPL(name, func, float16, float32, bool, float32) \
883
+ CPU_DEVICE_NOIMPL(name, func, float16, float64, bool, float64) \
884
+ CPU_DEVICE_NOIMPL(name, cfunc, float16, complex32, bool, complex32) \
885
+ CPU_DEVICE_NOIMPL(name, cfunc, float16, complex64, bool, complex64) \
886
+ CPU_DEVICE_NOIMPL(name, cfunc, float16, complex128, bool, complex128) \
887
+ \
888
+ CPU_DEVICE_BINARY(name, func, float32, uint8, bool, float32) \
889
+ CPU_DEVICE_BINARY(name, func, float32, uint16, bool, float32) \
890
+ CPU_DEVICE_BINARY(name, func, float32, uint32, bool, float64) \
891
+ CPU_DEVICE_BINARY(name, func, float32, int8, bool, float32) \
892
+ CPU_DEVICE_BINARY(name, func, float32, int16, bool, float32) \
893
+ CPU_DEVICE_BINARY(name, func, float32, int32, bool, float64) \
894
+ CPU_DEVICE_BINARY(name, func, float32, bfloat16, bool, float32) \
895
+ CPU_DEVICE_NOIMPL(name, func, float32, float16, bool, float32) \
896
+ CPU_DEVICE_BINARY(name, func, float32, float32, bool, float32) \
897
+ CPU_DEVICE_BINARY(name, func, float32, float64, bool, float64) \
898
+ CPU_DEVICE_NOIMPL(name, cfunc, float32, complex32, bool, complex64) \
899
+ CPU_DEVICE_BINARYC(name, cfunc, float32, complex64, bool, complex64) \
900
+ CPU_DEVICE_BINARYC(name, cfunc, float32, complex128, bool, complex128) \
901
+ \
902
+ CPU_DEVICE_BINARY(name, func, float64, uint8, bool, float64) \
903
+ CPU_DEVICE_BINARY(name, func, float64, uint16, bool, float64) \
904
+ CPU_DEVICE_BINARY(name, func, float64, uint32, bool, float64) \
905
+ CPU_DEVICE_BINARY(name, func, float64, int8, bool, float64) \
906
+ CPU_DEVICE_BINARY(name, func, float64, int16, bool, float64) \
907
+ CPU_DEVICE_BINARY(name, func, float64, int32, bool, float64) \
908
+ CPU_DEVICE_BINARY(name, func, float64, bfloat16, bool, float64) \
909
+ CPU_DEVICE_NOIMPL(name, func, float64, float16, bool, float64) \
910
+ CPU_DEVICE_BINARY(name, func, float64, float32, bool, float64) \
911
+ CPU_DEVICE_BINARY(name, func, float64, float64, bool, float64) \
912
+ CPU_DEVICE_NOIMPL(name, cfunc, float64, complex32, bool, complex128) \
913
+ CPU_DEVICE_BINARYC(name, cfunc, float64, complex64, bool, complex128) \
914
+ CPU_DEVICE_BINARYC(name, cfunc, float64, complex128, bool, complex128) \
915
+ \
916
+ CPU_DEVICE_NOIMPL(name, cfunc, complex32, uint8, bool, complex32) \
917
+ CPU_DEVICE_NOIMPL(name, cfunc, complex32, uint16, bool, complex64) \
918
+ CPU_DEVICE_NOIMPL(name, cfunc, complex32, uint32, bool, complex128) \
919
+ CPU_DEVICE_NOIMPL(name, cfunc, complex32, int8, bool, complex32) \
920
+ CPU_DEVICE_NOIMPL(name, cfunc, complex32, int16, bool, complex64) \
921
+ CPU_DEVICE_NOIMPL(name, cfunc, complex32, int32, bool, complex128) \
922
+ CPU_DEVICE_NOIMPL(name, cfunc, complex32, bfloat16, bool, complex64) \
923
+ CPU_DEVICE_NOIMPL(name, cfunc, complex32, float16, bool, complex32) \
924
+ CPU_DEVICE_NOIMPL(name, cfunc, complex32, float32, bool, complex64) \
925
+ CPU_DEVICE_NOIMPL(name, cfunc, complex32, float64, bool, complex128) \
926
+ CPU_DEVICE_NOIMPL(name, cfunc, complex32, complex32, bool, complex32) \
927
+ CPU_DEVICE_NOIMPL(name, cfunc, complex32, complex64, bool, complex64) \
928
+ CPU_DEVICE_NOIMPL(name, cfunc, complex32, complex128, bool, complex128) \
929
+ \
930
+ CPU_DEVICE_BINARYC(name, cfunc, complex64, uint8, bool, complex64) \
931
+ CPU_DEVICE_BINARYC(name, cfunc, complex64, uint16, bool, complex64) \
932
+ CPU_DEVICE_BINARYC(name, cfunc, complex64, uint32, bool, complex128) \
933
+ CPU_DEVICE_BINARYC(name, cfunc, complex64, int8, bool, complex64) \
934
+ CPU_DEVICE_BINARYC(name, cfunc, complex64, int16, bool, complex64) \
935
+ CPU_DEVICE_BINARYC(name, cfunc, complex64, int32, bool, complex128) \
936
+ CPU_DEVICE_BINARY(name, cfunc, complex64, bfloat16, bool, complex64) \
937
+ CPU_DEVICE_NOIMPL(name, cfunc, complex64, float16, bool, complex64) \
938
+ CPU_DEVICE_BINARYC(name, cfunc, complex64, float32, bool, complex64) \
939
+ CPU_DEVICE_BINARYC(name, cfunc, complex64, float64, bool, complex128) \
940
+ CPU_DEVICE_NOIMPL(name, cfunc, complex64, complex32, bool, complex64) \
941
+ CPU_DEVICE_BINARYC(name, cfunc, complex64, complex64, bool, complex64) \
942
+ CPU_DEVICE_BINARYC(name, cfunc, complex64, complex128, bool, complex128) \
943
+ \
944
+ CPU_DEVICE_BINARYC(name, cfunc, complex128, uint8, bool, complex128) \
945
+ CPU_DEVICE_BINARYC(name, cfunc, complex128, uint16, bool, complex128) \
946
+ CPU_DEVICE_BINARYC(name, cfunc, complex128, uint32, bool, complex128) \
947
+ CPU_DEVICE_BINARYC(name, cfunc, complex128, int8, bool, complex128) \
948
+ CPU_DEVICE_BINARYC(name, cfunc, complex128, int16, bool, complex128) \
949
+ CPU_DEVICE_BINARYC(name, cfunc, complex128, int32, bool, complex128) \
950
+ CPU_DEVICE_BINARY(name, cfunc, complex128, bfloat16, bool, complex128) \
951
+ CPU_DEVICE_NOIMPL(name, cfunc, complex128, float16, bool, complex128) \
952
+ CPU_DEVICE_BINARYC(name, cfunc, complex128, float32, bool, complex128) \
953
+ CPU_DEVICE_BINARYC(name, cfunc, complex128, float64, bool, complex128) \
954
+ CPU_DEVICE_NOIMPL(name, cfunc, complex128, complex32, bool, complex128) \
955
+ CPU_DEVICE_BINARYC(name, cfunc, complex128, complex64, bool, complex128) \
956
+ CPU_DEVICE_BINARYC(name, cfunc, complex128, complex128, bool, complex128) \
957
+
958
+
959
+ #define less(x, y) x < y
960
+ CPU_DEVICE_ALL_COMPARISON(less, less, less, lexorder_lt)
961
+
962
+ #define less_equal(x, y) x <= y
963
+ CPU_DEVICE_ALL_COMPARISON(less_equal, less_equal, less_equal, lexorder_le)
964
+
965
+ #define greater_equal(x, y) x >= y
966
+ CPU_DEVICE_ALL_COMPARISON(greater_equal, greater_equal, greater_equal, lexorder_ge)
967
+
968
+ #define greater(x, y) x > y
969
+ CPU_DEVICE_ALL_COMPARISON(greater, greater, greater, lexorder_gt)
970
+
971
+ #define equal(x, y) x == y
972
+ CPU_DEVICE_ALL_COMPARISON(equal, equal, equal, equal)
973
+
974
+ #define not_equal(x, y) x != y
975
+ CPU_DEVICE_ALL_COMPARISON(not_equal, not_equal, not_equal, not_equal)
976
+
977
+ #define equaln(x, y) (x == y || (x != x && y != y))
978
+ CPU_DEVICE_ALL_COMPARISON(equaln, equaln, equaln, lexorder_eqn)
979
+
980
+
981
+ /*****************************************************************************/
982
+ /* Bitwise */
983
+ /*****************************************************************************/
984
+
985
+ #define CPU_DEVICE_ALL_BITWISE(name, func) \
986
+ CPU_DEVICE_BINARY(name, func, bool, bool, bool, bool) \
987
+ CPU_DEVICE_BINARY(name, func, bool, uint8, uint8, uint8) \
988
+ CPU_DEVICE_BINARY(name, func, bool, uint16, uint16, uint16) \
989
+ CPU_DEVICE_BINARY(name, func, bool, uint32, uint32, uint32) \
990
+ CPU_DEVICE_BINARY(name, func, bool, uint64, uint64, uint64) \
991
+ CPU_DEVICE_BINARY(name, func, bool, int8, int8, int8) \
992
+ CPU_DEVICE_BINARY(name, func, bool, int16, int16, int16) \
993
+ CPU_DEVICE_BINARY(name, func, bool, int32, int32, int32) \
994
+ CPU_DEVICE_BINARY(name, func, bool, int64, int64, int64) \
995
+ \
996
+ CPU_DEVICE_BINARY(name, func, uint8, bool, uint8, uint8) \
997
+ CPU_DEVICE_BINARY(name, func, uint8, uint8, uint8, uint8) \
998
+ CPU_DEVICE_BINARY(name, func, uint8, uint16, uint16, uint16) \
999
+ CPU_DEVICE_BINARY(name, func, uint8, uint32, uint32, uint32) \
1000
+ CPU_DEVICE_BINARY(name, func, uint8, uint64, uint64, uint64) \
1001
+ CPU_DEVICE_BINARY(name, func, uint8, int8, int16, int16) \
1002
+ CPU_DEVICE_BINARY(name, func, uint8, int16, int16, int16) \
1003
+ CPU_DEVICE_BINARY(name, func, uint8, int32, int32, int32) \
1004
+ CPU_DEVICE_BINARY(name, func, uint8, int64, int64, int64) \
1005
+ \
1006
+ CPU_DEVICE_BINARY(name, func, uint16, bool, uint16, uint16) \
1007
+ CPU_DEVICE_BINARY(name, func, uint16, uint8, uint16, uint16) \
1008
+ CPU_DEVICE_BINARY(name, func, uint16, uint16, uint16, uint16) \
1009
+ CPU_DEVICE_BINARY(name, func, uint16, uint32, uint32, uint32) \
1010
+ CPU_DEVICE_BINARY(name, func, uint16, uint64, uint64, uint64) \
1011
+ CPU_DEVICE_BINARY(name, func, uint16, int8, int32, int32) \
1012
+ CPU_DEVICE_BINARY(name, func, uint16, int16, int32, int32) \
1013
+ CPU_DEVICE_BINARY(name, func, uint16, int32, int32, int32) \
1014
+ CPU_DEVICE_BINARY(name, func, uint16, int64, int64, int64) \
1015
+ \
1016
+ CPU_DEVICE_BINARY(name, func, uint32, bool, uint32, uint32) \
1017
+ CPU_DEVICE_BINARY(name, func, uint32, uint8, uint32, uint32) \
1018
+ CPU_DEVICE_BINARY(name, func, uint32, uint16, uint32, uint32) \
1019
+ CPU_DEVICE_BINARY(name, func, uint32, uint32, uint32, uint32) \
1020
+ CPU_DEVICE_BINARY(name, func, uint32, uint64, uint64, uint64) \
1021
+ CPU_DEVICE_BINARY(name, func, uint32, int8, int64, int64) \
1022
+ CPU_DEVICE_BINARY(name, func, uint32, int16, int64, int64) \
1023
+ CPU_DEVICE_BINARY(name, func, uint32, int32, int64, int64) \
1024
+ CPU_DEVICE_BINARY(name, func, uint32, int64, int64, int64) \
1025
+ \
1026
+ CPU_DEVICE_BINARY(name, func, uint64, bool, uint64, uint64) \
1027
+ CPU_DEVICE_BINARY(name, func, uint64, uint8, uint64, uint64) \
1028
+ CPU_DEVICE_BINARY(name, func, uint64, uint16, uint64, uint64) \
1029
+ CPU_DEVICE_BINARY(name, func, uint64, uint32, uint64, uint64) \
1030
+ CPU_DEVICE_BINARY(name, func, uint64, uint64, uint64, uint64) \
1031
+ \
1032
+ CPU_DEVICE_BINARY(name, func, int8, bool, int8, int8) \
1033
+ CPU_DEVICE_BINARY(name, func, int8, uint8, int16, int16) \
1034
+ CPU_DEVICE_BINARY(name, func, int8, uint16, int32, int32) \
1035
+ CPU_DEVICE_BINARY(name, func, int8, uint32, int64, int64) \
1036
+ CPU_DEVICE_BINARY(name, func, int8, int8, int8, int8) \
1037
+ CPU_DEVICE_BINARY(name, func, int8, int16, int16, int16) \
1038
+ CPU_DEVICE_BINARY(name, func, int8, int32, int32, int32) \
1039
+ CPU_DEVICE_BINARY(name, func, int8, int64, int64, int64) \
1040
+ \
1041
+ CPU_DEVICE_BINARY(name, func, int16, bool, int16, int16) \
1042
+ CPU_DEVICE_BINARY(name, func, int16, uint8, int16, int16) \
1043
+ CPU_DEVICE_BINARY(name, func, int16, uint16, int32, int32) \
1044
+ CPU_DEVICE_BINARY(name, func, int16, uint32, int64, int64) \
1045
+ CPU_DEVICE_BINARY(name, func, int16, int8, int16, int16) \
1046
+ CPU_DEVICE_BINARY(name, func, int16, int16, int16, int16) \
1047
+ CPU_DEVICE_BINARY(name, func, int16, int32, int32, int32) \
1048
+ CPU_DEVICE_BINARY(name, func, int16, int64, int64, int64) \
1049
+ \
1050
+ CPU_DEVICE_BINARY(name, func, int32, bool, int32, int32) \
1051
+ CPU_DEVICE_BINARY(name, func, int32, uint8, int32, int32) \
1052
+ CPU_DEVICE_BINARY(name, func, int32, uint16, int32, int32) \
1053
+ CPU_DEVICE_BINARY(name, func, int32, uint32, int64, int64) \
1054
+ CPU_DEVICE_BINARY(name, func, int32, int8, int32, int32) \
1055
+ CPU_DEVICE_BINARY(name, func, int32, int16, int32, int32) \
1056
+ CPU_DEVICE_BINARY(name, func, int32, int32, int32, int32) \
1057
+ CPU_DEVICE_BINARY(name, func, int32, int64, int64, int64) \
1058
+ \
1059
+ CPU_DEVICE_BINARY(name, func, int64, bool, int64, int64) \
1060
+ CPU_DEVICE_BINARY(name, func, int64, uint8, int64, int64) \
1061
+ CPU_DEVICE_BINARY(name, func, int64, uint16, int64, int64) \
1062
+ CPU_DEVICE_BINARY(name, func, int64, uint32, int64, int64) \
1063
+ CPU_DEVICE_BINARY(name, func, int64, int8, int64, int64) \
1064
+ CPU_DEVICE_BINARY(name, func, int64, int16, int64, int64) \
1065
+ CPU_DEVICE_BINARY(name, func, int64, int32, int64, int64) \
1066
+ CPU_DEVICE_BINARY(name, func, int64, int64, int64, int64)
1067
+
1068
+ #define bitwise_and(x, y) x & y
1069
+ CPU_DEVICE_ALL_BITWISE(bitwise_and, bitwise_and)
1070
+
1071
+ #define bitwise_or(x, y) x | y
1072
+ CPU_DEVICE_ALL_BITWISE(bitwise_or, bitwise_or)
1073
+
1074
+ #define bitwise_xor(x, y) x ^ y
1075
+ CPU_DEVICE_ALL_BITWISE(bitwise_xor, bitwise_xor)
1076
+
1077
+
1078
+ /*****************************************************************************/
1079
+ /* Two return values */
1080
+ /*****************************************************************************/
1081
+
1082
+ #define CPU_DEVICE_BINARY_MV(name, func, t0, t1, t2, t3) \
1083
+ extern "C" void \
1084
+ gm_cpu_device_fixed_1D_C_##name##_##t0##_##t1##_##t2##_##t3( \
1085
+ const char *a0, const char *a1, char *a2, char *a3, int64_t N) \
1086
+ { \
1087
+ const t0##_t *x0 = (const t0##_t *)a0; \
1088
+ const t1##_t *x1 = (const t1##_t *)a1; \
1089
+ t2##_t *x2 = (t2##_t *)a2; \
1090
+ t3##_t *x3 = (t3##_t *)a3; \
1091
+ int64_t i; \
1092
+ \
1093
+ for (i = 0; i < N; i++) { \
1094
+ func(&x2[i], &x3[i], x0[i], x1[i]); \
1095
+ } \
1096
+ } \
1097
+ \
1098
+ extern "C" void \
1099
+ gm_cpu_device_0D_##name##_##t0##_##t1##_##t2##_##t3( \
1100
+ const char *a0, const char *a1, char *a2, char *a3) \
1101
+ { \
1102
+ const t0##_t x0 = *(const t0##_t *)a0; \
1103
+ const t1##_t x1 = *(const t1##_t *)a1; \
1104
+ t2##_t *x2 = (t2##_t *)a2; \
1105
+ t3##_t *x3 = (t3##_t *)a3; \
1106
+ \
1107
+ func(x2, x3, x0, x1); \
1108
+ }
1109
+
1110
+ #define CPU_DEVICE_ALL_BINARY_MV(name, func) \
1111
+ CPU_DEVICE_BINARY_MV(name, func, uint8, uint8, uint8, uint8) \
1112
+ CPU_DEVICE_BINARY_MV(name, func, uint16, uint16, uint16, uint16) \
1113
+ CPU_DEVICE_BINARY_MV(name, func, uint32, uint32, uint32, uint32) \
1114
+ CPU_DEVICE_BINARY_MV(name, func, uint64, uint64, uint64, uint64) \
1115
+ CPU_DEVICE_BINARY_MV(name, func, int8, int8, int8, int8) \
1116
+ CPU_DEVICE_BINARY_MV(name, func, int16, int16, int16, int16) \
1117
+ CPU_DEVICE_BINARY_MV(name, func, int32, int32, int32, int32) \
1118
+ CPU_DEVICE_BINARY_MV(name, func, int64, int64, int64, int64) \
1119
+ CPU_DEVICE_BINARY_MV(name, func, bfloat16, bfloat16, bfloat16, bfloat16) \
1120
+ CPU_DEVICE_BINARY_MV(name, func, float32, float32, float32, float32) \
1121
+ CPU_DEVICE_BINARY_MV(name, func, float64, float64, float64, float64)
1122
+
1123
+ CPU_DEVICE_ALL_BINARY_MV(divmod, _divmod)