gumath 0.2.0dev5 → 0.2.0dev8

Sign up to get free protection for your applications and to get access to all the features.
Files changed (99) hide show
  1. checksums.yaml +4 -4
  2. data/CONTRIBUTING.md +7 -2
  3. data/Gemfile +0 -3
  4. data/ext/ruby_gumath/GPATH +0 -0
  5. data/ext/ruby_gumath/GRTAGS +0 -0
  6. data/ext/ruby_gumath/GTAGS +0 -0
  7. data/ext/ruby_gumath/extconf.rb +0 -5
  8. data/ext/ruby_gumath/functions.c +10 -2
  9. data/ext/ruby_gumath/gufunc_object.c +15 -4
  10. data/ext/ruby_gumath/gufunc_object.h +9 -3
  11. data/ext/ruby_gumath/gumath/Makefile +63 -0
  12. data/ext/ruby_gumath/gumath/Makefile.in +1 -0
  13. data/ext/ruby_gumath/gumath/config.h +56 -0
  14. data/ext/ruby_gumath/gumath/config.h.in +3 -0
  15. data/ext/ruby_gumath/gumath/config.log +497 -0
  16. data/ext/ruby_gumath/gumath/config.status +1034 -0
  17. data/ext/ruby_gumath/gumath/configure +375 -4
  18. data/ext/ruby_gumath/gumath/configure.ac +47 -3
  19. data/ext/ruby_gumath/gumath/libgumath/Makefile +236 -0
  20. data/ext/ruby_gumath/gumath/libgumath/Makefile.in +90 -24
  21. data/ext/ruby_gumath/gumath/libgumath/Makefile.vc +54 -15
  22. data/ext/ruby_gumath/gumath/libgumath/apply.c +92 -28
  23. data/ext/ruby_gumath/gumath/libgumath/apply.o +0 -0
  24. data/ext/ruby_gumath/gumath/libgumath/common.o +0 -0
  25. data/ext/ruby_gumath/gumath/libgumath/cpu_device_binary.o +0 -0
  26. data/ext/ruby_gumath/gumath/libgumath/cpu_device_unary.o +0 -0
  27. data/ext/ruby_gumath/gumath/libgumath/cpu_host_binary.o +0 -0
  28. data/ext/ruby_gumath/gumath/libgumath/cpu_host_unary.o +0 -0
  29. data/ext/ruby_gumath/gumath/libgumath/examples.o +0 -0
  30. data/ext/ruby_gumath/gumath/libgumath/extending/graph.c +27 -20
  31. data/ext/ruby_gumath/gumath/libgumath/extending/pdist.c +1 -1
  32. data/ext/ruby_gumath/gumath/libgumath/func.c +13 -9
  33. data/ext/ruby_gumath/gumath/libgumath/func.o +0 -0
  34. data/ext/ruby_gumath/gumath/libgumath/graph.o +0 -0
  35. data/ext/ruby_gumath/gumath/libgumath/gumath.h +55 -14
  36. data/ext/ruby_gumath/gumath/libgumath/kernels/common.c +513 -0
  37. data/ext/ruby_gumath/gumath/libgumath/kernels/common.h +155 -0
  38. data/ext/ruby_gumath/gumath/libgumath/kernels/contrib/bfloat16.h +520 -0
  39. data/ext/ruby_gumath/gumath/libgumath/kernels/cpu_device_binary.cc +1123 -0
  40. data/ext/ruby_gumath/gumath/libgumath/kernels/cpu_device_binary.h +1062 -0
  41. data/ext/ruby_gumath/gumath/libgumath/kernels/cpu_device_msvc.cc +555 -0
  42. data/ext/ruby_gumath/gumath/libgumath/kernels/cpu_device_unary.cc +368 -0
  43. data/ext/ruby_gumath/gumath/libgumath/kernels/cpu_device_unary.h +335 -0
  44. data/ext/ruby_gumath/gumath/libgumath/kernels/cpu_host_binary.c +2952 -0
  45. data/ext/ruby_gumath/gumath/libgumath/kernels/cpu_host_unary.c +1100 -0
  46. data/ext/ruby_gumath/gumath/libgumath/kernels/cuda_device_binary.cu +1143 -0
  47. data/ext/ruby_gumath/gumath/libgumath/kernels/cuda_device_binary.h +1061 -0
  48. data/ext/ruby_gumath/gumath/libgumath/kernels/cuda_device_unary.cu +528 -0
  49. data/ext/ruby_gumath/gumath/libgumath/kernels/cuda_device_unary.h +463 -0
  50. data/ext/ruby_gumath/gumath/libgumath/kernels/cuda_host_binary.c +2817 -0
  51. data/ext/ruby_gumath/gumath/libgumath/kernels/cuda_host_unary.c +1331 -0
  52. data/ext/ruby_gumath/gumath/libgumath/kernels/device.hh +614 -0
  53. data/ext/ruby_gumath/gumath/libgumath/libgumath.a +0 -0
  54. data/ext/ruby_gumath/gumath/libgumath/libgumath.so +1 -0
  55. data/ext/ruby_gumath/gumath/libgumath/libgumath.so.0 +1 -0
  56. data/ext/ruby_gumath/gumath/libgumath/libgumath.so.0.2.0dev3 +0 -0
  57. data/ext/ruby_gumath/gumath/libgumath/nploops.o +0 -0
  58. data/ext/ruby_gumath/gumath/libgumath/pdist.o +0 -0
  59. data/ext/ruby_gumath/gumath/libgumath/quaternion.o +0 -0
  60. data/ext/ruby_gumath/gumath/libgumath/tbl.o +0 -0
  61. data/ext/ruby_gumath/gumath/libgumath/thread.c +17 -4
  62. data/ext/ruby_gumath/gumath/libgumath/thread.o +0 -0
  63. data/ext/ruby_gumath/gumath/libgumath/xndloops.c +110 -0
  64. data/ext/ruby_gumath/gumath/libgumath/xndloops.o +0 -0
  65. data/ext/ruby_gumath/gumath/python/gumath/__init__.py +150 -0
  66. data/ext/ruby_gumath/gumath/python/gumath/_gumath.c +446 -80
  67. data/ext/ruby_gumath/gumath/python/gumath/cuda.c +78 -0
  68. data/ext/ruby_gumath/gumath/python/gumath/examples.c +0 -5
  69. data/ext/ruby_gumath/gumath/python/gumath/functions.c +2 -2
  70. data/ext/ruby_gumath/gumath/python/gumath/gumath.h +246 -0
  71. data/ext/ruby_gumath/gumath/python/gumath/libgumath.a +0 -0
  72. data/ext/ruby_gumath/gumath/python/gumath/libgumath.so +1 -0
  73. data/ext/ruby_gumath/gumath/python/gumath/libgumath.so.0 +1 -0
  74. data/ext/ruby_gumath/gumath/python/gumath/libgumath.so.0.2.0dev3 +0 -0
  75. data/ext/ruby_gumath/gumath/python/gumath/pygumath.h +31 -2
  76. data/ext/ruby_gumath/gumath/python/gumath_aux.py +767 -0
  77. data/ext/ruby_gumath/gumath/python/randdec.py +535 -0
  78. data/ext/ruby_gumath/gumath/python/randfloat.py +177 -0
  79. data/ext/ruby_gumath/gumath/python/test_gumath.py +1504 -24
  80. data/ext/ruby_gumath/gumath/python/test_xndarray.py +462 -0
  81. data/ext/ruby_gumath/gumath/setup.py +67 -6
  82. data/ext/ruby_gumath/gumath/tools/detect_cuda_arch.cc +35 -0
  83. data/ext/ruby_gumath/include/gumath.h +55 -14
  84. data/ext/ruby_gumath/include/ruby_gumath.h +4 -1
  85. data/ext/ruby_gumath/lib/libgumath.a +0 -0
  86. data/ext/ruby_gumath/lib/libgumath.so.0.2.0dev3 +0 -0
  87. data/ext/ruby_gumath/ruby_gumath.c +231 -70
  88. data/ext/ruby_gumath/ruby_gumath.h +4 -1
  89. data/ext/ruby_gumath/ruby_gumath_internal.h +25 -0
  90. data/ext/ruby_gumath/util.c +34 -0
  91. data/ext/ruby_gumath/util.h +9 -0
  92. data/gumath.gemspec +3 -2
  93. data/lib/gumath.rb +55 -1
  94. data/lib/gumath/version.rb +2 -2
  95. data/lib/ruby_gumath.so +0 -0
  96. metadata +63 -10
  97. data/ext/ruby_gumath/gumath/libgumath/extending/bfloat16.c +0 -130
  98. data/ext/ruby_gumath/gumath/libgumath/kernels/binary.c +0 -547
  99. data/ext/ruby_gumath/gumath/libgumath/kernels/unary.c +0 -449
@@ -0,0 +1,2817 @@
1
+ /*
2
+ * BSD 3-Clause License
3
+ *
4
+ * Copyright (c) 2017-2018, plures
5
+ * All rights reserved.
6
+ *
7
+ * Redistribution and use in source and binary forms, with or without
8
+ * modification, are permitted provided that the following conditions are met:
9
+ *
10
+ * 1. Redistributions of source code must retain the above copyright notice,
11
+ * this list of conditions and the following disclaimer.
12
+ *
13
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
14
+ * this list of conditions and the following disclaimer in the documentation
15
+ * and/or other materials provided with the distribution.
16
+ *
17
+ * 3. Neither the name of the copyright holder nor the names of its
18
+ * contributors may be used to endorse or promote products derived from
19
+ * this software without specific prior written permission.
20
+ *
21
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
22
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
24
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
25
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
26
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
27
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
28
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
29
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
30
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31
+ */
32
+
33
+
34
+ #include <stdlib.h>
35
+ #include <stdint.h>
36
+ #include <string.h>
37
+ #include "ndtypes.h"
38
+ #include "xnd.h"
39
+ #include "gumath.h"
40
+ #include "common.h"
41
+ #include "cuda_device_binary.h"
42
+
43
+
44
+ /****************************************************************************/
45
+ /* Optimized dispatch (exact casting) */
46
+ /****************************************************************************/
47
+
48
+ /* Structured kernel locations for fast lookup. */
49
+ static int
50
+ binary_kernel_location(const ndt_t *in0, const ndt_t *in1, ndt_context_t *ctx)
51
+ {
52
+ const ndt_t *t0 = ndt_dtype(in0);
53
+ const ndt_t *t1 = ndt_dtype(in1);
54
+
55
+ switch (t0->tag) {
56
+ case Uint8: {
57
+ switch (t1->tag) {
58
+ case Uint8: return 0;
59
+ case Uint16: return 4;
60
+ case Uint32: return 8;
61
+ case Uint64: return 12;
62
+
63
+ case Int8: return 16;
64
+ case Int16: return 20;
65
+ case Int32: return 24;
66
+ case Int64: return 28;
67
+
68
+ case BFloat16: return 32;
69
+ case Float16: return 36;
70
+ case Float32: return 40;
71
+ case Float64: return 44;
72
+
73
+ case Complex32: return 48;
74
+ case Complex64: return 52;
75
+ case Complex128: return 56;
76
+
77
+ default: goto invalid_combination;
78
+ }
79
+ }
80
+ case Uint16: {
81
+ switch (t1->tag) {
82
+ case Uint8: return 60;
83
+ case Uint16: return 64;
84
+ case Uint32: return 68;
85
+ case Uint64: return 72;
86
+
87
+ case Int8: return 76;
88
+ case Int16: return 80;
89
+ case Int32: return 84;
90
+ case Int64: return 88;
91
+
92
+ case BFloat16: return 92;
93
+ case Float16: return 96;
94
+ case Float32: return 100;
95
+ case Float64: return 104;
96
+
97
+ case Complex32: return 108;
98
+ case Complex64: return 112;
99
+ case Complex128: return 116;
100
+
101
+ default: goto invalid_combination;
102
+ }
103
+ }
104
+ case Uint32: {
105
+ switch (t1->tag) {
106
+ case Uint8: return 120;
107
+ case Uint16: return 124;
108
+ case Uint32: return 128;
109
+ case Uint64: return 132;
110
+
111
+ case Int8: return 136;
112
+ case Int16: return 140;
113
+ case Int32: return 144;
114
+ case Int64: return 148;
115
+
116
+ case BFloat16: return 152;
117
+ case Float16: return 156;
118
+ case Float32: return 160;
119
+ case Float64: return 164;
120
+
121
+ case Complex32: return 168;
122
+ case Complex64: return 172;
123
+ case Complex128: return 176;
124
+
125
+ default: goto invalid_combination;
126
+ }
127
+ }
128
+ case Uint64: {
129
+ switch (t1->tag) {
130
+ case Uint8: return 180;
131
+ case Uint16: return 184;
132
+ case Uint32: return 188;
133
+ case Uint64: return 192;
134
+
135
+ default: goto invalid_combination;
136
+ }
137
+ }
138
+
139
+ case Int8: {
140
+ switch (t1->tag) {
141
+ case Uint8: return 196;
142
+ case Uint16: return 200;
143
+ case Uint32: return 204;
144
+
145
+ case Int8: return 208;
146
+ case Int16: return 212;
147
+ case Int32: return 216;
148
+ case Int64: return 220;
149
+
150
+ case BFloat16: return 224;
151
+ case Float16: return 228;
152
+ case Float32: return 232;
153
+ case Float64: return 236;
154
+
155
+ case Complex32: return 240;
156
+ case Complex64: return 244;
157
+ case Complex128: return 248;
158
+
159
+ default: goto invalid_combination;
160
+ }
161
+ }
162
+ case Int16: {
163
+ switch (t1->tag) {
164
+ case Uint8: return 252;
165
+ case Uint16: return 256;
166
+ case Uint32: return 260;
167
+
168
+ case Int8: return 264;
169
+ case Int16: return 268;
170
+ case Int32: return 272;
171
+ case Int64: return 276;
172
+
173
+ case BFloat16: return 280;
174
+ case Float16: return 284;
175
+ case Float32: return 288;
176
+ case Float64: return 292;
177
+
178
+ case Complex32: return 296;
179
+ case Complex64: return 300;
180
+ case Complex128: return 304;
181
+
182
+ default: goto invalid_combination;
183
+ }
184
+ }
185
+ case Int32: {
186
+ switch (t1->tag) {
187
+ case Uint8: return 308;
188
+ case Uint16: return 312;
189
+ case Uint32: return 316;
190
+
191
+ case Int8: return 320;
192
+ case Int16: return 324;
193
+ case Int32: return 328;
194
+ case Int64: return 332;
195
+
196
+ case BFloat16: return 336;
197
+ case Float16: return 340;
198
+ case Float32: return 344;
199
+ case Float64: return 348;
200
+
201
+ case Complex32: return 352;
202
+ case Complex64: return 356;
203
+ case Complex128: return 360;
204
+
205
+ default: goto invalid_combination;
206
+ }
207
+ }
208
+ case Int64: {
209
+ switch (t1->tag) {
210
+ case Uint8: return 364;
211
+ case Uint16: return 368;
212
+ case Uint32: return 372;
213
+
214
+ case Int8: return 376;
215
+ case Int16: return 380;
216
+ case Int32: return 384;
217
+ case Int64: return 388;
218
+
219
+ default: goto invalid_combination;
220
+ }
221
+ }
222
+
223
+ case BFloat16: {
224
+ switch (t1->tag) {
225
+ case Uint8: return 392;
226
+ case Uint16: return 396;
227
+ case Uint32: return 400;
228
+
229
+ case Int8: return 404;
230
+ case Int16: return 408;
231
+ case Int32: return 412;
232
+
233
+ case BFloat16: return 416;
234
+ case Float16: return 420;
235
+ case Float32: return 424;
236
+ case Float64: return 428;
237
+
238
+ case Complex32: return 432;
239
+ case Complex64: return 436;
240
+ case Complex128: return 440;
241
+
242
+ default: goto invalid_combination;
243
+ }
244
+ }
245
+
246
+ case Float16: {
247
+ switch (t1->tag) {
248
+ case Uint8: return 444;
249
+ case Uint16: return 448;
250
+ case Uint32: return 452;
251
+
252
+ case Int8: return 456;
253
+ case Int16: return 460;
254
+ case Int32: return 464;
255
+
256
+ case BFloat16: return 468;
257
+ case Float16: return 472;
258
+ case Float32: return 476;
259
+ case Float64: return 480;
260
+
261
+ case Complex32: return 484;
262
+ case Complex64: return 488;
263
+ case Complex128: return 492;
264
+
265
+ default: goto invalid_combination;
266
+ }
267
+ }
268
+
269
+ case Float32: {
270
+ switch (t1->tag) {
271
+ case Uint8: return 496;
272
+ case Uint16: return 500;
273
+ case Uint32: return 504;
274
+
275
+ case Int8: return 508;
276
+ case Int16: return 512;
277
+ case Int32: return 516;
278
+
279
+ case BFloat16: return 520;
280
+ case Float16: return 524;
281
+ case Float32: return 528;
282
+ case Float64: return 532;
283
+
284
+ case Complex32: return 536;
285
+ case Complex64: return 540;
286
+ case Complex128: return 544;
287
+
288
+ default: goto invalid_combination;
289
+ }
290
+ }
291
+
292
+ case Float64: {
293
+ switch (t1->tag) {
294
+ case Uint8: return 548;
295
+ case Uint16: return 552;
296
+ case Uint32: return 556;
297
+
298
+ case Int8: return 560;
299
+ case Int16: return 564;
300
+ case Int32: return 568;
301
+
302
+ case BFloat16: return 572;
303
+ case Float16: return 576;
304
+ case Float32: return 580;
305
+ case Float64: return 584;
306
+
307
+ case Complex32: return 588;
308
+ case Complex64: return 592;
309
+ case Complex128: return 596;
310
+
311
+ default: goto invalid_combination;
312
+ }
313
+ }
314
+
315
+ case Complex32: {
316
+ switch (t1->tag) {
317
+ case Uint8: return 600;
318
+ case Uint16: return 604;
319
+ case Uint32: return 608;
320
+
321
+ case Int8: return 612;
322
+ case Int16: return 616;
323
+ case Int32: return 620;
324
+
325
+ case BFloat16: return 624;
326
+ case Float16: return 628;
327
+ case Float32: return 632;
328
+ case Float64: return 636;
329
+
330
+ case Complex32: return 640;
331
+ case Complex64: return 644;
332
+ case Complex128: return 648;
333
+
334
+ default: goto invalid_combination;
335
+ }
336
+ }
337
+
338
+ case Complex64: {
339
+ switch (t1->tag) {
340
+ case Uint8: return 652;
341
+ case Uint16: return 656;
342
+ case Uint32: return 660;
343
+
344
+ case Int8: return 664;
345
+ case Int16: return 668;
346
+ case Int32: return 672;
347
+
348
+ case BFloat16: return 676;
349
+ case Float16: return 680;
350
+ case Float32: return 684;
351
+ case Float64: return 688;
352
+
353
+ case Complex32: return 692;
354
+ case Complex64: return 696;
355
+ case Complex128: return 700;
356
+
357
+ default: goto invalid_combination;
358
+ }
359
+ }
360
+
361
+ case Complex128: {
362
+ switch (t1->tag) {
363
+ case Uint8: return 704;
364
+ case Uint16: return 708;
365
+ case Uint32: return 712;
366
+
367
+ case Int8: return 716;
368
+ case Int16: return 720;
369
+ case Int32: return 724;
370
+
371
+ case BFloat16: return 728;
372
+ case Float16: return 732;
373
+ case Float32: return 736;
374
+ case Float64: return 740;
375
+
376
+ case Complex32: return 744;
377
+ case Complex64: return 748;
378
+ case Complex128: return 752;
379
+
380
+ default: goto invalid_combination;
381
+ }
382
+ }
383
+
384
+ default:
385
+ goto invalid_combination;
386
+ }
387
+
388
+ invalid_combination:
389
+ ndt_err_format(ctx, NDT_ValueError, "invalid dtype");
390
+ return -1;
391
+ }
392
+
393
+ static int
394
+ bitwise_kernel_location(const ndt_t *in0, const ndt_t *in1, ndt_context_t *ctx)
395
+ {
396
+ const ndt_t *t0 = ndt_dtype(in0);
397
+ const ndt_t *t1 = ndt_dtype(in1);
398
+
399
+ switch (t0->tag) {
400
+ case Bool: {
401
+ switch (t1->tag) {
402
+ case Bool: return 0;
403
+
404
+ case Uint8: return 4;
405
+ case Uint16: return 8;
406
+ case Uint32: return 12;
407
+ case Uint64: return 16;
408
+
409
+ case Int8: return 20;
410
+ case Int16: return 24;
411
+ case Int32: return 28;
412
+ case Int64: return 32;
413
+
414
+ default: goto invalid_combination;
415
+ }
416
+ }
417
+
418
+ case Uint8: {
419
+ switch (t1->tag) {
420
+ case Bool: return 36;
421
+
422
+ case Uint8: return 40;
423
+ case Uint16: return 44;
424
+ case Uint32: return 48;
425
+ case Uint64: return 52;
426
+
427
+ case Int8: return 56;
428
+ case Int16: return 60;
429
+ case Int32: return 64;
430
+ case Int64: return 68;
431
+
432
+ default: goto invalid_combination;
433
+ }
434
+ }
435
+ case Uint16: {
436
+ switch (t1->tag) {
437
+ case Bool: return 72;
438
+
439
+ case Int8: return 76;
440
+ case Int16: return 80;
441
+ case Int32: return 84;
442
+ case Int64: return 88;
443
+
444
+ case Uint8: return 92;
445
+ case Uint16: return 96;
446
+ case Uint32: return 100;
447
+ case Uint64: return 104;
448
+
449
+ default: goto invalid_combination;
450
+ }
451
+ }
452
+ case Uint32: {
453
+ switch (t1->tag) {
454
+ case Bool: return 108;
455
+
456
+ case Uint8: return 112;
457
+ case Uint16: return 116;
458
+ case Uint32: return 120;
459
+ case Uint64: return 124;
460
+
461
+ case Int8: return 128;
462
+ case Int16: return 132;
463
+ case Int32: return 136;
464
+ case Int64: return 140;
465
+
466
+ default: goto invalid_combination;
467
+ }
468
+ }
469
+ case Uint64: {
470
+ switch (t1->tag) {
471
+ case Bool: return 144;
472
+
473
+ case Uint8: return 148;
474
+ case Uint16: return 152;
475
+ case Uint32: return 156;
476
+ case Uint64: return 160;
477
+
478
+ default: goto invalid_combination;
479
+ }
480
+ }
481
+
482
+ case Int8: {
483
+ switch (t1->tag) {
484
+ case Bool: return 164;
485
+
486
+ case Uint8: return 168;
487
+ case Uint16: return 172;
488
+ case Uint32: return 176;
489
+
490
+ case Int8: return 180;
491
+ case Int16: return 184;
492
+ case Int32: return 188;
493
+ case Int64: return 192;
494
+
495
+ default: goto invalid_combination;
496
+ }
497
+ }
498
+ case Int16: {
499
+ switch (t1->tag) {
500
+ case Bool: return 196;
501
+
502
+ case Uint8: return 200;
503
+ case Uint16: return 204;
504
+ case Uint32: return 208;
505
+
506
+ case Int8: return 212;
507
+ case Int16: return 216;
508
+ case Int32: return 220;
509
+ case Int64: return 224;
510
+
511
+ default: goto invalid_combination;
512
+ }
513
+ }
514
+ case Int32: {
515
+ switch (t1->tag) {
516
+ case Bool: return 228;
517
+
518
+ case Uint8: return 232;
519
+ case Uint16: return 236;
520
+ case Uint32: return 240;
521
+
522
+ case Int8: return 244;
523
+ case Int16: return 248;
524
+ case Int32: return 252;
525
+ case Int64: return 256;
526
+
527
+ default: goto invalid_combination;
528
+ }
529
+ }
530
+
531
+ case Int64: {
532
+ switch (t1->tag) {
533
+ case Bool: return 260;
534
+
535
+ case Uint8: return 264;
536
+ case Uint16: return 268;
537
+ case Uint32: return 272;
538
+
539
+ case Int8: return 276;
540
+ case Int16: return 280;
541
+ case Int32: return 284;
542
+ case Int64: return 288;
543
+
544
+ default: goto invalid_combination;
545
+ }
546
+ }
547
+
548
+ default:
549
+ goto invalid_combination;
550
+ }
551
+
552
+ invalid_combination:
553
+ ndt_err_format(ctx, NDT_ValueError, "invalid dtype");
554
+ return -1;
555
+ }
556
+
557
+
558
+ #define CUDA_CHECK_POWER_EXP(t1) \
559
+ static inline int \
560
+ check_power_exp_##t1(const char *a1, ndt_context_t *ctx) \
561
+ { \
562
+ const t1##_t exp = *(const t1##_t *)a1; \
563
+ if (exp < 0) { \
564
+ ndt_err_format(ctx, NDT_ValueError, \
565
+ "negative exponents are not allowed for integer powers"); \
566
+ return -1; \
567
+ } \
568
+ \
569
+ return 0; \
570
+ }
571
+
572
+ #define CUDA_CHECK_POWER_EXP_SUCCESS(t1) \
573
+ static inline int \
574
+ check_power_exp_##t1(const char *a1, ndt_context_t *ctx) \
575
+ { \
576
+ (void)a1; \
577
+ (void)ctx; \
578
+ \
579
+ return 0; \
580
+ }
581
+
582
+ CUDA_CHECK_POWER_EXP(int8)
583
+ CUDA_CHECK_POWER_EXP(int16)
584
+ CUDA_CHECK_POWER_EXP(int32)
585
+ CUDA_CHECK_POWER_EXP(int64)
586
+
587
+ CUDA_CHECK_POWER_EXP_SUCCESS(bool)
588
+
589
+ CUDA_CHECK_POWER_EXP_SUCCESS(uint8)
590
+ CUDA_CHECK_POWER_EXP_SUCCESS(uint16)
591
+ CUDA_CHECK_POWER_EXP_SUCCESS(uint32)
592
+ CUDA_CHECK_POWER_EXP_SUCCESS(uint64)
593
+
594
+ CUDA_CHECK_POWER_EXP_SUCCESS(bfloat16)
595
+ CUDA_CHECK_POWER_EXP_SUCCESS(float16)
596
+ CUDA_CHECK_POWER_EXP_SUCCESS(float32)
597
+ CUDA_CHECK_POWER_EXP_SUCCESS(float64)
598
+
599
+ CUDA_CHECK_POWER_EXP_SUCCESS(complex64)
600
+ CUDA_CHECK_POWER_EXP_SUCCESS(complex128)
601
+
602
+
603
+
604
+ #define CUDA_HOST_BINARY(name, t0, t1, t2) \
605
+ static int \
606
+ gm_cuda_host_fixed_1D_C_##name##_##t0##_##t1##_##t2(xnd_t stack[], ndt_context_t *ctx) \
607
+ { \
608
+ const char *a0 = apply_index(&stack[0]); \
609
+ const char *a1 = apply_index(&stack[1]); \
610
+ char *a2 = apply_index(&stack[2]); \
611
+ const int64_t N = xnd_fixed_shape(&stack[0]); \
612
+ (void)ctx; \
613
+ \
614
+ if (strcmp(STRINGIZE(name), "power") == 0) { \
615
+ if (check_power_exp_##t1(a1, ctx) < 0) { \
616
+ return -1; \
617
+ } \
618
+ } \
619
+ \
620
+ gm_cuda_device_fixed_1D_C_##name##_##t0##_##t1##_##t2(a0, a1, a2, N); \
621
+ \
622
+ if (ndt_is_optional(ndt_dtype(stack[2].type))) { \
623
+ binary_update_bitmap_1D_S(stack); \
624
+ } \
625
+ else if (strcmp(STRINGIZE(name), "equaln") == 0) { \
626
+ if (xnd_cuda_device_synchronize(ctx) < 0) { \
627
+ return -1; \
628
+ } \
629
+ binary_update_bitmap_1D_S_bool(stack); \
630
+ } \
631
+ \
632
+ return 0; \
633
+ } \
634
+ \
635
+ static int \
636
+ gm_cuda_host_fixed_1D_S_##name##_##t0##_##t1##_##t2(xnd_t stack[], ndt_context_t *ctx) \
637
+ { \
638
+ const char *a0 = apply_index(&stack[0]); \
639
+ const char *a1 = apply_index(&stack[1]); \
640
+ char *a2 = apply_index(&stack[2]); \
641
+ const int64_t N = xnd_fixed_shape(&stack[0]); \
642
+ const int64_t s0 = xnd_fixed_step(&stack[0]); \
643
+ const int64_t s1 = xnd_fixed_step(&stack[1]); \
644
+ const int64_t s2 = xnd_fixed_step(&stack[2]); \
645
+ (void)ctx; \
646
+ \
647
+ if (strcmp(STRINGIZE(name), "power") == 0) { \
648
+ if (check_power_exp_##t1(a1, ctx) < 0) { \
649
+ return -1; \
650
+ } \
651
+ } \
652
+ \
653
+ gm_cuda_device_fixed_1D_S_##name##_##t0##_##t1##_##t2(a0, a1, a2, s0, s1, s2, N); \
654
+ \
655
+ if (ndt_is_optional(ndt_dtype(stack[2].type))) { \
656
+ binary_update_bitmap_1D_S(stack); \
657
+ } \
658
+ else if (strcmp(STRINGIZE(name), "equaln") == 0) { \
659
+ if (xnd_cuda_device_synchronize(ctx) < 0) { \
660
+ return -1; \
661
+ } \
662
+ binary_update_bitmap_1D_S_bool(stack); \
663
+ } \
664
+ \
665
+ return 0; \
666
+ } \
667
+ \
668
+ static int \
669
+ gm_cuda_host_0D_##name##_##t0##_##t1##_##t2(xnd_t stack[], ndt_context_t *ctx) \
670
+ { \
671
+ const char *a0 = stack[0].ptr; \
672
+ const char *a1 = stack[1].ptr; \
673
+ char *a2 = stack[2].ptr; \
674
+ (void)ctx; \
675
+ \
676
+ if (strcmp(STRINGIZE(name), "power") == 0) { \
677
+ if (check_power_exp_##t1(a1, ctx) < 0) { \
678
+ return -1; \
679
+ } \
680
+ } \
681
+ \
682
+ gm_cuda_device_0D_##name##_##t0##_##t1##_##t2(a0, a1, a2); \
683
+ \
684
+ if (ndt_is_optional(ndt_dtype(stack[2].type))) { \
685
+ binary_update_bitmap_0D(stack); \
686
+ } \
687
+ else if (strcmp(STRINGIZE(name), "equaln") == 0) { \
688
+ if (xnd_cuda_device_synchronize(ctx) < 0) { \
689
+ return -1; \
690
+ } \
691
+ binary_update_bitmap_0D_bool(stack); \
692
+ } \
693
+ \
694
+ return 0; \
695
+ }
696
+
697
+
698
+ #define CUDA_HOST_NOIMPL(name, t0, t1, t2) \
699
+ static int \
700
+ gm_cuda_host_fixed_1D_C_##name##_##t0##_##t1##_##t2(xnd_t stack[], ndt_context_t *ctx) \
701
+ { \
702
+ (void)stack; \
703
+ \
704
+ ndt_err_format(ctx, NDT_NotImplementedError, \
705
+ "implementation for " STRINGIZE(name) " : " \
706
+ STRINGIZE(t0) ", " STRINGIZE(t1) " -> " STRINGIZE(t2) \
707
+ " currently requires double rounding"); \
708
+ \
709
+ return -1; \
710
+ } \
711
+ \
712
+ static int \
713
+ gm_cuda_host_fixed_1D_S_##name##_##t0##_##t1##_##t2(xnd_t stack[], ndt_context_t *ctx) \
714
+ { \
715
+ (void)stack; \
716
+ \
717
+ ndt_err_format(ctx, NDT_NotImplementedError, \
718
+ "implementation for " STRINGIZE(name) " : " \
719
+ STRINGIZE(t0) ", " STRINGIZE(t1) " -> " STRINGIZE(t2) \
720
+ " currently requires double rounding"); \
721
+ \
722
+ return -1; \
723
+ } \
724
+ \
725
+ static int \
726
+ gm_cuda_host_0D_##name##_##t0##_##t1##_##t2(xnd_t stack[], ndt_context_t *ctx) \
727
+ { \
728
+ (void)stack; \
729
+ \
730
+ ndt_err_format(ctx, NDT_NotImplementedError, \
731
+ "implementation for " STRINGIZE(name) " : " \
732
+ STRINGIZE(t0) ", " STRINGIZE(t1) " -> " STRINGIZE(t2) \
733
+ " currently requires double rounding"); \
734
+ \
735
+ return -1; \
736
+ }
737
+
738
+ #define CUDA_HOST_NOKERN(name, t0, t1, t2) \
739
+ static int \
740
+ gm_cuda_host_fixed_1D_C_##name##_##t0##_##t1##_##t2(xnd_t stack[], ndt_context_t *ctx) \
741
+ { \
742
+ (void)stack; \
743
+ \
744
+ ndt_err_format(ctx, NDT_TypeError, \
745
+ "no kernel for " STRINGIZE(name) " : " \
746
+ STRINGIZE(t0) ", " STRINGIZE(t1) " -> " STRINGIZE(t2)); \
747
+ \
748
+ return -1; \
749
+ } \
750
+ \
751
+ static int \
752
+ gm_cuda_host_fixed_1D_S_##name##_##t0##_##t1##_##t2(xnd_t stack[], ndt_context_t *ctx) \
753
+ { \
754
+ (void)stack; \
755
+ \
756
+ ndt_err_format(ctx, NDT_TypeError, \
757
+ "no kernel for " STRINGIZE(name) " : " \
758
+ STRINGIZE(t0) ", " STRINGIZE(t1) " -> " STRINGIZE(t2)); \
759
+ \
760
+ return -1; \
761
+ } \
762
+ \
763
+ static int \
764
+ gm_cuda_host_0D_##name##_##t0##_##t1##_##t2(xnd_t stack[], ndt_context_t *ctx) \
765
+ { \
766
+ (void)stack; \
767
+ \
768
+ ndt_err_format(ctx, NDT_TypeError, \
769
+ "no kernel for " STRINGIZE(name) " : " \
770
+ STRINGIZE(t0) ", " STRINGIZE(t1) " -> " STRINGIZE(t2)); \
771
+ \
772
+ return -1; \
773
+ }
774
+
775
+
776
+ #define CUDA_HOST_BINARY_INIT(func, t0, t1, t2) \
777
+ { .name = STRINGIZE(func), \
778
+ .sig = "... * " STRINGIZE(t0) ", ... * " STRINGIZE(t1) " -> ... * " STRINGIZE(t2), \
779
+ .OptC = gm_cuda_host_fixed_1D_C_##func##_##t0##_##t1##_##t2, \
780
+ .OptS = gm_cuda_host_fixed_1D_S_##func##_##t0##_##t1##_##t2, \
781
+ .Xnd = gm_cuda_host_0D_##func##_##t0##_##t1##_##t2 }, \
782
+ \
783
+ { .name = STRINGIZE(func), \
784
+ .sig = "... * ?" STRINGIZE(t0) ", ... * " STRINGIZE(t1) " -> ... * ?" STRINGIZE(t2), \
785
+ .OptC = gm_cuda_host_fixed_1D_C_##func##_##t0##_##t1##_##t2, \
786
+ .OptS = gm_cuda_host_fixed_1D_S_##func##_##t0##_##t1##_##t2, \
787
+ .Xnd = gm_cuda_host_0D_##func##_##t0##_##t1##_##t2 }, \
788
+ \
789
+ { .name = STRINGIZE(func), \
790
+ .sig = "... * " STRINGIZE(t0) ", ... * ?" STRINGIZE(t1) " -> ... * ?" STRINGIZE(t2), \
791
+ .OptC = gm_cuda_host_fixed_1D_C_##func##_##t0##_##t1##_##t2, \
792
+ .OptS = gm_cuda_host_fixed_1D_S_##func##_##t0##_##t1##_##t2, \
793
+ .Xnd = gm_cuda_host_0D_##func##_##t0##_##t1##_##t2 }, \
794
+ \
795
+ { .name = STRINGIZE(func), \
796
+ .sig = "... * ?" STRINGIZE(t0) ", ... * ?" STRINGIZE(t1) " -> ... * ?" STRINGIZE(t2), \
797
+ .OptC = gm_cuda_host_fixed_1D_C_##func##_##t0##_##t1##_##t2, \
798
+ .OptS = gm_cuda_host_fixed_1D_S_##func##_##t0##_##t1##_##t2, \
799
+ .Xnd = gm_cuda_host_0D_##func##_##t0##_##t1##_##t2 }
800
+
801
+
802
+ #define CUDA_HOST_EQUALN_INIT(func, t0, t1, t2) \
803
+ { .name = STRINGIZE(func), \
804
+ .sig = "... * " STRINGIZE(t0) ", ... * " STRINGIZE(t1) " -> ... * " STRINGIZE(t2), \
805
+ .OptC = gm_cuda_host_fixed_1D_C_##func##_##t0##_##t1##_##t2, \
806
+ .OptS = gm_cuda_host_fixed_1D_S_##func##_##t0##_##t1##_##t2, \
807
+ .Xnd = gm_cuda_host_0D_##func##_##t0##_##t1##_##t2 }, \
808
+ \
809
+ { .name = STRINGIZE(func), \
810
+ .sig = "... * ?" STRINGIZE(t0) ", ... * " STRINGIZE(t1) " -> ... * " STRINGIZE(t2), \
811
+ .OptC = gm_cuda_host_fixed_1D_C_##func##_##t0##_##t1##_##t2, \
812
+ .OptS = gm_cuda_host_fixed_1D_S_##func##_##t0##_##t1##_##t2, \
813
+ .Xnd = gm_cuda_host_0D_##func##_##t0##_##t1##_##t2 }, \
814
+ \
815
+ { .name = STRINGIZE(func), \
816
+ .sig = "... * " STRINGIZE(t0) ", ... * ?" STRINGIZE(t1) " -> ... * " STRINGIZE(t2), \
817
+ .OptC = gm_cuda_host_fixed_1D_C_##func##_##t0##_##t1##_##t2, \
818
+ .OptS = gm_cuda_host_fixed_1D_S_##func##_##t0##_##t1##_##t2, \
819
+ .Xnd = gm_cuda_host_0D_##func##_##t0##_##t1##_##t2 }, \
820
+ \
821
+ { .name = STRINGIZE(func), \
822
+ .sig = "... * ?" STRINGIZE(t0) ", ... * ?" STRINGIZE(t1) " -> ... * " STRINGIZE(t2), \
823
+ .OptC = gm_cuda_host_fixed_1D_C_##func##_##t0##_##t1##_##t2, \
824
+ .OptS = gm_cuda_host_fixed_1D_S_##func##_##t0##_##t1##_##t2, \
825
+ .Xnd = gm_cuda_host_0D_##func##_##t0##_##t1##_##t2 }
826
+
827
+
828
+ #undef bool
829
+ #define bool_t _Bool
830
+
831
+
832
+ /*****************************************************************************/
833
+ /* Arithmetic */
834
+ /*****************************************************************************/
835
+
836
+ #define CUDA_HOST_ALL_ARITHMETIC(name) \
837
+ CUDA_HOST_BINARY(name, uint8, uint8, uint8) \
838
+ CUDA_HOST_BINARY(name, uint8, uint16, uint16) \
839
+ CUDA_HOST_BINARY(name, uint8, uint32, uint32) \
840
+ CUDA_HOST_BINARY(name, uint8, uint64, uint64) \
841
+ CUDA_HOST_BINARY(name, uint8, int8, int16) \
842
+ CUDA_HOST_BINARY(name, uint8, int16, int16) \
843
+ CUDA_HOST_BINARY(name, uint8, int32, int32) \
844
+ CUDA_HOST_BINARY(name, uint8, int64, int64) \
845
+ CUDA_HOST_BINARY(name, uint8, bfloat16, bfloat16) \
846
+ CUDA_HOST_BINARY(name, uint8, float16, float16) \
847
+ CUDA_HOST_BINARY(name, uint8, float32, float32) \
848
+ CUDA_HOST_BINARY(name, uint8, float64, float64) \
849
+ CUDA_HOST_NOIMPL(name, uint8, complex32, complex32) \
850
+ CUDA_HOST_BINARY(name, uint8, complex64, complex64) \
851
+ CUDA_HOST_BINARY(name, uint8, complex128, complex128) \
852
+ \
853
+ CUDA_HOST_BINARY(name, uint16, uint8, uint16) \
854
+ CUDA_HOST_BINARY(name, uint16, uint16, uint16) \
855
+ CUDA_HOST_BINARY(name, uint16, uint32, uint32) \
856
+ CUDA_HOST_BINARY(name, uint16, uint64, uint64) \
857
+ CUDA_HOST_BINARY(name, uint16, int8, int32) \
858
+ CUDA_HOST_BINARY(name, uint16, int16, int32) \
859
+ CUDA_HOST_BINARY(name, uint16, int32, int32) \
860
+ CUDA_HOST_BINARY(name, uint16, int64, int64) \
861
+ CUDA_HOST_BINARY(name, uint16, bfloat16, float32) \
862
+ CUDA_HOST_BINARY(name, uint16, float16, float32) \
863
+ CUDA_HOST_BINARY(name, uint16, float32, float32) \
864
+ CUDA_HOST_BINARY(name, uint16, float64, float64) \
865
+ CUDA_HOST_NOIMPL(name, uint16, complex32, complex64) \
866
+ CUDA_HOST_BINARY(name, uint16, complex64, complex64) \
867
+ CUDA_HOST_BINARY(name, uint16, complex128, complex128) \
868
+ \
869
+ CUDA_HOST_BINARY(name, uint32, uint8, uint32) \
870
+ CUDA_HOST_BINARY(name, uint32, uint16, uint32) \
871
+ CUDA_HOST_BINARY(name, uint32, uint32, uint32) \
872
+ CUDA_HOST_BINARY(name, uint32, uint64, uint64) \
873
+ CUDA_HOST_BINARY(name, uint32, int8, int64) \
874
+ CUDA_HOST_BINARY(name, uint32, int16, int64) \
875
+ CUDA_HOST_BINARY(name, uint32, int32, int64) \
876
+ CUDA_HOST_BINARY(name, uint32, int64, int64) \
877
+ CUDA_HOST_BINARY(name, uint32, bfloat16, float64) \
878
+ CUDA_HOST_BINARY(name, uint32, float16, float64) \
879
+ CUDA_HOST_BINARY(name, uint32, float32, float64) \
880
+ CUDA_HOST_BINARY(name, uint32, float64, float64) \
881
+ CUDA_HOST_NOIMPL(name, uint32, complex32, complex128) \
882
+ CUDA_HOST_BINARY(name, uint32, complex64, complex128) \
883
+ CUDA_HOST_BINARY(name, uint32, complex128, complex128) \
884
+ \
885
+ CUDA_HOST_BINARY(name, uint64, uint8, uint64) \
886
+ CUDA_HOST_BINARY(name, uint64, uint16, uint64) \
887
+ CUDA_HOST_BINARY(name, uint64, uint32, uint64) \
888
+ CUDA_HOST_BINARY(name, uint64, uint64, uint64) \
889
+ \
890
+ CUDA_HOST_BINARY(name, int8, uint8, int16) \
891
+ CUDA_HOST_BINARY(name, int8, uint16, int32) \
892
+ CUDA_HOST_BINARY(name, int8, uint32, int64) \
893
+ CUDA_HOST_BINARY(name, int8, int8, int8) \
894
+ CUDA_HOST_BINARY(name, int8, int16, int16) \
895
+ CUDA_HOST_BINARY(name, int8, int32, int32) \
896
+ CUDA_HOST_BINARY(name, int8, int64, int64) \
897
+ CUDA_HOST_BINARY(name, int8, bfloat16, bfloat16) \
898
+ CUDA_HOST_BINARY(name, int8, float16, float16) \
899
+ CUDA_HOST_BINARY(name, int8, float32, float32) \
900
+ CUDA_HOST_BINARY(name, int8, float64, float64) \
901
+ CUDA_HOST_NOIMPL(name, int8, complex32, complex32) \
902
+ CUDA_HOST_BINARY(name, int8, complex64, complex64) \
903
+ CUDA_HOST_BINARY(name, int8, complex128, complex128) \
904
+ \
905
+ CUDA_HOST_BINARY(name, int16, uint8, int16) \
906
+ CUDA_HOST_BINARY(name, int16, uint16, int32) \
907
+ CUDA_HOST_BINARY(name, int16, uint32, int64) \
908
+ CUDA_HOST_BINARY(name, int16, int8, int16) \
909
+ CUDA_HOST_BINARY(name, int16, int16, int16) \
910
+ CUDA_HOST_BINARY(name, int16, int32, int32) \
911
+ CUDA_HOST_BINARY(name, int16, int64, int64) \
912
+ CUDA_HOST_BINARY(name, int16, bfloat16, float32) \
913
+ CUDA_HOST_BINARY(name, int16, float16, float32) \
914
+ CUDA_HOST_BINARY(name, int16, float32, float32) \
915
+ CUDA_HOST_BINARY(name, int16, float64, float64) \
916
+ CUDA_HOST_NOIMPL(name, int16, complex32, complex64) \
917
+ CUDA_HOST_BINARY(name, int16, complex64, complex64) \
918
+ CUDA_HOST_BINARY(name, int16, complex128, complex128) \
919
+ \
920
+ CUDA_HOST_BINARY(name, int32, uint8, int32) \
921
+ CUDA_HOST_BINARY(name, int32, uint16, int32) \
922
+ CUDA_HOST_BINARY(name, int32, uint32, int64) \
923
+ CUDA_HOST_BINARY(name, int32, int8, int32) \
924
+ CUDA_HOST_BINARY(name, int32, int16, int32) \
925
+ CUDA_HOST_BINARY(name, int32, int32, int32) \
926
+ CUDA_HOST_BINARY(name, int32, int64, int64) \
927
+ CUDA_HOST_BINARY(name, int32, bfloat16, float64) \
928
+ CUDA_HOST_BINARY(name, int32, float16, float64) \
929
+ CUDA_HOST_BINARY(name, int32, float32, float64) \
930
+ CUDA_HOST_BINARY(name, int32, float64, float64) \
931
+ CUDA_HOST_NOIMPL(name, int32, complex32, complex128) \
932
+ CUDA_HOST_BINARY(name, int32, complex64, complex128) \
933
+ CUDA_HOST_BINARY(name, int32, complex128, complex128) \
934
+ \
935
+ CUDA_HOST_BINARY(name, int64, uint8, int64) \
936
+ CUDA_HOST_BINARY(name, int64, uint16, int64) \
937
+ CUDA_HOST_BINARY(name, int64, uint32, int64) \
938
+ CUDA_HOST_BINARY(name, int64, int8, int64) \
939
+ CUDA_HOST_BINARY(name, int64, int16, int64) \
940
+ CUDA_HOST_BINARY(name, int64, int32, int64) \
941
+ CUDA_HOST_BINARY(name, int64, int64, int64) \
942
+ \
943
+ CUDA_HOST_BINARY(name, bfloat16, uint8, bfloat16) \
944
+ CUDA_HOST_BINARY(name, bfloat16, uint16, float32) \
945
+ CUDA_HOST_BINARY(name, bfloat16, uint32, float64) \
946
+ CUDA_HOST_BINARY(name, bfloat16, int8, bfloat16) \
947
+ CUDA_HOST_BINARY(name, bfloat16, int16, float32) \
948
+ CUDA_HOST_BINARY(name, bfloat16, int32, float64) \
949
+ CUDA_HOST_BINARY(name, bfloat16, bfloat16, bfloat16) \
950
+ CUDA_HOST_BINARY(name, bfloat16, float16, float32) \
951
+ CUDA_HOST_BINARY(name, bfloat16, float32, float32) \
952
+ CUDA_HOST_BINARY(name, bfloat16, float64, float64) \
953
+ CUDA_HOST_NOIMPL(name, bfloat16, complex32, complex64) \
954
+ CUDA_HOST_BINARY(name, bfloat16, complex64, complex64) \
955
+ CUDA_HOST_BINARY(name, bfloat16, complex128, complex128) \
956
+ \
957
+ CUDA_HOST_BINARY(name, float16, uint8, float16) \
958
+ CUDA_HOST_BINARY(name, float16, uint16, float32) \
959
+ CUDA_HOST_BINARY(name, float16, uint32, float64) \
960
+ CUDA_HOST_BINARY(name, float16, int8, float16) \
961
+ CUDA_HOST_BINARY(name, float16, int16, float32) \
962
+ CUDA_HOST_BINARY(name, float16, int32, float64) \
963
+ CUDA_HOST_BINARY(name, float16, bfloat16, float32) \
964
+ CUDA_HOST_BINARY(name, float16, float16, float16) \
965
+ CUDA_HOST_BINARY(name, float16, float32, float32) \
966
+ CUDA_HOST_BINARY(name, float16, float64, float64) \
967
+ CUDA_HOST_NOIMPL(name, float16, complex32, complex32) \
968
+ CUDA_HOST_BINARY(name, float16, complex64, complex64) \
969
+ CUDA_HOST_BINARY(name, float16, complex128, complex128) \
970
+ \
971
+ CUDA_HOST_BINARY(name, float32, uint8, float32) \
972
+ CUDA_HOST_BINARY(name, float32, uint16, float32) \
973
+ CUDA_HOST_BINARY(name, float32, uint32, float64) \
974
+ CUDA_HOST_BINARY(name, float32, int8, float32) \
975
+ CUDA_HOST_BINARY(name, float32, int16, float32) \
976
+ CUDA_HOST_BINARY(name, float32, int32, float64) \
977
+ CUDA_HOST_BINARY(name, float32, bfloat16, float32) \
978
+ CUDA_HOST_BINARY(name, float32, float16, float32) \
979
+ CUDA_HOST_BINARY(name, float32, float32, float32) \
980
+ CUDA_HOST_BINARY(name, float32, float64, float64) \
981
+ CUDA_HOST_NOIMPL(name, float32, complex32, complex64) \
982
+ CUDA_HOST_BINARY(name, float32, complex64, complex64) \
983
+ CUDA_HOST_BINARY(name, float32, complex128, complex128) \
984
+ \
985
+ CUDA_HOST_BINARY(name, float64, uint8, float64) \
986
+ CUDA_HOST_BINARY(name, float64, uint16, float64) \
987
+ CUDA_HOST_BINARY(name, float64, uint32, float64) \
988
+ CUDA_HOST_BINARY(name, float64, int8, float64) \
989
+ CUDA_HOST_BINARY(name, float64, int16, float64) \
990
+ CUDA_HOST_BINARY(name, float64, int32, float64) \
991
+ CUDA_HOST_BINARY(name, float64, bfloat16, float64) \
992
+ CUDA_HOST_BINARY(name, float64, float16, float64) \
993
+ CUDA_HOST_BINARY(name, float64, float32, float64) \
994
+ CUDA_HOST_BINARY(name, float64, float64, float64) \
995
+ CUDA_HOST_NOIMPL(name, float64, complex32, complex128) \
996
+ CUDA_HOST_BINARY(name, float64, complex64, complex128) \
997
+ CUDA_HOST_BINARY(name, float64, complex128, complex128) \
998
+ \
999
+ CUDA_HOST_NOIMPL(name, complex32, uint8, complex32) \
1000
+ CUDA_HOST_NOIMPL(name, complex32, uint16, complex64) \
1001
+ CUDA_HOST_NOIMPL(name, complex32, uint32, complex128) \
1002
+ CUDA_HOST_NOIMPL(name, complex32, int8, complex32) \
1003
+ CUDA_HOST_NOIMPL(name, complex32, int16, complex64) \
1004
+ CUDA_HOST_NOIMPL(name, complex32, int32, complex128) \
1005
+ CUDA_HOST_NOIMPL(name, complex32, bfloat16, complex64) \
1006
+ CUDA_HOST_NOIMPL(name, complex32, float16, complex32) \
1007
+ CUDA_HOST_NOIMPL(name, complex32, float32, complex64) \
1008
+ CUDA_HOST_NOIMPL(name, complex32, float64, complex128) \
1009
+ CUDA_HOST_NOIMPL(name, complex32, complex32, complex32) \
1010
+ CUDA_HOST_NOIMPL(name, complex32, complex64, complex64) \
1011
+ CUDA_HOST_NOIMPL(name, complex32, complex128, complex128) \
1012
+ \
1013
+ CUDA_HOST_BINARY(name, complex64, uint8, complex64) \
1014
+ CUDA_HOST_BINARY(name, complex64, uint16, complex64) \
1015
+ CUDA_HOST_BINARY(name, complex64, uint32, complex128) \
1016
+ CUDA_HOST_BINARY(name, complex64, int8, complex64) \
1017
+ CUDA_HOST_BINARY(name, complex64, int16, complex64) \
1018
+ CUDA_HOST_BINARY(name, complex64, int32, complex128) \
1019
+ CUDA_HOST_BINARY(name, complex64, bfloat16, complex64) \
1020
+ CUDA_HOST_BINARY(name, complex64, float16, complex64) \
1021
+ CUDA_HOST_BINARY(name, complex64, float32, complex64) \
1022
+ CUDA_HOST_BINARY(name, complex64, float64, complex128) \
1023
+ CUDA_HOST_NOIMPL(name, complex64, complex32, complex64) \
1024
+ CUDA_HOST_BINARY(name, complex64, complex64, complex64) \
1025
+ CUDA_HOST_BINARY(name, complex64, complex128, complex128) \
1026
+ \
1027
+ CUDA_HOST_BINARY(name, complex128, uint8, complex128) \
1028
+ CUDA_HOST_BINARY(name, complex128, uint16, complex128) \
1029
+ CUDA_HOST_BINARY(name, complex128, uint32, complex128) \
1030
+ CUDA_HOST_BINARY(name, complex128, int8, complex128) \
1031
+ CUDA_HOST_BINARY(name, complex128, int16, complex128) \
1032
+ CUDA_HOST_BINARY(name, complex128, int32, complex128) \
1033
+ CUDA_HOST_BINARY(name, complex128, bfloat16, complex128) \
1034
+ CUDA_HOST_BINARY(name, complex128, float16, complex128) \
1035
+ CUDA_HOST_BINARY(name, complex128, float32, complex128) \
1036
+ CUDA_HOST_BINARY(name, complex128, float64, complex128) \
1037
+ CUDA_HOST_NOIMPL(name, complex128, complex32, complex128) \
1038
+ CUDA_HOST_BINARY(name, complex128, complex64, complex128) \
1039
+ CUDA_HOST_BINARY(name, complex128, complex128, complex128)
1040
+
1041
+ #define CUDA_HOST_ALL_ARITHMETIC_NO_COMPLEX(name) \
1042
+ CUDA_HOST_BINARY(name, uint8, uint8, uint8) \
1043
+ CUDA_HOST_BINARY(name, uint8, uint16, uint16) \
1044
+ CUDA_HOST_BINARY(name, uint8, uint32, uint32) \
1045
+ CUDA_HOST_BINARY(name, uint8, uint64, uint64) \
1046
+ CUDA_HOST_BINARY(name, uint8, int8, int16) \
1047
+ CUDA_HOST_BINARY(name, uint8, int16, int16) \
1048
+ CUDA_HOST_BINARY(name, uint8, int32, int32) \
1049
+ CUDA_HOST_BINARY(name, uint8, int64, int64) \
1050
+ CUDA_HOST_BINARY(name, uint8, bfloat16, bfloat16) \
1051
+ CUDA_HOST_NOIMPL(name, uint8, float16, float16) \
1052
+ CUDA_HOST_BINARY(name, uint8, float32, float32) \
1053
+ CUDA_HOST_BINARY(name, uint8, float64, float64) \
1054
+ CUDA_HOST_NOKERN(name, uint8, complex32, complex32) \
1055
+ CUDA_HOST_NOKERN(name, uint8, complex64, complex64) \
1056
+ CUDA_HOST_NOKERN(name, uint8, complex128, complex128) \
1057
+ \
1058
+ CUDA_HOST_BINARY(name, uint16, uint8, uint16) \
1059
+ CUDA_HOST_BINARY(name, uint16, uint16, uint16) \
1060
+ CUDA_HOST_BINARY(name, uint16, uint32, uint32) \
1061
+ CUDA_HOST_BINARY(name, uint16, uint64, uint64) \
1062
+ CUDA_HOST_BINARY(name, uint16, int8, int32) \
1063
+ CUDA_HOST_BINARY(name, uint16, int16, int32) \
1064
+ CUDA_HOST_BINARY(name, uint16, int32, int32) \
1065
+ CUDA_HOST_BINARY(name, uint16, int64, int64) \
1066
+ CUDA_HOST_BINARY(name, uint16, bfloat16, float32) \
1067
+ CUDA_HOST_BINARY(name, uint16, float16, float32) \
1068
+ CUDA_HOST_BINARY(name, uint16, float32, float32) \
1069
+ CUDA_HOST_BINARY(name, uint16, float64, float64) \
1070
+ CUDA_HOST_NOKERN(name, uint16, complex32, complex64) \
1071
+ CUDA_HOST_NOKERN(name, uint16, complex64, complex64) \
1072
+ CUDA_HOST_NOKERN(name, uint16, complex128, complex128) \
1073
+ \
1074
+ CUDA_HOST_BINARY(name, uint32, uint8, uint32) \
1075
+ CUDA_HOST_BINARY(name, uint32, uint16, uint32) \
1076
+ CUDA_HOST_BINARY(name, uint32, uint32, uint32) \
1077
+ CUDA_HOST_BINARY(name, uint32, uint64, uint64) \
1078
+ CUDA_HOST_BINARY(name, uint32, int8, int64) \
1079
+ CUDA_HOST_BINARY(name, uint32, int16, int64) \
1080
+ CUDA_HOST_BINARY(name, uint32, int32, int64) \
1081
+ CUDA_HOST_BINARY(name, uint32, int64, int64) \
1082
+ CUDA_HOST_BINARY(name, uint32, bfloat16, float64) \
1083
+ CUDA_HOST_BINARY(name, uint32, float16, float64) \
1084
+ CUDA_HOST_BINARY(name, uint32, float32, float64) \
1085
+ CUDA_HOST_BINARY(name, uint32, float64, float64) \
1086
+ CUDA_HOST_NOKERN(name, uint32, complex32, complex128) \
1087
+ CUDA_HOST_NOKERN(name, uint32, complex64, complex128) \
1088
+ CUDA_HOST_NOKERN(name, uint32, complex128, complex128) \
1089
+ \
1090
+ CUDA_HOST_BINARY(name, uint64, uint8, uint64) \
1091
+ CUDA_HOST_BINARY(name, uint64, uint16, uint64) \
1092
+ CUDA_HOST_BINARY(name, uint64, uint32, uint64) \
1093
+ CUDA_HOST_BINARY(name, uint64, uint64, uint64) \
1094
+ \
1095
+ CUDA_HOST_BINARY(name, int8, uint8, int16) \
1096
+ CUDA_HOST_BINARY(name, int8, uint16, int32) \
1097
+ CUDA_HOST_BINARY(name, int8, uint32, int64) \
1098
+ CUDA_HOST_BINARY(name, int8, int8, int8) \
1099
+ CUDA_HOST_BINARY(name, int8, int16, int16) \
1100
+ CUDA_HOST_BINARY(name, int8, int32, int32) \
1101
+ CUDA_HOST_BINARY(name, int8, int64, int64) \
1102
+ CUDA_HOST_BINARY(name, int8, bfloat16, bfloat16) \
1103
+ CUDA_HOST_NOIMPL(name, int8, float16, float16) \
1104
+ CUDA_HOST_BINARY(name, int8, float32, float32) \
1105
+ CUDA_HOST_BINARY(name, int8, float64, float64) \
1106
+ CUDA_HOST_NOKERN(name, int8, complex32, complex32) \
1107
+ CUDA_HOST_NOKERN(name, int8, complex64, complex64) \
1108
+ CUDA_HOST_NOKERN(name, int8, complex128, complex128) \
1109
+ \
1110
+ CUDA_HOST_BINARY(name, int16, uint8, int16) \
1111
+ CUDA_HOST_BINARY(name, int16, uint16, int32) \
1112
+ CUDA_HOST_BINARY(name, int16, uint32, int64) \
1113
+ CUDA_HOST_BINARY(name, int16, int8, int16) \
1114
+ CUDA_HOST_BINARY(name, int16, int16, int16) \
1115
+ CUDA_HOST_BINARY(name, int16, int32, int32) \
1116
+ CUDA_HOST_BINARY(name, int16, int64, int64) \
1117
+ CUDA_HOST_BINARY(name, int16, bfloat16, float32) \
1118
+ CUDA_HOST_BINARY(name, int16, float16, float32) \
1119
+ CUDA_HOST_BINARY(name, int16, float32, float32) \
1120
+ CUDA_HOST_BINARY(name, int16, float64, float64) \
1121
+ CUDA_HOST_NOKERN(name, int16, complex32, complex64) \
1122
+ CUDA_HOST_NOKERN(name, int16, complex64, complex64) \
1123
+ CUDA_HOST_NOKERN(name, int16, complex128, complex128) \
1124
+ \
1125
+ CUDA_HOST_BINARY(name, int32, uint8, int32) \
1126
+ CUDA_HOST_BINARY(name, int32, uint16, int32) \
1127
+ CUDA_HOST_BINARY(name, int32, uint32, int64) \
1128
+ CUDA_HOST_BINARY(name, int32, int8, int32) \
1129
+ CUDA_HOST_BINARY(name, int32, int16, int32) \
1130
+ CUDA_HOST_BINARY(name, int32, int32, int32) \
1131
+ CUDA_HOST_BINARY(name, int32, int64, int64) \
1132
+ CUDA_HOST_BINARY(name, int32, bfloat16, float64) \
1133
+ CUDA_HOST_BINARY(name, int32, float16, float64) \
1134
+ CUDA_HOST_BINARY(name, int32, float32, float64) \
1135
+ CUDA_HOST_BINARY(name, int32, float64, float64) \
1136
+ CUDA_HOST_NOKERN(name, int32, complex32, complex128) \
1137
+ CUDA_HOST_NOKERN(name, int32, complex64, complex128) \
1138
+ CUDA_HOST_NOKERN(name, int32, complex128, complex128) \
1139
+ \
1140
+ CUDA_HOST_BINARY(name, int64, uint8, int64) \
1141
+ CUDA_HOST_BINARY(name, int64, uint16, int64) \
1142
+ CUDA_HOST_BINARY(name, int64, uint32, int64) \
1143
+ CUDA_HOST_BINARY(name, int64, int8, int64) \
1144
+ CUDA_HOST_BINARY(name, int64, int16, int64) \
1145
+ CUDA_HOST_BINARY(name, int64, int32, int64) \
1146
+ CUDA_HOST_BINARY(name, int64, int64, int64) \
1147
+ \
1148
+ CUDA_HOST_BINARY(name, bfloat16, uint8, bfloat16) \
1149
+ CUDA_HOST_BINARY(name, bfloat16, uint16, float32) \
1150
+ CUDA_HOST_BINARY(name, bfloat16, uint32, float64) \
1151
+ CUDA_HOST_BINARY(name, bfloat16, int8, bfloat16) \
1152
+ CUDA_HOST_BINARY(name, bfloat16, int16, float32) \
1153
+ CUDA_HOST_BINARY(name, bfloat16, int32, float64) \
1154
+ CUDA_HOST_BINARY(name, bfloat16, bfloat16, bfloat16) \
1155
+ CUDA_HOST_BINARY(name, bfloat16, float16, float32) \
1156
+ CUDA_HOST_BINARY(name, bfloat16, float32, float32) \
1157
+ CUDA_HOST_BINARY(name, bfloat16, float64, float64) \
1158
+ CUDA_HOST_NOKERN(name, bfloat16, complex32, complex64) \
1159
+ CUDA_HOST_NOKERN(name, bfloat16, complex64, complex64) \
1160
+ CUDA_HOST_NOKERN(name, bfloat16, complex128, complex128) \
1161
+ \
1162
+ CUDA_HOST_NOIMPL(name, float16, uint8, float16) \
1163
+ CUDA_HOST_BINARY(name, float16, uint16, float32) \
1164
+ CUDA_HOST_BINARY(name, float16, uint32, float64) \
1165
+ CUDA_HOST_NOIMPL(name, float16, int8, float16) \
1166
+ CUDA_HOST_BINARY(name, float16, int16, float32) \
1167
+ CUDA_HOST_BINARY(name, float16, int32, float64) \
1168
+ CUDA_HOST_NOIMPL(name, float16, bfloat16, float32) \
1169
+ CUDA_HOST_NOIMPL(name, float16, float16, float16) \
1170
+ CUDA_HOST_BINARY(name, float16, float32, float32) \
1171
+ CUDA_HOST_BINARY(name, float16, float64, float64) \
1172
+ CUDA_HOST_NOKERN(name, float16, complex32, complex32) \
1173
+ CUDA_HOST_NOKERN(name, float16, complex64, complex64) \
1174
+ CUDA_HOST_NOKERN(name, float16, complex128, complex128) \
1175
+ \
1176
+ CUDA_HOST_BINARY(name, float32, uint8, float32) \
1177
+ CUDA_HOST_BINARY(name, float32, uint16, float32) \
1178
+ CUDA_HOST_BINARY(name, float32, uint32, float64) \
1179
+ CUDA_HOST_BINARY(name, float32, int8, float32) \
1180
+ CUDA_HOST_BINARY(name, float32, int16, float32) \
1181
+ CUDA_HOST_BINARY(name, float32, int32, float64) \
1182
+ CUDA_HOST_BINARY(name, float32, bfloat16, float32) \
1183
+ CUDA_HOST_BINARY(name, float32, float16, float32) \
1184
+ CUDA_HOST_BINARY(name, float32, float32, float32) \
1185
+ CUDA_HOST_BINARY(name, float32, float64, float64) \
1186
+ CUDA_HOST_NOKERN(name, float32, complex32, complex64) \
1187
+ CUDA_HOST_NOKERN(name, float32, complex64, complex64) \
1188
+ CUDA_HOST_NOKERN(name, float32, complex128, complex128) \
1189
+ \
1190
+ CUDA_HOST_BINARY(name, float64, uint8, float64) \
1191
+ CUDA_HOST_BINARY(name, float64, uint16, float64) \
1192
+ CUDA_HOST_BINARY(name, float64, uint32, float64) \
1193
+ CUDA_HOST_BINARY(name, float64, int8, float64) \
1194
+ CUDA_HOST_BINARY(name, float64, int16, float64) \
1195
+ CUDA_HOST_BINARY(name, float64, int32, float64) \
1196
+ CUDA_HOST_BINARY(name, float64, bfloat16, float64) \
1197
+ CUDA_HOST_BINARY(name, float64, float16, float64) \
1198
+ CUDA_HOST_BINARY(name, float64, float32, float64) \
1199
+ CUDA_HOST_BINARY(name, float64, float64, float64) \
1200
+ CUDA_HOST_NOKERN(name, float64, complex32, complex128) \
1201
+ CUDA_HOST_NOKERN(name, float64, complex64, complex128) \
1202
+ CUDA_HOST_NOKERN(name, float64, complex128, complex128) \
1203
+ \
1204
+ CUDA_HOST_NOKERN(name, complex32, uint8, complex32) \
1205
+ CUDA_HOST_NOKERN(name, complex32, uint16, complex64) \
1206
+ CUDA_HOST_NOKERN(name, complex32, uint32, complex128) \
1207
+ CUDA_HOST_NOKERN(name, complex32, int8, complex32) \
1208
+ CUDA_HOST_NOKERN(name, complex32, int16, complex64) \
1209
+ CUDA_HOST_NOKERN(name, complex32, int32, complex128) \
1210
+ CUDA_HOST_NOKERN(name, complex32, bfloat16, complex64) \
1211
+ CUDA_HOST_NOKERN(name, complex32, float16, complex32) \
1212
+ CUDA_HOST_NOKERN(name, complex32, float32, complex64) \
1213
+ CUDA_HOST_NOKERN(name, complex32, float64, complex128) \
1214
+ CUDA_HOST_NOKERN(name, complex32, complex32, complex32) \
1215
+ CUDA_HOST_NOKERN(name, complex32, complex64, complex64) \
1216
+ CUDA_HOST_NOKERN(name, complex32, complex128, complex128) \
1217
+ \
1218
+ CUDA_HOST_NOKERN(name, complex64, uint8, complex64) \
1219
+ CUDA_HOST_NOKERN(name, complex64, uint16, complex64) \
1220
+ CUDA_HOST_NOKERN(name, complex64, uint32, complex128) \
1221
+ CUDA_HOST_NOKERN(name, complex64, int8, complex64) \
1222
+ CUDA_HOST_NOKERN(name, complex64, int16, complex64) \
1223
+ CUDA_HOST_NOKERN(name, complex64, int32, complex128) \
1224
+ CUDA_HOST_NOKERN(name, complex64, bfloat16, complex64) \
1225
+ CUDA_HOST_NOKERN(name, complex64, float16, complex64) \
1226
+ CUDA_HOST_NOKERN(name, complex64, float32, complex64) \
1227
+ CUDA_HOST_NOKERN(name, complex64, float64, complex128) \
1228
+ CUDA_HOST_NOKERN(name, complex64, complex32, complex64) \
1229
+ CUDA_HOST_NOKERN(name, complex64, complex64, complex64) \
1230
+ CUDA_HOST_NOKERN(name, complex64, complex128, complex128) \
1231
+ \
1232
+ CUDA_HOST_NOKERN(name, complex128, uint8, complex128) \
1233
+ CUDA_HOST_NOKERN(name, complex128, uint16, complex128) \
1234
+ CUDA_HOST_NOKERN(name, complex128, uint32, complex128) \
1235
+ CUDA_HOST_NOKERN(name, complex128, int8, complex128) \
1236
+ CUDA_HOST_NOKERN(name, complex128, int16, complex128) \
1237
+ CUDA_HOST_NOKERN(name, complex128, int32, complex128) \
1238
+ CUDA_HOST_NOKERN(name, complex128, bfloat16, complex128) \
1239
+ CUDA_HOST_NOKERN(name, complex128, float16, complex128) \
1240
+ CUDA_HOST_NOKERN(name, complex128, float32, complex128) \
1241
+ CUDA_HOST_NOKERN(name, complex128, float64, complex128) \
1242
+ CUDA_HOST_NOKERN(name, complex128, complex32, complex128) \
1243
+ CUDA_HOST_NOKERN(name, complex128, complex64, complex128) \
1244
+ CUDA_HOST_NOKERN(name, complex128, complex128, complex128)
1245
+
1246
+ #define CUDA_HOST_ALL_ARITHMETIC_FLOAT_RETURN(name) \
1247
+ CUDA_HOST_BINARY(name, uint8, uint8, float16) \
1248
+ CUDA_HOST_BINARY(name, uint8, uint16, float32) \
1249
+ CUDA_HOST_BINARY(name, uint8, uint32, float64) \
1250
+ CUDA_HOST_NOKERN(name, uint8, uint64, uint64) \
1251
+ CUDA_HOST_BINARY(name, uint8, int8, float16) \
1252
+ CUDA_HOST_BINARY(name, uint8, int16, float32) \
1253
+ CUDA_HOST_BINARY(name, uint8, int32, float64) \
1254
+ CUDA_HOST_NOKERN(name, uint8, int64, int64) \
1255
+ CUDA_HOST_BINARY(name, uint8, bfloat16, bfloat16) \
1256
+ CUDA_HOST_BINARY(name, uint8, float16, float16) \
1257
+ CUDA_HOST_BINARY(name, uint8, float32, float32) \
1258
+ CUDA_HOST_BINARY(name, uint8, float64, float64) \
1259
+ CUDA_HOST_NOIMPL(name, uint8, complex32, complex32) \
1260
+ CUDA_HOST_BINARY(name, uint8, complex64, complex64) \
1261
+ CUDA_HOST_BINARY(name, uint8, complex128, complex128) \
1262
+ \
1263
+ CUDA_HOST_BINARY(name, uint16, uint8, float32) \
1264
+ CUDA_HOST_BINARY(name, uint16, uint16, float32) \
1265
+ CUDA_HOST_BINARY(name, uint16, uint32, float64) \
1266
+ CUDA_HOST_NOKERN(name, uint16, uint64, uint64) \
1267
+ CUDA_HOST_BINARY(name, uint16, int8, float32) \
1268
+ CUDA_HOST_BINARY(name, uint16, int16, float32) \
1269
+ CUDA_HOST_BINARY(name, uint16, int32, float64) \
1270
+ CUDA_HOST_NOKERN(name, uint16, int64, int64) \
1271
+ CUDA_HOST_BINARY(name, uint16, bfloat16, float32) \
1272
+ CUDA_HOST_BINARY(name, uint16, float16, float32) \
1273
+ CUDA_HOST_BINARY(name, uint16, float32, float32) \
1274
+ CUDA_HOST_BINARY(name, uint16, float64, float64) \
1275
+ CUDA_HOST_NOIMPL(name, uint16, complex32, complex64) \
1276
+ CUDA_HOST_BINARY(name, uint16, complex64, complex64) \
1277
+ CUDA_HOST_BINARY(name, uint16, complex128, complex128) \
1278
+ \
1279
+ CUDA_HOST_BINARY(name, uint32, uint8, float64) \
1280
+ CUDA_HOST_BINARY(name, uint32, uint16, float64) \
1281
+ CUDA_HOST_BINARY(name, uint32, uint32, float64) \
1282
+ CUDA_HOST_NOKERN(name, uint32, uint64, uint64) \
1283
+ CUDA_HOST_BINARY(name, uint32, int8, float64) \
1284
+ CUDA_HOST_BINARY(name, uint32, int16, float64) \
1285
+ CUDA_HOST_BINARY(name, uint32, int32, float64) \
1286
+ CUDA_HOST_NOKERN(name, uint32, int64, int64) \
1287
+ CUDA_HOST_BINARY(name, uint32, bfloat16, float64) \
1288
+ CUDA_HOST_BINARY(name, uint32, float16, float64) \
1289
+ CUDA_HOST_BINARY(name, uint32, float32, float64) \
1290
+ CUDA_HOST_BINARY(name, uint32, float64, float64) \
1291
+ CUDA_HOST_NOIMPL(name, uint32, complex32, complex128) \
1292
+ CUDA_HOST_BINARY(name, uint32, complex64, complex128) \
1293
+ CUDA_HOST_BINARY(name, uint32, complex128, complex128) \
1294
+ \
1295
+ CUDA_HOST_NOKERN(name, uint64, uint8, uint64) \
1296
+ CUDA_HOST_NOKERN(name, uint64, uint16, uint64) \
1297
+ CUDA_HOST_NOKERN(name, uint64, uint32, uint64) \
1298
+ CUDA_HOST_NOKERN(name, uint64, uint64, uint64) \
1299
+ \
1300
+ CUDA_HOST_BINARY(name, int8, uint8, float16) \
1301
+ CUDA_HOST_BINARY(name, int8, uint16, float32) \
1302
+ CUDA_HOST_BINARY(name, int8, uint32, float64) \
1303
+ CUDA_HOST_BINARY(name, int8, int8, float16) \
1304
+ CUDA_HOST_BINARY(name, int8, int16, float32) \
1305
+ CUDA_HOST_BINARY(name, int8, int32, float64) \
1306
+ CUDA_HOST_NOKERN(name, int8, int64, int64) \
1307
+ CUDA_HOST_BINARY(name, int8, bfloat16, bfloat16) \
1308
+ CUDA_HOST_BINARY(name, int8, float16, float16) \
1309
+ CUDA_HOST_BINARY(name, int8, float32, float32) \
1310
+ CUDA_HOST_BINARY(name, int8, float64, float64) \
1311
+ CUDA_HOST_NOIMPL(name, int8, complex32, complex32) \
1312
+ CUDA_HOST_BINARY(name, int8, complex64, complex64) \
1313
+ CUDA_HOST_BINARY(name, int8, complex128, complex128) \
1314
+ \
1315
+ CUDA_HOST_BINARY(name, int16, uint8, float32) \
1316
+ CUDA_HOST_BINARY(name, int16, uint16, float32) \
1317
+ CUDA_HOST_BINARY(name, int16, uint32, float64) \
1318
+ CUDA_HOST_BINARY(name, int16, int8, float32) \
1319
+ CUDA_HOST_BINARY(name, int16, int16, float32) \
1320
+ CUDA_HOST_BINARY(name, int16, int32, float64) \
1321
+ CUDA_HOST_NOKERN(name, int16, int64, int64) \
1322
+ CUDA_HOST_BINARY(name, int16, bfloat16, float32) \
1323
+ CUDA_HOST_BINARY(name, int16, float16, float32) \
1324
+ CUDA_HOST_BINARY(name, int16, float32, float32) \
1325
+ CUDA_HOST_BINARY(name, int16, float64, float64) \
1326
+ CUDA_HOST_NOIMPL(name, int16, complex32, complex64) \
1327
+ CUDA_HOST_BINARY(name, int16, complex64, complex64) \
1328
+ CUDA_HOST_BINARY(name, int16, complex128, complex128) \
1329
+ \
1330
+ CUDA_HOST_BINARY(name, int32, uint8, float64) \
1331
+ CUDA_HOST_BINARY(name, int32, uint16, float64) \
1332
+ CUDA_HOST_BINARY(name, int32, uint32, float64) \
1333
+ CUDA_HOST_BINARY(name, int32, int8, float64) \
1334
+ CUDA_HOST_BINARY(name, int32, int16, float64) \
1335
+ CUDA_HOST_BINARY(name, int32, int32, float64) \
1336
+ CUDA_HOST_NOKERN(name, int32, int64, int64) \
1337
+ CUDA_HOST_BINARY(name, int32, bfloat16, float64) \
1338
+ CUDA_HOST_BINARY(name, int32, float16, float64) \
1339
+ CUDA_HOST_BINARY(name, int32, float32, float64) \
1340
+ CUDA_HOST_BINARY(name, int32, float64, float64) \
1341
+ CUDA_HOST_NOIMPL(name, int32, complex32, complex128) \
1342
+ CUDA_HOST_BINARY(name, int32, complex64, complex128) \
1343
+ CUDA_HOST_BINARY(name, int32, complex128, complex128) \
1344
+ \
1345
+ CUDA_HOST_NOKERN(name, int64, uint8, int64) \
1346
+ CUDA_HOST_NOKERN(name, int64, uint16, int64) \
1347
+ CUDA_HOST_NOKERN(name, int64, uint32, int64) \
1348
+ CUDA_HOST_NOKERN(name, int64, int8, int64) \
1349
+ CUDA_HOST_NOKERN(name, int64, int16, int64) \
1350
+ CUDA_HOST_NOKERN(name, int64, int32, int64) \
1351
+ CUDA_HOST_NOKERN(name, int64, int64, int64) \
1352
+ \
1353
+ CUDA_HOST_BINARY(name, bfloat16, uint8, bfloat16) \
1354
+ CUDA_HOST_BINARY(name, bfloat16, uint16, float32) \
1355
+ CUDA_HOST_BINARY(name, bfloat16, uint32, float64) \
1356
+ CUDA_HOST_BINARY(name, bfloat16, int8, bfloat16) \
1357
+ CUDA_HOST_BINARY(name, bfloat16, int16, float32) \
1358
+ CUDA_HOST_BINARY(name, bfloat16, int32, float64) \
1359
+ CUDA_HOST_BINARY(name, bfloat16, bfloat16, bfloat16) \
1360
+ CUDA_HOST_BINARY(name, bfloat16, float16, float32) \
1361
+ CUDA_HOST_BINARY(name, bfloat16, float32, float32) \
1362
+ CUDA_HOST_BINARY(name, bfloat16, float64, float64) \
1363
+ CUDA_HOST_NOIMPL(name, bfloat16, complex32, complex64) \
1364
+ CUDA_HOST_BINARY(name, bfloat16, complex64, complex64) \
1365
+ CUDA_HOST_BINARY(name, bfloat16, complex128, complex128) \
1366
+ \
1367
+ CUDA_HOST_BINARY(name, float16, uint8, float16) \
1368
+ CUDA_HOST_BINARY(name, float16, uint16, float32) \
1369
+ CUDA_HOST_BINARY(name, float16, uint32, float64) \
1370
+ CUDA_HOST_BINARY(name, float16, int8, float16) \
1371
+ CUDA_HOST_BINARY(name, float16, int16, float32) \
1372
+ CUDA_HOST_BINARY(name, float16, int32, float64) \
1373
+ CUDA_HOST_BINARY(name, float16, bfloat16, float32) \
1374
+ CUDA_HOST_BINARY(name, float16, float16, float16) \
1375
+ CUDA_HOST_BINARY(name, float16, float32, float32) \
1376
+ CUDA_HOST_BINARY(name, float16, float64, float64) \
1377
+ CUDA_HOST_NOIMPL(name, float16, complex32, complex32) \
1378
+ CUDA_HOST_BINARY(name, float16, complex64, complex64) \
1379
+ CUDA_HOST_BINARY(name, float16, complex128, complex128) \
1380
+ \
1381
+ CUDA_HOST_BINARY(name, float32, uint8, float32) \
1382
+ CUDA_HOST_BINARY(name, float32, uint16, float32) \
1383
+ CUDA_HOST_BINARY(name, float32, uint32, float64) \
1384
+ CUDA_HOST_BINARY(name, float32, int8, float32) \
1385
+ CUDA_HOST_BINARY(name, float32, int16, float32) \
1386
+ CUDA_HOST_BINARY(name, float32, int32, float64) \
1387
+ CUDA_HOST_BINARY(name, float32, bfloat16, float32) \
1388
+ CUDA_HOST_BINARY(name, float32, float16, float32) \
1389
+ CUDA_HOST_BINARY(name, float32, float32, float32) \
1390
+ CUDA_HOST_BINARY(name, float32, float64, float64) \
1391
+ CUDA_HOST_NOIMPL(name, float32, complex32, complex64) \
1392
+ CUDA_HOST_BINARY(name, float32, complex64, complex64) \
1393
+ CUDA_HOST_BINARY(name, float32, complex128, complex128) \
1394
+ \
1395
+ CUDA_HOST_BINARY(name, float64, uint8, float64) \
1396
+ CUDA_HOST_BINARY(name, float64, uint16, float64) \
1397
+ CUDA_HOST_BINARY(name, float64, uint32, float64) \
1398
+ CUDA_HOST_BINARY(name, float64, int8, float64) \
1399
+ CUDA_HOST_BINARY(name, float64, int16, float64) \
1400
+ CUDA_HOST_BINARY(name, float64, int32, float64) \
1401
+ CUDA_HOST_BINARY(name, float64, bfloat16, float64) \
1402
+ CUDA_HOST_BINARY(name, float64, float16, float64) \
1403
+ CUDA_HOST_BINARY(name, float64, float32, float64) \
1404
+ CUDA_HOST_BINARY(name, float64, float64, float64) \
1405
+ CUDA_HOST_NOIMPL(name, float64, complex32, complex128) \
1406
+ CUDA_HOST_BINARY(name, float64, complex64, complex128) \
1407
+ CUDA_HOST_BINARY(name, float64, complex128, complex128) \
1408
+ \
1409
+ CUDA_HOST_NOIMPL(name, complex32, uint8, complex32) \
1410
+ CUDA_HOST_NOIMPL(name, complex32, uint16, complex64) \
1411
+ CUDA_HOST_NOIMPL(name, complex32, uint32, complex128) \
1412
+ CUDA_HOST_NOIMPL(name, complex32, int8, complex32) \
1413
+ CUDA_HOST_NOIMPL(name, complex32, int16, complex64) \
1414
+ CUDA_HOST_NOIMPL(name, complex32, int32, complex128) \
1415
+ CUDA_HOST_NOIMPL(name, complex32, bfloat16, complex64) \
1416
+ CUDA_HOST_NOIMPL(name, complex32, float16, complex32) \
1417
+ CUDA_HOST_NOIMPL(name, complex32, float32, complex64) \
1418
+ CUDA_HOST_NOIMPL(name, complex32, float64, complex128) \
1419
+ CUDA_HOST_NOIMPL(name, complex32, complex32, complex32) \
1420
+ CUDA_HOST_NOIMPL(name, complex32, complex64, complex64) \
1421
+ CUDA_HOST_NOIMPL(name, complex32, complex128, complex128) \
1422
+ \
1423
+ CUDA_HOST_BINARY(name, complex64, uint8, complex64) \
1424
+ CUDA_HOST_BINARY(name, complex64, uint16, complex64) \
1425
+ CUDA_HOST_BINARY(name, complex64, uint32, complex128) \
1426
+ CUDA_HOST_BINARY(name, complex64, int8, complex64) \
1427
+ CUDA_HOST_BINARY(name, complex64, int16, complex64) \
1428
+ CUDA_HOST_BINARY(name, complex64, int32, complex128) \
1429
+ CUDA_HOST_BINARY(name, complex64, bfloat16, complex64) \
1430
+ CUDA_HOST_BINARY(name, complex64, float16, complex64) \
1431
+ CUDA_HOST_BINARY(name, complex64, float32, complex64) \
1432
+ CUDA_HOST_BINARY(name, complex64, float64, complex128) \
1433
+ CUDA_HOST_NOIMPL(name, complex64, complex32, complex64) \
1434
+ CUDA_HOST_BINARY(name, complex64, complex64, complex64) \
1435
+ CUDA_HOST_BINARY(name, complex64, complex128, complex128) \
1436
+ \
1437
+ CUDA_HOST_BINARY(name, complex128, uint8, complex128) \
1438
+ CUDA_HOST_BINARY(name, complex128, uint16, complex128) \
1439
+ CUDA_HOST_BINARY(name, complex128, uint32, complex128) \
1440
+ CUDA_HOST_BINARY(name, complex128, int8, complex128) \
1441
+ CUDA_HOST_BINARY(name, complex128, int16, complex128) \
1442
+ CUDA_HOST_BINARY(name, complex128, int32, complex128) \
1443
+ CUDA_HOST_BINARY(name, complex128, bfloat16, complex128) \
1444
+ CUDA_HOST_BINARY(name, complex128, float16, complex128) \
1445
+ CUDA_HOST_BINARY(name, complex128, float32, complex128) \
1446
+ CUDA_HOST_BINARY(name, complex128, float64, complex128) \
1447
+ CUDA_HOST_NOIMPL(name, complex128, complex32, complex128) \
1448
+ CUDA_HOST_BINARY(name, complex128, complex64, complex128) \
1449
+ CUDA_HOST_BINARY(name, complex128, complex128, complex128)
1450
+
1451
+ #define CUDA_HOST_ALL_ARITHMETIC_INIT(name) \
1452
+ CUDA_HOST_BINARY_INIT(name, uint8, uint8, uint8), \
1453
+ CUDA_HOST_BINARY_INIT(name, uint8, uint16, uint16), \
1454
+ CUDA_HOST_BINARY_INIT(name, uint8, uint32, uint32), \
1455
+ CUDA_HOST_BINARY_INIT(name, uint8, uint64, uint64), \
1456
+ CUDA_HOST_BINARY_INIT(name, uint8, int8, int16), \
1457
+ CUDA_HOST_BINARY_INIT(name, uint8, int16, int16), \
1458
+ CUDA_HOST_BINARY_INIT(name, uint8, int32, int32), \
1459
+ CUDA_HOST_BINARY_INIT(name, uint8, int64, int64), \
1460
+ CUDA_HOST_BINARY_INIT(name, uint8, bfloat16, bfloat16), \
1461
+ CUDA_HOST_BINARY_INIT(name, uint8, float16, float16), \
1462
+ CUDA_HOST_BINARY_INIT(name, uint8, float32, float32), \
1463
+ CUDA_HOST_BINARY_INIT(name, uint8, float64, float64), \
1464
+ CUDA_HOST_BINARY_INIT(name, uint8, complex32, complex32), \
1465
+ CUDA_HOST_BINARY_INIT(name, uint8, complex64, complex64), \
1466
+ CUDA_HOST_BINARY_INIT(name, uint8, complex128, complex128), \
1467
+ \
1468
+ CUDA_HOST_BINARY_INIT(name, uint16, uint8, uint16), \
1469
+ CUDA_HOST_BINARY_INIT(name, uint16, uint16, uint16), \
1470
+ CUDA_HOST_BINARY_INIT(name, uint16, uint32, uint32), \
1471
+ CUDA_HOST_BINARY_INIT(name, uint16, uint64, uint64), \
1472
+ CUDA_HOST_BINARY_INIT(name, uint16, int8, int32), \
1473
+ CUDA_HOST_BINARY_INIT(name, uint16, int16, int32), \
1474
+ CUDA_HOST_BINARY_INIT(name, uint16, int32, int32), \
1475
+ CUDA_HOST_BINARY_INIT(name, uint16, int64, int64), \
1476
+ CUDA_HOST_BINARY_INIT(name, uint16, bfloat16, float32), \
1477
+ CUDA_HOST_BINARY_INIT(name, uint16, float16, float32), \
1478
+ CUDA_HOST_BINARY_INIT(name, uint16, float32, float32), \
1479
+ CUDA_HOST_BINARY_INIT(name, uint16, float64, float64), \
1480
+ CUDA_HOST_BINARY_INIT(name, uint16, complex32, complex64), \
1481
+ CUDA_HOST_BINARY_INIT(name, uint16, complex64, complex64), \
1482
+ CUDA_HOST_BINARY_INIT(name, uint16, complex128, complex128), \
1483
+ \
1484
+ CUDA_HOST_BINARY_INIT(name, uint32, uint8, uint32), \
1485
+ CUDA_HOST_BINARY_INIT(name, uint32, uint16, uint32), \
1486
+ CUDA_HOST_BINARY_INIT(name, uint32, uint32, uint32), \
1487
+ CUDA_HOST_BINARY_INIT(name, uint32, uint64, uint64), \
1488
+ CUDA_HOST_BINARY_INIT(name, uint32, int8, int64), \
1489
+ CUDA_HOST_BINARY_INIT(name, uint32, int16, int64), \
1490
+ CUDA_HOST_BINARY_INIT(name, uint32, int32, int64), \
1491
+ CUDA_HOST_BINARY_INIT(name, uint32, int64, int64), \
1492
+ CUDA_HOST_BINARY_INIT(name, uint32, bfloat16, float64), \
1493
+ CUDA_HOST_BINARY_INIT(name, uint32, float16, float64), \
1494
+ CUDA_HOST_BINARY_INIT(name, uint32, float32, float64), \
1495
+ CUDA_HOST_BINARY_INIT(name, uint32, float64, float64), \
1496
+ CUDA_HOST_BINARY_INIT(name, uint32, complex32, complex128), \
1497
+ CUDA_HOST_BINARY_INIT(name, uint32, complex64, complex128), \
1498
+ CUDA_HOST_BINARY_INIT(name, uint32, complex128, complex128), \
1499
+ \
1500
+ CUDA_HOST_BINARY_INIT(name, uint64, uint8, uint64), \
1501
+ CUDA_HOST_BINARY_INIT(name, uint64, uint16, uint64), \
1502
+ CUDA_HOST_BINARY_INIT(name, uint64, uint32, uint64), \
1503
+ CUDA_HOST_BINARY_INIT(name, uint64, uint64, uint64), \
1504
+ \
1505
+ CUDA_HOST_BINARY_INIT(name, int8, uint8, int16), \
1506
+ CUDA_HOST_BINARY_INIT(name, int8, uint16, int32), \
1507
+ CUDA_HOST_BINARY_INIT(name, int8, uint32, int64), \
1508
+ CUDA_HOST_BINARY_INIT(name, int8, int8, int8), \
1509
+ CUDA_HOST_BINARY_INIT(name, int8, int16, int16), \
1510
+ CUDA_HOST_BINARY_INIT(name, int8, int32, int32), \
1511
+ CUDA_HOST_BINARY_INIT(name, int8, int64, int64), \
1512
+ CUDA_HOST_BINARY_INIT(name, int8, bfloat16, bfloat16), \
1513
+ CUDA_HOST_BINARY_INIT(name, int8, float16, float16), \
1514
+ CUDA_HOST_BINARY_INIT(name, int8, float32, float32), \
1515
+ CUDA_HOST_BINARY_INIT(name, int8, float64, float64), \
1516
+ CUDA_HOST_BINARY_INIT(name, int8, complex32, complex32), \
1517
+ CUDA_HOST_BINARY_INIT(name, int8, complex64, complex64), \
1518
+ CUDA_HOST_BINARY_INIT(name, int8, complex128, complex128), \
1519
+ \
1520
+ CUDA_HOST_BINARY_INIT(name, int16, uint8, int16), \
1521
+ CUDA_HOST_BINARY_INIT(name, int16, uint16, int32), \
1522
+ CUDA_HOST_BINARY_INIT(name, int16, uint32, int64), \
1523
+ CUDA_HOST_BINARY_INIT(name, int16, int8, int16), \
1524
+ CUDA_HOST_BINARY_INIT(name, int16, int16, int16), \
1525
+ CUDA_HOST_BINARY_INIT(name, int16, int32, int32), \
1526
+ CUDA_HOST_BINARY_INIT(name, int16, int64, int64), \
1527
+ CUDA_HOST_BINARY_INIT(name, int16, bfloat16, float32), \
1528
+ CUDA_HOST_BINARY_INIT(name, int16, float16, float32), \
1529
+ CUDA_HOST_BINARY_INIT(name, int16, float32, float32), \
1530
+ CUDA_HOST_BINARY_INIT(name, int16, float64, float64), \
1531
+ CUDA_HOST_BINARY_INIT(name, int16, complex32, complex64), \
1532
+ CUDA_HOST_BINARY_INIT(name, int16, complex64, complex64), \
1533
+ CUDA_HOST_BINARY_INIT(name, int16, complex128, complex128), \
1534
+ \
1535
+ CUDA_HOST_BINARY_INIT(name, int32, uint8, int32), \
1536
+ CUDA_HOST_BINARY_INIT(name, int32, uint16, int32), \
1537
+ CUDA_HOST_BINARY_INIT(name, int32, uint32, int64), \
1538
+ CUDA_HOST_BINARY_INIT(name, int32, int8, int32), \
1539
+ CUDA_HOST_BINARY_INIT(name, int32, int16, int32), \
1540
+ CUDA_HOST_BINARY_INIT(name, int32, int32, int32), \
1541
+ CUDA_HOST_BINARY_INIT(name, int32, int64, int64), \
1542
+ CUDA_HOST_BINARY_INIT(name, int32, bfloat16, float64), \
1543
+ CUDA_HOST_BINARY_INIT(name, int32, float16, float64), \
1544
+ CUDA_HOST_BINARY_INIT(name, int32, float32, float64), \
1545
+ CUDA_HOST_BINARY_INIT(name, int32, float64, float64), \
1546
+ CUDA_HOST_BINARY_INIT(name, int32, complex32, complex128), \
1547
+ CUDA_HOST_BINARY_INIT(name, int32, complex64, complex128), \
1548
+ CUDA_HOST_BINARY_INIT(name, int32, complex128, complex128), \
1549
+ \
1550
+ CUDA_HOST_BINARY_INIT(name, int64, uint8, int64), \
1551
+ CUDA_HOST_BINARY_INIT(name, int64, uint16, int64), \
1552
+ CUDA_HOST_BINARY_INIT(name, int64, uint32, int64), \
1553
+ CUDA_HOST_BINARY_INIT(name, int64, int8, int64), \
1554
+ CUDA_HOST_BINARY_INIT(name, int64, int16, int64), \
1555
+ CUDA_HOST_BINARY_INIT(name, int64, int32, int64), \
1556
+ CUDA_HOST_BINARY_INIT(name, int64, int64, int64), \
1557
+ \
1558
+ CUDA_HOST_BINARY_INIT(name, bfloat16, uint8, bfloat16), \
1559
+ CUDA_HOST_BINARY_INIT(name, bfloat16, uint16, float32), \
1560
+ CUDA_HOST_BINARY_INIT(name, bfloat16, uint32, float64), \
1561
+ CUDA_HOST_BINARY_INIT(name, bfloat16, int8, bfloat16), \
1562
+ CUDA_HOST_BINARY_INIT(name, bfloat16, int16, float32), \
1563
+ CUDA_HOST_BINARY_INIT(name, bfloat16, int32, float64), \
1564
+ CUDA_HOST_BINARY_INIT(name, bfloat16, bfloat16, bfloat16), \
1565
+ CUDA_HOST_BINARY_INIT(name, bfloat16, float16, float32), \
1566
+ CUDA_HOST_BINARY_INIT(name, bfloat16, float32, float32), \
1567
+ CUDA_HOST_BINARY_INIT(name, bfloat16, float64, float64), \
1568
+ CUDA_HOST_BINARY_INIT(name, bfloat16, complex32, complex64), \
1569
+ CUDA_HOST_BINARY_INIT(name, bfloat16, complex64, complex64), \
1570
+ CUDA_HOST_BINARY_INIT(name, bfloat16, complex128, complex128), \
1571
+ \
1572
+ CUDA_HOST_BINARY_INIT(name, float16, uint8, float16), \
1573
+ CUDA_HOST_BINARY_INIT(name, float16, uint16, float32), \
1574
+ CUDA_HOST_BINARY_INIT(name, float16, uint32, float64), \
1575
+ CUDA_HOST_BINARY_INIT(name, float16, int8, float16), \
1576
+ CUDA_HOST_BINARY_INIT(name, float16, int16, float32), \
1577
+ CUDA_HOST_BINARY_INIT(name, float16, int32, float64), \
1578
+ CUDA_HOST_BINARY_INIT(name, float16, bfloat16, float32), \
1579
+ CUDA_HOST_BINARY_INIT(name, float16, float16, float16), \
1580
+ CUDA_HOST_BINARY_INIT(name, float16, float32, float32), \
1581
+ CUDA_HOST_BINARY_INIT(name, float16, float64, float64), \
1582
+ CUDA_HOST_BINARY_INIT(name, float16, complex32, complex32), \
1583
+ CUDA_HOST_BINARY_INIT(name, float16, complex64, complex64), \
1584
+ CUDA_HOST_BINARY_INIT(name, float16, complex128, complex128), \
1585
+ \
1586
+ CUDA_HOST_BINARY_INIT(name, float32, uint8, float32), \
1587
+ CUDA_HOST_BINARY_INIT(name, float32, uint16, float32), \
1588
+ CUDA_HOST_BINARY_INIT(name, float32, uint32, float64), \
1589
+ CUDA_HOST_BINARY_INIT(name, float32, int8, float32), \
1590
+ CUDA_HOST_BINARY_INIT(name, float32, int16, float32), \
1591
+ CUDA_HOST_BINARY_INIT(name, float32, int32, float64), \
1592
+ CUDA_HOST_BINARY_INIT(name, float32, bfloat16, float32), \
1593
+ CUDA_HOST_BINARY_INIT(name, float32, float16, float32), \
1594
+ CUDA_HOST_BINARY_INIT(name, float32, float32, float32), \
1595
+ CUDA_HOST_BINARY_INIT(name, float32, float64, float64), \
1596
+ CUDA_HOST_BINARY_INIT(name, float32, complex32, complex64), \
1597
+ CUDA_HOST_BINARY_INIT(name, float32, complex64, complex64), \
1598
+ CUDA_HOST_BINARY_INIT(name, float32, complex128, complex128), \
1599
+ \
1600
+ CUDA_HOST_BINARY_INIT(name, float64, uint8, float64), \
1601
+ CUDA_HOST_BINARY_INIT(name, float64, uint16, float64), \
1602
+ CUDA_HOST_BINARY_INIT(name, float64, uint32, float64), \
1603
+ CUDA_HOST_BINARY_INIT(name, float64, int8, float64), \
1604
+ CUDA_HOST_BINARY_INIT(name, float64, int16, float64), \
1605
+ CUDA_HOST_BINARY_INIT(name, float64, int32, float64), \
1606
+ CUDA_HOST_BINARY_INIT(name, float64, bfloat16, float64), \
1607
+ CUDA_HOST_BINARY_INIT(name, float64, float16, float64), \
1608
+ CUDA_HOST_BINARY_INIT(name, float64, float32, float64), \
1609
+ CUDA_HOST_BINARY_INIT(name, float64, float64, float64), \
1610
+ CUDA_HOST_BINARY_INIT(name, float64, complex32, complex128), \
1611
+ CUDA_HOST_BINARY_INIT(name, float64, complex64, complex128), \
1612
+ CUDA_HOST_BINARY_INIT(name, float64, complex128, complex128), \
1613
+ \
1614
+ CUDA_HOST_BINARY_INIT(name, complex32, uint8, complex32), \
1615
+ CUDA_HOST_BINARY_INIT(name, complex32, uint16, complex64), \
1616
+ CUDA_HOST_BINARY_INIT(name, complex32, uint32, complex128), \
1617
+ CUDA_HOST_BINARY_INIT(name, complex32, int8, complex32), \
1618
+ CUDA_HOST_BINARY_INIT(name, complex32, int16, complex64), \
1619
+ CUDA_HOST_BINARY_INIT(name, complex32, int32, complex128), \
1620
+ CUDA_HOST_BINARY_INIT(name, complex32, bfloat16, complex64), \
1621
+ CUDA_HOST_BINARY_INIT(name, complex32, float16, complex32), \
1622
+ CUDA_HOST_BINARY_INIT(name, complex32, float32, complex64), \
1623
+ CUDA_HOST_BINARY_INIT(name, complex32, float64, complex128), \
1624
+ CUDA_HOST_BINARY_INIT(name, complex32, complex32, complex32), \
1625
+ CUDA_HOST_BINARY_INIT(name, complex32, complex64, complex64), \
1626
+ CUDA_HOST_BINARY_INIT(name, complex32, complex128, complex128), \
1627
+ \
1628
+ CUDA_HOST_BINARY_INIT(name, complex64, uint8, complex64), \
1629
+ CUDA_HOST_BINARY_INIT(name, complex64, uint16, complex64), \
1630
+ CUDA_HOST_BINARY_INIT(name, complex64, uint32, complex128), \
1631
+ CUDA_HOST_BINARY_INIT(name, complex64, int8, complex64), \
1632
+ CUDA_HOST_BINARY_INIT(name, complex64, int16, complex64), \
1633
+ CUDA_HOST_BINARY_INIT(name, complex64, int32, complex128), \
1634
+ CUDA_HOST_BINARY_INIT(name, complex64, bfloat16, complex64), \
1635
+ CUDA_HOST_BINARY_INIT(name, complex64, float16, complex64), \
1636
+ CUDA_HOST_BINARY_INIT(name, complex64, float32, complex64), \
1637
+ CUDA_HOST_BINARY_INIT(name, complex64, float64, complex128), \
1638
+ CUDA_HOST_BINARY_INIT(name, complex64, complex32, complex64), \
1639
+ CUDA_HOST_BINARY_INIT(name, complex64, complex64, complex64), \
1640
+ CUDA_HOST_BINARY_INIT(name, complex64, complex128, complex128), \
1641
+ \
1642
+ CUDA_HOST_BINARY_INIT(name, complex128, uint8, complex128), \
1643
+ CUDA_HOST_BINARY_INIT(name, complex128, uint16, complex128), \
1644
+ CUDA_HOST_BINARY_INIT(name, complex128, uint32, complex128), \
1645
+ CUDA_HOST_BINARY_INIT(name, complex128, int8, complex128), \
1646
+ CUDA_HOST_BINARY_INIT(name, complex128, int16, complex128), \
1647
+ CUDA_HOST_BINARY_INIT(name, complex128, int32, complex128), \
1648
+ CUDA_HOST_BINARY_INIT(name, complex128, bfloat16, complex128), \
1649
+ CUDA_HOST_BINARY_INIT(name, complex128, float16, complex128), \
1650
+ CUDA_HOST_BINARY_INIT(name, complex128, float32, complex128), \
1651
+ CUDA_HOST_BINARY_INIT(name, complex128, float64, complex128), \
1652
+ CUDA_HOST_BINARY_INIT(name, complex128, complex32, complex128), \
1653
+ CUDA_HOST_BINARY_INIT(name, complex128, complex64, complex128), \
1654
+ CUDA_HOST_BINARY_INIT(name, complex128, complex128, complex128)
1655
+
1656
+ #define CUDA_HOST_ALL_ARITHMETIC_FLOAT_RETURN_INIT(name) \
1657
+ CUDA_HOST_BINARY_INIT(name, uint8, uint8, float16), \
1658
+ CUDA_HOST_BINARY_INIT(name, uint8, uint16, float32), \
1659
+ CUDA_HOST_BINARY_INIT(name, uint8, uint32, float64), \
1660
+ CUDA_HOST_BINARY_INIT(name, uint8, uint64, uint64), \
1661
+ CUDA_HOST_BINARY_INIT(name, uint8, int8, float16), \
1662
+ CUDA_HOST_BINARY_INIT(name, uint8, int16, float32), \
1663
+ CUDA_HOST_BINARY_INIT(name, uint8, int32, float64), \
1664
+ CUDA_HOST_BINARY_INIT(name, uint8, int64, int64), \
1665
+ CUDA_HOST_BINARY_INIT(name, uint8, bfloat16, bfloat16), \
1666
+ CUDA_HOST_BINARY_INIT(name, uint8, float16, float16), \
1667
+ CUDA_HOST_BINARY_INIT(name, uint8, float32, float32), \
1668
+ CUDA_HOST_BINARY_INIT(name, uint8, float64, float64), \
1669
+ CUDA_HOST_BINARY_INIT(name, uint8, complex32, complex32), \
1670
+ CUDA_HOST_BINARY_INIT(name, uint8, complex64, complex64), \
1671
+ CUDA_HOST_BINARY_INIT(name, uint8, complex128, complex128), \
1672
+ \
1673
+ CUDA_HOST_BINARY_INIT(name, uint16, uint8, float32), \
1674
+ CUDA_HOST_BINARY_INIT(name, uint16, uint16, float32), \
1675
+ CUDA_HOST_BINARY_INIT(name, uint16, uint32, float64), \
1676
+ CUDA_HOST_BINARY_INIT(name, uint16, uint64, uint64), \
1677
+ CUDA_HOST_BINARY_INIT(name, uint16, int8, float32), \
1678
+ CUDA_HOST_BINARY_INIT(name, uint16, int16, float32), \
1679
+ CUDA_HOST_BINARY_INIT(name, uint16, int32, float64), \
1680
+ CUDA_HOST_BINARY_INIT(name, uint16, int64, int64), \
1681
+ CUDA_HOST_BINARY_INIT(name, uint16, bfloat16, float32), \
1682
+ CUDA_HOST_BINARY_INIT(name, uint16, float16, float32), \
1683
+ CUDA_HOST_BINARY_INIT(name, uint16, float32, float32), \
1684
+ CUDA_HOST_BINARY_INIT(name, uint16, float64, float64), \
1685
+ CUDA_HOST_BINARY_INIT(name, uint16, complex32, complex64), \
1686
+ CUDA_HOST_BINARY_INIT(name, uint16, complex64, complex64), \
1687
+ CUDA_HOST_BINARY_INIT(name, uint16, complex128, complex128), \
1688
+ \
1689
+ CUDA_HOST_BINARY_INIT(name, uint32, uint8, float64), \
1690
+ CUDA_HOST_BINARY_INIT(name, uint32, uint16, float64), \
1691
+ CUDA_HOST_BINARY_INIT(name, uint32, uint32, float64), \
1692
+ CUDA_HOST_BINARY_INIT(name, uint32, uint64, uint64), \
1693
+ CUDA_HOST_BINARY_INIT(name, uint32, int8, float64), \
1694
+ CUDA_HOST_BINARY_INIT(name, uint32, int16, float64), \
1695
+ CUDA_HOST_BINARY_INIT(name, uint32, int32, float64), \
1696
+ CUDA_HOST_BINARY_INIT(name, uint32, int64, int64), \
1697
+ CUDA_HOST_BINARY_INIT(name, uint32, bfloat16, float64), \
1698
+ CUDA_HOST_BINARY_INIT(name, uint32, float16, float64), \
1699
+ CUDA_HOST_BINARY_INIT(name, uint32, float32, float64), \
1700
+ CUDA_HOST_BINARY_INIT(name, uint32, float64, float64), \
1701
+ CUDA_HOST_BINARY_INIT(name, uint32, complex32, complex128), \
1702
+ CUDA_HOST_BINARY_INIT(name, uint32, complex64, complex128), \
1703
+ CUDA_HOST_BINARY_INIT(name, uint32, complex128, complex128), \
1704
+ \
1705
+ CUDA_HOST_BINARY_INIT(name, uint64, uint8, uint64), \
1706
+ CUDA_HOST_BINARY_INIT(name, uint64, uint16, uint64), \
1707
+ CUDA_HOST_BINARY_INIT(name, uint64, uint32, uint64), \
1708
+ CUDA_HOST_BINARY_INIT(name, uint64, uint64, uint64), \
1709
+ \
1710
+ CUDA_HOST_BINARY_INIT(name, int8, uint8, float16), \
1711
+ CUDA_HOST_BINARY_INIT(name, int8, uint16, float32), \
1712
+ CUDA_HOST_BINARY_INIT(name, int8, uint32, float64), \
1713
+ CUDA_HOST_BINARY_INIT(name, int8, int8, float16), \
1714
+ CUDA_HOST_BINARY_INIT(name, int8, int16, float32), \
1715
+ CUDA_HOST_BINARY_INIT(name, int8, int32, float64), \
1716
+ CUDA_HOST_BINARY_INIT(name, int8, int64, int64), \
1717
+ CUDA_HOST_BINARY_INIT(name, int8, bfloat16, bfloat16), \
1718
+ CUDA_HOST_BINARY_INIT(name, int8, float16, float16), \
1719
+ CUDA_HOST_BINARY_INIT(name, int8, float32, float32), \
1720
+ CUDA_HOST_BINARY_INIT(name, int8, float64, float64), \
1721
+ CUDA_HOST_BINARY_INIT(name, int8, complex32, complex32), \
1722
+ CUDA_HOST_BINARY_INIT(name, int8, complex64, complex64), \
1723
+ CUDA_HOST_BINARY_INIT(name, int8, complex128, complex128), \
1724
+ \
1725
+ CUDA_HOST_BINARY_INIT(name, int16, uint8, float32), \
1726
+ CUDA_HOST_BINARY_INIT(name, int16, uint16, float32), \
1727
+ CUDA_HOST_BINARY_INIT(name, int16, uint32, float64), \
1728
+ CUDA_HOST_BINARY_INIT(name, int16, int8, float32), \
1729
+ CUDA_HOST_BINARY_INIT(name, int16, int16, float32), \
1730
+ CUDA_HOST_BINARY_INIT(name, int16, int32, float64), \
1731
+ CUDA_HOST_BINARY_INIT(name, int16, int64, int64), \
1732
+ CUDA_HOST_BINARY_INIT(name, int16, bfloat16, float32), \
1733
+ CUDA_HOST_BINARY_INIT(name, int16, float16, float32), \
1734
+ CUDA_HOST_BINARY_INIT(name, int16, float32, float32), \
1735
+ CUDA_HOST_BINARY_INIT(name, int16, float64, float64), \
1736
+ CUDA_HOST_BINARY_INIT(name, int16, complex32, complex64), \
1737
+ CUDA_HOST_BINARY_INIT(name, int16, complex64, complex64), \
1738
+ CUDA_HOST_BINARY_INIT(name, int16, complex128, complex128), \
1739
+ \
1740
+ CUDA_HOST_BINARY_INIT(name, int32, uint8, float64), \
1741
+ CUDA_HOST_BINARY_INIT(name, int32, uint16, float64), \
1742
+ CUDA_HOST_BINARY_INIT(name, int32, uint32, float64), \
1743
+ CUDA_HOST_BINARY_INIT(name, int32, int8, float64), \
1744
+ CUDA_HOST_BINARY_INIT(name, int32, int16, float64), \
1745
+ CUDA_HOST_BINARY_INIT(name, int32, int32, float64), \
1746
+ CUDA_HOST_BINARY_INIT(name, int32, int64, int64), \
1747
+ CUDA_HOST_BINARY_INIT(name, int32, bfloat16, float64), \
1748
+ CUDA_HOST_BINARY_INIT(name, int32, float16, float64), \
1749
+ CUDA_HOST_BINARY_INIT(name, int32, float32, float64), \
1750
+ CUDA_HOST_BINARY_INIT(name, int32, float64, float64), \
1751
+ CUDA_HOST_BINARY_INIT(name, int32, complex32, complex128), \
1752
+ CUDA_HOST_BINARY_INIT(name, int32, complex64, complex128), \
1753
+ CUDA_HOST_BINARY_INIT(name, int32, complex128, complex128), \
1754
+ \
1755
+ CUDA_HOST_BINARY_INIT(name, int64, uint8, int64), \
1756
+ CUDA_HOST_BINARY_INIT(name, int64, uint16, int64), \
1757
+ CUDA_HOST_BINARY_INIT(name, int64, uint32, int64), \
1758
+ CUDA_HOST_BINARY_INIT(name, int64, int8, int64), \
1759
+ CUDA_HOST_BINARY_INIT(name, int64, int16, int64), \
1760
+ CUDA_HOST_BINARY_INIT(name, int64, int32, int64), \
1761
+ CUDA_HOST_BINARY_INIT(name, int64, int64, int64), \
1762
+ \
1763
+ CUDA_HOST_BINARY_INIT(name, bfloat16, uint8, bfloat16), \
1764
+ CUDA_HOST_BINARY_INIT(name, bfloat16, uint16, float32), \
1765
+ CUDA_HOST_BINARY_INIT(name, bfloat16, uint32, float64), \
1766
+ CUDA_HOST_BINARY_INIT(name, bfloat16, int8, bfloat16), \
1767
+ CUDA_HOST_BINARY_INIT(name, bfloat16, int16, float32), \
1768
+ CUDA_HOST_BINARY_INIT(name, bfloat16, int32, float64), \
1769
+ CUDA_HOST_BINARY_INIT(name, bfloat16, bfloat16, bfloat16), \
1770
+ CUDA_HOST_BINARY_INIT(name, bfloat16, float16, float32), \
1771
+ CUDA_HOST_BINARY_INIT(name, bfloat16, float32, float32), \
1772
+ CUDA_HOST_BINARY_INIT(name, bfloat16, float64, float64), \
1773
+ CUDA_HOST_BINARY_INIT(name, bfloat16, complex32, complex64), \
1774
+ CUDA_HOST_BINARY_INIT(name, bfloat16, complex64, complex64), \
1775
+ CUDA_HOST_BINARY_INIT(name, bfloat16, complex128, complex128), \
1776
+ \
1777
+ CUDA_HOST_BINARY_INIT(name, float16, uint8, float16), \
1778
+ CUDA_HOST_BINARY_INIT(name, float16, uint16, float32), \
1779
+ CUDA_HOST_BINARY_INIT(name, float16, uint32, float64), \
1780
+ CUDA_HOST_BINARY_INIT(name, float16, int8, float16), \
1781
+ CUDA_HOST_BINARY_INIT(name, float16, int16, float32), \
1782
+ CUDA_HOST_BINARY_INIT(name, float16, int32, float64), \
1783
+ CUDA_HOST_BINARY_INIT(name, float16, bfloat16, float32), \
1784
+ CUDA_HOST_BINARY_INIT(name, float16, float16, float16), \
1785
+ CUDA_HOST_BINARY_INIT(name, float16, float32, float32), \
1786
+ CUDA_HOST_BINARY_INIT(name, float16, float64, float64), \
1787
+ CUDA_HOST_BINARY_INIT(name, float16, complex32, complex32), \
1788
+ CUDA_HOST_BINARY_INIT(name, float16, complex64, complex64), \
1789
+ CUDA_HOST_BINARY_INIT(name, float16, complex128, complex128), \
1790
+ \
1791
+ CUDA_HOST_BINARY_INIT(name, float32, uint8, float32), \
1792
+ CUDA_HOST_BINARY_INIT(name, float32, uint16, float32), \
1793
+ CUDA_HOST_BINARY_INIT(name, float32, uint32, float64), \
1794
+ CUDA_HOST_BINARY_INIT(name, float32, int8, float32), \
1795
+ CUDA_HOST_BINARY_INIT(name, float32, int16, float32), \
1796
+ CUDA_HOST_BINARY_INIT(name, float32, int32, float64), \
1797
+ CUDA_HOST_BINARY_INIT(name, float32, bfloat16, float32), \
1798
+ CUDA_HOST_BINARY_INIT(name, float32, float16, float32), \
1799
+ CUDA_HOST_BINARY_INIT(name, float32, float32, float32), \
1800
+ CUDA_HOST_BINARY_INIT(name, float32, float64, float64), \
1801
+ CUDA_HOST_BINARY_INIT(name, float32, complex32, complex64), \
1802
+ CUDA_HOST_BINARY_INIT(name, float32, complex64, complex64), \
1803
+ CUDA_HOST_BINARY_INIT(name, float32, complex128, complex128), \
1804
+ \
1805
+ CUDA_HOST_BINARY_INIT(name, float64, uint8, float64), \
1806
+ CUDA_HOST_BINARY_INIT(name, float64, uint16, float64), \
1807
+ CUDA_HOST_BINARY_INIT(name, float64, uint32, float64), \
1808
+ CUDA_HOST_BINARY_INIT(name, float64, int8, float64), \
1809
+ CUDA_HOST_BINARY_INIT(name, float64, int16, float64), \
1810
+ CUDA_HOST_BINARY_INIT(name, float64, int32, float64), \
1811
+ CUDA_HOST_BINARY_INIT(name, float64, bfloat16, float64), \
1812
+ CUDA_HOST_BINARY_INIT(name, float64, float16, float64), \
1813
+ CUDA_HOST_BINARY_INIT(name, float64, float32, float64), \
1814
+ CUDA_HOST_BINARY_INIT(name, float64, float64, float64), \
1815
+ CUDA_HOST_BINARY_INIT(name, float64, complex32, complex128), \
1816
+ CUDA_HOST_BINARY_INIT(name, float64, complex64, complex128), \
1817
+ CUDA_HOST_BINARY_INIT(name, float64, complex128, complex128), \
1818
+ \
1819
+ CUDA_HOST_BINARY_INIT(name, complex32, uint8, complex32), \
1820
+ CUDA_HOST_BINARY_INIT(name, complex32, uint16, complex64), \
1821
+ CUDA_HOST_BINARY_INIT(name, complex32, uint32, complex128), \
1822
+ CUDA_HOST_BINARY_INIT(name, complex32, int8, complex32), \
1823
+ CUDA_HOST_BINARY_INIT(name, complex32, int16, complex64), \
1824
+ CUDA_HOST_BINARY_INIT(name, complex32, int32, complex128), \
1825
+ CUDA_HOST_BINARY_INIT(name, complex32, bfloat16, complex64), \
1826
+ CUDA_HOST_BINARY_INIT(name, complex32, float16, complex32), \
1827
+ CUDA_HOST_BINARY_INIT(name, complex32, float32, complex64), \
1828
+ CUDA_HOST_BINARY_INIT(name, complex32, float64, complex128), \
1829
+ CUDA_HOST_BINARY_INIT(name, complex32, complex32, complex32), \
1830
+ CUDA_HOST_BINARY_INIT(name, complex32, complex64, complex64), \
1831
+ CUDA_HOST_BINARY_INIT(name, complex32, complex128, complex128), \
1832
+ \
1833
+ CUDA_HOST_BINARY_INIT(name, complex64, uint8, complex64), \
1834
+ CUDA_HOST_BINARY_INIT(name, complex64, uint16, complex64), \
1835
+ CUDA_HOST_BINARY_INIT(name, complex64, uint32, complex128), \
1836
+ CUDA_HOST_BINARY_INIT(name, complex64, int8, complex64), \
1837
+ CUDA_HOST_BINARY_INIT(name, complex64, int16, complex64), \
1838
+ CUDA_HOST_BINARY_INIT(name, complex64, int32, complex128), \
1839
+ CUDA_HOST_BINARY_INIT(name, complex64, bfloat16, complex64), \
1840
+ CUDA_HOST_BINARY_INIT(name, complex64, float16, complex64), \
1841
+ CUDA_HOST_BINARY_INIT(name, complex64, float32, complex64), \
1842
+ CUDA_HOST_BINARY_INIT(name, complex64, float64, complex128), \
1843
+ CUDA_HOST_BINARY_INIT(name, complex64, complex32, complex64), \
1844
+ CUDA_HOST_BINARY_INIT(name, complex64, complex64, complex64), \
1845
+ CUDA_HOST_BINARY_INIT(name, complex64, complex128, complex128), \
1846
+ \
1847
+ CUDA_HOST_BINARY_INIT(name, complex128, uint8, complex128), \
1848
+ CUDA_HOST_BINARY_INIT(name, complex128, uint16, complex128), \
1849
+ CUDA_HOST_BINARY_INIT(name, complex128, uint32, complex128), \
1850
+ CUDA_HOST_BINARY_INIT(name, complex128, int8, complex128), \
1851
+ CUDA_HOST_BINARY_INIT(name, complex128, int16, complex128), \
1852
+ CUDA_HOST_BINARY_INIT(name, complex128, int32, complex128), \
1853
+ CUDA_HOST_BINARY_INIT(name, complex128, bfloat16, complex128), \
1854
+ CUDA_HOST_BINARY_INIT(name, complex128, float16, complex128), \
1855
+ CUDA_HOST_BINARY_INIT(name, complex128, float32, complex128), \
1856
+ CUDA_HOST_BINARY_INIT(name, complex128, float64, complex128), \
1857
+ CUDA_HOST_BINARY_INIT(name, complex128, complex32, complex128), \
1858
+ CUDA_HOST_BINARY_INIT(name, complex128, complex64, complex128), \
1859
+ CUDA_HOST_BINARY_INIT(name, complex128, complex128, complex128)
1860
+
1861
+
1862
+ CUDA_HOST_ALL_ARITHMETIC(add)
1863
+ CUDA_HOST_ALL_ARITHMETIC(subtract)
1864
+ CUDA_HOST_ALL_ARITHMETIC(multiply)
1865
+ CUDA_HOST_ALL_ARITHMETIC_NO_COMPLEX(floor_divide)
1866
+ CUDA_HOST_ALL_ARITHMETIC_NO_COMPLEX(remainder)
1867
+ CUDA_HOST_ALL_ARITHMETIC_FLOAT_RETURN(divide)
1868
+ CUDA_HOST_ALL_ARITHMETIC(power)
1869
+
1870
+
1871
+ /*****************************************************************************/
1872
+ /* Comparison */
1873
+ /*****************************************************************************/
1874
+
1875
+ #define CUDA_HOST_ALL_COMPARISON(name) \
1876
+ CUDA_HOST_BINARY(name, uint8, uint8, bool) \
1877
+ CUDA_HOST_BINARY(name, uint8, uint16, bool) \
1878
+ CUDA_HOST_BINARY(name, uint8, uint32, bool) \
1879
+ CUDA_HOST_BINARY(name, uint8, uint64, bool) \
1880
+ CUDA_HOST_BINARY(name, uint8, int8, bool) \
1881
+ CUDA_HOST_BINARY(name, uint8, int16, bool) \
1882
+ CUDA_HOST_BINARY(name, uint8, int32, bool) \
1883
+ CUDA_HOST_BINARY(name, uint8, int64, bool) \
1884
+ CUDA_HOST_BINARY(name, uint8, bfloat16, bool) \
1885
+ CUDA_HOST_BINARY(name, uint8, float16, bool) \
1886
+ CUDA_HOST_BINARY(name, uint8, float32, bool) \
1887
+ CUDA_HOST_BINARY(name, uint8, float64, bool) \
1888
+ CUDA_HOST_NOIMPL(name, uint8, complex32, bool) \
1889
+ CUDA_HOST_BINARY(name, uint8, complex64, bool) \
1890
+ CUDA_HOST_BINARY(name, uint8, complex128, bool) \
1891
+ \
1892
+ CUDA_HOST_BINARY(name, uint16, uint8, bool) \
1893
+ CUDA_HOST_BINARY(name, uint16, uint16, bool) \
1894
+ CUDA_HOST_BINARY(name, uint16, uint32, bool) \
1895
+ CUDA_HOST_BINARY(name, uint16, uint64, bool) \
1896
+ CUDA_HOST_BINARY(name, uint16, int8, bool) \
1897
+ CUDA_HOST_BINARY(name, uint16, int16, bool) \
1898
+ CUDA_HOST_BINARY(name, uint16, int32, bool) \
1899
+ CUDA_HOST_BINARY(name, uint16, int64, bool) \
1900
+ CUDA_HOST_BINARY(name, uint16, bfloat16, bool) \
1901
+ CUDA_HOST_BINARY(name, uint16, float16, bool) \
1902
+ CUDA_HOST_BINARY(name, uint16, float32, bool) \
1903
+ CUDA_HOST_BINARY(name, uint16, float64, bool) \
1904
+ CUDA_HOST_NOIMPL(name, uint16, complex32, bool) \
1905
+ CUDA_HOST_BINARY(name, uint16, complex64, bool) \
1906
+ CUDA_HOST_BINARY(name, uint16, complex128, bool) \
1907
+ \
1908
+ CUDA_HOST_BINARY(name, uint32, uint8, bool) \
1909
+ CUDA_HOST_BINARY(name, uint32, uint16, bool) \
1910
+ CUDA_HOST_BINARY(name, uint32, uint32, bool) \
1911
+ CUDA_HOST_BINARY(name, uint32, uint64, bool) \
1912
+ CUDA_HOST_BINARY(name, uint32, int8, bool) \
1913
+ CUDA_HOST_BINARY(name, uint32, int16, bool) \
1914
+ CUDA_HOST_BINARY(name, uint32, int32, bool) \
1915
+ CUDA_HOST_BINARY(name, uint32, int64, bool) \
1916
+ CUDA_HOST_BINARY(name, uint32, bfloat16, bool) \
1917
+ CUDA_HOST_BINARY(name, uint32, float16, bool) \
1918
+ CUDA_HOST_BINARY(name, uint32, float32, bool) \
1919
+ CUDA_HOST_BINARY(name, uint32, float64, bool) \
1920
+ CUDA_HOST_NOIMPL(name, uint32, complex32, bool) \
1921
+ CUDA_HOST_BINARY(name, uint32, complex64, bool) \
1922
+ CUDA_HOST_BINARY(name, uint32, complex128, bool) \
1923
+ \
1924
+ CUDA_HOST_BINARY(name, uint64, uint8, bool) \
1925
+ CUDA_HOST_BINARY(name, uint64, uint16, bool) \
1926
+ CUDA_HOST_BINARY(name, uint64, uint32, bool) \
1927
+ CUDA_HOST_BINARY(name, uint64, uint64, bool) \
1928
+ \
1929
+ CUDA_HOST_BINARY(name, int8, uint8, bool) \
1930
+ CUDA_HOST_BINARY(name, int8, uint16, bool) \
1931
+ CUDA_HOST_BINARY(name, int8, uint32, bool) \
1932
+ CUDA_HOST_BINARY(name, int8, int8, bool) \
1933
+ CUDA_HOST_BINARY(name, int8, int16, bool) \
1934
+ CUDA_HOST_BINARY(name, int8, int32, bool) \
1935
+ CUDA_HOST_BINARY(name, int8, int64, bool) \
1936
+ CUDA_HOST_BINARY(name, int8, bfloat16, bool) \
1937
+ CUDA_HOST_BINARY(name, int8, float16, bool) \
1938
+ CUDA_HOST_BINARY(name, int8, float32, bool) \
1939
+ CUDA_HOST_BINARY(name, int8, float64, bool) \
1940
+ CUDA_HOST_NOIMPL(name, int8, complex32, bool) \
1941
+ CUDA_HOST_BINARY(name, int8, complex64, bool) \
1942
+ CUDA_HOST_BINARY(name, int8, complex128, bool) \
1943
+ \
1944
+ CUDA_HOST_BINARY(name, int16, uint8, bool) \
1945
+ CUDA_HOST_BINARY(name, int16, uint16, bool) \
1946
+ CUDA_HOST_BINARY(name, int16, uint32, bool) \
1947
+ CUDA_HOST_BINARY(name, int16, int8, bool) \
1948
+ CUDA_HOST_BINARY(name, int16, int16, bool) \
1949
+ CUDA_HOST_BINARY(name, int16, int32, bool) \
1950
+ CUDA_HOST_BINARY(name, int16, int64, bool) \
1951
+ CUDA_HOST_BINARY(name, int16, bfloat16, bool) \
1952
+ CUDA_HOST_BINARY(name, int16, float16, bool) \
1953
+ CUDA_HOST_BINARY(name, int16, float32, bool) \
1954
+ CUDA_HOST_BINARY(name, int16, float64, bool) \
1955
+ CUDA_HOST_NOIMPL(name, int16, complex32, bool) \
1956
+ CUDA_HOST_BINARY(name, int16, complex64, bool) \
1957
+ CUDA_HOST_BINARY(name, int16, complex128, bool) \
1958
+ \
1959
+ CUDA_HOST_BINARY(name, int32, uint8, bool) \
1960
+ CUDA_HOST_BINARY(name, int32, uint16, bool) \
1961
+ CUDA_HOST_BINARY(name, int32, uint32, bool) \
1962
+ CUDA_HOST_BINARY(name, int32, int8, bool) \
1963
+ CUDA_HOST_BINARY(name, int32, int16, bool) \
1964
+ CUDA_HOST_BINARY(name, int32, int32, bool) \
1965
+ CUDA_HOST_BINARY(name, int32, int64, bool) \
1966
+ CUDA_HOST_BINARY(name, int32, bfloat16, bool) \
1967
+ CUDA_HOST_BINARY(name, int32, float16, bool) \
1968
+ CUDA_HOST_BINARY(name, int32, float32, bool) \
1969
+ CUDA_HOST_BINARY(name, int32, float64, bool) \
1970
+ CUDA_HOST_NOIMPL(name, int32, complex32, bool) \
1971
+ CUDA_HOST_BINARY(name, int32, complex64, bool) \
1972
+ CUDA_HOST_BINARY(name, int32, complex128, bool) \
1973
+ \
1974
+ CUDA_HOST_BINARY(name, int64, uint8, bool) \
1975
+ CUDA_HOST_BINARY(name, int64, uint16, bool) \
1976
+ CUDA_HOST_BINARY(name, int64, uint32, bool) \
1977
+ CUDA_HOST_BINARY(name, int64, int8, bool) \
1978
+ CUDA_HOST_BINARY(name, int64, int16, bool) \
1979
+ CUDA_HOST_BINARY(name, int64, int32, bool) \
1980
+ CUDA_HOST_BINARY(name, int64, int64, bool) \
1981
+ \
1982
+ CUDA_HOST_BINARY(name, bfloat16, uint8, bool) \
1983
+ CUDA_HOST_BINARY(name, bfloat16, uint16, bool) \
1984
+ CUDA_HOST_BINARY(name, bfloat16, uint32, bool) \
1985
+ CUDA_HOST_BINARY(name, bfloat16, int8, bool) \
1986
+ CUDA_HOST_BINARY(name, bfloat16, int16, bool) \
1987
+ CUDA_HOST_BINARY(name, bfloat16, int32, bool) \
1988
+ CUDA_HOST_BINARY(name, bfloat16, bfloat16, bool) \
1989
+ CUDA_HOST_BINARY(name, bfloat16, float16, bool) \
1990
+ CUDA_HOST_BINARY(name, bfloat16, float32, bool) \
1991
+ CUDA_HOST_BINARY(name, bfloat16, float64, bool) \
1992
+ CUDA_HOST_NOIMPL(name, bfloat16, complex32, bool) \
1993
+ CUDA_HOST_BINARY(name, bfloat16, complex64, bool) \
1994
+ CUDA_HOST_BINARY(name, bfloat16, complex128, bool) \
1995
+ \
1996
+ CUDA_HOST_BINARY(name, float16, uint8, bool) \
1997
+ CUDA_HOST_BINARY(name, float16, uint16, bool) \
1998
+ CUDA_HOST_BINARY(name, float16, uint32, bool) \
1999
+ CUDA_HOST_BINARY(name, float16, int8, bool) \
2000
+ CUDA_HOST_BINARY(name, float16, int16, bool) \
2001
+ CUDA_HOST_BINARY(name, float16, int32, bool) \
2002
+ CUDA_HOST_BINARY(name, float16, bfloat16, bool) \
2003
+ CUDA_HOST_BINARY(name, float16, float16, bool) \
2004
+ CUDA_HOST_BINARY(name, float16, float32, bool) \
2005
+ CUDA_HOST_BINARY(name, float16, float64, bool) \
2006
+ CUDA_HOST_NOIMPL(name, float16, complex32, bool) \
2007
+ CUDA_HOST_BINARY(name, float16, complex64, bool) \
2008
+ CUDA_HOST_BINARY(name, float16, complex128, bool) \
2009
+ \
2010
+ CUDA_HOST_BINARY(name, float32, uint8, bool) \
2011
+ CUDA_HOST_BINARY(name, float32, uint16, bool) \
2012
+ CUDA_HOST_BINARY(name, float32, uint32, bool) \
2013
+ CUDA_HOST_BINARY(name, float32, int8, bool) \
2014
+ CUDA_HOST_BINARY(name, float32, int16, bool) \
2015
+ CUDA_HOST_BINARY(name, float32, int32, bool) \
2016
+ CUDA_HOST_BINARY(name, float32, bfloat16, bool) \
2017
+ CUDA_HOST_BINARY(name, float32, float16, bool) \
2018
+ CUDA_HOST_BINARY(name, float32, float32, bool) \
2019
+ CUDA_HOST_BINARY(name, float32, float64, bool) \
2020
+ CUDA_HOST_NOIMPL(name, float32, complex32, bool) \
2021
+ CUDA_HOST_BINARY(name, float32, complex64, bool) \
2022
+ CUDA_HOST_BINARY(name, float32, complex128, bool) \
2023
+ \
2024
+ CUDA_HOST_BINARY(name, float64, uint8, bool) \
2025
+ CUDA_HOST_BINARY(name, float64, uint16, bool) \
2026
+ CUDA_HOST_BINARY(name, float64, uint32, bool) \
2027
+ CUDA_HOST_BINARY(name, float64, int8, bool) \
2028
+ CUDA_HOST_BINARY(name, float64, int16, bool) \
2029
+ CUDA_HOST_BINARY(name, float64, int32, bool) \
2030
+ CUDA_HOST_BINARY(name, float64, bfloat16, bool) \
2031
+ CUDA_HOST_BINARY(name, float64, float16, bool) \
2032
+ CUDA_HOST_BINARY(name, float64, float32, bool) \
2033
+ CUDA_HOST_BINARY(name, float64, float64, bool) \
2034
+ CUDA_HOST_NOIMPL(name, float64, complex32, bool) \
2035
+ CUDA_HOST_BINARY(name, float64, complex64, bool) \
2036
+ CUDA_HOST_BINARY(name, float64, complex128, bool) \
2037
+ \
2038
+ CUDA_HOST_NOIMPL(name, complex32, uint8, bool) \
2039
+ CUDA_HOST_NOIMPL(name, complex32, uint16, bool) \
2040
+ CUDA_HOST_NOIMPL(name, complex32, uint32, bool) \
2041
+ CUDA_HOST_NOIMPL(name, complex32, int8, bool) \
2042
+ CUDA_HOST_NOIMPL(name, complex32, int16, bool) \
2043
+ CUDA_HOST_NOIMPL(name, complex32, int32, bool) \
2044
+ CUDA_HOST_NOIMPL(name, complex32, bfloat16, bool) \
2045
+ CUDA_HOST_NOIMPL(name, complex32, float16, bool) \
2046
+ CUDA_HOST_NOIMPL(name, complex32, float32, bool) \
2047
+ CUDA_HOST_NOIMPL(name, complex32, float64, bool) \
2048
+ CUDA_HOST_NOIMPL(name, complex32, complex32, bool) \
2049
+ CUDA_HOST_NOIMPL(name, complex32, complex64, bool) \
2050
+ CUDA_HOST_NOIMPL(name, complex32, complex128, bool) \
2051
+ \
2052
+ CUDA_HOST_BINARY(name, complex64, uint8, bool) \
2053
+ CUDA_HOST_BINARY(name, complex64, uint16, bool) \
2054
+ CUDA_HOST_BINARY(name, complex64, uint32, bool) \
2055
+ CUDA_HOST_BINARY(name, complex64, int8, bool) \
2056
+ CUDA_HOST_BINARY(name, complex64, int16, bool) \
2057
+ CUDA_HOST_BINARY(name, complex64, int32, bool) \
2058
+ CUDA_HOST_BINARY(name, complex64, bfloat16, bool) \
2059
+ CUDA_HOST_BINARY(name, complex64, float16, bool) \
2060
+ CUDA_HOST_BINARY(name, complex64, float32, bool) \
2061
+ CUDA_HOST_BINARY(name, complex64, float64, bool) \
2062
+ CUDA_HOST_NOIMPL(name, complex64, complex32, bool) \
2063
+ CUDA_HOST_BINARY(name, complex64, complex64, bool) \
2064
+ CUDA_HOST_BINARY(name, complex64, complex128, bool) \
2065
+ \
2066
+ CUDA_HOST_BINARY(name, complex128, uint8, bool) \
2067
+ CUDA_HOST_BINARY(name, complex128, uint16, bool) \
2068
+ CUDA_HOST_BINARY(name, complex128, uint32, bool) \
2069
+ CUDA_HOST_BINARY(name, complex128, int8, bool) \
2070
+ CUDA_HOST_BINARY(name, complex128, int16, bool) \
2071
+ CUDA_HOST_BINARY(name, complex128, int32, bool) \
2072
+ CUDA_HOST_BINARY(name, complex128, bfloat16, bool) \
2073
+ CUDA_HOST_BINARY(name, complex128, float16, bool) \
2074
+ CUDA_HOST_BINARY(name, complex128, float32, bool) \
2075
+ CUDA_HOST_BINARY(name, complex128, float64, bool) \
2076
+ CUDA_HOST_NOIMPL(name, complex128, complex32, bool) \
2077
+ CUDA_HOST_BINARY(name, complex128, complex64, bool) \
2078
+ CUDA_HOST_BINARY(name, complex128, complex128, bool)
2079
+
2080
+ #define CUDA_HOST_ALL_COMPARISON_INIT(name) \
2081
+ CUDA_HOST_BINARY_INIT(name, uint8, uint8, bool), \
2082
+ CUDA_HOST_BINARY_INIT(name, uint8, uint16, bool), \
2083
+ CUDA_HOST_BINARY_INIT(name, uint8, uint32, bool), \
2084
+ CUDA_HOST_BINARY_INIT(name, uint8, uint64, bool), \
2085
+ CUDA_HOST_BINARY_INIT(name, uint8, int8, bool), \
2086
+ CUDA_HOST_BINARY_INIT(name, uint8, int16, bool), \
2087
+ CUDA_HOST_BINARY_INIT(name, uint8, int32, bool), \
2088
+ CUDA_HOST_BINARY_INIT(name, uint8, int64, bool), \
2089
+ CUDA_HOST_BINARY_INIT(name, uint8, bfloat16, bool), \
2090
+ CUDA_HOST_BINARY_INIT(name, uint8, float16, bool), \
2091
+ CUDA_HOST_BINARY_INIT(name, uint8, float32, bool), \
2092
+ CUDA_HOST_BINARY_INIT(name, uint8, float64, bool), \
2093
+ CUDA_HOST_BINARY_INIT(name, uint8, complex32, bool), \
2094
+ CUDA_HOST_BINARY_INIT(name, uint8, complex64, bool), \
2095
+ CUDA_HOST_BINARY_INIT(name, uint8, complex128, bool), \
2096
+ \
2097
+ CUDA_HOST_BINARY_INIT(name, uint16, uint8, bool), \
2098
+ CUDA_HOST_BINARY_INIT(name, uint16, uint16, bool), \
2099
+ CUDA_HOST_BINARY_INIT(name, uint16, uint32, bool), \
2100
+ CUDA_HOST_BINARY_INIT(name, uint16, uint64, bool), \
2101
+ CUDA_HOST_BINARY_INIT(name, uint16, int8, bool), \
2102
+ CUDA_HOST_BINARY_INIT(name, uint16, int16, bool), \
2103
+ CUDA_HOST_BINARY_INIT(name, uint16, int32, bool), \
2104
+ CUDA_HOST_BINARY_INIT(name, uint16, int64, bool), \
2105
+ CUDA_HOST_BINARY_INIT(name, uint16, bfloat16, bool), \
2106
+ CUDA_HOST_BINARY_INIT(name, uint16, float16, bool), \
2107
+ CUDA_HOST_BINARY_INIT(name, uint16, float32, bool), \
2108
+ CUDA_HOST_BINARY_INIT(name, uint16, float64, bool), \
2109
+ CUDA_HOST_BINARY_INIT(name, uint16, complex32, bool), \
2110
+ CUDA_HOST_BINARY_INIT(name, uint16, complex64, bool), \
2111
+ CUDA_HOST_BINARY_INIT(name, uint16, complex128, bool), \
2112
+ \
2113
+ CUDA_HOST_BINARY_INIT(name, uint32, uint8, bool), \
2114
+ CUDA_HOST_BINARY_INIT(name, uint32, uint16, bool), \
2115
+ CUDA_HOST_BINARY_INIT(name, uint32, uint32, bool), \
2116
+ CUDA_HOST_BINARY_INIT(name, uint32, uint64, bool), \
2117
+ CUDA_HOST_BINARY_INIT(name, uint32, int8, bool), \
2118
+ CUDA_HOST_BINARY_INIT(name, uint32, int16, bool), \
2119
+ CUDA_HOST_BINARY_INIT(name, uint32, int32, bool), \
2120
+ CUDA_HOST_BINARY_INIT(name, uint32, int64, bool), \
2121
+ CUDA_HOST_BINARY_INIT(name, uint32, bfloat16, bool), \
2122
+ CUDA_HOST_BINARY_INIT(name, uint32, float16, bool), \
2123
+ CUDA_HOST_BINARY_INIT(name, uint32, float32, bool), \
2124
+ CUDA_HOST_BINARY_INIT(name, uint32, float64, bool), \
2125
+ CUDA_HOST_BINARY_INIT(name, uint32, complex32, bool), \
2126
+ CUDA_HOST_BINARY_INIT(name, uint32, complex64, bool), \
2127
+ CUDA_HOST_BINARY_INIT(name, uint32, complex128, bool), \
2128
+ \
2129
+ CUDA_HOST_BINARY_INIT(name, uint64, uint8, bool), \
2130
+ CUDA_HOST_BINARY_INIT(name, uint64, uint16, bool), \
2131
+ CUDA_HOST_BINARY_INIT(name, uint64, uint32, bool), \
2132
+ CUDA_HOST_BINARY_INIT(name, uint64, uint64, bool), \
2133
+ \
2134
+ CUDA_HOST_BINARY_INIT(name, int8, uint8, bool), \
2135
+ CUDA_HOST_BINARY_INIT(name, int8, uint16, bool), \
2136
+ CUDA_HOST_BINARY_INIT(name, int8, uint32, bool), \
2137
+ CUDA_HOST_BINARY_INIT(name, int8, int8, bool), \
2138
+ CUDA_HOST_BINARY_INIT(name, int8, int16, bool), \
2139
+ CUDA_HOST_BINARY_INIT(name, int8, int32, bool), \
2140
+ CUDA_HOST_BINARY_INIT(name, int8, int64, bool), \
2141
+ CUDA_HOST_BINARY_INIT(name, int8, bfloat16, bool), \
2142
+ CUDA_HOST_BINARY_INIT(name, int8, float16, bool), \
2143
+ CUDA_HOST_BINARY_INIT(name, int8, float32, bool), \
2144
+ CUDA_HOST_BINARY_INIT(name, int8, float64, bool), \
2145
+ CUDA_HOST_BINARY_INIT(name, int8, complex32, bool), \
2146
+ CUDA_HOST_BINARY_INIT(name, int8, complex64, bool), \
2147
+ CUDA_HOST_BINARY_INIT(name, int8, complex128, bool), \
2148
+ \
2149
+ CUDA_HOST_BINARY_INIT(name, int16, uint8, bool), \
2150
+ CUDA_HOST_BINARY_INIT(name, int16, uint16, bool), \
2151
+ CUDA_HOST_BINARY_INIT(name, int16, uint32, bool), \
2152
+ CUDA_HOST_BINARY_INIT(name, int16, int8, bool), \
2153
+ CUDA_HOST_BINARY_INIT(name, int16, int16, bool), \
2154
+ CUDA_HOST_BINARY_INIT(name, int16, int32, bool), \
2155
+ CUDA_HOST_BINARY_INIT(name, int16, int64, bool), \
2156
+ CUDA_HOST_BINARY_INIT(name, int16, bfloat16, bool), \
2157
+ CUDA_HOST_BINARY_INIT(name, int16, float16, bool), \
2158
+ CUDA_HOST_BINARY_INIT(name, int16, float32, bool), \
2159
+ CUDA_HOST_BINARY_INIT(name, int16, float64, bool), \
2160
+ CUDA_HOST_BINARY_INIT(name, int16, complex32, bool), \
2161
+ CUDA_HOST_BINARY_INIT(name, int16, complex64, bool), \
2162
+ CUDA_HOST_BINARY_INIT(name, int16, complex128, bool), \
2163
+ \
2164
+ CUDA_HOST_BINARY_INIT(name, int32, uint8, bool), \
2165
+ CUDA_HOST_BINARY_INIT(name, int32, uint16, bool), \
2166
+ CUDA_HOST_BINARY_INIT(name, int32, uint32, bool), \
2167
+ CUDA_HOST_BINARY_INIT(name, int32, int8, bool), \
2168
+ CUDA_HOST_BINARY_INIT(name, int32, int16, bool), \
2169
+ CUDA_HOST_BINARY_INIT(name, int32, int32, bool), \
2170
+ CUDA_HOST_BINARY_INIT(name, int32, int64, bool), \
2171
+ CUDA_HOST_BINARY_INIT(name, int32, bfloat16, bool), \
2172
+ CUDA_HOST_BINARY_INIT(name, int32, float16, bool), \
2173
+ CUDA_HOST_BINARY_INIT(name, int32, float32, bool), \
2174
+ CUDA_HOST_BINARY_INIT(name, int32, float64, bool), \
2175
+ CUDA_HOST_BINARY_INIT(name, int32, complex32, bool), \
2176
+ CUDA_HOST_BINARY_INIT(name, int32, complex64, bool), \
2177
+ CUDA_HOST_BINARY_INIT(name, int32, complex128, bool), \
2178
+ \
2179
+ CUDA_HOST_BINARY_INIT(name, int64, uint8, bool), \
2180
+ CUDA_HOST_BINARY_INIT(name, int64, uint16, bool), \
2181
+ CUDA_HOST_BINARY_INIT(name, int64, uint32, bool), \
2182
+ CUDA_HOST_BINARY_INIT(name, int64, int8, bool), \
2183
+ CUDA_HOST_BINARY_INIT(name, int64, int16, bool), \
2184
+ CUDA_HOST_BINARY_INIT(name, int64, int32, bool), \
2185
+ CUDA_HOST_BINARY_INIT(name, int64, int64, bool), \
2186
+ \
2187
+ CUDA_HOST_BINARY_INIT(name, bfloat16, uint8, bool), \
2188
+ CUDA_HOST_BINARY_INIT(name, bfloat16, uint16, bool), \
2189
+ CUDA_HOST_BINARY_INIT(name, bfloat16, uint32, bool), \
2190
+ CUDA_HOST_BINARY_INIT(name, bfloat16, int8, bool), \
2191
+ CUDA_HOST_BINARY_INIT(name, bfloat16, int16, bool), \
2192
+ CUDA_HOST_BINARY_INIT(name, bfloat16, int32, bool), \
2193
+ CUDA_HOST_BINARY_INIT(name, bfloat16, bfloat16, bool), \
2194
+ CUDA_HOST_BINARY_INIT(name, bfloat16, float16, bool), \
2195
+ CUDA_HOST_BINARY_INIT(name, bfloat16, float32, bool), \
2196
+ CUDA_HOST_BINARY_INIT(name, bfloat16, float64, bool), \
2197
+ CUDA_HOST_BINARY_INIT(name, bfloat16, complex32, bool), \
2198
+ CUDA_HOST_BINARY_INIT(name, bfloat16, complex64, bool), \
2199
+ CUDA_HOST_BINARY_INIT(name, bfloat16, complex128, bool), \
2200
+ \
2201
+ CUDA_HOST_BINARY_INIT(name, float16, uint8, bool), \
2202
+ CUDA_HOST_BINARY_INIT(name, float16, uint16, bool), \
2203
+ CUDA_HOST_BINARY_INIT(name, float16, uint32, bool), \
2204
+ CUDA_HOST_BINARY_INIT(name, float16, int8, bool), \
2205
+ CUDA_HOST_BINARY_INIT(name, float16, int16, bool), \
2206
+ CUDA_HOST_BINARY_INIT(name, float16, int32, bool), \
2207
+ CUDA_HOST_BINARY_INIT(name, float16, bfloat16, bool), \
2208
+ CUDA_HOST_BINARY_INIT(name, float16, float16, bool), \
2209
+ CUDA_HOST_BINARY_INIT(name, float16, float32, bool), \
2210
+ CUDA_HOST_BINARY_INIT(name, float16, float64, bool), \
2211
+ CUDA_HOST_BINARY_INIT(name, float16, complex32, bool), \
2212
+ CUDA_HOST_BINARY_INIT(name, float16, complex64, bool), \
2213
+ CUDA_HOST_BINARY_INIT(name, float16, complex128, bool), \
2214
+ \
2215
+ CUDA_HOST_BINARY_INIT(name, float32, uint8, bool), \
2216
+ CUDA_HOST_BINARY_INIT(name, float32, uint16, bool), \
2217
+ CUDA_HOST_BINARY_INIT(name, float32, uint32, bool), \
2218
+ CUDA_HOST_BINARY_INIT(name, float32, int8, bool), \
2219
+ CUDA_HOST_BINARY_INIT(name, float32, int16, bool), \
2220
+ CUDA_HOST_BINARY_INIT(name, float32, int32, bool), \
2221
+ CUDA_HOST_BINARY_INIT(name, float32, bfloat16, bool), \
2222
+ CUDA_HOST_BINARY_INIT(name, float32, float16, bool), \
2223
+ CUDA_HOST_BINARY_INIT(name, float32, float32, bool), \
2224
+ CUDA_HOST_BINARY_INIT(name, float32, float64, bool), \
2225
+ CUDA_HOST_BINARY_INIT(name, float32, complex32, bool), \
2226
+ CUDA_HOST_BINARY_INIT(name, float32, complex64, bool), \
2227
+ CUDA_HOST_BINARY_INIT(name, float32, complex128, bool), \
2228
+ \
2229
+ CUDA_HOST_BINARY_INIT(name, float64, uint8, bool), \
2230
+ CUDA_HOST_BINARY_INIT(name, float64, uint16, bool), \
2231
+ CUDA_HOST_BINARY_INIT(name, float64, uint32, bool), \
2232
+ CUDA_HOST_BINARY_INIT(name, float64, int8, bool), \
2233
+ CUDA_HOST_BINARY_INIT(name, float64, int16, bool), \
2234
+ CUDA_HOST_BINARY_INIT(name, float64, int32, bool), \
2235
+ CUDA_HOST_BINARY_INIT(name, float64, bfloat16, bool), \
2236
+ CUDA_HOST_BINARY_INIT(name, float64, float16, bool), \
2237
+ CUDA_HOST_BINARY_INIT(name, float64, float32, bool), \
2238
+ CUDA_HOST_BINARY_INIT(name, float64, float64, bool), \
2239
+ CUDA_HOST_BINARY_INIT(name, float64, complex32, bool), \
2240
+ CUDA_HOST_BINARY_INIT(name, float64, complex64, bool), \
2241
+ CUDA_HOST_BINARY_INIT(name, float64, complex128, bool), \
2242
+ \
2243
+ CUDA_HOST_BINARY_INIT(name, complex32, uint8, bool), \
2244
+ CUDA_HOST_BINARY_INIT(name, complex32, uint16, bool), \
2245
+ CUDA_HOST_BINARY_INIT(name, complex32, uint32, bool), \
2246
+ CUDA_HOST_BINARY_INIT(name, complex32, int8, bool), \
2247
+ CUDA_HOST_BINARY_INIT(name, complex32, int16, bool), \
2248
+ CUDA_HOST_BINARY_INIT(name, complex32, int32, bool), \
2249
+ CUDA_HOST_BINARY_INIT(name, complex32, bfloat16, bool), \
2250
+ CUDA_HOST_BINARY_INIT(name, complex32, float16, bool), \
2251
+ CUDA_HOST_BINARY_INIT(name, complex32, float32, bool), \
2252
+ CUDA_HOST_BINARY_INIT(name, complex32, float64, bool), \
2253
+ CUDA_HOST_BINARY_INIT(name, complex32, complex32, bool), \
2254
+ CUDA_HOST_BINARY_INIT(name, complex32, complex64, bool), \
2255
+ CUDA_HOST_BINARY_INIT(name, complex32, complex128, bool), \
2256
+ \
2257
+ CUDA_HOST_BINARY_INIT(name, complex64, uint8, bool), \
2258
+ CUDA_HOST_BINARY_INIT(name, complex64, uint16, bool), \
2259
+ CUDA_HOST_BINARY_INIT(name, complex64, uint32, bool), \
2260
+ CUDA_HOST_BINARY_INIT(name, complex64, int8, bool), \
2261
+ CUDA_HOST_BINARY_INIT(name, complex64, int16, bool), \
2262
+ CUDA_HOST_BINARY_INIT(name, complex64, int32, bool), \
2263
+ CUDA_HOST_BINARY_INIT(name, complex64, bfloat16, bool), \
2264
+ CUDA_HOST_BINARY_INIT(name, complex64, float16, bool), \
2265
+ CUDA_HOST_BINARY_INIT(name, complex64, float32, bool), \
2266
+ CUDA_HOST_BINARY_INIT(name, complex64, float64, bool), \
2267
+ CUDA_HOST_BINARY_INIT(name, complex64, complex32, bool), \
2268
+ CUDA_HOST_BINARY_INIT(name, complex64, complex64, bool), \
2269
+ CUDA_HOST_BINARY_INIT(name, complex64, complex128, bool), \
2270
+ \
2271
+ CUDA_HOST_BINARY_INIT(name, complex128, uint8, bool), \
2272
+ CUDA_HOST_BINARY_INIT(name, complex128, uint16, bool), \
2273
+ CUDA_HOST_BINARY_INIT(name, complex128, uint32, bool), \
2274
+ CUDA_HOST_BINARY_INIT(name, complex128, int8, bool), \
2275
+ CUDA_HOST_BINARY_INIT(name, complex128, int16, bool), \
2276
+ CUDA_HOST_BINARY_INIT(name, complex128, int32, bool), \
2277
+ CUDA_HOST_BINARY_INIT(name, complex128, bfloat16, bool), \
2278
+ CUDA_HOST_BINARY_INIT(name, complex128, float16, bool), \
2279
+ CUDA_HOST_BINARY_INIT(name, complex128, float32, bool), \
2280
+ CUDA_HOST_BINARY_INIT(name, complex128, float64, bool), \
2281
+ CUDA_HOST_BINARY_INIT(name, complex128, complex32, bool), \
2282
+ CUDA_HOST_BINARY_INIT(name, complex128, complex64, bool), \
2283
+ CUDA_HOST_BINARY_INIT(name, complex128, complex128, bool)
2284
+
2285
+ #define CUDA_HOST_ALL_EQUALN_INIT(name) \
2286
+ CUDA_HOST_EQUALN_INIT(name, uint8, uint8, bool), \
2287
+ CUDA_HOST_EQUALN_INIT(name, uint8, uint16, bool), \
2288
+ CUDA_HOST_EQUALN_INIT(name, uint8, uint32, bool), \
2289
+ CUDA_HOST_EQUALN_INIT(name, uint8, uint64, bool), \
2290
+ CUDA_HOST_EQUALN_INIT(name, uint8, int8, bool), \
2291
+ CUDA_HOST_EQUALN_INIT(name, uint8, int16, bool), \
2292
+ CUDA_HOST_EQUALN_INIT(name, uint8, int32, bool), \
2293
+ CUDA_HOST_EQUALN_INIT(name, uint8, int64, bool), \
2294
+ CUDA_HOST_EQUALN_INIT(name, uint8, bfloat16, bool), \
2295
+ CUDA_HOST_EQUALN_INIT(name, uint8, float16, bool), \
2296
+ CUDA_HOST_EQUALN_INIT(name, uint8, float32, bool), \
2297
+ CUDA_HOST_EQUALN_INIT(name, uint8, float64, bool), \
2298
+ CUDA_HOST_EQUALN_INIT(name, uint8, complex32, bool), \
2299
+ CUDA_HOST_EQUALN_INIT(name, uint8, complex64, bool), \
2300
+ CUDA_HOST_EQUALN_INIT(name, uint8, complex128, bool), \
2301
+ \
2302
+ CUDA_HOST_EQUALN_INIT(name, uint16, uint8, bool), \
2303
+ CUDA_HOST_EQUALN_INIT(name, uint16, uint16, bool), \
2304
+ CUDA_HOST_EQUALN_INIT(name, uint16, uint32, bool), \
2305
+ CUDA_HOST_EQUALN_INIT(name, uint16, uint64, bool), \
2306
+ CUDA_HOST_EQUALN_INIT(name, uint16, int8, bool), \
2307
+ CUDA_HOST_EQUALN_INIT(name, uint16, int16, bool), \
2308
+ CUDA_HOST_EQUALN_INIT(name, uint16, int32, bool), \
2309
+ CUDA_HOST_EQUALN_INIT(name, uint16, int64, bool), \
2310
+ CUDA_HOST_EQUALN_INIT(name, uint16, bfloat16, bool), \
2311
+ CUDA_HOST_EQUALN_INIT(name, uint16, float16, bool), \
2312
+ CUDA_HOST_EQUALN_INIT(name, uint16, float32, bool), \
2313
+ CUDA_HOST_EQUALN_INIT(name, uint16, float64, bool), \
2314
+ CUDA_HOST_EQUALN_INIT(name, uint16, complex32, bool), \
2315
+ CUDA_HOST_EQUALN_INIT(name, uint16, complex64, bool), \
2316
+ CUDA_HOST_EQUALN_INIT(name, uint16, complex128, bool), \
2317
+ \
2318
+ CUDA_HOST_EQUALN_INIT(name, uint32, uint8, bool), \
2319
+ CUDA_HOST_EQUALN_INIT(name, uint32, uint16, bool), \
2320
+ CUDA_HOST_EQUALN_INIT(name, uint32, uint32, bool), \
2321
+ CUDA_HOST_EQUALN_INIT(name, uint32, uint64, bool), \
2322
+ CUDA_HOST_EQUALN_INIT(name, uint32, int8, bool), \
2323
+ CUDA_HOST_EQUALN_INIT(name, uint32, int16, bool), \
2324
+ CUDA_HOST_EQUALN_INIT(name, uint32, int32, bool), \
2325
+ CUDA_HOST_EQUALN_INIT(name, uint32, int64, bool), \
2326
+ CUDA_HOST_EQUALN_INIT(name, uint32, bfloat16, bool), \
2327
+ CUDA_HOST_EQUALN_INIT(name, uint32, float16, bool), \
2328
+ CUDA_HOST_EQUALN_INIT(name, uint32, float32, bool), \
2329
+ CUDA_HOST_EQUALN_INIT(name, uint32, float64, bool), \
2330
+ CUDA_HOST_EQUALN_INIT(name, uint32, complex32, bool), \
2331
+ CUDA_HOST_EQUALN_INIT(name, uint32, complex64, bool), \
2332
+ CUDA_HOST_EQUALN_INIT(name, uint32, complex128, bool), \
2333
+ \
2334
+ CUDA_HOST_EQUALN_INIT(name, uint64, uint8, bool), \
2335
+ CUDA_HOST_EQUALN_INIT(name, uint64, uint16, bool), \
2336
+ CUDA_HOST_EQUALN_INIT(name, uint64, uint32, bool), \
2337
+ CUDA_HOST_EQUALN_INIT(name, uint64, uint64, bool), \
2338
+ \
2339
+ CUDA_HOST_EQUALN_INIT(name, int8, uint8, bool), \
2340
+ CUDA_HOST_EQUALN_INIT(name, int8, uint16, bool), \
2341
+ CUDA_HOST_EQUALN_INIT(name, int8, uint32, bool), \
2342
+ CUDA_HOST_EQUALN_INIT(name, int8, int8, bool), \
2343
+ CUDA_HOST_EQUALN_INIT(name, int8, int16, bool), \
2344
+ CUDA_HOST_EQUALN_INIT(name, int8, int32, bool), \
2345
+ CUDA_HOST_EQUALN_INIT(name, int8, int64, bool), \
2346
+ CUDA_HOST_EQUALN_INIT(name, int8, bfloat16, bool), \
2347
+ CUDA_HOST_EQUALN_INIT(name, int8, float16, bool), \
2348
+ CUDA_HOST_EQUALN_INIT(name, int8, float32, bool), \
2349
+ CUDA_HOST_EQUALN_INIT(name, int8, float64, bool), \
2350
+ CUDA_HOST_EQUALN_INIT(name, int8, complex32, bool), \
2351
+ CUDA_HOST_EQUALN_INIT(name, int8, complex64, bool), \
2352
+ CUDA_HOST_EQUALN_INIT(name, int8, complex128, bool), \
2353
+ \
2354
+ CUDA_HOST_EQUALN_INIT(name, int16, uint8, bool), \
2355
+ CUDA_HOST_EQUALN_INIT(name, int16, uint16, bool), \
2356
+ CUDA_HOST_EQUALN_INIT(name, int16, uint32, bool), \
2357
+ CUDA_HOST_EQUALN_INIT(name, int16, int8, bool), \
2358
+ CUDA_HOST_EQUALN_INIT(name, int16, int16, bool), \
2359
+ CUDA_HOST_EQUALN_INIT(name, int16, int32, bool), \
2360
+ CUDA_HOST_EQUALN_INIT(name, int16, int64, bool), \
2361
+ CUDA_HOST_EQUALN_INIT(name, int16, bfloat16, bool), \
2362
+ CUDA_HOST_EQUALN_INIT(name, int16, float16, bool), \
2363
+ CUDA_HOST_EQUALN_INIT(name, int16, float32, bool), \
2364
+ CUDA_HOST_EQUALN_INIT(name, int16, float64, bool), \
2365
+ CUDA_HOST_EQUALN_INIT(name, int16, complex32, bool), \
2366
+ CUDA_HOST_EQUALN_INIT(name, int16, complex64, bool), \
2367
+ CUDA_HOST_EQUALN_INIT(name, int16, complex128, bool), \
2368
+ \
2369
+ CUDA_HOST_EQUALN_INIT(name, int32, uint8, bool), \
2370
+ CUDA_HOST_EQUALN_INIT(name, int32, uint16, bool), \
2371
+ CUDA_HOST_EQUALN_INIT(name, int32, uint32, bool), \
2372
+ CUDA_HOST_EQUALN_INIT(name, int32, int8, bool), \
2373
+ CUDA_HOST_EQUALN_INIT(name, int32, int16, bool), \
2374
+ CUDA_HOST_EQUALN_INIT(name, int32, int32, bool), \
2375
+ CUDA_HOST_EQUALN_INIT(name, int32, int64, bool), \
2376
+ CUDA_HOST_EQUALN_INIT(name, int32, bfloat16, bool), \
2377
+ CUDA_HOST_EQUALN_INIT(name, int32, float16, bool), \
2378
+ CUDA_HOST_EQUALN_INIT(name, int32, float32, bool), \
2379
+ CUDA_HOST_EQUALN_INIT(name, int32, float64, bool), \
2380
+ CUDA_HOST_EQUALN_INIT(name, int32, complex32, bool), \
2381
+ CUDA_HOST_EQUALN_INIT(name, int32, complex64, bool), \
2382
+ CUDA_HOST_EQUALN_INIT(name, int32, complex128, bool), \
2383
+ \
2384
+ CUDA_HOST_EQUALN_INIT(name, int64, uint8, bool), \
2385
+ CUDA_HOST_EQUALN_INIT(name, int64, uint16, bool), \
2386
+ CUDA_HOST_EQUALN_INIT(name, int64, uint32, bool), \
2387
+ CUDA_HOST_EQUALN_INIT(name, int64, int8, bool), \
2388
+ CUDA_HOST_EQUALN_INIT(name, int64, int16, bool), \
2389
+ CUDA_HOST_EQUALN_INIT(name, int64, int32, bool), \
2390
+ CUDA_HOST_EQUALN_INIT(name, int64, int64, bool), \
2391
+ \
2392
+ CUDA_HOST_EQUALN_INIT(name, bfloat16, uint8, bool), \
2393
+ CUDA_HOST_EQUALN_INIT(name, bfloat16, uint16, bool), \
2394
+ CUDA_HOST_EQUALN_INIT(name, bfloat16, uint32, bool), \
2395
+ CUDA_HOST_EQUALN_INIT(name, bfloat16, int8, bool), \
2396
+ CUDA_HOST_EQUALN_INIT(name, bfloat16, int16, bool), \
2397
+ CUDA_HOST_EQUALN_INIT(name, bfloat16, int32, bool), \
2398
+ CUDA_HOST_EQUALN_INIT(name, bfloat16, bfloat16, bool), \
2399
+ CUDA_HOST_EQUALN_INIT(name, bfloat16, float16, bool), \
2400
+ CUDA_HOST_EQUALN_INIT(name, bfloat16, float32, bool), \
2401
+ CUDA_HOST_EQUALN_INIT(name, bfloat16, float64, bool), \
2402
+ CUDA_HOST_EQUALN_INIT(name, bfloat16, complex32, bool), \
2403
+ CUDA_HOST_EQUALN_INIT(name, bfloat16, complex64, bool), \
2404
+ CUDA_HOST_EQUALN_INIT(name, bfloat16, complex128, bool), \
2405
+ \
2406
+ CUDA_HOST_EQUALN_INIT(name, float16, uint8, bool), \
2407
+ CUDA_HOST_EQUALN_INIT(name, float16, uint16, bool), \
2408
+ CUDA_HOST_EQUALN_INIT(name, float16, uint32, bool), \
2409
+ CUDA_HOST_EQUALN_INIT(name, float16, int8, bool), \
2410
+ CUDA_HOST_EQUALN_INIT(name, float16, int16, bool), \
2411
+ CUDA_HOST_EQUALN_INIT(name, float16, int32, bool), \
2412
+ CUDA_HOST_EQUALN_INIT(name, float16, bfloat16, bool), \
2413
+ CUDA_HOST_EQUALN_INIT(name, float16, float16, bool), \
2414
+ CUDA_HOST_EQUALN_INIT(name, float16, float32, bool), \
2415
+ CUDA_HOST_EQUALN_INIT(name, float16, float64, bool), \
2416
+ CUDA_HOST_EQUALN_INIT(name, float16, complex32, bool), \
2417
+ CUDA_HOST_EQUALN_INIT(name, float16, complex64, bool), \
2418
+ CUDA_HOST_EQUALN_INIT(name, float16, complex128, bool), \
2419
+ \
2420
+ CUDA_HOST_EQUALN_INIT(name, float32, uint8, bool), \
2421
+ CUDA_HOST_EQUALN_INIT(name, float32, uint16, bool), \
2422
+ CUDA_HOST_EQUALN_INIT(name, float32, uint32, bool), \
2423
+ CUDA_HOST_EQUALN_INIT(name, float32, int8, bool), \
2424
+ CUDA_HOST_EQUALN_INIT(name, float32, int16, bool), \
2425
+ CUDA_HOST_EQUALN_INIT(name, float32, int32, bool), \
2426
+ CUDA_HOST_EQUALN_INIT(name, float32, bfloat16, bool), \
2427
+ CUDA_HOST_EQUALN_INIT(name, float32, float16, bool), \
2428
+ CUDA_HOST_EQUALN_INIT(name, float32, float32, bool), \
2429
+ CUDA_HOST_EQUALN_INIT(name, float32, float64, bool), \
2430
+ CUDA_HOST_EQUALN_INIT(name, float32, complex32, bool), \
2431
+ CUDA_HOST_EQUALN_INIT(name, float32, complex64, bool), \
2432
+ CUDA_HOST_EQUALN_INIT(name, float32, complex128, bool), \
2433
+ \
2434
+ CUDA_HOST_EQUALN_INIT(name, float64, uint8, bool), \
2435
+ CUDA_HOST_EQUALN_INIT(name, float64, uint16, bool), \
2436
+ CUDA_HOST_EQUALN_INIT(name, float64, uint32, bool), \
2437
+ CUDA_HOST_EQUALN_INIT(name, float64, int8, bool), \
2438
+ CUDA_HOST_EQUALN_INIT(name, float64, int16, bool), \
2439
+ CUDA_HOST_EQUALN_INIT(name, float64, int32, bool), \
2440
+ CUDA_HOST_EQUALN_INIT(name, float64, bfloat16, bool), \
2441
+ CUDA_HOST_EQUALN_INIT(name, float64, float16, bool), \
2442
+ CUDA_HOST_EQUALN_INIT(name, float64, float32, bool), \
2443
+ CUDA_HOST_EQUALN_INIT(name, float64, float64, bool), \
2444
+ CUDA_HOST_EQUALN_INIT(name, float64, complex32, bool), \
2445
+ CUDA_HOST_EQUALN_INIT(name, float64, complex64, bool), \
2446
+ CUDA_HOST_EQUALN_INIT(name, float64, complex128, bool), \
2447
+ \
2448
+ CUDA_HOST_EQUALN_INIT(name, complex32, uint8, bool), \
2449
+ CUDA_HOST_EQUALN_INIT(name, complex32, uint16, bool), \
2450
+ CUDA_HOST_EQUALN_INIT(name, complex32, uint32, bool), \
2451
+ CUDA_HOST_EQUALN_INIT(name, complex32, int8, bool), \
2452
+ CUDA_HOST_EQUALN_INIT(name, complex32, int16, bool), \
2453
+ CUDA_HOST_EQUALN_INIT(name, complex32, int32, bool), \
2454
+ CUDA_HOST_EQUALN_INIT(name, complex32, bfloat16, bool), \
2455
+ CUDA_HOST_EQUALN_INIT(name, complex32, float16, bool), \
2456
+ CUDA_HOST_EQUALN_INIT(name, complex32, float32, bool), \
2457
+ CUDA_HOST_EQUALN_INIT(name, complex32, float64, bool), \
2458
+ CUDA_HOST_EQUALN_INIT(name, complex32, complex32, bool), \
2459
+ CUDA_HOST_EQUALN_INIT(name, complex32, complex64, bool), \
2460
+ CUDA_HOST_EQUALN_INIT(name, complex32, complex128, bool), \
2461
+ \
2462
+ CUDA_HOST_EQUALN_INIT(name, complex64, uint8, bool), \
2463
+ CUDA_HOST_EQUALN_INIT(name, complex64, uint16, bool), \
2464
+ CUDA_HOST_EQUALN_INIT(name, complex64, uint32, bool), \
2465
+ CUDA_HOST_EQUALN_INIT(name, complex64, int8, bool), \
2466
+ CUDA_HOST_EQUALN_INIT(name, complex64, int16, bool), \
2467
+ CUDA_HOST_EQUALN_INIT(name, complex64, int32, bool), \
2468
+ CUDA_HOST_EQUALN_INIT(name, complex64, bfloat16, bool), \
2469
+ CUDA_HOST_EQUALN_INIT(name, complex64, float16, bool), \
2470
+ CUDA_HOST_EQUALN_INIT(name, complex64, float32, bool), \
2471
+ CUDA_HOST_EQUALN_INIT(name, complex64, float64, bool), \
2472
+ CUDA_HOST_EQUALN_INIT(name, complex64, complex32, bool), \
2473
+ CUDA_HOST_EQUALN_INIT(name, complex64, complex64, bool), \
2474
+ CUDA_HOST_EQUALN_INIT(name, complex64, complex128, bool), \
2475
+ \
2476
+ CUDA_HOST_EQUALN_INIT(name, complex128, uint8, bool), \
2477
+ CUDA_HOST_EQUALN_INIT(name, complex128, uint16, bool), \
2478
+ CUDA_HOST_EQUALN_INIT(name, complex128, uint32, bool), \
2479
+ CUDA_HOST_EQUALN_INIT(name, complex128, int8, bool), \
2480
+ CUDA_HOST_EQUALN_INIT(name, complex128, int16, bool), \
2481
+ CUDA_HOST_EQUALN_INIT(name, complex128, int32, bool), \
2482
+ CUDA_HOST_EQUALN_INIT(name, complex128, bfloat16, bool), \
2483
+ CUDA_HOST_EQUALN_INIT(name, complex128, float16, bool), \
2484
+ CUDA_HOST_EQUALN_INIT(name, complex128, float32, bool), \
2485
+ CUDA_HOST_EQUALN_INIT(name, complex128, float64, bool), \
2486
+ CUDA_HOST_EQUALN_INIT(name, complex128, complex32, bool), \
2487
+ CUDA_HOST_EQUALN_INIT(name, complex128, complex64, bool), \
2488
+ CUDA_HOST_EQUALN_INIT(name, complex128, complex128, bool)
2489
+
2490
+
2491
+ CUDA_HOST_ALL_COMPARISON(less)
2492
+ CUDA_HOST_ALL_COMPARISON(less_equal)
2493
+ CUDA_HOST_ALL_COMPARISON(greater_equal)
2494
+ CUDA_HOST_ALL_COMPARISON(greater)
2495
+ CUDA_HOST_ALL_COMPARISON(equal)
2496
+ CUDA_HOST_ALL_COMPARISON(not_equal)
2497
+ CUDA_HOST_ALL_COMPARISON(equaln)
2498
+
2499
+
2500
+ static const gm_kernel_init_t binary_kernels[] = {
2501
+ CUDA_HOST_ALL_ARITHMETIC_INIT(add),
2502
+ CUDA_HOST_ALL_ARITHMETIC_INIT(subtract),
2503
+ CUDA_HOST_ALL_ARITHMETIC_INIT(multiply),
2504
+ CUDA_HOST_ALL_ARITHMETIC_INIT(floor_divide),
2505
+ CUDA_HOST_ALL_ARITHMETIC_INIT(remainder),
2506
+ CUDA_HOST_ALL_ARITHMETIC_FLOAT_RETURN_INIT(divide),
2507
+ CUDA_HOST_ALL_ARITHMETIC_INIT(power),
2508
+ CUDA_HOST_ALL_COMPARISON_INIT(less),
2509
+ CUDA_HOST_ALL_COMPARISON_INIT(less_equal),
2510
+ CUDA_HOST_ALL_COMPARISON_INIT(greater_equal),
2511
+ CUDA_HOST_ALL_COMPARISON_INIT(greater),
2512
+ CUDA_HOST_ALL_COMPARISON_INIT(equal),
2513
+ CUDA_HOST_ALL_COMPARISON_INIT(not_equal),
2514
+ CUDA_HOST_ALL_EQUALN_INIT(equaln),
2515
+
2516
+ { .name = NULL, .sig = NULL }
2517
+ };
2518
+
2519
+
2520
+ /*****************************************************************************/
2521
+ /* Bitwise */
2522
+ /*****************************************************************************/
2523
+
2524
+ #define CUDA_HOST_ALL_BITWISE(name) \
2525
+ CUDA_HOST_BINARY(name, bool, bool, bool) \
2526
+ CUDA_HOST_BINARY(name, bool, uint8, uint8) \
2527
+ CUDA_HOST_BINARY(name, bool, uint16, uint16) \
2528
+ CUDA_HOST_BINARY(name, bool, uint32, uint32) \
2529
+ CUDA_HOST_BINARY(name, bool, uint64, uint64) \
2530
+ CUDA_HOST_BINARY(name, bool, int8, int8) \
2531
+ CUDA_HOST_BINARY(name, bool, int16, int16) \
2532
+ CUDA_HOST_BINARY(name, bool, int32, int32) \
2533
+ CUDA_HOST_BINARY(name, bool, int64, int64) \
2534
+ \
2535
+ CUDA_HOST_BINARY(name, uint8, bool, uint8) \
2536
+ CUDA_HOST_BINARY(name, uint8, uint8, uint8) \
2537
+ CUDA_HOST_BINARY(name, uint8, uint16, uint16) \
2538
+ CUDA_HOST_BINARY(name, uint8, uint32, uint32) \
2539
+ CUDA_HOST_BINARY(name, uint8, uint64, uint64) \
2540
+ CUDA_HOST_BINARY(name, uint8, int8, int16) \
2541
+ CUDA_HOST_BINARY(name, uint8, int16, int16) \
2542
+ CUDA_HOST_BINARY(name, uint8, int32, int32) \
2543
+ CUDA_HOST_BINARY(name, uint8, int64, int64) \
2544
+ \
2545
+ CUDA_HOST_BINARY(name, uint16, bool, uint16) \
2546
+ CUDA_HOST_BINARY(name, uint16, uint8, uint16) \
2547
+ CUDA_HOST_BINARY(name, uint16, uint16, uint16) \
2548
+ CUDA_HOST_BINARY(name, uint16, uint32, uint32) \
2549
+ CUDA_HOST_BINARY(name, uint16, uint64, uint64) \
2550
+ CUDA_HOST_BINARY(name, uint16, int8, int32) \
2551
+ CUDA_HOST_BINARY(name, uint16, int16, int32) \
2552
+ CUDA_HOST_BINARY(name, uint16, int32, int32) \
2553
+ CUDA_HOST_BINARY(name, uint16, int64, int64) \
2554
+ \
2555
+ CUDA_HOST_BINARY(name, uint32, bool, uint32) \
2556
+ CUDA_HOST_BINARY(name, uint32, uint8, uint32) \
2557
+ CUDA_HOST_BINARY(name, uint32, uint16, uint32) \
2558
+ CUDA_HOST_BINARY(name, uint32, uint32, uint32) \
2559
+ CUDA_HOST_BINARY(name, uint32, uint64, uint64) \
2560
+ CUDA_HOST_BINARY(name, uint32, int8, int64) \
2561
+ CUDA_HOST_BINARY(name, uint32, int16, int64) \
2562
+ CUDA_HOST_BINARY(name, uint32, int32, int64) \
2563
+ CUDA_HOST_BINARY(name, uint32, int64, int64) \
2564
+ \
2565
+ CUDA_HOST_BINARY(name, uint64, bool, uint64) \
2566
+ CUDA_HOST_BINARY(name, uint64, uint8, uint64) \
2567
+ CUDA_HOST_BINARY(name, uint64, uint16, uint64) \
2568
+ CUDA_HOST_BINARY(name, uint64, uint32, uint64) \
2569
+ CUDA_HOST_BINARY(name, uint64, uint64, uint64) \
2570
+ \
2571
+ CUDA_HOST_BINARY(name, int8, bool, int8) \
2572
+ CUDA_HOST_BINARY(name, int8, uint8, int16) \
2573
+ CUDA_HOST_BINARY(name, int8, uint16, int32) \
2574
+ CUDA_HOST_BINARY(name, int8, uint32, int64) \
2575
+ CUDA_HOST_BINARY(name, int8, int8, int8) \
2576
+ CUDA_HOST_BINARY(name, int8, int16, int16) \
2577
+ CUDA_HOST_BINARY(name, int8, int32, int32) \
2578
+ CUDA_HOST_BINARY(name, int8, int64, int64) \
2579
+ \
2580
+ CUDA_HOST_BINARY(name, int16, bool, int16) \
2581
+ CUDA_HOST_BINARY(name, int16, uint8, int16) \
2582
+ CUDA_HOST_BINARY(name, int16, uint16, int32) \
2583
+ CUDA_HOST_BINARY(name, int16, uint32, int64) \
2584
+ CUDA_HOST_BINARY(name, int16, int8, int16) \
2585
+ CUDA_HOST_BINARY(name, int16, int16, int16) \
2586
+ CUDA_HOST_BINARY(name, int16, int32, int32) \
2587
+ CUDA_HOST_BINARY(name, int16, int64, int64) \
2588
+ \
2589
+ CUDA_HOST_BINARY(name, int32, bool, int32) \
2590
+ CUDA_HOST_BINARY(name, int32, uint8, int32) \
2591
+ CUDA_HOST_BINARY(name, int32, uint16, int32) \
2592
+ CUDA_HOST_BINARY(name, int32, uint32, int64) \
2593
+ CUDA_HOST_BINARY(name, int32, int8, int32) \
2594
+ CUDA_HOST_BINARY(name, int32, int16, int32) \
2595
+ CUDA_HOST_BINARY(name, int32, int32, int32) \
2596
+ CUDA_HOST_BINARY(name, int32, int64, int64) \
2597
+ \
2598
+ CUDA_HOST_BINARY(name, int64, bool, int64) \
2599
+ CUDA_HOST_BINARY(name, int64, uint8, int64) \
2600
+ CUDA_HOST_BINARY(name, int64, uint16, int64) \
2601
+ CUDA_HOST_BINARY(name, int64, uint32, int64) \
2602
+ CUDA_HOST_BINARY(name, int64, int8, int64) \
2603
+ CUDA_HOST_BINARY(name, int64, int16, int64) \
2604
+ CUDA_HOST_BINARY(name, int64, int32, int64) \
2605
+ CUDA_HOST_BINARY(name, int64, int64, int64)
2606
+
2607
+ #define CUDA_HOST_ALL_BITWISE_INIT(name) \
2608
+ CUDA_HOST_BINARY_INIT(name, bool, bool, bool), \
2609
+ CUDA_HOST_BINARY_INIT(name, bool, uint8, uint8), \
2610
+ CUDA_HOST_BINARY_INIT(name, bool, uint16, uint16), \
2611
+ CUDA_HOST_BINARY_INIT(name, bool, uint32, uint32), \
2612
+ CUDA_HOST_BINARY_INIT(name, bool, uint64, uint64), \
2613
+ CUDA_HOST_BINARY_INIT(name, bool, int8, int8), \
2614
+ CUDA_HOST_BINARY_INIT(name, bool, int16, int16), \
2615
+ CUDA_HOST_BINARY_INIT(name, bool, int32, int32), \
2616
+ CUDA_HOST_BINARY_INIT(name, bool, int64, int64), \
2617
+ \
2618
+ CUDA_HOST_BINARY_INIT(name, uint8, bool, uint8), \
2619
+ CUDA_HOST_BINARY_INIT(name, uint8, uint8, uint8), \
2620
+ CUDA_HOST_BINARY_INIT(name, uint8, uint16, uint16), \
2621
+ CUDA_HOST_BINARY_INIT(name, uint8, uint32, uint32), \
2622
+ CUDA_HOST_BINARY_INIT(name, uint8, uint64, uint64), \
2623
+ CUDA_HOST_BINARY_INIT(name, uint8, int8, int16), \
2624
+ CUDA_HOST_BINARY_INIT(name, uint8, int16, int16), \
2625
+ CUDA_HOST_BINARY_INIT(name, uint8, int32, int32), \
2626
+ CUDA_HOST_BINARY_INIT(name, uint8, int64, int64), \
2627
+ \
2628
+ CUDA_HOST_BINARY_INIT(name, uint16, bool, uint16), \
2629
+ CUDA_HOST_BINARY_INIT(name, uint16, uint8, uint16), \
2630
+ CUDA_HOST_BINARY_INIT(name, uint16, uint16, uint16), \
2631
+ CUDA_HOST_BINARY_INIT(name, uint16, uint32, uint32), \
2632
+ CUDA_HOST_BINARY_INIT(name, uint16, uint64, uint64), \
2633
+ CUDA_HOST_BINARY_INIT(name, uint16, int8, int32), \
2634
+ CUDA_HOST_BINARY_INIT(name, uint16, int16, int32), \
2635
+ CUDA_HOST_BINARY_INIT(name, uint16, int32, int32), \
2636
+ CUDA_HOST_BINARY_INIT(name, uint16, int64, int64), \
2637
+ \
2638
+ CUDA_HOST_BINARY_INIT(name, uint32, bool, uint32), \
2639
+ CUDA_HOST_BINARY_INIT(name, uint32, uint8, uint32), \
2640
+ CUDA_HOST_BINARY_INIT(name, uint32, uint16, uint32), \
2641
+ CUDA_HOST_BINARY_INIT(name, uint32, uint32, uint32), \
2642
+ CUDA_HOST_BINARY_INIT(name, uint32, uint64, uint64), \
2643
+ CUDA_HOST_BINARY_INIT(name, uint32, int8, int64), \
2644
+ CUDA_HOST_BINARY_INIT(name, uint32, int16, int64), \
2645
+ CUDA_HOST_BINARY_INIT(name, uint32, int32, int64), \
2646
+ CUDA_HOST_BINARY_INIT(name, uint32, int64, int64), \
2647
+ \
2648
+ CUDA_HOST_BINARY_INIT(name, uint64, bool, uint64), \
2649
+ CUDA_HOST_BINARY_INIT(name, uint64, uint8, uint64), \
2650
+ CUDA_HOST_BINARY_INIT(name, uint64, uint16, uint64), \
2651
+ CUDA_HOST_BINARY_INIT(name, uint64, uint32, uint64), \
2652
+ CUDA_HOST_BINARY_INIT(name, uint64, uint64, uint64), \
2653
+ \
2654
+ CUDA_HOST_BINARY_INIT(name, int8, bool, int8), \
2655
+ CUDA_HOST_BINARY_INIT(name, int8, uint8, int16), \
2656
+ CUDA_HOST_BINARY_INIT(name, int8, uint16, int32), \
2657
+ CUDA_HOST_BINARY_INIT(name, int8, uint32, int64), \
2658
+ CUDA_HOST_BINARY_INIT(name, int8, int8, int8), \
2659
+ CUDA_HOST_BINARY_INIT(name, int8, int16, int16), \
2660
+ CUDA_HOST_BINARY_INIT(name, int8, int32, int32), \
2661
+ CUDA_HOST_BINARY_INIT(name, int8, int64, int64), \
2662
+ \
2663
+ CUDA_HOST_BINARY_INIT(name, int16, bool, int16), \
2664
+ CUDA_HOST_BINARY_INIT(name, int16, uint8, int16), \
2665
+ CUDA_HOST_BINARY_INIT(name, int16, uint16, int32), \
2666
+ CUDA_HOST_BINARY_INIT(name, int16, uint32, int64), \
2667
+ CUDA_HOST_BINARY_INIT(name, int16, int8, int16), \
2668
+ CUDA_HOST_BINARY_INIT(name, int16, int16, int16), \
2669
+ CUDA_HOST_BINARY_INIT(name, int16, int32, int32), \
2670
+ CUDA_HOST_BINARY_INIT(name, int16, int64, int64), \
2671
+ \
2672
+ CUDA_HOST_BINARY_INIT(name, int32, bool, int32), \
2673
+ CUDA_HOST_BINARY_INIT(name, int32, uint8, int32), \
2674
+ CUDA_HOST_BINARY_INIT(name, int32, uint16, int32), \
2675
+ CUDA_HOST_BINARY_INIT(name, int32, uint32, int64), \
2676
+ CUDA_HOST_BINARY_INIT(name, int32, int8, int32), \
2677
+ CUDA_HOST_BINARY_INIT(name, int32, int16, int32), \
2678
+ CUDA_HOST_BINARY_INIT(name, int32, int32, int32), \
2679
+ CUDA_HOST_BINARY_INIT(name, int32, int64, int64), \
2680
+ \
2681
+ CUDA_HOST_BINARY_INIT(name, int64, bool, int64), \
2682
+ CUDA_HOST_BINARY_INIT(name, int64, uint8, int64), \
2683
+ CUDA_HOST_BINARY_INIT(name, int64, uint16, int64), \
2684
+ CUDA_HOST_BINARY_INIT(name, int64, uint32, int64), \
2685
+ CUDA_HOST_BINARY_INIT(name, int64, int8, int64), \
2686
+ CUDA_HOST_BINARY_INIT(name, int64, int16, int64), \
2687
+ CUDA_HOST_BINARY_INIT(name, int64, int32, int64), \
2688
+ CUDA_HOST_BINARY_INIT(name, int64, int64, int64)
2689
+
2690
+
2691
+ CUDA_HOST_ALL_BITWISE(bitwise_and)
2692
+ CUDA_HOST_ALL_BITWISE(bitwise_or)
2693
+ CUDA_HOST_ALL_BITWISE(bitwise_xor)
2694
+
2695
+
2696
+ static const gm_kernel_init_t bitwise_kernels[] = {
2697
+ CUDA_HOST_ALL_BITWISE_INIT(bitwise_and),
2698
+ CUDA_HOST_ALL_BITWISE_INIT(bitwise_or),
2699
+ CUDA_HOST_ALL_BITWISE_INIT(bitwise_xor),
2700
+
2701
+ { .name = NULL, .sig = NULL }
2702
+ };
2703
+
2704
+
2705
+ /****************************************************************************/
2706
+ /* Two return values */
2707
+ /****************************************************************************/
2708
+
2709
+ #define CUDA_HOST_BINARY_MV(name, t0, t1, t2, t3) \
2710
+ static int \
2711
+ gm_cuda_host_fixed_1D_C_##name##_##t0##_##t1##_##t2##_##t3(xnd_t stack[], ndt_context_t *ctx) \
2712
+ { \
2713
+ const char *a0 = apply_index(&stack[0]); \
2714
+ const char *a1 = apply_index(&stack[1]); \
2715
+ char *a2 = apply_index(&stack[2]); \
2716
+ char *a3 = apply_index(&stack[3]); \
2717
+ const int64_t N = xnd_fixed_shape(&stack[0]); \
2718
+ (void)ctx; \
2719
+ \
2720
+ gm_cuda_device_fixed_1D_C_##name##_##t0##_##t1##_##t2##_##t3( \
2721
+ a0, a1, a2, a3, N); \
2722
+ \
2723
+ return 0; \
2724
+ }
2725
+
2726
+ #define CUDA_HOST_BINARY_MV_INIT(func, t0, t1, t2, t3) \
2727
+ { .name = STRINGIZE(func), \
2728
+ .sig = "... * " STRINGIZE(t0) ", ... * " STRINGIZE(t1) " -> " \
2729
+ "... * " STRINGIZE(t2) ", ... * " STRINGIZE(t3), \
2730
+ .OptC = gm_cuda_host_fixed_1D_C_##func##_##t0##_##t1##_##t2##_##t3 }
2731
+
2732
+ #define CUDA_HOST_ALL_BINARY_MV(name) \
2733
+ CUDA_HOST_BINARY_MV(name, uint8, uint8, uint8, uint8) \
2734
+ CUDA_HOST_BINARY_MV(name, uint16, uint16, uint16, uint16) \
2735
+ CUDA_HOST_BINARY_MV(name, uint32, uint32, uint32, uint32) \
2736
+ CUDA_HOST_BINARY_MV(name, uint64, uint64, uint64, uint64) \
2737
+ CUDA_HOST_BINARY_MV(name, int8, int8, int8, int8) \
2738
+ CUDA_HOST_BINARY_MV(name, int16, int16, int16, int16) \
2739
+ CUDA_HOST_BINARY_MV(name, int32, int32, int32, int32) \
2740
+ CUDA_HOST_BINARY_MV(name, int64, int64, int64, int64) \
2741
+ CUDA_HOST_BINARY_MV(name, bfloat16, bfloat16, bfloat16, bfloat16) \
2742
+ CUDA_HOST_BINARY_MV(name, float32, float32, float32, float32) \
2743
+ CUDA_HOST_BINARY_MV(name, float64, float64, float64, float64)
2744
+
2745
+ #define CUDA_HOST_ALL_BINARY_MV_INIT(name) \
2746
+ CUDA_HOST_BINARY_MV_INIT(name, uint8, uint8, uint8, uint8), \
2747
+ CUDA_HOST_BINARY_MV_INIT(name, uint16, uint16, uint16, uint16), \
2748
+ CUDA_HOST_BINARY_MV_INIT(name, uint32, uint32, uint32, uint32), \
2749
+ CUDA_HOST_BINARY_MV_INIT(name, uint64, uint64, uint64, uint64), \
2750
+ CUDA_HOST_BINARY_MV_INIT(name, int8, int8, int8, int8), \
2751
+ CUDA_HOST_BINARY_MV_INIT(name, int16, int16, int16, int16), \
2752
+ CUDA_HOST_BINARY_MV_INIT(name, int32, int32, int32, int32), \
2753
+ CUDA_HOST_BINARY_MV_INIT(name, int64, int64, int64, int64), \
2754
+ CUDA_HOST_BINARY_MV_INIT(name, bfloat16, bfloat16, bfloat16, bfloat16), \
2755
+ CUDA_HOST_BINARY_MV_INIT(name, float32, float32, float32, float32), \
2756
+ CUDA_HOST_BINARY_MV_INIT(name, float64, float64, float64, float64)
2757
+
2758
+ CUDA_HOST_ALL_BINARY_MV(divmod)
2759
+
2760
+
2761
+ static const gm_kernel_init_t binary_mv_kernels[] = {
2762
+ CUDA_HOST_ALL_BINARY_MV_INIT(divmod),
2763
+
2764
+ { .name = NULL, .sig = NULL }
2765
+ };
2766
+
2767
+
2768
+ /****************************************************************************/
2769
+ /* Initialize kernel table */
2770
+ /****************************************************************************/
2771
+
2772
+ typedef _Bool bool;
2773
+
2774
+ static const gm_kernel_set_t *
2775
+ binary_typecheck(ndt_apply_spec_t *spec, const gm_func_t *f, const ndt_t *types[],
2776
+ const int64_t li[], int nin, int nout, bool check_broadcast,
2777
+ ndt_context_t *ctx)
2778
+ {
2779
+ return cuda_binary_typecheck(binary_kernel_location, spec, f, types, li,
2780
+ nin, nout, check_broadcast, ctx);
2781
+ }
2782
+
2783
+ static const gm_kernel_set_t *
2784
+ bitwise_typecheck(ndt_apply_spec_t *spec, const gm_func_t *f, const ndt_t *types[],
2785
+ const int64_t li[], int nin, int nout, bool check_broadcast,
2786
+ ndt_context_t *ctx)
2787
+ {
2788
+ return cuda_binary_typecheck(bitwise_kernel_location, spec, f, types, li,
2789
+ nin, nout, check_broadcast, ctx);
2790
+ }
2791
+
2792
+
2793
+ int
2794
+ gm_init_cuda_binary_kernels(gm_tbl_t *tbl, ndt_context_t *ctx)
2795
+ {
2796
+ const gm_kernel_init_t *k;
2797
+
2798
+ for (k = binary_kernels; k->name != NULL; k++) {
2799
+ if (gm_add_kernel_typecheck(tbl, k, ctx, &binary_typecheck) < 0) {
2800
+ return -1;
2801
+ }
2802
+ }
2803
+
2804
+ for (k = bitwise_kernels; k->name != NULL; k++) {
2805
+ if (gm_add_kernel_typecheck(tbl, k, ctx, &bitwise_typecheck) < 0) {
2806
+ return -1;
2807
+ }
2808
+ }
2809
+
2810
+ for (k = binary_mv_kernels; k->name != NULL; k++) {
2811
+ if (gm_add_kernel(tbl, k, ctx) < 0) {
2812
+ return -1;
2813
+ }
2814
+ }
2815
+
2816
+ return 0;
2817
+ }