gumath 0.2.0dev5 → 0.2.0dev8

Sign up to get free protection for your applications and to get access to all the features.
Files changed (99) hide show
  1. checksums.yaml +4 -4
  2. data/CONTRIBUTING.md +7 -2
  3. data/Gemfile +0 -3
  4. data/ext/ruby_gumath/GPATH +0 -0
  5. data/ext/ruby_gumath/GRTAGS +0 -0
  6. data/ext/ruby_gumath/GTAGS +0 -0
  7. data/ext/ruby_gumath/extconf.rb +0 -5
  8. data/ext/ruby_gumath/functions.c +10 -2
  9. data/ext/ruby_gumath/gufunc_object.c +15 -4
  10. data/ext/ruby_gumath/gufunc_object.h +9 -3
  11. data/ext/ruby_gumath/gumath/Makefile +63 -0
  12. data/ext/ruby_gumath/gumath/Makefile.in +1 -0
  13. data/ext/ruby_gumath/gumath/config.h +56 -0
  14. data/ext/ruby_gumath/gumath/config.h.in +3 -0
  15. data/ext/ruby_gumath/gumath/config.log +497 -0
  16. data/ext/ruby_gumath/gumath/config.status +1034 -0
  17. data/ext/ruby_gumath/gumath/configure +375 -4
  18. data/ext/ruby_gumath/gumath/configure.ac +47 -3
  19. data/ext/ruby_gumath/gumath/libgumath/Makefile +236 -0
  20. data/ext/ruby_gumath/gumath/libgumath/Makefile.in +90 -24
  21. data/ext/ruby_gumath/gumath/libgumath/Makefile.vc +54 -15
  22. data/ext/ruby_gumath/gumath/libgumath/apply.c +92 -28
  23. data/ext/ruby_gumath/gumath/libgumath/apply.o +0 -0
  24. data/ext/ruby_gumath/gumath/libgumath/common.o +0 -0
  25. data/ext/ruby_gumath/gumath/libgumath/cpu_device_binary.o +0 -0
  26. data/ext/ruby_gumath/gumath/libgumath/cpu_device_unary.o +0 -0
  27. data/ext/ruby_gumath/gumath/libgumath/cpu_host_binary.o +0 -0
  28. data/ext/ruby_gumath/gumath/libgumath/cpu_host_unary.o +0 -0
  29. data/ext/ruby_gumath/gumath/libgumath/examples.o +0 -0
  30. data/ext/ruby_gumath/gumath/libgumath/extending/graph.c +27 -20
  31. data/ext/ruby_gumath/gumath/libgumath/extending/pdist.c +1 -1
  32. data/ext/ruby_gumath/gumath/libgumath/func.c +13 -9
  33. data/ext/ruby_gumath/gumath/libgumath/func.o +0 -0
  34. data/ext/ruby_gumath/gumath/libgumath/graph.o +0 -0
  35. data/ext/ruby_gumath/gumath/libgumath/gumath.h +55 -14
  36. data/ext/ruby_gumath/gumath/libgumath/kernels/common.c +513 -0
  37. data/ext/ruby_gumath/gumath/libgumath/kernels/common.h +155 -0
  38. data/ext/ruby_gumath/gumath/libgumath/kernels/contrib/bfloat16.h +520 -0
  39. data/ext/ruby_gumath/gumath/libgumath/kernels/cpu_device_binary.cc +1123 -0
  40. data/ext/ruby_gumath/gumath/libgumath/kernels/cpu_device_binary.h +1062 -0
  41. data/ext/ruby_gumath/gumath/libgumath/kernels/cpu_device_msvc.cc +555 -0
  42. data/ext/ruby_gumath/gumath/libgumath/kernels/cpu_device_unary.cc +368 -0
  43. data/ext/ruby_gumath/gumath/libgumath/kernels/cpu_device_unary.h +335 -0
  44. data/ext/ruby_gumath/gumath/libgumath/kernels/cpu_host_binary.c +2952 -0
  45. data/ext/ruby_gumath/gumath/libgumath/kernels/cpu_host_unary.c +1100 -0
  46. data/ext/ruby_gumath/gumath/libgumath/kernels/cuda_device_binary.cu +1143 -0
  47. data/ext/ruby_gumath/gumath/libgumath/kernels/cuda_device_binary.h +1061 -0
  48. data/ext/ruby_gumath/gumath/libgumath/kernels/cuda_device_unary.cu +528 -0
  49. data/ext/ruby_gumath/gumath/libgumath/kernels/cuda_device_unary.h +463 -0
  50. data/ext/ruby_gumath/gumath/libgumath/kernels/cuda_host_binary.c +2817 -0
  51. data/ext/ruby_gumath/gumath/libgumath/kernels/cuda_host_unary.c +1331 -0
  52. data/ext/ruby_gumath/gumath/libgumath/kernels/device.hh +614 -0
  53. data/ext/ruby_gumath/gumath/libgumath/libgumath.a +0 -0
  54. data/ext/ruby_gumath/gumath/libgumath/libgumath.so +1 -0
  55. data/ext/ruby_gumath/gumath/libgumath/libgumath.so.0 +1 -0
  56. data/ext/ruby_gumath/gumath/libgumath/libgumath.so.0.2.0dev3 +0 -0
  57. data/ext/ruby_gumath/gumath/libgumath/nploops.o +0 -0
  58. data/ext/ruby_gumath/gumath/libgumath/pdist.o +0 -0
  59. data/ext/ruby_gumath/gumath/libgumath/quaternion.o +0 -0
  60. data/ext/ruby_gumath/gumath/libgumath/tbl.o +0 -0
  61. data/ext/ruby_gumath/gumath/libgumath/thread.c +17 -4
  62. data/ext/ruby_gumath/gumath/libgumath/thread.o +0 -0
  63. data/ext/ruby_gumath/gumath/libgumath/xndloops.c +110 -0
  64. data/ext/ruby_gumath/gumath/libgumath/xndloops.o +0 -0
  65. data/ext/ruby_gumath/gumath/python/gumath/__init__.py +150 -0
  66. data/ext/ruby_gumath/gumath/python/gumath/_gumath.c +446 -80
  67. data/ext/ruby_gumath/gumath/python/gumath/cuda.c +78 -0
  68. data/ext/ruby_gumath/gumath/python/gumath/examples.c +0 -5
  69. data/ext/ruby_gumath/gumath/python/gumath/functions.c +2 -2
  70. data/ext/ruby_gumath/gumath/python/gumath/gumath.h +246 -0
  71. data/ext/ruby_gumath/gumath/python/gumath/libgumath.a +0 -0
  72. data/ext/ruby_gumath/gumath/python/gumath/libgumath.so +1 -0
  73. data/ext/ruby_gumath/gumath/python/gumath/libgumath.so.0 +1 -0
  74. data/ext/ruby_gumath/gumath/python/gumath/libgumath.so.0.2.0dev3 +0 -0
  75. data/ext/ruby_gumath/gumath/python/gumath/pygumath.h +31 -2
  76. data/ext/ruby_gumath/gumath/python/gumath_aux.py +767 -0
  77. data/ext/ruby_gumath/gumath/python/randdec.py +535 -0
  78. data/ext/ruby_gumath/gumath/python/randfloat.py +177 -0
  79. data/ext/ruby_gumath/gumath/python/test_gumath.py +1504 -24
  80. data/ext/ruby_gumath/gumath/python/test_xndarray.py +462 -0
  81. data/ext/ruby_gumath/gumath/setup.py +67 -6
  82. data/ext/ruby_gumath/gumath/tools/detect_cuda_arch.cc +35 -0
  83. data/ext/ruby_gumath/include/gumath.h +55 -14
  84. data/ext/ruby_gumath/include/ruby_gumath.h +4 -1
  85. data/ext/ruby_gumath/lib/libgumath.a +0 -0
  86. data/ext/ruby_gumath/lib/libgumath.so.0.2.0dev3 +0 -0
  87. data/ext/ruby_gumath/ruby_gumath.c +231 -70
  88. data/ext/ruby_gumath/ruby_gumath.h +4 -1
  89. data/ext/ruby_gumath/ruby_gumath_internal.h +25 -0
  90. data/ext/ruby_gumath/util.c +34 -0
  91. data/ext/ruby_gumath/util.h +9 -0
  92. data/gumath.gemspec +3 -2
  93. data/lib/gumath.rb +55 -1
  94. data/lib/gumath/version.rb +2 -2
  95. data/lib/ruby_gumath.so +0 -0
  96. metadata +63 -10
  97. data/ext/ruby_gumath/gumath/libgumath/extending/bfloat16.c +0 -130
  98. data/ext/ruby_gumath/gumath/libgumath/kernels/binary.c +0 -547
  99. data/ext/ruby_gumath/gumath/libgumath/kernels/unary.c +0 -449
@@ -0,0 +1,463 @@
1
+ /*
2
+ * BSD 3-Clause License
3
+ *
4
+ * Copyright (c) 2017-2018, plures
5
+ * All rights reserved.
6
+ *
7
+ * Redistribution and use in source and binary forms, with or without
8
+ * modification, are permitted provided that the following conditions are met:
9
+ *
10
+ * 1. Redistributions of source code must retain the above copyright notice,
11
+ * this list of conditions and the following disclaimer.
12
+ *
13
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
14
+ * this list of conditions and the following disclaimer in the documentation
15
+ * and/or other materials provided with the distribution.
16
+ *
17
+ * 3. Neither the name of the copyright holder nor the names of its
18
+ * contributors may be used to endorse or promote products derived from
19
+ * this software without specific prior written permission.
20
+ *
21
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
22
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
24
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
25
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
26
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
27
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
28
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
29
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
30
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31
+ */
32
+
33
+
34
+ #ifndef CUDA_DEVICE_UNARY_H
35
+ #define CUDA_DEVICE_UNARY_H
36
+
37
+
38
+ #ifdef __cplusplus
39
+ #include <cinttypes>
40
+ #include <cuda_fp16.h>
41
+ #include <thrust/complex.h>
42
+ #include "contrib/bfloat16.h"
43
+
44
+ typedef half float16_t;
45
+ typedef tf::bfloat16 bfloat16_t;
46
+ typedef thrust::complex<float> complex64_t;
47
+ typedef thrust::complex<double> complex128_t;
48
+ #else
49
+ #include <stdint.h>
50
+ #endif
51
+
52
+
53
+ typedef bool bool_t;
54
+ typedef float float32_t;
55
+ typedef double float64_t;
56
+
57
+
58
+ /*****************************************************************************/
59
+ /* Cuda device kernel signature */
60
+ /*****************************************************************************/
61
+
62
+ #ifdef __cplusplus
63
+ #define CUDA_DEVICE_UNARY_DECL(name, t0, t1) \
64
+ extern "C" void gm_cuda_device_fixed_1D_C_##name##_##t0##_##t1(const char *a0, char *a1, \
65
+ const int64_t N); \
66
+ extern "C" void gm_cuda_device_fixed_1D_S_##name##_##t0##_##t1(const char *a0, char *a1, \
67
+ const int64_t s0, const int64_t s1, \
68
+ const int64_t N); \
69
+ extern "C" void gm_cuda_device_0D_##name##_##t0##_##t1(const char *a0, char *a1);
70
+
71
+ #define CUDA_DEVICE_UNARY_REDUCE_DECL(name, t0, t1) \
72
+ extern "C" void gm_cuda_device_1D_C_reduce_##name##_##t0##_##t1(const char *a0, char *a1, \
73
+ const int64_t N);
74
+ #else
75
+ #define CUDA_DEVICE_UNARY_DECL(name, t0, t1) \
76
+ void gm_cuda_device_fixed_1D_C_##name##_##t0##_##t1(const char *a0, char *a1, \
77
+ const int64_t N); \
78
+ void gm_cuda_device_fixed_1D_S_##name##_##t0##_##t1(const char *a0, char *a1, \
79
+ const int64_t s0, const int64_t s1, \
80
+ const int64_t N); \
81
+ void gm_cuda_device_0D_##name##_##t0##_##t1(const char *a0, char *a1);
82
+
83
+ #define CUDA_DEVICE_UNARY_REDUCE_DECL(name, t0, t1) \
84
+ void gm_cuda_device_1D_C_reduce_##name##_##t0##_##t1(const char *a0, char *a1, \
85
+ const int64_t N);
86
+ #endif
87
+
88
+ #define CUDA_DEVICE_NOIMPL_DECL(name, t0, t1)
89
+
90
+
91
+ /*****************************************************************************/
92
+ /* Copy */
93
+ /*****************************************************************************/
94
+
95
+ #define CUDA_DEVICE_ALL_UNARY_DECL(name) \
96
+ CUDA_DEVICE_UNARY_DECL(name, bool, bool) \
97
+ CUDA_DEVICE_UNARY_DECL(name, bool, uint8) \
98
+ CUDA_DEVICE_UNARY_DECL(name, bool, uint16) \
99
+ CUDA_DEVICE_UNARY_DECL(name, bool, uint32) \
100
+ CUDA_DEVICE_UNARY_DECL(name, bool, uint64) \
101
+ CUDA_DEVICE_UNARY_DECL(name, bool, int8) \
102
+ CUDA_DEVICE_UNARY_DECL(name, bool, int16) \
103
+ CUDA_DEVICE_UNARY_DECL(name, bool, int32) \
104
+ CUDA_DEVICE_UNARY_DECL(name, bool, int64) \
105
+ CUDA_DEVICE_UNARY_DECL(name, bool, bfloat16) \
106
+ CUDA_DEVICE_UNARY_DECL(name, bool, float16) \
107
+ CUDA_DEVICE_UNARY_DECL(name, bool, float32) \
108
+ CUDA_DEVICE_UNARY_DECL(name, bool, float64) \
109
+ CUDA_DEVICE_NOIMPL_DECL(name, bool, complex32) \
110
+ CUDA_DEVICE_UNARY_DECL(name, bool, complex64) \
111
+ CUDA_DEVICE_UNARY_DECL(name, bool, complex128) \
112
+ CUDA_DEVICE_UNARY_DECL(name, uint8, uint8) \
113
+ CUDA_DEVICE_UNARY_DECL(name, uint8, uint16) \
114
+ CUDA_DEVICE_UNARY_DECL(name, uint8, uint32) \
115
+ CUDA_DEVICE_UNARY_DECL(name, uint8, uint64) \
116
+ CUDA_DEVICE_UNARY_DECL(name, uint8, int16) \
117
+ CUDA_DEVICE_UNARY_DECL(name, uint8, int32) \
118
+ CUDA_DEVICE_UNARY_DECL(name, uint8, int64) \
119
+ CUDA_DEVICE_UNARY_DECL(name, uint8, bfloat16) \
120
+ CUDA_DEVICE_UNARY_DECL(name, uint8, float16) \
121
+ CUDA_DEVICE_UNARY_DECL(name, uint8, float32) \
122
+ CUDA_DEVICE_UNARY_DECL(name, uint8, float64) \
123
+ CUDA_DEVICE_NOIMPL_DECL(name, uint8, complex32) \
124
+ CUDA_DEVICE_UNARY_DECL(name, uint8, complex64) \
125
+ CUDA_DEVICE_UNARY_DECL(name, uint8, complex128) \
126
+ CUDA_DEVICE_UNARY_DECL(name, uint16, uint16) \
127
+ CUDA_DEVICE_UNARY_DECL(name, uint16, uint32) \
128
+ CUDA_DEVICE_UNARY_DECL(name, uint16, uint64) \
129
+ CUDA_DEVICE_UNARY_DECL(name, uint16, int32) \
130
+ CUDA_DEVICE_UNARY_DECL(name, uint16, int64) \
131
+ CUDA_DEVICE_UNARY_DECL(name, uint16, float32) \
132
+ CUDA_DEVICE_UNARY_DECL(name, uint16, float64) \
133
+ CUDA_DEVICE_UNARY_DECL(name, uint16, complex64) \
134
+ CUDA_DEVICE_UNARY_DECL(name, uint16, complex128) \
135
+ CUDA_DEVICE_UNARY_DECL(name, uint32, uint32) \
136
+ CUDA_DEVICE_UNARY_DECL(name, uint32, uint64) \
137
+ CUDA_DEVICE_UNARY_DECL(name, uint32, int64) \
138
+ CUDA_DEVICE_UNARY_DECL(name, uint32, float64) \
139
+ CUDA_DEVICE_UNARY_DECL(name, uint32, complex128) \
140
+ CUDA_DEVICE_UNARY_DECL(name, uint64, uint64) \
141
+ CUDA_DEVICE_UNARY_DECL(name, int8, int8) \
142
+ CUDA_DEVICE_UNARY_DECL(name, int8, int16) \
143
+ CUDA_DEVICE_UNARY_DECL(name, int8, int32) \
144
+ CUDA_DEVICE_UNARY_DECL(name, int8, int64) \
145
+ CUDA_DEVICE_UNARY_DECL(name, int8, bfloat16) \
146
+ CUDA_DEVICE_UNARY_DECL(name, int8, float16) \
147
+ CUDA_DEVICE_UNARY_DECL(name, int8, float32) \
148
+ CUDA_DEVICE_UNARY_DECL(name, int8, float64) \
149
+ CUDA_DEVICE_NOIMPL_DECL(name, int8, complex32) \
150
+ CUDA_DEVICE_UNARY_DECL(name, int8, complex64) \
151
+ CUDA_DEVICE_UNARY_DECL(name, int8, complex128) \
152
+ CUDA_DEVICE_UNARY_DECL(name, int16, int16) \
153
+ CUDA_DEVICE_UNARY_DECL(name, int16, int32) \
154
+ CUDA_DEVICE_UNARY_DECL(name, int16, int64) \
155
+ CUDA_DEVICE_UNARY_DECL(name, int16, float32) \
156
+ CUDA_DEVICE_UNARY_DECL(name, int16, float64) \
157
+ CUDA_DEVICE_UNARY_DECL(name, int16, complex64) \
158
+ CUDA_DEVICE_UNARY_DECL(name, int16, complex128) \
159
+ CUDA_DEVICE_UNARY_DECL(name, int32, int32) \
160
+ CUDA_DEVICE_UNARY_DECL(name, int32, int64) \
161
+ CUDA_DEVICE_UNARY_DECL(name, int32, float64) \
162
+ CUDA_DEVICE_UNARY_DECL(name, int32, complex128) \
163
+ CUDA_DEVICE_UNARY_DECL(name, int64, int64) \
164
+ CUDA_DEVICE_UNARY_DECL(name, bfloat16, bfloat16) \
165
+ CUDA_DEVICE_UNARY_DECL(name, bfloat16, float32) \
166
+ CUDA_DEVICE_UNARY_DECL(name, bfloat16, float64) \
167
+ CUDA_DEVICE_UNARY_DECL(name, bfloat16, complex64) \
168
+ CUDA_DEVICE_UNARY_DECL(name, bfloat16, complex128) \
169
+ CUDA_DEVICE_UNARY_DECL(name, float16, float16) \
170
+ CUDA_DEVICE_UNARY_DECL(name, float16, float32) \
171
+ CUDA_DEVICE_UNARY_DECL(name, float16, float64) \
172
+ CUDA_DEVICE_NOIMPL_DECL(name, float16, complex32) \
173
+ CUDA_DEVICE_UNARY_DECL(name, float16, complex64) \
174
+ CUDA_DEVICE_UNARY_DECL(name, float16, complex128) \
175
+ CUDA_DEVICE_UNARY_DECL(name, float32, float32) \
176
+ CUDA_DEVICE_UNARY_DECL(name, float32, float64) \
177
+ CUDA_DEVICE_UNARY_DECL(name, float32, complex64) \
178
+ CUDA_DEVICE_UNARY_DECL(name, float32, complex128) \
179
+ CUDA_DEVICE_UNARY_DECL(name, float64, float64) \
180
+ CUDA_DEVICE_UNARY_DECL(name, float64, complex128) \
181
+ CUDA_DEVICE_NOIMPL_DECL(name, complex32, complex32) \
182
+ CUDA_DEVICE_NOIMPL_DECL(name, complex32, complex64) \
183
+ CUDA_DEVICE_NOIMPL_DECL(name, complex32, complex128) \
184
+ CUDA_DEVICE_UNARY_DECL(name, complex64, complex64) \
185
+ CUDA_DEVICE_UNARY_DECL(name, complex64, complex128) \
186
+ CUDA_DEVICE_UNARY_DECL(name, complex128, complex128)
187
+
188
+
189
+ CUDA_DEVICE_ALL_UNARY_DECL(copy)
190
+ CUDA_DEVICE_ALL_UNARY_DECL(abs)
191
+
192
+
193
+ /*****************************************************************************/
194
+ /* Reduce */
195
+ /*****************************************************************************/
196
+
197
+ #define CUDA_DEVICE_ALL_UNARY_REDUCE_DECL(name) \
198
+ CUDA_DEVICE_UNARY_REDUCE_DECL(name, bool, bool) \
199
+ CUDA_DEVICE_UNARY_REDUCE_DECL(name, bool, uint8) \
200
+ CUDA_DEVICE_UNARY_REDUCE_DECL(name, bool, uint16) \
201
+ CUDA_DEVICE_UNARY_REDUCE_DECL(name, bool, uint32) \
202
+ CUDA_DEVICE_UNARY_REDUCE_DECL(name, bool, uint64) \
203
+ CUDA_DEVICE_UNARY_REDUCE_DECL(name, bool, int8) \
204
+ CUDA_DEVICE_UNARY_REDUCE_DECL(name, bool, int16) \
205
+ CUDA_DEVICE_UNARY_REDUCE_DECL(name, bool, int32) \
206
+ CUDA_DEVICE_UNARY_REDUCE_DECL(name, bool, int64) \
207
+ CUDA_DEVICE_NOIMPL_DECL(name, bool, bfloat16) \
208
+ CUDA_DEVICE_UNARY_REDUCE_DECL(name, bool, float16) \
209
+ CUDA_DEVICE_UNARY_REDUCE_DECL(name, bool, float32) \
210
+ CUDA_DEVICE_UNARY_REDUCE_DECL(name, bool, float64) \
211
+ CUDA_DEVICE_NOIMPL_DECL(name, bool, complex32) \
212
+ CUDA_DEVICE_NOIMPL_DECL(name, bool, complex64) \
213
+ CUDA_DEVICE_NOIMPL_DECL(name, bool, complex128) \
214
+ \
215
+ CUDA_DEVICE_UNARY_REDUCE_DECL(name, uint8, uint8) \
216
+ CUDA_DEVICE_UNARY_REDUCE_DECL(name, uint8, uint16) \
217
+ CUDA_DEVICE_UNARY_REDUCE_DECL(name, uint8, uint32) \
218
+ CUDA_DEVICE_UNARY_REDUCE_DECL(name, uint8, uint64) \
219
+ CUDA_DEVICE_UNARY_REDUCE_DECL(name, uint8, int16) \
220
+ CUDA_DEVICE_UNARY_REDUCE_DECL(name, uint8, int32) \
221
+ CUDA_DEVICE_UNARY_REDUCE_DECL(name, uint8, int64) \
222
+ CUDA_DEVICE_NOIMPL_DECL(name, uint8, bfloat16) \
223
+ CUDA_DEVICE_UNARY_REDUCE_DECL(name, uint8, float16) \
224
+ CUDA_DEVICE_UNARY_REDUCE_DECL(name, uint8, float32) \
225
+ CUDA_DEVICE_UNARY_REDUCE_DECL(name, uint8, float64) \
226
+ CUDA_DEVICE_NOIMPL_DECL(name, uint8, complex32) \
227
+ CUDA_DEVICE_NOIMPL_DECL(name, uint8, complex64) \
228
+ CUDA_DEVICE_NOIMPL_DECL(name, uint8, complex128) \
229
+ \
230
+ CUDA_DEVICE_UNARY_REDUCE_DECL(name, uint16, uint16) \
231
+ CUDA_DEVICE_UNARY_REDUCE_DECL(name, uint16, uint32) \
232
+ CUDA_DEVICE_UNARY_REDUCE_DECL(name, uint16, uint64) \
233
+ CUDA_DEVICE_UNARY_REDUCE_DECL(name, uint16, int32) \
234
+ CUDA_DEVICE_UNARY_REDUCE_DECL(name, uint16, int64) \
235
+ CUDA_DEVICE_UNARY_REDUCE_DECL(name, uint16, float32) \
236
+ CUDA_DEVICE_UNARY_REDUCE_DECL(name, uint16, float64) \
237
+ CUDA_DEVICE_NOIMPL_DECL(name, uint16, complex64) \
238
+ CUDA_DEVICE_NOIMPL_DECL(name, uint16, complex128) \
239
+ \
240
+ CUDA_DEVICE_UNARY_REDUCE_DECL(name, uint32, uint32) \
241
+ CUDA_DEVICE_UNARY_REDUCE_DECL(name, uint32, uint64) \
242
+ CUDA_DEVICE_UNARY_REDUCE_DECL(name, uint32, int64) \
243
+ CUDA_DEVICE_UNARY_REDUCE_DECL(name, uint32, float64) \
244
+ CUDA_DEVICE_NOIMPL_DECL(name, uint32, complex128) \
245
+ \
246
+ CUDA_DEVICE_UNARY_REDUCE_DECL(name, uint64, uint64) \
247
+ \
248
+ CUDA_DEVICE_UNARY_REDUCE_DECL(name, int8, int8) \
249
+ CUDA_DEVICE_UNARY_REDUCE_DECL(name, int8, int16) \
250
+ CUDA_DEVICE_UNARY_REDUCE_DECL(name, int8, int32) \
251
+ CUDA_DEVICE_UNARY_REDUCE_DECL(name, int8, int64) \
252
+ CUDA_DEVICE_NOIMPL_DECL(name, int8, bfloat16) \
253
+ CUDA_DEVICE_UNARY_REDUCE_DECL(name, int8, float16) \
254
+ CUDA_DEVICE_UNARY_REDUCE_DECL(name, int8, float32) \
255
+ CUDA_DEVICE_UNARY_REDUCE_DECL(name, int8, float64) \
256
+ CUDA_DEVICE_NOIMPL_DECL(name, int8, complex32) \
257
+ CUDA_DEVICE_NOIMPL_DECL(name, int8, complex64) \
258
+ CUDA_DEVICE_NOIMPL_DECL(name, int8, complex128) \
259
+ \
260
+ CUDA_DEVICE_UNARY_REDUCE_DECL(name, int16, int16) \
261
+ CUDA_DEVICE_UNARY_REDUCE_DECL(name, int16, int32) \
262
+ CUDA_DEVICE_UNARY_REDUCE_DECL(name, int16, int64) \
263
+ CUDA_DEVICE_UNARY_REDUCE_DECL(name, int16, float32) \
264
+ CUDA_DEVICE_UNARY_REDUCE_DECL(name, int16, float64) \
265
+ CUDA_DEVICE_NOIMPL_DECL(name, int16, complex64) \
266
+ CUDA_DEVICE_NOIMPL_DECL(name, int16, complex128) \
267
+ \
268
+ CUDA_DEVICE_UNARY_REDUCE_DECL(name, int32, int32) \
269
+ CUDA_DEVICE_UNARY_REDUCE_DECL(name, int32, int64) \
270
+ CUDA_DEVICE_UNARY_REDUCE_DECL(name, int32, float64) \
271
+ CUDA_DEVICE_NOIMPL_DECL(name, int32, complex128) \
272
+ \
273
+ CUDA_DEVICE_UNARY_REDUCE_DECL(name, int64, int64) \
274
+ \
275
+ CUDA_DEVICE_NOIMPL_DECL(name, bfloat16, bfloat16) \
276
+ CUDA_DEVICE_NOIMPL_DECL(name, bfloat16, float32) \
277
+ CUDA_DEVICE_NOIMPL_DECL(name, bfloat16, float64) \
278
+ CUDA_DEVICE_NOIMPL_DECL(name, bfloat16, complex64) \
279
+ CUDA_DEVICE_NOIMPL_DECL(name, bfloat16, complex128) \
280
+ \
281
+ CUDA_DEVICE_UNARY_REDUCE_DECL(name, float16, float16) \
282
+ CUDA_DEVICE_NOIMPL_DECL(name, float16, float32) \
283
+ CUDA_DEVICE_NOIMPL_DECL(name, float16, float64) \
284
+ CUDA_DEVICE_NOIMPL_DECL(name, float16, complex32) \
285
+ CUDA_DEVICE_NOIMPL_DECL(name, float16, complex64) \
286
+ CUDA_DEVICE_NOIMPL_DECL(name, float16, complex128) \
287
+ \
288
+ CUDA_DEVICE_UNARY_REDUCE_DECL(name, float32, float32) \
289
+ CUDA_DEVICE_UNARY_REDUCE_DECL(name, float32, float64) \
290
+ CUDA_DEVICE_UNARY_REDUCE_DECL(name, float32, complex64) \
291
+ CUDA_DEVICE_UNARY_REDUCE_DECL(name, float32, complex128) \
292
+ \
293
+ CUDA_DEVICE_UNARY_REDUCE_DECL(name, float64, float64) \
294
+ CUDA_DEVICE_UNARY_REDUCE_DECL(name, float64, complex128) \
295
+ \
296
+ CUDA_DEVICE_NOIMPL_DECL(name, complex32, complex32) \
297
+ CUDA_DEVICE_NOIMPL_DECL(name, complex32, complex64) \
298
+ CUDA_DEVICE_NOIMPL_DECL(name, complex32, complex128) \
299
+ \
300
+ CUDA_DEVICE_NOIMPL_DECL(name, complex64, complex64) \
301
+ CUDA_DEVICE_NOIMPL_DECL(name, complex64, complex128) \
302
+ \
303
+ CUDA_DEVICE_NOIMPL_DECL(name, complex128, complex128)
304
+
305
+
306
+ CUDA_DEVICE_ALL_UNARY_REDUCE_DECL(add)
307
+ CUDA_DEVICE_ALL_UNARY_REDUCE_DECL(multiply)
308
+
309
+
310
+ /*****************************************************************************/
311
+ /* Bitwise NOT */
312
+ /*****************************************************************************/
313
+
314
+ CUDA_DEVICE_UNARY_DECL(invert, bool, bool)
315
+
316
+ CUDA_DEVICE_UNARY_DECL(invert, uint8, uint8)
317
+ CUDA_DEVICE_UNARY_DECL(invert, uint16, uint16)
318
+ CUDA_DEVICE_UNARY_DECL(invert, uint32, uint32)
319
+ CUDA_DEVICE_UNARY_DECL(invert, uint64, uint64)
320
+
321
+ CUDA_DEVICE_UNARY_DECL(invert, int8, int8)
322
+ CUDA_DEVICE_UNARY_DECL(invert, int16, int16)
323
+ CUDA_DEVICE_UNARY_DECL(invert, int32, int32)
324
+ CUDA_DEVICE_UNARY_DECL(invert, int64, int64)
325
+
326
+
327
+ /*****************************************************************************/
328
+ /* Negative */
329
+ /*****************************************************************************/
330
+
331
+ CUDA_DEVICE_UNARY_DECL(negative, uint8, int16)
332
+ CUDA_DEVICE_UNARY_DECL(negative, uint16, int32)
333
+ CUDA_DEVICE_UNARY_DECL(negative, uint32, int64)
334
+
335
+ CUDA_DEVICE_UNARY_DECL(negative, int8, int8)
336
+ CUDA_DEVICE_UNARY_DECL(negative, int16, int16)
337
+ CUDA_DEVICE_UNARY_DECL(negative, int32, int32)
338
+ CUDA_DEVICE_UNARY_DECL(negative, int64, int64)
339
+
340
+ CUDA_DEVICE_UNARY_DECL(negative, bfloat16, bfloat16)
341
+ CUDA_DEVICE_UNARY_DECL(negative, float16, float16)
342
+ CUDA_DEVICE_UNARY_DECL(negative, float32, float32)
343
+ CUDA_DEVICE_UNARY_DECL(negative, float64, float64)
344
+
345
+ CUDA_DEVICE_NOIMPL_DECL(negative, complex32, complex32)
346
+ CUDA_DEVICE_UNARY_DECL(negative, complex64, complex64)
347
+ CUDA_DEVICE_UNARY_DECL(negative, complex128, complex128)
348
+
349
+
350
+ /*****************************************************************************/
351
+ /* Math */
352
+ /*****************************************************************************/
353
+
354
+ #define CUDA_DEVICE_UNARY_ALL_REAL_MATH_DECL(name) \
355
+ CUDA_DEVICE_UNARY_DECL(name##f, uint16, float32) \
356
+ CUDA_DEVICE_UNARY_DECL(name##f, int16, float32) \
357
+ CUDA_DEVICE_UNARY_DECL(name##b16, bfloat16, bfloat16) \
358
+ CUDA_DEVICE_UNARY_DECL(name##f, float32, float32) \
359
+ CUDA_DEVICE_UNARY_DECL(name, uint32, float64) \
360
+ CUDA_DEVICE_UNARY_DECL(name, int32, float64) \
361
+ CUDA_DEVICE_UNARY_DECL(name, float64, float64)
362
+
363
+ #define CUDA_DEVICE_UNARY_ALL_COMPLEX_MATH_DECL(name) \
364
+ CUDA_DEVICE_UNARY_ALL_REAL_MATH_DECL(name) \
365
+ CUDA_DEVICE_UNARY_DECL(name, complex32, complex32) \
366
+ CUDA_DEVICE_UNARY_DECL(name, complex64, complex64) \
367
+ CUDA_DEVICE_UNARY_DECL(name, complex128, complex128)
368
+
369
+ #define CUDA_DEVICE_UNARY_ALL_HALF_MATH_DECL(name) \
370
+ CUDA_DEVICE_UNARY_DECL(name##f16, uint8, float16) \
371
+ CUDA_DEVICE_UNARY_DECL(name##f16, int8, float16) \
372
+ CUDA_DEVICE_UNARY_DECL(name##f16, float16, float16)
373
+
374
+ #define CUDA_DEVICE_UNARY_ALL_REAL_MATH_WITH_HALF_DECL(name) \
375
+ CUDA_DEVICE_UNARY_ALL_HALF_MATH_DECL(name) \
376
+ CUDA_DEVICE_UNARY_ALL_REAL_MATH_DECL(name) \
377
+
378
+ #define CUDA_DEVICE_UNARY_ALL_COMPLEX_MATH_WITH_HALF_DECL(name) \
379
+ CUDA_DEVICE_UNARY_ALL_HALF_MATH_DECL(name) \
380
+ CUDA_DEVICE_UNARY_ALL_COMPLEX_MATH_DECL(name) \
381
+
382
+
383
+ /*****************************************************************************/
384
+ /* Abs functions */
385
+ /*****************************************************************************/
386
+
387
+ CUDA_DEVICE_UNARY_ALL_REAL_MATH_WITH_HALF_DECL(fabs)
388
+
389
+
390
+ /*****************************************************************************/
391
+ /* Exponential functions */
392
+ /*****************************************************************************/
393
+
394
+ CUDA_DEVICE_UNARY_ALL_COMPLEX_MATH_WITH_HALF_DECL(exp)
395
+ CUDA_DEVICE_UNARY_ALL_REAL_MATH_WITH_HALF_DECL(exp2)
396
+ CUDA_DEVICE_UNARY_ALL_REAL_MATH_DECL(expm1)
397
+
398
+
399
+ /*****************************************************************************/
400
+ /* Logarithm functions */
401
+ /*****************************************************************************/
402
+
403
+ CUDA_DEVICE_UNARY_ALL_COMPLEX_MATH_WITH_HALF_DECL(log)
404
+ CUDA_DEVICE_UNARY_ALL_COMPLEX_MATH_WITH_HALF_DECL(log10)
405
+ CUDA_DEVICE_UNARY_ALL_REAL_MATH_WITH_HALF_DECL(log2)
406
+ CUDA_DEVICE_UNARY_ALL_REAL_MATH_DECL(log1p)
407
+ CUDA_DEVICE_UNARY_ALL_REAL_MATH_DECL(logb)
408
+
409
+
410
+ /*****************************************************************************/
411
+ /* Power functions */
412
+ /*****************************************************************************/
413
+
414
+ CUDA_DEVICE_UNARY_ALL_COMPLEX_MATH_WITH_HALF_DECL(sqrt)
415
+ CUDA_DEVICE_UNARY_ALL_REAL_MATH_DECL(cbrt)
416
+
417
+
418
+ /*****************************************************************************/
419
+ /* Trigonometric functions */
420
+ /*****************************************************************************/
421
+
422
+ CUDA_DEVICE_UNARY_ALL_COMPLEX_MATH_WITH_HALF_DECL(sin)
423
+ CUDA_DEVICE_UNARY_ALL_COMPLEX_MATH_WITH_HALF_DECL(cos)
424
+ CUDA_DEVICE_UNARY_ALL_COMPLEX_MATH_DECL(tan)
425
+ CUDA_DEVICE_UNARY_ALL_COMPLEX_MATH_DECL(asin)
426
+ CUDA_DEVICE_UNARY_ALL_COMPLEX_MATH_DECL(acos)
427
+ CUDA_DEVICE_UNARY_ALL_COMPLEX_MATH_DECL(atan)
428
+
429
+
430
+ /*****************************************************************************/
431
+ /* Hyperbolic functions */
432
+ /*****************************************************************************/
433
+
434
+ CUDA_DEVICE_UNARY_ALL_COMPLEX_MATH_DECL(sinh)
435
+ CUDA_DEVICE_UNARY_ALL_COMPLEX_MATH_DECL(cosh)
436
+ CUDA_DEVICE_UNARY_ALL_COMPLEX_MATH_DECL(tanh)
437
+ CUDA_DEVICE_UNARY_ALL_COMPLEX_MATH_DECL(asinh)
438
+ CUDA_DEVICE_UNARY_ALL_COMPLEX_MATH_DECL(acosh)
439
+ CUDA_DEVICE_UNARY_ALL_COMPLEX_MATH_DECL(atanh)
440
+
441
+
442
+ /*****************************************************************************/
443
+ /* Error and gamma functions */
444
+ /*****************************************************************************/
445
+
446
+ CUDA_DEVICE_UNARY_ALL_REAL_MATH_DECL(erf)
447
+ CUDA_DEVICE_UNARY_ALL_REAL_MATH_DECL(erfc)
448
+ CUDA_DEVICE_UNARY_ALL_REAL_MATH_DECL(lgamma)
449
+ CUDA_DEVICE_UNARY_ALL_REAL_MATH_DECL(tgamma)
450
+
451
+
452
+ /*****************************************************************************/
453
+ /* Ceiling, floor, trunc */
454
+ /*****************************************************************************/
455
+
456
+ CUDA_DEVICE_UNARY_ALL_REAL_MATH_DECL(ceil)
457
+ CUDA_DEVICE_UNARY_ALL_REAL_MATH_DECL(floor)
458
+ CUDA_DEVICE_UNARY_ALL_REAL_MATH_DECL(trunc)
459
+ CUDA_DEVICE_UNARY_ALL_REAL_MATH_DECL(round)
460
+ CUDA_DEVICE_UNARY_ALL_REAL_MATH_DECL(nearbyint)
461
+
462
+
463
+ #endif /* CUDA_DEVICE_UNARY_H */