triton-windows 3.3.1.post19__cp312-cp312-win_amd64.whl → 3.3.1.post21__cp312-cp312-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of triton-windows might be problematic. Click here for more details.

Files changed (108) hide show
  1. triton/_C/libtriton.pyd +0 -0
  2. triton/backends/amd/driver.py +6 -1
  3. triton/backends/nvidia/compiler.py +1 -3
  4. triton/backends/nvidia/driver.py +7 -3
  5. triton/runtime/autotuner.py +2 -2
  6. triton/runtime/build.py +5 -5
  7. triton/windows_utils.py +11 -4
  8. {triton_windows-3.3.1.post19.dist-info → triton_windows-3.3.1.post21.dist-info}/METADATA +1 -1
  9. {triton_windows-3.3.1.post19.dist-info → triton_windows-3.3.1.post21.dist-info}/RECORD +11 -108
  10. triton/backends/amd/include/hip/amd_detail/amd_channel_descriptor.h +0 -358
  11. triton/backends/amd/include/hip/amd_detail/amd_device_functions.h +0 -1010
  12. triton/backends/amd/include/hip/amd_detail/amd_hip_atomic.h +0 -1638
  13. triton/backends/amd/include/hip/amd_detail/amd_hip_bf16.h +0 -1814
  14. triton/backends/amd/include/hip/amd_detail/amd_hip_bfloat16.h +0 -293
  15. triton/backends/amd/include/hip/amd_detail/amd_hip_common.h +0 -32
  16. triton/backends/amd/include/hip/amd_detail/amd_hip_complex.h +0 -174
  17. triton/backends/amd/include/hip/amd_detail/amd_hip_cooperative_groups.h +0 -835
  18. triton/backends/amd/include/hip/amd_detail/amd_hip_fp16.h +0 -1809
  19. triton/backends/amd/include/hip/amd_detail/amd_hip_fp8.h +0 -1391
  20. triton/backends/amd/include/hip/amd_detail/amd_hip_gl_interop.h +0 -108
  21. triton/backends/amd/include/hip/amd_detail/amd_hip_math_constants.h +0 -124
  22. triton/backends/amd/include/hip/amd_detail/amd_hip_runtime.h +0 -405
  23. triton/backends/amd/include/hip/amd_detail/amd_hip_runtime_pt_api.h +0 -196
  24. triton/backends/amd/include/hip/amd_detail/amd_hip_unsafe_atomics.h +0 -565
  25. triton/backends/amd/include/hip/amd_detail/amd_hip_vector_types.h +0 -2226
  26. triton/backends/amd/include/hip/amd_detail/amd_math_functions.h +0 -104
  27. triton/backends/amd/include/hip/amd_detail/amd_surface_functions.h +0 -244
  28. triton/backends/amd/include/hip/amd_detail/amd_warp_functions.h +0 -538
  29. triton/backends/amd/include/hip/amd_detail/amd_warp_sync_functions.h +0 -288
  30. triton/backends/amd/include/hip/amd_detail/concepts.hpp +0 -30
  31. triton/backends/amd/include/hip/amd_detail/device_library_decls.h +0 -133
  32. triton/backends/amd/include/hip/amd_detail/functional_grid_launch.hpp +0 -218
  33. triton/backends/amd/include/hip/amd_detail/grid_launch.h +0 -67
  34. triton/backends/amd/include/hip/amd_detail/grid_launch.hpp +0 -50
  35. triton/backends/amd/include/hip/amd_detail/grid_launch_GGL.hpp +0 -26
  36. triton/backends/amd/include/hip/amd_detail/helpers.hpp +0 -137
  37. triton/backends/amd/include/hip/amd_detail/hip_api_trace.hpp +0 -1446
  38. triton/backends/amd/include/hip/amd_detail/hip_assert.h +0 -101
  39. triton/backends/amd/include/hip/amd_detail/hip_cooperative_groups_helper.h +0 -242
  40. triton/backends/amd/include/hip/amd_detail/hip_fp16_gcc.h +0 -254
  41. triton/backends/amd/include/hip/amd_detail/hip_fp16_math_fwd.h +0 -96
  42. triton/backends/amd/include/hip/amd_detail/hip_ldg.h +0 -100
  43. triton/backends/amd/include/hip/amd_detail/hip_prof_str.h +0 -10570
  44. triton/backends/amd/include/hip/amd_detail/hip_runtime_prof.h +0 -78
  45. triton/backends/amd/include/hip/amd_detail/host_defines.h +0 -184
  46. triton/backends/amd/include/hip/amd_detail/hsa_helpers.hpp +0 -102
  47. triton/backends/amd/include/hip/amd_detail/macro_based_grid_launch.hpp +0 -798
  48. triton/backends/amd/include/hip/amd_detail/math_fwd.h +0 -698
  49. triton/backends/amd/include/hip/amd_detail/ockl_image.h +0 -177
  50. triton/backends/amd/include/hip/amd_detail/program_state.hpp +0 -107
  51. triton/backends/amd/include/hip/amd_detail/texture_fetch_functions.h +0 -491
  52. triton/backends/amd/include/hip/amd_detail/texture_indirect_functions.h +0 -478
  53. triton/backends/amd/include/hip/channel_descriptor.h +0 -39
  54. triton/backends/amd/include/hip/device_functions.h +0 -38
  55. triton/backends/amd/include/hip/driver_types.h +0 -468
  56. triton/backends/amd/include/hip/hip_bf16.h +0 -36
  57. triton/backends/amd/include/hip/hip_bfloat16.h +0 -44
  58. triton/backends/amd/include/hip/hip_common.h +0 -100
  59. triton/backends/amd/include/hip/hip_complex.h +0 -38
  60. triton/backends/amd/include/hip/hip_cooperative_groups.h +0 -46
  61. triton/backends/amd/include/hip/hip_deprecated.h +0 -95
  62. triton/backends/amd/include/hip/hip_ext.h +0 -161
  63. triton/backends/amd/include/hip/hip_fp16.h +0 -36
  64. triton/backends/amd/include/hip/hip_fp8.h +0 -33
  65. triton/backends/amd/include/hip/hip_gl_interop.h +0 -32
  66. triton/backends/amd/include/hip/hip_hcc.h +0 -24
  67. triton/backends/amd/include/hip/hip_math_constants.h +0 -36
  68. triton/backends/amd/include/hip/hip_profile.h +0 -27
  69. triton/backends/amd/include/hip/hip_runtime.h +0 -75
  70. triton/backends/amd/include/hip/hip_runtime_api.h +0 -9261
  71. triton/backends/amd/include/hip/hip_texture_types.h +0 -29
  72. triton/backends/amd/include/hip/hip_vector_types.h +0 -41
  73. triton/backends/amd/include/hip/hip_version.h +0 -17
  74. triton/backends/amd/include/hip/hiprtc.h +0 -421
  75. triton/backends/amd/include/hip/library_types.h +0 -78
  76. triton/backends/amd/include/hip/math_functions.h +0 -42
  77. triton/backends/amd/include/hip/surface_types.h +0 -63
  78. triton/backends/amd/include/hip/texture_types.h +0 -194
  79. triton/backends/amd/include/hsa/Brig.h +0 -1131
  80. triton/backends/amd/include/hsa/amd_hsa_common.h +0 -91
  81. triton/backends/amd/include/hsa/amd_hsa_elf.h +0 -462
  82. triton/backends/amd/include/hsa/amd_hsa_kernel_code.h +0 -269
  83. triton/backends/amd/include/hsa/amd_hsa_queue.h +0 -109
  84. triton/backends/amd/include/hsa/amd_hsa_signal.h +0 -80
  85. triton/backends/amd/include/hsa/hsa.h +0 -5738
  86. triton/backends/amd/include/hsa/hsa_amd_tool.h +0 -91
  87. triton/backends/amd/include/hsa/hsa_api_trace.h +0 -579
  88. triton/backends/amd/include/hsa/hsa_api_trace_version.h +0 -68
  89. triton/backends/amd/include/hsa/hsa_ext_amd.h +0 -3146
  90. triton/backends/amd/include/hsa/hsa_ext_finalize.h +0 -531
  91. triton/backends/amd/include/hsa/hsa_ext_image.h +0 -1454
  92. triton/backends/amd/include/hsa/hsa_ven_amd_aqlprofile.h +0 -488
  93. triton/backends/amd/include/hsa/hsa_ven_amd_loader.h +0 -667
  94. triton/backends/amd/include/hsa/hsa_ven_amd_pc_sampling.h +0 -416
  95. triton/backends/amd/include/roctracer/ext/prof_protocol.h +0 -107
  96. triton/backends/amd/include/roctracer/hip_ostream_ops.h +0 -4515
  97. triton/backends/amd/include/roctracer/hsa_ostream_ops.h +0 -1727
  98. triton/backends/amd/include/roctracer/hsa_prof_str.h +0 -3059
  99. triton/backends/amd/include/roctracer/roctracer.h +0 -779
  100. triton/backends/amd/include/roctracer/roctracer_ext.h +0 -81
  101. triton/backends/amd/include/roctracer/roctracer_hcc.h +0 -24
  102. triton/backends/amd/include/roctracer/roctracer_hip.h +0 -37
  103. triton/backends/amd/include/roctracer/roctracer_hsa.h +0 -112
  104. triton/backends/amd/include/roctracer/roctracer_plugin.h +0 -137
  105. triton/backends/amd/include/roctracer/roctracer_roctx.h +0 -67
  106. triton/backends/amd/include/roctracer/roctx.h +0 -229
  107. {triton_windows-3.3.1.post19.dist-info → triton_windows-3.3.1.post21.dist-info}/WHEEL +0 -0
  108. {triton_windows-3.3.1.post19.dist-info → triton_windows-3.3.1.post21.dist-info}/top_level.txt +0 -0
@@ -1,538 +0,0 @@
1
- /*
2
- Copyright (c) 2022 - 2023 Advanced Micro Devices, Inc. All rights reserved.
3
-
4
- Permission is hereby granted, free of charge, to any person obtaining a copy
5
- of this software and associated documentation files (the "Software"), to deal
6
- in the Software without restriction, including without limitation the rights
7
- to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
8
- copies of the Software, and to permit persons to whom the Software is
9
- furnished to do so, subject to the following conditions:
10
-
11
- The above copyright notice and this permission notice shall be included in
12
- all copies or substantial portions of the Software.
13
-
14
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15
- IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16
- FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
17
- AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18
- LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
19
- OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
20
- THE SOFTWARE.
21
- */
22
-
23
- #ifndef HIP_INCLUDE_HIP_AMD_DETAIL_WARP_FUNCTIONS_H
24
- #define HIP_INCLUDE_HIP_AMD_DETAIL_WARP_FUNCTIONS_H
25
-
26
- __device__ static inline unsigned __hip_ds_bpermute(int index, unsigned src) {
27
- union { int i; unsigned u; float f; } tmp; tmp.u = src;
28
- tmp.i = __builtin_amdgcn_ds_bpermute(index, tmp.i);
29
- return tmp.u;
30
- }
31
-
32
- __device__ static inline float __hip_ds_bpermutef(int index, float src) {
33
- union { int i; unsigned u; float f; } tmp; tmp.f = src;
34
- tmp.i = __builtin_amdgcn_ds_bpermute(index, tmp.i);
35
- return tmp.f;
36
- }
37
-
38
- __device__ static inline unsigned __hip_ds_permute(int index, unsigned src) {
39
- union { int i; unsigned u; float f; } tmp; tmp.u = src;
40
- tmp.i = __builtin_amdgcn_ds_permute(index, tmp.i);
41
- return tmp.u;
42
- }
43
-
44
- __device__ static inline float __hip_ds_permutef(int index, float src) {
45
- union { int i; unsigned u; float f; } tmp; tmp.f = src;
46
- tmp.i = __builtin_amdgcn_ds_permute(index, tmp.i);
47
- return tmp.f;
48
- }
49
-
50
- #define __hip_ds_swizzle(src, pattern) __hip_ds_swizzle_N<(pattern)>((src))
51
- #define __hip_ds_swizzlef(src, pattern) __hip_ds_swizzlef_N<(pattern)>((src))
52
-
53
- template <int pattern>
54
- __device__ static inline unsigned __hip_ds_swizzle_N(unsigned int src) {
55
- union { int i; unsigned u; float f; } tmp; tmp.u = src;
56
- tmp.i = __builtin_amdgcn_ds_swizzle(tmp.i, pattern);
57
- return tmp.u;
58
- }
59
-
60
- template <int pattern>
61
- __device__ static inline float __hip_ds_swizzlef_N(float src) {
62
- union { int i; unsigned u; float f; } tmp; tmp.f = src;
63
- tmp.i = __builtin_amdgcn_ds_swizzle(tmp.i, pattern);
64
- return tmp.f;
65
- }
66
-
67
- #define __hip_move_dpp(src, dpp_ctrl, row_mask, bank_mask, bound_ctrl) \
68
- __hip_move_dpp_N<(dpp_ctrl), (row_mask), (bank_mask), (bound_ctrl)>((src))
69
-
70
- template <int dpp_ctrl, int row_mask, int bank_mask, bool bound_ctrl>
71
- __device__ static inline int __hip_move_dpp_N(int src) {
72
- return __builtin_amdgcn_mov_dpp(src, dpp_ctrl, row_mask, bank_mask,
73
- bound_ctrl);
74
- }
75
-
76
- static constexpr int warpSize = __AMDGCN_WAVEFRONT_SIZE;
77
-
78
- // warp vote function __all __any __ballot
79
- __device__
80
- inline
81
- int __all(int predicate) {
82
- return __ockl_wfall_i32(predicate);
83
- }
84
-
85
- __device__
86
- inline
87
- int __any(int predicate) {
88
- return __ockl_wfany_i32(predicate);
89
- }
90
-
91
- // XXX from llvm/include/llvm/IR/InstrTypes.h
92
- #define ICMP_NE 33
93
-
94
- __device__
95
- inline
96
- unsigned long long int __ballot(int predicate) {
97
- return __builtin_amdgcn_uicmp(predicate, 0, ICMP_NE);
98
- }
99
-
100
- __device__
101
- inline
102
- unsigned long long int __ballot64(int predicate) {
103
- return __builtin_amdgcn_uicmp(predicate, 0, ICMP_NE);
104
- }
105
-
106
- // See amd_warp_sync_functions.h for an explanation of this preprocessor flag.
107
- #ifdef HIP_ENABLE_WARP_SYNC_BUILTINS
108
- // Since threads in a wave do not make independent progress, __activemask()
109
- // always returns the exact active mask, i.e, all active threads in the wave.
110
- __device__
111
- inline
112
- unsigned long long __activemask() {
113
- return __ballot(true);
114
- }
115
- #endif // HIP_ENABLE_WARP_SYNC_BUILTINS
116
-
117
- __device__ static inline unsigned int __lane_id() {
118
- return __builtin_amdgcn_mbcnt_hi(
119
- -1, __builtin_amdgcn_mbcnt_lo(-1, 0));
120
- }
121
-
122
- __device__
123
- inline
124
- int __shfl(int var, int src_lane, int width = warpSize) {
125
- int self = __lane_id();
126
- int index = (src_lane & (width - 1)) + (self & ~(width-1));
127
- return __builtin_amdgcn_ds_bpermute(index<<2, var);
128
- }
129
- __device__
130
- inline
131
- unsigned int __shfl(unsigned int var, int src_lane, int width = warpSize) {
132
- union { int i; unsigned u; float f; } tmp; tmp.u = var;
133
- tmp.i = __shfl(tmp.i, src_lane, width);
134
- return tmp.u;
135
- }
136
- __device__
137
- inline
138
- float __shfl(float var, int src_lane, int width = warpSize) {
139
- union { int i; unsigned u; float f; } tmp; tmp.f = var;
140
- tmp.i = __shfl(tmp.i, src_lane, width);
141
- return tmp.f;
142
- }
143
- __device__
144
- inline
145
- double __shfl(double var, int src_lane, int width = warpSize) {
146
- static_assert(sizeof(double) == 2 * sizeof(int), "");
147
- static_assert(sizeof(double) == sizeof(uint64_t), "");
148
-
149
- int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
150
- tmp[0] = __shfl(tmp[0], src_lane, width);
151
- tmp[1] = __shfl(tmp[1], src_lane, width);
152
-
153
- uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
154
- double tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
155
- return tmp1;
156
- }
157
- __device__
158
- inline
159
- long __shfl(long var, int src_lane, int width = warpSize)
160
- {
161
- #ifndef _MSC_VER
162
- static_assert(sizeof(long) == 2 * sizeof(int), "");
163
- static_assert(sizeof(long) == sizeof(uint64_t), "");
164
-
165
- int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
166
- tmp[0] = __shfl(tmp[0], src_lane, width);
167
- tmp[1] = __shfl(tmp[1], src_lane, width);
168
-
169
- uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
170
- long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
171
- return tmp1;
172
- #else
173
- static_assert(sizeof(long) == sizeof(int), "");
174
- return static_cast<long>(__shfl(static_cast<int>(var), src_lane, width));
175
- #endif
176
- }
177
- __device__
178
- inline
179
- unsigned long __shfl(unsigned long var, int src_lane, int width = warpSize) {
180
- #ifndef _MSC_VER
181
- static_assert(sizeof(unsigned long) == 2 * sizeof(unsigned int), "");
182
- static_assert(sizeof(unsigned long) == sizeof(uint64_t), "");
183
-
184
- unsigned int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
185
- tmp[0] = __shfl(tmp[0], src_lane, width);
186
- tmp[1] = __shfl(tmp[1], src_lane, width);
187
-
188
- uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
189
- unsigned long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
190
- return tmp1;
191
- #else
192
- static_assert(sizeof(unsigned long) == sizeof(unsigned int), "");
193
- return static_cast<unsigned long>(__shfl(static_cast<unsigned int>(var), src_lane, width));
194
- #endif
195
- }
196
- __device__
197
- inline
198
- long long __shfl(long long var, int src_lane, int width = warpSize)
199
- {
200
- static_assert(sizeof(long long) == 2 * sizeof(int), "");
201
- static_assert(sizeof(long long) == sizeof(uint64_t), "");
202
-
203
- int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
204
- tmp[0] = __shfl(tmp[0], src_lane, width);
205
- tmp[1] = __shfl(tmp[1], src_lane, width);
206
-
207
- uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
208
- long long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
209
- return tmp1;
210
- }
211
- __device__
212
- inline
213
- unsigned long long __shfl(unsigned long long var, int src_lane, int width = warpSize) {
214
- static_assert(sizeof(unsigned long long) == 2 * sizeof(unsigned int), "");
215
- static_assert(sizeof(unsigned long long) == sizeof(uint64_t), "");
216
-
217
- unsigned int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
218
- tmp[0] = __shfl(tmp[0], src_lane, width);
219
- tmp[1] = __shfl(tmp[1], src_lane, width);
220
-
221
- uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
222
- unsigned long long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
223
- return tmp1;
224
- }
225
-
226
- __device__
227
- inline
228
- int __shfl_up(int var, unsigned int lane_delta, int width = warpSize) {
229
- int self = __lane_id();
230
- int index = self - lane_delta;
231
- index = (index < (self & ~(width-1)))?self:index;
232
- return __builtin_amdgcn_ds_bpermute(index<<2, var);
233
- }
234
- __device__
235
- inline
236
- unsigned int __shfl_up(unsigned int var, unsigned int lane_delta, int width = warpSize) {
237
- union { int i; unsigned u; float f; } tmp; tmp.u = var;
238
- tmp.i = __shfl_up(tmp.i, lane_delta, width);
239
- return tmp.u;
240
- }
241
- __device__
242
- inline
243
- float __shfl_up(float var, unsigned int lane_delta, int width = warpSize) {
244
- union { int i; unsigned u; float f; } tmp; tmp.f = var;
245
- tmp.i = __shfl_up(tmp.i, lane_delta, width);
246
- return tmp.f;
247
- }
248
- __device__
249
- inline
250
- double __shfl_up(double var, unsigned int lane_delta, int width = warpSize) {
251
- static_assert(sizeof(double) == 2 * sizeof(int), "");
252
- static_assert(sizeof(double) == sizeof(uint64_t), "");
253
-
254
- int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
255
- tmp[0] = __shfl_up(tmp[0], lane_delta, width);
256
- tmp[1] = __shfl_up(tmp[1], lane_delta, width);
257
-
258
- uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
259
- double tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
260
- return tmp1;
261
- }
262
- __device__
263
- inline
264
- long __shfl_up(long var, unsigned int lane_delta, int width = warpSize)
265
- {
266
- #ifndef _MSC_VER
267
- static_assert(sizeof(long) == 2 * sizeof(int), "");
268
- static_assert(sizeof(long) == sizeof(uint64_t), "");
269
-
270
- int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
271
- tmp[0] = __shfl_up(tmp[0], lane_delta, width);
272
- tmp[1] = __shfl_up(tmp[1], lane_delta, width);
273
-
274
- uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
275
- long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
276
- return tmp1;
277
- #else
278
- static_assert(sizeof(long) == sizeof(int), "");
279
- return static_cast<long>(__shfl_up(static_cast<int>(var), lane_delta, width));
280
- #endif
281
- }
282
-
283
- __device__
284
- inline
285
- unsigned long __shfl_up(unsigned long var, unsigned int lane_delta, int width = warpSize)
286
- {
287
- #ifndef _MSC_VER
288
- static_assert(sizeof(unsigned long) == 2 * sizeof(unsigned int), "");
289
- static_assert(sizeof(unsigned long) == sizeof(uint64_t), "");
290
-
291
- unsigned int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
292
- tmp[0] = __shfl_up(tmp[0], lane_delta, width);
293
- tmp[1] = __shfl_up(tmp[1], lane_delta, width);
294
-
295
- uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
296
- unsigned long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
297
- return tmp1;
298
- #else
299
- static_assert(sizeof(unsigned long) == sizeof(unsigned int), "");
300
- return static_cast<unsigned long>(__shfl_up(static_cast<unsigned int>(var), lane_delta, width));
301
- #endif
302
- }
303
-
304
- __device__
305
- inline
306
- long long __shfl_up(long long var, unsigned int lane_delta, int width = warpSize)
307
- {
308
- static_assert(sizeof(long long) == 2 * sizeof(int), "");
309
- static_assert(sizeof(long long) == sizeof(uint64_t), "");
310
- int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
311
- tmp[0] = __shfl_up(tmp[0], lane_delta, width);
312
- tmp[1] = __shfl_up(tmp[1], lane_delta, width);
313
- uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
314
- long long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
315
- return tmp1;
316
- }
317
-
318
- __device__
319
- inline
320
- unsigned long long __shfl_up(unsigned long long var, unsigned int lane_delta, int width = warpSize)
321
- {
322
- static_assert(sizeof(unsigned long long) == 2 * sizeof(unsigned int), "");
323
- static_assert(sizeof(unsigned long long) == sizeof(uint64_t), "");
324
- unsigned int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
325
- tmp[0] = __shfl_up(tmp[0], lane_delta, width);
326
- tmp[1] = __shfl_up(tmp[1], lane_delta, width);
327
- uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
328
- unsigned long long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
329
- return tmp1;
330
- }
331
-
332
- __device__
333
- inline
334
- int __shfl_down(int var, unsigned int lane_delta, int width = warpSize) {
335
- int self = __lane_id();
336
- int index = self + lane_delta;
337
- index = (int)((self&(width-1))+lane_delta) >= width?self:index;
338
- return __builtin_amdgcn_ds_bpermute(index<<2, var);
339
- }
340
- __device__
341
- inline
342
- unsigned int __shfl_down(unsigned int var, unsigned int lane_delta, int width = warpSize) {
343
- union { int i; unsigned u; float f; } tmp; tmp.u = var;
344
- tmp.i = __shfl_down(tmp.i, lane_delta, width);
345
- return tmp.u;
346
- }
347
- __device__
348
- inline
349
- float __shfl_down(float var, unsigned int lane_delta, int width = warpSize) {
350
- union { int i; unsigned u; float f; } tmp; tmp.f = var;
351
- tmp.i = __shfl_down(tmp.i, lane_delta, width);
352
- return tmp.f;
353
- }
354
- __device__
355
- inline
356
- double __shfl_down(double var, unsigned int lane_delta, int width = warpSize) {
357
- static_assert(sizeof(double) == 2 * sizeof(int), "");
358
- static_assert(sizeof(double) == sizeof(uint64_t), "");
359
-
360
- int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
361
- tmp[0] = __shfl_down(tmp[0], lane_delta, width);
362
- tmp[1] = __shfl_down(tmp[1], lane_delta, width);
363
-
364
- uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
365
- double tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
366
- return tmp1;
367
- }
368
- __device__
369
- inline
370
- long __shfl_down(long var, unsigned int lane_delta, int width = warpSize)
371
- {
372
- #ifndef _MSC_VER
373
- static_assert(sizeof(long) == 2 * sizeof(int), "");
374
- static_assert(sizeof(long) == sizeof(uint64_t), "");
375
-
376
- int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
377
- tmp[0] = __shfl_down(tmp[0], lane_delta, width);
378
- tmp[1] = __shfl_down(tmp[1], lane_delta, width);
379
-
380
- uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
381
- long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
382
- return tmp1;
383
- #else
384
- static_assert(sizeof(long) == sizeof(int), "");
385
- return static_cast<long>(__shfl_down(static_cast<int>(var), lane_delta, width));
386
- #endif
387
- }
388
- __device__
389
- inline
390
- unsigned long __shfl_down(unsigned long var, unsigned int lane_delta, int width = warpSize)
391
- {
392
- #ifndef _MSC_VER
393
- static_assert(sizeof(unsigned long) == 2 * sizeof(unsigned int), "");
394
- static_assert(sizeof(unsigned long) == sizeof(uint64_t), "");
395
-
396
- unsigned int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
397
- tmp[0] = __shfl_down(tmp[0], lane_delta, width);
398
- tmp[1] = __shfl_down(tmp[1], lane_delta, width);
399
-
400
- uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
401
- unsigned long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
402
- return tmp1;
403
- #else
404
- static_assert(sizeof(unsigned long) == sizeof(unsigned int), "");
405
- return static_cast<unsigned long>(__shfl_down(static_cast<unsigned int>(var), lane_delta, width));
406
- #endif
407
- }
408
- __device__
409
- inline
410
- long long __shfl_down(long long var, unsigned int lane_delta, int width = warpSize)
411
- {
412
- static_assert(sizeof(long long) == 2 * sizeof(int), "");
413
- static_assert(sizeof(long long) == sizeof(uint64_t), "");
414
- int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
415
- tmp[0] = __shfl_down(tmp[0], lane_delta, width);
416
- tmp[1] = __shfl_down(tmp[1], lane_delta, width);
417
- uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
418
- long long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
419
- return tmp1;
420
- }
421
- __device__
422
- inline
423
- unsigned long long __shfl_down(unsigned long long var, unsigned int lane_delta, int width = warpSize)
424
- {
425
- static_assert(sizeof(unsigned long long) == 2 * sizeof(unsigned int), "");
426
- static_assert(sizeof(unsigned long long) == sizeof(uint64_t), "");
427
- unsigned int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
428
- tmp[0] = __shfl_down(tmp[0], lane_delta, width);
429
- tmp[1] = __shfl_down(tmp[1], lane_delta, width);
430
- uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
431
- unsigned long long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
432
- return tmp1;
433
- }
434
-
435
- __device__
436
- inline
437
- int __shfl_xor(int var, int lane_mask, int width = warpSize) {
438
- int self = __lane_id();
439
- int index = self^lane_mask;
440
- index = index >= ((self+width)&~(width-1))?self:index;
441
- return __builtin_amdgcn_ds_bpermute(index<<2, var);
442
- }
443
- __device__
444
- inline
445
- unsigned int __shfl_xor(unsigned int var, int lane_mask, int width = warpSize) {
446
- union { int i; unsigned u; float f; } tmp; tmp.u = var;
447
- tmp.i = __shfl_xor(tmp.i, lane_mask, width);
448
- return tmp.u;
449
- }
450
- __device__
451
- inline
452
- float __shfl_xor(float var, int lane_mask, int width = warpSize) {
453
- union { int i; unsigned u; float f; } tmp; tmp.f = var;
454
- tmp.i = __shfl_xor(tmp.i, lane_mask, width);
455
- return tmp.f;
456
- }
457
- __device__
458
- inline
459
- double __shfl_xor(double var, int lane_mask, int width = warpSize) {
460
- static_assert(sizeof(double) == 2 * sizeof(int), "");
461
- static_assert(sizeof(double) == sizeof(uint64_t), "");
462
-
463
- int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
464
- tmp[0] = __shfl_xor(tmp[0], lane_mask, width);
465
- tmp[1] = __shfl_xor(tmp[1], lane_mask, width);
466
-
467
- uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
468
- double tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
469
- return tmp1;
470
- }
471
- __device__
472
- inline
473
- long __shfl_xor(long var, int lane_mask, int width = warpSize)
474
- {
475
- #ifndef _MSC_VER
476
- static_assert(sizeof(long) == 2 * sizeof(int), "");
477
- static_assert(sizeof(long) == sizeof(uint64_t), "");
478
-
479
- int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
480
- tmp[0] = __shfl_xor(tmp[0], lane_mask, width);
481
- tmp[1] = __shfl_xor(tmp[1], lane_mask, width);
482
-
483
- uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
484
- long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
485
- return tmp1;
486
- #else
487
- static_assert(sizeof(long) == sizeof(int), "");
488
- return static_cast<long>(__shfl_xor(static_cast<int>(var), lane_mask, width));
489
- #endif
490
- }
491
- __device__
492
- inline
493
- unsigned long __shfl_xor(unsigned long var, int lane_mask, int width = warpSize)
494
- {
495
- #ifndef _MSC_VER
496
- static_assert(sizeof(unsigned long) == 2 * sizeof(unsigned int), "");
497
- static_assert(sizeof(unsigned long) == sizeof(uint64_t), "");
498
-
499
- unsigned int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
500
- tmp[0] = __shfl_xor(tmp[0], lane_mask, width);
501
- tmp[1] = __shfl_xor(tmp[1], lane_mask, width);
502
-
503
- uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
504
- unsigned long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
505
- return tmp1;
506
- #else
507
- static_assert(sizeof(unsigned long) == sizeof(unsigned int), "");
508
- return static_cast<unsigned long>(__shfl_xor(static_cast<unsigned int>(var), lane_mask, width));
509
- #endif
510
- }
511
- __device__
512
- inline
513
- long long __shfl_xor(long long var, int lane_mask, int width = warpSize)
514
- {
515
- static_assert(sizeof(long long) == 2 * sizeof(int), "");
516
- static_assert(sizeof(long long) == sizeof(uint64_t), "");
517
- int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
518
- tmp[0] = __shfl_xor(tmp[0], lane_mask, width);
519
- tmp[1] = __shfl_xor(tmp[1], lane_mask, width);
520
- uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
521
- long long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
522
- return tmp1;
523
- }
524
- __device__
525
- inline
526
- unsigned long long __shfl_xor(unsigned long long var, int lane_mask, int width = warpSize)
527
- {
528
- static_assert(sizeof(unsigned long long) == 2 * sizeof(unsigned int), "");
529
- static_assert(sizeof(unsigned long long) == sizeof(uint64_t), "");
530
- unsigned int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
531
- tmp[0] = __shfl_xor(tmp[0], lane_mask, width);
532
- tmp[1] = __shfl_xor(tmp[1], lane_mask, width);
533
- uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
534
- unsigned long long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
535
- return tmp1;
536
- }
537
-
538
- #endif