triton-windows 3.3.0.post19__cp310-cp310-win_amd64.whl → 3.3.1.post21__cp310-cp310-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of triton-windows might be problematic. Click here for more details.

Files changed (116) hide show
  1. triton/_C/libtriton.pyd +0 -0
  2. triton/__init__.py +1 -1
  3. triton/backends/amd/driver.py +6 -1
  4. triton/backends/nvidia/compiler.py +1 -3
  5. triton/backends/nvidia/driver.py +7 -3
  6. triton/runtime/autotuner.py +2 -2
  7. triton/runtime/build.py +5 -5
  8. triton/runtime/tcc/lib/python310.def +1610 -0
  9. triton/runtime/tcc/lib/python311.def +1633 -0
  10. triton/runtime/tcc/lib/python312.def +1703 -0
  11. triton/runtime/tcc/lib/python313.def +1651 -0
  12. triton/runtime/tcc/lib/python313t.def +1656 -0
  13. triton/runtime/tcc/lib/python39.def +1644 -0
  14. triton/runtime/tcc/lib/python3t.def +905 -0
  15. triton/windows_utils.py +11 -4
  16. {triton_windows-3.3.0.post19.dist-info → triton_windows-3.3.1.post21.dist-info}/METADATA +1 -1
  17. {triton_windows-3.3.0.post19.dist-info → triton_windows-3.3.1.post21.dist-info}/RECORD +19 -109
  18. {triton_windows-3.3.0.post19.dist-info → triton_windows-3.3.1.post21.dist-info}/WHEEL +1 -1
  19. triton/backends/amd/include/hip/amd_detail/amd_channel_descriptor.h +0 -358
  20. triton/backends/amd/include/hip/amd_detail/amd_device_functions.h +0 -1010
  21. triton/backends/amd/include/hip/amd_detail/amd_hip_atomic.h +0 -1638
  22. triton/backends/amd/include/hip/amd_detail/amd_hip_bf16.h +0 -1814
  23. triton/backends/amd/include/hip/amd_detail/amd_hip_bfloat16.h +0 -293
  24. triton/backends/amd/include/hip/amd_detail/amd_hip_common.h +0 -32
  25. triton/backends/amd/include/hip/amd_detail/amd_hip_complex.h +0 -174
  26. triton/backends/amd/include/hip/amd_detail/amd_hip_cooperative_groups.h +0 -835
  27. triton/backends/amd/include/hip/amd_detail/amd_hip_fp16.h +0 -1809
  28. triton/backends/amd/include/hip/amd_detail/amd_hip_fp8.h +0 -1391
  29. triton/backends/amd/include/hip/amd_detail/amd_hip_gl_interop.h +0 -108
  30. triton/backends/amd/include/hip/amd_detail/amd_hip_math_constants.h +0 -124
  31. triton/backends/amd/include/hip/amd_detail/amd_hip_runtime.h +0 -405
  32. triton/backends/amd/include/hip/amd_detail/amd_hip_runtime_pt_api.h +0 -196
  33. triton/backends/amd/include/hip/amd_detail/amd_hip_unsafe_atomics.h +0 -565
  34. triton/backends/amd/include/hip/amd_detail/amd_hip_vector_types.h +0 -2226
  35. triton/backends/amd/include/hip/amd_detail/amd_math_functions.h +0 -104
  36. triton/backends/amd/include/hip/amd_detail/amd_surface_functions.h +0 -244
  37. triton/backends/amd/include/hip/amd_detail/amd_warp_functions.h +0 -538
  38. triton/backends/amd/include/hip/amd_detail/amd_warp_sync_functions.h +0 -288
  39. triton/backends/amd/include/hip/amd_detail/concepts.hpp +0 -30
  40. triton/backends/amd/include/hip/amd_detail/device_library_decls.h +0 -133
  41. triton/backends/amd/include/hip/amd_detail/functional_grid_launch.hpp +0 -218
  42. triton/backends/amd/include/hip/amd_detail/grid_launch.h +0 -67
  43. triton/backends/amd/include/hip/amd_detail/grid_launch.hpp +0 -50
  44. triton/backends/amd/include/hip/amd_detail/grid_launch_GGL.hpp +0 -26
  45. triton/backends/amd/include/hip/amd_detail/helpers.hpp +0 -137
  46. triton/backends/amd/include/hip/amd_detail/hip_api_trace.hpp +0 -1446
  47. triton/backends/amd/include/hip/amd_detail/hip_assert.h +0 -101
  48. triton/backends/amd/include/hip/amd_detail/hip_cooperative_groups_helper.h +0 -242
  49. triton/backends/amd/include/hip/amd_detail/hip_fp16_gcc.h +0 -254
  50. triton/backends/amd/include/hip/amd_detail/hip_fp16_math_fwd.h +0 -96
  51. triton/backends/amd/include/hip/amd_detail/hip_ldg.h +0 -100
  52. triton/backends/amd/include/hip/amd_detail/hip_prof_str.h +0 -10570
  53. triton/backends/amd/include/hip/amd_detail/hip_runtime_prof.h +0 -78
  54. triton/backends/amd/include/hip/amd_detail/host_defines.h +0 -184
  55. triton/backends/amd/include/hip/amd_detail/hsa_helpers.hpp +0 -102
  56. triton/backends/amd/include/hip/amd_detail/macro_based_grid_launch.hpp +0 -798
  57. triton/backends/amd/include/hip/amd_detail/math_fwd.h +0 -698
  58. triton/backends/amd/include/hip/amd_detail/ockl_image.h +0 -177
  59. triton/backends/amd/include/hip/amd_detail/program_state.hpp +0 -107
  60. triton/backends/amd/include/hip/amd_detail/texture_fetch_functions.h +0 -491
  61. triton/backends/amd/include/hip/amd_detail/texture_indirect_functions.h +0 -478
  62. triton/backends/amd/include/hip/channel_descriptor.h +0 -39
  63. triton/backends/amd/include/hip/device_functions.h +0 -38
  64. triton/backends/amd/include/hip/driver_types.h +0 -468
  65. triton/backends/amd/include/hip/hip_bf16.h +0 -36
  66. triton/backends/amd/include/hip/hip_bfloat16.h +0 -44
  67. triton/backends/amd/include/hip/hip_common.h +0 -100
  68. triton/backends/amd/include/hip/hip_complex.h +0 -38
  69. triton/backends/amd/include/hip/hip_cooperative_groups.h +0 -46
  70. triton/backends/amd/include/hip/hip_deprecated.h +0 -95
  71. triton/backends/amd/include/hip/hip_ext.h +0 -161
  72. triton/backends/amd/include/hip/hip_fp16.h +0 -36
  73. triton/backends/amd/include/hip/hip_fp8.h +0 -33
  74. triton/backends/amd/include/hip/hip_gl_interop.h +0 -32
  75. triton/backends/amd/include/hip/hip_hcc.h +0 -24
  76. triton/backends/amd/include/hip/hip_math_constants.h +0 -36
  77. triton/backends/amd/include/hip/hip_profile.h +0 -27
  78. triton/backends/amd/include/hip/hip_runtime.h +0 -75
  79. triton/backends/amd/include/hip/hip_runtime_api.h +0 -9261
  80. triton/backends/amd/include/hip/hip_texture_types.h +0 -29
  81. triton/backends/amd/include/hip/hip_vector_types.h +0 -41
  82. triton/backends/amd/include/hip/hip_version.h +0 -17
  83. triton/backends/amd/include/hip/hiprtc.h +0 -421
  84. triton/backends/amd/include/hip/library_types.h +0 -78
  85. triton/backends/amd/include/hip/math_functions.h +0 -42
  86. triton/backends/amd/include/hip/surface_types.h +0 -63
  87. triton/backends/amd/include/hip/texture_types.h +0 -194
  88. triton/backends/amd/include/hsa/Brig.h +0 -1131
  89. triton/backends/amd/include/hsa/amd_hsa_common.h +0 -91
  90. triton/backends/amd/include/hsa/amd_hsa_elf.h +0 -462
  91. triton/backends/amd/include/hsa/amd_hsa_kernel_code.h +0 -269
  92. triton/backends/amd/include/hsa/amd_hsa_queue.h +0 -109
  93. triton/backends/amd/include/hsa/amd_hsa_signal.h +0 -80
  94. triton/backends/amd/include/hsa/hsa.h +0 -5738
  95. triton/backends/amd/include/hsa/hsa_amd_tool.h +0 -91
  96. triton/backends/amd/include/hsa/hsa_api_trace.h +0 -579
  97. triton/backends/amd/include/hsa/hsa_api_trace_version.h +0 -68
  98. triton/backends/amd/include/hsa/hsa_ext_amd.h +0 -3146
  99. triton/backends/amd/include/hsa/hsa_ext_finalize.h +0 -531
  100. triton/backends/amd/include/hsa/hsa_ext_image.h +0 -1454
  101. triton/backends/amd/include/hsa/hsa_ven_amd_aqlprofile.h +0 -488
  102. triton/backends/amd/include/hsa/hsa_ven_amd_loader.h +0 -667
  103. triton/backends/amd/include/hsa/hsa_ven_amd_pc_sampling.h +0 -416
  104. triton/backends/amd/include/roctracer/ext/prof_protocol.h +0 -107
  105. triton/backends/amd/include/roctracer/hip_ostream_ops.h +0 -4515
  106. triton/backends/amd/include/roctracer/hsa_ostream_ops.h +0 -1727
  107. triton/backends/amd/include/roctracer/hsa_prof_str.h +0 -3059
  108. triton/backends/amd/include/roctracer/roctracer.h +0 -779
  109. triton/backends/amd/include/roctracer/roctracer_ext.h +0 -81
  110. triton/backends/amd/include/roctracer/roctracer_hcc.h +0 -24
  111. triton/backends/amd/include/roctracer/roctracer_hip.h +0 -37
  112. triton/backends/amd/include/roctracer/roctracer_hsa.h +0 -112
  113. triton/backends/amd/include/roctracer/roctracer_plugin.h +0 -137
  114. triton/backends/amd/include/roctracer/roctracer_roctx.h +0 -67
  115. triton/backends/amd/include/roctracer/roctx.h +0 -229
  116. {triton_windows-3.3.0.post19.dist-info → triton_windows-3.3.1.post21.dist-info}/top_level.txt +0 -0
@@ -1,538 +0,0 @@
1
- /*
2
- Copyright (c) 2022 - 2023 Advanced Micro Devices, Inc. All rights reserved.
3
-
4
- Permission is hereby granted, free of charge, to any person obtaining a copy
5
- of this software and associated documentation files (the "Software"), to deal
6
- in the Software without restriction, including without limitation the rights
7
- to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
8
- copies of the Software, and to permit persons to whom the Software is
9
- furnished to do so, subject to the following conditions:
10
-
11
- The above copyright notice and this permission notice shall be included in
12
- all copies or substantial portions of the Software.
13
-
14
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15
- IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16
- FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
17
- AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18
- LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
19
- OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
20
- THE SOFTWARE.
21
- */
22
-
23
- #ifndef HIP_INCLUDE_HIP_AMD_DETAIL_WARP_FUNCTIONS_H
24
- #define HIP_INCLUDE_HIP_AMD_DETAIL_WARP_FUNCTIONS_H
25
-
26
- __device__ static inline unsigned __hip_ds_bpermute(int index, unsigned src) {
27
- union { int i; unsigned u; float f; } tmp; tmp.u = src;
28
- tmp.i = __builtin_amdgcn_ds_bpermute(index, tmp.i);
29
- return tmp.u;
30
- }
31
-
32
- __device__ static inline float __hip_ds_bpermutef(int index, float src) {
33
- union { int i; unsigned u; float f; } tmp; tmp.f = src;
34
- tmp.i = __builtin_amdgcn_ds_bpermute(index, tmp.i);
35
- return tmp.f;
36
- }
37
-
38
- __device__ static inline unsigned __hip_ds_permute(int index, unsigned src) {
39
- union { int i; unsigned u; float f; } tmp; tmp.u = src;
40
- tmp.i = __builtin_amdgcn_ds_permute(index, tmp.i);
41
- return tmp.u;
42
- }
43
-
44
- __device__ static inline float __hip_ds_permutef(int index, float src) {
45
- union { int i; unsigned u; float f; } tmp; tmp.f = src;
46
- tmp.i = __builtin_amdgcn_ds_permute(index, tmp.i);
47
- return tmp.f;
48
- }
49
-
50
- #define __hip_ds_swizzle(src, pattern) __hip_ds_swizzle_N<(pattern)>((src))
51
- #define __hip_ds_swizzlef(src, pattern) __hip_ds_swizzlef_N<(pattern)>((src))
52
-
53
- template <int pattern>
54
- __device__ static inline unsigned __hip_ds_swizzle_N(unsigned int src) {
55
- union { int i; unsigned u; float f; } tmp; tmp.u = src;
56
- tmp.i = __builtin_amdgcn_ds_swizzle(tmp.i, pattern);
57
- return tmp.u;
58
- }
59
-
60
- template <int pattern>
61
- __device__ static inline float __hip_ds_swizzlef_N(float src) {
62
- union { int i; unsigned u; float f; } tmp; tmp.f = src;
63
- tmp.i = __builtin_amdgcn_ds_swizzle(tmp.i, pattern);
64
- return tmp.f;
65
- }
66
-
67
- #define __hip_move_dpp(src, dpp_ctrl, row_mask, bank_mask, bound_ctrl) \
68
- __hip_move_dpp_N<(dpp_ctrl), (row_mask), (bank_mask), (bound_ctrl)>((src))
69
-
70
- template <int dpp_ctrl, int row_mask, int bank_mask, bool bound_ctrl>
71
- __device__ static inline int __hip_move_dpp_N(int src) {
72
- return __builtin_amdgcn_mov_dpp(src, dpp_ctrl, row_mask, bank_mask,
73
- bound_ctrl);
74
- }
75
-
76
- static constexpr int warpSize = __AMDGCN_WAVEFRONT_SIZE;
77
-
78
- // warp vote function __all __any __ballot
79
- __device__
80
- inline
81
- int __all(int predicate) {
82
- return __ockl_wfall_i32(predicate);
83
- }
84
-
85
- __device__
86
- inline
87
- int __any(int predicate) {
88
- return __ockl_wfany_i32(predicate);
89
- }
90
-
91
- // XXX from llvm/include/llvm/IR/InstrTypes.h
92
- #define ICMP_NE 33
93
-
94
- __device__
95
- inline
96
- unsigned long long int __ballot(int predicate) {
97
- return __builtin_amdgcn_uicmp(predicate, 0, ICMP_NE);
98
- }
99
-
100
- __device__
101
- inline
102
- unsigned long long int __ballot64(int predicate) {
103
- return __builtin_amdgcn_uicmp(predicate, 0, ICMP_NE);
104
- }
105
-
106
- // See amd_warp_sync_functions.h for an explanation of this preprocessor flag.
107
- #ifdef HIP_ENABLE_WARP_SYNC_BUILTINS
108
- // Since threads in a wave do not make independent progress, __activemask()
109
- // always returns the exact active mask, i.e, all active threads in the wave.
110
- __device__
111
- inline
112
- unsigned long long __activemask() {
113
- return __ballot(true);
114
- }
115
- #endif // HIP_ENABLE_WARP_SYNC_BUILTINS
116
-
117
- __device__ static inline unsigned int __lane_id() {
118
- return __builtin_amdgcn_mbcnt_hi(
119
- -1, __builtin_amdgcn_mbcnt_lo(-1, 0));
120
- }
121
-
122
- __device__
123
- inline
124
- int __shfl(int var, int src_lane, int width = warpSize) {
125
- int self = __lane_id();
126
- int index = (src_lane & (width - 1)) + (self & ~(width-1));
127
- return __builtin_amdgcn_ds_bpermute(index<<2, var);
128
- }
129
- __device__
130
- inline
131
- unsigned int __shfl(unsigned int var, int src_lane, int width = warpSize) {
132
- union { int i; unsigned u; float f; } tmp; tmp.u = var;
133
- tmp.i = __shfl(tmp.i, src_lane, width);
134
- return tmp.u;
135
- }
136
- __device__
137
- inline
138
- float __shfl(float var, int src_lane, int width = warpSize) {
139
- union { int i; unsigned u; float f; } tmp; tmp.f = var;
140
- tmp.i = __shfl(tmp.i, src_lane, width);
141
- return tmp.f;
142
- }
143
- __device__
144
- inline
145
- double __shfl(double var, int src_lane, int width = warpSize) {
146
- static_assert(sizeof(double) == 2 * sizeof(int), "");
147
- static_assert(sizeof(double) == sizeof(uint64_t), "");
148
-
149
- int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
150
- tmp[0] = __shfl(tmp[0], src_lane, width);
151
- tmp[1] = __shfl(tmp[1], src_lane, width);
152
-
153
- uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
154
- double tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
155
- return tmp1;
156
- }
157
- __device__
158
- inline
159
- long __shfl(long var, int src_lane, int width = warpSize)
160
- {
161
- #ifndef _MSC_VER
162
- static_assert(sizeof(long) == 2 * sizeof(int), "");
163
- static_assert(sizeof(long) == sizeof(uint64_t), "");
164
-
165
- int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
166
- tmp[0] = __shfl(tmp[0], src_lane, width);
167
- tmp[1] = __shfl(tmp[1], src_lane, width);
168
-
169
- uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
170
- long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
171
- return tmp1;
172
- #else
173
- static_assert(sizeof(long) == sizeof(int), "");
174
- return static_cast<long>(__shfl(static_cast<int>(var), src_lane, width));
175
- #endif
176
- }
177
- __device__
178
- inline
179
- unsigned long __shfl(unsigned long var, int src_lane, int width = warpSize) {
180
- #ifndef _MSC_VER
181
- static_assert(sizeof(unsigned long) == 2 * sizeof(unsigned int), "");
182
- static_assert(sizeof(unsigned long) == sizeof(uint64_t), "");
183
-
184
- unsigned int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
185
- tmp[0] = __shfl(tmp[0], src_lane, width);
186
- tmp[1] = __shfl(tmp[1], src_lane, width);
187
-
188
- uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
189
- unsigned long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
190
- return tmp1;
191
- #else
192
- static_assert(sizeof(unsigned long) == sizeof(unsigned int), "");
193
- return static_cast<unsigned long>(__shfl(static_cast<unsigned int>(var), src_lane, width));
194
- #endif
195
- }
196
- __device__
197
- inline
198
- long long __shfl(long long var, int src_lane, int width = warpSize)
199
- {
200
- static_assert(sizeof(long long) == 2 * sizeof(int), "");
201
- static_assert(sizeof(long long) == sizeof(uint64_t), "");
202
-
203
- int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
204
- tmp[0] = __shfl(tmp[0], src_lane, width);
205
- tmp[1] = __shfl(tmp[1], src_lane, width);
206
-
207
- uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
208
- long long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
209
- return tmp1;
210
- }
211
- __device__
212
- inline
213
- unsigned long long __shfl(unsigned long long var, int src_lane, int width = warpSize) {
214
- static_assert(sizeof(unsigned long long) == 2 * sizeof(unsigned int), "");
215
- static_assert(sizeof(unsigned long long) == sizeof(uint64_t), "");
216
-
217
- unsigned int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
218
- tmp[0] = __shfl(tmp[0], src_lane, width);
219
- tmp[1] = __shfl(tmp[1], src_lane, width);
220
-
221
- uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
222
- unsigned long long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
223
- return tmp1;
224
- }
225
-
226
- __device__
227
- inline
228
- int __shfl_up(int var, unsigned int lane_delta, int width = warpSize) {
229
- int self = __lane_id();
230
- int index = self - lane_delta;
231
- index = (index < (self & ~(width-1)))?self:index;
232
- return __builtin_amdgcn_ds_bpermute(index<<2, var);
233
- }
234
- __device__
235
- inline
236
- unsigned int __shfl_up(unsigned int var, unsigned int lane_delta, int width = warpSize) {
237
- union { int i; unsigned u; float f; } tmp; tmp.u = var;
238
- tmp.i = __shfl_up(tmp.i, lane_delta, width);
239
- return tmp.u;
240
- }
241
- __device__
242
- inline
243
- float __shfl_up(float var, unsigned int lane_delta, int width = warpSize) {
244
- union { int i; unsigned u; float f; } tmp; tmp.f = var;
245
- tmp.i = __shfl_up(tmp.i, lane_delta, width);
246
- return tmp.f;
247
- }
248
- __device__
249
- inline
250
- double __shfl_up(double var, unsigned int lane_delta, int width = warpSize) {
251
- static_assert(sizeof(double) == 2 * sizeof(int), "");
252
- static_assert(sizeof(double) == sizeof(uint64_t), "");
253
-
254
- int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
255
- tmp[0] = __shfl_up(tmp[0], lane_delta, width);
256
- tmp[1] = __shfl_up(tmp[1], lane_delta, width);
257
-
258
- uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
259
- double tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
260
- return tmp1;
261
- }
262
- __device__
263
- inline
264
- long __shfl_up(long var, unsigned int lane_delta, int width = warpSize)
265
- {
266
- #ifndef _MSC_VER
267
- static_assert(sizeof(long) == 2 * sizeof(int), "");
268
- static_assert(sizeof(long) == sizeof(uint64_t), "");
269
-
270
- int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
271
- tmp[0] = __shfl_up(tmp[0], lane_delta, width);
272
- tmp[1] = __shfl_up(tmp[1], lane_delta, width);
273
-
274
- uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
275
- long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
276
- return tmp1;
277
- #else
278
- static_assert(sizeof(long) == sizeof(int), "");
279
- return static_cast<long>(__shfl_up(static_cast<int>(var), lane_delta, width));
280
- #endif
281
- }
282
-
283
- __device__
284
- inline
285
- unsigned long __shfl_up(unsigned long var, unsigned int lane_delta, int width = warpSize)
286
- {
287
- #ifndef _MSC_VER
288
- static_assert(sizeof(unsigned long) == 2 * sizeof(unsigned int), "");
289
- static_assert(sizeof(unsigned long) == sizeof(uint64_t), "");
290
-
291
- unsigned int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
292
- tmp[0] = __shfl_up(tmp[0], lane_delta, width);
293
- tmp[1] = __shfl_up(tmp[1], lane_delta, width);
294
-
295
- uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
296
- unsigned long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
297
- return tmp1;
298
- #else
299
- static_assert(sizeof(unsigned long) == sizeof(unsigned int), "");
300
- return static_cast<unsigned long>(__shfl_up(static_cast<unsigned int>(var), lane_delta, width));
301
- #endif
302
- }
303
-
304
- __device__
305
- inline
306
- long long __shfl_up(long long var, unsigned int lane_delta, int width = warpSize)
307
- {
308
- static_assert(sizeof(long long) == 2 * sizeof(int), "");
309
- static_assert(sizeof(long long) == sizeof(uint64_t), "");
310
- int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
311
- tmp[0] = __shfl_up(tmp[0], lane_delta, width);
312
- tmp[1] = __shfl_up(tmp[1], lane_delta, width);
313
- uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
314
- long long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
315
- return tmp1;
316
- }
317
-
318
- __device__
319
- inline
320
- unsigned long long __shfl_up(unsigned long long var, unsigned int lane_delta, int width = warpSize)
321
- {
322
- static_assert(sizeof(unsigned long long) == 2 * sizeof(unsigned int), "");
323
- static_assert(sizeof(unsigned long long) == sizeof(uint64_t), "");
324
- unsigned int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
325
- tmp[0] = __shfl_up(tmp[0], lane_delta, width);
326
- tmp[1] = __shfl_up(tmp[1], lane_delta, width);
327
- uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
328
- unsigned long long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
329
- return tmp1;
330
- }
331
-
332
- __device__
333
- inline
334
- int __shfl_down(int var, unsigned int lane_delta, int width = warpSize) {
335
- int self = __lane_id();
336
- int index = self + lane_delta;
337
- index = (int)((self&(width-1))+lane_delta) >= width?self:index;
338
- return __builtin_amdgcn_ds_bpermute(index<<2, var);
339
- }
340
- __device__
341
- inline
342
- unsigned int __shfl_down(unsigned int var, unsigned int lane_delta, int width = warpSize) {
343
- union { int i; unsigned u; float f; } tmp; tmp.u = var;
344
- tmp.i = __shfl_down(tmp.i, lane_delta, width);
345
- return tmp.u;
346
- }
347
- __device__
348
- inline
349
- float __shfl_down(float var, unsigned int lane_delta, int width = warpSize) {
350
- union { int i; unsigned u; float f; } tmp; tmp.f = var;
351
- tmp.i = __shfl_down(tmp.i, lane_delta, width);
352
- return tmp.f;
353
- }
354
- __device__
355
- inline
356
- double __shfl_down(double var, unsigned int lane_delta, int width = warpSize) {
357
- static_assert(sizeof(double) == 2 * sizeof(int), "");
358
- static_assert(sizeof(double) == sizeof(uint64_t), "");
359
-
360
- int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
361
- tmp[0] = __shfl_down(tmp[0], lane_delta, width);
362
- tmp[1] = __shfl_down(tmp[1], lane_delta, width);
363
-
364
- uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
365
- double tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
366
- return tmp1;
367
- }
368
- __device__
369
- inline
370
- long __shfl_down(long var, unsigned int lane_delta, int width = warpSize)
371
- {
372
- #ifndef _MSC_VER
373
- static_assert(sizeof(long) == 2 * sizeof(int), "");
374
- static_assert(sizeof(long) == sizeof(uint64_t), "");
375
-
376
- int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
377
- tmp[0] = __shfl_down(tmp[0], lane_delta, width);
378
- tmp[1] = __shfl_down(tmp[1], lane_delta, width);
379
-
380
- uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
381
- long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
382
- return tmp1;
383
- #else
384
- static_assert(sizeof(long) == sizeof(int), "");
385
- return static_cast<long>(__shfl_down(static_cast<int>(var), lane_delta, width));
386
- #endif
387
- }
388
- __device__
389
- inline
390
- unsigned long __shfl_down(unsigned long var, unsigned int lane_delta, int width = warpSize)
391
- {
392
- #ifndef _MSC_VER
393
- static_assert(sizeof(unsigned long) == 2 * sizeof(unsigned int), "");
394
- static_assert(sizeof(unsigned long) == sizeof(uint64_t), "");
395
-
396
- unsigned int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
397
- tmp[0] = __shfl_down(tmp[0], lane_delta, width);
398
- tmp[1] = __shfl_down(tmp[1], lane_delta, width);
399
-
400
- uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
401
- unsigned long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
402
- return tmp1;
403
- #else
404
- static_assert(sizeof(unsigned long) == sizeof(unsigned int), "");
405
- return static_cast<unsigned long>(__shfl_down(static_cast<unsigned int>(var), lane_delta, width));
406
- #endif
407
- }
408
- __device__
409
- inline
410
- long long __shfl_down(long long var, unsigned int lane_delta, int width = warpSize)
411
- {
412
- static_assert(sizeof(long long) == 2 * sizeof(int), "");
413
- static_assert(sizeof(long long) == sizeof(uint64_t), "");
414
- int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
415
- tmp[0] = __shfl_down(tmp[0], lane_delta, width);
416
- tmp[1] = __shfl_down(tmp[1], lane_delta, width);
417
- uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
418
- long long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
419
- return tmp1;
420
- }
421
- __device__
422
- inline
423
- unsigned long long __shfl_down(unsigned long long var, unsigned int lane_delta, int width = warpSize)
424
- {
425
- static_assert(sizeof(unsigned long long) == 2 * sizeof(unsigned int), "");
426
- static_assert(sizeof(unsigned long long) == sizeof(uint64_t), "");
427
- unsigned int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
428
- tmp[0] = __shfl_down(tmp[0], lane_delta, width);
429
- tmp[1] = __shfl_down(tmp[1], lane_delta, width);
430
- uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
431
- unsigned long long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
432
- return tmp1;
433
- }
434
-
435
- __device__
436
- inline
437
- int __shfl_xor(int var, int lane_mask, int width = warpSize) {
438
- int self = __lane_id();
439
- int index = self^lane_mask;
440
- index = index >= ((self+width)&~(width-1))?self:index;
441
- return __builtin_amdgcn_ds_bpermute(index<<2, var);
442
- }
443
- __device__
444
- inline
445
- unsigned int __shfl_xor(unsigned int var, int lane_mask, int width = warpSize) {
446
- union { int i; unsigned u; float f; } tmp; tmp.u = var;
447
- tmp.i = __shfl_xor(tmp.i, lane_mask, width);
448
- return tmp.u;
449
- }
450
- __device__
451
- inline
452
- float __shfl_xor(float var, int lane_mask, int width = warpSize) {
453
- union { int i; unsigned u; float f; } tmp; tmp.f = var;
454
- tmp.i = __shfl_xor(tmp.i, lane_mask, width);
455
- return tmp.f;
456
- }
457
- __device__
458
- inline
459
- double __shfl_xor(double var, int lane_mask, int width = warpSize) {
460
- static_assert(sizeof(double) == 2 * sizeof(int), "");
461
- static_assert(sizeof(double) == sizeof(uint64_t), "");
462
-
463
- int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
464
- tmp[0] = __shfl_xor(tmp[0], lane_mask, width);
465
- tmp[1] = __shfl_xor(tmp[1], lane_mask, width);
466
-
467
- uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
468
- double tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
469
- return tmp1;
470
- }
471
- __device__
472
- inline
473
- long __shfl_xor(long var, int lane_mask, int width = warpSize)
474
- {
475
- #ifndef _MSC_VER
476
- static_assert(sizeof(long) == 2 * sizeof(int), "");
477
- static_assert(sizeof(long) == sizeof(uint64_t), "");
478
-
479
- int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
480
- tmp[0] = __shfl_xor(tmp[0], lane_mask, width);
481
- tmp[1] = __shfl_xor(tmp[1], lane_mask, width);
482
-
483
- uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
484
- long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
485
- return tmp1;
486
- #else
487
- static_assert(sizeof(long) == sizeof(int), "");
488
- return static_cast<long>(__shfl_xor(static_cast<int>(var), lane_mask, width));
489
- #endif
490
- }
491
- __device__
492
- inline
493
- unsigned long __shfl_xor(unsigned long var, int lane_mask, int width = warpSize)
494
- {
495
- #ifndef _MSC_VER
496
- static_assert(sizeof(unsigned long) == 2 * sizeof(unsigned int), "");
497
- static_assert(sizeof(unsigned long) == sizeof(uint64_t), "");
498
-
499
- unsigned int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
500
- tmp[0] = __shfl_xor(tmp[0], lane_mask, width);
501
- tmp[1] = __shfl_xor(tmp[1], lane_mask, width);
502
-
503
- uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
504
- unsigned long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
505
- return tmp1;
506
- #else
507
- static_assert(sizeof(unsigned long) == sizeof(unsigned int), "");
508
- return static_cast<unsigned long>(__shfl_xor(static_cast<unsigned int>(var), lane_mask, width));
509
- #endif
510
- }
511
- __device__
512
- inline
513
- long long __shfl_xor(long long var, int lane_mask, int width = warpSize)
514
- {
515
- static_assert(sizeof(long long) == 2 * sizeof(int), "");
516
- static_assert(sizeof(long long) == sizeof(uint64_t), "");
517
- int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
518
- tmp[0] = __shfl_xor(tmp[0], lane_mask, width);
519
- tmp[1] = __shfl_xor(tmp[1], lane_mask, width);
520
- uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
521
- long long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
522
- return tmp1;
523
- }
524
- __device__
525
- inline
526
- unsigned long long __shfl_xor(unsigned long long var, int lane_mask, int width = warpSize)
527
- {
528
- static_assert(sizeof(unsigned long long) == 2 * sizeof(unsigned int), "");
529
- static_assert(sizeof(unsigned long long) == sizeof(uint64_t), "");
530
- unsigned int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
531
- tmp[0] = __shfl_xor(tmp[0], lane_mask, width);
532
- tmp[1] = __shfl_xor(tmp[1], lane_mask, width);
533
- uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
534
- unsigned long long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
535
- return tmp1;
536
- }
537
-
538
- #endif