triton-windows 3.3.1.post19__cp310-cp310-win_amd64.whl → 3.4.0.post20__cp310-cp310-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of triton-windows might be problematic. Click here for more details.

Files changed (166) hide show
  1. triton/_C/libtriton.pyd +0 -0
  2. triton/__init__.py +4 -1
  3. triton/_filecheck.py +87 -0
  4. triton/_internal_testing.py +26 -15
  5. triton/_utils.py +110 -21
  6. triton/backends/__init__.py +20 -23
  7. triton/backends/amd/__init__.py +0 -0
  8. triton/backends/amd/compiler.py +112 -78
  9. triton/backends/amd/driver.c +5 -2
  10. triton/backends/amd/driver.py +149 -47
  11. triton/backends/compiler.py +7 -21
  12. triton/backends/nvidia/bin/ptxas.exe +0 -0
  13. triton/backends/nvidia/compiler.py +92 -93
  14. triton/backends/nvidia/driver.c +90 -98
  15. triton/backends/nvidia/driver.py +303 -128
  16. triton/compiler/code_generator.py +212 -111
  17. triton/compiler/compiler.py +110 -25
  18. triton/experimental/__init__.py +0 -0
  19. triton/experimental/gluon/__init__.py +4 -0
  20. triton/experimental/gluon/_compiler.py +0 -0
  21. triton/experimental/gluon/_runtime.py +99 -0
  22. triton/experimental/gluon/language/__init__.py +18 -0
  23. triton/experimental/gluon/language/_core.py +312 -0
  24. triton/experimental/gluon/language/_layouts.py +230 -0
  25. triton/experimental/gluon/language/_math.py +12 -0
  26. triton/experimental/gluon/language/_semantic.py +287 -0
  27. triton/experimental/gluon/language/_standard.py +47 -0
  28. triton/experimental/gluon/language/nvidia/__init__.py +4 -0
  29. triton/experimental/gluon/language/nvidia/blackwell/__init__.py +202 -0
  30. triton/experimental/gluon/language/nvidia/blackwell/tma.py +32 -0
  31. triton/experimental/gluon/language/nvidia/hopper/__init__.py +11 -0
  32. triton/experimental/gluon/language/nvidia/hopper/mbarrier.py +51 -0
  33. triton/experimental/gluon/language/nvidia/hopper/tma.py +96 -0
  34. triton/experimental/gluon/nvidia/__init__.py +4 -0
  35. triton/experimental/gluon/nvidia/blackwell.py +3 -0
  36. triton/experimental/gluon/nvidia/hopper.py +40 -0
  37. triton/knobs.py +481 -0
  38. triton/language/__init__.py +39 -14
  39. triton/language/core.py +794 -537
  40. triton/language/extra/cuda/__init__.py +10 -7
  41. triton/language/extra/cuda/gdc.py +42 -0
  42. triton/language/extra/cuda/libdevice.py +394 -394
  43. triton/language/extra/cuda/utils.py +21 -21
  44. triton/language/extra/hip/libdevice.py +113 -104
  45. triton/language/math.py +65 -66
  46. triton/language/random.py +12 -2
  47. triton/language/semantic.py +1706 -1770
  48. triton/language/standard.py +116 -51
  49. triton/runtime/autotuner.py +117 -59
  50. triton/runtime/build.py +76 -12
  51. triton/runtime/cache.py +18 -47
  52. triton/runtime/driver.py +32 -29
  53. triton/runtime/interpreter.py +72 -35
  54. triton/runtime/jit.py +146 -110
  55. triton/testing.py +16 -12
  56. triton/tools/disasm.py +3 -4
  57. triton/tools/tensor_descriptor.py +36 -0
  58. triton/windows_utils.py +14 -6
  59. {triton_windows-3.3.1.post19.dist-info → triton_windows-3.4.0.post20.dist-info}/METADATA +7 -2
  60. triton_windows-3.4.0.post20.dist-info/RECORD +186 -0
  61. triton_windows-3.4.0.post20.dist-info/entry_points.txt +3 -0
  62. triton_windows-3.4.0.post20.dist-info/licenses/LICENSE +23 -0
  63. triton_windows-3.4.0.post20.dist-info/top_level.txt +1 -0
  64. triton/backends/amd/include/hip/amd_detail/amd_channel_descriptor.h +0 -358
  65. triton/backends/amd/include/hip/amd_detail/amd_device_functions.h +0 -1010
  66. triton/backends/amd/include/hip/amd_detail/amd_hip_atomic.h +0 -1638
  67. triton/backends/amd/include/hip/amd_detail/amd_hip_bf16.h +0 -1814
  68. triton/backends/amd/include/hip/amd_detail/amd_hip_bfloat16.h +0 -293
  69. triton/backends/amd/include/hip/amd_detail/amd_hip_common.h +0 -32
  70. triton/backends/amd/include/hip/amd_detail/amd_hip_complex.h +0 -174
  71. triton/backends/amd/include/hip/amd_detail/amd_hip_cooperative_groups.h +0 -835
  72. triton/backends/amd/include/hip/amd_detail/amd_hip_fp16.h +0 -1809
  73. triton/backends/amd/include/hip/amd_detail/amd_hip_fp8.h +0 -1391
  74. triton/backends/amd/include/hip/amd_detail/amd_hip_gl_interop.h +0 -108
  75. triton/backends/amd/include/hip/amd_detail/amd_hip_math_constants.h +0 -124
  76. triton/backends/amd/include/hip/amd_detail/amd_hip_runtime.h +0 -405
  77. triton/backends/amd/include/hip/amd_detail/amd_hip_runtime_pt_api.h +0 -196
  78. triton/backends/amd/include/hip/amd_detail/amd_hip_unsafe_atomics.h +0 -565
  79. triton/backends/amd/include/hip/amd_detail/amd_hip_vector_types.h +0 -2226
  80. triton/backends/amd/include/hip/amd_detail/amd_math_functions.h +0 -104
  81. triton/backends/amd/include/hip/amd_detail/amd_surface_functions.h +0 -244
  82. triton/backends/amd/include/hip/amd_detail/amd_warp_functions.h +0 -538
  83. triton/backends/amd/include/hip/amd_detail/amd_warp_sync_functions.h +0 -288
  84. triton/backends/amd/include/hip/amd_detail/concepts.hpp +0 -30
  85. triton/backends/amd/include/hip/amd_detail/device_library_decls.h +0 -133
  86. triton/backends/amd/include/hip/amd_detail/functional_grid_launch.hpp +0 -218
  87. triton/backends/amd/include/hip/amd_detail/grid_launch.h +0 -67
  88. triton/backends/amd/include/hip/amd_detail/grid_launch.hpp +0 -50
  89. triton/backends/amd/include/hip/amd_detail/grid_launch_GGL.hpp +0 -26
  90. triton/backends/amd/include/hip/amd_detail/helpers.hpp +0 -137
  91. triton/backends/amd/include/hip/amd_detail/hip_api_trace.hpp +0 -1446
  92. triton/backends/amd/include/hip/amd_detail/hip_assert.h +0 -101
  93. triton/backends/amd/include/hip/amd_detail/hip_cooperative_groups_helper.h +0 -242
  94. triton/backends/amd/include/hip/amd_detail/hip_fp16_gcc.h +0 -254
  95. triton/backends/amd/include/hip/amd_detail/hip_fp16_math_fwd.h +0 -96
  96. triton/backends/amd/include/hip/amd_detail/hip_ldg.h +0 -100
  97. triton/backends/amd/include/hip/amd_detail/hip_prof_str.h +0 -10570
  98. triton/backends/amd/include/hip/amd_detail/hip_runtime_prof.h +0 -78
  99. triton/backends/amd/include/hip/amd_detail/host_defines.h +0 -184
  100. triton/backends/amd/include/hip/amd_detail/hsa_helpers.hpp +0 -102
  101. triton/backends/amd/include/hip/amd_detail/macro_based_grid_launch.hpp +0 -798
  102. triton/backends/amd/include/hip/amd_detail/math_fwd.h +0 -698
  103. triton/backends/amd/include/hip/amd_detail/ockl_image.h +0 -177
  104. triton/backends/amd/include/hip/amd_detail/program_state.hpp +0 -107
  105. triton/backends/amd/include/hip/amd_detail/texture_fetch_functions.h +0 -491
  106. triton/backends/amd/include/hip/amd_detail/texture_indirect_functions.h +0 -478
  107. triton/backends/amd/include/hip/channel_descriptor.h +0 -39
  108. triton/backends/amd/include/hip/device_functions.h +0 -38
  109. triton/backends/amd/include/hip/driver_types.h +0 -468
  110. triton/backends/amd/include/hip/hip_bf16.h +0 -36
  111. triton/backends/amd/include/hip/hip_bfloat16.h +0 -44
  112. triton/backends/amd/include/hip/hip_common.h +0 -100
  113. triton/backends/amd/include/hip/hip_complex.h +0 -38
  114. triton/backends/amd/include/hip/hip_cooperative_groups.h +0 -46
  115. triton/backends/amd/include/hip/hip_deprecated.h +0 -95
  116. triton/backends/amd/include/hip/hip_ext.h +0 -161
  117. triton/backends/amd/include/hip/hip_fp16.h +0 -36
  118. triton/backends/amd/include/hip/hip_fp8.h +0 -33
  119. triton/backends/amd/include/hip/hip_gl_interop.h +0 -32
  120. triton/backends/amd/include/hip/hip_hcc.h +0 -24
  121. triton/backends/amd/include/hip/hip_math_constants.h +0 -36
  122. triton/backends/amd/include/hip/hip_profile.h +0 -27
  123. triton/backends/amd/include/hip/hip_runtime.h +0 -75
  124. triton/backends/amd/include/hip/hip_runtime_api.h +0 -9261
  125. triton/backends/amd/include/hip/hip_texture_types.h +0 -29
  126. triton/backends/amd/include/hip/hip_vector_types.h +0 -41
  127. triton/backends/amd/include/hip/hip_version.h +0 -17
  128. triton/backends/amd/include/hip/hiprtc.h +0 -421
  129. triton/backends/amd/include/hip/library_types.h +0 -78
  130. triton/backends/amd/include/hip/math_functions.h +0 -42
  131. triton/backends/amd/include/hip/surface_types.h +0 -63
  132. triton/backends/amd/include/hip/texture_types.h +0 -194
  133. triton/backends/amd/include/hsa/Brig.h +0 -1131
  134. triton/backends/amd/include/hsa/amd_hsa_common.h +0 -91
  135. triton/backends/amd/include/hsa/amd_hsa_elf.h +0 -462
  136. triton/backends/amd/include/hsa/amd_hsa_kernel_code.h +0 -269
  137. triton/backends/amd/include/hsa/amd_hsa_queue.h +0 -109
  138. triton/backends/amd/include/hsa/amd_hsa_signal.h +0 -80
  139. triton/backends/amd/include/hsa/hsa.h +0 -5738
  140. triton/backends/amd/include/hsa/hsa_amd_tool.h +0 -91
  141. triton/backends/amd/include/hsa/hsa_api_trace.h +0 -579
  142. triton/backends/amd/include/hsa/hsa_api_trace_version.h +0 -68
  143. triton/backends/amd/include/hsa/hsa_ext_amd.h +0 -3146
  144. triton/backends/amd/include/hsa/hsa_ext_finalize.h +0 -531
  145. triton/backends/amd/include/hsa/hsa_ext_image.h +0 -1454
  146. triton/backends/amd/include/hsa/hsa_ven_amd_aqlprofile.h +0 -488
  147. triton/backends/amd/include/hsa/hsa_ven_amd_loader.h +0 -667
  148. triton/backends/amd/include/hsa/hsa_ven_amd_pc_sampling.h +0 -416
  149. triton/backends/amd/include/roctracer/ext/prof_protocol.h +0 -107
  150. triton/backends/amd/include/roctracer/hip_ostream_ops.h +0 -4515
  151. triton/backends/amd/include/roctracer/hsa_ostream_ops.h +0 -1727
  152. triton/backends/amd/include/roctracer/hsa_prof_str.h +0 -3059
  153. triton/backends/amd/include/roctracer/roctracer.h +0 -779
  154. triton/backends/amd/include/roctracer/roctracer_ext.h +0 -81
  155. triton/backends/amd/include/roctracer/roctracer_hcc.h +0 -24
  156. triton/backends/amd/include/roctracer/roctracer_hip.h +0 -37
  157. triton/backends/amd/include/roctracer/roctracer_hsa.h +0 -112
  158. triton/backends/amd/include/roctracer/roctracer_plugin.h +0 -137
  159. triton/backends/amd/include/roctracer/roctracer_roctx.h +0 -67
  160. triton/backends/amd/include/roctracer/roctx.h +0 -229
  161. triton/language/_utils.py +0 -21
  162. triton/language/extra/cuda/_experimental_tma.py +0 -106
  163. triton/tools/experimental_descriptor.py +0 -32
  164. triton_windows-3.3.1.post19.dist-info/RECORD +0 -260
  165. triton_windows-3.3.1.post19.dist-info/top_level.txt +0 -14
  166. {triton_windows-3.3.1.post19.dist-info → triton_windows-3.4.0.post20.dist-info}/WHEEL +0 -0
@@ -1,538 +0,0 @@
1
- /*
2
- Copyright (c) 2022 - 2023 Advanced Micro Devices, Inc. All rights reserved.
3
-
4
- Permission is hereby granted, free of charge, to any person obtaining a copy
5
- of this software and associated documentation files (the "Software"), to deal
6
- in the Software without restriction, including without limitation the rights
7
- to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
8
- copies of the Software, and to permit persons to whom the Software is
9
- furnished to do so, subject to the following conditions:
10
-
11
- The above copyright notice and this permission notice shall be included in
12
- all copies or substantial portions of the Software.
13
-
14
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15
- IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16
- FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
17
- AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18
- LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
19
- OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
20
- THE SOFTWARE.
21
- */
22
-
23
- #ifndef HIP_INCLUDE_HIP_AMD_DETAIL_WARP_FUNCTIONS_H
24
- #define HIP_INCLUDE_HIP_AMD_DETAIL_WARP_FUNCTIONS_H
25
-
26
- __device__ static inline unsigned __hip_ds_bpermute(int index, unsigned src) {
27
- union { int i; unsigned u; float f; } tmp; tmp.u = src;
28
- tmp.i = __builtin_amdgcn_ds_bpermute(index, tmp.i);
29
- return tmp.u;
30
- }
31
-
32
- __device__ static inline float __hip_ds_bpermutef(int index, float src) {
33
- union { int i; unsigned u; float f; } tmp; tmp.f = src;
34
- tmp.i = __builtin_amdgcn_ds_bpermute(index, tmp.i);
35
- return tmp.f;
36
- }
37
-
38
- __device__ static inline unsigned __hip_ds_permute(int index, unsigned src) {
39
- union { int i; unsigned u; float f; } tmp; tmp.u = src;
40
- tmp.i = __builtin_amdgcn_ds_permute(index, tmp.i);
41
- return tmp.u;
42
- }
43
-
44
- __device__ static inline float __hip_ds_permutef(int index, float src) {
45
- union { int i; unsigned u; float f; } tmp; tmp.f = src;
46
- tmp.i = __builtin_amdgcn_ds_permute(index, tmp.i);
47
- return tmp.f;
48
- }
49
-
50
- #define __hip_ds_swizzle(src, pattern) __hip_ds_swizzle_N<(pattern)>((src))
51
- #define __hip_ds_swizzlef(src, pattern) __hip_ds_swizzlef_N<(pattern)>((src))
52
-
53
- template <int pattern>
54
- __device__ static inline unsigned __hip_ds_swizzle_N(unsigned int src) {
55
- union { int i; unsigned u; float f; } tmp; tmp.u = src;
56
- tmp.i = __builtin_amdgcn_ds_swizzle(tmp.i, pattern);
57
- return tmp.u;
58
- }
59
-
60
- template <int pattern>
61
- __device__ static inline float __hip_ds_swizzlef_N(float src) {
62
- union { int i; unsigned u; float f; } tmp; tmp.f = src;
63
- tmp.i = __builtin_amdgcn_ds_swizzle(tmp.i, pattern);
64
- return tmp.f;
65
- }
66
-
67
- #define __hip_move_dpp(src, dpp_ctrl, row_mask, bank_mask, bound_ctrl) \
68
- __hip_move_dpp_N<(dpp_ctrl), (row_mask), (bank_mask), (bound_ctrl)>((src))
69
-
70
- template <int dpp_ctrl, int row_mask, int bank_mask, bool bound_ctrl>
71
- __device__ static inline int __hip_move_dpp_N(int src) {
72
- return __builtin_amdgcn_mov_dpp(src, dpp_ctrl, row_mask, bank_mask,
73
- bound_ctrl);
74
- }
75
-
76
- static constexpr int warpSize = __AMDGCN_WAVEFRONT_SIZE;
77
-
78
- // warp vote function __all __any __ballot
79
- __device__
80
- inline
81
- int __all(int predicate) {
82
- return __ockl_wfall_i32(predicate);
83
- }
84
-
85
- __device__
86
- inline
87
- int __any(int predicate) {
88
- return __ockl_wfany_i32(predicate);
89
- }
90
-
91
- // XXX from llvm/include/llvm/IR/InstrTypes.h
92
- #define ICMP_NE 33
93
-
94
- __device__
95
- inline
96
- unsigned long long int __ballot(int predicate) {
97
- return __builtin_amdgcn_uicmp(predicate, 0, ICMP_NE);
98
- }
99
-
100
- __device__
101
- inline
102
- unsigned long long int __ballot64(int predicate) {
103
- return __builtin_amdgcn_uicmp(predicate, 0, ICMP_NE);
104
- }
105
-
106
- // See amd_warp_sync_functions.h for an explanation of this preprocessor flag.
107
- #ifdef HIP_ENABLE_WARP_SYNC_BUILTINS
108
- // Since threads in a wave do not make independent progress, __activemask()
109
- // always returns the exact active mask, i.e, all active threads in the wave.
110
- __device__
111
- inline
112
- unsigned long long __activemask() {
113
- return __ballot(true);
114
- }
115
- #endif // HIP_ENABLE_WARP_SYNC_BUILTINS
116
-
117
- __device__ static inline unsigned int __lane_id() {
118
- return __builtin_amdgcn_mbcnt_hi(
119
- -1, __builtin_amdgcn_mbcnt_lo(-1, 0));
120
- }
121
-
122
- __device__
123
- inline
124
- int __shfl(int var, int src_lane, int width = warpSize) {
125
- int self = __lane_id();
126
- int index = (src_lane & (width - 1)) + (self & ~(width-1));
127
- return __builtin_amdgcn_ds_bpermute(index<<2, var);
128
- }
129
- __device__
130
- inline
131
- unsigned int __shfl(unsigned int var, int src_lane, int width = warpSize) {
132
- union { int i; unsigned u; float f; } tmp; tmp.u = var;
133
- tmp.i = __shfl(tmp.i, src_lane, width);
134
- return tmp.u;
135
- }
136
- __device__
137
- inline
138
- float __shfl(float var, int src_lane, int width = warpSize) {
139
- union { int i; unsigned u; float f; } tmp; tmp.f = var;
140
- tmp.i = __shfl(tmp.i, src_lane, width);
141
- return tmp.f;
142
- }
143
- __device__
144
- inline
145
- double __shfl(double var, int src_lane, int width = warpSize) {
146
- static_assert(sizeof(double) == 2 * sizeof(int), "");
147
- static_assert(sizeof(double) == sizeof(uint64_t), "");
148
-
149
- int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
150
- tmp[0] = __shfl(tmp[0], src_lane, width);
151
- tmp[1] = __shfl(tmp[1], src_lane, width);
152
-
153
- uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
154
- double tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
155
- return tmp1;
156
- }
157
- __device__
158
- inline
159
- long __shfl(long var, int src_lane, int width = warpSize)
160
- {
161
- #ifndef _MSC_VER
162
- static_assert(sizeof(long) == 2 * sizeof(int), "");
163
- static_assert(sizeof(long) == sizeof(uint64_t), "");
164
-
165
- int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
166
- tmp[0] = __shfl(tmp[0], src_lane, width);
167
- tmp[1] = __shfl(tmp[1], src_lane, width);
168
-
169
- uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
170
- long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
171
- return tmp1;
172
- #else
173
- static_assert(sizeof(long) == sizeof(int), "");
174
- return static_cast<long>(__shfl(static_cast<int>(var), src_lane, width));
175
- #endif
176
- }
177
- __device__
178
- inline
179
- unsigned long __shfl(unsigned long var, int src_lane, int width = warpSize) {
180
- #ifndef _MSC_VER
181
- static_assert(sizeof(unsigned long) == 2 * sizeof(unsigned int), "");
182
- static_assert(sizeof(unsigned long) == sizeof(uint64_t), "");
183
-
184
- unsigned int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
185
- tmp[0] = __shfl(tmp[0], src_lane, width);
186
- tmp[1] = __shfl(tmp[1], src_lane, width);
187
-
188
- uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
189
- unsigned long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
190
- return tmp1;
191
- #else
192
- static_assert(sizeof(unsigned long) == sizeof(unsigned int), "");
193
- return static_cast<unsigned long>(__shfl(static_cast<unsigned int>(var), src_lane, width));
194
- #endif
195
- }
196
- __device__
197
- inline
198
- long long __shfl(long long var, int src_lane, int width = warpSize)
199
- {
200
- static_assert(sizeof(long long) == 2 * sizeof(int), "");
201
- static_assert(sizeof(long long) == sizeof(uint64_t), "");
202
-
203
- int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
204
- tmp[0] = __shfl(tmp[0], src_lane, width);
205
- tmp[1] = __shfl(tmp[1], src_lane, width);
206
-
207
- uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
208
- long long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
209
- return tmp1;
210
- }
211
- __device__
212
- inline
213
- unsigned long long __shfl(unsigned long long var, int src_lane, int width = warpSize) {
214
- static_assert(sizeof(unsigned long long) == 2 * sizeof(unsigned int), "");
215
- static_assert(sizeof(unsigned long long) == sizeof(uint64_t), "");
216
-
217
- unsigned int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
218
- tmp[0] = __shfl(tmp[0], src_lane, width);
219
- tmp[1] = __shfl(tmp[1], src_lane, width);
220
-
221
- uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
222
- unsigned long long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
223
- return tmp1;
224
- }
225
-
226
- __device__
227
- inline
228
- int __shfl_up(int var, unsigned int lane_delta, int width = warpSize) {
229
- int self = __lane_id();
230
- int index = self - lane_delta;
231
- index = (index < (self & ~(width-1)))?self:index;
232
- return __builtin_amdgcn_ds_bpermute(index<<2, var);
233
- }
234
- __device__
235
- inline
236
- unsigned int __shfl_up(unsigned int var, unsigned int lane_delta, int width = warpSize) {
237
- union { int i; unsigned u; float f; } tmp; tmp.u = var;
238
- tmp.i = __shfl_up(tmp.i, lane_delta, width);
239
- return tmp.u;
240
- }
241
- __device__
242
- inline
243
- float __shfl_up(float var, unsigned int lane_delta, int width = warpSize) {
244
- union { int i; unsigned u; float f; } tmp; tmp.f = var;
245
- tmp.i = __shfl_up(tmp.i, lane_delta, width);
246
- return tmp.f;
247
- }
248
- __device__
249
- inline
250
- double __shfl_up(double var, unsigned int lane_delta, int width = warpSize) {
251
- static_assert(sizeof(double) == 2 * sizeof(int), "");
252
- static_assert(sizeof(double) == sizeof(uint64_t), "");
253
-
254
- int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
255
- tmp[0] = __shfl_up(tmp[0], lane_delta, width);
256
- tmp[1] = __shfl_up(tmp[1], lane_delta, width);
257
-
258
- uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
259
- double tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
260
- return tmp1;
261
- }
262
- __device__
263
- inline
264
- long __shfl_up(long var, unsigned int lane_delta, int width = warpSize)
265
- {
266
- #ifndef _MSC_VER
267
- static_assert(sizeof(long) == 2 * sizeof(int), "");
268
- static_assert(sizeof(long) == sizeof(uint64_t), "");
269
-
270
- int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
271
- tmp[0] = __shfl_up(tmp[0], lane_delta, width);
272
- tmp[1] = __shfl_up(tmp[1], lane_delta, width);
273
-
274
- uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
275
- long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
276
- return tmp1;
277
- #else
278
- static_assert(sizeof(long) == sizeof(int), "");
279
- return static_cast<long>(__shfl_up(static_cast<int>(var), lane_delta, width));
280
- #endif
281
- }
282
-
283
- __device__
284
- inline
285
- unsigned long __shfl_up(unsigned long var, unsigned int lane_delta, int width = warpSize)
286
- {
287
- #ifndef _MSC_VER
288
- static_assert(sizeof(unsigned long) == 2 * sizeof(unsigned int), "");
289
- static_assert(sizeof(unsigned long) == sizeof(uint64_t), "");
290
-
291
- unsigned int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
292
- tmp[0] = __shfl_up(tmp[0], lane_delta, width);
293
- tmp[1] = __shfl_up(tmp[1], lane_delta, width);
294
-
295
- uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
296
- unsigned long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
297
- return tmp1;
298
- #else
299
- static_assert(sizeof(unsigned long) == sizeof(unsigned int), "");
300
- return static_cast<unsigned long>(__shfl_up(static_cast<unsigned int>(var), lane_delta, width));
301
- #endif
302
- }
303
-
304
- __device__
305
- inline
306
- long long __shfl_up(long long var, unsigned int lane_delta, int width = warpSize)
307
- {
308
- static_assert(sizeof(long long) == 2 * sizeof(int), "");
309
- static_assert(sizeof(long long) == sizeof(uint64_t), "");
310
- int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
311
- tmp[0] = __shfl_up(tmp[0], lane_delta, width);
312
- tmp[1] = __shfl_up(tmp[1], lane_delta, width);
313
- uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
314
- long long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
315
- return tmp1;
316
- }
317
-
318
- __device__
319
- inline
320
- unsigned long long __shfl_up(unsigned long long var, unsigned int lane_delta, int width = warpSize)
321
- {
322
- static_assert(sizeof(unsigned long long) == 2 * sizeof(unsigned int), "");
323
- static_assert(sizeof(unsigned long long) == sizeof(uint64_t), "");
324
- unsigned int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
325
- tmp[0] = __shfl_up(tmp[0], lane_delta, width);
326
- tmp[1] = __shfl_up(tmp[1], lane_delta, width);
327
- uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
328
- unsigned long long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
329
- return tmp1;
330
- }
331
-
332
- __device__
333
- inline
334
- int __shfl_down(int var, unsigned int lane_delta, int width = warpSize) {
335
- int self = __lane_id();
336
- int index = self + lane_delta;
337
- index = (int)((self&(width-1))+lane_delta) >= width?self:index;
338
- return __builtin_amdgcn_ds_bpermute(index<<2, var);
339
- }
340
- __device__
341
- inline
342
- unsigned int __shfl_down(unsigned int var, unsigned int lane_delta, int width = warpSize) {
343
- union { int i; unsigned u; float f; } tmp; tmp.u = var;
344
- tmp.i = __shfl_down(tmp.i, lane_delta, width);
345
- return tmp.u;
346
- }
347
- __device__
348
- inline
349
- float __shfl_down(float var, unsigned int lane_delta, int width = warpSize) {
350
- union { int i; unsigned u; float f; } tmp; tmp.f = var;
351
- tmp.i = __shfl_down(tmp.i, lane_delta, width);
352
- return tmp.f;
353
- }
354
- __device__
355
- inline
356
- double __shfl_down(double var, unsigned int lane_delta, int width = warpSize) {
357
- static_assert(sizeof(double) == 2 * sizeof(int), "");
358
- static_assert(sizeof(double) == sizeof(uint64_t), "");
359
-
360
- int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
361
- tmp[0] = __shfl_down(tmp[0], lane_delta, width);
362
- tmp[1] = __shfl_down(tmp[1], lane_delta, width);
363
-
364
- uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
365
- double tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
366
- return tmp1;
367
- }
368
- __device__
369
- inline
370
- long __shfl_down(long var, unsigned int lane_delta, int width = warpSize)
371
- {
372
- #ifndef _MSC_VER
373
- static_assert(sizeof(long) == 2 * sizeof(int), "");
374
- static_assert(sizeof(long) == sizeof(uint64_t), "");
375
-
376
- int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
377
- tmp[0] = __shfl_down(tmp[0], lane_delta, width);
378
- tmp[1] = __shfl_down(tmp[1], lane_delta, width);
379
-
380
- uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
381
- long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
382
- return tmp1;
383
- #else
384
- static_assert(sizeof(long) == sizeof(int), "");
385
- return static_cast<long>(__shfl_down(static_cast<int>(var), lane_delta, width));
386
- #endif
387
- }
388
- __device__
389
- inline
390
- unsigned long __shfl_down(unsigned long var, unsigned int lane_delta, int width = warpSize)
391
- {
392
- #ifndef _MSC_VER
393
- static_assert(sizeof(unsigned long) == 2 * sizeof(unsigned int), "");
394
- static_assert(sizeof(unsigned long) == sizeof(uint64_t), "");
395
-
396
- unsigned int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
397
- tmp[0] = __shfl_down(tmp[0], lane_delta, width);
398
- tmp[1] = __shfl_down(tmp[1], lane_delta, width);
399
-
400
- uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
401
- unsigned long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
402
- return tmp1;
403
- #else
404
- static_assert(sizeof(unsigned long) == sizeof(unsigned int), "");
405
- return static_cast<unsigned long>(__shfl_down(static_cast<unsigned int>(var), lane_delta, width));
406
- #endif
407
- }
408
- __device__
409
- inline
410
- long long __shfl_down(long long var, unsigned int lane_delta, int width = warpSize)
411
- {
412
- static_assert(sizeof(long long) == 2 * sizeof(int), "");
413
- static_assert(sizeof(long long) == sizeof(uint64_t), "");
414
- int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
415
- tmp[0] = __shfl_down(tmp[0], lane_delta, width);
416
- tmp[1] = __shfl_down(tmp[1], lane_delta, width);
417
- uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
418
- long long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
419
- return tmp1;
420
- }
421
- __device__
422
- inline
423
- unsigned long long __shfl_down(unsigned long long var, unsigned int lane_delta, int width = warpSize)
424
- {
425
- static_assert(sizeof(unsigned long long) == 2 * sizeof(unsigned int), "");
426
- static_assert(sizeof(unsigned long long) == sizeof(uint64_t), "");
427
- unsigned int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
428
- tmp[0] = __shfl_down(tmp[0], lane_delta, width);
429
- tmp[1] = __shfl_down(tmp[1], lane_delta, width);
430
- uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
431
- unsigned long long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
432
- return tmp1;
433
- }
434
-
435
- __device__
436
- inline
437
- int __shfl_xor(int var, int lane_mask, int width = warpSize) {
438
- int self = __lane_id();
439
- int index = self^lane_mask;
440
- index = index >= ((self+width)&~(width-1))?self:index;
441
- return __builtin_amdgcn_ds_bpermute(index<<2, var);
442
- }
443
- __device__
444
- inline
445
- unsigned int __shfl_xor(unsigned int var, int lane_mask, int width = warpSize) {
446
- union { int i; unsigned u; float f; } tmp; tmp.u = var;
447
- tmp.i = __shfl_xor(tmp.i, lane_mask, width);
448
- return tmp.u;
449
- }
450
- __device__
451
- inline
452
- float __shfl_xor(float var, int lane_mask, int width = warpSize) {
453
- union { int i; unsigned u; float f; } tmp; tmp.f = var;
454
- tmp.i = __shfl_xor(tmp.i, lane_mask, width);
455
- return tmp.f;
456
- }
457
- __device__
458
- inline
459
- double __shfl_xor(double var, int lane_mask, int width = warpSize) {
460
- static_assert(sizeof(double) == 2 * sizeof(int), "");
461
- static_assert(sizeof(double) == sizeof(uint64_t), "");
462
-
463
- int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
464
- tmp[0] = __shfl_xor(tmp[0], lane_mask, width);
465
- tmp[1] = __shfl_xor(tmp[1], lane_mask, width);
466
-
467
- uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
468
- double tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
469
- return tmp1;
470
- }
471
- __device__
472
- inline
473
- long __shfl_xor(long var, int lane_mask, int width = warpSize)
474
- {
475
- #ifndef _MSC_VER
476
- static_assert(sizeof(long) == 2 * sizeof(int), "");
477
- static_assert(sizeof(long) == sizeof(uint64_t), "");
478
-
479
- int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
480
- tmp[0] = __shfl_xor(tmp[0], lane_mask, width);
481
- tmp[1] = __shfl_xor(tmp[1], lane_mask, width);
482
-
483
- uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
484
- long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
485
- return tmp1;
486
- #else
487
- static_assert(sizeof(long) == sizeof(int), "");
488
- return static_cast<long>(__shfl_xor(static_cast<int>(var), lane_mask, width));
489
- #endif
490
- }
491
- __device__
492
- inline
493
- unsigned long __shfl_xor(unsigned long var, int lane_mask, int width = warpSize)
494
- {
495
- #ifndef _MSC_VER
496
- static_assert(sizeof(unsigned long) == 2 * sizeof(unsigned int), "");
497
- static_assert(sizeof(unsigned long) == sizeof(uint64_t), "");
498
-
499
- unsigned int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
500
- tmp[0] = __shfl_xor(tmp[0], lane_mask, width);
501
- tmp[1] = __shfl_xor(tmp[1], lane_mask, width);
502
-
503
- uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
504
- unsigned long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
505
- return tmp1;
506
- #else
507
- static_assert(sizeof(unsigned long) == sizeof(unsigned int), "");
508
- return static_cast<unsigned long>(__shfl_xor(static_cast<unsigned int>(var), lane_mask, width));
509
- #endif
510
- }
511
- __device__
512
- inline
513
- long long __shfl_xor(long long var, int lane_mask, int width = warpSize)
514
- {
515
- static_assert(sizeof(long long) == 2 * sizeof(int), "");
516
- static_assert(sizeof(long long) == sizeof(uint64_t), "");
517
- int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
518
- tmp[0] = __shfl_xor(tmp[0], lane_mask, width);
519
- tmp[1] = __shfl_xor(tmp[1], lane_mask, width);
520
- uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
521
- long long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
522
- return tmp1;
523
- }
524
- __device__
525
- inline
526
- unsigned long long __shfl_xor(unsigned long long var, int lane_mask, int width = warpSize)
527
- {
528
- static_assert(sizeof(unsigned long long) == 2 * sizeof(unsigned int), "");
529
- static_assert(sizeof(unsigned long long) == sizeof(uint64_t), "");
530
- unsigned int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
531
- tmp[0] = __shfl_xor(tmp[0], lane_mask, width);
532
- tmp[1] = __shfl_xor(tmp[1], lane_mask, width);
533
- uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
534
- unsigned long long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
535
- return tmp1;
536
- }
537
-
538
- #endif