triton-windows 3.3.1.post19__cp313-cp313-win_amd64.whl → 3.3.1.post21__cp313-cp313-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of triton-windows might be problematic. Click here for more details.

Files changed (108) hide show
  1. triton/_C/libtriton.pyd +0 -0
  2. triton/backends/amd/driver.py +6 -1
  3. triton/backends/nvidia/compiler.py +1 -3
  4. triton/backends/nvidia/driver.py +7 -3
  5. triton/runtime/autotuner.py +2 -2
  6. triton/runtime/build.py +5 -5
  7. triton/windows_utils.py +11 -4
  8. {triton_windows-3.3.1.post19.dist-info → triton_windows-3.3.1.post21.dist-info}/METADATA +1 -1
  9. {triton_windows-3.3.1.post19.dist-info → triton_windows-3.3.1.post21.dist-info}/RECORD +11 -108
  10. triton/backends/amd/include/hip/amd_detail/amd_channel_descriptor.h +0 -358
  11. triton/backends/amd/include/hip/amd_detail/amd_device_functions.h +0 -1010
  12. triton/backends/amd/include/hip/amd_detail/amd_hip_atomic.h +0 -1638
  13. triton/backends/amd/include/hip/amd_detail/amd_hip_bf16.h +0 -1814
  14. triton/backends/amd/include/hip/amd_detail/amd_hip_bfloat16.h +0 -293
  15. triton/backends/amd/include/hip/amd_detail/amd_hip_common.h +0 -32
  16. triton/backends/amd/include/hip/amd_detail/amd_hip_complex.h +0 -174
  17. triton/backends/amd/include/hip/amd_detail/amd_hip_cooperative_groups.h +0 -835
  18. triton/backends/amd/include/hip/amd_detail/amd_hip_fp16.h +0 -1809
  19. triton/backends/amd/include/hip/amd_detail/amd_hip_fp8.h +0 -1391
  20. triton/backends/amd/include/hip/amd_detail/amd_hip_gl_interop.h +0 -108
  21. triton/backends/amd/include/hip/amd_detail/amd_hip_math_constants.h +0 -124
  22. triton/backends/amd/include/hip/amd_detail/amd_hip_runtime.h +0 -405
  23. triton/backends/amd/include/hip/amd_detail/amd_hip_runtime_pt_api.h +0 -196
  24. triton/backends/amd/include/hip/amd_detail/amd_hip_unsafe_atomics.h +0 -565
  25. triton/backends/amd/include/hip/amd_detail/amd_hip_vector_types.h +0 -2226
  26. triton/backends/amd/include/hip/amd_detail/amd_math_functions.h +0 -104
  27. triton/backends/amd/include/hip/amd_detail/amd_surface_functions.h +0 -244
  28. triton/backends/amd/include/hip/amd_detail/amd_warp_functions.h +0 -538
  29. triton/backends/amd/include/hip/amd_detail/amd_warp_sync_functions.h +0 -288
  30. triton/backends/amd/include/hip/amd_detail/concepts.hpp +0 -30
  31. triton/backends/amd/include/hip/amd_detail/device_library_decls.h +0 -133
  32. triton/backends/amd/include/hip/amd_detail/functional_grid_launch.hpp +0 -218
  33. triton/backends/amd/include/hip/amd_detail/grid_launch.h +0 -67
  34. triton/backends/amd/include/hip/amd_detail/grid_launch.hpp +0 -50
  35. triton/backends/amd/include/hip/amd_detail/grid_launch_GGL.hpp +0 -26
  36. triton/backends/amd/include/hip/amd_detail/helpers.hpp +0 -137
  37. triton/backends/amd/include/hip/amd_detail/hip_api_trace.hpp +0 -1446
  38. triton/backends/amd/include/hip/amd_detail/hip_assert.h +0 -101
  39. triton/backends/amd/include/hip/amd_detail/hip_cooperative_groups_helper.h +0 -242
  40. triton/backends/amd/include/hip/amd_detail/hip_fp16_gcc.h +0 -254
  41. triton/backends/amd/include/hip/amd_detail/hip_fp16_math_fwd.h +0 -96
  42. triton/backends/amd/include/hip/amd_detail/hip_ldg.h +0 -100
  43. triton/backends/amd/include/hip/amd_detail/hip_prof_str.h +0 -10570
  44. triton/backends/amd/include/hip/amd_detail/hip_runtime_prof.h +0 -78
  45. triton/backends/amd/include/hip/amd_detail/host_defines.h +0 -184
  46. triton/backends/amd/include/hip/amd_detail/hsa_helpers.hpp +0 -102
  47. triton/backends/amd/include/hip/amd_detail/macro_based_grid_launch.hpp +0 -798
  48. triton/backends/amd/include/hip/amd_detail/math_fwd.h +0 -698
  49. triton/backends/amd/include/hip/amd_detail/ockl_image.h +0 -177
  50. triton/backends/amd/include/hip/amd_detail/program_state.hpp +0 -107
  51. triton/backends/amd/include/hip/amd_detail/texture_fetch_functions.h +0 -491
  52. triton/backends/amd/include/hip/amd_detail/texture_indirect_functions.h +0 -478
  53. triton/backends/amd/include/hip/channel_descriptor.h +0 -39
  54. triton/backends/amd/include/hip/device_functions.h +0 -38
  55. triton/backends/amd/include/hip/driver_types.h +0 -468
  56. triton/backends/amd/include/hip/hip_bf16.h +0 -36
  57. triton/backends/amd/include/hip/hip_bfloat16.h +0 -44
  58. triton/backends/amd/include/hip/hip_common.h +0 -100
  59. triton/backends/amd/include/hip/hip_complex.h +0 -38
  60. triton/backends/amd/include/hip/hip_cooperative_groups.h +0 -46
  61. triton/backends/amd/include/hip/hip_deprecated.h +0 -95
  62. triton/backends/amd/include/hip/hip_ext.h +0 -161
  63. triton/backends/amd/include/hip/hip_fp16.h +0 -36
  64. triton/backends/amd/include/hip/hip_fp8.h +0 -33
  65. triton/backends/amd/include/hip/hip_gl_interop.h +0 -32
  66. triton/backends/amd/include/hip/hip_hcc.h +0 -24
  67. triton/backends/amd/include/hip/hip_math_constants.h +0 -36
  68. triton/backends/amd/include/hip/hip_profile.h +0 -27
  69. triton/backends/amd/include/hip/hip_runtime.h +0 -75
  70. triton/backends/amd/include/hip/hip_runtime_api.h +0 -9261
  71. triton/backends/amd/include/hip/hip_texture_types.h +0 -29
  72. triton/backends/amd/include/hip/hip_vector_types.h +0 -41
  73. triton/backends/amd/include/hip/hip_version.h +0 -17
  74. triton/backends/amd/include/hip/hiprtc.h +0 -421
  75. triton/backends/amd/include/hip/library_types.h +0 -78
  76. triton/backends/amd/include/hip/math_functions.h +0 -42
  77. triton/backends/amd/include/hip/surface_types.h +0 -63
  78. triton/backends/amd/include/hip/texture_types.h +0 -194
  79. triton/backends/amd/include/hsa/Brig.h +0 -1131
  80. triton/backends/amd/include/hsa/amd_hsa_common.h +0 -91
  81. triton/backends/amd/include/hsa/amd_hsa_elf.h +0 -462
  82. triton/backends/amd/include/hsa/amd_hsa_kernel_code.h +0 -269
  83. triton/backends/amd/include/hsa/amd_hsa_queue.h +0 -109
  84. triton/backends/amd/include/hsa/amd_hsa_signal.h +0 -80
  85. triton/backends/amd/include/hsa/hsa.h +0 -5738
  86. triton/backends/amd/include/hsa/hsa_amd_tool.h +0 -91
  87. triton/backends/amd/include/hsa/hsa_api_trace.h +0 -579
  88. triton/backends/amd/include/hsa/hsa_api_trace_version.h +0 -68
  89. triton/backends/amd/include/hsa/hsa_ext_amd.h +0 -3146
  90. triton/backends/amd/include/hsa/hsa_ext_finalize.h +0 -531
  91. triton/backends/amd/include/hsa/hsa_ext_image.h +0 -1454
  92. triton/backends/amd/include/hsa/hsa_ven_amd_aqlprofile.h +0 -488
  93. triton/backends/amd/include/hsa/hsa_ven_amd_loader.h +0 -667
  94. triton/backends/amd/include/hsa/hsa_ven_amd_pc_sampling.h +0 -416
  95. triton/backends/amd/include/roctracer/ext/prof_protocol.h +0 -107
  96. triton/backends/amd/include/roctracer/hip_ostream_ops.h +0 -4515
  97. triton/backends/amd/include/roctracer/hsa_ostream_ops.h +0 -1727
  98. triton/backends/amd/include/roctracer/hsa_prof_str.h +0 -3059
  99. triton/backends/amd/include/roctracer/roctracer.h +0 -779
  100. triton/backends/amd/include/roctracer/roctracer_ext.h +0 -81
  101. triton/backends/amd/include/roctracer/roctracer_hcc.h +0 -24
  102. triton/backends/amd/include/roctracer/roctracer_hip.h +0 -37
  103. triton/backends/amd/include/roctracer/roctracer_hsa.h +0 -112
  104. triton/backends/amd/include/roctracer/roctracer_plugin.h +0 -137
  105. triton/backends/amd/include/roctracer/roctracer_roctx.h +0 -67
  106. triton/backends/amd/include/roctracer/roctx.h +0 -229
  107. {triton_windows-3.3.1.post19.dist-info → triton_windows-3.3.1.post21.dist-info}/WHEEL +0 -0
  108. {triton_windows-3.3.1.post19.dist-info → triton_windows-3.3.1.post21.dist-info}/top_level.txt +0 -0
@@ -1,1010 +0,0 @@
1
- /*
2
- Copyright (c) 2015 - 2023 Advanced Micro Devices, Inc. All rights reserved.
3
-
4
- Permission is hereby granted, free of charge, to any person obtaining a copy
5
- of this software and associated documentation files (the "Software"), to deal
6
- in the Software without restriction, including without limitation the rights
7
- to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
8
- copies of the Software, and to permit persons to whom the Software is
9
- furnished to do so, subject to the following conditions:
10
-
11
- The above copyright notice and this permission notice shall be included in
12
- all copies or substantial portions of the Software.
13
-
14
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15
- IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16
- FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
17
- AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18
- LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
19
- OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
20
- THE SOFTWARE.
21
- */
22
-
23
- #ifndef HIP_INCLUDE_HIP_AMD_DETAIL_DEVICE_FUNCTIONS_H
24
- #define HIP_INCLUDE_HIP_AMD_DETAIL_DEVICE_FUNCTIONS_H
25
-
26
- #if !defined(__HIPCC_RTC__)
27
- #include <hip/amd_detail/amd_hip_common.h>
28
- #include <hip/amd_detail/device_library_decls.h>
29
- #include <hip/amd_detail/hip_assert.h>
30
- #include "host_defines.h"
31
- #include "math_fwd.h"
32
- #include <hip/hip_runtime_api.h>
33
- #include <stddef.h>
34
- #include <hip/hip_vector_types.h>
35
- #endif // !defined(__HIPCC_RTC__)
36
-
37
- #if defined(__clang__) && defined(__HIP__)
38
- extern "C" __device__ int printf(const char *fmt, ...);
39
- #else
40
- template <typename... All>
41
- static inline __device__ void printf(const char* format, All... all) {}
42
- #endif
43
-
44
- extern "C" __device__ unsigned long long __ockl_steadyctr_u64();
45
-
46
- /*
47
- Integer Intrinsics
48
- */
49
-
50
- // integer intrinsic function __poc __clz __ffs __brev
51
- __device__ static inline unsigned int __popc(unsigned int input) {
52
- return __builtin_popcount(input);
53
- }
54
- __device__ static inline unsigned int __popcll(unsigned long long int input) {
55
- return __builtin_popcountll(input);
56
- }
57
-
58
- __device__ static inline int __clz(int input) {
59
- return __ockl_clz_u32((uint)input);
60
- }
61
-
62
- __device__ static inline int __clzll(long long int input) {
63
- return __ockl_clz_u64((uint64_t)input);
64
- }
65
-
66
- __device__ static inline unsigned int __ffs(unsigned int input) {
67
- return ( input == 0 ? -1 : __builtin_ctz(input) ) + 1;
68
- }
69
-
70
- __device__ static inline unsigned int __ffsll(unsigned long long int input) {
71
- return ( input == 0 ? -1 : __builtin_ctzll(input) ) + 1;
72
- }
73
-
74
- __device__ static inline unsigned int __ffs(int input) {
75
- return ( input == 0 ? -1 : __builtin_ctz(input) ) + 1;
76
- }
77
-
78
- __device__ static inline unsigned int __ffsll(long long int input) {
79
- return ( input == 0 ? -1 : __builtin_ctzll(input) ) + 1;
80
- }
81
-
82
- // Given a 32/64-bit value exec mask and an integer value base (between 0 and WAVEFRONT_SIZE),
83
- // find the n-th (given by offset) set bit in the exec mask from the base bit, and return the bit position.
84
- // If not found, return -1.
85
- __device__ static int32_t __fns64(uint64_t mask, uint32_t base, int32_t offset) {
86
- uint64_t temp_mask = mask;
87
- int32_t temp_offset = offset;
88
-
89
- if (offset == 0) {
90
- temp_mask &= (1 << base);
91
- temp_offset = 1;
92
- }
93
- else if (offset < 0) {
94
- temp_mask = __builtin_bitreverse64(mask);
95
- base = 63 - base;
96
- temp_offset = -offset;
97
- }
98
-
99
- temp_mask = temp_mask & ((~0ULL) << base);
100
- if (__builtin_popcountll(temp_mask) < temp_offset)
101
- return -1;
102
- int32_t total = 0;
103
- for (int i = 0x20; i > 0; i >>= 1) {
104
- uint64_t temp_mask_lo = temp_mask & ((1ULL << i) - 1);
105
- int32_t pcnt = __builtin_popcountll(temp_mask_lo);
106
- if (pcnt < temp_offset) {
107
- temp_mask = temp_mask >> i;
108
- temp_offset -= pcnt;
109
- total += i;
110
- }
111
- else {
112
- temp_mask = temp_mask_lo;
113
- }
114
- }
115
- if (offset < 0)
116
- return 63 - total;
117
- else
118
- return total;
119
- }
120
-
121
- __device__ static int32_t __fns32(uint64_t mask, uint32_t base, int32_t offset) {
122
- uint64_t temp_mask = mask;
123
- int32_t temp_offset = offset;
124
- if (offset == 0) {
125
- temp_mask &= (1 << base);
126
- temp_offset = 1;
127
- }
128
- else if (offset < 0) {
129
- temp_mask = __builtin_bitreverse64(mask);
130
- base = 63 - base;
131
- temp_offset = -offset;
132
- }
133
- temp_mask = temp_mask & ((~0ULL) << base);
134
- if (__builtin_popcountll(temp_mask) < temp_offset)
135
- return -1;
136
- int32_t total = 0;
137
- for (int i = 0x20; i > 0; i >>= 1) {
138
- uint64_t temp_mask_lo = temp_mask & ((1ULL << i) - 1);
139
- int32_t pcnt = __builtin_popcountll(temp_mask_lo);
140
- if (pcnt < temp_offset) {
141
- temp_mask = temp_mask >> i;
142
- temp_offset -= pcnt;
143
- total += i;
144
- }
145
- else {
146
- temp_mask = temp_mask_lo;
147
- }
148
- }
149
- if (offset < 0)
150
- return 63 - total;
151
- else
152
- return total;
153
- }
154
- __device__ static inline unsigned int __brev(unsigned int input) {
155
- return __builtin_bitreverse32(input);
156
- }
157
-
158
- __device__ static inline unsigned long long int __brevll(unsigned long long int input) {
159
- return __builtin_bitreverse64(input);
160
- }
161
-
162
- __device__ static inline unsigned int __lastbit_u32_u64(uint64_t input) {
163
- return input == 0 ? -1 : __builtin_ctzl(input);
164
- }
165
-
166
- __device__ static inline unsigned int __bitextract_u32(unsigned int src0, unsigned int src1, unsigned int src2) {
167
- uint32_t offset = src1 & 31;
168
- uint32_t width = src2 & 31;
169
- return width == 0 ? 0 : (src0 << (32 - offset - width)) >> (32 - width);
170
- }
171
-
172
- __device__ static inline uint64_t __bitextract_u64(uint64_t src0, unsigned int src1, unsigned int src2) {
173
- uint64_t offset = src1 & 63;
174
- uint64_t width = src2 & 63;
175
- return width == 0 ? 0 : (src0 << (64 - offset - width)) >> (64 - width);
176
- }
177
-
178
- __device__ static inline unsigned int __bitinsert_u32(unsigned int src0, unsigned int src1, unsigned int src2, unsigned int src3) {
179
- uint32_t offset = src2 & 31;
180
- uint32_t width = src3 & 31;
181
- uint32_t mask = (1 << width) - 1;
182
- return ((src0 & ~(mask << offset)) | ((src1 & mask) << offset));
183
- }
184
-
185
- __device__ static inline uint64_t __bitinsert_u64(uint64_t src0, uint64_t src1, unsigned int src2, unsigned int src3) {
186
- uint64_t offset = src2 & 63;
187
- uint64_t width = src3 & 63;
188
- uint64_t mask = (1ULL << width) - 1;
189
- return ((src0 & ~(mask << offset)) | ((src1 & mask) << offset));
190
- }
191
-
192
- __device__ inline unsigned int __funnelshift_l(unsigned int lo, unsigned int hi, unsigned int shift)
193
- {
194
- uint32_t mask_shift = shift & 31;
195
- return mask_shift == 0 ? hi : __builtin_amdgcn_alignbit(hi, lo, 32 - mask_shift);
196
- }
197
-
198
- __device__ inline unsigned int __funnelshift_lc(unsigned int lo, unsigned int hi, unsigned int shift)
199
- {
200
- uint32_t min_shift = shift >= 32 ? 32 : shift;
201
- return min_shift == 0 ? hi : __builtin_amdgcn_alignbit(hi, lo, 32 - min_shift);
202
- }
203
-
204
- __device__ inline unsigned int __funnelshift_r(unsigned int lo, unsigned int hi, unsigned int shift)
205
- {
206
- return __builtin_amdgcn_alignbit(hi, lo, shift);
207
- }
208
-
209
- __device__ inline unsigned int __funnelshift_rc(unsigned int lo, unsigned int hi, unsigned int shift)
210
- {
211
- return shift >= 32 ? hi : __builtin_amdgcn_alignbit(hi, lo, shift);
212
- }
213
-
214
- __device__ static unsigned int __byte_perm(unsigned int x, unsigned int y, unsigned int s);
215
- __device__ static unsigned int __hadd(int x, int y);
216
- __device__ static int __mul24(int x, int y);
217
- __device__ static long long int __mul64hi(long long int x, long long int y);
218
- __device__ static int __mulhi(int x, int y);
219
- __device__ static int __rhadd(int x, int y);
220
- __device__ static unsigned int __sad(int x, int y,unsigned int z);
221
- __device__ static unsigned int __uhadd(unsigned int x, unsigned int y);
222
- __device__ static int __umul24(unsigned int x, unsigned int y);
223
- __device__ static unsigned long long int __umul64hi(unsigned long long int x, unsigned long long int y);
224
- __device__ static unsigned int __umulhi(unsigned int x, unsigned int y);
225
- __device__ static unsigned int __urhadd(unsigned int x, unsigned int y);
226
- __device__ static unsigned int __usad(unsigned int x, unsigned int y, unsigned int z);
227
-
228
- struct ucharHolder {
229
- union {
230
- unsigned char c[4];
231
- unsigned int ui;
232
- };
233
- } __attribute__((aligned(4)));
234
-
235
- struct uchar2Holder {
236
- union {
237
- unsigned int ui[2];
238
- unsigned char c[8];
239
- };
240
- } __attribute__((aligned(8)));
241
-
242
- __device__
243
- static inline unsigned int __byte_perm(unsigned int x, unsigned int y, unsigned int s) {
244
- struct uchar2Holder cHoldVal;
245
- struct ucharHolder cHoldKey;
246
- cHoldKey.ui = s;
247
- cHoldVal.ui[0] = x;
248
- cHoldVal.ui[1] = y;
249
- unsigned int result;
250
- result = cHoldVal.c[cHoldKey.c[0] & 0x07];
251
- result += (cHoldVal.c[(cHoldKey.c[0] & 0x70) >> 4] << 8);
252
- result += (cHoldVal.c[cHoldKey.c[1] & 0x07] << 16);
253
- result += (cHoldVal.c[(cHoldKey.c[1] & 0x70) >> 4] << 24);
254
- return result;
255
- }
256
-
257
- __device__ static inline unsigned int __hadd(int x, int y) {
258
- int z = x + y;
259
- int sign = z & 0x8000000;
260
- int value = z & 0x7FFFFFFF;
261
- return ((value) >> 1 || sign);
262
- }
263
-
264
- __device__ static inline int __mul24(int x, int y) {
265
- return __ockl_mul24_i32(x, y);
266
- }
267
-
268
- __device__ static inline long long __mul64hi(long long int x, long long int y) {
269
- unsigned long long x0 = (unsigned long long)x & 0xffffffffUL;
270
- long long x1 = x >> 32;
271
- unsigned long long y0 = (unsigned long long)y & 0xffffffffUL;
272
- long long y1 = y >> 32;
273
- unsigned long long z0 = x0*y0;
274
- long long t = x1*y0 + (z0 >> 32);
275
- long long z1 = t & 0xffffffffL;
276
- long long z2 = t >> 32;
277
- z1 = x0*y1 + z1;
278
- return x1*y1 + z2 + (z1 >> 32);
279
- }
280
-
281
- __device__ static inline int __mulhi(int x, int y) {
282
- return __ockl_mul_hi_i32(x, y);
283
- }
284
-
285
- __device__ static inline int __rhadd(int x, int y) {
286
- int z = x + y + 1;
287
- int sign = z & 0x8000000;
288
- int value = z & 0x7FFFFFFF;
289
- return ((value) >> 1 || sign);
290
- }
291
- __device__ static inline unsigned int __sad(int x, int y, unsigned int z) {
292
- return x > y ? x - y + z : y - x + z;
293
- }
294
- __device__ static inline unsigned int __uhadd(unsigned int x, unsigned int y) {
295
- return (x + y) >> 1;
296
- }
297
- __device__ static inline int __umul24(unsigned int x, unsigned int y) {
298
- return __ockl_mul24_u32(x, y);
299
- }
300
-
301
- __device__
302
- static inline unsigned long long __umul64hi(unsigned long long int x, unsigned long long int y) {
303
- unsigned long long x0 = x & 0xffffffffUL;
304
- unsigned long long x1 = x >> 32;
305
- unsigned long long y0 = y & 0xffffffffUL;
306
- unsigned long long y1 = y >> 32;
307
- unsigned long long z0 = x0*y0;
308
- unsigned long long t = x1*y0 + (z0 >> 32);
309
- unsigned long long z1 = t & 0xffffffffUL;
310
- unsigned long long z2 = t >> 32;
311
- z1 = x0*y1 + z1;
312
- return x1*y1 + z2 + (z1 >> 32);
313
- }
314
-
315
- __device__ static inline unsigned int __umulhi(unsigned int x, unsigned int y) {
316
- return __ockl_mul_hi_u32(x, y);
317
- }
318
- __device__ static inline unsigned int __urhadd(unsigned int x, unsigned int y) {
319
- return (x + y + 1) >> 1;
320
- }
321
- __device__ static inline unsigned int __usad(unsigned int x, unsigned int y, unsigned int z) {
322
- return __ockl_sadd_u32(x, y, z);
323
- }
324
-
325
- __device__
326
- static inline unsigned int __mbcnt_lo(unsigned int x, unsigned int y) {return __builtin_amdgcn_mbcnt_lo(x,y);};
327
-
328
- __device__
329
- static inline unsigned int __mbcnt_hi(unsigned int x, unsigned int y) {return __builtin_amdgcn_mbcnt_hi(x,y);};
330
-
331
- /*
332
- HIP specific device functions
333
- */
334
-
335
- #if !defined(__HIPCC_RTC__)
336
- #include "amd_warp_functions.h"
337
- #include "amd_warp_sync_functions.h"
338
- #endif
339
-
340
- #define MASK1 0x00ff00ff
341
- #define MASK2 0xff00ff00
342
-
343
- __device__ static inline char4 __hip_hc_add8pk(char4 in1, char4 in2) {
344
- char4 out;
345
- unsigned one1 = in1.w & MASK1;
346
- unsigned one2 = in2.w & MASK1;
347
- out.w = (one1 + one2) & MASK1;
348
- one1 = in1.w & MASK2;
349
- one2 = in2.w & MASK2;
350
- out.w = out.w | ((one1 + one2) & MASK2);
351
- return out;
352
- }
353
-
354
- __device__ static inline char4 __hip_hc_sub8pk(char4 in1, char4 in2) {
355
- char4 out;
356
- unsigned one1 = in1.w & MASK1;
357
- unsigned one2 = in2.w & MASK1;
358
- out.w = (one1 - one2) & MASK1;
359
- one1 = in1.w & MASK2;
360
- one2 = in2.w & MASK2;
361
- out.w = out.w | ((one1 - one2) & MASK2);
362
- return out;
363
- }
364
-
365
- __device__ static inline char4 __hip_hc_mul8pk(char4 in1, char4 in2) {
366
- char4 out;
367
- unsigned one1 = in1.w & MASK1;
368
- unsigned one2 = in2.w & MASK1;
369
- out.w = (one1 * one2) & MASK1;
370
- one1 = in1.w & MASK2;
371
- one2 = in2.w & MASK2;
372
- out.w = out.w | ((one1 * one2) & MASK2);
373
- return out;
374
- }
375
-
376
- __device__ static inline float __double2float_rd(double x) {
377
- return __ocml_cvtrtn_f32_f64(x);
378
- }
379
- __device__ static inline float __double2float_rn(double x) { return x; }
380
- __device__ static inline float __double2float_ru(double x) {
381
- return __ocml_cvtrtp_f32_f64(x);
382
- }
383
- __device__ static inline float __double2float_rz(double x) {
384
- return __ocml_cvtrtz_f32_f64(x);
385
- }
386
-
387
- __device__ static inline int __double2hiint(double x) {
388
- static_assert(sizeof(double) == 2 * sizeof(int), "");
389
-
390
- int tmp[2];
391
- __builtin_memcpy(tmp, &x, sizeof(tmp));
392
-
393
- return tmp[1];
394
- }
395
- __device__ static inline int __double2loint(double x) {
396
- static_assert(sizeof(double) == 2 * sizeof(int), "");
397
-
398
- int tmp[2];
399
- __builtin_memcpy(tmp, &x, sizeof(tmp));
400
-
401
- return tmp[0];
402
- }
403
-
404
- __device__ static inline int __double2int_rd(double x) { return (int)__ocml_floor_f64(x); }
405
- __device__ static inline int __double2int_rn(double x) { return (int)__ocml_rint_f64(x); }
406
- __device__ static inline int __double2int_ru(double x) { return (int)__ocml_ceil_f64(x); }
407
- __device__ static inline int __double2int_rz(double x) { return (int)x; }
408
-
409
- __device__ static inline long long int __double2ll_rd(double x) {
410
- return (long long)__ocml_floor_f64(x);
411
- }
412
- __device__ static inline long long int __double2ll_rn(double x) {
413
- return (long long)__ocml_rint_f64(x);
414
- }
415
- __device__ static inline long long int __double2ll_ru(double x) {
416
- return (long long)__ocml_ceil_f64(x);
417
- }
418
- __device__ static inline long long int __double2ll_rz(double x) { return (long long)x; }
419
-
420
- __device__ static inline unsigned int __double2uint_rd(double x) {
421
- return (unsigned int)__ocml_floor_f64(x);
422
- }
423
- __device__ static inline unsigned int __double2uint_rn(double x) {
424
- return (unsigned int)__ocml_rint_f64(x);
425
- }
426
- __device__ static inline unsigned int __double2uint_ru(double x) {
427
- return (unsigned int)__ocml_ceil_f64(x);
428
- }
429
- __device__ static inline unsigned int __double2uint_rz(double x) { return (unsigned int)x; }
430
-
431
- __device__ static inline unsigned long long int __double2ull_rd(double x) {
432
- return (unsigned long long int)__ocml_floor_f64(x);
433
- }
434
- __device__ static inline unsigned long long int __double2ull_rn(double x) {
435
- return (unsigned long long int)__ocml_rint_f64(x);
436
- }
437
- __device__ static inline unsigned long long int __double2ull_ru(double x) {
438
- return (unsigned long long int)__ocml_ceil_f64(x);
439
- }
440
- __device__ static inline unsigned long long int __double2ull_rz(double x) {
441
- return (unsigned long long int)x;
442
- }
443
- __device__ static inline long long int __double_as_longlong(double x) {
444
- static_assert(sizeof(long long) == sizeof(double), "");
445
-
446
- long long tmp;
447
- __builtin_memcpy(&tmp, &x, sizeof(tmp));
448
-
449
- return tmp;
450
- }
451
-
452
- /*
453
- __device__ unsigned short __float2half_rn(float x);
454
- __device__ float __half2float(unsigned short);
455
-
456
- The above device function are not a valid .
457
- Use
458
- __device__ __half __float2half_rn(float x);
459
- __device__ float __half2float(__half);
460
- from hip_fp16.h
461
-
462
- CUDA implements half as unsigned short whereas, HIP doesn't.
463
-
464
- */
465
-
466
- __device__ static inline int __float2int_rd(float x) { return (int)__ocml_floor_f32(x); }
467
- __device__ static inline int __float2int_rn(float x) { return (int)__ocml_rint_f32(x); }
468
- __device__ static inline int __float2int_ru(float x) { return (int)__ocml_ceil_f32(x); }
469
- __device__ static inline int __float2int_rz(float x) { return (int)__ocml_trunc_f32(x); }
470
-
471
- __device__ static inline long long int __float2ll_rd(float x) {
472
- return (long long int)__ocml_floor_f32(x);
473
- }
474
- __device__ static inline long long int __float2ll_rn(float x) {
475
- return (long long int)__ocml_rint_f32(x);
476
- }
477
- __device__ static inline long long int __float2ll_ru(float x) {
478
- return (long long int)__ocml_ceil_f32(x);
479
- }
480
- __device__ static inline long long int __float2ll_rz(float x) { return (long long int)x; }
481
-
482
- __device__ static inline unsigned int __float2uint_rd(float x) {
483
- return (unsigned int)__ocml_floor_f32(x);
484
- }
485
- __device__ static inline unsigned int __float2uint_rn(float x) {
486
- return (unsigned int)__ocml_rint_f32(x);
487
- }
488
- __device__ static inline unsigned int __float2uint_ru(float x) {
489
- return (unsigned int)__ocml_ceil_f32(x);
490
- }
491
- __device__ static inline unsigned int __float2uint_rz(float x) { return (unsigned int)x; }
492
-
493
- __device__ static inline unsigned long long int __float2ull_rd(float x) {
494
- return (unsigned long long int)__ocml_floor_f32(x);
495
- }
496
- __device__ static inline unsigned long long int __float2ull_rn(float x) {
497
- return (unsigned long long int)__ocml_rint_f32(x);
498
- }
499
- __device__ static inline unsigned long long int __float2ull_ru(float x) {
500
- return (unsigned long long int)__ocml_ceil_f32(x);
501
- }
502
- __device__ static inline unsigned long long int __float2ull_rz(float x) {
503
- return (unsigned long long int)x;
504
- }
505
-
506
- __device__ static inline int __float_as_int(float x) {
507
- static_assert(sizeof(int) == sizeof(float), "");
508
-
509
- int tmp;
510
- __builtin_memcpy(&tmp, &x, sizeof(tmp));
511
-
512
- return tmp;
513
- }
514
-
515
- __device__ static inline unsigned int __float_as_uint(float x) {
516
- static_assert(sizeof(unsigned int) == sizeof(float), "");
517
-
518
- unsigned int tmp;
519
- __builtin_memcpy(&tmp, &x, sizeof(tmp));
520
-
521
- return tmp;
522
- }
523
-
524
- __device__ static inline double __hiloint2double(int hi, int lo) {
525
- static_assert(sizeof(double) == sizeof(uint64_t), "");
526
-
527
- uint64_t tmp0 = (static_cast<uint64_t>(hi) << 32ull) | static_cast<uint32_t>(lo);
528
- double tmp1;
529
- __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
530
-
531
- return tmp1;
532
- }
533
-
534
- __device__ static inline double __int2double_rn(int x) { return (double)x; }
535
-
536
- __device__ static inline float __int2float_rd(int x) {
537
- return __ocml_cvtrtn_f32_s32(x);
538
- }
539
- __device__ static inline float __int2float_rn(int x) { return (float)x; }
540
- __device__ static inline float __int2float_ru(int x) {
541
- return __ocml_cvtrtp_f32_s32(x);
542
- }
543
- __device__ static inline float __int2float_rz(int x) {
544
- return __ocml_cvtrtz_f32_s32(x);
545
- }
546
-
547
- __device__ static inline float __int_as_float(int x) {
548
- static_assert(sizeof(float) == sizeof(int), "");
549
-
550
- float tmp;
551
- __builtin_memcpy(&tmp, &x, sizeof(tmp));
552
-
553
- return tmp;
554
- }
555
-
556
- __device__ static inline double __ll2double_rd(long long int x) {
557
- return __ocml_cvtrtn_f64_s64(x);
558
- }
559
- __device__ static inline double __ll2double_rn(long long int x) { return (double)x; }
560
- __device__ static inline double __ll2double_ru(long long int x) {
561
- return __ocml_cvtrtp_f64_s64(x);
562
- }
563
- __device__ static inline double __ll2double_rz(long long int x) {
564
- return __ocml_cvtrtz_f64_s64(x);
565
- }
566
-
567
- __device__ static inline float __ll2float_rd(long long int x) {
568
- return __ocml_cvtrtn_f32_s64(x);
569
- }
570
- __device__ static inline float __ll2float_rn(long long int x) { return (float)x; }
571
- __device__ static inline float __ll2float_ru(long long int x) {
572
- return __ocml_cvtrtp_f32_s64(x);
573
- }
574
- __device__ static inline float __ll2float_rz(long long int x) {
575
- return __ocml_cvtrtz_f32_s64(x);
576
- }
577
-
578
- __device__ static inline double __longlong_as_double(long long int x) {
579
- static_assert(sizeof(double) == sizeof(long long), "");
580
-
581
- double tmp;
582
- __builtin_memcpy(&tmp, &x, sizeof(tmp));
583
-
584
- return tmp;
585
- }
586
-
587
- __device__ static inline double __uint2double_rn(unsigned int x) { return (double)x; }
588
-
589
- __device__ static inline float __uint2float_rd(unsigned int x) {
590
- return __ocml_cvtrtn_f32_u32(x);
591
- }
592
- __device__ static inline float __uint2float_rn(unsigned int x) { return (float)x; }
593
- __device__ static inline float __uint2float_ru(unsigned int x) {
594
- return __ocml_cvtrtp_f32_u32(x);
595
- }
596
- __device__ static inline float __uint2float_rz(unsigned int x) {
597
- return __ocml_cvtrtz_f32_u32(x);
598
- }
599
-
600
- __device__ static inline float __uint_as_float(unsigned int x) {
601
- static_assert(sizeof(float) == sizeof(unsigned int), "");
602
-
603
- float tmp;
604
- __builtin_memcpy(&tmp, &x, sizeof(tmp));
605
-
606
- return tmp;
607
- }
608
-
609
- __device__ static inline double __ull2double_rd(unsigned long long int x) {
610
- return __ocml_cvtrtn_f64_u64(x);
611
- }
612
- __device__ static inline double __ull2double_rn(unsigned long long int x) { return (double)x; }
613
- __device__ static inline double __ull2double_ru(unsigned long long int x) {
614
- return __ocml_cvtrtp_f64_u64(x);
615
- }
616
- __device__ static inline double __ull2double_rz(unsigned long long int x) {
617
- return __ocml_cvtrtz_f64_u64(x);
618
- }
619
-
620
- __device__ static inline float __ull2float_rd(unsigned long long int x) {
621
- return __ocml_cvtrtn_f32_u64(x);
622
- }
623
- __device__ static inline float __ull2float_rn(unsigned long long int x) { return (float)x; }
624
- __device__ static inline float __ull2float_ru(unsigned long long int x) {
625
- return __ocml_cvtrtp_f32_u64(x);
626
- }
627
- __device__ static inline float __ull2float_rz(unsigned long long int x) {
628
- return __ocml_cvtrtz_f32_u64(x);
629
- }
630
-
631
- #if defined(__clang__) && defined(__HIP__)
632
-
633
- // Clock functions
634
- __device__ long long int __clock64();
635
- __device__ long long int __clock();
636
- __device__ long long int clock64();
637
- __device__ long long int clock();
638
- __device__ long long int wall_clock64();
639
- // hip.amdgcn.bc - named sync
640
- __device__ void __named_sync();
641
-
642
- #ifdef __HIP_DEVICE_COMPILE__
643
-
644
- // Clock function to return GPU core cycle count.
645
- // GPU can change its core clock frequency at runtime. The maximum frequency can be queried
646
- // through hipDeviceAttributeClockRate attribute.
647
- __device__
648
- inline __attribute((always_inline))
649
- long long int __clock64() {
650
- #if __has_builtin(__builtin_amdgcn_s_memtime)
651
- // Exists on gfx8, gfx9, gfx10.1, gfx10.2, gfx10.3
652
- return (long long int) __builtin_amdgcn_s_memtime();
653
- #else
654
- // Subject to change when better solution available
655
- return (long long int) __builtin_readcyclecounter();
656
- #endif
657
- }
658
-
659
- __device__
660
- inline __attribute((always_inline))
661
- long long int __clock() { return __clock64(); }
662
-
663
- // Clock function to return wall clock count at a constant frequency that can be queried
664
- // through hipDeviceAttributeWallClockRate attribute.
665
- __device__
666
- inline __attribute__((always_inline))
667
- long long int wall_clock64() {
668
- return (long long int) __ockl_steadyctr_u64();
669
- }
670
-
671
- __device__
672
- inline __attribute__((always_inline))
673
- long long int clock64() { return __clock64(); }
674
-
675
- __device__
676
- inline __attribute__((always_inline))
677
- long long int clock() { return __clock(); }
678
-
679
- // hip.amdgcn.bc - named sync
680
- __device__
681
- inline
682
- void __named_sync() { __builtin_amdgcn_s_barrier(); }
683
-
684
- #endif // __HIP_DEVICE_COMPILE__
685
-
686
- // hip.amdgcn.bc - lanemask
687
- __device__
688
- inline
689
- uint64_t __lanemask_gt()
690
- {
691
- uint32_t lane = __ockl_lane_u32();
692
- if (lane == 63)
693
- return 0;
694
- uint64_t ballot = __ballot64(1);
695
- uint64_t mask = (~((uint64_t)0)) << (lane + 1);
696
- return mask & ballot;
697
- }
698
-
699
- __device__
700
- inline
701
- uint64_t __lanemask_lt()
702
- {
703
- uint32_t lane = __ockl_lane_u32();
704
- int64_t ballot = __ballot64(1);
705
- uint64_t mask = ((uint64_t)1 << lane) - (uint64_t)1;
706
- return mask & ballot;
707
- }
708
-
709
- __device__
710
- inline
711
- uint64_t __lanemask_eq()
712
- {
713
- uint32_t lane = __ockl_lane_u32();
714
- int64_t mask = ((uint64_t)1 << lane);
715
- return mask;
716
- }
717
-
718
-
719
- __device__ inline void* __local_to_generic(void* p) { return p; }
720
-
721
- #ifdef __HIP_DEVICE_COMPILE__
722
- __device__
723
- inline
724
- void* __get_dynamicgroupbaseptr()
725
- {
726
- // Get group segment base pointer.
727
- return (char*)__local_to_generic((void*)__to_local(__builtin_amdgcn_groupstaticsize()));
728
- }
729
- #else
730
- __device__
731
- void* __get_dynamicgroupbaseptr();
732
- #endif // __HIP_DEVICE_COMPILE__
733
-
734
- __device__
735
- inline
736
- void *__amdgcn_get_dynamicgroupbaseptr() {
737
- return __get_dynamicgroupbaseptr();
738
- }
739
-
740
- // Memory Fence Functions
741
- __device__
742
- inline
743
- static void __threadfence()
744
- {
745
- __builtin_amdgcn_fence(__ATOMIC_SEQ_CST, "agent");
746
- }
747
-
748
- __device__
749
- inline
750
- static void __threadfence_block()
751
- {
752
- __builtin_amdgcn_fence(__ATOMIC_SEQ_CST, "workgroup");
753
- }
754
-
755
- __device__
756
- inline
757
- static void __threadfence_system()
758
- {
759
- __builtin_amdgcn_fence(__ATOMIC_SEQ_CST, "");
760
- }
761
- __device__ inline static void __work_group_barrier(__cl_mem_fence_flags flags) {
762
- if (flags) {
763
- __builtin_amdgcn_fence(__ATOMIC_RELEASE, "workgroup");
764
- __builtin_amdgcn_s_barrier();
765
- __builtin_amdgcn_fence(__ATOMIC_ACQUIRE, "workgroup");
766
- } else {
767
- __builtin_amdgcn_s_barrier();
768
- }
769
- }
770
-
771
- __device__
772
- inline
773
- static void __barrier(int n)
774
- {
775
- __work_group_barrier((__cl_mem_fence_flags)n);
776
- }
777
-
778
- __device__
779
- inline
780
- __attribute__((convergent))
781
- void __syncthreads()
782
- {
783
- __barrier(__CLK_LOCAL_MEM_FENCE);
784
- }
785
-
786
- __device__
787
- inline
788
- __attribute__((convergent))
789
- int __syncthreads_count(int predicate)
790
- {
791
- return __ockl_wgred_add_i32(!!predicate);
792
- }
793
-
794
- __device__
795
- inline
796
- __attribute__((convergent))
797
- int __syncthreads_and(int predicate)
798
- {
799
- return __ockl_wgred_and_i32(!!predicate);
800
- }
801
-
802
- __device__
803
- inline
804
- __attribute__((convergent))
805
- int __syncthreads_or(int predicate)
806
- {
807
- return __ockl_wgred_or_i32(!!predicate);
808
- }
809
-
810
- // hip.amdgcn.bc - device routine
811
- /*
812
- HW_ID Register bit structure for RDNA2 & RDNA3
813
- WAVE_ID 4:0 Wave id within the SIMD.
814
- SIMD_ID 9:8 SIMD_ID within the WGP: [0] = row, [1] = column.
815
- WGP_ID 13:10 Physical WGP ID.
816
- SA_ID 16 Shader Array ID
817
- SE_ID 20:18 Shader Engine the wave is assigned to for gfx11
818
- SE_ID 19:18 Shader Engine the wave is assigned to for gfx10
819
- DP_RATE 31:29 Number of double-precision float units per SIMD
820
-
821
- HW_ID Register bit structure for GCN and CDNA
822
- WAVE_ID 3:0 Wave buffer slot number. 0-9.
823
- SIMD_ID 5:4 SIMD which the wave is assigned to within the CU.
824
- PIPE_ID 7:6 Pipeline from which the wave was dispatched.
825
- CU_ID 11:8 Compute Unit the wave is assigned to.
826
- SH_ID 12 Shader Array (within an SE) the wave is assigned to.
827
- SE_ID 15:13 Shader Engine the wave is assigned to for gfx908, gfx90a, gfx940-942
828
- 14:13 Shader Engine the wave is assigned to for Vega.
829
- TG_ID 19:16 Thread-group ID
830
- VM_ID 23:20 Virtual Memory ID
831
- QUEUE_ID 26:24 Queue from which this wave was dispatched.
832
- STATE_ID 29:27 State ID (graphics only, not compute).
833
- ME_ID 31:30 Micro-engine ID.
834
-
835
- XCC_ID Register bit structure for gfx940
836
- XCC_ID 3:0 XCC the wave is assigned to.
837
- */
838
-
839
- #if (defined (__GFX10__) || defined (__GFX11__))
840
- #define HW_ID 23
841
- #else
842
- #define HW_ID 4
843
- #endif
844
-
845
- #if (defined(__GFX10__) || defined(__GFX11__))
846
- #define HW_ID_WGP_ID_SIZE 4
847
- #define HW_ID_WGP_ID_OFFSET 10
848
- #if (defined(__AMDGCN_CUMODE__))
849
- #define HW_ID_CU_ID_SIZE 1
850
- #define HW_ID_CU_ID_OFFSET 8
851
- #endif
852
- #else
853
- #define HW_ID_CU_ID_SIZE 4
854
- #define HW_ID_CU_ID_OFFSET 8
855
- #endif
856
-
857
- #if (defined(__gfx908__) || defined(__gfx90a__) || \
858
- defined(__GFX11__))
859
- #define HW_ID_SE_ID_SIZE 3
860
- #else //4 SEs/XCC for gfx940-942
861
- #define HW_ID_SE_ID_SIZE 2
862
- #endif
863
- #if (defined(__GFX10__) || defined(__GFX11__))
864
- #define HW_ID_SE_ID_OFFSET 18
865
- #define HW_ID_SA_ID_OFFSET 16
866
- #define HW_ID_SA_ID_SIZE 1
867
- #else
868
- #define HW_ID_SE_ID_OFFSET 13
869
- #endif
870
-
871
- #if (defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__))
872
- #define XCC_ID 20
873
- #define XCC_ID_XCC_ID_SIZE 4
874
- #define XCC_ID_XCC_ID_OFFSET 0
875
- #endif
876
-
877
- #if (!defined(__HIP_NO_IMAGE_SUPPORT) && \
878
- (defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)))
879
- #define __HIP_NO_IMAGE_SUPPORT 1
880
- #endif
881
-
882
- /*
883
- Encoding of parameter bitmask
884
- HW_ID 5:0 HW_ID
885
- OFFSET 10:6 Range: 0..31
886
- SIZE 15:11 Range: 1..32
887
- */
888
-
889
- #define GETREG_IMMED(SZ,OFF,REG) (((SZ) << 11) | ((OFF) << 6) | (REG))
890
-
891
- /*
892
- __smid returns the wave's assigned Compute Unit and Shader Engine.
893
- The Compute Unit, CU_ID returned in bits 3:0, and Shader Engine, SE_ID in bits 5:4.
894
- Note: the results vary over time.
895
- SZ minus 1 since SIZE is 1-based.
896
- */
897
- __device__
898
- inline
899
- unsigned __smid(void)
900
- {
901
- unsigned se_id = __builtin_amdgcn_s_getreg(
902
- GETREG_IMMED(HW_ID_SE_ID_SIZE-1, HW_ID_SE_ID_OFFSET, HW_ID));
903
- #if (defined(__GFX10__) || defined(__GFX11__))
904
- unsigned wgp_id = __builtin_amdgcn_s_getreg(
905
- GETREG_IMMED(HW_ID_WGP_ID_SIZE - 1, HW_ID_WGP_ID_OFFSET, HW_ID));
906
- unsigned sa_id = __builtin_amdgcn_s_getreg(
907
- GETREG_IMMED(HW_ID_SA_ID_SIZE - 1, HW_ID_SA_ID_OFFSET, HW_ID));
908
- #if (defined(__AMDGCN_CUMODE__))
909
- unsigned cu_id = __builtin_amdgcn_s_getreg(
910
- GETREG_IMMED(HW_ID_CU_ID_SIZE - 1, HW_ID_CU_ID_OFFSET, HW_ID));
911
- #endif
912
- #else
913
- #if (defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__))
914
- unsigned xcc_id = __builtin_amdgcn_s_getreg(
915
- GETREG_IMMED(XCC_ID_XCC_ID_SIZE - 1, XCC_ID_XCC_ID_OFFSET, XCC_ID));
916
- #endif
917
- unsigned cu_id = __builtin_amdgcn_s_getreg(
918
- GETREG_IMMED(HW_ID_CU_ID_SIZE - 1, HW_ID_CU_ID_OFFSET, HW_ID));
919
- #endif
920
- #if (defined(__GFX10__) || defined(__GFX11__))
921
- unsigned temp = se_id;
922
- temp = (temp << HW_ID_SA_ID_SIZE) | sa_id;
923
- temp = (temp << HW_ID_WGP_ID_SIZE) | wgp_id;
924
- #if (defined(__AMDGCN_CUMODE__))
925
- temp = (temp << HW_ID_CU_ID_SIZE) | cu_id;
926
- #endif
927
- return temp;
928
- //TODO : CU Mode impl
929
- #elif (defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__))
930
- unsigned temp = xcc_id;
931
- temp = (temp << HW_ID_SE_ID_SIZE) | se_id;
932
- temp = (temp << HW_ID_CU_ID_SIZE) | cu_id;
933
- return temp;
934
- #else
935
- return (se_id << HW_ID_CU_ID_SIZE) + cu_id;
936
- #endif
937
- }
938
-
939
- /**
940
- * Map HIP_DYNAMIC_SHARED to "extern __shared__" for compatibility with old HIP applications
941
- * To be removed in a future release.
942
- */
943
- #define HIP_DYNAMIC_SHARED(type, var) extern __shared__ type var[];
944
- #define HIP_DYNAMIC_SHARED_ATTRIBUTE
945
-
946
- #endif //defined(__clang__) && defined(__HIP__)
947
-
948
-
949
- // loop unrolling
950
- static inline __device__ void* __hip_hc_memcpy(void* dst, const void* src, size_t size) {
951
- auto dstPtr = static_cast<unsigned char*>(dst);
952
- auto srcPtr = static_cast<const unsigned char*>(src);
953
-
954
- while (size >= 4u) {
955
- dstPtr[0] = srcPtr[0];
956
- dstPtr[1] = srcPtr[1];
957
- dstPtr[2] = srcPtr[2];
958
- dstPtr[3] = srcPtr[3];
959
-
960
- size -= 4u;
961
- srcPtr += 4u;
962
- dstPtr += 4u;
963
- }
964
- switch (size) {
965
- case 3:
966
- dstPtr[2] = srcPtr[2];
967
- case 2:
968
- dstPtr[1] = srcPtr[1];
969
- case 1:
970
- dstPtr[0] = srcPtr[0];
971
- }
972
-
973
- return dst;
974
- }
975
-
976
- static inline __device__ void* __hip_hc_memset(void* dst, unsigned char val, size_t size) {
977
- auto dstPtr = static_cast<unsigned char*>(dst);
978
-
979
- while (size >= 4u) {
980
- dstPtr[0] = val;
981
- dstPtr[1] = val;
982
- dstPtr[2] = val;
983
- dstPtr[3] = val;
984
-
985
- size -= 4u;
986
- dstPtr += 4u;
987
- }
988
- switch (size) {
989
- case 3:
990
- dstPtr[2] = val;
991
- case 2:
992
- dstPtr[1] = val;
993
- case 1:
994
- dstPtr[0] = val;
995
- }
996
-
997
- return dst;
998
- }
999
- #ifndef __OPENMP_AMDGCN__
1000
- static inline __device__ void* memcpy(void* dst, const void* src, size_t size) {
1001
- return __hip_hc_memcpy(dst, src, size);
1002
- }
1003
-
1004
- static inline __device__ void* memset(void* ptr, int val, size_t size) {
1005
- unsigned char val8 = static_cast<unsigned char>(val);
1006
- return __hip_hc_memset(ptr, val8, size);
1007
- }
1008
- #endif // !__OPENMP_AMDGCN__
1009
-
1010
- #endif