triton-windows 3.2.0.post18__cp313-cp313-win_amd64.whl → 3.2.0.post21__cp313-cp313-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of triton-windows might be problematic. Click here for more details.

Files changed (111) hide show
  1. triton/_C/libtriton.pyd +0 -0
  2. triton/backends/amd/driver.py +6 -1
  3. triton/backends/nvidia/compiler.py +1 -3
  4. triton/backends/nvidia/driver.c +1 -0
  5. triton/backends/nvidia/driver.py +8 -3
  6. triton/runtime/autotuner.py +2 -2
  7. triton/runtime/build.py +14 -6
  8. triton/runtime/tcc/lib/python310.def +1610 -0
  9. triton/runtime/tcc/lib/python311.def +1633 -0
  10. triton/runtime/tcc/lib/python312.def +1703 -0
  11. triton/runtime/tcc/lib/python313.def +1651 -0
  12. triton/runtime/tcc/lib/python313t.def +1656 -0
  13. triton/runtime/tcc/lib/python39.def +1644 -0
  14. triton/runtime/tcc/lib/python3t.def +905 -0
  15. triton/windows_utils.py +11 -4
  16. {triton_windows-3.2.0.post18.dist-info → triton_windows-3.2.0.post21.dist-info}/METADATA +1 -1
  17. {triton_windows-3.2.0.post18.dist-info → triton_windows-3.2.0.post21.dist-info}/RECORD +19 -104
  18. {triton_windows-3.2.0.post18.dist-info → triton_windows-3.2.0.post21.dist-info}/WHEEL +1 -1
  19. triton/backends/amd/include/hip/amd_detail/amd_channel_descriptor.h +0 -358
  20. triton/backends/amd/include/hip/amd_detail/amd_device_functions.h +0 -1031
  21. triton/backends/amd/include/hip/amd_detail/amd_hip_atomic.h +0 -1612
  22. triton/backends/amd/include/hip/amd_detail/amd_hip_bf16.h +0 -1337
  23. triton/backends/amd/include/hip/amd_detail/amd_hip_bfloat16.h +0 -293
  24. triton/backends/amd/include/hip/amd_detail/amd_hip_common.h +0 -32
  25. triton/backends/amd/include/hip/amd_detail/amd_hip_complex.h +0 -174
  26. triton/backends/amd/include/hip/amd_detail/amd_hip_cooperative_groups.h +0 -829
  27. triton/backends/amd/include/hip/amd_detail/amd_hip_fp16.h +0 -1809
  28. triton/backends/amd/include/hip/amd_detail/amd_hip_gl_interop.h +0 -108
  29. triton/backends/amd/include/hip/amd_detail/amd_hip_math_constants.h +0 -124
  30. triton/backends/amd/include/hip/amd_detail/amd_hip_runtime.h +0 -405
  31. triton/backends/amd/include/hip/amd_detail/amd_hip_runtime_pt_api.h +0 -196
  32. triton/backends/amd/include/hip/amd_detail/amd_hip_unsafe_atomics.h +0 -565
  33. triton/backends/amd/include/hip/amd_detail/amd_hip_vector_types.h +0 -2226
  34. triton/backends/amd/include/hip/amd_detail/amd_math_functions.h +0 -104
  35. triton/backends/amd/include/hip/amd_detail/amd_surface_functions.h +0 -244
  36. triton/backends/amd/include/hip/amd_detail/amd_warp_functions.h +0 -494
  37. triton/backends/amd/include/hip/amd_detail/concepts.hpp +0 -30
  38. triton/backends/amd/include/hip/amd_detail/device_library_decls.h +0 -133
  39. triton/backends/amd/include/hip/amd_detail/functional_grid_launch.hpp +0 -218
  40. triton/backends/amd/include/hip/amd_detail/grid_launch.h +0 -67
  41. triton/backends/amd/include/hip/amd_detail/grid_launch.hpp +0 -50
  42. triton/backends/amd/include/hip/amd_detail/grid_launch_GGL.hpp +0 -26
  43. triton/backends/amd/include/hip/amd_detail/helpers.hpp +0 -137
  44. triton/backends/amd/include/hip/amd_detail/hip_api_trace.hpp +0 -1350
  45. triton/backends/amd/include/hip/amd_detail/hip_assert.h +0 -101
  46. triton/backends/amd/include/hip/amd_detail/hip_cooperative_groups_helper.h +0 -242
  47. triton/backends/amd/include/hip/amd_detail/hip_fp16_gcc.h +0 -254
  48. triton/backends/amd/include/hip/amd_detail/hip_fp16_math_fwd.h +0 -96
  49. triton/backends/amd/include/hip/amd_detail/hip_ldg.h +0 -100
  50. triton/backends/amd/include/hip/amd_detail/hip_prof_str.h +0 -10169
  51. triton/backends/amd/include/hip/amd_detail/hip_runtime_prof.h +0 -77
  52. triton/backends/amd/include/hip/amd_detail/host_defines.h +0 -180
  53. triton/backends/amd/include/hip/amd_detail/hsa_helpers.hpp +0 -102
  54. triton/backends/amd/include/hip/amd_detail/macro_based_grid_launch.hpp +0 -798
  55. triton/backends/amd/include/hip/amd_detail/math_fwd.h +0 -698
  56. triton/backends/amd/include/hip/amd_detail/ockl_image.h +0 -177
  57. triton/backends/amd/include/hip/amd_detail/program_state.hpp +0 -107
  58. triton/backends/amd/include/hip/amd_detail/texture_fetch_functions.h +0 -491
  59. triton/backends/amd/include/hip/amd_detail/texture_indirect_functions.h +0 -478
  60. triton/backends/amd/include/hip/channel_descriptor.h +0 -39
  61. triton/backends/amd/include/hip/device_functions.h +0 -38
  62. triton/backends/amd/include/hip/driver_types.h +0 -468
  63. triton/backends/amd/include/hip/hip_bf16.h +0 -36
  64. triton/backends/amd/include/hip/hip_bfloat16.h +0 -44
  65. triton/backends/amd/include/hip/hip_common.h +0 -100
  66. triton/backends/amd/include/hip/hip_complex.h +0 -38
  67. triton/backends/amd/include/hip/hip_cooperative_groups.h +0 -46
  68. triton/backends/amd/include/hip/hip_deprecated.h +0 -95
  69. triton/backends/amd/include/hip/hip_ext.h +0 -159
  70. triton/backends/amd/include/hip/hip_fp16.h +0 -36
  71. triton/backends/amd/include/hip/hip_gl_interop.h +0 -32
  72. triton/backends/amd/include/hip/hip_hcc.h +0 -24
  73. triton/backends/amd/include/hip/hip_math_constants.h +0 -36
  74. triton/backends/amd/include/hip/hip_profile.h +0 -27
  75. triton/backends/amd/include/hip/hip_runtime.h +0 -75
  76. triton/backends/amd/include/hip/hip_runtime_api.h +0 -8919
  77. triton/backends/amd/include/hip/hip_texture_types.h +0 -29
  78. triton/backends/amd/include/hip/hip_vector_types.h +0 -41
  79. triton/backends/amd/include/hip/hip_version.h +0 -17
  80. triton/backends/amd/include/hip/hiprtc.h +0 -421
  81. triton/backends/amd/include/hip/library_types.h +0 -78
  82. triton/backends/amd/include/hip/math_functions.h +0 -42
  83. triton/backends/amd/include/hip/surface_types.h +0 -63
  84. triton/backends/amd/include/hip/texture_types.h +0 -194
  85. triton/backends/amd/include/hsa/Brig.h +0 -1131
  86. triton/backends/amd/include/hsa/amd_hsa_common.h +0 -91
  87. triton/backends/amd/include/hsa/amd_hsa_elf.h +0 -436
  88. triton/backends/amd/include/hsa/amd_hsa_kernel_code.h +0 -269
  89. triton/backends/amd/include/hsa/amd_hsa_queue.h +0 -109
  90. triton/backends/amd/include/hsa/amd_hsa_signal.h +0 -80
  91. triton/backends/amd/include/hsa/hsa.h +0 -5729
  92. triton/backends/amd/include/hsa/hsa_amd_tool.h +0 -91
  93. triton/backends/amd/include/hsa/hsa_api_trace.h +0 -566
  94. triton/backends/amd/include/hsa/hsa_ext_amd.h +0 -3090
  95. triton/backends/amd/include/hsa/hsa_ext_finalize.h +0 -531
  96. triton/backends/amd/include/hsa/hsa_ext_image.h +0 -1454
  97. triton/backends/amd/include/hsa/hsa_ven_amd_aqlprofile.h +0 -488
  98. triton/backends/amd/include/hsa/hsa_ven_amd_loader.h +0 -667
  99. triton/backends/amd/include/roctracer/ext/prof_protocol.h +0 -107
  100. triton/backends/amd/include/roctracer/hip_ostream_ops.h +0 -4435
  101. triton/backends/amd/include/roctracer/hsa_ostream_ops.h +0 -1467
  102. triton/backends/amd/include/roctracer/hsa_prof_str.h +0 -3027
  103. triton/backends/amd/include/roctracer/roctracer.h +0 -779
  104. triton/backends/amd/include/roctracer/roctracer_ext.h +0 -81
  105. triton/backends/amd/include/roctracer/roctracer_hcc.h +0 -24
  106. triton/backends/amd/include/roctracer/roctracer_hip.h +0 -37
  107. triton/backends/amd/include/roctracer/roctracer_hsa.h +0 -112
  108. triton/backends/amd/include/roctracer/roctracer_plugin.h +0 -137
  109. triton/backends/amd/include/roctracer/roctracer_roctx.h +0 -67
  110. triton/backends/amd/include/roctracer/roctx.h +0 -229
  111. {triton_windows-3.2.0.post18.dist-info → triton_windows-3.2.0.post21.dist-info}/top_level.txt +0 -0
@@ -1,1031 +0,0 @@
1
- /*
2
- Copyright (c) 2015 - 2023 Advanced Micro Devices, Inc. All rights reserved.
3
-
4
- Permission is hereby granted, free of charge, to any person obtaining a copy
5
- of this software and associated documentation files (the "Software"), to deal
6
- in the Software without restriction, including without limitation the rights
7
- to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
8
- copies of the Software, and to permit persons to whom the Software is
9
- furnished to do so, subject to the following conditions:
10
-
11
- The above copyright notice and this permission notice shall be included in
12
- all copies or substantial portions of the Software.
13
-
14
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15
- IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16
- FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
17
- AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18
- LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
19
- OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
20
- THE SOFTWARE.
21
- */
22
-
23
- #ifndef HIP_INCLUDE_HIP_AMD_DETAIL_DEVICE_FUNCTIONS_H
24
- #define HIP_INCLUDE_HIP_AMD_DETAIL_DEVICE_FUNCTIONS_H
25
-
26
- #if !defined(__HIPCC_RTC__)
27
- #include <hip/amd_detail/amd_hip_common.h>
28
- #include <hip/amd_detail/device_library_decls.h>
29
- #include <hip/amd_detail/hip_assert.h>
30
- #include "host_defines.h"
31
- #include "math_fwd.h"
32
- #include <hip/hip_runtime_api.h>
33
- #include <stddef.h>
34
- #include <hip/hip_vector_types.h>
35
- #endif // !defined(__HIPCC_RTC__)
36
-
37
- #if defined(__clang__) && defined(__HIP__)
38
- extern "C" __device__ int printf(const char *fmt, ...);
39
- #else
40
- template <typename... All>
41
- static inline __device__ void printf(const char* format, All... all) {}
42
- #endif
43
-
44
- extern "C" __device__ unsigned long long __ockl_steadyctr_u64();
45
-
46
- /*
47
- Integer Intrinsics
48
- */
49
-
50
- // integer intrinsic function __poc __clz __ffs __brev
51
- __device__ static inline unsigned int __popc(unsigned int input) {
52
- return __builtin_popcount(input);
53
- }
54
- __device__ static inline unsigned int __popcll(unsigned long long int input) {
55
- return __builtin_popcountll(input);
56
- }
57
-
58
- __device__ static inline int __clz(int input) {
59
- return __ockl_clz_u32((uint)input);
60
- }
61
-
62
- __device__ static inline int __clzll(long long int input) {
63
- return __ockl_clz_u64((uint64_t)input);
64
- }
65
-
66
- __device__ static inline unsigned int __ffs(unsigned int input) {
67
- return ( input == 0 ? -1 : __builtin_ctz(input) ) + 1;
68
- }
69
-
70
- __device__ static inline unsigned int __ffsll(unsigned long long int input) {
71
- return ( input == 0 ? -1 : __builtin_ctzll(input) ) + 1;
72
- }
73
-
74
- __device__ static inline unsigned int __ffs(int input) {
75
- return ( input == 0 ? -1 : __builtin_ctz(input) ) + 1;
76
- }
77
-
78
- __device__ static inline unsigned int __ffsll(long long int input) {
79
- return ( input == 0 ? -1 : __builtin_ctzll(input) ) + 1;
80
- }
81
-
82
- // Given a 32/64-bit value exec mask and an integer value base (between 0 and WAVEFRONT_SIZE),
83
- // find the n-th (given by offset) set bit in the exec mask from the base bit, and return the bit position.
84
- // If not found, return -1.
85
- __device__ static int32_t __fns64(uint64_t mask, uint32_t base, int32_t offset) {
86
- uint64_t temp_mask = mask;
87
- int32_t temp_offset = offset;
88
-
89
- if (offset == 0) {
90
- temp_mask &= (1 << base);
91
- temp_offset = 1;
92
- }
93
- else if (offset < 0) {
94
- temp_mask = __builtin_bitreverse64(mask);
95
- base = 63 - base;
96
- temp_offset = -offset;
97
- }
98
-
99
- temp_mask = temp_mask & ((~0ULL) << base);
100
- if (__builtin_popcountll(temp_mask) < temp_offset)
101
- return -1;
102
- int32_t total = 0;
103
- for (int i = 0x20; i > 0; i >>= 1) {
104
- uint64_t temp_mask_lo = temp_mask & ((1ULL << i) - 1);
105
- int32_t pcnt = __builtin_popcountll(temp_mask_lo);
106
- if (pcnt < temp_offset) {
107
- temp_mask = temp_mask >> i;
108
- temp_offset -= pcnt;
109
- total += i;
110
- }
111
- else {
112
- temp_mask = temp_mask_lo;
113
- }
114
- }
115
- if (offset < 0)
116
- return 63 - total;
117
- else
118
- return total;
119
- }
120
-
121
- __device__ static int32_t __fns32(uint64_t mask, uint32_t base, int32_t offset) {
122
- uint64_t temp_mask = mask;
123
- int32_t temp_offset = offset;
124
- if (offset == 0) {
125
- temp_mask &= (1 << base);
126
- temp_offset = 1;
127
- }
128
- else if (offset < 0) {
129
- temp_mask = __builtin_bitreverse64(mask);
130
- base = 63 - base;
131
- temp_offset = -offset;
132
- }
133
- temp_mask = temp_mask & ((~0ULL) << base);
134
- if (__builtin_popcountll(temp_mask) < temp_offset)
135
- return -1;
136
- int32_t total = 0;
137
- for (int i = 0x20; i > 0; i >>= 1) {
138
- uint64_t temp_mask_lo = temp_mask & ((1ULL << i) - 1);
139
- int32_t pcnt = __builtin_popcountll(temp_mask_lo);
140
- if (pcnt < temp_offset) {
141
- temp_mask = temp_mask >> i;
142
- temp_offset -= pcnt;
143
- total += i;
144
- }
145
- else {
146
- temp_mask = temp_mask_lo;
147
- }
148
- }
149
- if (offset < 0)
150
- return 63 - total;
151
- else
152
- return total;
153
- }
154
- __device__ static inline unsigned int __brev(unsigned int input) {
155
- return __builtin_bitreverse32(input);
156
- }
157
-
158
- __device__ static inline unsigned long long int __brevll(unsigned long long int input) {
159
- return __builtin_bitreverse64(input);
160
- }
161
-
162
- __device__ static inline unsigned int __lastbit_u32_u64(uint64_t input) {
163
- return input == 0 ? -1 : __builtin_ctzl(input);
164
- }
165
-
166
- __device__ static inline unsigned int __bitextract_u32(unsigned int src0, unsigned int src1, unsigned int src2) {
167
- uint32_t offset = src1 & 31;
168
- uint32_t width = src2 & 31;
169
- return width == 0 ? 0 : (src0 << (32 - offset - width)) >> (32 - width);
170
- }
171
-
172
- __device__ static inline uint64_t __bitextract_u64(uint64_t src0, unsigned int src1, unsigned int src2) {
173
- uint64_t offset = src1 & 63;
174
- uint64_t width = src2 & 63;
175
- return width == 0 ? 0 : (src0 << (64 - offset - width)) >> (64 - width);
176
- }
177
-
178
- __device__ static inline unsigned int __bitinsert_u32(unsigned int src0, unsigned int src1, unsigned int src2, unsigned int src3) {
179
- uint32_t offset = src2 & 31;
180
- uint32_t width = src3 & 31;
181
- uint32_t mask = (1 << width) - 1;
182
- return ((src0 & ~(mask << offset)) | ((src1 & mask) << offset));
183
- }
184
-
185
- __device__ static inline uint64_t __bitinsert_u64(uint64_t src0, uint64_t src1, unsigned int src2, unsigned int src3) {
186
- uint64_t offset = src2 & 63;
187
- uint64_t width = src3 & 63;
188
- uint64_t mask = (1ULL << width) - 1;
189
- return ((src0 & ~(mask << offset)) | ((src1 & mask) << offset));
190
- }
191
-
192
- __device__ inline unsigned int __funnelshift_l(unsigned int lo, unsigned int hi, unsigned int shift)
193
- {
194
- uint32_t mask_shift = shift & 31;
195
- return mask_shift == 0 ? hi : __builtin_amdgcn_alignbit(hi, lo, 32 - mask_shift);
196
- }
197
-
198
- __device__ inline unsigned int __funnelshift_lc(unsigned int lo, unsigned int hi, unsigned int shift)
199
- {
200
- uint32_t min_shift = shift >= 32 ? 32 : shift;
201
- return min_shift == 0 ? hi : __builtin_amdgcn_alignbit(hi, lo, 32 - min_shift);
202
- }
203
-
204
- __device__ inline unsigned int __funnelshift_r(unsigned int lo, unsigned int hi, unsigned int shift)
205
- {
206
- return __builtin_amdgcn_alignbit(hi, lo, shift);
207
- }
208
-
209
- __device__ inline unsigned int __funnelshift_rc(unsigned int lo, unsigned int hi, unsigned int shift)
210
- {
211
- return shift >= 32 ? hi : __builtin_amdgcn_alignbit(hi, lo, shift);
212
- }
213
-
214
- __device__ static unsigned int __byte_perm(unsigned int x, unsigned int y, unsigned int s);
215
- __device__ static unsigned int __hadd(int x, int y);
216
- __device__ static int __mul24(int x, int y);
217
- __device__ static long long int __mul64hi(long long int x, long long int y);
218
- __device__ static int __mulhi(int x, int y);
219
- __device__ static int __rhadd(int x, int y);
220
- __device__ static unsigned int __sad(int x, int y,unsigned int z);
221
- __device__ static unsigned int __uhadd(unsigned int x, unsigned int y);
222
- __device__ static int __umul24(unsigned int x, unsigned int y);
223
- __device__ static unsigned long long int __umul64hi(unsigned long long int x, unsigned long long int y);
224
- __device__ static unsigned int __umulhi(unsigned int x, unsigned int y);
225
- __device__ static unsigned int __urhadd(unsigned int x, unsigned int y);
226
- __device__ static unsigned int __usad(unsigned int x, unsigned int y, unsigned int z);
227
-
228
- struct ucharHolder {
229
- union {
230
- unsigned char c[4];
231
- unsigned int ui;
232
- };
233
- } __attribute__((aligned(4)));
234
-
235
- struct uchar2Holder {
236
- union {
237
- unsigned int ui[2];
238
- unsigned char c[8];
239
- };
240
- } __attribute__((aligned(8)));
241
-
242
- __device__
243
- static inline unsigned int __byte_perm(unsigned int x, unsigned int y, unsigned int s) {
244
- struct uchar2Holder cHoldVal;
245
- struct ucharHolder cHoldKey;
246
- cHoldKey.ui = s;
247
- cHoldVal.ui[0] = x;
248
- cHoldVal.ui[1] = y;
249
- unsigned int result;
250
- result = cHoldVal.c[cHoldKey.c[0] & 0x07];
251
- result += (cHoldVal.c[(cHoldKey.c[0] & 0x70) >> 4] << 8);
252
- result += (cHoldVal.c[cHoldKey.c[1] & 0x07] << 16);
253
- result += (cHoldVal.c[(cHoldKey.c[1] & 0x70) >> 4] << 24);
254
- return result;
255
- }
256
-
257
- __device__ static inline unsigned int __hadd(int x, int y) {
258
- int z = x + y;
259
- int sign = z & 0x8000000;
260
- int value = z & 0x7FFFFFFF;
261
- return ((value) >> 1 || sign);
262
- }
263
-
264
- __device__ static inline int __mul24(int x, int y) {
265
- return __ockl_mul24_i32(x, y);
266
- }
267
-
268
- __device__ static inline long long __mul64hi(long long int x, long long int y) {
269
- ulong x0 = (ulong)x & 0xffffffffUL;
270
- long x1 = x >> 32;
271
- ulong y0 = (ulong)y & 0xffffffffUL;
272
- long y1 = y >> 32;
273
- ulong z0 = x0*y0;
274
- long t = x1*y0 + (z0 >> 32);
275
- long z1 = t & 0xffffffffL;
276
- long z2 = t >> 32;
277
- z1 = x0*y1 + z1;
278
- return x1*y1 + z2 + (z1 >> 32);
279
- }
280
-
281
- __device__ static inline int __mulhi(int x, int y) {
282
- return __ockl_mul_hi_i32(x, y);
283
- }
284
-
285
- __device__ static inline int __rhadd(int x, int y) {
286
- int z = x + y + 1;
287
- int sign = z & 0x8000000;
288
- int value = z & 0x7FFFFFFF;
289
- return ((value) >> 1 || sign);
290
- }
291
- __device__ static inline unsigned int __sad(int x, int y, unsigned int z) {
292
- return x > y ? x - y + z : y - x + z;
293
- }
294
- __device__ static inline unsigned int __uhadd(unsigned int x, unsigned int y) {
295
- return (x + y) >> 1;
296
- }
297
- __device__ static inline int __umul24(unsigned int x, unsigned int y) {
298
- return __ockl_mul24_u32(x, y);
299
- }
300
-
301
- __device__
302
- static inline unsigned long long __umul64hi(unsigned long long int x, unsigned long long int y) {
303
- ulong x0 = x & 0xffffffffUL;
304
- ulong x1 = x >> 32;
305
- ulong y0 = y & 0xffffffffUL;
306
- ulong y1 = y >> 32;
307
- ulong z0 = x0*y0;
308
- ulong t = x1*y0 + (z0 >> 32);
309
- ulong z1 = t & 0xffffffffUL;
310
- ulong z2 = t >> 32;
311
- z1 = x0*y1 + z1;
312
- return x1*y1 + z2 + (z1 >> 32);
313
- }
314
-
315
- __device__ static inline unsigned int __umulhi(unsigned int x, unsigned int y) {
316
- return __ockl_mul_hi_u32(x, y);
317
- }
318
- __device__ static inline unsigned int __urhadd(unsigned int x, unsigned int y) {
319
- return (x + y + 1) >> 1;
320
- }
321
- __device__ static inline unsigned int __usad(unsigned int x, unsigned int y, unsigned int z) {
322
- return __ockl_sadd_u32(x, y, z);
323
- }
324
-
325
- __device__ static inline unsigned int __lane_id() {
326
- return __builtin_amdgcn_mbcnt_hi(
327
- -1, __builtin_amdgcn_mbcnt_lo(-1, 0));
328
- }
329
-
330
- __device__
331
- static inline unsigned int __mbcnt_lo(unsigned int x, unsigned int y) {return __builtin_amdgcn_mbcnt_lo(x,y);};
332
-
333
- __device__
334
- static inline unsigned int __mbcnt_hi(unsigned int x, unsigned int y) {return __builtin_amdgcn_mbcnt_hi(x,y);};
335
-
336
- /*
337
- HIP specific device functions
338
- */
339
-
340
- #if !defined(__HIPCC_RTC__)
341
- #include "amd_warp_functions.h"
342
- #endif
343
-
344
- #define MASK1 0x00ff00ff
345
- #define MASK2 0xff00ff00
346
-
347
- __device__ static inline char4 __hip_hc_add8pk(char4 in1, char4 in2) {
348
- char4 out;
349
- unsigned one1 = in1.w & MASK1;
350
- unsigned one2 = in2.w & MASK1;
351
- out.w = (one1 + one2) & MASK1;
352
- one1 = in1.w & MASK2;
353
- one2 = in2.w & MASK2;
354
- out.w = out.w | ((one1 + one2) & MASK2);
355
- return out;
356
- }
357
-
358
- __device__ static inline char4 __hip_hc_sub8pk(char4 in1, char4 in2) {
359
- char4 out;
360
- unsigned one1 = in1.w & MASK1;
361
- unsigned one2 = in2.w & MASK1;
362
- out.w = (one1 - one2) & MASK1;
363
- one1 = in1.w & MASK2;
364
- one2 = in2.w & MASK2;
365
- out.w = out.w | ((one1 - one2) & MASK2);
366
- return out;
367
- }
368
-
369
- __device__ static inline char4 __hip_hc_mul8pk(char4 in1, char4 in2) {
370
- char4 out;
371
- unsigned one1 = in1.w & MASK1;
372
- unsigned one2 = in2.w & MASK1;
373
- out.w = (one1 * one2) & MASK1;
374
- one1 = in1.w & MASK2;
375
- one2 = in2.w & MASK2;
376
- out.w = out.w | ((one1 * one2) & MASK2);
377
- return out;
378
- }
379
-
380
- __device__ static inline float __double2float_rd(double x) {
381
- return __ocml_cvtrtn_f32_f64(x);
382
- }
383
- __device__ static inline float __double2float_rn(double x) { return x; }
384
- __device__ static inline float __double2float_ru(double x) {
385
- return __ocml_cvtrtp_f32_f64(x);
386
- }
387
- __device__ static inline float __double2float_rz(double x) {
388
- return __ocml_cvtrtz_f32_f64(x);
389
- }
390
-
391
- __device__ static inline int __double2hiint(double x) {
392
- static_assert(sizeof(double) == 2 * sizeof(int), "");
393
-
394
- int tmp[2];
395
- __builtin_memcpy(tmp, &x, sizeof(tmp));
396
-
397
- return tmp[1];
398
- }
399
- __device__ static inline int __double2loint(double x) {
400
- static_assert(sizeof(double) == 2 * sizeof(int), "");
401
-
402
- int tmp[2];
403
- __builtin_memcpy(tmp, &x, sizeof(tmp));
404
-
405
- return tmp[0];
406
- }
407
-
408
- __device__ static inline int __double2int_rd(double x) { return (int)__ocml_floor_f64(x); }
409
- __device__ static inline int __double2int_rn(double x) { return (int)__ocml_rint_f64(x); }
410
- __device__ static inline int __double2int_ru(double x) { return (int)__ocml_ceil_f64(x); }
411
- __device__ static inline int __double2int_rz(double x) { return (int)x; }
412
-
413
- __device__ static inline long long int __double2ll_rd(double x) {
414
- return (long long)__ocml_floor_f64(x);
415
- }
416
- __device__ static inline long long int __double2ll_rn(double x) {
417
- return (long long)__ocml_rint_f64(x);
418
- }
419
- __device__ static inline long long int __double2ll_ru(double x) {
420
- return (long long)__ocml_ceil_f64(x);
421
- }
422
- __device__ static inline long long int __double2ll_rz(double x) { return (long long)x; }
423
-
424
- __device__ static inline unsigned int __double2uint_rd(double x) {
425
- return (unsigned int)__ocml_floor_f64(x);
426
- }
427
- __device__ static inline unsigned int __double2uint_rn(double x) {
428
- return (unsigned int)__ocml_rint_f64(x);
429
- }
430
- __device__ static inline unsigned int __double2uint_ru(double x) {
431
- return (unsigned int)__ocml_ceil_f64(x);
432
- }
433
- __device__ static inline unsigned int __double2uint_rz(double x) { return (unsigned int)x; }
434
-
435
- __device__ static inline unsigned long long int __double2ull_rd(double x) {
436
- return (unsigned long long int)__ocml_floor_f64(x);
437
- }
438
- __device__ static inline unsigned long long int __double2ull_rn(double x) {
439
- return (unsigned long long int)__ocml_rint_f64(x);
440
- }
441
- __device__ static inline unsigned long long int __double2ull_ru(double x) {
442
- return (unsigned long long int)__ocml_ceil_f64(x);
443
- }
444
- __device__ static inline unsigned long long int __double2ull_rz(double x) {
445
- return (unsigned long long int)x;
446
- }
447
- __device__ static inline long long int __double_as_longlong(double x) {
448
- static_assert(sizeof(long long) == sizeof(double), "");
449
-
450
- long long tmp;
451
- __builtin_memcpy(&tmp, &x, sizeof(tmp));
452
-
453
- return tmp;
454
- }
455
-
456
- /*
457
- __device__ unsigned short __float2half_rn(float x);
458
- __device__ float __half2float(unsigned short);
459
-
460
- The above device function are not a valid .
461
- Use
462
- __device__ __half __float2half_rn(float x);
463
- __device__ float __half2float(__half);
464
- from hip_fp16.h
465
-
466
- CUDA implements half as unsigned short whereas, HIP doesn't.
467
-
468
- */
469
-
470
- __device__ static inline int __float2int_rd(float x) { return (int)__ocml_floor_f32(x); }
471
- __device__ static inline int __float2int_rn(float x) { return (int)__ocml_rint_f32(x); }
472
- __device__ static inline int __float2int_ru(float x) { return (int)__ocml_ceil_f32(x); }
473
- __device__ static inline int __float2int_rz(float x) { return (int)__ocml_trunc_f32(x); }
474
-
475
- __device__ static inline long long int __float2ll_rd(float x) {
476
- return (long long int)__ocml_floor_f32(x);
477
- }
478
- __device__ static inline long long int __float2ll_rn(float x) {
479
- return (long long int)__ocml_rint_f32(x);
480
- }
481
- __device__ static inline long long int __float2ll_ru(float x) {
482
- return (long long int)__ocml_ceil_f32(x);
483
- }
484
- __device__ static inline long long int __float2ll_rz(float x) { return (long long int)x; }
485
-
486
- __device__ static inline unsigned int __float2uint_rd(float x) {
487
- return (unsigned int)__ocml_floor_f32(x);
488
- }
489
- __device__ static inline unsigned int __float2uint_rn(float x) {
490
- return (unsigned int)__ocml_rint_f32(x);
491
- }
492
- __device__ static inline unsigned int __float2uint_ru(float x) {
493
- return (unsigned int)__ocml_ceil_f32(x);
494
- }
495
- __device__ static inline unsigned int __float2uint_rz(float x) { return (unsigned int)x; }
496
-
497
- __device__ static inline unsigned long long int __float2ull_rd(float x) {
498
- return (unsigned long long int)__ocml_floor_f32(x);
499
- }
500
- __device__ static inline unsigned long long int __float2ull_rn(float x) {
501
- return (unsigned long long int)__ocml_rint_f32(x);
502
- }
503
- __device__ static inline unsigned long long int __float2ull_ru(float x) {
504
- return (unsigned long long int)__ocml_ceil_f32(x);
505
- }
506
- __device__ static inline unsigned long long int __float2ull_rz(float x) {
507
- return (unsigned long long int)x;
508
- }
509
-
510
- __device__ static inline int __float_as_int(float x) {
511
- static_assert(sizeof(int) == sizeof(float), "");
512
-
513
- int tmp;
514
- __builtin_memcpy(&tmp, &x, sizeof(tmp));
515
-
516
- return tmp;
517
- }
518
-
519
- __device__ static inline unsigned int __float_as_uint(float x) {
520
- static_assert(sizeof(unsigned int) == sizeof(float), "");
521
-
522
- unsigned int tmp;
523
- __builtin_memcpy(&tmp, &x, sizeof(tmp));
524
-
525
- return tmp;
526
- }
527
-
528
- __device__ static inline double __hiloint2double(int hi, int lo) {
529
- static_assert(sizeof(double) == sizeof(uint64_t), "");
530
-
531
- uint64_t tmp0 = (static_cast<uint64_t>(hi) << 32ull) | static_cast<uint32_t>(lo);
532
- double tmp1;
533
- __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
534
-
535
- return tmp1;
536
- }
537
-
538
- __device__ static inline double __int2double_rn(int x) { return (double)x; }
539
-
540
- __device__ static inline float __int2float_rd(int x) {
541
- return __ocml_cvtrtn_f32_s32(x);
542
- }
543
- __device__ static inline float __int2float_rn(int x) { return (float)x; }
544
- __device__ static inline float __int2float_ru(int x) {
545
- return __ocml_cvtrtp_f32_s32(x);
546
- }
547
- __device__ static inline float __int2float_rz(int x) {
548
- return __ocml_cvtrtz_f32_s32(x);
549
- }
550
-
551
- __device__ static inline float __int_as_float(int x) {
552
- static_assert(sizeof(float) == sizeof(int), "");
553
-
554
- float tmp;
555
- __builtin_memcpy(&tmp, &x, sizeof(tmp));
556
-
557
- return tmp;
558
- }
559
-
560
- __device__ static inline double __ll2double_rd(long long int x) {
561
- return __ocml_cvtrtn_f64_s64(x);
562
- }
563
- __device__ static inline double __ll2double_rn(long long int x) { return (double)x; }
564
- __device__ static inline double __ll2double_ru(long long int x) {
565
- return __ocml_cvtrtp_f64_s64(x);
566
- }
567
- __device__ static inline double __ll2double_rz(long long int x) {
568
- return __ocml_cvtrtz_f64_s64(x);
569
- }
570
-
571
- __device__ static inline float __ll2float_rd(long long int x) {
572
- return __ocml_cvtrtn_f32_s64(x);
573
- }
574
- __device__ static inline float __ll2float_rn(long long int x) { return (float)x; }
575
- __device__ static inline float __ll2float_ru(long long int x) {
576
- return __ocml_cvtrtp_f32_s64(x);
577
- }
578
- __device__ static inline float __ll2float_rz(long long int x) {
579
- return __ocml_cvtrtz_f32_s64(x);
580
- }
581
-
582
- __device__ static inline double __longlong_as_double(long long int x) {
583
- static_assert(sizeof(double) == sizeof(long long), "");
584
-
585
- double tmp;
586
- __builtin_memcpy(&tmp, &x, sizeof(tmp));
587
-
588
- return tmp;
589
- }
590
-
591
- __device__ static inline double __uint2double_rn(unsigned int x) { return (double)x; }
592
-
593
- __device__ static inline float __uint2float_rd(unsigned int x) {
594
- return __ocml_cvtrtn_f32_u32(x);
595
- }
596
- __device__ static inline float __uint2float_rn(unsigned int x) { return (float)x; }
597
- __device__ static inline float __uint2float_ru(unsigned int x) {
598
- return __ocml_cvtrtp_f32_u32(x);
599
- }
600
- __device__ static inline float __uint2float_rz(unsigned int x) {
601
- return __ocml_cvtrtz_f32_u32(x);
602
- }
603
-
604
- __device__ static inline float __uint_as_float(unsigned int x) {
605
- static_assert(sizeof(float) == sizeof(unsigned int), "");
606
-
607
- float tmp;
608
- __builtin_memcpy(&tmp, &x, sizeof(tmp));
609
-
610
- return tmp;
611
- }
612
-
613
- __device__ static inline double __ull2double_rd(unsigned long long int x) {
614
- return __ocml_cvtrtn_f64_u64(x);
615
- }
616
- __device__ static inline double __ull2double_rn(unsigned long long int x) { return (double)x; }
617
- __device__ static inline double __ull2double_ru(unsigned long long int x) {
618
- return __ocml_cvtrtp_f64_u64(x);
619
- }
620
- __device__ static inline double __ull2double_rz(unsigned long long int x) {
621
- return __ocml_cvtrtz_f64_u64(x);
622
- }
623
-
624
- __device__ static inline float __ull2float_rd(unsigned long long int x) {
625
- return __ocml_cvtrtn_f32_u64(x);
626
- }
627
- __device__ static inline float __ull2float_rn(unsigned long long int x) { return (float)x; }
628
- __device__ static inline float __ull2float_ru(unsigned long long int x) {
629
- return __ocml_cvtrtp_f32_u64(x);
630
- }
631
- __device__ static inline float __ull2float_rz(unsigned long long int x) {
632
- return __ocml_cvtrtz_f32_u64(x);
633
- }
634
-
635
- #if defined(__clang__) && defined(__HIP__)
636
-
637
- // Clock functions
638
- __device__ long long int __clock64();
639
- __device__ long long int __clock();
640
- __device__ long long int clock64();
641
- __device__ long long int clock();
642
- __device__ long long int wall_clock64();
643
- // hip.amdgcn.bc - named sync
644
- __device__ void __named_sync();
645
-
646
- #ifdef __HIP_DEVICE_COMPILE__
647
-
648
- // Clock function to return GPU core cycle count.
649
- // GPU can change its core clock frequency at runtime. The maximum frequency can be queried
650
- // through hipDeviceAttributeClockRate attribute.
651
- __device__
652
- inline __attribute((always_inline))
653
- long long int __clock64() {
654
- #if __has_builtin(__builtin_amdgcn_s_memtime)
655
- // Exists on gfx8, gfx9, gfx10.1, gfx10.2, gfx10.3
656
- return (long long int) __builtin_amdgcn_s_memtime();
657
- #else
658
- // Subject to change when better solution available
659
- return (long long int) __builtin_readcyclecounter();
660
- #endif
661
- }
662
-
663
- __device__
664
- inline __attribute((always_inline))
665
- long long int __clock() { return __clock64(); }
666
-
667
- // Clock function to return wall clock count at a constant frequency that can be queried
668
- // through hipDeviceAttributeWallClockRate attribute.
669
- __device__
670
- inline __attribute__((always_inline))
671
- long long int wall_clock64() {
672
- return (long long int) __ockl_steadyctr_u64();
673
- }
674
-
675
- __device__
676
- inline __attribute__((always_inline))
677
- long long int clock64() { return __clock64(); }
678
-
679
- __device__
680
- inline __attribute__((always_inline))
681
- long long int clock() { return __clock(); }
682
-
683
- // hip.amdgcn.bc - named sync
684
- __device__
685
- inline
686
- void __named_sync() { __builtin_amdgcn_s_barrier(); }
687
-
688
- #endif // __HIP_DEVICE_COMPILE__
689
-
690
- // warp vote function __all __any __ballot
691
- __device__
692
- inline
693
- int __all(int predicate) {
694
- return __ockl_wfall_i32(predicate);
695
- }
696
-
697
- __device__
698
- inline
699
- int __any(int predicate) {
700
- return __ockl_wfany_i32(predicate);
701
- }
702
-
703
- // XXX from llvm/include/llvm/IR/InstrTypes.h
704
- #define ICMP_NE 33
705
-
706
- __device__
707
- inline
708
- unsigned long long int __ballot(int predicate) {
709
- return __builtin_amdgcn_uicmp(predicate, 0, ICMP_NE);
710
- }
711
-
712
- __device__
713
- inline
714
- unsigned long long int __ballot64(int predicate) {
715
- return __builtin_amdgcn_uicmp(predicate, 0, ICMP_NE);
716
- }
717
-
718
- // hip.amdgcn.bc - lanemask
719
- __device__
720
- inline
721
- uint64_t __lanemask_gt()
722
- {
723
- uint32_t lane = __ockl_lane_u32();
724
- if (lane == 63)
725
- return 0;
726
- uint64_t ballot = __ballot64(1);
727
- uint64_t mask = (~((uint64_t)0)) << (lane + 1);
728
- return mask & ballot;
729
- }
730
-
731
- __device__
732
- inline
733
- uint64_t __lanemask_lt()
734
- {
735
- uint32_t lane = __ockl_lane_u32();
736
- int64_t ballot = __ballot64(1);
737
- uint64_t mask = ((uint64_t)1 << lane) - (uint64_t)1;
738
- return mask & ballot;
739
- }
740
-
741
- __device__
742
- inline
743
- uint64_t __lanemask_eq()
744
- {
745
- uint32_t lane = __ockl_lane_u32();
746
- int64_t mask = ((uint64_t)1 << lane);
747
- return mask;
748
- }
749
-
750
-
751
- __device__ inline void* __local_to_generic(void* p) { return p; }
752
-
753
- #ifdef __HIP_DEVICE_COMPILE__
754
- __device__
755
- inline
756
- void* __get_dynamicgroupbaseptr()
757
- {
758
- // Get group segment base pointer.
759
- return (char*)__local_to_generic((void*)__to_local(__builtin_amdgcn_groupstaticsize()));
760
- }
761
- #else
762
- __device__
763
- void* __get_dynamicgroupbaseptr();
764
- #endif // __HIP_DEVICE_COMPILE__
765
-
766
- __device__
767
- inline
768
- void *__amdgcn_get_dynamicgroupbaseptr() {
769
- return __get_dynamicgroupbaseptr();
770
- }
771
-
772
- // Memory Fence Functions
773
- __device__
774
- inline
775
- static void __threadfence()
776
- {
777
- __builtin_amdgcn_fence(__ATOMIC_SEQ_CST, "agent");
778
- }
779
-
780
- __device__
781
- inline
782
- static void __threadfence_block()
783
- {
784
- __builtin_amdgcn_fence(__ATOMIC_SEQ_CST, "workgroup");
785
- }
786
-
787
- __device__
788
- inline
789
- static void __threadfence_system()
790
- {
791
- __builtin_amdgcn_fence(__ATOMIC_SEQ_CST, "");
792
- }
793
- __device__ inline static void __work_group_barrier(__cl_mem_fence_flags flags) {
794
- if (flags) {
795
- __builtin_amdgcn_fence(__ATOMIC_RELEASE, "workgroup");
796
- __builtin_amdgcn_s_barrier();
797
- __builtin_amdgcn_fence(__ATOMIC_ACQUIRE, "workgroup");
798
- } else {
799
- __builtin_amdgcn_s_barrier();
800
- }
801
- }
802
-
803
- __device__
804
- inline
805
- static void __barrier(int n)
806
- {
807
- __work_group_barrier((__cl_mem_fence_flags)n);
808
- }
809
-
810
- __device__
811
- inline
812
- __attribute__((convergent))
813
- void __syncthreads()
814
- {
815
- __barrier(__CLK_LOCAL_MEM_FENCE);
816
- }
817
-
818
- __device__
819
- inline
820
- __attribute__((convergent))
821
- int __syncthreads_count(int predicate)
822
- {
823
- return __ockl_wgred_add_i32(!!predicate);
824
- }
825
-
826
- __device__
827
- inline
828
- __attribute__((convergent))
829
- int __syncthreads_and(int predicate)
830
- {
831
- return __ockl_wgred_and_i32(!!predicate);
832
- }
833
-
834
- __device__
835
- inline
836
- __attribute__((convergent))
837
- int __syncthreads_or(int predicate)
838
- {
839
- return __ockl_wgred_or_i32(!!predicate);
840
- }
841
-
842
- // hip.amdgcn.bc - device routine
843
- /*
844
- HW_ID Register bit structure for RDNA2 & RDNA3
845
- WAVE_ID 4:0 Wave id within the SIMD.
846
- SIMD_ID 9:8 SIMD_ID within the WGP: [0] = row, [1] = column.
847
- WGP_ID 13:10 Physical WGP ID.
848
- SA_ID 16 Shader Array ID
849
- SE_ID 20:18 Shader Engine the wave is assigned to for gfx11
850
- SE_ID 19:18 Shader Engine the wave is assigned to for gfx10
851
- DP_RATE 31:29 Number of double-precision float units per SIMD
852
-
853
- HW_ID Register bit structure for GCN and CDNA
854
- WAVE_ID 3:0 Wave buffer slot number. 0-9.
855
- SIMD_ID 5:4 SIMD which the wave is assigned to within the CU.
856
- PIPE_ID 7:6 Pipeline from which the wave was dispatched.
857
- CU_ID 11:8 Compute Unit the wave is assigned to.
858
- SH_ID 12 Shader Array (within an SE) the wave is assigned to.
859
- SE_ID 15:13 Shader Engine the wave is assigned to for gfx908, gfx90a, gfx940-942
860
- 14:13 Shader Engine the wave is assigned to for Vega.
861
- TG_ID 19:16 Thread-group ID
862
- VM_ID 23:20 Virtual Memory ID
863
- QUEUE_ID 26:24 Queue from which this wave was dispatched.
864
- STATE_ID 29:27 State ID (graphics only, not compute).
865
- ME_ID 31:30 Micro-engine ID.
866
-
867
- XCC_ID Register bit structure for gfx940
868
- XCC_ID 3:0 XCC the wave is assigned to.
869
- */
870
-
871
- #if (defined (__GFX10__) || defined (__GFX11__))
872
- #define HW_ID 23
873
- #else
874
- #define HW_ID 4
875
- #endif
876
-
877
- #if (defined(__GFX10__) || defined(__GFX11__))
878
- #define HW_ID_WGP_ID_SIZE 4
879
- #define HW_ID_WGP_ID_OFFSET 10
880
- #else
881
- #define HW_ID_CU_ID_SIZE 4
882
- #define HW_ID_CU_ID_OFFSET 8
883
- #endif
884
-
885
- #if (defined(__gfx908__) || defined(__gfx90a__) || \
886
- defined(__GFX11__))
887
- #define HW_ID_SE_ID_SIZE 3
888
- #else //4 SEs/XCC for gfx940-942
889
- #define HW_ID_SE_ID_SIZE 2
890
- #endif
891
- #if (defined(__GFX10__) || defined(__GFX11__))
892
- #define HW_ID_SE_ID_OFFSET 18
893
- #define HW_ID_SA_ID_OFFSET 16
894
- #define HW_ID_SA_ID_SIZE 1
895
- #else
896
- #define HW_ID_SE_ID_OFFSET 13
897
- #endif
898
-
899
- #if (defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__))
900
- #define XCC_ID 20
901
- #define XCC_ID_XCC_ID_SIZE 4
902
- #define XCC_ID_XCC_ID_OFFSET 0
903
- #endif
904
-
905
- #if (!defined(__HIP_NO_IMAGE_SUPPORT) && \
906
- (defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)))
907
- #define __HIP_NO_IMAGE_SUPPORT 1
908
- #endif
909
-
910
- /*
911
- Encoding of parameter bitmask
912
- HW_ID 5:0 HW_ID
913
- OFFSET 10:6 Range: 0..31
914
- SIZE 15:11 Range: 1..32
915
- */
916
-
917
- #define GETREG_IMMED(SZ,OFF,REG) (((SZ) << 11) | ((OFF) << 6) | (REG))
918
-
919
- /*
920
- __smid returns the wave's assigned Compute Unit and Shader Engine.
921
- The Compute Unit, CU_ID returned in bits 3:0, and Shader Engine, SE_ID in bits 5:4.
922
- Note: the results vary over time.
923
- SZ minus 1 since SIZE is 1-based.
924
- */
925
- __device__
926
- inline
927
- unsigned __smid(void)
928
- {
929
- unsigned se_id = __builtin_amdgcn_s_getreg(
930
- GETREG_IMMED(HW_ID_SE_ID_SIZE-1, HW_ID_SE_ID_OFFSET, HW_ID));
931
- #if (defined(__GFX10__) || defined(__GFX11__))
932
- unsigned wgp_id = __builtin_amdgcn_s_getreg(
933
- GETREG_IMMED(HW_ID_WGP_ID_SIZE - 1, HW_ID_WGP_ID_OFFSET, HW_ID));
934
- unsigned sa_id = __builtin_amdgcn_s_getreg(
935
- GETREG_IMMED(HW_ID_SA_ID_SIZE - 1, HW_ID_SA_ID_OFFSET, HW_ID));
936
- #else
937
- #if (defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__))
938
- unsigned xcc_id = __builtin_amdgcn_s_getreg(
939
- GETREG_IMMED(XCC_ID_XCC_ID_SIZE - 1, XCC_ID_XCC_ID_OFFSET, XCC_ID));
940
- #endif
941
- unsigned cu_id = __builtin_amdgcn_s_getreg(
942
- GETREG_IMMED(HW_ID_CU_ID_SIZE - 1, HW_ID_CU_ID_OFFSET, HW_ID));
943
- #endif
944
- #if (defined(__GFX10__) || defined(__GFX11__))
945
- unsigned temp = se_id;
946
- temp = (temp << HW_ID_SA_ID_SIZE) | sa_id;
947
- temp = (temp << HW_ID_WGP_ID_SIZE) | wgp_id;
948
- return temp;
949
- //TODO : CU Mode impl
950
- #elif (defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__))
951
- unsigned temp = xcc_id;
952
- temp = (temp << HW_ID_SE_ID_SIZE) | se_id;
953
- temp = (temp << HW_ID_CU_ID_SIZE) | cu_id;
954
- return temp;
955
- #else
956
- return (se_id << HW_ID_CU_ID_SIZE) + cu_id;
957
- #endif
958
- }
959
-
960
- /**
961
- * Map HIP_DYNAMIC_SHARED to "extern __shared__" for compatibility with old HIP applications
962
- * To be removed in a future release.
963
- */
964
- #define HIP_DYNAMIC_SHARED(type, var) extern __shared__ type var[];
965
- #define HIP_DYNAMIC_SHARED_ATTRIBUTE
966
-
967
- #endif //defined(__clang__) && defined(__HIP__)
968
-
969
-
970
- // loop unrolling
971
- static inline __device__ void* __hip_hc_memcpy(void* dst, const void* src, size_t size) {
972
- auto dstPtr = static_cast<unsigned char*>(dst);
973
- auto srcPtr = static_cast<const unsigned char*>(src);
974
-
975
- while (size >= 4u) {
976
- dstPtr[0] = srcPtr[0];
977
- dstPtr[1] = srcPtr[1];
978
- dstPtr[2] = srcPtr[2];
979
- dstPtr[3] = srcPtr[3];
980
-
981
- size -= 4u;
982
- srcPtr += 4u;
983
- dstPtr += 4u;
984
- }
985
- switch (size) {
986
- case 3:
987
- dstPtr[2] = srcPtr[2];
988
- case 2:
989
- dstPtr[1] = srcPtr[1];
990
- case 1:
991
- dstPtr[0] = srcPtr[0];
992
- }
993
-
994
- return dst;
995
- }
996
-
997
- static inline __device__ void* __hip_hc_memset(void* dst, unsigned char val, size_t size) {
998
- auto dstPtr = static_cast<unsigned char*>(dst);
999
-
1000
- while (size >= 4u) {
1001
- dstPtr[0] = val;
1002
- dstPtr[1] = val;
1003
- dstPtr[2] = val;
1004
- dstPtr[3] = val;
1005
-
1006
- size -= 4u;
1007
- dstPtr += 4u;
1008
- }
1009
- switch (size) {
1010
- case 3:
1011
- dstPtr[2] = val;
1012
- case 2:
1013
- dstPtr[1] = val;
1014
- case 1:
1015
- dstPtr[0] = val;
1016
- }
1017
-
1018
- return dst;
1019
- }
1020
- #ifndef __OPENMP_AMDGCN__
1021
- static inline __device__ void* memcpy(void* dst, const void* src, size_t size) {
1022
- return __hip_hc_memcpy(dst, src, size);
1023
- }
1024
-
1025
- static inline __device__ void* memset(void* ptr, int val, size_t size) {
1026
- unsigned char val8 = static_cast<unsigned char>(val);
1027
- return __hip_hc_memset(ptr, val8, size);
1028
- }
1029
- #endif // !__OPENMP_AMDGCN__
1030
-
1031
- #endif