triton-windows 3.3.1.post19__cp310-cp310-win_amd64.whl → 3.3.1.post21__cp310-cp310-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of triton-windows might be problematic. Click here for more details.

Files changed (108) hide show
  1. triton/_C/libtriton.pyd +0 -0
  2. triton/backends/amd/driver.py +6 -1
  3. triton/backends/nvidia/compiler.py +1 -3
  4. triton/backends/nvidia/driver.py +7 -3
  5. triton/runtime/autotuner.py +2 -2
  6. triton/runtime/build.py +5 -5
  7. triton/windows_utils.py +11 -4
  8. {triton_windows-3.3.1.post19.dist-info → triton_windows-3.3.1.post21.dist-info}/METADATA +1 -1
  9. {triton_windows-3.3.1.post19.dist-info → triton_windows-3.3.1.post21.dist-info}/RECORD +11 -108
  10. triton/backends/amd/include/hip/amd_detail/amd_channel_descriptor.h +0 -358
  11. triton/backends/amd/include/hip/amd_detail/amd_device_functions.h +0 -1010
  12. triton/backends/amd/include/hip/amd_detail/amd_hip_atomic.h +0 -1638
  13. triton/backends/amd/include/hip/amd_detail/amd_hip_bf16.h +0 -1814
  14. triton/backends/amd/include/hip/amd_detail/amd_hip_bfloat16.h +0 -293
  15. triton/backends/amd/include/hip/amd_detail/amd_hip_common.h +0 -32
  16. triton/backends/amd/include/hip/amd_detail/amd_hip_complex.h +0 -174
  17. triton/backends/amd/include/hip/amd_detail/amd_hip_cooperative_groups.h +0 -835
  18. triton/backends/amd/include/hip/amd_detail/amd_hip_fp16.h +0 -1809
  19. triton/backends/amd/include/hip/amd_detail/amd_hip_fp8.h +0 -1391
  20. triton/backends/amd/include/hip/amd_detail/amd_hip_gl_interop.h +0 -108
  21. triton/backends/amd/include/hip/amd_detail/amd_hip_math_constants.h +0 -124
  22. triton/backends/amd/include/hip/amd_detail/amd_hip_runtime.h +0 -405
  23. triton/backends/amd/include/hip/amd_detail/amd_hip_runtime_pt_api.h +0 -196
  24. triton/backends/amd/include/hip/amd_detail/amd_hip_unsafe_atomics.h +0 -565
  25. triton/backends/amd/include/hip/amd_detail/amd_hip_vector_types.h +0 -2226
  26. triton/backends/amd/include/hip/amd_detail/amd_math_functions.h +0 -104
  27. triton/backends/amd/include/hip/amd_detail/amd_surface_functions.h +0 -244
  28. triton/backends/amd/include/hip/amd_detail/amd_warp_functions.h +0 -538
  29. triton/backends/amd/include/hip/amd_detail/amd_warp_sync_functions.h +0 -288
  30. triton/backends/amd/include/hip/amd_detail/concepts.hpp +0 -30
  31. triton/backends/amd/include/hip/amd_detail/device_library_decls.h +0 -133
  32. triton/backends/amd/include/hip/amd_detail/functional_grid_launch.hpp +0 -218
  33. triton/backends/amd/include/hip/amd_detail/grid_launch.h +0 -67
  34. triton/backends/amd/include/hip/amd_detail/grid_launch.hpp +0 -50
  35. triton/backends/amd/include/hip/amd_detail/grid_launch_GGL.hpp +0 -26
  36. triton/backends/amd/include/hip/amd_detail/helpers.hpp +0 -137
  37. triton/backends/amd/include/hip/amd_detail/hip_api_trace.hpp +0 -1446
  38. triton/backends/amd/include/hip/amd_detail/hip_assert.h +0 -101
  39. triton/backends/amd/include/hip/amd_detail/hip_cooperative_groups_helper.h +0 -242
  40. triton/backends/amd/include/hip/amd_detail/hip_fp16_gcc.h +0 -254
  41. triton/backends/amd/include/hip/amd_detail/hip_fp16_math_fwd.h +0 -96
  42. triton/backends/amd/include/hip/amd_detail/hip_ldg.h +0 -100
  43. triton/backends/amd/include/hip/amd_detail/hip_prof_str.h +0 -10570
  44. triton/backends/amd/include/hip/amd_detail/hip_runtime_prof.h +0 -78
  45. triton/backends/amd/include/hip/amd_detail/host_defines.h +0 -184
  46. triton/backends/amd/include/hip/amd_detail/hsa_helpers.hpp +0 -102
  47. triton/backends/amd/include/hip/amd_detail/macro_based_grid_launch.hpp +0 -798
  48. triton/backends/amd/include/hip/amd_detail/math_fwd.h +0 -698
  49. triton/backends/amd/include/hip/amd_detail/ockl_image.h +0 -177
  50. triton/backends/amd/include/hip/amd_detail/program_state.hpp +0 -107
  51. triton/backends/amd/include/hip/amd_detail/texture_fetch_functions.h +0 -491
  52. triton/backends/amd/include/hip/amd_detail/texture_indirect_functions.h +0 -478
  53. triton/backends/amd/include/hip/channel_descriptor.h +0 -39
  54. triton/backends/amd/include/hip/device_functions.h +0 -38
  55. triton/backends/amd/include/hip/driver_types.h +0 -468
  56. triton/backends/amd/include/hip/hip_bf16.h +0 -36
  57. triton/backends/amd/include/hip/hip_bfloat16.h +0 -44
  58. triton/backends/amd/include/hip/hip_common.h +0 -100
  59. triton/backends/amd/include/hip/hip_complex.h +0 -38
  60. triton/backends/amd/include/hip/hip_cooperative_groups.h +0 -46
  61. triton/backends/amd/include/hip/hip_deprecated.h +0 -95
  62. triton/backends/amd/include/hip/hip_ext.h +0 -161
  63. triton/backends/amd/include/hip/hip_fp16.h +0 -36
  64. triton/backends/amd/include/hip/hip_fp8.h +0 -33
  65. triton/backends/amd/include/hip/hip_gl_interop.h +0 -32
  66. triton/backends/amd/include/hip/hip_hcc.h +0 -24
  67. triton/backends/amd/include/hip/hip_math_constants.h +0 -36
  68. triton/backends/amd/include/hip/hip_profile.h +0 -27
  69. triton/backends/amd/include/hip/hip_runtime.h +0 -75
  70. triton/backends/amd/include/hip/hip_runtime_api.h +0 -9261
  71. triton/backends/amd/include/hip/hip_texture_types.h +0 -29
  72. triton/backends/amd/include/hip/hip_vector_types.h +0 -41
  73. triton/backends/amd/include/hip/hip_version.h +0 -17
  74. triton/backends/amd/include/hip/hiprtc.h +0 -421
  75. triton/backends/amd/include/hip/library_types.h +0 -78
  76. triton/backends/amd/include/hip/math_functions.h +0 -42
  77. triton/backends/amd/include/hip/surface_types.h +0 -63
  78. triton/backends/amd/include/hip/texture_types.h +0 -194
  79. triton/backends/amd/include/hsa/Brig.h +0 -1131
  80. triton/backends/amd/include/hsa/amd_hsa_common.h +0 -91
  81. triton/backends/amd/include/hsa/amd_hsa_elf.h +0 -462
  82. triton/backends/amd/include/hsa/amd_hsa_kernel_code.h +0 -269
  83. triton/backends/amd/include/hsa/amd_hsa_queue.h +0 -109
  84. triton/backends/amd/include/hsa/amd_hsa_signal.h +0 -80
  85. triton/backends/amd/include/hsa/hsa.h +0 -5738
  86. triton/backends/amd/include/hsa/hsa_amd_tool.h +0 -91
  87. triton/backends/amd/include/hsa/hsa_api_trace.h +0 -579
  88. triton/backends/amd/include/hsa/hsa_api_trace_version.h +0 -68
  89. triton/backends/amd/include/hsa/hsa_ext_amd.h +0 -3146
  90. triton/backends/amd/include/hsa/hsa_ext_finalize.h +0 -531
  91. triton/backends/amd/include/hsa/hsa_ext_image.h +0 -1454
  92. triton/backends/amd/include/hsa/hsa_ven_amd_aqlprofile.h +0 -488
  93. triton/backends/amd/include/hsa/hsa_ven_amd_loader.h +0 -667
  94. triton/backends/amd/include/hsa/hsa_ven_amd_pc_sampling.h +0 -416
  95. triton/backends/amd/include/roctracer/ext/prof_protocol.h +0 -107
  96. triton/backends/amd/include/roctracer/hip_ostream_ops.h +0 -4515
  97. triton/backends/amd/include/roctracer/hsa_ostream_ops.h +0 -1727
  98. triton/backends/amd/include/roctracer/hsa_prof_str.h +0 -3059
  99. triton/backends/amd/include/roctracer/roctracer.h +0 -779
  100. triton/backends/amd/include/roctracer/roctracer_ext.h +0 -81
  101. triton/backends/amd/include/roctracer/roctracer_hcc.h +0 -24
  102. triton/backends/amd/include/roctracer/roctracer_hip.h +0 -37
  103. triton/backends/amd/include/roctracer/roctracer_hsa.h +0 -112
  104. triton/backends/amd/include/roctracer/roctracer_plugin.h +0 -137
  105. triton/backends/amd/include/roctracer/roctracer_roctx.h +0 -67
  106. triton/backends/amd/include/roctracer/roctx.h +0 -229
  107. {triton_windows-3.3.1.post19.dist-info → triton_windows-3.3.1.post21.dist-info}/WHEEL +0 -0
  108. {triton_windows-3.3.1.post19.dist-info → triton_windows-3.3.1.post21.dist-info}/top_level.txt +0 -0
@@ -1,1638 +0,0 @@
1
- /*
2
- Copyright (c) 2015 - Present Advanced Micro Devices, Inc. All rights reserved.
3
-
4
- Permission is hereby granted, free of charge, to any person obtaining a copy
5
- of this software and associated documentation files (the "Software"), to deal
6
- in the Software without restriction, including without limitation the rights
7
- to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
8
- copies of the Software, and to permit persons to whom the Software is
9
- furnished to do so, subject to the following conditions:
10
-
11
- The above copyright notice and this permission notice shall be included in
12
- all copies or substantial portions of the Software.
13
-
14
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15
- IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16
- FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
17
- AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18
- LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
19
- OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
20
- THE SOFTWARE.
21
- */
22
-
23
- #pragma once
24
-
25
- #if !defined(__HIPCC_RTC__)
26
- #include "amd_device_functions.h"
27
- #endif
28
-
29
- #if __has_builtin(__hip_atomic_compare_exchange_strong)
30
-
31
- template<bool B, typename T, typename F> struct Cond_t;
32
-
33
- template<typename T, typename F> struct Cond_t<true, T, F> { using type = T; };
34
- template<typename T, typename F> struct Cond_t<false, T, F> { using type = F; };
35
-
36
- #if !__HIP_DEVICE_COMPILE__
37
- //TODO: Remove this after compiler pre-defines the following Macros.
38
- #define __HIP_MEMORY_SCOPE_SINGLETHREAD 1
39
- #define __HIP_MEMORY_SCOPE_WAVEFRONT 2
40
- #define __HIP_MEMORY_SCOPE_WORKGROUP 3
41
- #define __HIP_MEMORY_SCOPE_AGENT 4
42
- #define __HIP_MEMORY_SCOPE_SYSTEM 5
43
- #endif
44
-
45
- #if !defined(__HIPCC_RTC__)
46
- #include "amd_hip_unsafe_atomics.h"
47
- #endif
48
-
49
- // Atomic expanders
50
- template<
51
- int mem_order = __ATOMIC_SEQ_CST,
52
- int mem_scope= __HIP_MEMORY_SCOPE_SYSTEM,
53
- typename T,
54
- typename Op,
55
- typename F>
56
- inline
57
- __attribute__((always_inline, device))
58
- T hip_cas_expander(T* p, T x, Op op, F f) noexcept
59
- {
60
- using FP = __attribute__((address_space(0))) const void*;
61
-
62
- __device__
63
- extern bool is_shared_workaround(FP) asm("llvm.amdgcn.is.shared");
64
-
65
- if (is_shared_workaround((FP)p))
66
- return f();
67
-
68
- using U = typename Cond_t<
69
- sizeof(T) == sizeof(unsigned int), unsigned int, unsigned long long>::type;
70
-
71
- auto q = reinterpret_cast<U*>(p);
72
-
73
- U tmp0{__hip_atomic_load(q, mem_order, mem_scope)};
74
- U tmp1;
75
- do {
76
- tmp1 = tmp0;
77
-
78
- op(reinterpret_cast<T&>(tmp1), x);
79
- } while (!__hip_atomic_compare_exchange_strong(q, &tmp0, tmp1, mem_order,
80
- mem_order, mem_scope));
81
-
82
- return reinterpret_cast<const T&>(tmp0);
83
- }
84
-
85
- template<
86
- int mem_order = __ATOMIC_SEQ_CST,
87
- int mem_scope= __HIP_MEMORY_SCOPE_SYSTEM,
88
- typename T,
89
- typename Cmp,
90
- typename F>
91
- inline
92
- __attribute__((always_inline, device))
93
- T hip_cas_extrema_expander(T* p, T x, Cmp cmp, F f) noexcept
94
- {
95
- using FP = __attribute__((address_space(0))) const void*;
96
-
97
- __device__
98
- extern bool is_shared_workaround(FP) asm("llvm.amdgcn.is.shared");
99
-
100
- if (is_shared_workaround((FP)p))
101
- return f();
102
-
103
- using U = typename Cond_t<
104
- sizeof(T) == sizeof(unsigned int), unsigned int, unsigned long long>::type;
105
-
106
- auto q = reinterpret_cast<U*>(p);
107
-
108
- U tmp{__hip_atomic_load(q, mem_order, mem_scope)};
109
- while (cmp(x, reinterpret_cast<const T&>(tmp)) &&
110
- !__hip_atomic_compare_exchange_strong(q, &tmp, x, mem_order, mem_order,
111
- mem_scope));
112
-
113
- return reinterpret_cast<const T&>(tmp);
114
- }
115
-
116
- __device__
117
- inline
118
- int atomicCAS(int* address, int compare, int val) {
119
- __hip_atomic_compare_exchange_strong(address, &compare, val, __ATOMIC_RELAXED, __ATOMIC_RELAXED,
120
- __HIP_MEMORY_SCOPE_AGENT);
121
- return compare;
122
- }
123
-
124
- __device__
125
- inline
126
- int atomicCAS_system(int* address, int compare, int val) {
127
- __hip_atomic_compare_exchange_strong(address, &compare, val, __ATOMIC_RELAXED, __ATOMIC_RELAXED,
128
- __HIP_MEMORY_SCOPE_SYSTEM);
129
- return compare;
130
- }
131
-
132
- __device__
133
- inline
134
- unsigned int atomicCAS(unsigned int* address, unsigned int compare, unsigned int val) {
135
- __hip_atomic_compare_exchange_strong(address, &compare, val, __ATOMIC_RELAXED, __ATOMIC_RELAXED,
136
- __HIP_MEMORY_SCOPE_AGENT);
137
- return compare;
138
- }
139
-
140
- __device__
141
- inline
142
- unsigned int atomicCAS_system(unsigned int* address, unsigned int compare, unsigned int val) {
143
- __hip_atomic_compare_exchange_strong(address, &compare, val, __ATOMIC_RELAXED, __ATOMIC_RELAXED,
144
- __HIP_MEMORY_SCOPE_SYSTEM);
145
- return compare;
146
- }
147
-
148
- __device__
149
- inline
150
- unsigned long atomicCAS(unsigned long* address, unsigned long compare, unsigned long val) {
151
- __hip_atomic_compare_exchange_strong(address, &compare, val, __ATOMIC_RELAXED, __ATOMIC_RELAXED,
152
- __HIP_MEMORY_SCOPE_AGENT);
153
- return compare;
154
- }
155
-
156
- __device__
157
- inline
158
- unsigned long atomicCAS_system(unsigned long* address, unsigned long compare, unsigned long val) {
159
- __hip_atomic_compare_exchange_strong(address, &compare, val, __ATOMIC_RELAXED, __ATOMIC_RELAXED,
160
- __HIP_MEMORY_SCOPE_SYSTEM);
161
- return compare;
162
- }
163
-
164
- __device__
165
- inline
166
- unsigned long long atomicCAS(unsigned long long* address, unsigned long long compare,
167
- unsigned long long val) {
168
- __hip_atomic_compare_exchange_strong(address, &compare, val, __ATOMIC_RELAXED, __ATOMIC_RELAXED,
169
- __HIP_MEMORY_SCOPE_AGENT);
170
- return compare;
171
- }
172
-
173
- __device__
174
- inline
175
- unsigned long long atomicCAS_system(unsigned long long* address, unsigned long long compare,
176
- unsigned long long val) {
177
- __hip_atomic_compare_exchange_strong(address, &compare, val, __ATOMIC_RELAXED, __ATOMIC_RELAXED,
178
- __HIP_MEMORY_SCOPE_SYSTEM);
179
- return compare;
180
- }
181
-
182
- __device__
183
- inline
184
- float atomicCAS(float* address, float compare, float val) {
185
- __hip_atomic_compare_exchange_strong(address, &compare, val, __ATOMIC_RELAXED, __ATOMIC_RELAXED,
186
- __HIP_MEMORY_SCOPE_AGENT);
187
- return compare;
188
- }
189
-
190
- __device__
191
- inline
192
- float atomicCAS_system(float* address, float compare, float val) {
193
- __hip_atomic_compare_exchange_strong(address, &compare, val, __ATOMIC_RELAXED, __ATOMIC_RELAXED,
194
- __HIP_MEMORY_SCOPE_SYSTEM);
195
- return compare;
196
- }
197
-
198
- __device__
199
- inline
200
- double atomicCAS(double* address, double compare, double val) {
201
- __hip_atomic_compare_exchange_strong(address, &compare, val, __ATOMIC_RELAXED, __ATOMIC_RELAXED,
202
- __HIP_MEMORY_SCOPE_AGENT);
203
- return compare;
204
- }
205
-
206
- __device__
207
- inline
208
- double atomicCAS_system(double* address, double compare, double val) {
209
- __hip_atomic_compare_exchange_strong(address, &compare, val, __ATOMIC_RELAXED, __ATOMIC_RELAXED,
210
- __HIP_MEMORY_SCOPE_SYSTEM);
211
- return compare;
212
- }
213
-
214
- __device__
215
- inline
216
- int atomicAdd(int* address, int val) {
217
- return __hip_atomic_fetch_add(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
218
- }
219
-
220
- __device__
221
- inline
222
- int atomicAdd_system(int* address, int val) {
223
- return __hip_atomic_fetch_add(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
224
- }
225
-
226
- __device__
227
- inline
228
- unsigned int atomicAdd(unsigned int* address, unsigned int val) {
229
- return __hip_atomic_fetch_add(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
230
- }
231
-
232
- __device__
233
- inline
234
- unsigned int atomicAdd_system(unsigned int* address, unsigned int val) {
235
- return __hip_atomic_fetch_add(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
236
- }
237
-
238
- __device__
239
- inline
240
- unsigned long atomicAdd(unsigned long* address, unsigned long val) {
241
- return __hip_atomic_fetch_add(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
242
- }
243
-
244
- __device__
245
- inline
246
- unsigned long atomicAdd_system(unsigned long* address, unsigned long val) {
247
- return __hip_atomic_fetch_add(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
248
- }
249
-
250
- __device__
251
- inline
252
- unsigned long long atomicAdd(unsigned long long* address, unsigned long long val) {
253
- return __hip_atomic_fetch_add(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
254
- }
255
-
256
- __device__
257
- inline
258
- unsigned long long atomicAdd_system(unsigned long long* address, unsigned long long val) {
259
- return __hip_atomic_fetch_add(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
260
- }
261
-
262
- __device__
263
- inline
264
- float atomicAdd(float* address, float val) {
265
- #if defined(__AMDGCN_UNSAFE_FP_ATOMICS__)
266
- return unsafeAtomicAdd(address, val);
267
- #else
268
- return __hip_atomic_fetch_add(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
269
- #endif
270
- }
271
-
272
- __device__
273
- inline
274
- float atomicAdd_system(float* address, float val) {
275
- return __hip_atomic_fetch_add(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
276
- }
277
-
278
- #if !defined(__HIPCC_RTC__)
279
- DEPRECATED("use atomicAdd instead")
280
- #endif // !defined(__HIPCC_RTC__)
281
- __device__
282
- inline
283
- void atomicAddNoRet(float* address, float val)
284
- {
285
- __ockl_atomic_add_noret_f32(address, val);
286
- }
287
-
288
- __device__
289
- inline
290
- double atomicAdd(double* address, double val) {
291
- #if defined(__AMDGCN_UNSAFE_FP_ATOMICS__)
292
- return unsafeAtomicAdd(address, val);
293
- #else
294
- return __hip_atomic_fetch_add(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
295
- #endif
296
- }
297
-
298
- __device__
299
- inline
300
- double atomicAdd_system(double* address, double val) {
301
- return __hip_atomic_fetch_add(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
302
- }
303
-
304
- __device__
305
- inline
306
- int atomicSub(int* address, int val) {
307
- return __hip_atomic_fetch_add(address, -val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
308
- }
309
-
310
- __device__
311
- inline
312
- int atomicSub_system(int* address, int val) {
313
- return __hip_atomic_fetch_add(address, -val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
314
- }
315
-
316
- __device__
317
- inline
318
- unsigned int atomicSub(unsigned int* address, unsigned int val) {
319
- return __hip_atomic_fetch_add(address, -val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
320
- }
321
-
322
- __device__
323
- inline
324
- unsigned int atomicSub_system(unsigned int* address, unsigned int val) {
325
- return __hip_atomic_fetch_add(address, -val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
326
- }
327
-
328
- __device__
329
- inline
330
- unsigned long atomicSub(unsigned long* address, unsigned long val) {
331
- return __hip_atomic_fetch_add(address, -val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
332
- }
333
-
334
- __device__
335
- inline
336
- unsigned long atomicSub_system(unsigned long* address, unsigned long val) {
337
- return __hip_atomic_fetch_add(address, -val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
338
- }
339
-
340
- __device__
341
- inline
342
- unsigned long long atomicSub(unsigned long long* address, unsigned long long val) {
343
- return __hip_atomic_fetch_add(address, -val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
344
- }
345
-
346
- __device__
347
- inline
348
- unsigned long long atomicSub_system(unsigned long long* address, unsigned long long val) {
349
- return __hip_atomic_fetch_add(address, -val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
350
- }
351
-
352
- __device__
353
- inline
354
- float atomicSub(float* address, float val) {
355
- #if defined(__AMDGCN_UNSAFE_FP_ATOMICS__)
356
- return unsafeAtomicAdd(address, -val);
357
- #else
358
- return __hip_atomic_fetch_add(address, -val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
359
- #endif
360
- }
361
-
362
- __device__
363
- inline
364
- float atomicSub_system(float* address, float val) {
365
- return __hip_atomic_fetch_add(address, -val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
366
- }
367
-
368
- __device__
369
- inline
370
- double atomicSub(double* address, double val) {
371
- #if defined(__AMDGCN_UNSAFE_FP_ATOMICS__)
372
- return unsafeAtomicAdd(address, -val);
373
- #else
374
- return __hip_atomic_fetch_add(address, -val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
375
- #endif
376
- }
377
-
378
- __device__
379
- inline
380
- double atomicSub_system(double* address, double val) {
381
- return __hip_atomic_fetch_add(address, -val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
382
- }
383
-
384
- __device__
385
- inline
386
- int atomicExch(int* address, int val) {
387
- return __hip_atomic_exchange(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
388
- }
389
-
390
- __device__
391
- inline
392
- int atomicExch_system(int* address, int val) {
393
- return __hip_atomic_exchange(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
394
- }
395
-
396
- __device__
397
- inline
398
- unsigned int atomicExch(unsigned int* address, unsigned int val) {
399
- return __hip_atomic_exchange(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
400
- }
401
-
402
- __device__
403
- inline
404
- unsigned int atomicExch_system(unsigned int* address, unsigned int val) {
405
- return __hip_atomic_exchange(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
406
- }
407
-
408
- __device__
409
- inline
410
- unsigned long atomicExch(unsigned long* address, unsigned long val) {
411
- return __hip_atomic_exchange(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
412
- }
413
-
414
- __device__
415
- inline
416
- unsigned long atomicExch_system(unsigned long* address, unsigned long val) {
417
- return __hip_atomic_exchange(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
418
- }
419
-
420
- __device__
421
- inline
422
- unsigned long long atomicExch(unsigned long long* address, unsigned long long val) {
423
- return __hip_atomic_exchange(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
424
- }
425
-
426
- __device__
427
- inline
428
- unsigned long long atomicExch_system(unsigned long long* address, unsigned long long val) {
429
- return __hip_atomic_exchange(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
430
- }
431
-
432
- __device__
433
- inline
434
- float atomicExch(float* address, float val) {
435
- return __hip_atomic_exchange(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
436
- }
437
-
438
- __device__
439
- inline
440
- float atomicExch_system(float* address, float val) {
441
- return __hip_atomic_exchange(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
442
- }
443
-
444
- __device__
445
- inline
446
- double atomicExch(double* address, double val) {
447
- return __hip_atomic_exchange(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
448
- }
449
-
450
- __device__
451
- inline
452
- double atomicExch_system(double* address, double val) {
453
- return __hip_atomic_exchange(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
454
- }
455
-
456
- __device__
457
- inline
458
- int atomicMin(int* address, int val) {
459
- #if defined(__gfx941__)
460
- return hip_cas_extrema_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT>(
461
- address, val, [](int x, int y) { return x < y; }, [=]() {
462
- return __hip_atomic_fetch_min(address, val, __ATOMIC_RELAXED,
463
- __HIP_MEMORY_SCOPE_AGENT);
464
- });
465
- #else
466
- return __hip_atomic_fetch_min(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
467
- #endif // __gfx941__
468
- }
469
-
470
- __device__
471
- inline
472
- int atomicMin_system(int* address, int val) {
473
- #if defined(__gfx941__)
474
- return hip_cas_extrema_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM>(
475
- address, val, [](int x, int y) { return x < y; }, [=]() {
476
- return __hip_atomic_fetch_min(address, val, __ATOMIC_RELAXED,
477
- __HIP_MEMORY_SCOPE_SYSTEM);
478
- });
479
- #else
480
- return __hip_atomic_fetch_min(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
481
- #endif // __gfx941__
482
- }
483
-
484
- __device__
485
- inline
486
- unsigned int atomicMin(unsigned int* address, unsigned int val) {
487
- #if defined(__gfx941__)
488
- return hip_cas_extrema_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT>(
489
- address, val, [](unsigned int x, unsigned int y) { return x < y; }, [=]() {
490
- return __hip_atomic_fetch_min(address, val, __ATOMIC_RELAXED,
491
- __HIP_MEMORY_SCOPE_AGENT);
492
- });
493
- #else
494
- return __hip_atomic_fetch_min(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
495
- #endif // __gfx941__
496
-
497
- }
498
-
499
- __device__
500
- inline
501
- unsigned int atomicMin_system(unsigned int* address, unsigned int val) {
502
- #if defined(__gfx941__)
503
- return hip_cas_extrema_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM>(
504
- address, val, [](unsigned int x, unsigned int y) { return x < y; }, [=]() {
505
- return __hip_atomic_fetch_min(address, val, __ATOMIC_RELAXED,
506
- __HIP_MEMORY_SCOPE_SYSTEM);
507
- });
508
- #else
509
- return __hip_atomic_fetch_min(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
510
- #endif // __gfx941__
511
- }
512
-
513
- __device__
514
- inline
515
- unsigned long long atomicMin(unsigned long* address, unsigned long val) {
516
- #if defined(__gfx941__)
517
- return hip_cas_extrema_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT>(
518
- address,
519
- val,
520
- [](unsigned long x, unsigned long y) { return x < y; },
521
- [=]() {
522
- return __hip_atomic_fetch_min(address, val, __ATOMIC_RELAXED,
523
- __HIP_MEMORY_SCOPE_AGENT);
524
- });
525
- #else
526
- return __hip_atomic_fetch_min(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
527
- #endif // __gfx941__
528
- }
529
-
530
- __device__
531
- inline
532
- unsigned long atomicMin_system(unsigned long* address, unsigned long val) {
533
- #if defined(__gfx941__)
534
- return hip_cas_extrema_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM>(
535
- address,
536
- val,
537
- [](unsigned long x, unsigned long y) { return x < y; },
538
- [=]() {
539
- return __hip_atomic_fetch_min(address, val, __ATOMIC_RELAXED,
540
- __HIP_MEMORY_SCOPE_SYSTEM);
541
- });
542
- #else
543
- return __hip_atomic_fetch_min(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
544
- #endif // __gfx941__
545
- }
546
-
547
- __device__
548
- inline
549
- unsigned long long atomicMin(unsigned long long* address, unsigned long long val) {
550
- #if defined(__gfx941__)
551
- return hip_cas_extrema_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT>(
552
- address,
553
- val,
554
- [](unsigned long long x, unsigned long long y) { return x < y; },
555
- [=]() {
556
- return __hip_atomic_fetch_min(address, val, __ATOMIC_RELAXED,
557
- __HIP_MEMORY_SCOPE_AGENT);
558
- });
559
- #else
560
- return __hip_atomic_fetch_min(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
561
- #endif // __gfx941__
562
- }
563
-
564
- __device__
565
- inline
566
- unsigned long long atomicMin_system(unsigned long long* address, unsigned long long val) {
567
- #if defined(__gfx941__)
568
- return hip_cas_extrema_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM>(
569
- address,
570
- val,
571
- [](unsigned long long x, unsigned long long y) { return x < y; },
572
- [=]() {
573
- return __hip_atomic_fetch_min(address, val, __ATOMIC_RELAXED,
574
- __HIP_MEMORY_SCOPE_SYSTEM);
575
- });
576
- #else
577
- return __hip_atomic_fetch_min(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
578
- #endif // __gfx941__
579
- }
580
-
581
- __device__
582
- inline
583
- long long atomicMin(long long* address, long long val) {
584
- #if defined(__gfx941__)
585
- return hip_cas_extrema_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT>(
586
- address, val, [](long long x, long long y) { return x < y; },
587
- [=]() {
588
- return __hip_atomic_fetch_min(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
589
- });
590
- #else
591
- return __hip_atomic_fetch_min(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
592
- #endif // __gfx941__
593
- }
594
-
595
- __device__
596
- inline
597
- long long atomicMin_system(long long* address, long long val) {
598
- #if defined(__gfx941__)
599
- return hip_cas_extrema_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM>(
600
- address, val, [](long long x, long long y) { return x < y; },
601
- [=]() {
602
- return __hip_atomic_fetch_min(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
603
- });
604
- #else
605
- return __hip_atomic_fetch_min(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
606
- #endif // __gfx941__
607
- }
608
-
609
- __device__
610
- inline
611
- float atomicMin(float* addr, float val) {
612
- #if defined(__AMDGCN_UNSAFE_FP_ATOMICS__)
613
- return unsafeAtomicMin(addr, val);
614
- #else
615
- typedef union u_hold {
616
- float a;
617
- unsigned int b;
618
- } u_hold_t;
619
- u_hold_t u{val};
620
- bool neg_zero = 0x80000000U == u.b;
621
- #if __has_builtin(__hip_atomic_load) && \
622
- __has_builtin(__hip_atomic_compare_exchange_strong)
623
- float value = __hip_atomic_load(addr, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
624
- bool done = false;
625
- while (!done && (value > val || (neg_zero && value == 0.0f))) {
626
- done = __hip_atomic_compare_exchange_strong(addr, &value, val,
627
- __ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
628
- }
629
- return value;
630
- #else
631
- unsigned int *uaddr = (unsigned int *)addr;
632
- unsigned int value = __atomic_load_n(uaddr, __ATOMIC_RELAXED);
633
- bool done = false;
634
- while (!done && (__uint_as_float(value) > val || (neg_zero && __uint_as_float(value) == 0.0f))) {
635
- done = __atomic_compare_exchange_n(uaddr, &value, __float_as_uint(val), false,
636
- __ATOMIC_RELAXED, __ATOMIC_RELAXED);
637
- }
638
- return __uint_as_float(value);
639
- #endif
640
- #endif
641
- }
642
-
643
- __device__
644
- inline
645
- float atomicMin_system(float* address, float val) {
646
- unsigned int* uaddr { reinterpret_cast<unsigned int*>(address) };
647
- #if __has_builtin(__hip_atomic_load)
648
- unsigned int tmp {__hip_atomic_load(uaddr, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM)};
649
- #else
650
- unsigned int tmp {__atomic_load_n(uaddr, __ATOMIC_RELAXED)};
651
- #endif
652
- float value = __uint_as_float(tmp);
653
-
654
- while (val < value) {
655
- value = atomicCAS_system(address, value, val);
656
- }
657
-
658
- return value;
659
- }
660
-
661
- __device__
662
- inline
663
- double atomicMin(double* addr, double val) {
664
- #if defined(__AMDGCN_UNSAFE_FP_ATOMICS__)
665
- return unsafeAtomicMin(addr, val);
666
- #else
667
- typedef union u_hold {
668
- double a;
669
- unsigned long long b;
670
- } u_hold_t;
671
- u_hold_t u{val};
672
- bool neg_zero = 0x8000000000000000ULL == u.b;
673
- #if __has_builtin(__hip_atomic_load) && \
674
- __has_builtin(__hip_atomic_compare_exchange_strong)
675
- double value = __hip_atomic_load(addr, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
676
- bool done = false;
677
- while (!done && (value > val || (neg_zero && value == 0.0))) {
678
- done = __hip_atomic_compare_exchange_strong(addr, &value, val,
679
- __ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
680
- }
681
- return value;
682
- #else
683
- unsigned long long *uaddr = (unsigned long long *)addr;
684
- unsigned long long value = __atomic_load_n(uaddr, __ATOMIC_RELAXED);
685
- bool done = false;
686
- while (!done &&
687
- (__longlong_as_double(value) > val || (neg_zero && __longlong_as_double(value) == 0.0))) {
688
- done = __atomic_compare_exchange_n(uaddr, &value, __double_as_longlong(val), false,
689
- __ATOMIC_RELAXED, __ATOMIC_RELAXED);
690
- }
691
- return __longlong_as_double(value);
692
- #endif
693
- #endif
694
- }
695
-
696
- __device__
697
- inline
698
- double atomicMin_system(double* address, double val) {
699
- unsigned long long* uaddr { reinterpret_cast<unsigned long long*>(address) };
700
- #if __has_builtin(__hip_atomic_load)
701
- unsigned long long tmp {__hip_atomic_load(uaddr, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM)};
702
- #else
703
- unsigned long long tmp {__atomic_load_n(uaddr, __ATOMIC_RELAXED)};
704
- #endif
705
- double value = __longlong_as_double(tmp);
706
-
707
- while (val < value) {
708
- value = atomicCAS_system(address, value, val);
709
- }
710
-
711
- return value;
712
- }
713
-
714
- __device__
715
- inline
716
- int atomicMax(int* address, int val) {
717
- #if defined(__gfx941__)
718
- return hip_cas_extrema_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT>(
719
- address, val, [](int x, int y) { return y < x; }, [=]() {
720
- return __hip_atomic_fetch_max(address, val, __ATOMIC_RELAXED,
721
- __HIP_MEMORY_SCOPE_AGENT);
722
- });
723
- #else
724
- return __hip_atomic_fetch_max(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
725
- #endif // __gfx941__
726
- }
727
-
728
- __device__
729
- inline
730
- int atomicMax_system(int* address, int val) {
731
- #if defined(__gfx941__)
732
- return hip_cas_extrema_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM>(
733
- address, val, [](int x, int y) { return y < x; }, [=]() {
734
- return __hip_atomic_fetch_max(address, val, __ATOMIC_RELAXED,
735
- __HIP_MEMORY_SCOPE_SYSTEM);
736
- });
737
- #else
738
- return __hip_atomic_fetch_max(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
739
- #endif // __gfx941__
740
- }
741
-
742
- __device__
743
- inline
744
- unsigned int atomicMax(unsigned int* address, unsigned int val) {
745
- #if defined(__gfx941__)
746
- return hip_cas_extrema_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT>(
747
- address, val, [](unsigned int x, unsigned int y) { return y < x; }, [=]() {
748
- return __hip_atomic_fetch_max(address, val, __ATOMIC_RELAXED,
749
- __HIP_MEMORY_SCOPE_AGENT);
750
- });
751
- #else
752
- return __hip_atomic_fetch_max(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
753
- #endif // __gfx941__
754
- }
755
-
756
- __device__
757
- inline
758
- unsigned int atomicMax_system(unsigned int* address, unsigned int val) {
759
- #if defined(__gfx941__)
760
- return hip_cas_extrema_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM>(
761
- address, val, [](unsigned int x, unsigned int y) { return y < x; }, [=]() {
762
- return __hip_atomic_fetch_max(address, val, __ATOMIC_RELAXED,
763
- __HIP_MEMORY_SCOPE_SYSTEM);
764
- });
765
- #else
766
- return __hip_atomic_fetch_max(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
767
- #endif // __gfx941__
768
- }
769
-
770
- __device__
771
- inline
772
- unsigned long atomicMax(unsigned long* address, unsigned long val) {
773
- #if defined(__gfx941__)
774
- return hip_cas_extrema_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT>(
775
- address,
776
- val,
777
- [](unsigned long x, unsigned long y) { return y < x; },
778
- [=]() {
779
- return __hip_atomic_fetch_max(address, val, __ATOMIC_RELAXED,
780
- __HIP_MEMORY_SCOPE_AGENT);
781
- });
782
- #else
783
- return __hip_atomic_fetch_max(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
784
- #endif // __gfx941__
785
- }
786
-
787
- __device__
788
- inline
789
- unsigned long atomicMax_system(unsigned long* address, unsigned long val) {
790
- #if defined(__gfx941__)
791
- return hip_cas_extrema_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM>(
792
- address,
793
- val,
794
- [](unsigned long x, unsigned long y) { return y < x; },
795
- [=]() {
796
- return __hip_atomic_fetch_max(address, val, __ATOMIC_RELAXED,
797
- __HIP_MEMORY_SCOPE_SYSTEM);
798
- });
799
- #else
800
- return __hip_atomic_fetch_max(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
801
- #endif // __gfx941__
802
- }
803
-
804
- __device__
805
- inline
806
- unsigned long long atomicMax(unsigned long long* address, unsigned long long val) {
807
- #if defined(__gfx941__)
808
- return hip_cas_extrema_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT>(
809
- address,
810
- val,
811
- [](unsigned long long x, unsigned long long y) { return y < x; },
812
- [=]() {
813
- return __hip_atomic_fetch_max(address, val, __ATOMIC_RELAXED,
814
- __HIP_MEMORY_SCOPE_AGENT);
815
- });
816
- #else
817
- return __hip_atomic_fetch_max(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
818
- #endif // __gfx941__
819
- }
820
-
821
- __device__
822
- inline
823
- unsigned long long atomicMax_system(unsigned long long* address, unsigned long long val) {
824
- #if defined(__gfx941__)
825
- return hip_cas_extrema_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM>(
826
- address,
827
- val,
828
- [](unsigned long long x, unsigned long long y) { return y < x; },
829
- [=]() {
830
- return __hip_atomic_fetch_max(address, val, __ATOMIC_RELAXED,
831
- __HIP_MEMORY_SCOPE_SYSTEM);
832
- });
833
- #else
834
- return __hip_atomic_fetch_max(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
835
- #endif // __gfx941__
836
- }
837
-
838
- __device__
839
- inline
840
- long long atomicMax(long long* address, long long val) {
841
- #if defined(__gfx941__)
842
- return hip_cas_extrema_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT>(
843
- address, val, [](long long x, long long y) { return y < x; },
844
- [=]() {
845
- return __hip_atomic_fetch_max(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
846
- });
847
- #else
848
- return __hip_atomic_fetch_max(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
849
- #endif // __gfx941__
850
- }
851
-
852
- __device__
853
- inline
854
- long long atomicMax_system(long long* address, long long val) {
855
- #if defined(__gfx941__)
856
- return hip_cas_extrema_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM>(
857
- address, val, [](long long x, long long y) { return y < x; },
858
- [=]() {
859
- return __hip_atomic_fetch_max(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
860
- });
861
- #else
862
- return __hip_atomic_fetch_max(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
863
- #endif // __gfx941__
864
- }
865
-
866
- __device__
867
- inline
868
- float atomicMax(float* addr, float val) {
869
- #if defined(__AMDGCN_UNSAFE_FP_ATOMICS__)
870
- return unsafeAtomicMax(addr, val);
871
- #else
872
- typedef union u_hold {
873
- float a;
874
- unsigned int b;
875
- } u_hold_t;
876
- u_hold_t u{val};
877
- bool neg_zero = 0x80000000U == u.b;
878
- #if __has_builtin(__hip_atomic_load) && \
879
- __has_builtin(__hip_atomic_compare_exchange_strong)
880
- float value = __hip_atomic_load(addr, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
881
- bool done = false;
882
- while (!done && (value < val || (neg_zero && value == 0.0f))) {
883
- done = __hip_atomic_compare_exchange_strong(addr, &value, val,
884
- __ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
885
- }
886
- return value;
887
- #else
888
- unsigned int *uaddr = (unsigned int *)addr;
889
- unsigned int value = __atomic_load_n(uaddr, __ATOMIC_RELAXED);
890
- bool done = false;
891
- while (!done && (__uint_as_float(value) < val || (neg_zero && __uint_as_float(value) == 0.0f))) {
892
- done = __atomic_compare_exchange_n(uaddr, &value, __float_as_uint(val), false,
893
- __ATOMIC_RELAXED, __ATOMIC_RELAXED);
894
- }
895
- return __uint_as_float(value);
896
- #endif
897
- #endif
898
- }
899
-
900
- __device__
901
- inline
902
- float atomicMax_system(float* address, float val) {
903
- unsigned int* uaddr { reinterpret_cast<unsigned int*>(address) };
904
- #if __has_builtin(__hip_atomic_load)
905
- unsigned int tmp {__hip_atomic_load(uaddr, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM)};
906
- #else
907
- unsigned int tmp {__atomic_load_n(uaddr, __ATOMIC_RELAXED)};
908
- #endif
909
- float value = __uint_as_float(tmp);
910
-
911
- while (value < val) {
912
- value = atomicCAS_system(address, value, val);
913
- }
914
-
915
- return value;
916
- }
917
-
918
- __device__
919
- inline
920
- double atomicMax(double* addr, double val) {
921
- #if defined(__AMDGCN_UNSAFE_FP_ATOMICS__)
922
- return unsafeAtomicMax(addr, val);
923
- #else
924
- typedef union u_hold {
925
- double a;
926
- unsigned long long b;
927
- } u_hold_t;
928
- u_hold_t u{val};
929
- bool neg_zero = 0x8000000000000000ULL == u.b;
930
- #if __has_builtin(__hip_atomic_load) && \
931
- __has_builtin(__hip_atomic_compare_exchange_strong)
932
- double value = __hip_atomic_load(addr, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
933
- bool done = false;
934
- while (!done && (value < val || (neg_zero && value == 0.0))) {
935
- done = __hip_atomic_compare_exchange_strong(addr, &value, val,
936
- __ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
937
- }
938
- return value;
939
- #else
940
- unsigned long long *uaddr = (unsigned long long *)addr;
941
- unsigned long long value = __atomic_load_n(uaddr, __ATOMIC_RELAXED);
942
- bool done = false;
943
- while (!done &&
944
- (__longlong_as_double(value) < val || (neg_zero && __longlong_as_double(value) == 0.0))) {
945
- done = __atomic_compare_exchange_n(uaddr, &value, __double_as_longlong(val), false,
946
- __ATOMIC_RELAXED, __ATOMIC_RELAXED);
947
- }
948
- return __longlong_as_double(value);
949
- #endif
950
- #endif
951
- }
952
-
953
- __device__
954
- inline
955
- double atomicMax_system(double* address, double val) {
956
- unsigned long long* uaddr { reinterpret_cast<unsigned long long*>(address) };
957
- #if __has_builtin(__hip_atomic_load)
958
- unsigned long long tmp {__hip_atomic_load(uaddr, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM)};
959
- #else
960
- unsigned long long tmp {__atomic_load_n(uaddr, __ATOMIC_RELAXED)};
961
- #endif
962
- double value = __longlong_as_double(tmp);
963
-
964
- while (value < val) {
965
- value = atomicCAS_system(address, value, val);
966
- }
967
-
968
- return value;
969
- }
970
-
971
- __device__
972
- inline
973
- unsigned int atomicInc(unsigned int* address, unsigned int val)
974
- {
975
- #if defined(__gfx941__)
976
- return hip_cas_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT>(
977
- address,
978
- val,
979
- [](unsigned int& x, unsigned int y) { x = (x >= y) ? 0 : (x + 1); },
980
- [=]() {
981
- return
982
- __builtin_amdgcn_atomic_inc32(address, val, __ATOMIC_RELAXED, "agent");
983
- });
984
- #else
985
- return __builtin_amdgcn_atomic_inc32(address, val, __ATOMIC_RELAXED, "agent");
986
- #endif // __gfx941__
987
-
988
- }
989
-
990
- __device__
991
- inline
992
- unsigned int atomicDec(unsigned int* address, unsigned int val)
993
- {
994
- #if defined(__gfx941__)
995
- return hip_cas_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT>(
996
- address,
997
- val,
998
- [](unsigned int& x, unsigned int y) { x = (!x || x > y) ? y : (x - 1); },
999
- [=]() {
1000
- return
1001
- __builtin_amdgcn_atomic_dec32(address, val, __ATOMIC_RELAXED, "agent");
1002
- });
1003
- #else
1004
- return __builtin_amdgcn_atomic_dec32(address, val, __ATOMIC_RELAXED, "agent");
1005
- #endif // __gfx941__
1006
-
1007
- }
1008
-
1009
- __device__
1010
- inline
1011
- int atomicAnd(int* address, int val) {
1012
- #if defined(__gfx941__)
1013
- return hip_cas_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT>(
1014
- address, val, [](int& x, int y) { x &= y; }, [=]() {
1015
- return __hip_atomic_fetch_and(address, val, __ATOMIC_RELAXED,
1016
- __HIP_MEMORY_SCOPE_AGENT);
1017
- });
1018
- #else
1019
- return __hip_atomic_fetch_and(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
1020
- #endif // __gfx941__
1021
- }
1022
-
1023
- __device__
1024
- inline
1025
- int atomicAnd_system(int* address, int val) {
1026
- #if defined(__gfx941__)
1027
- return hip_cas_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM>(
1028
- address, val, [](int& x, int y) { x &= y; }, [=]() {
1029
- return __hip_atomic_fetch_and(address, val, __ATOMIC_RELAXED,
1030
- __HIP_MEMORY_SCOPE_SYSTEM);
1031
- });
1032
- #else
1033
- return __hip_atomic_fetch_and(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
1034
- #endif // __gfx941__
1035
- }
1036
-
1037
- __device__
1038
- inline
1039
- unsigned int atomicAnd(unsigned int* address, unsigned int val) {
1040
- #if defined(__gfx941__)
1041
- return hip_cas_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT>(
1042
- address, val, [](unsigned int& x, unsigned int y) { x &= y; }, [=]() {
1043
- return __hip_atomic_fetch_and(address, val, __ATOMIC_RELAXED,
1044
- __HIP_MEMORY_SCOPE_AGENT);
1045
- });
1046
- #else
1047
- return __hip_atomic_fetch_and(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
1048
- #endif // __gfx941__
1049
- }
1050
-
1051
- __device__
1052
- inline
1053
- unsigned int atomicAnd_system(unsigned int* address, unsigned int val) {
1054
- #if defined(__gfx941__)
1055
- return hip_cas_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM>(
1056
- address, val, [](unsigned int& x, unsigned int y) { x &= y; }, [=]() {
1057
- return __hip_atomic_fetch_and(address, val, __ATOMIC_RELAXED,
1058
- __HIP_MEMORY_SCOPE_SYSTEM);
1059
- });
1060
- #else
1061
- return __hip_atomic_fetch_and(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
1062
- #endif // __gfx941__
1063
- }
1064
-
1065
- __device__
1066
- inline
1067
- unsigned long atomicAnd(unsigned long* address, unsigned long val) {
1068
- #if defined(__gfx941__)
1069
- return hip_cas_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT>(
1070
- address, val, [](unsigned long& x, unsigned long y) { x &= y; }, [=]() {
1071
- return __hip_atomic_fetch_and(address, val, __ATOMIC_RELAXED,
1072
- __HIP_MEMORY_SCOPE_AGENT);
1073
- });
1074
- #else
1075
- return __hip_atomic_fetch_and(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
1076
- #endif // __gfx941__
1077
- }
1078
-
1079
- __device__
1080
- inline
1081
- unsigned long atomicAnd_system(unsigned long* address, unsigned long val) {
1082
- #if defined(__gfx941__)
1083
- return hip_cas_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM>(
1084
- address, val, [](unsigned long& x, unsigned long y) { x &= y; }, [=]() {
1085
- return __hip_atomic_fetch_and(address, val, __ATOMIC_RELAXED,
1086
- __HIP_MEMORY_SCOPE_SYSTEM);
1087
- });
1088
- #else
1089
- return __hip_atomic_fetch_and(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
1090
- #endif // __gfx941__
1091
- }
1092
-
1093
- __device__
1094
- inline
1095
- unsigned long long atomicAnd(unsigned long long* address, unsigned long long val) {
1096
- #if defined(__gfx941__)
1097
- return hip_cas_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT>(
1098
- address,
1099
- val,
1100
- [](unsigned long long& x, unsigned long long y) { x &= y; },
1101
- [=]() {
1102
- return __hip_atomic_fetch_and(address, val, __ATOMIC_RELAXED,
1103
- __HIP_MEMORY_SCOPE_AGENT);
1104
- });
1105
- #else
1106
- return __hip_atomic_fetch_and(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
1107
- #endif // __gfx941__
1108
- }
1109
-
1110
- __device__
1111
- inline
1112
- unsigned long long atomicAnd_system(unsigned long long* address, unsigned long long val) {
1113
- #if defined(__gfx941__)
1114
- return hip_cas_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM>(
1115
- address,
1116
- val,
1117
- [](unsigned long long& x, unsigned long long y) { x &= y; },
1118
- [=]() {
1119
- return __hip_atomic_fetch_and(address, val, __ATOMIC_RELAXED,
1120
- __HIP_MEMORY_SCOPE_SYSTEM);
1121
- });
1122
- #else
1123
- return __hip_atomic_fetch_and(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
1124
- #endif // __gfx941__
1125
- }
1126
-
1127
- __device__
1128
- inline
1129
- int atomicOr(int* address, int val) {
1130
- #if defined(__gfx941__)
1131
- return hip_cas_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT>(
1132
- address, val, [](int& x, int y) { x |= y; }, [=]() {
1133
- return __hip_atomic_fetch_or(address, val, __ATOMIC_RELAXED,
1134
- __HIP_MEMORY_SCOPE_AGENT);
1135
- });
1136
- #else
1137
- return __hip_atomic_fetch_or(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
1138
- #endif // __gfx941__
1139
- }
1140
-
1141
- __device__
1142
- inline
1143
- int atomicOr_system(int* address, int val) {
1144
- #if defined(__gfx941__)
1145
- return hip_cas_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM>(
1146
- address, val, [](int& x, int y) { x |= y; }, [=]() {
1147
- return __hip_atomic_fetch_or(address, val, __ATOMIC_RELAXED,
1148
- __HIP_MEMORY_SCOPE_SYSTEM);
1149
- });
1150
- #else
1151
- return __hip_atomic_fetch_or(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
1152
- #endif // __gfx941__
1153
- }
1154
-
1155
- __device__
1156
- inline
1157
- unsigned int atomicOr(unsigned int* address, unsigned int val) {
1158
- #if defined(__gfx941__)
1159
- return hip_cas_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT>(
1160
- address, val, [](unsigned int& x, unsigned int y) { x |= y; }, [=]() {
1161
- return __hip_atomic_fetch_or(address, val, __ATOMIC_RELAXED,
1162
- __HIP_MEMORY_SCOPE_AGENT);
1163
- });
1164
- #else
1165
- return __hip_atomic_fetch_or(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
1166
- #endif // __gfx941__
1167
- }
1168
-
1169
- __device__
1170
- inline
1171
- unsigned int atomicOr_system(unsigned int* address, unsigned int val) {
1172
- #if defined(__gfx941__)
1173
- return hip_cas_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM>(
1174
- address, val, [](unsigned int& x, unsigned int y) { x |= y; }, [=]() {
1175
- return __hip_atomic_fetch_or(address, val, __ATOMIC_RELAXED,
1176
- __HIP_MEMORY_SCOPE_SYSTEM);
1177
- });
1178
- #else
1179
- return __hip_atomic_fetch_or(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
1180
- #endif // __gfx941__
1181
- }
1182
-
1183
- __device__
1184
- inline
1185
- unsigned long atomicOr(unsigned long* address, unsigned long val) {
1186
- #if defined(__gfx941__)
1187
- return hip_cas_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT>(
1188
- address, val, [](unsigned long& x, unsigned long y) { x |= y; }, [=]() {
1189
- return __hip_atomic_fetch_or(address, val, __ATOMIC_RELAXED,
1190
- __HIP_MEMORY_SCOPE_AGENT);
1191
- });
1192
- #else
1193
- return __hip_atomic_fetch_or(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
1194
- #endif // __gfx941__
1195
- }
1196
-
1197
- __device__
1198
- inline
1199
- unsigned long atomicOr_system(unsigned long* address, unsigned long val) {
1200
- #if defined(__gfx941__)
1201
- return hip_cas_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM>(
1202
- address, val, [](unsigned long& x, unsigned long y) { x |= y; }, [=]() {
1203
- return __hip_atomic_fetch_or(address, val, __ATOMIC_RELAXED,
1204
- __HIP_MEMORY_SCOPE_SYSTEM);
1205
- });
1206
- #else
1207
- return __hip_atomic_fetch_or(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
1208
- #endif // __gfx941__
1209
- }
1210
-
1211
- __device__
1212
- inline
1213
- unsigned long long atomicOr(unsigned long long* address, unsigned long long val) {
1214
- #if defined(__gfx941__)
1215
- return hip_cas_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT>(
1216
- address,
1217
- val,
1218
- [](unsigned long long& x, unsigned long long y) { x |= y; },
1219
- [=]() {
1220
- return __hip_atomic_fetch_or(address, val, __ATOMIC_RELAXED,
1221
- __HIP_MEMORY_SCOPE_AGENT);
1222
- });
1223
- #else
1224
- return __hip_atomic_fetch_or(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
1225
- #endif // __gfx941__
1226
- }
1227
-
1228
- __device__
1229
- inline
1230
- unsigned long long atomicOr_system(unsigned long long* address, unsigned long long val) {
1231
- #if defined(__gfx941__)
1232
- return hip_cas_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM>(
1233
- address,
1234
- val,
1235
- [](unsigned long long& x, unsigned long long y) { x |= y; },
1236
- [=]() {
1237
- return __hip_atomic_fetch_or(address, val, __ATOMIC_RELAXED,
1238
- __HIP_MEMORY_SCOPE_SYSTEM);
1239
- });
1240
- #else
1241
- return __hip_atomic_fetch_or(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
1242
- #endif // __gfx941__
1243
- }
1244
-
1245
- __device__
1246
- inline
1247
- int atomicXor(int* address, int val) {
1248
- #if defined(__gfx941__)
1249
- return hip_cas_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT>(
1250
- address, val, [](int& x, int y) { x ^= y; }, [=]() {
1251
- return __hip_atomic_fetch_xor(address, val, __ATOMIC_RELAXED,
1252
- __HIP_MEMORY_SCOPE_AGENT);
1253
- });
1254
- #else
1255
- return __hip_atomic_fetch_xor(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
1256
- #endif // __gfx941__
1257
- }
1258
-
1259
- __device__
1260
- inline
1261
- int atomicXor_system(int* address, int val) {
1262
- #if defined(__gfx941__)
1263
- return hip_cas_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM>(
1264
- address, val, [](int& x, int y) { x ^= y; }, [=]() {
1265
- return __hip_atomic_fetch_xor(address, val, __ATOMIC_RELAXED,
1266
- __HIP_MEMORY_SCOPE_SYSTEM);
1267
- });
1268
- #else
1269
- return __hip_atomic_fetch_xor(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
1270
- #endif // __gfx941__
1271
- }
1272
-
1273
- __device__
1274
- inline
1275
- unsigned int atomicXor(unsigned int* address, unsigned int val) {
1276
- #if defined(__gfx941__)
1277
- return hip_cas_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT>(
1278
- address, val, [](unsigned int& x, unsigned int y) { x ^= y; }, [=]() {
1279
- return __hip_atomic_fetch_xor(address, val, __ATOMIC_RELAXED,
1280
- __HIP_MEMORY_SCOPE_AGENT);
1281
- });
1282
- #else
1283
- return __hip_atomic_fetch_xor(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
1284
- #endif // __gfx941__
1285
- }
1286
-
1287
- __device__
1288
- inline
1289
- unsigned int atomicXor_system(unsigned int* address, unsigned int val) {
1290
- #if defined(__gfx941__)
1291
- return hip_cas_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM>(
1292
- address, val, [](unsigned int& x, unsigned int y) { x ^= y; }, [=]() {
1293
- return __hip_atomic_fetch_xor(address, val, __ATOMIC_RELAXED,
1294
- __HIP_MEMORY_SCOPE_SYSTEM);
1295
- });
1296
- #else
1297
- return __hip_atomic_fetch_xor(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
1298
- #endif // __gfx941__
1299
- }
1300
-
1301
- __device__
1302
- inline
1303
- unsigned long atomicXor(unsigned long* address, unsigned long val) {
1304
- #if defined(__gfx941__)
1305
- return hip_cas_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT>(
1306
- address, val, [](unsigned long& x, unsigned long y) { x ^= y; }, [=]() {
1307
- return __hip_atomic_fetch_xor(address, val, __ATOMIC_RELAXED,
1308
- __HIP_MEMORY_SCOPE_AGENT);
1309
- });
1310
- #else
1311
- return __hip_atomic_fetch_xor(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
1312
- #endif // __gfx941__
1313
- }
1314
-
1315
- __device__
1316
- inline
1317
- unsigned long atomicXor_system(unsigned long* address, unsigned long val) {
1318
- #if defined(__gfx941__)
1319
- return hip_cas_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM>(
1320
- address, val, [](unsigned long& x, unsigned long y) { x ^= y; }, [=]() {
1321
- return __hip_atomic_fetch_xor(address, val, __ATOMIC_RELAXED,
1322
- __HIP_MEMORY_SCOPE_SYSTEM);
1323
- });
1324
- #else
1325
- return __hip_atomic_fetch_xor(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
1326
- #endif // __gfx941__
1327
- }
1328
-
1329
- __device__
1330
- inline
1331
- unsigned long long atomicXor(unsigned long long* address, unsigned long long val) {
1332
- #if defined(__gfx941__)
1333
- return hip_cas_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT>(
1334
- address,
1335
- val,
1336
- [](unsigned long long& x, unsigned long long y) { x ^= y; },
1337
- [=]() {
1338
- return __hip_atomic_fetch_xor(address, val, __ATOMIC_RELAXED,
1339
- __HIP_MEMORY_SCOPE_AGENT);
1340
- });
1341
- #else
1342
- return __hip_atomic_fetch_xor(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
1343
- #endif // __gfx941__
1344
- }
1345
-
1346
- __device__
1347
- inline
1348
- unsigned long long atomicXor_system(unsigned long long* address, unsigned long long val) {
1349
- return __hip_atomic_fetch_xor(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
1350
- }
1351
-
1352
- #else // __hip_atomic_compare_exchange_strong
1353
-
1354
- __device__
1355
- inline
1356
- int atomicCAS(int* address, int compare, int val)
1357
- {
1358
- __atomic_compare_exchange_n(
1359
- address, &compare, val, false, __ATOMIC_RELAXED, __ATOMIC_RELAXED);
1360
-
1361
- return compare;
1362
- }
1363
- __device__
1364
- inline
1365
- unsigned int atomicCAS(
1366
- unsigned int* address, unsigned int compare, unsigned int val)
1367
- {
1368
- __atomic_compare_exchange_n(
1369
- address, &compare, val, false, __ATOMIC_RELAXED, __ATOMIC_RELAXED);
1370
-
1371
- return compare;
1372
- }
1373
- __device__
1374
- inline
1375
- unsigned long long atomicCAS(
1376
- unsigned long long* address,
1377
- unsigned long long compare,
1378
- unsigned long long val)
1379
- {
1380
- __atomic_compare_exchange_n(
1381
- address, &compare, val, false, __ATOMIC_RELAXED, __ATOMIC_RELAXED);
1382
-
1383
- return compare;
1384
- }
1385
-
1386
- __device__
1387
- inline
1388
- int atomicAdd(int* address, int val)
1389
- {
1390
- return __atomic_fetch_add(address, val, __ATOMIC_RELAXED);
1391
- }
1392
- __device__
1393
- inline
1394
- unsigned int atomicAdd(unsigned int* address, unsigned int val)
1395
- {
1396
- return __atomic_fetch_add(address, val, __ATOMIC_RELAXED);
1397
- }
1398
- __device__
1399
- inline
1400
- unsigned long long atomicAdd(
1401
- unsigned long long* address, unsigned long long val)
1402
- {
1403
- return __atomic_fetch_add(address, val, __ATOMIC_RELAXED);
1404
- }
1405
- __device__
1406
- inline
1407
- float atomicAdd(float* address, float val)
1408
- {
1409
- #if defined(__AMDGCN_UNSAFE_FP_ATOMICS__)
1410
- return unsafeAtomicAdd(address, val);
1411
- #else
1412
- return __atomic_fetch_add(address, val, __ATOMIC_RELAXED);
1413
- #endif
1414
- }
1415
-
1416
- #if !defined(__HIPCC_RTC__)
1417
- DEPRECATED("use atomicAdd instead")
1418
- #endif // !defined(__HIPCC_RTC__)
1419
- __device__
1420
- inline
1421
- void atomicAddNoRet(float* address, float val)
1422
- {
1423
- __ockl_atomic_add_noret_f32(address, val);
1424
- }
1425
-
1426
- __device__
1427
- inline
1428
- double atomicAdd(double* address, double val)
1429
- {
1430
- #if defined(__AMDGCN_UNSAFE_FP_ATOMICS__)
1431
- return unsafeAtomicAdd(address, val);
1432
- #else
1433
- return __atomic_fetch_add(address, val, __ATOMIC_RELAXED);
1434
- #endif
1435
- }
1436
-
1437
- __device__
1438
- inline
1439
- int atomicSub(int* address, int val)
1440
- {
1441
- return __atomic_fetch_sub(address, val, __ATOMIC_RELAXED);
1442
- }
1443
- __device__
1444
- inline
1445
- unsigned int atomicSub(unsigned int* address, unsigned int val)
1446
- {
1447
- return __atomic_fetch_sub(address, val, __ATOMIC_RELAXED);
1448
- }
1449
-
1450
- __device__
1451
- inline
1452
- int atomicExch(int* address, int val)
1453
- {
1454
- return __atomic_exchange_n(address, val, __ATOMIC_RELAXED);
1455
- }
1456
- __device__
1457
- inline
1458
- unsigned int atomicExch(unsigned int* address, unsigned int val)
1459
- {
1460
- return __atomic_exchange_n(address, val, __ATOMIC_RELAXED);
1461
- }
1462
- __device__
1463
- inline
1464
- unsigned long long atomicExch(unsigned long long* address, unsigned long long val)
1465
- {
1466
- return __atomic_exchange_n(address, val, __ATOMIC_RELAXED);
1467
- }
1468
- __device__
1469
- inline
1470
- float atomicExch(float* address, float val)
1471
- {
1472
- return __uint_as_float(__atomic_exchange_n(
1473
- reinterpret_cast<unsigned int*>(address),
1474
- __float_as_uint(val),
1475
- __ATOMIC_RELAXED));
1476
- }
1477
-
1478
- __device__
1479
- inline
1480
- int atomicMin(int* address, int val)
1481
- {
1482
- return __atomic_fetch_min(address, val, __ATOMIC_RELAXED);
1483
- }
1484
- __device__
1485
- inline
1486
- unsigned int atomicMin(unsigned int* address, unsigned int val)
1487
- {
1488
- return __atomic_fetch_min(address, val, __ATOMIC_RELAXED);
1489
- }
1490
- __device__
1491
- inline
1492
- unsigned long long atomicMin(
1493
- unsigned long long* address, unsigned long long val)
1494
- {
1495
- unsigned long long tmp{__atomic_load_n(address, __ATOMIC_RELAXED)};
1496
- while (val < tmp) {
1497
- const auto tmp1 = __atomic_load_n(address, __ATOMIC_RELAXED);
1498
-
1499
- if (tmp1 != tmp) { tmp = tmp1; continue; }
1500
-
1501
- tmp = atomicCAS(address, tmp, val);
1502
- }
1503
-
1504
- return tmp;
1505
- }
1506
- __device__ inline long long atomicMin(long long* address, long long val) {
1507
- long long tmp{__atomic_load_n(address, __ATOMIC_RELAXED)};
1508
- while (val < tmp) {
1509
- const auto tmp1 = __atomic_load_n(address, __ATOMIC_RELAXED);
1510
-
1511
- if (tmp1 != tmp) {
1512
- tmp = tmp1;
1513
- continue;
1514
- }
1515
-
1516
- tmp = atomicCAS(address, tmp, val);
1517
- }
1518
- return tmp;
1519
- }
1520
-
1521
- __device__
1522
- inline
1523
- int atomicMax(int* address, int val)
1524
- {
1525
- return __atomic_fetch_max(address, val, __ATOMIC_RELAXED);
1526
- }
1527
- __device__
1528
- inline
1529
- unsigned int atomicMax(unsigned int* address, unsigned int val)
1530
- {
1531
- return __atomic_fetch_max(address, val, __ATOMIC_RELAXED);
1532
- }
1533
- __device__
1534
- inline
1535
- unsigned long long atomicMax(
1536
- unsigned long long* address, unsigned long long val)
1537
- {
1538
- unsigned long long tmp{__atomic_load_n(address, __ATOMIC_RELAXED)};
1539
- while (tmp < val) {
1540
- const auto tmp1 = __atomic_load_n(address, __ATOMIC_RELAXED);
1541
-
1542
- if (tmp1 != tmp) { tmp = tmp1; continue; }
1543
-
1544
- tmp = atomicCAS(address, tmp, val);
1545
- }
1546
-
1547
- return tmp;
1548
- }
1549
- __device__ inline long long atomicMax(long long* address, long long val) {
1550
- long long tmp{__atomic_load_n(address, __ATOMIC_RELAXED)};
1551
- while (tmp < val) {
1552
- const auto tmp1 = __atomic_load_n(address, __ATOMIC_RELAXED);
1553
-
1554
- if (tmp1 != tmp) {
1555
- tmp = tmp1;
1556
- continue;
1557
- }
1558
-
1559
- tmp = atomicCAS(address, tmp, val);
1560
- }
1561
- return tmp;
1562
- }
1563
-
1564
- __device__
1565
- inline
1566
- unsigned int atomicInc(unsigned int* address, unsigned int val)
1567
- {
1568
- return __builtin_amdgcn_atomic_inc32(address, val, __ATOMIC_RELAXED, "agent");
1569
- }
1570
-
1571
- __device__
1572
- inline
1573
- unsigned int atomicDec(unsigned int* address, unsigned int val)
1574
- {
1575
- return __builtin_amdgcn_atomic_dec32(address, val, __ATOMIC_RELAXED, "agent");
1576
- }
1577
-
1578
- __device__
1579
- inline
1580
- int atomicAnd(int* address, int val)
1581
- {
1582
- return __atomic_fetch_and(address, val, __ATOMIC_RELAXED);
1583
- }
1584
- __device__
1585
- inline
1586
- unsigned int atomicAnd(unsigned int* address, unsigned int val)
1587
- {
1588
- return __atomic_fetch_and(address, val, __ATOMIC_RELAXED);
1589
- }
1590
- __device__
1591
- inline
1592
- unsigned long long atomicAnd(
1593
- unsigned long long* address, unsigned long long val)
1594
- {
1595
- return __atomic_fetch_and(address, val, __ATOMIC_RELAXED);
1596
- }
1597
-
1598
- __device__
1599
- inline
1600
- int atomicOr(int* address, int val)
1601
- {
1602
- return __atomic_fetch_or(address, val, __ATOMIC_RELAXED);
1603
- }
1604
- __device__
1605
- inline
1606
- unsigned int atomicOr(unsigned int* address, unsigned int val)
1607
- {
1608
- return __atomic_fetch_or(address, val, __ATOMIC_RELAXED);
1609
- }
1610
- __device__
1611
- inline
1612
- unsigned long long atomicOr(
1613
- unsigned long long* address, unsigned long long val)
1614
- {
1615
- return __atomic_fetch_or(address, val, __ATOMIC_RELAXED);
1616
- }
1617
-
1618
- __device__
1619
- inline
1620
- int atomicXor(int* address, int val)
1621
- {
1622
- return __atomic_fetch_xor(address, val, __ATOMIC_RELAXED);
1623
- }
1624
- __device__
1625
- inline
1626
- unsigned int atomicXor(unsigned int* address, unsigned int val)
1627
- {
1628
- return __atomic_fetch_xor(address, val, __ATOMIC_RELAXED);
1629
- }
1630
- __device__
1631
- inline
1632
- unsigned long long atomicXor(
1633
- unsigned long long* address, unsigned long long val)
1634
- {
1635
- return __atomic_fetch_xor(address, val, __ATOMIC_RELAXED);
1636
- }
1637
-
1638
- #endif // __hip_atomic_compare_exchange_strong