PyPI - triton-windows - Versions diffs - 3.2.0.post11__cp39-cp39-win_amd64.whl → 3.3.0a0.post11__cp39-cp39-win_amd64.whl - Mend

triton-windows 3.2.0.post11__cp39-cp39-win_amd64.whl → 3.3.0a0.post11__cp39-cp39-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of triton-windows might be problematic. Click here for more details.

Files changed (68) hide show

triton/_C/libtriton.pyd +0 -0
triton/__init__.py +3 -3
triton/_internal_testing.py +59 -4
triton/_utils.py +35 -0
triton/backends/amd/compiler.py +121 -74
triton/backends/amd/driver.py +77 -43
triton/backends/amd/include/hip/amd_detail/amd_device_functions.h +28 -49
triton/backends/amd/include/hip/amd_detail/amd_hip_atomic.h +35 -9
triton/backends/amd/include/hip/amd_detail/amd_hip_bf16.h +761 -284
triton/backends/amd/include/hip/amd_detail/amd_hip_cooperative_groups.h +9 -3
triton/backends/amd/include/hip/amd_detail/amd_hip_fp8.h +1391 -0
triton/backends/amd/include/hip/amd_detail/amd_hip_gl_interop.h +3 -3
triton/backends/amd/include/hip/amd_detail/amd_warp_functions.h +44 -0
triton/backends/amd/include/hip/amd_detail/amd_warp_sync_functions.h +288 -0
triton/backends/amd/include/hip/amd_detail/hip_api_trace.hpp +110 -14
triton/backends/amd/include/hip/amd_detail/hip_prof_str.h +504 -103
triton/backends/amd/include/hip/amd_detail/hip_runtime_prof.h +2 -1
triton/backends/amd/include/hip/amd_detail/host_defines.h +4 -0
triton/backends/amd/include/hip/hip_ext.h +4 -2
triton/backends/amd/include/hip/hip_fp8.h +33 -0
triton/backends/amd/include/hip/hip_runtime_api.h +375 -33
triton/backends/amd/include/hip/hip_version.h +3 -3
triton/backends/amd/include/hip/hiprtc.h +25 -25
triton/backends/amd/include/hsa/amd_hsa_elf.h +40 -14
triton/backends/amd/include/hsa/hsa.h +11 -2
triton/backends/amd/include/hsa/hsa_api_trace.h +30 -17
triton/backends/amd/include/hsa/hsa_api_trace_version.h +68 -0
triton/backends/amd/include/hsa/hsa_ext_amd.h +83 -27
triton/backends/amd/include/hsa/hsa_ven_amd_aqlprofile.h +46 -46
triton/backends/amd/include/hsa/hsa_ven_amd_pc_sampling.h +416 -0
triton/backends/amd/include/roctracer/hip_ostream_ops.h +84 -4
triton/backends/amd/include/roctracer/hsa_ostream_ops.h +260 -0
triton/backends/amd/include/roctracer/hsa_prof_str.h +51 -19
triton/backends/amd/lib/asanrtl.bc +0 -0
triton/backends/compiler.py +25 -225
triton/backends/driver.py +7 -2
triton/backends/nvidia/bin/ptxas.exe +0 -0
triton/backends/nvidia/compiler.py +135 -90
triton/backends/nvidia/driver.c +0 -1
triton/backends/nvidia/driver.py +135 -49
triton/backends/nvidia/include/cuda.h +2162 -241
triton/backends/nvidia/lib/x64/cuda.lib +0 -0
triton/compiler/__init__.py +2 -2
triton/compiler/code_generator.py +334 -231
triton/compiler/compiler.py +77 -66
triton/language/__init__.py +22 -5
triton/language/core.py +448 -74
triton/language/extra/cuda/_experimental_tma.py +3 -5
triton/language/math.py +1 -1
triton/language/random.py +2 -1
triton/language/semantic.py +206 -52
triton/language/standard.py +35 -18
triton/runtime/_allocation.py +32 -0
triton/runtime/autotuner.py +27 -32
triton/runtime/build.py +1 -48
triton/runtime/cache.py +6 -6
triton/runtime/errors.py +10 -0
triton/runtime/interpreter.py +179 -45
triton/runtime/jit.py +149 -190
triton/testing.py +39 -11
triton/tools/compile.py +27 -20
triton/tools/{compile.c → extra/cuda/compile.c} +1 -0
triton/tools/mxfp.py +301 -0
{triton_windows-3.2.0.post11.dist-info → triton_windows-3.3.0a0.post11.dist-info}/METADATA +5 -2
{triton_windows-3.2.0.post11.dist-info → triton_windows-3.3.0a0.post11.dist-info}/RECORD +68 -59
{triton_windows-3.2.0.post11.dist-info → triton_windows-3.3.0a0.post11.dist-info}/top_level.txt +2 -0
/triton/tools/{compile.h → extra/cuda/compile.h} +0 -0
{triton_windows-3.2.0.post11.dist-info → triton_windows-3.3.0a0.post11.dist-info}/WHEEL +0 -0

triton/backends/amd/driver.py CHANGED Viewed

@@ -37,7 +37,7 @@ def _find_already_mmapped_dylib_on_linux(lib_name):
     # Load libc and get the dl_iterate_phdr symbol.
     try:
         dl_iterate_phdr = ctypes.CDLL('libc.so.6').dl_iterate_phdr
-    except:
+    except Exception:
         return None
     # argtypes must use c_char_p to accept create_string_buffer.
     dl_iterate_phdr.argtypes = [callback_t, c_char_p]
@@ -185,35 +185,32 @@ def ty_to_cpp(ty):
     }[ty]
-def make_launcher(constants, signature, ids, warp_size):
-    start_desc = len(signature)
-    #signature = generate_cu_signature(constants, signature, ids)
-    arg_decls = ', '.join(f"{ty_to_cpp(ty)} arg{i}" for i, ty in signature.items())
+def make_launcher(constants, signature, warp_size):
+    def _serialize_signature(sig):
+        if isinstance(sig, tuple):
+            return ','.join(map(_serialize_signature, sig))
+        return sig
     def _extracted_type(ty):
+        if isinstance(ty, tuple):
+            val = ','.join(map(_extracted_type, ty))
+            return f"[{val}]"
         if ty[0] == '*':
             return "PyObject*"
-        return {
-            'i1': 'int32_t',
-            'i8': 'int8_t',
-            'i16': 'int16_t',
-            'i32': 'int32_t',
-            'i64': 'int64_t',
-            'u1': 'uint32_t',
-            'u8': 'uint8_t',
-            'u16': 'uint16_t',
-            'u32': 'uint32_t',
-            'u64': 'uint64_t',
-            'fp16': 'float',
-            'bf16': 'float',
-            'fp32': 'float',
-            'f32': 'float',
-            'fp64': 'double',
-        }[ty]
+        if ty in ("constexpr"):
+            return "PyObject*"
+        return ty_to_cpp(ty)
     def format_of(ty):
+        if isinstance(ty, tuple):
+            val = ''.join(map(format_of, ty))
+            return f"({val})"
+        if ty[0] == '*':
+            return "O"
+        if ty in ("constexpr"):
+            return "O"
         return {
-            "PyObject*": "O",
             "float": "f",
             "double": "d",
             "long": "l",
@@ -225,16 +222,29 @@ def make_launcher(constants, signature, ids, warp_size):
             "uint16_t": "H",
             "uint32_t": "I",
             "uint64_t": "K",
-        }[ty]
+        }[ty_to_cpp(ty)]
-    args_format = ''.join([format_of(_extracted_type(ty)) for ty in signature.values()])
-    format = "iiiKKOOOO" + args_format
+    args_format = ''.join([format_of(ty) for ty in signature.values()])
+    format = "piiiKKOOOO" + args_format
+    signature = ','.join(map(_serialize_signature, signature.values()))
+    signature = list(filter(bool, signature.split(',')))
+    signature = {i: s for i, s in enumerate(signature)}
     args_list = ', ' + ', '.join(f"&_arg{i}" for i, ty in signature.items()) if len(signature) > 0 else ''
+    # Record the end of regular arguments;
+    # subsequent arguments are architecture-specific descriptors, such as tensor descriptors for CUDA.
+    arg_decls = ', '.join(f"{ty_to_cpp(ty)} arg{i}" for i, ty in signature.items() if ty != "constexpr")
+    internal_args_list = []
+    for i, ty in signature.items():
+        if ty[0] == "*":
+            internal_args_list.append(f"ptr_info{i}.dev_ptr")
+        elif ty != "constexpr":
+            internal_args_list.append(f"_arg{i}")
     libhip_path = _get_path_to_hip_runtime_dylib()
     # generate glue code
-    params = [i for i in signature.keys() if i not in constants]
+    params = list(range(len(signature)))
+    params = [f"&arg{i}" for i, ty in signature.items() if ty != "constexpr"]
+    params.append("&global_scratch")
     src = f"""
 #define __HIP_PLATFORM_AMD__
 #include <hip/hip_runtime.h>
@@ -257,6 +267,12 @@ static const char *hipLibSearchPaths[] = {{"{libhip_path}"}};
                   unsigned int blockDimY, unsigned int blockDimZ,             \\
                   unsigned int sharedMemBytes, hipStream_t stream,            \\
                   void **kernelParams, void **extra)                          \\
+  FOR_EACH_ERR_FN(hipModuleLaunchCooperativeKernel, hipFunction_t f,          \\
+                  unsigned int gridDimX, unsigned int gridDimY,               \\
+                  unsigned int gridDimZ, unsigned int blockDimX,              \\
+                  unsigned int blockDimY, unsigned int blockDimZ,             \\
+                  unsigned int sharedMemBytes, hipStream_t stream,            \\
+                  void **kernelParams, void **extra)                          \\
   FOR_EACH_ERR_FN(hipPointerGetAttribute, void *data,                         \\
                   hipPointer_attribute attribute, hipDeviceptr_t ptr)
@@ -328,13 +344,18 @@ static inline void gpuAssert(hipError_t code, const char *file, int line)
 #define HIP_CHECK(ans) {{ gpuAssert((ans), __FILE__, __LINE__); }}
-static void _launch(int gridX, int gridY, int gridZ, int num_warps, int num_ctas, int clusterDimX, int clusterDimY, int clusterDimZ, int shared_memory, hipStream_t stream, hipFunction_t function{', ' + arg_decls if len(arg_decls) > 0 else ''}) {{
+static void _launch(int gridX, int gridY, int gridZ, int num_warps, int num_ctas, int launch_cooperative_grid, int clusterDimX, int clusterDimY, int clusterDimZ, int shared_memory, hipStream_t stream, hipFunction_t function{', ' + arg_decls if len(arg_decls) > 0 else ''}) {{
   // printf("_launch hip kernel\\n");
-  void *params[] = {{ {', '.join(f"&arg{i}" for i in params)} }};
+  hipDeviceptr_t global_scratch = 0;
+  void *params[] = {{ {', '.join(params)} }};
+  if (gridX*gridY*gridZ > 0 && launch_cooperative_grid) {{
+    HIP_CHECK(hipSymbolTable.hipModuleLaunchCooperativeKernel(function, gridX, gridY, gridZ, {warp_size}*num_warps, 1, 1, shared_memory, stream, params, 0));
+    return;
+  }}
   if (gridX*gridY*gridZ > 0) {{
-      HIP_CHECK(hipSymbolTable.hipModuleLaunchKernel(function, gridX, gridY, gridZ, {warp_size}*num_warps, 1, 1, shared_memory, stream, params, 0));
-    }}
+    HIP_CHECK(hipSymbolTable.hipModuleLaunchKernel(function, gridX, gridY, gridZ, {warp_size}*num_warps, 1, 1, shared_memory, stream, params, 0));
   }}
+}}
 typedef struct _DevicePtrInfo {{
     hipDeviceptr_t dev_ptr;
@@ -387,12 +408,14 @@ static PyObject* launch(PyObject* self, PyObject* args) {{
   int gridX, gridY, gridZ;
   uint64_t _stream;
   uint64_t _function;
+  int launch_cooperative_grid;
   PyObject *launch_enter_hook = NULL;
   PyObject *launch_exit_hook = NULL;
   PyObject *kernel_metadata = NULL;
   PyObject *launch_metadata = NULL;
   {' '.join([f"{_extracted_type(ty)} _arg{i}; " for i, ty in signature.items()])}
-  if(!PyArg_ParseTuple(args, \"{format}\", &gridX, &gridY, &gridZ, &_stream, &_function,
+  if(!PyArg_ParseTuple(args, \"{format}\", &launch_cooperative_grid,
+                                           &gridX, &gridY, &gridZ, &_stream, &_function,
                                            &kernel_metadata, &launch_metadata,
                                            &launch_enter_hook, &launch_exit_hook {args_list})) {{
     return NULL;
@@ -415,7 +438,7 @@ static PyObject* launch(PyObject* self, PyObject* args) {{
   // raise exception asap
   {"; ".join([f"DevicePtrInfo ptr_info{i} = getPointer(_arg{i}, {i}); if (!ptr_info{i}.valid) return NULL;" if ty[0] == "*" else "" for i, ty in signature.items()])};
-  _launch(gridX, gridY, gridZ, num_warps, num_ctas, clusterDimX, clusterDimY, clusterDimZ, shared_memory, (hipStream_t)_stream, (hipFunction_t)_function{', ' + ', '.join(f"ptr_info{i}.dev_ptr" if ty[0]=="*" else f"_arg{i}"for i, ty in signature.items()) if len(signature) > 0 else ''});
+  _launch(gridX, gridY, gridZ, num_warps, num_ctas, launch_cooperative_grid, clusterDimX, clusterDimY, clusterDimZ, shared_memory, (hipStream_t)_stream, (hipFunction_t)_function{', ' + ', '.join(internal_args_list) if len(internal_args_list) > 0 else ''});
   if(launch_exit_hook != Py_None){{
     PyObject* args = Py_BuildValue("(O)", launch_metadata);
@@ -464,17 +487,17 @@ PyMODINIT_FUNC PyInit___triton_launcher(void) {{
 class HIPLauncher(object):
     def __init__(self, src, metadata):
-        ids = {"ids_of_const_exprs": src.fn.constexprs if hasattr(src, "fn") else tuple()}
         constants = src.constants if hasattr(src, "constants") else dict()
-        cst_key = lambda i: src.fn.arg_names.index(i) if isinstance(i, str) else i
-        constants = {cst_key(key): value for key, value in constants.items()}
-        signature = {cst_key(key): value for key, value in src.signature.items()}
-        src = make_launcher(constants, signature, ids, metadata.warp_size)
+        arg_idx = lambda x: (src.fn.arg_names.index(x), ) if isinstance(x, str) else x
+        constants = {arg_idx(idx): value for idx, value in constants.items()}
+        signature = {idx: value for idx, value in src.signature.items()}
+        src = make_launcher(constants, signature, metadata.warp_size)
         mod = compile_module_from_src(src, "__triton_launcher")
         self.launch = mod.launch
+        self.launch_cooperative_grid = metadata.launch_cooperative_grid
-    def __call__(self, *args, **kwargs):
-        self.launch(*args, **kwargs)
+    def __call__(self, *args):
+        self.launch(self.launch_cooperative_grid, *args)
 class HIPDriver(GPUDriver):
@@ -490,8 +513,11 @@ class HIPDriver(GPUDriver):
     @staticmethod
     def is_active():
-        import torch
-        return torch.version.hip is not None
+        try:
+            import torch
+            return torch.version.hip is not None
+        except ImportError:
+            return False
     def get_current_target(self):
         device = self.get_current_device()
@@ -500,6 +526,11 @@ class HIPDriver(GPUDriver):
         warp_size = device_properties['warpSize']
         return GPUTarget("hip", arch.split(':')[0], warp_size)
+    def get_active_torch_device(self):
+        import torch
+        # when using hip devices, the device string in pytorch is "cuda"
+        return torch.device("cuda", self.get_current_device())
     def get_benchmarker(self):
         from triton.testing import do_bench
         return do_bench
@@ -510,3 +541,6 @@ class HIPDriver(GPUDriver):
         # It's the same as the Nvidia backend.
         cache_size = 256 * 1024 * 1024
         return torch.empty(int(cache_size // 4), dtype=torch.int, device='cuda')
+    def clear_cache(self, cache):
+        cache.zero_()

triton/backends/amd/include/hip/amd_detail/amd_device_functions.h CHANGED Viewed

@@ -266,14 +266,14 @@ __device__ static inline int __mul24(int x, int y) {
 }
 __device__ static inline long long __mul64hi(long long int x, long long int y) {
-    ulong x0 = (ulong)x & 0xffffffffUL;
-    long x1 = x >> 32;
-    ulong y0 = (ulong)y & 0xffffffffUL;
-    long y1 = y >> 32;
-    ulong z0 = x0*y0;
-    long t = x1*y0 + (z0 >> 32);
-    long z1 = t & 0xffffffffL;
-    long z2 = t >> 32;
+    unsigned long long x0 = (unsigned long long)x & 0xffffffffUL;
+    long long x1 = x >> 32;
+    unsigned long long y0 = (unsigned long long)y & 0xffffffffUL;
+    long long y1 = y >> 32;
+    unsigned long long z0 = x0*y0;
+    long long t = x1*y0 + (z0 >> 32);
+    long long z1 = t & 0xffffffffL;
+    long long z2 = t >> 32;
     z1 = x0*y1 + z1;
     return x1*y1 + z2 + (z1 >> 32);
 }
@@ -300,14 +300,14 @@ __device__ static inline int __umul24(unsigned int x, unsigned int y) {
 __device__
 static inline unsigned long long __umul64hi(unsigned long long int x, unsigned long long int y) {
-    ulong x0 = x & 0xffffffffUL;
-    ulong x1 = x >> 32;
-    ulong y0 = y & 0xffffffffUL;
-    ulong y1 = y >> 32;
-    ulong z0 = x0*y0;
-    ulong t = x1*y0 + (z0 >> 32);
-    ulong z1 = t & 0xffffffffUL;
-    ulong z2 = t >> 32;
+    unsigned long long x0 = x & 0xffffffffUL;
+    unsigned long long x1 = x >> 32;
+    unsigned long long y0 = y & 0xffffffffUL;
+    unsigned long long y1 = y >> 32;
+    unsigned long long z0 = x0*y0;
+    unsigned long long t = x1*y0 + (z0 >> 32);
+    unsigned long long z1 = t & 0xffffffffUL;
+    unsigned long long z2 = t >> 32;
     z1 = x0*y1 + z1;
     return x1*y1 + z2 + (z1 >> 32);
 }
@@ -322,11 +322,6 @@ __device__ static inline unsigned int __usad(unsigned int x, unsigned int y, uns
     return __ockl_sadd_u32(x, y, z);
 }
-__device__ static inline unsigned int __lane_id() {
-    return  __builtin_amdgcn_mbcnt_hi(
-        -1, __builtin_amdgcn_mbcnt_lo(-1, 0));
-}
 __device__
 static inline unsigned int __mbcnt_lo(unsigned int x, unsigned int y) {return __builtin_amdgcn_mbcnt_lo(x,y);};
@@ -339,6 +334,7 @@ HIP specific device functions
 #if !defined(__HIPCC_RTC__)
 #include "amd_warp_functions.h"
+#include "amd_warp_sync_functions.h"
 #endif
 #define MASK1 0x00ff00ff
@@ -687,34 +683,6 @@ void __named_sync() { __builtin_amdgcn_s_barrier(); }
 #endif // __HIP_DEVICE_COMPILE__
-// warp vote function __all __any __ballot
-__device__
-inline
-int __all(int predicate) {
-    return __ockl_wfall_i32(predicate);
-}
-__device__
-inline
-int __any(int predicate) {
-    return __ockl_wfany_i32(predicate);
-}
-// XXX from llvm/include/llvm/IR/InstrTypes.h
-#define ICMP_NE 33
-__device__
-inline
-unsigned long long int __ballot(int predicate) {
-    return __builtin_amdgcn_uicmp(predicate, 0, ICMP_NE);
-}
-__device__
-inline
-unsigned long long int __ballot64(int predicate) {
-    return __builtin_amdgcn_uicmp(predicate, 0, ICMP_NE);
-}
 // hip.amdgcn.bc - lanemask
 __device__
 inline
@@ -877,6 +845,10 @@ int __syncthreads_or(int predicate)
 #if (defined(__GFX10__) || defined(__GFX11__))
   #define HW_ID_WGP_ID_SIZE   4
   #define HW_ID_WGP_ID_OFFSET 10
+  #if (defined(__AMDGCN_CUMODE__))
+    #define HW_ID_CU_ID_SIZE    1
+    #define HW_ID_CU_ID_OFFSET  8
+  #endif
 #else
   #define HW_ID_CU_ID_SIZE    4
   #define HW_ID_CU_ID_OFFSET  8
@@ -933,6 +905,10 @@ unsigned __smid(void)
             GETREG_IMMED(HW_ID_WGP_ID_SIZE - 1, HW_ID_WGP_ID_OFFSET, HW_ID));
       unsigned sa_id = __builtin_amdgcn_s_getreg(
             GETREG_IMMED(HW_ID_SA_ID_SIZE - 1, HW_ID_SA_ID_OFFSET, HW_ID));
+      #if (defined(__AMDGCN_CUMODE__))
+        unsigned cu_id = __builtin_amdgcn_s_getreg(
+            GETREG_IMMED(HW_ID_CU_ID_SIZE - 1, HW_ID_CU_ID_OFFSET, HW_ID));
+      #endif
     #else
       #if (defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__))
       unsigned xcc_id = __builtin_amdgcn_s_getreg(
@@ -945,6 +921,9 @@ unsigned __smid(void)
       unsigned temp = se_id;
       temp = (temp << HW_ID_SA_ID_SIZE) | sa_id;
       temp = (temp << HW_ID_WGP_ID_SIZE) | wgp_id;
+      #if (defined(__AMDGCN_CUMODE__))
+        temp = (temp << HW_ID_CU_ID_SIZE) | cu_id;
+      #endif
       return temp;
       //TODO : CU Mode impl
     #elif (defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__))

triton/backends/amd/include/hip/amd_detail/amd_hip_atomic.h CHANGED Viewed

@@ -612,11 +612,17 @@ float atomicMin(float* addr, float val) {
 #if defined(__AMDGCN_UNSAFE_FP_ATOMICS__)
   return unsafeAtomicMin(addr, val);
 #else
+  typedef union u_hold {
+    float a;
+    unsigned int b;
+  } u_hold_t;
+  u_hold_t u{val};
+  bool neg_zero = 0x80000000U == u.b;
   #if __has_builtin(__hip_atomic_load) && \
       __has_builtin(__hip_atomic_compare_exchange_strong)
   float value = __hip_atomic_load(addr, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
   bool done = false;
-  while (!done && value > val) {
+  while (!done && (value > val || (neg_zero && value == 0.0f))) {
     done = __hip_atomic_compare_exchange_strong(addr, &value, val,
                __ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
   }
@@ -625,7 +631,7 @@ float atomicMin(float* addr, float val) {
   unsigned int *uaddr = (unsigned int *)addr;
   unsigned int value = __atomic_load_n(uaddr, __ATOMIC_RELAXED);
   bool done = false;
-  while (!done && __uint_as_float(value) > val) {
+  while (!done && (__uint_as_float(value) > val || (neg_zero && __uint_as_float(value) == 0.0f))) {
     done = __atomic_compare_exchange_n(uaddr, &value, __float_as_uint(val), false,
                __ATOMIC_RELAXED, __ATOMIC_RELAXED);
   }
@@ -658,11 +664,17 @@ double atomicMin(double* addr, double val) {
 #if defined(__AMDGCN_UNSAFE_FP_ATOMICS__)
   return unsafeAtomicMin(addr, val);
 #else
+  typedef union u_hold {
+    double a;
+    unsigned long long b;
+  } u_hold_t;
+  u_hold_t u{val};
+  bool neg_zero = 0x8000000000000000ULL == u.b;
   #if __has_builtin(__hip_atomic_load) && \
       __has_builtin(__hip_atomic_compare_exchange_strong)
   double value = __hip_atomic_load(addr, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
   bool done = false;
-  while (!done && value > val) {
+  while (!done && (value > val || (neg_zero && value == 0.0)))  {
     done = __hip_atomic_compare_exchange_strong(addr, &value, val,
                __ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
   }
@@ -671,7 +683,8 @@ double atomicMin(double* addr, double val) {
   unsigned long long *uaddr = (unsigned long long *)addr;
   unsigned long long value = __atomic_load_n(uaddr, __ATOMIC_RELAXED);
   bool done = false;
-  while (!done && __longlong_as_double(value) > val) {
+  while (!done &&
+         (__longlong_as_double(value) > val || (neg_zero && __longlong_as_double(value) == 0.0))) {
     done = __atomic_compare_exchange_n(uaddr, &value, __double_as_longlong(val), false,
                __ATOMIC_RELAXED, __ATOMIC_RELAXED);
   }
@@ -856,11 +869,17 @@ float atomicMax(float* addr, float val) {
 #if defined(__AMDGCN_UNSAFE_FP_ATOMICS__)
   return unsafeAtomicMax(addr, val);
 #else
+  typedef union u_hold {
+    float a;
+    unsigned int b;
+  } u_hold_t;
+  u_hold_t u{val};
+  bool neg_zero = 0x80000000U == u.b;
   #if __has_builtin(__hip_atomic_load) && \
       __has_builtin(__hip_atomic_compare_exchange_strong)
   float value = __hip_atomic_load(addr, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
   bool done = false;
-  while (!done && value < val) {
+  while (!done && (value < val || (neg_zero && value == 0.0f))) {
     done = __hip_atomic_compare_exchange_strong(addr, &value, val,
                __ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
   }
@@ -869,7 +888,7 @@ float atomicMax(float* addr, float val) {
   unsigned int *uaddr = (unsigned int *)addr;
   unsigned int value = __atomic_load_n(uaddr, __ATOMIC_RELAXED);
   bool done = false;
-  while (!done && __uint_as_float(value) < val) {
+  while (!done && (__uint_as_float(value) < val || (neg_zero && __uint_as_float(value) == 0.0f))) {
     done = __atomic_compare_exchange_n(uaddr, &value, __float_as_uint(val), false,
                __ATOMIC_RELAXED, __ATOMIC_RELAXED);
   }
@@ -902,11 +921,17 @@ double atomicMax(double* addr, double val) {
 #if defined(__AMDGCN_UNSAFE_FP_ATOMICS__)
   return unsafeAtomicMax(addr, val);
 #else
+  typedef union u_hold {
+    double a;
+    unsigned long long b;
+  } u_hold_t;
+  u_hold_t u{val};
+  bool neg_zero = 0x8000000000000000ULL == u.b;
   #if __has_builtin(__hip_atomic_load) && \
       __has_builtin(__hip_atomic_compare_exchange_strong)
   double value = __hip_atomic_load(addr, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
   bool done = false;
-  while (!done && value < val) {
+  while (!done && (value < val || (neg_zero && value == 0.0))) {
     done = __hip_atomic_compare_exchange_strong(addr, &value, val,
                __ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
   }
@@ -915,7 +940,8 @@ double atomicMax(double* addr, double val) {
   unsigned long long *uaddr = (unsigned long long *)addr;
   unsigned long long value = __atomic_load_n(uaddr, __ATOMIC_RELAXED);
   bool done = false;
-  while (!done && __longlong_as_double(value) < val) {
+  while (!done &&
+         (__longlong_as_double(value) < val || (neg_zero && __longlong_as_double(value) == 0.0))) {
     done = __atomic_compare_exchange_n(uaddr, &value, __double_as_longlong(val), false,
                __ATOMIC_RELAXED, __ATOMIC_RELAXED);
   }
@@ -977,7 +1003,7 @@ unsigned int atomicDec(unsigned int* address, unsigned int val)
 #else
   return __builtin_amdgcn_atomic_dec32(address, val, __ATOMIC_RELAXED, "agent");
 #endif // __gfx941__
 }
 __device__