PyPI - triton-windows - Versions diffs - 3.2.0.post11__cp312-cp312-win_amd64.whl → 3.3.0a0.post11__cp312-cp312-win_amd64.whl - Mend

triton-windows 3.2.0.post11__cp312-cp312-win_amd64.whl → 3.3.0a0.post11__cp312-cp312-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of triton-windows might be problematic. Click here for more details.

Files changed (68) hide show

triton/_C/libtriton.pyd +0 -0
triton/__init__.py +3 -3
triton/_internal_testing.py +59 -4
triton/_utils.py +35 -0
triton/backends/amd/compiler.py +121 -74
triton/backends/amd/driver.py +77 -43
triton/backends/amd/include/hip/amd_detail/amd_device_functions.h +28 -49
triton/backends/amd/include/hip/amd_detail/amd_hip_atomic.h +35 -9
triton/backends/amd/include/hip/amd_detail/amd_hip_bf16.h +761 -284
triton/backends/amd/include/hip/amd_detail/amd_hip_cooperative_groups.h +9 -3
triton/backends/amd/include/hip/amd_detail/amd_hip_fp8.h +1391 -0
triton/backends/amd/include/hip/amd_detail/amd_hip_gl_interop.h +3 -3
triton/backends/amd/include/hip/amd_detail/amd_warp_functions.h +44 -0
triton/backends/amd/include/hip/amd_detail/amd_warp_sync_functions.h +288 -0
triton/backends/amd/include/hip/amd_detail/hip_api_trace.hpp +110 -14
triton/backends/amd/include/hip/amd_detail/hip_prof_str.h +504 -103
triton/backends/amd/include/hip/amd_detail/hip_runtime_prof.h +2 -1
triton/backends/amd/include/hip/amd_detail/host_defines.h +4 -0
triton/backends/amd/include/hip/hip_ext.h +4 -2
triton/backends/amd/include/hip/hip_fp8.h +33 -0
triton/backends/amd/include/hip/hip_runtime_api.h +375 -33
triton/backends/amd/include/hip/hip_version.h +3 -3
triton/backends/amd/include/hip/hiprtc.h +25 -25
triton/backends/amd/include/hsa/amd_hsa_elf.h +40 -14
triton/backends/amd/include/hsa/hsa.h +11 -2
triton/backends/amd/include/hsa/hsa_api_trace.h +30 -17
triton/backends/amd/include/hsa/hsa_api_trace_version.h +68 -0
triton/backends/amd/include/hsa/hsa_ext_amd.h +83 -27
triton/backends/amd/include/hsa/hsa_ven_amd_aqlprofile.h +46 -46
triton/backends/amd/include/hsa/hsa_ven_amd_pc_sampling.h +416 -0
triton/backends/amd/include/roctracer/hip_ostream_ops.h +84 -4
triton/backends/amd/include/roctracer/hsa_ostream_ops.h +260 -0
triton/backends/amd/include/roctracer/hsa_prof_str.h +51 -19
triton/backends/amd/lib/asanrtl.bc +0 -0
triton/backends/compiler.py +25 -225
triton/backends/driver.py +7 -2
triton/backends/nvidia/bin/ptxas.exe +0 -0
triton/backends/nvidia/compiler.py +135 -90
triton/backends/nvidia/driver.c +0 -1
triton/backends/nvidia/driver.py +135 -49
triton/backends/nvidia/include/cuda.h +2162 -241
triton/backends/nvidia/lib/x64/cuda.lib +0 -0
triton/compiler/__init__.py +2 -2
triton/compiler/code_generator.py +334 -231
triton/compiler/compiler.py +77 -66
triton/language/__init__.py +22 -5
triton/language/core.py +448 -74
triton/language/extra/cuda/_experimental_tma.py +3 -5
triton/language/math.py +1 -1
triton/language/random.py +2 -1
triton/language/semantic.py +206 -52
triton/language/standard.py +35 -18
triton/runtime/_allocation.py +32 -0
triton/runtime/autotuner.py +27 -32
triton/runtime/build.py +1 -48
triton/runtime/cache.py +6 -6
triton/runtime/errors.py +10 -0
triton/runtime/interpreter.py +179 -45
triton/runtime/jit.py +149 -190
triton/testing.py +39 -11
triton/tools/compile.py +27 -20
triton/tools/{compile.c → extra/cuda/compile.c} +1 -0
triton/tools/mxfp.py +301 -0
{triton_windows-3.2.0.post11.dist-info → triton_windows-3.3.0a0.post11.dist-info}/METADATA +5 -2
{triton_windows-3.2.0.post11.dist-info → triton_windows-3.3.0a0.post11.dist-info}/RECORD +68 -59
{triton_windows-3.2.0.post11.dist-info → triton_windows-3.3.0a0.post11.dist-info}/top_level.txt +2 -0
/triton/tools/{compile.h → extra/cuda/compile.h} +0 -0
{triton_windows-3.2.0.post11.dist-info → triton_windows-3.3.0a0.post11.dist-info}/WHEEL +0 -0

triton/_C/libtriton.pyd CHANGED Viewed

Binary file

triton/__init__.py CHANGED Viewed

@@ -1,5 +1,5 @@
 """isort:skip_file"""
-__version__ = '3.2.0'
+__version__ = '3.3.0'
 # Users may not know how to add cl and CUDA to PATH. Let's do it before loading anything
 import os
@@ -32,6 +32,7 @@ from .runtime import (
 from .runtime.jit import jit
 from .compiler import compile, CompilationError
 from .errors import TritonError
+from .runtime._allocation import set_allocator
 from . import language
 from . import testing
@@ -44,7 +45,6 @@ __all__ = [
     "compile",
     "Config",
     "heuristics",
-    "impl",
     "InterpreterError",
     "jit",
     "JITFunction",
@@ -52,10 +52,10 @@ __all__ = [
     "language",
     "MockTensor",
     "next_power_of_2",
-    "ops",
     "OutOfResources",
     "reinterpret",
     "runtime",
+    "set_allocator",
     "TensorWrapper",
     "TritonError",
     "testing",

triton/_internal_testing.py CHANGED Viewed

@@ -4,16 +4,18 @@ import numpy as np
 import torch
 import triton
 import triton.language as tl
+from triton.backends.nvidia.compiler import _path_to_binary
 import pytest
 from numpy.random import RandomState
 from typing import Optional, Union
-from triton.runtime.jit import TensorWrapper, reinterpret
+from triton.runtime.jit import TensorWrapper, reinterpret, type_canonicalisation_dict
 int_dtypes = ['int8', 'int16', 'int32', 'int64']
 uint_dtypes = ['uint8', 'uint16', 'uint32', 'uint64']
 integral_dtypes = int_dtypes + uint_dtypes
 float_dtypes = ['float16', 'float32', 'float64']
+float_dtypes_with_bfloat16 = float_dtypes + ['bfloat16']
 dtypes = integral_dtypes + float_dtypes
 dtypes_with_bfloat16 = dtypes + ['bfloat16']
 torch_float8_dtypes = ['float8_e4m3fn', 'float8_e5m2']
@@ -35,11 +37,45 @@ def is_cuda():
     return False if target is None else target.backend == "cuda"
+def is_hopper():
+    return is_cuda() and torch.cuda.get_device_capability()[0] >= 9
 def is_hip():
     target = get_current_target()
     return False if target is None else target.backend == "hip"
+def is_hip_mi200():
+    target = get_current_target()
+    if target is None or target.backend != 'hip':
+        return False
+    return target.arch == 'gfx90a'
+def is_hip_mi300():
+    target = get_current_target()
+    if target is None or target.backend != 'hip':
+        return False
+    return target.arch in ('gfx940', 'gfx941', 'gfx942')
+def is_hip_mi350():
+    target = get_current_target()
+    if target is None or target.backend != 'hip':
+        return False
+    return target.arch in ('gfx950')
+def is_hip_cdna():
+    return is_hip_mi200() or is_hip_mi300() or is_hip_mi350()
+def is_xpu():
+    target = get_current_target()
+    return False if target is None else target.backend == "xpu"
 def get_arch():
     target = get_current_target()
     return "" if target is None else str(target.arch)
@@ -94,6 +130,10 @@ def to_triton(x: np.ndarray, device, dst_type=None) -> Union[TensorWrapper, torc
         return torch.tensor(x, device=device)
+def str_to_triton_dtype(x: str) -> tl.dtype:
+    return tl.str_to_ty(type_canonicalisation_dict[x])
 def torch_dtype_name(dtype) -> str:
     if isinstance(dtype, triton.language.dtype):
         return dtype.name
@@ -116,8 +156,23 @@ def to_numpy(x):
         raise ValueError(f"Not a triton-compatible tensor: {x}")
-def supports_tma():
-    return is_cuda() and torch.cuda.get_device_capability()[0] >= 9
+def supports_tma(byval_only=False):
+    if is_interpreter():
+        return True
+    if not is_cuda():
+        return False
+    _, cuda_version = _path_to_binary("ptxas")
+    min_cuda_version = (12, 0) if byval_only else (12, 3)
+    cuda_version_tuple = tuple(map(int, cuda_version.split(".")))
+    assert len(cuda_version_tuple) == 2, cuda_version_tuple
+    return torch.cuda.get_device_capability()[0] >= 9 and cuda_version_tuple >= min_cuda_version
+def tma_skip_msg(byval_only=False):
+    if byval_only:
+        return "Requires __grid_constant__ TMA support (NVIDIA Hopper or higher, CUDA 12.0 or higher)"
+    else:
+        return "Requires advanced TMA support (NVIDIA Hopper or higher, CUDA 12.3 or higher)"
-requires_tma = pytest.mark.skipif(not supports_tma(), reason="Requires TMA support (NVIDIA Hopper or higher)")
+requires_tma = pytest.mark.skipif(not supports_tma(), reason=tma_skip_msg())

triton/_utils.py ADDED Viewed

@@ -0,0 +1,35 @@
+from functools import reduce
+def get_iterable_path(iterable, path):
+    return reduce(lambda a, idx: a[idx], path, iterable)
+def set_iterable_path(iterable, path, val):
+    prev = iterable if len(path) == 1 else get_iterable_path(iterable, path[:-1])
+    prev[path[-1]] = val
+def find_paths_if(iterable, pred):
+    from .language import core
+    is_iterable = lambda x: isinstance(x, (list, tuple, core.tuple, core.tuple_type))
+    ret = dict()
+    def _impl(current, path):
+        path = (path[0], ) if len(path) == 1 else tuple(path)
+        if is_iterable(current):
+            for idx, item in enumerate(current):
+                _impl(item, path + (idx, ))
+        elif pred(path, current):
+            if len(path) == 1:
+                ret[(path[0], )] = None
+            else:
+                ret[tuple(path)] = None
+    if is_iterable(iterable):
+        _impl(iterable, [])
+    elif pred(list(), iterable):
+        ret = {tuple(): None}
+    else:
+        ret = dict()
+    return list(ret.keys())

triton/backends/amd/compiler.py CHANGED Viewed

@@ -1,4 +1,4 @@
-from triton.backends.compiler import BaseBackend, GPUTarget, AttrsDescriptor, register_descriptor
+from triton.backends.compiler import BaseBackend, GPUTarget
 from triton._C.libtriton import ir, passes, llvm, amd
 from dataclasses import dataclass
 from typing import Any, Dict, Tuple
@@ -13,16 +13,13 @@ from pathlib import Path
 def min_dot_size(target: GPUTarget):
-    arch_str = target.arch
-    # CDNA 3.0 supports k==8 in all mfma variants except for int8
-    # (where the smallest `k` supported is 16)
-    if "gfx94" in arch_str:
-        return lambda lhsType, rhsType: (16, 16, 16) if (lhsType.is_int8() or rhsType.is_int8()) else (16, 16, 8)
-    # CDNA 2.0 always supports `k==8`
-    if "gfx9" in arch_str:
-        return lambda lhsType, rhsType: (16, 16, 8)
-    # Other architectures will only support 16,16,16
-    return lambda lhsType, rhsType: (16, 16, 16)
+    # If some given configuration is not supported in hardware we fallback to FMA and cast arguments
+    return lambda lhsType, rhsType: (1, 1, 1)
+def is_pingpong_enabled(arch):
+    default = "1" if arch == "gfx942" else "0"
+    return os.getenv("TRITON_HIP_USE_BLOCK_PINGPONG", default) == "1"
 @dataclass(frozen=True)
@@ -31,10 +28,6 @@ class HIPOptions:
     waves_per_eu: int = 1
     num_stages: int = 2
     num_ctas: int = 1
-    num_buffers_warp_spec: int = 0
-    num_consumer_groups: int = 0
-    reg_dec_producer: int = 0
-    reg_inc_consumer: int = 0
     extern_libs: dict = None
     cluster_dims: tuple = (1, 1, 1)
     debug: bool = False
@@ -45,6 +38,7 @@ class HIPOptions:
     default_dot_input_precision: str = "ieee"
     allowed_dot_input_precisions: Tuple[str] = ("ieee", )
     enable_fp_fusion: bool = True
+    launch_cooperative_grid: bool = False
     matrix_instr_nonkdim: int = 0
     kpack: int = 1
     allow_flush_denorm: bool = False
@@ -52,11 +46,23 @@ class HIPOptions:
     backend_name: str = 'hip'
     # The following option provides hints to the AMDGPU backend regarding instruction scheduling
-    # for all `tt.dot` operations in a kernel. The "default" variant preserves the default
+    # for all `tt.dot` operations in a kernel. The "none" variant preserves the default
     # instruction scheduling of the AMDGPU backend which aims at maximizing occupancy.
     # The option is experimental and may change at any time regarding its semantics and/or may
     # be gone entirely anytime.
-    instruction_sched_variant: str = 'default'
+    #
+    # Current experimental scheduling variants:
+    #
+    # llvm-iglp-0: injects `llvm.amdgcn.iglp_opt` intrinsic call with value `0` to the GEMM's
+    #              k-loop; i.e., "interleave DS and MFMA instructions for small GEMM kernels".
+    # llvm-iglp-1: injects `llvm.amdgcn.iglp_opt` intrinsic call with value `1` to the GEMM's
+    #              k-loop; i.e., "interleave DS and MFMA instructions for single wave small
+    #              GEMM kernels.".
+    # local-prefetch: implements instruction scheduling similar to the one from the ROCm Composable
+    #                 Kernel library. Note, this variant requires the use of buffer load/store ops
+    #                 and a special software pipelining style - i.e., 1x LDS and 1x register
+    #                 prefetch buffers for each GEMM tile.
+    instruction_sched_variant: str = 'none'
     def __post_init__(self):
         default_libdir = Path(__file__).parent / 'lib'
@@ -64,6 +70,9 @@ class HIPOptions:
         # Ignore user-defined warp size for gfx9
         warp_size = 32 if 'gfx10' in self.arch or 'gfx11' in self.arch or 'gfx12' in self.arch else 64
         object.__setattr__(self, 'warp_size', warp_size)
+        # Only kpack=1 is supported on gfx950
+        kpack = 1 if self.arch == 'gfx950' else self.kpack
+        object.__setattr__(self, 'kpack', kpack)
         libs = ["ocml", "ockl"]
         for lib in libs:
             extern_libs[lib] = str(default_libdir / f'{lib}.bc')
@@ -76,44 +85,6 @@ class HIPOptions:
         return hashlib.sha256(key.encode("utf-8")).hexdigest()
-@register_descriptor
-class HIPAttrsDescriptor(AttrsDescriptor):
-    # This property asserts if the underlying storage area of a given pointer
-    # can be resepresented as a 32 bit integer. When this is true, we can be
-    # sure that all indices into the tensor behind that pointer can use 32-bit
-    # indexing. That opens the door for the AMD backend to use buffer load/store
-    # instrinsics, which requires this property. Buffer load/store intrinsics
-    # gives direct out-of-bound support and simplifies index calculation for
-    # lower register pressure.
-    __slots__ = ("pointer_range_32")
-    def _add_backend_properties(self, params=None, values=None):
-        self.property_values["tt.pointer_range"] = 32
-        if params is None or values is None:
-            return
-        self.arg_properties["tt.pointer_range"] = [
-            param.num for param, arg in zip(params, values) if HIPAttrsDescriptor.is_within2gb(arg)
-            and not param.do_not_specialize and not param.do_not_specialize_on_alignment
-        ]
-    @staticmethod
-    def is_within2gb(arg):
-        if hasattr(arg, "ptr_range"):
-            return arg.ptr_range() <= 2**31 - 1
-        if "torch.Tensor" in str(type(arg)) and hasattr(arg, "untyped_storage"):
-            # Please note that 2**31-1 is the max int32 positive limit
-            return arg.untyped_storage().size() <= 2**31 - 1
-        return False
-    @staticmethod
-    def get_property_key(val, align):
-        generic_key = AttrsDescriptor.get_property_key(val, align)
-        hip_key = "S" if HIPAttrsDescriptor.is_within2gb(val) else "N"
-        key = (generic_key + hip_key).replace("N", "")
-        return key if key else "N"
 class HIPBackend(BaseBackend):
     @staticmethod
@@ -126,17 +97,25 @@ class HIPBackend(BaseBackend):
         self.binary_ext = "hsaco"
     def parse_options(self, opts) -> Any:
-        args = {'arch': self.target.arch}
+        args = {'arch': os.getenv("TRITON_OVERRIDE_ARCH", self.target.arch)}
+        # Enable XF32 (TF32) for CDNA3 GPUs
+        if self.target.arch in ('gfx940', 'gfx941', 'gfx942'):
+            allowed_dot_input_precisions = set(HIPOptions.allowed_dot_input_precisions)
+            allowed_dot_input_precisions.update({'tf32'})
+            args["allowed_dot_input_precisions"] = tuple(sorted(allowed_dot_input_precisions))
         if "supported_fp8_dtypes" not in opts:
             supported_fp8_dtypes = set(HIPOptions.supported_fp8_dtypes)
             if self.target.arch in ('gfx940', 'gfx941', 'gfx942'):
-                supported_fp8_dtypes.update({'fp8e4b8', 'fp8e5b16'})
+                supported_fp8_dtypes.update({'fp8e4nv', 'fp8e4b8', 'fp8e5b16'})
+            elif self.target.arch in ('gfx950'):
+                supported_fp8_dtypes.update({'fp8e4nv', 'fp8e5'})
             args["supported_fp8_dtypes"] = tuple(sorted(supported_fp8_dtypes))
         if "enable_fp_fusion" not in opts:
             args["enable_fp_fusion"] = os.getenv("TRITON_DEFAULT_FP_FUSION", "1") == "1"
-        args.update({k: opts[k] for k in HIPOptions.__dataclass_fields__.keys() if k in opts})
+        args.update({k: opts[k] for k in HIPOptions.__dataclass_fields__.keys() if k in opts and opts[k] is not None})
         return HIPOptions(**args)
     def pack_metadata(self, metadata):
@@ -149,23 +128,49 @@ class HIPBackend(BaseBackend):
             metadata.cluster_dims[2],
         )
-    def get_codegen_implementation(self):
+    def get_codegen_implementation(self, options):
         codegen_fns = {"min_dot_size": min_dot_size(self.target)}
         return codegen_fns
     def get_module_map(self) -> Dict[str, ModuleType]:
         from triton.language.extra.hip import libdevice
         return {"triton.language.extra.libdevice": libdevice}
     def load_dialects(self, ctx):
         amd.load_dialects(ctx)
-    def get_attrs_descriptor(self, params, args):
-        return HIPAttrsDescriptor(params, args)
+    @staticmethod
+    @functools.lru_cache()
+    def use_buffer_ops():
+        return os.environ.get("AMDGCN_USE_BUFFER_OPS", "0") == "1"
+    @staticmethod
+    def is_within_2gb(arg):
+        import torch
+        MAX_INT_32 = 2**31 - 1
+        if hasattr(arg, "ptr_range"):
+            return arg.ptr_range() <= MAX_INT_32
+        if isinstance(arg, torch.Tensor) and hasattr(arg, "untyped_storage"):
+            return arg.untyped_storage().size() <= MAX_INT_32
+        return False
+    @staticmethod
+    def parse_attr(desc):
+        ret = BaseBackend.parse_attr(desc)
+        if "S" in desc:
+            ret += [["tt.pointer_range", 32]]
+        return ret
     @staticmethod
-    def compute_spec_key(arg, align):
-        return HIPAttrsDescriptor.get_property_key(arg, align)
+    def get_arg_specialization(arg, ty, **kwargs):
+        ret = BaseBackend.get_arg_specialization(arg, ty, **kwargs)
+        # Only attempt to do buffer ops specialization if buffer ops are enabled.
+        # Otherwise the is_within_2gb check is unnecessary overhead.
+        if HIPBackend.use_buffer_ops() and ty == "tensor" and HIPBackend.is_within_2gb(arg):
+            ret += "S"
+        return ret
     @staticmethod
     def path_to_rocm_lld():
@@ -193,8 +198,8 @@ class HIPBackend(BaseBackend):
         pm.enable_debug()
         passes.common.add_inliner(pm)
         passes.ttir.add_rewrite_tensor_pointer(pm)
-        passes.ttir.add_combine(pm)
         passes.common.add_canonicalizer(pm)
+        passes.ttir.add_combine(pm)
         passes.ttir.add_reorder_broadcast(pm)
         passes.common.add_cse(pm)
         passes.common.add_licm(pm)
@@ -219,24 +224,38 @@ class HIPBackend(BaseBackend):
         passes.ttgpuir.add_remove_layout_conversions(pm)
         amd.passes.ttgpuir.add_optimize_epilogue(pm)
         passes.ttgpuir.add_optimize_dot_operands(pm, True)
+        amd.passes.ttgpuir.add_hoist_layout_conversions(pm)
+        global_prefetch = int(os.getenv("TRITON_HIP_GLOBAL_PREFETCH", "0"))
+        local_prefetch = int(os.getenv("TRITON_HIP_LOCAL_PREFETCH", "0"))
+        # The `local-prefetch` scheduling variant requires turning on buffer ops.
+        if options.instruction_sched_variant == "local-prefetch":
+            global_prefetch = local_prefetch = 1
         if amd.has_matrix_core_feature(options.arch):
             assert options.num_stages != 0, ("Triton AMD backend pipeliner has been updated. "
                                              "We used to trigger software pipelining with "
                                              "num_stages == 0. Now it will not happen anymore; "
                                              "please update to use num_stages == 2 for "
                                              "equivalent behavior in the past.")
-            amd.passes.ttgpuir.add_stream_pipelinev2(pm, options.num_stages)
+            amd.passes.ttgpuir.add_stream_pipeline(pm, options.num_stages, global_prefetch, local_prefetch)
             passes.common.add_canonicalizer(pm)
-        amd.passes.ttgpuir.insert_instruction_sched_hints(pm)
+        if options.instruction_sched_variant.lower() != "none":
+            amd.passes.ttgpuir.insert_instruction_sched_hints(pm, options.instruction_sched_variant)
         passes.ttgpuir.add_optimize_dot_operands(pm, True)
         passes.ttgpuir.add_remove_layout_conversions(pm)
         passes.ttgpuir.add_reduce_data_duplication(pm)
         if amd.has_matrix_core_feature(options.arch):
             amd.passes.ttgpuir.add_reorder_instructions(pm)
-        if os.environ.get("AMDGCN_USE_BUFFER_OPS", "0") == "1":
+            use_block_pingpong = is_pingpong_enabled(options.arch)
+            if use_block_pingpong and options.num_stages == 2:
+                amd.passes.ttgpuir.add_block_pingpong(pm)
+        if HIPBackend.use_buffer_ops():
             amd.passes.ttgpuir.add_canonicalize_pointers(pm)
             passes.common.add_canonicalizer(pm)
-            amd.passes.ttgpuir.add_convert_to_buffer_ops(pm)
+            amd.passes.ttgpuir.add_convert_to_buffer_ops(pm, options.arch)
         passes.common.add_canonicalizer(pm)
         passes.common.add_cse(pm)
         passes.common.add_symbol_dce(pm)
@@ -278,7 +297,8 @@ class HIPBackend(BaseBackend):
         passes.common.add_canonicalizer(pm)
         passes.common.add_cse(pm)
         passes.common.add_symbol_dce(pm)
-        amd.passes.ttgpuir.lower_instruction_sched_hints(pm, options.instruction_sched_variant)
+        if options.instruction_sched_variant.lower() != "none":
+            amd.passes.ttgpuir.lower_instruction_sched_hints(pm, options.arch, options.num_stages)
         if os.environ.get("TRITON_DISABLE_LINE_INFO", "0") == "0":
             passes.llvmir.add_di_scope(pm)
         amd.passes.ttgpuir.add_builtin_func_to_llvmir(pm, __HIP_FTZ)
@@ -289,12 +309,15 @@ class HIPBackend(BaseBackend):
         context = llvm.context()
         llvm_mod = llvm.to_module(mod, context)
         amd.attach_target_triple(llvm_mod)
-        llvm.attach_datalayout(llvm_mod, amd.TARGET_TRIPLE, options.arch, '')
+        target_features = ''
+        if os.environ.get("TRITON_ENABLE_ASAN", "0") == "1":
+            target_features = '+xnack'
+        llvm.attach_datalayout(llvm_mod, amd.TARGET_TRIPLE, options.arch, target_features)
         # Set various control constants on the LLVM module so that device
         # libraries can resolve references to them.
         amd.set_isa_version(llvm_mod, options.arch)
-        amd.set_abi_version(llvm_mod, 400)
+        amd.set_abi_version(llvm_mod, 500)
         amd.set_bool_control_constant(llvm_mod, "__oclc_finite_only_opt", False)
         amd.set_bool_control_constant(llvm_mod, "__oclc_correctly_rounded_sqrt32", True)
         amd.set_bool_control_constant(llvm_mod, "__oclc_unsafe_math_opt", False)
@@ -305,25 +328,46 @@ class HIPBackend(BaseBackend):
         # The public kernel should be kernel 0.
         fns[0].set_calling_conv(amd.CALLING_CONV_AMDGPU_KERNEL)
         fns[0].add_fn_attr("amdgpu-flat-work-group-size", f"1,{options.num_warps*options.warp_size}")
+        # LLVM AMDGPU backend supports the attribute "amdgpu-waves-per-eu"="<min>[, <max>]".
+        # This attribute may be attached to a kernel function definition and is an optimization hint.
+        # <min> parameter specifies the requested minimum number of waves per EU, and optional <max> parameter
+        # specifies the requested maximum number of waves per EU (must be greater than <min> if specified).
+        # If <max> is omitted, then there is no restriction on the maximum number of waves per EU other than
+        # the one dictated by the hardware for which the kernel is compiled. Passing 0, 0 as <min>, <max>
+        # implies the default behavior (no limits).
         fns[0].add_fn_attr("amdgpu-waves-per-eu", f"{options.waves_per_eu}")
         denormal_mode = "preserve-sign" if options.allow_flush_denorm else "ieee"
         fns[0].add_fn_attr("denormal-fp-math-f32", denormal_mode)
+        if os.environ.get("TRITON_ENABLE_ASAN", "0") == "1":
+            fns[0].add_fn_target_feature("+xnack")
+            fns[0].add_fn_asan_attr()
         # Hint the compiler that we'd like the firmware to set the kernel arguments
         # to user SGPRs so that the kernel does not need to s_load its arguments
         # from memory.
         amd.set_all_fn_arg_inreg(fns[0])
-        if options.extern_libs:
+        if os.environ.get("TRITON_ENABLE_ASAN", "0") == "1":
+            default_libdir = Path(__file__).parent / 'lib'
+            paths = [
+                str(default_libdir / 'asanrtl.bc'),
+                str(default_libdir / "ocml.bc"),
+                str(default_libdir / "ockl.bc")
+            ]
+            llvm.link_extern_libs(llvm_mod, paths)
+        elif options.extern_libs:
             paths = [path for (name, path) in options.extern_libs if amd.need_extern_lib(llvm_mod, name)]
             llvm.link_extern_libs(llvm_mod, paths)
         llvm.optimize_module(llvm_mod, llvm.OPTIMIZE_O3, options.arch, '', [], options.enable_fp_fusion)
         # Get some metadata
-        metadata["shared"] = src.get_int_attr("triton_gpu.shared")
+        metadata["shared"] = src.get_int_attr("ttg.shared")
         amd.cleanup_bitcode_metadata(llvm_mod)
+        # Disable inlining of print related functions,
+        # because inlining of these function could slow down compilation significantly
+        amd.disable_print_inline(llvm_mod)
         return str(llvm_mod)
     @staticmethod
@@ -343,7 +387,10 @@ class HIPBackend(BaseBackend):
     @staticmethod
     def make_hsaco(src, metadata, options):
-        hsaco = amd.assemble_amdgcn(src, options.arch, '')
+        target_features = ''
+        if os.environ.get("TRITON_ENABLE_ASAN", "0") == "1":
+            target_features = '+xnack'
+        hsaco = amd.assemble_amdgcn(src, options.arch, target_features)
         rocm_path = HIPBackend.path_to_rocm_lld()
         with tempfile.NamedTemporaryFile() as tmp_out: