PyPI - triton-windows - Versions diffs - 3.2.0.post11__cp310-cp310-win_amd64.whl - Mend

triton-windows 3.2.0.post11__cp310-cp310-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of triton-windows might be problematic. Click here for more details.

Files changed (154) hide show

triton/_C/libtriton.pyd +0 -0
triton/__init__.py +85 -0
triton/_internal_testing.py +123 -0
triton/backends/__init__.py +50 -0
triton/backends/amd/compiler.py +368 -0
triton/backends/amd/driver.c +211 -0
triton/backends/amd/driver.py +512 -0
triton/backends/amd/include/hip/amd_detail/amd_channel_descriptor.h +358 -0
triton/backends/amd/include/hip/amd_detail/amd_device_functions.h +1031 -0
triton/backends/amd/include/hip/amd_detail/amd_hip_atomic.h +1612 -0
triton/backends/amd/include/hip/amd_detail/amd_hip_bf16.h +1337 -0
triton/backends/amd/include/hip/amd_detail/amd_hip_bfloat16.h +293 -0
triton/backends/amd/include/hip/amd_detail/amd_hip_common.h +32 -0
triton/backends/amd/include/hip/amd_detail/amd_hip_complex.h +174 -0
triton/backends/amd/include/hip/amd_detail/amd_hip_cooperative_groups.h +829 -0
triton/backends/amd/include/hip/amd_detail/amd_hip_fp16.h +1809 -0
triton/backends/amd/include/hip/amd_detail/amd_hip_gl_interop.h +108 -0
triton/backends/amd/include/hip/amd_detail/amd_hip_math_constants.h +124 -0
triton/backends/amd/include/hip/amd_detail/amd_hip_runtime.h +405 -0
triton/backends/amd/include/hip/amd_detail/amd_hip_runtime_pt_api.h +196 -0
triton/backends/amd/include/hip/amd_detail/amd_hip_unsafe_atomics.h +565 -0
triton/backends/amd/include/hip/amd_detail/amd_hip_vector_types.h +2226 -0
triton/backends/amd/include/hip/amd_detail/amd_math_functions.h +104 -0
triton/backends/amd/include/hip/amd_detail/amd_surface_functions.h +244 -0
triton/backends/amd/include/hip/amd_detail/amd_warp_functions.h +494 -0
triton/backends/amd/include/hip/amd_detail/concepts.hpp +30 -0
triton/backends/amd/include/hip/amd_detail/device_library_decls.h +133 -0
triton/backends/amd/include/hip/amd_detail/functional_grid_launch.hpp +218 -0
triton/backends/amd/include/hip/amd_detail/grid_launch.h +67 -0
triton/backends/amd/include/hip/amd_detail/grid_launch.hpp +50 -0
triton/backends/amd/include/hip/amd_detail/grid_launch_GGL.hpp +26 -0
triton/backends/amd/include/hip/amd_detail/helpers.hpp +137 -0
triton/backends/amd/include/hip/amd_detail/hip_api_trace.hpp +1350 -0
triton/backends/amd/include/hip/amd_detail/hip_assert.h +101 -0
triton/backends/amd/include/hip/amd_detail/hip_cooperative_groups_helper.h +242 -0
triton/backends/amd/include/hip/amd_detail/hip_fp16_gcc.h +254 -0
triton/backends/amd/include/hip/amd_detail/hip_fp16_math_fwd.h +96 -0
triton/backends/amd/include/hip/amd_detail/hip_ldg.h +100 -0
triton/backends/amd/include/hip/amd_detail/hip_prof_str.h +10169 -0
triton/backends/amd/include/hip/amd_detail/hip_runtime_prof.h +77 -0
triton/backends/amd/include/hip/amd_detail/host_defines.h +180 -0
triton/backends/amd/include/hip/amd_detail/hsa_helpers.hpp +102 -0
triton/backends/amd/include/hip/amd_detail/macro_based_grid_launch.hpp +798 -0
triton/backends/amd/include/hip/amd_detail/math_fwd.h +698 -0
triton/backends/amd/include/hip/amd_detail/ockl_image.h +177 -0
triton/backends/amd/include/hip/amd_detail/program_state.hpp +107 -0
triton/backends/amd/include/hip/amd_detail/texture_fetch_functions.h +491 -0
triton/backends/amd/include/hip/amd_detail/texture_indirect_functions.h +478 -0
triton/backends/amd/include/hip/channel_descriptor.h +39 -0
triton/backends/amd/include/hip/device_functions.h +38 -0
triton/backends/amd/include/hip/driver_types.h +468 -0
triton/backends/amd/include/hip/hip_bf16.h +36 -0
triton/backends/amd/include/hip/hip_bfloat16.h +44 -0
triton/backends/amd/include/hip/hip_common.h +100 -0
triton/backends/amd/include/hip/hip_complex.h +38 -0
triton/backends/amd/include/hip/hip_cooperative_groups.h +46 -0
triton/backends/amd/include/hip/hip_deprecated.h +95 -0
triton/backends/amd/include/hip/hip_ext.h +159 -0
triton/backends/amd/include/hip/hip_fp16.h +36 -0
triton/backends/amd/include/hip/hip_gl_interop.h +32 -0
triton/backends/amd/include/hip/hip_hcc.h +24 -0
triton/backends/amd/include/hip/hip_math_constants.h +36 -0
triton/backends/amd/include/hip/hip_profile.h +27 -0
triton/backends/amd/include/hip/hip_runtime.h +75 -0
triton/backends/amd/include/hip/hip_runtime_api.h +8919 -0
triton/backends/amd/include/hip/hip_texture_types.h +29 -0
triton/backends/amd/include/hip/hip_vector_types.h +41 -0
triton/backends/amd/include/hip/hip_version.h +17 -0
triton/backends/amd/include/hip/hiprtc.h +421 -0
triton/backends/amd/include/hip/library_types.h +78 -0
triton/backends/amd/include/hip/math_functions.h +42 -0
triton/backends/amd/include/hip/surface_types.h +63 -0
triton/backends/amd/include/hip/texture_types.h +194 -0
triton/backends/amd/include/hsa/Brig.h +1131 -0
triton/backends/amd/include/hsa/amd_hsa_common.h +91 -0
triton/backends/amd/include/hsa/amd_hsa_elf.h +436 -0
triton/backends/amd/include/hsa/amd_hsa_kernel_code.h +269 -0
triton/backends/amd/include/hsa/amd_hsa_queue.h +109 -0
triton/backends/amd/include/hsa/amd_hsa_signal.h +80 -0
triton/backends/amd/include/hsa/hsa.h +5729 -0
triton/backends/amd/include/hsa/hsa_amd_tool.h +91 -0
triton/backends/amd/include/hsa/hsa_api_trace.h +566 -0
triton/backends/amd/include/hsa/hsa_ext_amd.h +3090 -0
triton/backends/amd/include/hsa/hsa_ext_finalize.h +531 -0
triton/backends/amd/include/hsa/hsa_ext_image.h +1454 -0
triton/backends/amd/include/hsa/hsa_ven_amd_aqlprofile.h +488 -0
triton/backends/amd/include/hsa/hsa_ven_amd_loader.h +667 -0
triton/backends/amd/include/roctracer/ext/prof_protocol.h +107 -0
triton/backends/amd/include/roctracer/hip_ostream_ops.h +4435 -0
triton/backends/amd/include/roctracer/hsa_ostream_ops.h +1467 -0
triton/backends/amd/include/roctracer/hsa_prof_str.h +3027 -0
triton/backends/amd/include/roctracer/roctracer.h +779 -0
triton/backends/amd/include/roctracer/roctracer_ext.h +81 -0
triton/backends/amd/include/roctracer/roctracer_hcc.h +24 -0
triton/backends/amd/include/roctracer/roctracer_hip.h +37 -0
triton/backends/amd/include/roctracer/roctracer_hsa.h +112 -0
triton/backends/amd/include/roctracer/roctracer_plugin.h +137 -0
triton/backends/amd/include/roctracer/roctracer_roctx.h +67 -0
triton/backends/amd/include/roctracer/roctx.h +229 -0
triton/backends/amd/lib/ockl.bc +0 -0
triton/backends/amd/lib/ocml.bc +0 -0
triton/backends/compiler.py +304 -0
triton/backends/driver.py +48 -0
triton/backends/nvidia/__init__.py +0 -0
triton/backends/nvidia/bin/ptxas.exe +0 -0
triton/backends/nvidia/compiler.py +410 -0
triton/backends/nvidia/driver.c +451 -0
triton/backends/nvidia/driver.py +524 -0
triton/backends/nvidia/include/cuda.h +24359 -0
triton/backends/nvidia/lib/libdevice.10.bc +0 -0
triton/backends/nvidia/lib/x64/cuda.lib +0 -0
triton/compiler/__init__.py +4 -0
triton/compiler/code_generator.py +1303 -0
triton/compiler/compiler.py +430 -0
triton/compiler/errors.py +51 -0
triton/compiler/make_launcher.py +0 -0
triton/errors.py +5 -0
triton/language/__init__.py +294 -0
triton/language/_utils.py +21 -0
triton/language/core.py +2694 -0
triton/language/extra/__init__.py +26 -0
triton/language/extra/cuda/__init__.py +13 -0
triton/language/extra/cuda/_experimental_tma.py +108 -0
triton/language/extra/cuda/libdevice.py +1629 -0
triton/language/extra/cuda/utils.py +109 -0
triton/language/extra/hip/__init__.py +3 -0
triton/language/extra/hip/libdevice.py +475 -0
triton/language/extra/libdevice.py +786 -0
triton/language/math.py +250 -0
triton/language/random.py +207 -0
triton/language/semantic.py +1796 -0
triton/language/standard.py +452 -0
triton/runtime/__init__.py +23 -0
triton/runtime/autotuner.py +408 -0
triton/runtime/build.py +111 -0
triton/runtime/cache.py +295 -0
triton/runtime/driver.py +60 -0
triton/runtime/errors.py +26 -0
triton/runtime/interpreter.py +1235 -0
triton/runtime/jit.py +951 -0
triton/testing.py +511 -0
triton/tools/__init__.py +0 -0
triton/tools/build_extern.py +365 -0
triton/tools/compile.c +67 -0
triton/tools/compile.h +14 -0
triton/tools/compile.py +155 -0
triton/tools/disasm.py +144 -0
triton/tools/experimental_descriptor.py +32 -0
triton/tools/link.py +322 -0
triton/windows_utils.py +375 -0
triton_windows-3.2.0.post11.dist-info/METADATA +39 -0
triton_windows-3.2.0.post11.dist-info/RECORD +154 -0
triton_windows-3.2.0.post11.dist-info/WHEEL +5 -0
triton_windows-3.2.0.post11.dist-info/top_level.txt +12 -0

triton/_C/libtriton.pyd ADDED Viewed

Binary file

triton/__init__.py ADDED Viewed

@@ -0,0 +1,85 @@
+"""isort:skip_file"""
+__version__ = '3.2.0'
+# Users may not know how to add cl and CUDA to PATH. Let's do it before loading anything
+import os
+if os.name == "nt":
+    from .windows_utils import find_cuda, find_msvc_winsdk
+    msvc_winsdk_inc_dirs, _ = find_msvc_winsdk()
+    if msvc_winsdk_inc_dirs:
+        cl_path = msvc_winsdk_inc_dirs[0].replace(r"\include", r"\bin\Hostx64\x64")
+        os.environ["PATH"] = cl_path + os.pathsep + os.environ["PATH"]
+    cuda_bin_path, _, _ = find_cuda()
+    if cuda_bin_path:
+        os.environ["PATH"] = cuda_bin_path + os.pathsep + os.environ["PATH"]
+# ---------------------------------------
+# Note: import order is significant here.
+# submodules
+from .runtime import (
+    autotune,
+    Config,
+    heuristics,
+    JITFunction,
+    KernelInterface,
+    reinterpret,
+    TensorWrapper,
+    OutOfResources,
+    InterpreterError,
+    MockTensor,
+)
+from .runtime.jit import jit
+from .compiler import compile, CompilationError
+from .errors import TritonError
+from . import language
+from . import testing
+from . import tools
+__all__ = [
+    "autotune",
+    "cdiv",
+    "CompilationError",
+    "compile",
+    "Config",
+    "heuristics",
+    "impl",
+    "InterpreterError",
+    "jit",
+    "JITFunction",
+    "KernelInterface",
+    "language",
+    "MockTensor",
+    "next_power_of_2",
+    "ops",
+    "OutOfResources",
+    "reinterpret",
+    "runtime",
+    "TensorWrapper",
+    "TritonError",
+    "testing",
+    "tools",
+]
+# -------------------------------------
+# misc. utilities that  don't fit well
+# into any specific module
+# -------------------------------------
+def cdiv(x: int, y: int):
+    return (x + y - 1) // y
+def next_power_of_2(n: int):
+    """Return the smallest power of 2 greater than or equal to n"""
+    n -= 1
+    n |= n >> 1
+    n |= n >> 2
+    n |= n >> 4
+    n |= n >> 8
+    n |= n >> 16
+    n |= n >> 32
+    n += 1
+    return n

triton/_internal_testing.py ADDED Viewed

@@ -0,0 +1,123 @@
+import os
+import re
+import numpy as np
+import torch
+import triton
+import triton.language as tl
+import pytest
+from numpy.random import RandomState
+from typing import Optional, Union
+from triton.runtime.jit import TensorWrapper, reinterpret
+int_dtypes = ['int8', 'int16', 'int32', 'int64']
+uint_dtypes = ['uint8', 'uint16', 'uint32', 'uint64']
+integral_dtypes = int_dtypes + uint_dtypes
+float_dtypes = ['float16', 'float32', 'float64']
+dtypes = integral_dtypes + float_dtypes
+dtypes_with_bfloat16 = dtypes + ['bfloat16']
+torch_float8_dtypes = ['float8_e4m3fn', 'float8_e5m2']
+torch_dtypes = ['bool'] + int_dtypes + ['uint8'] + float_dtypes + ['bfloat16']
+def is_interpreter():
+    return os.environ.get('TRITON_INTERPRET', '0') == '1'
+def get_current_target():
+    if is_interpreter():
+        return None
+    return triton.runtime.driver.active.get_current_target()
+def is_cuda():
+    target = get_current_target()
+    return False if target is None else target.backend == "cuda"
+def is_hip():
+    target = get_current_target()
+    return False if target is None else target.backend == "hip"
+def get_arch():
+    target = get_current_target()
+    return "" if target is None else str(target.arch)
+def numpy_random(shape, dtype_str, rs: Optional[RandomState] = None, low=None, high=None):
+    """
+    Override `rs` if you're calling this function twice and don't want the same
+    result for both calls.
+    """
+    if isinstance(shape, int):
+        shape = (shape, )
+    if rs is None:
+        rs = RandomState(seed=17)
+    if dtype_str in int_dtypes + uint_dtypes:
+        iinfo = np.iinfo(getattr(np, dtype_str))
+        low = iinfo.min if low is None else max(low, iinfo.min)
+        high = iinfo.max if high is None else min(high, iinfo.max)
+        dtype = getattr(np, dtype_str)
+        x = rs.randint(low, high, shape, dtype=dtype)
+        x[x == 0] = 1  # Workaround. Never return zero so tests of division don't error out.
+        return x
+    elif dtype_str and 'float8' in dtype_str:
+        x = rs.randint(20, 40, shape, dtype=np.int8)
+        return x
+    elif dtype_str in float_dtypes:
+        return rs.normal(0, 1, shape).astype(dtype_str)
+    elif dtype_str == 'bfloat16':
+        return (rs.normal(0, 1, shape).astype('float32').view('uint32') & np.uint32(0xffff0000)).view('float32')
+    elif dtype_str in ['bool', 'int1', 'bool_']:
+        return rs.normal(0, 1, shape) > 0.0
+    else:
+        raise RuntimeError(f'Unknown dtype {dtype_str}')
+def to_triton(x: np.ndarray, device, dst_type=None) -> Union[TensorWrapper, torch.Tensor]:
+    '''
+    Note: We need dst_type because the type of x can be different from dst_type.
+          For example: x is of type `float32`, dst_type is `bfloat16`.
+          If dst_type is None, we infer dst_type from x.
+    '''
+    t = x.dtype.name
+    if t in uint_dtypes:
+        signed_type_name = t.lstrip('u')  # e.g. "uint16" -> "int16"
+        x_signed = x.astype(getattr(np, signed_type_name))
+        return reinterpret(torch.tensor(x_signed, device=device), getattr(tl, t))
+    else:
+        if dst_type and 'float8' in dst_type:
+            return reinterpret(torch.tensor(x, device=device), getattr(tl, dst_type))
+        if t == 'float32' and dst_type == 'bfloat16':
+            return torch.tensor(x, device=device).bfloat16()
+        return torch.tensor(x, device=device)
+def torch_dtype_name(dtype) -> str:
+    if isinstance(dtype, triton.language.dtype):
+        return dtype.name
+    elif isinstance(dtype, torch.dtype):
+        # 'torch.int64' -> 'int64'
+        m = re.match(r'^torch\.(\w+)$', str(dtype))
+        return m.group(1)
+    else:
+        raise TypeError(f'not a triton or torch dtype: {type(dtype)}')
+def to_numpy(x):
+    if isinstance(x, TensorWrapper):
+        return x.base.cpu().numpy().astype(getattr(np, torch_dtype_name(x.dtype)))
+    elif isinstance(x, torch.Tensor):
+        if x.dtype is torch.bfloat16:
+            return x.cpu().float().numpy()
+        return x.cpu().numpy()
+    else:
+        raise ValueError(f"Not a triton-compatible tensor: {x}")
+def supports_tma():
+    return is_cuda() and torch.cuda.get_device_capability()[0] >= 9
+requires_tma = pytest.mark.skipif(not supports_tma(), reason="Requires TMA support (NVIDIA Hopper or higher)")

triton/backends/__init__.py ADDED Viewed

@@ -0,0 +1,50 @@
+import os
+import importlib.util
+import inspect
+from dataclasses import dataclass
+from .driver import DriverBase
+from .compiler import BaseBackend
+def _load_module(name, path):
+    spec = importlib.util.spec_from_file_location(name, path)
+    module = importlib.util.module_from_spec(spec)
+    spec.loader.exec_module(module)
+    return module
+def _find_concrete_subclasses(module, base_class):
+    ret = []
+    for attr_name in dir(module):
+        attr = getattr(module, attr_name)
+        if isinstance(attr, type) and issubclass(attr, base_class) and not inspect.isabstract(attr):
+            ret.append(attr)
+    if len(ret) == 0:
+        raise RuntimeError(f"Found 0 concrete subclasses of {base_class} in {module}: {ret}")
+    if len(ret) > 1:
+        raise RuntimeError(f"Found >1 concrete subclasses of {base_class} in {module}: {ret}")
+    return ret[0]
+@dataclass(frozen=True)
+class Backend:
+    compiler: BaseBackend = None
+    driver: DriverBase = None
+def _discover_backends():
+    backends = dict()
+    root = os.path.dirname(__file__)
+    for name in os.listdir(root):
+        if not os.path.isdir(os.path.join(root, name)):
+            continue
+        if name.startswith('__'):
+            continue
+        compiler = _load_module(name, os.path.join(root, name, 'compiler.py'))
+        driver = _load_module(name, os.path.join(root, name, 'driver.py'))
+        backends[name] = Backend(_find_concrete_subclasses(compiler, BaseBackend),
+                                 _find_concrete_subclasses(driver, DriverBase))
+    return backends
+backends = _discover_backends()

triton/backends/amd/compiler.py ADDED Viewed

@@ -0,0 +1,368 @@
+from triton.backends.compiler import BaseBackend, GPUTarget, AttrsDescriptor, register_descriptor
+from triton._C.libtriton import ir, passes, llvm, amd
+from dataclasses import dataclass
+from typing import Any, Dict, Tuple
+from types import ModuleType
+import hashlib
+import tempfile
+import os
+import re
+import subprocess
+import functools
+from pathlib import Path
+def min_dot_size(target: GPUTarget):
+    arch_str = target.arch
+    # CDNA 3.0 supports k==8 in all mfma variants except for int8
+    # (where the smallest `k` supported is 16)
+    if "gfx94" in arch_str:
+        return lambda lhsType, rhsType: (16, 16, 16) if (lhsType.is_int8() or rhsType.is_int8()) else (16, 16, 8)
+    # CDNA 2.0 always supports `k==8`
+    if "gfx9" in arch_str:
+        return lambda lhsType, rhsType: (16, 16, 8)
+    # Other architectures will only support 16,16,16
+    return lambda lhsType, rhsType: (16, 16, 16)
+@dataclass(frozen=True)
+class HIPOptions:
+    num_warps: int = 4
+    waves_per_eu: int = 1
+    num_stages: int = 2
+    num_ctas: int = 1
+    num_buffers_warp_spec: int = 0
+    num_consumer_groups: int = 0
+    reg_dec_producer: int = 0
+    reg_inc_consumer: int = 0
+    extern_libs: dict = None
+    cluster_dims: tuple = (1, 1, 1)
+    debug: bool = False
+    sanitize_overflow: bool = True
+    arch: str = None
+    supported_fp8_dtypes: Tuple[str] = ("fp8e5", )
+    deprecated_fp8_dtypes: Tuple[str] = ()
+    default_dot_input_precision: str = "ieee"
+    allowed_dot_input_precisions: Tuple[str] = ("ieee", )
+    enable_fp_fusion: bool = True
+    matrix_instr_nonkdim: int = 0
+    kpack: int = 1
+    allow_flush_denorm: bool = False
+    max_num_imprecise_acc_default: int = 0
+    backend_name: str = 'hip'
+    # The following option provides hints to the AMDGPU backend regarding instruction scheduling
+    # for all `tt.dot` operations in a kernel. The "default" variant preserves the default
+    # instruction scheduling of the AMDGPU backend which aims at maximizing occupancy.
+    # The option is experimental and may change at any time regarding its semantics and/or may
+    # be gone entirely anytime.
+    instruction_sched_variant: str = 'default'
+    def __post_init__(self):
+        default_libdir = Path(__file__).parent / 'lib'
+        extern_libs = {} if self.extern_libs is None else dict(self.extern_libs)
+        # Ignore user-defined warp size for gfx9
+        warp_size = 32 if 'gfx10' in self.arch or 'gfx11' in self.arch or 'gfx12' in self.arch else 64
+        object.__setattr__(self, 'warp_size', warp_size)
+        libs = ["ocml", "ockl"]
+        for lib in libs:
+            extern_libs[lib] = str(default_libdir / f'{lib}.bc')
+        object.__setattr__(self, 'extern_libs', tuple(extern_libs.items()))
+        assert self.num_warps > 0 and (self.num_warps & (self.num_warps - 1)) == 0, \
+               "num_warps must be a power of 2"
+    def hash(self):
+        key = '_'.join([f'{name}-{val}' for name, val in self.__dict__.items()])
+        return hashlib.sha256(key.encode("utf-8")).hexdigest()
+@register_descriptor
+class HIPAttrsDescriptor(AttrsDescriptor):
+    # This property asserts if the underlying storage area of a given pointer
+    # can be resepresented as a 32 bit integer. When this is true, we can be
+    # sure that all indices into the tensor behind that pointer can use 32-bit
+    # indexing. That opens the door for the AMD backend to use buffer load/store
+    # instrinsics, which requires this property. Buffer load/store intrinsics
+    # gives direct out-of-bound support and simplifies index calculation for
+    # lower register pressure.
+    __slots__ = ("pointer_range_32")
+    def _add_backend_properties(self, params=None, values=None):
+        self.property_values["tt.pointer_range"] = 32
+        if params is None or values is None:
+            return
+        self.arg_properties["tt.pointer_range"] = [
+            param.num for param, arg in zip(params, values) if HIPAttrsDescriptor.is_within2gb(arg)
+            and not param.do_not_specialize and not param.do_not_specialize_on_alignment
+        ]
+    @staticmethod
+    def is_within2gb(arg):
+        if hasattr(arg, "ptr_range"):
+            return arg.ptr_range() <= 2**31 - 1
+        if "torch.Tensor" in str(type(arg)) and hasattr(arg, "untyped_storage"):
+            # Please note that 2**31-1 is the max int32 positive limit
+            return arg.untyped_storage().size() <= 2**31 - 1
+        return False
+    @staticmethod
+    def get_property_key(val, align):
+        generic_key = AttrsDescriptor.get_property_key(val, align)
+        hip_key = "S" if HIPAttrsDescriptor.is_within2gb(val) else "N"
+        key = (generic_key + hip_key).replace("N", "")
+        return key if key else "N"
+class HIPBackend(BaseBackend):
+    @staticmethod
+    def supports_target(target: GPUTarget):
+        return target.backend == 'hip'
+    def __init__(self, target: GPUTarget) -> None:
+        super().__init__(target)
+        assert isinstance(target.arch, str)
+        self.binary_ext = "hsaco"
+    def parse_options(self, opts) -> Any:
+        args = {'arch': self.target.arch}
+        if "supported_fp8_dtypes" not in opts:
+            supported_fp8_dtypes = set(HIPOptions.supported_fp8_dtypes)
+            if self.target.arch in ('gfx940', 'gfx941', 'gfx942'):
+                supported_fp8_dtypes.update({'fp8e4b8', 'fp8e5b16'})
+            args["supported_fp8_dtypes"] = tuple(sorted(supported_fp8_dtypes))
+        if "enable_fp_fusion" not in opts:
+            args["enable_fp_fusion"] = os.getenv("TRITON_DEFAULT_FP_FUSION", "1") == "1"
+        args.update({k: opts[k] for k in HIPOptions.__dataclass_fields__.keys() if k in opts})
+        return HIPOptions(**args)
+    def pack_metadata(self, metadata):
+        return (
+            metadata.num_warps,
+            metadata.num_ctas,
+            metadata.shared,
+            metadata.cluster_dims[0],
+            metadata.cluster_dims[1],
+            metadata.cluster_dims[2],
+        )
+    def get_codegen_implementation(self):
+        codegen_fns = {"min_dot_size": min_dot_size(self.target)}
+        return codegen_fns
+    def get_module_map(self) -> Dict[str, ModuleType]:
+        from triton.language.extra.hip import libdevice
+        return {"triton.language.extra.libdevice": libdevice}
+    def load_dialects(self, ctx):
+        amd.load_dialects(ctx)
+    def get_attrs_descriptor(self, params, args):
+        return HIPAttrsDescriptor(params, args)
+    @staticmethod
+    def compute_spec_key(arg, align):
+        return HIPAttrsDescriptor.get_property_key(arg, align)
+    @staticmethod
+    def path_to_rocm_lld():
+        # Check env path for ld.lld
+        lld_env_path = os.getenv("TRITON_HIP_LLD_PATH")
+        if lld_env_path is not None:
+            lld = Path(lld_env_path)
+            if lld.is_file():
+                return lld
+        # Check backend for ld.lld (used for pytorch wheels)
+        lld = Path(__file__).parent / "llvm/bin/ld.lld"
+        if lld.is_file():
+            return lld
+        lld = Path("/opt/rocm/llvm/bin/ld.lld")
+        if lld.is_file():
+            return lld
+        lld = Path("/usr/bin/ld.lld")
+        if lld.is_file():
+            return lld
+        raise Exception("ROCm linker /opt/rocm/llvm/bin/ld.lld not found. Set 'TRITON_HIP_LLD_PATH' to its path.")
+    @staticmethod
+    def make_ttir(mod, metadata, options):
+        pm = ir.pass_manager(mod.context)
+        pm.enable_debug()
+        passes.common.add_inliner(pm)
+        passes.ttir.add_rewrite_tensor_pointer(pm)
+        passes.ttir.add_combine(pm)
+        passes.common.add_canonicalizer(pm)
+        passes.ttir.add_reorder_broadcast(pm)
+        passes.common.add_cse(pm)
+        passes.common.add_licm(pm)
+        passes.common.add_symbol_dce(pm)
+        passes.ttir.add_loop_unroll(pm)
+        pm.run(mod)
+        return mod
+    @staticmethod
+    def make_ttgir(mod, metadata, options):
+        pm = ir.pass_manager(mod.context)
+        pm.enable_debug()
+        passes.ttir.add_convert_to_ttgpuir(pm, f"hip:{options.arch}", options.num_warps, options.warp_size,
+                                           options.num_ctas)
+        pm.run(mod)
+        pm = ir.pass_manager(mod.context)
+        pm.enable_debug()
+        passes.ttgpuir.add_coalesce(pm)
+        passes.ttgpuir.add_remove_layout_conversions(pm)
+        passes.ttgpuir.add_optimize_thread_locality(pm)
+        amd.passes.ttgpuir.add_accelerate_matmul(pm, options.arch, options.matrix_instr_nonkdim, options.kpack)
+        passes.ttgpuir.add_remove_layout_conversions(pm)
+        amd.passes.ttgpuir.add_optimize_epilogue(pm)
+        passes.ttgpuir.add_optimize_dot_operands(pm, True)
+        if amd.has_matrix_core_feature(options.arch):
+            assert options.num_stages != 0, ("Triton AMD backend pipeliner has been updated. "
+                                             "We used to trigger software pipelining with "
+                                             "num_stages == 0. Now it will not happen anymore; "
+                                             "please update to use num_stages == 2 for "
+                                             "equivalent behavior in the past.")
+            amd.passes.ttgpuir.add_stream_pipelinev2(pm, options.num_stages)
+            passes.common.add_canonicalizer(pm)
+        amd.passes.ttgpuir.insert_instruction_sched_hints(pm)
+        passes.ttgpuir.add_optimize_dot_operands(pm, True)
+        passes.ttgpuir.add_remove_layout_conversions(pm)
+        passes.ttgpuir.add_reduce_data_duplication(pm)
+        if amd.has_matrix_core_feature(options.arch):
+            amd.passes.ttgpuir.add_reorder_instructions(pm)
+        if os.environ.get("AMDGCN_USE_BUFFER_OPS", "0") == "1":
+            amd.passes.ttgpuir.add_canonicalize_pointers(pm)
+            passes.common.add_canonicalizer(pm)
+            amd.passes.ttgpuir.add_convert_to_buffer_ops(pm)
+        passes.common.add_canonicalizer(pm)
+        passes.common.add_cse(pm)
+        passes.common.add_symbol_dce(pm)
+        pm.run(mod)
+        return mod
+    @staticmethod
+    def make_llir(src, metadata, options):
+        mod = src
+        # TritonGPU -> LLVM-IR (MLIR)
+        pm = ir.pass_manager(mod.context)
+        pm.enable_debug()
+        amd.passes.ttgpuir.add_decompose_unsupported_conversions(pm, options.arch)
+        # custom_lds_size is an experimental parameter that defines amount of LDS available
+        # for one thread block. Measured in bytes.
+        #
+        # If custom_lds_size = 0, pass will consider all LDS is available for one threads block,
+        # LDS size is determined by provided arch name.
+        custom_lds_size = 0
+        amd.passes.ttgpuir.add_optimize_lds_usage(pm, options.arch, custom_lds_size)
+        passes.convert.add_scf_to_cf(pm)
+        passes.convert.add_index_to_llvmir(pm)
+        passes.ttgpuir.add_allocate_shared_memory(pm)
+        ## __HIP_FTZ is used to control the denorm flushing behavior of exp2 op as follows:
+        ## 1. If __HIP_FTZ = 1, exp2 flushes denorms in input and output regardless
+        ##    of the value of kernel arg `allow_flush_denorm`.
+        ## 2. If __HIP_FTZ = 0, whether exp2 flushes denorms in input and output
+        ##    depends on the value of kernel arg `allow_flush_denorm`.
+        ## 3. __HIP_FTZ is default to 1 and not exposed as a kernel argument.
+        ##    For now it is used as a controller for developers only.
+        __HIP_FTZ = True
+        amd.passes.ttgpuir.add_to_llvmir(pm, options.arch, __HIP_FTZ)
+        passes.common.add_canonicalizer(pm)
+        passes.common.add_cse(pm)
+        passes.convert.add_cf_to_llvmir(pm)
+        passes.convert.add_arith_to_llvmir(pm)
+        passes.common.add_canonicalizer(pm)
+        passes.common.add_cse(pm)
+        passes.common.add_symbol_dce(pm)
+        amd.passes.ttgpuir.lower_instruction_sched_hints(pm, options.instruction_sched_variant)
+        if os.environ.get("TRITON_DISABLE_LINE_INFO", "0") == "0":
+            passes.llvmir.add_di_scope(pm)
+        amd.passes.ttgpuir.add_builtin_func_to_llvmir(pm, __HIP_FTZ)
+        pm.run(mod)
+        # LLVM-IR (MLIR) -> LLVM-IR (LLVM)
+        llvm.init_targets()
+        context = llvm.context()
+        llvm_mod = llvm.to_module(mod, context)
+        amd.attach_target_triple(llvm_mod)
+        llvm.attach_datalayout(llvm_mod, amd.TARGET_TRIPLE, options.arch, '')
+        # Set various control constants on the LLVM module so that device
+        # libraries can resolve references to them.
+        amd.set_isa_version(llvm_mod, options.arch)
+        amd.set_abi_version(llvm_mod, 400)
+        amd.set_bool_control_constant(llvm_mod, "__oclc_finite_only_opt", False)
+        amd.set_bool_control_constant(llvm_mod, "__oclc_correctly_rounded_sqrt32", True)
+        amd.set_bool_control_constant(llvm_mod, "__oclc_unsafe_math_opt", False)
+        amd.set_bool_control_constant(llvm_mod, "__oclc_wavefrontsize64", options.warp_size == 64)
+        # Set kernel attributes first given this may affect later optimizations.
+        fns = [fn for fn in llvm_mod.get_functions() if not fn.is_declaration()]
+        # The public kernel should be kernel 0.
+        fns[0].set_calling_conv(amd.CALLING_CONV_AMDGPU_KERNEL)
+        fns[0].add_fn_attr("amdgpu-flat-work-group-size", f"1,{options.num_warps*options.warp_size}")
+        fns[0].add_fn_attr("amdgpu-waves-per-eu", f"{options.waves_per_eu}")
+        denormal_mode = "preserve-sign" if options.allow_flush_denorm else "ieee"
+        fns[0].add_fn_attr("denormal-fp-math-f32", denormal_mode)
+        # Hint the compiler that we'd like the firmware to set the kernel arguments
+        # to user SGPRs so that the kernel does not need to s_load its arguments
+        # from memory.
+        amd.set_all_fn_arg_inreg(fns[0])
+        if options.extern_libs:
+            paths = [path for (name, path) in options.extern_libs if amd.need_extern_lib(llvm_mod, name)]
+            llvm.link_extern_libs(llvm_mod, paths)
+        llvm.optimize_module(llvm_mod, llvm.OPTIMIZE_O3, options.arch, '', [], options.enable_fp_fusion)
+        # Get some metadata
+        metadata["shared"] = src.get_int_attr("triton_gpu.shared")
+        amd.cleanup_bitcode_metadata(llvm_mod)
+        return str(llvm_mod)
+    @staticmethod
+    def make_amdgcn(src, metadata, options):
+        # Find kernel names (there should only be one)
+        # We get the name at the last possible step to accomodate `triton.compile`
+        # on user-provided LLVM
+        names = re.findall(r"define amdgpu_kernel void @([a-zA-Z_][a-zA-Z0-9_]*)", src)
+        assert len(names) == 1
+        metadata["name"] = names[0]
+        # llvm -> hsaco
+        amdgcn = llvm.translate_to_asm(src, amd.TARGET_TRIPLE, options.arch, '', [], options.enable_fp_fusion, False)
+        if os.environ.get("AMDGCN_ENABLE_DUMP", "0") == "1":
+            print("// -----// AMDGCN Dump //----- //")
+            print(amdgcn)
+        return amdgcn
+    @staticmethod
+    def make_hsaco(src, metadata, options):
+        hsaco = amd.assemble_amdgcn(src, options.arch, '')
+        rocm_path = HIPBackend.path_to_rocm_lld()
+        with tempfile.NamedTemporaryFile() as tmp_out:
+            with tempfile.NamedTemporaryFile() as tmp_in:
+                with open(tmp_in.name, 'wb') as fd_in:
+                    fd_in.write(hsaco)
+                subprocess.check_call([rocm_path, '-flavor', 'gnu', '-shared', tmp_in.name, '-o', tmp_out.name])
+            with open(tmp_out.name, 'rb') as fd_out:
+                ret = fd_out.read()
+        return ret
+    def add_stages(self, stages, options):
+        stages["ttir"] = lambda src, metadata: self.make_ttir(src, metadata, options)
+        stages["ttgir"] = lambda src, metadata: self.make_ttgir(src, metadata, options)
+        stages["llir"] = lambda src, metadata: self.make_llir(src, metadata, options)
+        stages["amdgcn"] = lambda src, metadata: self.make_amdgcn(src, metadata, options)
+        stages["hsaco"] = lambda src, metadata: self.make_hsaco(src, metadata, options)
+    @functools.lru_cache()
+    def hash(self):
+        version = subprocess.check_output([HIPBackend.path_to_rocm_lld(), "--version"], encoding='utf-8')
+        return f'{version}-{self.target}'