PyPI - triton-windows - Versions diffs - 3.4.0.post20__cp310-cp310-win_amd64.whl → 3.5.0.post21__cp310-cp310-win_amd64.whl - Mend

triton-windows 3.4.0.post20__cp310-cp310-win_amd64.whl → 3.5.0.post21__cp310-cp310-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of triton-windows might be problematic. Click here for more details.

Files changed (107) hide show

triton/_C/libtriton.pyd +0 -0
triton/__init__.py +8 -2
triton/_filecheck.py +24 -14
triton/_internal_testing.py +70 -4
triton/_utils.py +3 -1
triton/backends/amd/compiler.py +68 -60
triton/backends/amd/driver.c +113 -44
triton/backends/amd/driver.py +133 -57
triton/backends/driver.py +13 -0
triton/backends/nvidia/compiler.py +80 -22
triton/backends/nvidia/driver.c +88 -15
triton/backends/nvidia/driver.py +130 -123
triton/compiler/__init__.py +5 -2
triton/compiler/code_generator.py +270 -163
triton/compiler/compiler.py +45 -62
triton/experimental/gluon/__init__.py +3 -2
triton/experimental/gluon/_runtime.py +9 -6
triton/experimental/gluon/language/__init__.py +117 -16
triton/experimental/gluon/language/_core.py +246 -68
triton/experimental/gluon/language/_layouts.py +398 -45
triton/experimental/gluon/language/_math.py +17 -9
triton/experimental/gluon/language/_semantic.py +130 -37
triton/experimental/gluon/language/_standard.py +55 -22
triton/experimental/gluon/language/amd/__init__.py +4 -0
triton/experimental/gluon/language/amd/_layouts.py +96 -0
triton/experimental/gluon/language/amd/cdna3/__init__.py +100 -0
triton/experimental/gluon/language/amd/cdna4/__init__.py +48 -0
triton/experimental/gluon/language/amd/cdna4/async_copy.py +151 -0
triton/experimental/gluon/language/extra/__init__.py +3 -0
triton/experimental/gluon/language/nvidia/ampere/__init__.py +3 -0
triton/experimental/gluon/language/nvidia/ampere/async_copy.py +74 -0
triton/experimental/gluon/language/nvidia/ampere/mbarrier.py +80 -0
triton/experimental/gluon/language/nvidia/blackwell/__init__.py +192 -7
triton/experimental/gluon/language/nvidia/blackwell/tma.py +20 -0
triton/experimental/gluon/language/nvidia/hopper/__init__.py +124 -3
triton/experimental/gluon/language/nvidia/hopper/mbarrier.py +20 -37
triton/experimental/gluon/language/nvidia/hopper/tma.py +4 -3
triton/experimental/gluon/nvidia/hopper.py +6 -1
triton/knobs.py +132 -67
triton/language/__init__.py +16 -10
triton/language/core.py +163 -83
triton/language/extra/cuda/gdc.py +6 -6
triton/language/extra/hip/__init__.py +3 -1
triton/language/extra/hip/libdevice.py +7 -0
triton/language/extra/hip/utils.py +35 -0
triton/language/extra/libdevice.py +4 -0
triton/language/semantic.py +76 -23
triton/language/standard.py +14 -14
triton/language/target_info.py +54 -0
triton/runtime/_allocation.py +15 -3
triton/runtime/_async_compile.py +55 -0
triton/runtime/autotuner.py +4 -5
triton/runtime/build.py +11 -9
triton/runtime/cache.py +44 -1
triton/runtime/driver.py +16 -41
triton/runtime/interpreter.py +31 -23
triton/runtime/jit.py +318 -157
triton/runtime/tcc/include/_mingw.h +8 -10
triton/runtime/tcc/include/assert.h +5 -0
triton/runtime/tcc/include/errno.h +1 -1
triton/runtime/tcc/include/float.h +21 -3
triton/runtime/tcc/include/iso646.h +36 -0
triton/runtime/tcc/include/limits.h +5 -0
triton/runtime/tcc/include/malloc.h +2 -2
triton/runtime/tcc/include/math.h +21 -261
triton/runtime/tcc/include/stdalign.h +16 -0
triton/runtime/tcc/include/stdarg.h +5 -70
triton/runtime/tcc/include/stdatomic.h +171 -0
triton/runtime/tcc/include/stddef.h +7 -19
triton/runtime/tcc/include/stdlib.h +15 -4
triton/runtime/tcc/include/stdnoreturn.h +7 -0
triton/runtime/tcc/include/sys/stat.h +2 -2
triton/runtime/tcc/include/sys/types.h +5 -0
triton/runtime/tcc/include/tcc/tcc_libm.h +444 -27
triton/runtime/tcc/include/tccdefs.h +342 -0
triton/runtime/tcc/include/tgmath.h +89 -0
triton/runtime/tcc/include/uchar.h +33 -0
triton/runtime/tcc/include/unistd.h +1 -0
triton/runtime/tcc/include/winapi/qos.h +72 -0
triton/runtime/tcc/include/winapi/shellapi.h +59 -0
triton/runtime/tcc/include/winapi/winbase.h +9 -2
triton/runtime/tcc/include/winapi/wincon.h +8 -0
triton/runtime/tcc/include/winapi/windows.h +1 -1
triton/runtime/tcc/include/winapi/winnls.h +778 -0
triton/runtime/tcc/include/winapi/winnt.h +9 -7
triton/runtime/tcc/include/winapi/winsock2.h +1474 -0
triton/runtime/tcc/include/winapi/ws2ipdef.h +21 -0
triton/runtime/tcc/include/winapi/ws2tcpip.h +391 -0
triton/runtime/tcc/lib/libtcc1.a +0 -0
triton/runtime/tcc/lib/python314.def +1800 -0
triton/runtime/tcc/lib/python314t.def +1809 -0
triton/runtime/tcc/libtcc.dll +0 -0
triton/runtime/tcc/tcc.exe +0 -0
triton/tools/compile.py +62 -14
triton/tools/extra/cuda/compile.c +1 -0
triton/tools/extra/hip/compile.cpp +66 -0
triton/tools/extra/hip/compile.h +13 -0
triton/tools/ragged_tma.py +92 -0
triton/tools/tensor_descriptor.py +7 -9
triton/windows_utils.py +42 -79
{triton_windows-3.4.0.post20.dist-info → triton_windows-3.5.0.post21.dist-info}/METADATA +3 -4
{triton_windows-3.4.0.post20.dist-info → triton_windows-3.5.0.post21.dist-info}/RECORD +106 -75
triton/runtime/tcc/lib/libtcc1-64.a +0 -0
{triton_windows-3.4.0.post20.dist-info → triton_windows-3.5.0.post21.dist-info}/WHEEL +0 -0
{triton_windows-3.4.0.post20.dist-info → triton_windows-3.5.0.post21.dist-info}/entry_points.txt +0 -0
{triton_windows-3.4.0.post20.dist-info → triton_windows-3.5.0.post21.dist-info}/licenses/LICENSE +0 -0
{triton_windows-3.4.0.post20.dist-info → triton_windows-3.5.0.post21.dist-info}/top_level.txt +0 -0

triton/_C/libtriton.pyd CHANGED Viewed

Binary file

triton/__init__.py CHANGED Viewed

@@ -1,5 +1,5 @@
 """isort:skip_file"""
-__version__ = '3.4.0'
+__version__ = '3.5.0'
 # ---------------------------------------
 # Note: import order is significant here.
@@ -17,7 +17,8 @@ from .runtime import (
     InterpreterError,
     MockTensor,
 )
-from .runtime.jit import jit
+from .runtime.jit import constexpr_function, jit
+from .runtime._async_compile import AsyncCompileMode, FutureKernel
 from .compiler import compile, CompilationError
 from .errors import TritonError
 from .runtime._allocation import set_allocator
@@ -29,11 +30,14 @@ from . import tools
 must_use_result = language.core.must_use_result
 __all__ = [
+    "AsyncCompileMode",
     "autotune",
     "cdiv",
     "CompilationError",
     "compile",
     "Config",
+    "constexpr_function",
+    "FutureKernel",
     "heuristics",
     "InterpreterError",
     "jit",
@@ -59,10 +63,12 @@ __all__ = [
 # -------------------------------------
+@constexpr_function
 def cdiv(x: int, y: int):
     return (x + y - 1) // y
+@constexpr_function
 def next_power_of_2(n: int):
     """Return the smallest power of 2 greater than or equal to n"""
     n -= 1

triton/_filecheck.py CHANGED Viewed

@@ -1,3 +1,4 @@
+import functools
 import os
 import inspect
 import subprocess
@@ -7,6 +8,7 @@ import triton
 from triton.compiler import ASTSource, make_backend
 from triton.backends.compiler import GPUTarget
 from triton.experimental.gluon._runtime import GluonASTSource
+from triton.runtime.jit import create_function_from_signature
 from triton._C.libtriton import ir
 # ===-----------------------------------------------------------------------===#
@@ -15,7 +17,6 @@ from triton._C.libtriton import ir
 # Stub target for testing the frontend.
 stub_target = GPUTarget("cuda", 100, 32)
-stub_backend = make_backend(stub_target)
 triton_dir = os.path.dirname(__file__)
 filecheck_path = os.path.join(triton_dir, "FileCheck")
@@ -42,29 +43,37 @@ def run_filecheck(name, module_str, check_template):
             temp.write(check_template)
         try:
-            subprocess.check_output([filecheck_path, temp_expected, "--input-file", temp_module],
-                                    stderr=subprocess.STDOUT)
+            subprocess.check_output(
+                [filecheck_path, temp_expected, "--input-file", temp_module, "--dump-input-context=50"],
+                stderr=subprocess.STDOUT)
         except subprocess.CalledProcessError as error:
             decoded = error.output.decode('unicode_escape')
             raise ValueError(decoded)
-def run_parser(kernel_fn):
-    sigkeys = [x.name for x in kernel_fn.params]
-    sigvals = [f"arg{i}" for i in range(len(sigkeys))]
-    signature = {k: v for (k, v) in zip(sigkeys, sigvals)}
+def run_parser(kernel_fn, args=(), kwargs={}, target=stub_target):
+    if "sanitize_overflow" not in kwargs:
+        kwargs = dict(kwargs)
+        kwargs["sanitize_overflow"] = False
+    backend = make_backend(target)
+    binder = create_function_from_signature(
+        kernel_fn.signature,
+        kernel_fn.params,
+        backend,
+    )
+    bound_args, specialization, options = binder(*args, **kwargs)
+    options, signature, constexprs, attrs = kernel_fn._pack_args(backend, kwargs, bound_args, specialization, options)
     source_cls = GluonASTSource if kernel_fn.is_gluon() else ASTSource
-    src = source_cls(fn=kernel_fn, signature=signature)
+    src = source_cls(kernel_fn, signature, constexprs, attrs)
     context = ir.context()
     ir.load_dialects(context)
-    stub_backend.load_dialects(context)
+    backend.load_dialects(context)
-    extra_options = src.parse_options()
-    options = stub_backend.parse_options(dict(**extra_options))
-    codegen_fns = stub_backend.get_codegen_implementation(options)
-    module_map = stub_backend.get_module_map()
-    module = src.make_ir(options, codegen_fns, module_map, context)
+    codegen_fns = backend.get_codegen_implementation(options)
+    module_map = backend.get_module_map()
+    module = src.make_ir(target, options, codegen_fns, module_map, context)
     assert module.verify()
     return module
@@ -81,6 +90,7 @@ def run_filecheck_test(kernel_fn):
 def filecheck_test(fn):
+    @functools.wraps(fn)
     def test_fn():
         run_filecheck_test(fn)

triton/_internal_testing.py CHANGED Viewed

@@ -5,10 +5,10 @@ import torch
 import triton
 import triton.language as tl
 from triton import knobs
+from typing import Optional, Set, Union
 import pytest
 from numpy.random import RandomState
-from typing import Optional, Union
 from triton.runtime.jit import TensorWrapper, reinterpret, type_canonicalisation_dict
 int_dtypes = ['int8', 'int16', 'int32', 'int64']
@@ -38,10 +38,22 @@ def is_cuda():
     return False if target is None else target.backend == "cuda"
-def is_hopper():
+def is_ampere_or_newer():
+    return is_cuda() and torch.cuda.get_device_capability()[0] >= 8
+def is_blackwell():
+    return is_cuda() and torch.cuda.get_device_capability()[0] == 10
+def is_hopper_or_newer():
     return is_cuda() and torch.cuda.get_device_capability()[0] >= 9
+def is_hopper():
+    return is_cuda() and torch.cuda.get_device_capability()[0] == 9
 def is_hip():
     target = get_current_target()
     return False if target is None else target.backend == "hip"
@@ -62,9 +74,13 @@ def is_hip_cdna4():
     return target is not None and target.backend == 'hip' and target.arch == 'gfx950'
+def is_hip_gfx11():
+    target = get_current_target()
+    return target is not None and target.backend == 'hip' and 'gfx11' in target.arch
 def is_hip_gfx12():
     target = get_current_target()
-    print(target.arch)
     return target is not None and target.backend == 'hip' and 'gfx12' in target.arch
@@ -72,6 +88,10 @@ def is_hip_cdna():
     return is_hip_cdna2() or is_hip_cdna3() or is_hip_cdna4()
+def get_hip_lds_size():
+    return 163840 if is_hip_cdna4() else 65536
 def is_xpu():
     target = get_current_target()
     return False if target is None else target.backend == "xpu"
@@ -132,7 +152,7 @@ def to_triton(x: np.ndarray, device, dst_type=None) -> Union[TensorWrapper, torc
 def str_to_triton_dtype(x: str) -> tl.dtype:
-    return tl.str_to_ty(type_canonicalisation_dict[x])
+    return tl.str_to_ty(type_canonicalisation_dict[x], None)
 def torch_dtype_name(dtype) -> str:
@@ -187,3 +207,49 @@ def unwrap_tensor(t: Union[torch.Tensor, triton.runtime.jit.TensorWrapper]) -> t
     if isinstance(t, triton.runtime.jit.TensorWrapper):
         return t.base
     return t
+def _fresh_knobs_impl(skipped_attr: Optional[Set[str]] = None):
+    from triton import knobs
+    if skipped_attr is None:
+        skipped_attr = set()
+    monkeypatch = pytest.MonkeyPatch()
+    knobs_map = {
+        name: knobset
+        for name, knobset in knobs.__dict__.items()
+        if isinstance(knobset, knobs.base_knobs) and knobset != knobs.base_knobs and name not in skipped_attr
+    }
+    # We store which variables we need to unset below in finally because
+    # monkeypatch doesn't appear to reset variables that were never set
+    # before the monkeypatch.delenv call below.
+    env_to_unset = []
+    prev_propagate_env = knobs.propagate_env
+    def fresh_function():
+        nonlocal env_to_unset
+        for name, knobset in knobs_map.items():
+            setattr(knobs, name, knobset.copy().reset())
+            for knob in knobset.knob_descriptors.values():
+                if knob.key in os.environ:
+                    monkeypatch.delenv(knob.key, raising=False)
+                else:
+                    env_to_unset.append(knob.key)
+        knobs.propagate_env = True
+        return knobs
+    def reset_function():
+        for name, knobset in knobs_map.items():
+            setattr(knobs, name, knobset)
+        # `undo` should be placed before `del os.environ`
+        # Otherwise, it may restore environment variables that monkeypatch deleted
+        monkeypatch.undo()
+        for k in env_to_unset:
+            if k in os.environ:
+                del os.environ[k]
+        knobs.propagate_env = prev_propagate_env
+    return fresh_function, reset_function

triton/_utils.py CHANGED Viewed

@@ -16,9 +16,11 @@ def get_iterable_path(iterable: IterableType, path: ObjPath) -> Any:
 def set_iterable_path(iterable: IterableType, path: tuple[int, ...], val: Any):
+    from .language import core
     assert len(path) != 0
     prev = iterable if len(path) == 1 else get_iterable_path(iterable, path[:-1])
-    prev[path[-1]] = val  # type: ignore[index]
+    assert isinstance(prev, core.tuple)
+    prev._setitem(path[-1], val)
 def find_paths_if(iterable: Union[IterableType, Any], pred: Callable[[ObjPath, Any], bool]) -> list[ObjPath]:

triton/backends/amd/compiler.py CHANGED Viewed

@@ -7,8 +7,8 @@ from types import ModuleType
 import hashlib
 import tempfile
 import re
-import subprocess
 import functools
+import warnings
 from pathlib import Path
@@ -18,8 +18,9 @@ def get_min_dot_size(target: GPUTarget):
     return lambda lhs_type, rhs_type: (1, 1, 1)
-def is_pingpong_schedule_enabled(arch):
-    return (arch == "gfx942") if knobs.amd.use_block_pingpong is None else knobs.amd.use_block_pingpong
+def is_pingpong_schedule_enabled(arch, use_async_copy):
+    return (arch == "gfx942" or (arch == "gfx950" and use_async_copy is True)
+            ) if knobs.amd.use_block_pingpong is None else knobs.amd.use_block_pingpong
 def is_in_thread_transpose_enabled(arch):
@@ -37,7 +38,11 @@ class HIPOptions:
     debug: bool = False
     sanitize_overflow: bool = True
     arch: str = None
-    supported_fp8_dtypes: Tuple[str] = ("fp8e5", )
+    # We have native support for OCP fp8 variants since CDNA4/RDNA4. For earlier generations,
+    # we software emulate the support for them.
+    # UZ fp8 variants (fp8e4b8 and fp8e5b16) are natively supported for CDNA3. For other
+    # architectures they are software emulated.
+    supported_fp8_dtypes: Tuple[str] = ("fp8e4nv", "fp8e5", "fp8e5b16", "fp8e4b8")
     deprecated_fp8_dot_operand_dtypes: Tuple[str] = ()
     default_dot_input_precision: str = "ieee"
     allowed_dot_input_precisions: Tuple[str] = ("ieee", )
@@ -48,6 +53,7 @@ class HIPOptions:
     allow_flush_denorm: bool = False
     max_num_imprecise_acc_default: int = 0
     backend_name: str = 'hip'
+    instrumentation_mode: str = ""
     # The following option provides hints to the AMDGPU backend regarding instruction scheduling
     # for all `tt.dot` operations in a kernel. The "none" variant preserves the default
@@ -57,10 +63,6 @@ class HIPOptions:
     #
     # Current experimental scheduling variants:
     #
-    # local-prefetch: implements instruction scheduling similar to the one from the ROCm Composable
-    #                 Kernel library. Note, this variant requires the use of buffer load/store ops
-    #                 and a special software pipelining style - i.e., 1x LDS and 1x register
-    #                 prefetch buffers for each GEMM tile.
     # attention: enables a bunch of optimizations for attention kernels, including:
     #            - iglp 2 and sched.barrier around it
     #            - sink-insts-to-avoid-spills flag to avoid register spills
@@ -73,8 +75,11 @@ class HIPOptions:
         assert self.num_warps > 0 and (self.num_warps & (self.num_warps - 1)) == 0, \
                "num_warps must be a power of 2"
-        if self.arch == 'gfx950':
-            assert self.kpack == 1, "gfx950 only accepts kpack == 1"
+        if (self.arch == 'gfx950') and (self.kpack != 1):
+            warnings.warn(
+                f"kpack is deprecated starting from gfx950 and will be removed in later releases. So for now kpack = {self.kpack} will be overwritten to 1 to make transitioning easier."
+            )
+            object.__setattr__(self, 'kpack', 1)
         default_libdir = Path(__file__).parent / 'lib'
         extern_libs = {} if self.extern_libs is None else dict(self.extern_libs)
@@ -88,6 +93,7 @@ class HIPOptions:
 class HIPBackend(BaseBackend):
+    instrumentation = None
     @staticmethod
     def supports_target(target: GPUTarget):
@@ -104,6 +110,9 @@ class HIPBackend(BaseBackend):
     def parse_options(self, opts) -> Any:
         args = {'arch': knobs.runtime.override_arch or self.target.arch}
+        if opts.get("num_ctas", 1) > 1:
+            raise ValueError("num_ctas > 1 not supported for AMD GPUs")
         # Enable XF32 (TF32) for CDNA3 GPUs
         if self.target.arch == 'gfx942':
             allowed_dot_input_precisions = set(HIPOptions.allowed_dot_input_precisions)
@@ -111,14 +120,12 @@ class HIPBackend(BaseBackend):
             args["allowed_dot_input_precisions"] = tuple(sorted(allowed_dot_input_precisions))
         if "supported_fp8_dtypes" not in opts:
-            supported_fp8_dtypes = set(HIPOptions.supported_fp8_dtypes)
-            if self.target.arch == 'gfx942':
-                supported_fp8_dtypes.update({'fp8e4nv', 'fp8e4b8', 'fp8e5b16'})
-            elif self.target.arch == 'gfx950':
-                supported_fp8_dtypes.update({'fp8e4nv', 'fp8e5'})
-            elif 'gfx12' in self.target.arch:
-                supported_fp8_dtypes.update({'fp8e4nv', 'fp8e5'})
-            args["supported_fp8_dtypes"] = tuple(sorted(supported_fp8_dtypes))
+            args["supported_fp8_dtypes"] = tuple(sorted(HIPOptions.supported_fp8_dtypes))
+        if self.target.arch == 'gfx950':
+            deprecated_fp8_dot_operand_dtypes = set(HIPOptions.deprecated_fp8_dot_operand_dtypes)
+            deprecated_fp8_dot_operand_dtypes.update({"fp8e5b16", "fp8e4b8"})
+            args["deprecated_fp8_dot_operand_dtypes"] = tuple(sorted(deprecated_fp8_dot_operand_dtypes))
         if "enable_fp_fusion" not in opts:
             args["enable_fp_fusion"] = knobs.language.default_fp_fusion
@@ -146,6 +153,8 @@ class HIPBackend(BaseBackend):
     def load_dialects(self, ctx):
         amd.load_dialects(ctx)
+        if HIPBackend.instrumentation:
+            HIPBackend.instrumentation.load_dialects(ctx)
     @staticmethod
     def is_within_2gb(arg):
@@ -174,26 +183,6 @@ class HIPBackend(BaseBackend):
             ret += "S"
         return ret
-    @staticmethod
-    def path_to_rocm_lld():
-        # Check env path for ld.lld
-        lld_env_path = knobs.amd.lld_path
-        if lld_env_path is not None:
-            lld = Path(lld_env_path)
-            if lld.is_file():
-                return lld
-        # Check backend for ld.lld (used for pytorch wheels)
-        lld = Path(__file__).parent / "llvm/bin/ld.lld"
-        if lld.is_file():
-            return lld
-        lld = Path("/opt/rocm/llvm/bin/ld.lld")
-        if lld.is_file():
-            return lld
-        lld = Path("/usr/bin/ld.lld")
-        if lld.is_file():
-            return lld
-        raise Exception("ROCm linker /opt/rocm/llvm/bin/ld.lld not found. Set 'TRITON_HIP_LLD_PATH' to its path.")
     @staticmethod
     def make_ttir(mod, metadata, options):
         pm = ir.pass_manager(mod.context)
@@ -237,12 +226,10 @@ class HIPBackend(BaseBackend):
         global_prefetch = knobs.amd.global_prefetch
         local_prefetch = knobs.amd.local_prefetch
         use_async_copy = knobs.amd.use_async_copy
+        use_block_pingpong = is_pingpong_schedule_enabled(options.arch, use_async_copy)
-        # The `local-prefetch` scheduling variant requires turning on buffer ops.
-        if options.schedule_hint == "local-prefetch":
-            global_prefetch = local_prefetch = 1
-        amd.passes.ttgpuir.add_stream_pipeline(pm, options.num_stages, global_prefetch, local_prefetch, use_async_copy)
+        amd.passes.ttgpuir.add_stream_pipeline(pm, options.num_stages, global_prefetch, local_prefetch, use_async_copy,
+                                               use_block_pingpong)
         if use_async_copy:
             amd.passes.ttgpuir.add_coalesce_async_copy(pm, options.arch)
         passes.common.add_canonicalizer(pm)
@@ -255,14 +242,13 @@ class HIPBackend(BaseBackend):
             amd.passes.ttgpuir.add_in_thread_transpose(pm)
             passes.ttgpuir.add_remove_layout_conversions(pm)
         amd.passes.ttgpuir.add_reorder_instructions(pm)
-        use_block_pingpong = is_pingpong_schedule_enabled(options.arch)
-        if use_block_pingpong and options.num_stages == 2:
+        if use_block_pingpong and options.num_stages > 1:
             amd.passes.ttgpuir.add_block_pingpong(pm, options.num_stages)
         if knobs.amd.use_buffer_ops:
             amd.passes.ttgpuir.add_canonicalize_pointers(pm)
             passes.common.add_canonicalizer(pm)
-            amd.passes.ttgpuir.add_convert_to_buffer_ops(pm, options.arch)
+            amd.passes.ttgpuir.add_convert_to_buffer_ops(pm, options.arch, knobs.amd.use_buffer_atomics)
         amd.passes.ttgpuir.add_fold_true_cmpi(pm)
         passes.common.add_canonicalizer(pm)
@@ -274,15 +260,16 @@ class HIPBackend(BaseBackend):
         return mod
     @staticmethod
-    def ttgir_opt(src, metadata, options):
+    def gluon_to_ttgir(src, metadata, options):
         mod = src
         pm = ir.pass_manager(mod.context)
         pm.enable_debug()
-        passes.ttgpuir.add_inliner(pm)
+        passes.gluon.add_inliner(pm)
+        passes.gluon.add_resolve_auto_encodings(pm)
         passes.common.add_sccp(pm)
         passes.ttir.add_loop_aware_cse(pm)
-        passes.ttgpuir.add_canonicalizer(pm)
+        passes.gluon.add_canonicalizer(pm)
         passes.ttgpuir.add_combine_tensor_select_and_if(pm)
         pm.run(mod)
@@ -304,7 +291,10 @@ class HIPBackend(BaseBackend):
         passes.convert.add_scf_to_cf(pm)
         passes.convert.add_index_to_llvmir(pm)
-        passes.ttgpuir.add_allocate_shared_memory(pm)
+        amd.passes.ttgpuir.add_allocate_shared_memory(pm)
+        # instrumentation point here so we can override IRs above (e.g., ttir and ttgir)
+        if HIPBackend.instrumentation:
+            HIPBackend.instrumentation.patch("ttgpuir_to_llvmir", pm, mod.context)
         ## __HIP_FTZ is used to control the denorm flushing behavior of exp2 op as follows:
         ## 1. If __HIP_FTZ = 1, exp2 flushes denorms in input and output regardless
         ##    of the value of kernel arg `allow_flush_denorm`.
@@ -322,10 +312,17 @@ class HIPBackend(BaseBackend):
         passes.common.add_canonicalizer(pm)
         passes.common.add_cse(pm)
         passes.common.add_symbol_dce(pm)
         if options.schedule_hint.lower() != "none":
             amd.passes.ttgpuir.lower_instruction_sched_hints(pm, options.arch, options.num_stages)
+        # This can not be moved below the di_scope pass
+        if HIPBackend.instrumentation:
+            HIPBackend.instrumentation.patch("llvmir_to_llvm", pm, mod.context)
         if not knobs.compilation.disable_line_info:
             passes.llvmir.add_di_scope(pm)
         amd.passes.ttgpuir.add_builtin_func_to_llvmir(pm, __HIP_FTZ)
         pm.run(mod)
@@ -382,15 +379,27 @@ class HIPBackend(BaseBackend):
             llvm.link_extern_libs(llvm_mod, paths)
         elif options.extern_libs:
             paths = [path for (name, path) in options.extern_libs if amd.need_extern_lib(llvm_mod, name)]
-            llvm.link_extern_libs(llvm_mod, paths)
+            if len(paths) > 0:
+                llvm.link_extern_libs(llvm_mod, paths)
         llvm.optimize_module(llvm_mod, llvm.OPTIMIZE_O3, options.arch, '', [], options.enable_fp_fusion)
+        # Architectures with architected SGPRs store the workgroup id in ttmp9 (X) and ttmp7 (Y[15:0], Z[31:16]).
+        # These attributes are used to determine if Z should be masked out when loading Y. They are inferred during
+        # optimize_module from calls to @llvm.amdgcn.workgroup.id.x/y/z(). We cannot rely on this because a
+        # dispatch dimensions might be used even if there is no program_id() call for it.
+        if amd.has_architected_sgprs(options.arch):
+            fns[0].remove_fn_attr("amdgpu-no-workgroup-id-x")
+            fns[0].remove_fn_attr("amdgpu-no-workgroup-id-y")
+            fns[0].remove_fn_attr("amdgpu-no-workgroup-id-z")
         if knobs.amd.scalarize_packed_fops:
             amd.add_scalarize_packed_fops_llvm_pass(fns[0])
         # Get some metadata
         metadata["shared"] = src.get_int_attr("ttg.shared")
+        metadata["profile_scratch_size"] = src.get_int_attr("ttg.profile_scratch_memory_size") or 0
+        metadata["profile_scratch_align"] = src.get_int_attr("ttg.profile_scratch_memory_alignment") or 1
         amd.cleanup_bitcode_metadata(llvm_mod)
         # Disable inlining of print related functions,
@@ -414,7 +423,9 @@ class HIPBackend(BaseBackend):
         # the regression is not significant. It would be better to have some heuristics.
         if options.schedule_hint == 'attention':
             flags.append('sink-insts-to-avoid-spills')
-        amdgcn = llvm.translate_to_asm(src, amd.TARGET_TRIPLE, options.arch, '', flags, options.enable_fp_fusion, False)
+        features = '-real-true16' if 'gfx11' in options.arch else ''
+        amdgcn = llvm.translate_to_asm(src, amd.TARGET_TRIPLE, options.arch, features, flags, options.enable_fp_fusion,
+                                       False)
         if knobs.amd.dump_amdgcn:
             print("// -----// AMDGCN Dump //----- //")
             print(amdgcn)
@@ -426,14 +437,12 @@ class HIPBackend(BaseBackend):
         if knobs.compilation.enable_asan:
             target_features = '+xnack'
         hsaco = amd.assemble_amdgcn(src, options.arch, target_features)
-        rocm_path = HIPBackend.path_to_rocm_lld()
         with tempfile.NamedTemporaryFile() as tmp_out:
             with tempfile.NamedTemporaryFile() as tmp_in:
-                with open(tmp_in.name, 'wb') as fd_in:
+                with open(tmp_in.name, "wb") as fd_in:
                     fd_in.write(hsaco)
-                subprocess.check_call([rocm_path, '-flavor', 'gnu', '-shared', tmp_in.name, '-o', tmp_out.name])
-            with open(tmp_out.name, 'rb') as fd_out:
+                amd.link_hsaco(tmp_in.name, tmp_out.name)
+            with open(tmp_out.name, "rb") as fd_out:
                 ret = fd_out.read()
         return ret
@@ -442,12 +451,11 @@ class HIPBackend(BaseBackend):
             stages["ttir"] = lambda src, metadata: self.make_ttir(src, metadata, options)
             stages["ttgir"] = lambda src, metadata: self.make_ttgir(src, metadata, options)
         elif language == Language.GLUON:
-            stages["ttgir"] = lambda src, metadata: self.ttgir_opt(src, metadata, options)
+            stages["ttgir"] = lambda src, metadata: self.gluon_to_ttgir(src, metadata, options)
         stages["llir"] = lambda src, metadata: self.make_llir(src, metadata, options)
         stages["amdgcn"] = lambda src, metadata: self.make_amdgcn(src, metadata, options)
         stages["hsaco"] = lambda src, metadata: self.make_hsaco(src, metadata, options)
     @functools.lru_cache()
     def hash(self):
-        version = subprocess.check_output([HIPBackend.path_to_rocm_lld(), "--version"], encoding='utf-8')
-        return f'{version}-{self.target}'
+        return f'{self.target}'