PyPI - triton-windows - Versions diffs - 3.4.0.post20__cp310-cp310-win_amd64.whl → 3.5.0.post21__cp310-cp310-win_amd64.whl - Mend

triton-windows 3.4.0.post20__cp310-cp310-win_amd64.whl → 3.5.0.post21__cp310-cp310-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of triton-windows might be problematic. Click here for more details.

Files changed (107) hide show

triton/_C/libtriton.pyd +0 -0
triton/__init__.py +8 -2
triton/_filecheck.py +24 -14
triton/_internal_testing.py +70 -4
triton/_utils.py +3 -1
triton/backends/amd/compiler.py +68 -60
triton/backends/amd/driver.c +113 -44
triton/backends/amd/driver.py +133 -57
triton/backends/driver.py +13 -0
triton/backends/nvidia/compiler.py +80 -22
triton/backends/nvidia/driver.c +88 -15
triton/backends/nvidia/driver.py +130 -123
triton/compiler/__init__.py +5 -2
triton/compiler/code_generator.py +270 -163
triton/compiler/compiler.py +45 -62
triton/experimental/gluon/__init__.py +3 -2
triton/experimental/gluon/_runtime.py +9 -6
triton/experimental/gluon/language/__init__.py +117 -16
triton/experimental/gluon/language/_core.py +246 -68
triton/experimental/gluon/language/_layouts.py +398 -45
triton/experimental/gluon/language/_math.py +17 -9
triton/experimental/gluon/language/_semantic.py +130 -37
triton/experimental/gluon/language/_standard.py +55 -22
triton/experimental/gluon/language/amd/__init__.py +4 -0
triton/experimental/gluon/language/amd/_layouts.py +96 -0
triton/experimental/gluon/language/amd/cdna3/__init__.py +100 -0
triton/experimental/gluon/language/amd/cdna4/__init__.py +48 -0
triton/experimental/gluon/language/amd/cdna4/async_copy.py +151 -0
triton/experimental/gluon/language/extra/__init__.py +3 -0
triton/experimental/gluon/language/nvidia/ampere/__init__.py +3 -0
triton/experimental/gluon/language/nvidia/ampere/async_copy.py +74 -0
triton/experimental/gluon/language/nvidia/ampere/mbarrier.py +80 -0
triton/experimental/gluon/language/nvidia/blackwell/__init__.py +192 -7
triton/experimental/gluon/language/nvidia/blackwell/tma.py +20 -0
triton/experimental/gluon/language/nvidia/hopper/__init__.py +124 -3
triton/experimental/gluon/language/nvidia/hopper/mbarrier.py +20 -37
triton/experimental/gluon/language/nvidia/hopper/tma.py +4 -3
triton/experimental/gluon/nvidia/hopper.py +6 -1
triton/knobs.py +132 -67
triton/language/__init__.py +16 -10
triton/language/core.py +163 -83
triton/language/extra/cuda/gdc.py +6 -6
triton/language/extra/hip/__init__.py +3 -1
triton/language/extra/hip/libdevice.py +7 -0
triton/language/extra/hip/utils.py +35 -0
triton/language/extra/libdevice.py +4 -0
triton/language/semantic.py +76 -23
triton/language/standard.py +14 -14
triton/language/target_info.py +54 -0
triton/runtime/_allocation.py +15 -3
triton/runtime/_async_compile.py +55 -0
triton/runtime/autotuner.py +4 -5
triton/runtime/build.py +11 -9
triton/runtime/cache.py +44 -1
triton/runtime/driver.py +16 -41
triton/runtime/interpreter.py +31 -23
triton/runtime/jit.py +318 -157
triton/runtime/tcc/include/_mingw.h +8 -10
triton/runtime/tcc/include/assert.h +5 -0
triton/runtime/tcc/include/errno.h +1 -1
triton/runtime/tcc/include/float.h +21 -3
triton/runtime/tcc/include/iso646.h +36 -0
triton/runtime/tcc/include/limits.h +5 -0
triton/runtime/tcc/include/malloc.h +2 -2
triton/runtime/tcc/include/math.h +21 -261
triton/runtime/tcc/include/stdalign.h +16 -0
triton/runtime/tcc/include/stdarg.h +5 -70
triton/runtime/tcc/include/stdatomic.h +171 -0
triton/runtime/tcc/include/stddef.h +7 -19
triton/runtime/tcc/include/stdlib.h +15 -4
triton/runtime/tcc/include/stdnoreturn.h +7 -0
triton/runtime/tcc/include/sys/stat.h +2 -2
triton/runtime/tcc/include/sys/types.h +5 -0
triton/runtime/tcc/include/tcc/tcc_libm.h +444 -27
triton/runtime/tcc/include/tccdefs.h +342 -0
triton/runtime/tcc/include/tgmath.h +89 -0
triton/runtime/tcc/include/uchar.h +33 -0
triton/runtime/tcc/include/unistd.h +1 -0
triton/runtime/tcc/include/winapi/qos.h +72 -0
triton/runtime/tcc/include/winapi/shellapi.h +59 -0
triton/runtime/tcc/include/winapi/winbase.h +9 -2
triton/runtime/tcc/include/winapi/wincon.h +8 -0
triton/runtime/tcc/include/winapi/windows.h +1 -1
triton/runtime/tcc/include/winapi/winnls.h +778 -0
triton/runtime/tcc/include/winapi/winnt.h +9 -7
triton/runtime/tcc/include/winapi/winsock2.h +1474 -0
triton/runtime/tcc/include/winapi/ws2ipdef.h +21 -0
triton/runtime/tcc/include/winapi/ws2tcpip.h +391 -0
triton/runtime/tcc/lib/libtcc1.a +0 -0
triton/runtime/tcc/lib/python314.def +1800 -0
triton/runtime/tcc/lib/python314t.def +1809 -0
triton/runtime/tcc/libtcc.dll +0 -0
triton/runtime/tcc/tcc.exe +0 -0
triton/tools/compile.py +62 -14
triton/tools/extra/cuda/compile.c +1 -0
triton/tools/extra/hip/compile.cpp +66 -0
triton/tools/extra/hip/compile.h +13 -0
triton/tools/ragged_tma.py +92 -0
triton/tools/tensor_descriptor.py +7 -9
triton/windows_utils.py +42 -79
{triton_windows-3.4.0.post20.dist-info → triton_windows-3.5.0.post21.dist-info}/METADATA +3 -4
{triton_windows-3.4.0.post20.dist-info → triton_windows-3.5.0.post21.dist-info}/RECORD +106 -75
triton/runtime/tcc/lib/libtcc1-64.a +0 -0
{triton_windows-3.4.0.post20.dist-info → triton_windows-3.5.0.post21.dist-info}/WHEEL +0 -0
{triton_windows-3.4.0.post20.dist-info → triton_windows-3.5.0.post21.dist-info}/entry_points.txt +0 -0
{triton_windows-3.4.0.post20.dist-info → triton_windows-3.5.0.post21.dist-info}/licenses/LICENSE +0 -0
{triton_windows-3.4.0.post20.dist-info → triton_windows-3.5.0.post21.dist-info}/top_level.txt +0 -0

triton/experimental/gluon/language/nvidia/hopper/__init__.py CHANGED Viewed

@@ -1,11 +1,132 @@
-from . import mbarrier
-from . import tma
+from __future__ import annotations
+from triton.compiler.code_generator import unflatten_ir_values
+from ..ampere import async_copy
+from . import mbarrier, tma
 from ... import _core
-__all__ = ["fence_async_shared", "mbarrier", "tma"]
+from typing import List, Tuple, TYPE_CHECKING
+if TYPE_CHECKING:
+    from triton._C.libtriton import ir
+__all__ = ["async_copy", "fence_async_shared", "mbarrier", "tma", "warpgroup_mma", "warpgroup_mma_wait"]
 @_core.builtin
 def fence_async_shared(cluster=False, _semantic=None):
+    """
+    Issue a fence to complete asynchronous shared memory operations.
+    Args:
+        cluster (bool): Whether to fence across cluster. Defaults to False.
+    """
     cluster = _core._unwrap_if_constexpr(cluster)
     _semantic.builder.create_fence_async_shared(cluster)
+class warpgroup_mma_accumulator_type(_core.base_type):
+    tensor_type: _core.dtype
+    def __init__(self, tensor_type: _core.dtype):
+        self.tensor_type = tensor_type
+    def __str__(self) -> str:
+        return f"warpgroup_mma_accumulator<{self.tensor_type}>"
+    def _unflatten_ir(self, handles: List[ir.value], cursor: int) -> Tuple[warpgroup_mma_accumulator, int]:
+        return warpgroup_mma_accumulator(handles[cursor], self.tensor_type), cursor + 1
+    def _flatten_ir_types(self, builder: ir.builder, out: List[ir.type]) -> None:
+        self.tensor_type._flatten_ir_types(builder, out)
+    def __eq__(self, other) -> bool:
+        return type(self) is type(other) and self.tensor_type == other.tensor_type
+    def mangle(self) -> str:
+        return f"FT{self.tensor_type.mangle()}FT"
+class warpgroup_mma_accumulator(_core.base_value):
+    handle: ir.value
+    type: warpgroup_mma_accumulator_type
+    def __init__(self, handle, tensor_type: _core.dtype):
+        self.handle = handle
+        self.type = warpgroup_mma_accumulator_type(tensor_type)
+    def _flatten_ir(self, handles: List[ir.value]) -> None:
+        handles.append(self.handle)
+@_core.builtin
+def warpgroup_mma_init(value, _semantic):
+    assert isinstance(value, _core.tensor)
+    return warpgroup_mma_accumulator(value.handle, value.type)
+@_core.builtin
+def warpgroup_mma(a, b, acc, *, use_acc=True, precision=None, max_num_imprecise_acc=None, is_async=False,
+                  _semantic=None):
+    """
+    Perform warpgroup MMA (Tensor Core) operations.
+    acc = a * b + (acc if use_acc else 0)
+    Args:
+        a (tensor or shared_memory_descriptor): Left hand side operand.
+        b (shared_memory_descriptor): Right hand side operand.
+        acc (tensor): Accumulator tensor.
+        use_acc (bool): Whether to use the initial value of the accumulator. Defaults to True.
+        precision (str, optional): Dot input precision. Defaults to builder default.
+        max_num_imprecise_acc (int): Max imprecise accumulations. Used for fp8 -> fp32 dot. Determines how many accumulation are done in limited precision. Defaults to None, which means no upcasting is done.
+        is_async (bool): Whether operation is asynchronous. Defaults to False.
+    Returns:
+        tensor or warpgroup_mma_accumulator: Returns the result if synchronous, or a token to load the value once computed if asynchronous.
+    """
+    use_acc = _semantic.to_tensor(use_acc)
+    if precision is None:
+        precision = _semantic.builder.options.default_dot_input_precision
+    precision = _semantic._str_to_dot_input_precision(precision)
+    K = a.type.shape[-1]
+    if max_num_imprecise_acc is None:
+        if a.dtype.is_fp8() and b.dtype.is_fp8():
+            max_num_imprecise_acc = _semantic.builder.options.max_num_imprecise_acc_default
+        else:
+            max_num_imprecise_acc = 0
+    else:
+        if a.dtype.is_fp8() and b.dtype.is_fp8() and max_num_imprecise_acc > K:
+            raise ValueError(f"max_num_imprecise_acc ({max_num_imprecise_acc}) must be <= K ({K})")
+    max_num_imprecise_acc = _core._unwrap_if_constexpr(max_num_imprecise_acc)
+    is_async = _core._unwrap_if_constexpr(is_async)
+    handle = _semantic.builder.create_warpgroup_mma(a.handle, b.handle, acc.handle, use_acc.handle, precision,
+                                                    max_num_imprecise_acc, is_async)
+    tensor_ty = acc.type.tensor_type if isinstance(acc, warpgroup_mma_accumulator) else acc.type
+    if is_async:
+        return warpgroup_mma_accumulator(handle, tensor_ty)
+    else:
+        return _core.tensor(handle, tensor_ty)
+@_core.builtin
+def warpgroup_mma_wait(num_outstanding=0, deps=None, _semantic=None):
+    """
+    Wait until `num_outstanding` or less warpgroup MMA operations are in-flight.
+    Args:
+        num_outstanding (int): Number of outstanding warpgroup MMA operations to wait for. Defaults to 0.
+        deps (Sequence[tensor]): List of dependencies that need to be kept alive while the mma is unfinished.
+    """
+    if deps is None:
+        raise ValueError("warpgroup_mma_wait deps must be given")
+    deps_handles = [x.handle for x in deps] if deps is not None else []
+    num_outstanding = _core._unwrap_if_constexpr(num_outstanding)
+    results = _semantic.builder.create_warpgroup_mma_wait(deps_handles, num_outstanding)
+    result_types = [dep.type.tensor_type if isinstance(dep, warpgroup_mma_accumulator) else dep.type for dep in deps]
+    results = unflatten_ir_values(results, result_types)
+    if len(deps) == 1:
+        return next(results)
+    return tuple(results)

triton/experimental/gluon/language/nvidia/hopper/mbarrier.py CHANGED Viewed

@@ -1,51 +1,34 @@
-from triton.experimental.gluon.language._layouts import SwizzledSharedLayout
-from triton.experimental.gluon.language._core import builtin, _unwrap_if_constexpr
+from ..ampere.mbarrier import MBarrierLayout, init, invalidate, wait
+from ..._core import _unwrap_if_constexpr, builtin
-__all__ = ["MBarrierLayout", "init", "invalidate", "expect", "wait", "arrive"]
-class MBarrierLayout(SwizzledSharedLayout):
-    def __init__(self, ctas_per_cga: int = 1, cta_split_num: int = 1):
-        super().__init__(
-            vec=1,
-            per_phase=1,
-            max_phase=1,
-            order=[0],
-            ctas_per_cga=[ctas_per_cga],
-            cta_split_num=[cta_split_num],
-            cta_order=[0],
-        )
-@builtin
-def init(mbarrier, count, _semantic=None):
-    count = _unwrap_if_constexpr(count)
-    _semantic.builder.create_mbarrier_init(mbarrier.handle, count)
-@builtin
-def invalidate(mbarrier, _semantic=None):
-    _semantic.builder.create_mbarrier_inval(mbarrier.handle)
+__all__ = ["arrive", "expect", "init", "invalidate", "MBarrierLayout", "wait"]
 @builtin
 def expect(mbarrier, bytes, pred=True, _semantic=None):
+    """
+    Expect a specific number of bytes being copied. When they are copied, the barrier is signaled.
+    Args:
+        mbarrier (shared_memory_descriptor): Barrier that will be signaled when the operation is complete.
+        bytes (int): Expected byte count.
+        pred (bool): Scalar predicate. Operation is skipped if predicate is False. Defaults to True.
+    """
     bytes = _unwrap_if_constexpr(bytes)
     pred = _semantic.to_tensor(pred)
     _semantic.builder.create_mbarrier_expect(mbarrier.handle, bytes, pred.handle)
 @builtin
-def wait(mbarrier, phase, pred=True, deps=(), _semantic=None):
-    phase = _semantic.to_tensor(phase)
-    pred = _semantic.to_tensor(pred)
-    deps = [x.handle for x in deps]
-    _semantic.builder.create_mbarrier_wait(mbarrier.handle, phase.handle, pred.handle, deps)
-@builtin
-def arrive(mbarrier, count, pred=True, _semantic=None):
+def arrive(mbarrier, *, count=1, pred=True, _semantic=None):
+    """
+    Arrive at an mbarrier with a specified count.
+    Args:
+        mbarrier (shared_memory_descriptor): Barrier to be signalled.
+        count (int): Count to arrive with. Defaults to 1.
+        pred (bool): Scalar predicate. Operation is skipped if predicate is False. Defaults to True.
+    """
     count = _unwrap_if_constexpr(count)
     pred = _semantic.to_tensor(pred)
     _semantic.builder.create_mbarrier_arrive(mbarrier.handle, count, pred.handle)

triton/experimental/gluon/language/nvidia/hopper/tma.py CHANGED Viewed

@@ -1,6 +1,7 @@
 from __future__ import annotations
 from typing import List, Tuple, TYPE_CHECKING
 from dataclasses import dataclass
+from triton.language.core import base_type, base_value
 import triton.experimental.gluon.language._core as ttgl
 from triton.experimental.gluon.language._layouts import NVMMASharedLayout
 from triton.experimental.gluon.language._core import builtin, _unwrap_if_constexpr
@@ -12,7 +13,7 @@ __all__ = ["async_copy_global_to_shared", "async_copy_shared_to_global", "store_
 @dataclass(eq=True)
-class tensor_descriptor_type:
+class tensor_descriptor_type(base_type):
     block_type: ttgl.block_type
     shape_type: ttgl.tuple_type
     strides_type: ttgl.tuple_type
@@ -41,10 +42,10 @@ class tensor_descriptor_type:
         self.strides_type._flatten_ir_types(builder, out)
     def mangle(self) -> str:
-        return f"TD{self.block_type.mangle}_{self.layout.mangle()}TD"
+        return f"TD{self.block_type.mangle()}_{self.layout.mangle()}TD"
-class tensor_descriptor:
+class tensor_descriptor(base_value):
     def __init__(self, handle, shape: List[ttgl.tensor], strides: List[ttgl.tensor], block_type: ttgl.block_type,
                  layout: NVMMASharedLayout):

triton/experimental/gluon/nvidia/hopper.py CHANGED Viewed

@@ -13,6 +13,7 @@ class TensorDescriptor:
     strides: List[int]
     block_shape: List[int]
     layout: NVMMASharedLayout
+    padding: str = "zero"
     def __post_init__(self):
         rank = len(self.shape)
@@ -28,13 +29,17 @@ class TensorDescriptor:
             assert (stride * elem_bytes) % 16 == 0, "strides must be 16-byte aligned"
         assert self.strides[-1] == 1, "Last dimension must be contiguous"
         assert isinstance(self.layout, NVMMASharedLayout), "Layout must be NVMMASharedLayout"
+        assert self.padding == "zero" or self.padding == "nan", "Illegal value for padding"
+        if self.padding == "nan":
+            assert self.base.dtype.is_floating_point, "Padding option `nan` is only supported for floating point tensors"
     @staticmethod
-    def from_tensor(tensor: Any, block_shape: List[int], layout: NVMMASharedLayout):
+    def from_tensor(tensor: Any, block_shape: List[int], layout: NVMMASharedLayout, padding="zero"):
         return TensorDescriptor(
             tensor,
             tensor.shape,
             tensor.stride(),
             block_shape,
             layout,
+            padding,
         )

triton/knobs.py CHANGED Viewed

@@ -1,15 +1,19 @@
 from __future__ import annotations
+import functools
 import importlib
 import os
 import re
 import subprocess
 import sysconfig
+import warnings
 from dataclasses import dataclass
 from contextlib import contextmanager
 from typing import cast, Any, Callable, Generator, Generic, Optional, Protocol, Type, TypeVar, TypedDict, TYPE_CHECKING, Union
+from triton._C.libtriton import getenv, getenv_bool  # type: ignore
 if TYPE_CHECKING:
     from .runtime.cache import CacheManager, RemoteCacheBackend
     from .runtime.jit import JitFunctionInfo, KernelParam
@@ -25,11 +29,6 @@ env = Env()
 propagate_env: bool = True
-def getenv(key: str) -> Optional[str]:
-    res = os.getenv(key)
-    return res.strip() if res is not None else res
 def setenv(key: str, value: Optional[str]) -> None:
     if not propagate_env:
         return
@@ -62,32 +61,25 @@ def toenv(val: Any) -> Union[None, tuple[Optional[str]]]:
 SetType = TypeVar("SetType")
 GetType = TypeVar("GetType")
+_NOTHING = object()
 class env_base(Generic[SetType, GetType]):
-    def __init__(self, key: str, default: Union[SetType, Callable[[], SetType]]) -> None:
+    def __init__(self, key: str) -> None:
         self.key = key
-        self.default: Callable[[], SetType] = default if callable(default) else lambda: default
     def __set_name__(self, objclass: Type[object], name: str) -> None:
         self.name = name
     def __get__(self, obj: Optional[object], objclass: Optional[Type[object]]) -> GetType:
-        if obj is None:
-            raise AttributeError(f"Cannot access {type(self)} on non-instance")
-        if self.name in obj.__dict__:
-            return self.transform(obj.__dict__[self.name])
-        else:
+        py_val = obj.__dict__.get(self.name, _NOTHING)
+        if py_val is _NOTHING:
             return self.get()
-    @property
-    def env_val(self) -> str | None:
-        return getenv(self.key)
+        return self.transform(py_val)
     def get(self) -> GetType:
-        env = self.env_val
-        return self.transform(self.default() if env is None else self.from_env(env))
+        raise NotImplementedError()
     def __set__(self, obj: object, value: Union[SetType, Env]) -> None:
         if isinstance(value, Env):
@@ -105,54 +97,70 @@ class env_base(Generic[SetType, GetType]):
         # if GetType != SetType.
         return cast(GetType, val)
-    def from_env(self, val: str) -> SetType:
-        raise NotImplementedError()
 class env_str(env_base[str, str]):
-    def from_env(self, val: str) -> str:
-        return val
+    def __init__(self, key: str, default: str):
+        super().__init__(key)
+        self.default = default
+    def get(self) -> str:
+        return getenv(self.key, self.default)
+class env_str_callable_default(env_base[str, str]):
+    def __init__(self, key: str, default_factory: Callable[[], str]):
+        super().__init__(key)
+        self.default_factory = default_factory
+    def get(self) -> str:
+        env_val = getenv(self.key)
+        if env_val is None:
+            return self.default_factory()
+        return env_val
 class env_bool(env_base[bool, bool]):
-    def __init__(self, key: str, default: Union[bool, Callable[[], bool]] = False) -> None:
-        super().__init__(key, default)
+    def __init__(self, key: str, default: bool = False) -> None:
+        super().__init__(key)
+        self.default = default
-    def from_env(self, val: str) -> bool:
-        return val.lower() in ("1", "true", "yes", "on", "y")
+    def get(self) -> bool:
+        return getenv_bool(self.key, self.default)
 class env_int(env_base[int, int]):
-    def __init__(self, key: str, default: Union[int, Callable[[], int]] = 0) -> None:
-        super().__init__(key, default)
+    def __init__(self, key: str, default: int = 0) -> None:
+        super().__init__(key)
+        self.default = default
-    def from_env(self, val: str) -> int:
+    def get(self) -> int:
+        val = getenv(self.key)
+        if val is None:
+            return self.default
         try:
             return int(val)
         except ValueError as exc:
             raise RuntimeError(f"Unable to use {self.key}={val}: expected int") from exc
-class env_opt_base(Generic[GetType, SetType], env_base[Optional[GetType], Optional[SetType]]):
-    def __init__(self, key: str) -> None:
-        super().__init__(key, None)
 ClassType = TypeVar("ClassType")
-class env_class(Generic[ClassType], env_opt_base[Type[ClassType], Type[ClassType]]):
+class env_class(Generic[ClassType], env_base[Optional[Type[ClassType]], Optional[Type[ClassType]]]):
     def __init__(self, key: str, type: str) -> None:
         super().__init__(key)
         # We can't pass the type directly to avoid import cycles
         self.type = type
-    def from_env(self, val: str) -> Type[ClassType]:
+    def get(self) -> Optional[Type[ClassType]]:
+        val = getenv(self.key)
+        if val is None:
+            return None
         comps = val.split(":", 1)
         if len(comps) != 2:
             raise RuntimeError(f"Unable to read {self.key}: '{val}' isn't of the form MODULE:CLASS")
@@ -170,16 +178,15 @@ class NvidiaTool:
     version: str
     @staticmethod
+    @functools.lru_cache
     def from_path(path: str) -> Optional[NvidiaTool]:
         try:
             result = subprocess.check_output([path, "--version"], stderr=subprocess.STDOUT)
-            if result is None:
-                return None
             version = re.search(r".*release (\d+\.\d+).*", result.decode("utf-8"), flags=re.MULTILINE)
             if version is None:
                 return None
             return NvidiaTool(path, version.group(1))
-        except subprocess.CalledProcessError:
+        except (subprocess.CalledProcessError, FileNotFoundError):
             return None
@@ -202,6 +209,7 @@ def find_nvidia_tool(binary: str) -> str:
             if os.access(path, os.X_OK):
                 return path
+    warnings.warn(f"Failed to find {binary}")
     return ""
@@ -210,34 +218,38 @@ class env_nvidia_tool(env_base[str, NvidiaTool]):
     def __init__(self, binary: str) -> None:
         binary += sysconfig.get_config_var("EXE")
         self.binary = binary
-        super().__init__(f"TRITON_{binary.upper()}_PATH", lambda: find_nvidia_tool(self.binary))
+        self.default_path = find_nvidia_tool(binary)
+        super().__init__(f"TRITON_{binary.upper()}_PATH")
+    def get(self) -> NvidiaTool:
+        return self.transform(getenv(self.key))
     def transform(self, path: str) -> NvidiaTool:
-        paths = [
-            path,
-            # We still add default as fallback in case the pointed binary isn't
-            # accessible.
-            self.default(),
-        ]
+        # We still add default as fallback in case the pointed binary isn't
+        # accessible.
+        if path is not None:
+            paths = [path, self.default_path]
+        else:
+            paths = [self.default_path]
         for path in paths:
-            if not path or not os.access(path, os.X_OK):
-                continue
             if tool := NvidiaTool.from_path(path):
                 return tool
         raise RuntimeError(f"Cannot find {self.binary}")
-    def from_env(self, val: str) -> str:
-        return val
 # Separate classes so that types are correct
-class env_opt_str(env_opt_base[str, str], env_str):
-    pass
+class env_opt_str(env_base[Optional[str], Optional[str]]):
+    def get(self) -> Optional[str]:
+        return getenv(self.key)
-class env_opt_bool(env_opt_base[bool, bool], env_bool):
-    pass
+class env_opt_bool(env_base):
+    def get(self) -> Optional[str]:
+        return getenv_bool(self.key, None)
 @dataclass(frozen=True)
@@ -305,7 +317,7 @@ class base_knobs:
     @contextmanager
     def scope(self) -> Generator[None, None, None]:
         try:
-            initial_env = {knob.key: knob.env_val for knob in self.knob_descriptors.values()}
+            initial_env = {knob.key: getenv(knob.key) for knob in self.knob_descriptors.values()}
             orig = dict(self.__dict__)
             yield
         finally:
@@ -350,11 +362,11 @@ cache: cache_knobs
 class cache_knobs(base_knobs):
-    home_dir: env_str = env_str("TRITON_HOME", lambda: os.path.expanduser("~/"))
+    home_dir: env_str = env_str("TRITON_HOME", os.path.expanduser("~/"))
-    dump_dir: env_str = env_str("TRITON_DUMP_DIR", lambda: cache.get_triton_dir("dump"))
-    override_dir: env_str = env_str("TRITON_OVERRIDE_DIR", lambda: cache.get_triton_dir("override"))
-    dir: env_str = env_str("TRITON_CACHE_DIR", lambda: cache.get_triton_dir("cache"))
+    dump_dir = env_str_callable_default("TRITON_DUMP_DIR", lambda: cache.get_triton_dir("dump"))
+    override_dir = env_str_callable_default("TRITON_OVERRIDE_DIR", lambda: cache.get_triton_dir("override"))
+    dir = env_str_callable_default("TRITON_CACHE_DIR", lambda: cache.get_triton_dir("cache"))
     manager_class: env_class[CacheManager] = env_class("TRITON_CACHE_MANAGER", "CacheManager")
     remote_manager_class: env_class[RemoteCacheBackend] = env_class("TRITON_REMOTE_CACHE_BACKEND", "RemoteCacheBackend")
@@ -374,6 +386,7 @@ class compilation_knobs(base_knobs):
     disable_line_info: env_bool = env_bool("TRITON_DISABLE_LINE_INFO")
     front_end_debugging: env_bool = env_bool("TRITON_FRONT_END_DEBUGGING")
     allow_non_constexpr_globals: env_bool = env_bool("TRITON_ALLOW_NON_CONSTEXPR_GLOBALS")
+    enable_experimental_consan: env_bool = env_bool("TRITON_ENABLE_EXPERIMENTAL_CONSAN")
     listener: Union[CompilationListener, None] = None
@@ -383,11 +396,53 @@ class autotuning_knobs(base_knobs):
 class LaunchHook(Protocol):
+    """Hook invoked before and after kernel launching
+    """
     def __call__(self, metadata: LazyDict) -> None:
         ...
+class InitHandleHook(Protocol):
+    """Hook invoked around kernel binary/module loading.
+    module/function can be None for the *start* hook (before loading).
+    """
+    def __call__(
+        self,
+        module: Optional[object],
+        function: Optional[Callable],
+        name: str,
+        metadata_group: dict[str, str],
+        hash: str,
+    ) -> None:
+        ...
+F = TypeVar("F", bound=Callable)
+class HookChain(Generic[F]):
+    """A chain of hooks of the same type F to be called in order.
+    """
+    def __init__(self, reversed: bool = False):
+        self.calls: list[F] = []
+        self.reversed = reversed
+    def add(self, func: F) -> None:
+        if func not in self.calls:
+            self.calls.append(func)
+    def remove(self, func: F) -> None:
+        if func in self.calls:
+            self.calls.remove(func)
+    def __call__(self, *args, **kwargs):
+        for call in self.calls if not self.reversed else reversed(self.calls):
+            call(*args, **kwargs)
 # This is of the form [attr_name, attr_val]
 # TODO: Use tuple instead of list for better typing.
 KernelAttr = list[Union[str, int]]
@@ -418,11 +473,15 @@ class JITHook(Protocol):
 class runtime_knobs(base_knobs):
     interpret: env_bool = env_bool("TRITON_INTERPRET")
-    debug: env_bool = env_bool("TRITON_DEBUG")
+    # debug is on critical path for kernel launches
+    # avoid repeated reads from env-var by calling get directly
+    debug: bool = env_bool("TRITON_DEBUG").get()
     override_arch: env_opt_str = env_opt_str("TRITON_OVERRIDE_ARCH")
-    launch_enter_hook: Optional[LaunchHook] = None
-    launch_exit_hook: Optional[LaunchHook] = None
+    launch_enter_hook: HookChain[LaunchHook] = HookChain()
+    launch_exit_hook: HookChain[LaunchHook] = HookChain(reversed=True)
+    kernel_load_start_hook: HookChain[InitHandleHook] = HookChain()
+    kernel_load_end_hook: HookChain[InitHandleHook] = HookChain(reversed=True)
     # Hook for inspecting compiled functions and modules
     jit_cache_hook: Optional[JITHook] = None
@@ -444,6 +503,7 @@ class nvidia_knobs(base_knobs):
     dump_nvptx: env_bool = env_bool("NVPTX_ENABLE_DUMP")
     disable_ptxas_opt: env_bool = env_bool("DISABLE_PTXAS_OPT")
     mock_ptx_version: env_opt_str = env_opt_str("TRITON_MOCK_PTX_VERSION")
+    dump_ptxas_log: env_bool = env_bool("TRITON_DUMP_PTXAS_LOG")
     libdevice_path: env_opt_str = env_opt_str("TRITON_LIBDEVICE_PATH")
     libcuda_path: env_opt_str = env_opt_str("TRITON_LIBCUDA_PATH")
@@ -451,9 +511,10 @@ class nvidia_knobs(base_knobs):
 class amd_knobs(base_knobs):
     use_buffer_ops: env_bool = env_bool("AMDGCN_USE_BUFFER_OPS")
+    # Note: This requires use_buffer_ops be true to have any effect
+    use_buffer_atomics: env_bool = env_bool("AMDGCN_USE_BUFFER_ATOMICS", True)
     dump_amdgcn: env_bool = env_bool("AMDGCN_ENABLE_DUMP")
     libhip_path: env_opt_str = env_opt_str("TRITON_LIBHIP_PATH")
-    lld_path: env_opt_str = env_opt_str("TRITON_HIP_LLD_PATH")
     # We use strs so that we can have a default value based on other runtime info
     use_block_pingpong: env_opt_bool = env_opt_bool("TRITON_HIP_USE_BLOCK_PINGPONG")
@@ -479,3 +540,7 @@ language = language_knobs()
 nvidia = nvidia_knobs()
 amd = amd_knobs()
 proton = proton_knobs()
+def refresh_knobs():
+    runtime.debug = env_bool("TRITON_DEBUG").get()