PyPI - triton-windows - Versions diffs - 3.3.1.post19__cp39-cp39-win_amd64.whl → 3.4.0.post20__cp39-cp39-win_amd64.whl - Mend

triton-windows 3.3.1.post19__cp39-cp39-win_amd64.whl → 3.4.0.post20__cp39-cp39-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of triton-windows might be problematic. Click here for more details.

Files changed (166) hide show

triton/_C/libtriton.pyd +0 -0
triton/__init__.py +4 -1
triton/_filecheck.py +87 -0
triton/_internal_testing.py +26 -15
triton/_utils.py +110 -21
triton/backends/__init__.py +20 -23
triton/backends/amd/__init__.py +0 -0
triton/backends/amd/compiler.py +112 -78
triton/backends/amd/driver.c +5 -2
triton/backends/amd/driver.py +149 -47
triton/backends/compiler.py +7 -21
triton/backends/nvidia/bin/ptxas.exe +0 -0
triton/backends/nvidia/compiler.py +92 -93
triton/backends/nvidia/driver.c +90 -98
triton/backends/nvidia/driver.py +303 -128
triton/compiler/code_generator.py +212 -111
triton/compiler/compiler.py +110 -25
triton/experimental/__init__.py +0 -0
triton/experimental/gluon/__init__.py +4 -0
triton/experimental/gluon/_compiler.py +0 -0
triton/experimental/gluon/_runtime.py +99 -0
triton/experimental/gluon/language/__init__.py +18 -0
triton/experimental/gluon/language/_core.py +312 -0
triton/experimental/gluon/language/_layouts.py +230 -0
triton/experimental/gluon/language/_math.py +12 -0
triton/experimental/gluon/language/_semantic.py +287 -0
triton/experimental/gluon/language/_standard.py +47 -0
triton/experimental/gluon/language/nvidia/__init__.py +4 -0
triton/experimental/gluon/language/nvidia/blackwell/__init__.py +202 -0
triton/experimental/gluon/language/nvidia/blackwell/tma.py +32 -0
triton/experimental/gluon/language/nvidia/hopper/__init__.py +11 -0
triton/experimental/gluon/language/nvidia/hopper/mbarrier.py +51 -0
triton/experimental/gluon/language/nvidia/hopper/tma.py +96 -0
triton/experimental/gluon/nvidia/__init__.py +4 -0
triton/experimental/gluon/nvidia/blackwell.py +3 -0
triton/experimental/gluon/nvidia/hopper.py +40 -0
triton/knobs.py +481 -0
triton/language/__init__.py +39 -14
triton/language/core.py +794 -537
triton/language/extra/cuda/__init__.py +10 -7
triton/language/extra/cuda/gdc.py +42 -0
triton/language/extra/cuda/libdevice.py +394 -394
triton/language/extra/cuda/utils.py +21 -21
triton/language/extra/hip/libdevice.py +113 -104
triton/language/math.py +65 -66
triton/language/random.py +12 -2
triton/language/semantic.py +1706 -1770
triton/language/standard.py +116 -51
triton/runtime/autotuner.py +117 -59
triton/runtime/build.py +76 -12
triton/runtime/cache.py +18 -47
triton/runtime/driver.py +32 -29
triton/runtime/interpreter.py +72 -35
triton/runtime/jit.py +146 -110
triton/testing.py +16 -12
triton/tools/disasm.py +3 -4
triton/tools/tensor_descriptor.py +36 -0
triton/windows_utils.py +14 -6
{triton_windows-3.3.1.post19.dist-info → triton_windows-3.4.0.post20.dist-info}/METADATA +7 -2
triton_windows-3.4.0.post20.dist-info/RECORD +186 -0
triton_windows-3.4.0.post20.dist-info/entry_points.txt +3 -0
triton_windows-3.4.0.post20.dist-info/licenses/LICENSE +23 -0
triton_windows-3.4.0.post20.dist-info/top_level.txt +1 -0
triton/backends/amd/include/hip/amd_detail/amd_channel_descriptor.h +0 -358
triton/backends/amd/include/hip/amd_detail/amd_device_functions.h +0 -1010
triton/backends/amd/include/hip/amd_detail/amd_hip_atomic.h +0 -1638
triton/backends/amd/include/hip/amd_detail/amd_hip_bf16.h +0 -1814
triton/backends/amd/include/hip/amd_detail/amd_hip_bfloat16.h +0 -293
triton/backends/amd/include/hip/amd_detail/amd_hip_common.h +0 -32
triton/backends/amd/include/hip/amd_detail/amd_hip_complex.h +0 -174
triton/backends/amd/include/hip/amd_detail/amd_hip_cooperative_groups.h +0 -835
triton/backends/amd/include/hip/amd_detail/amd_hip_fp16.h +0 -1809
triton/backends/amd/include/hip/amd_detail/amd_hip_fp8.h +0 -1391
triton/backends/amd/include/hip/amd_detail/amd_hip_gl_interop.h +0 -108
triton/backends/amd/include/hip/amd_detail/amd_hip_math_constants.h +0 -124
triton/backends/amd/include/hip/amd_detail/amd_hip_runtime.h +0 -405
triton/backends/amd/include/hip/amd_detail/amd_hip_runtime_pt_api.h +0 -196
triton/backends/amd/include/hip/amd_detail/amd_hip_unsafe_atomics.h +0 -565
triton/backends/amd/include/hip/amd_detail/amd_hip_vector_types.h +0 -2226
triton/backends/amd/include/hip/amd_detail/amd_math_functions.h +0 -104
triton/backends/amd/include/hip/amd_detail/amd_surface_functions.h +0 -244
triton/backends/amd/include/hip/amd_detail/amd_warp_functions.h +0 -538
triton/backends/amd/include/hip/amd_detail/amd_warp_sync_functions.h +0 -288
triton/backends/amd/include/hip/amd_detail/concepts.hpp +0 -30
triton/backends/amd/include/hip/amd_detail/device_library_decls.h +0 -133
triton/backends/amd/include/hip/amd_detail/functional_grid_launch.hpp +0 -218
triton/backends/amd/include/hip/amd_detail/grid_launch.h +0 -67
triton/backends/amd/include/hip/amd_detail/grid_launch.hpp +0 -50
triton/backends/amd/include/hip/amd_detail/grid_launch_GGL.hpp +0 -26
triton/backends/amd/include/hip/amd_detail/helpers.hpp +0 -137
triton/backends/amd/include/hip/amd_detail/hip_api_trace.hpp +0 -1446
triton/backends/amd/include/hip/amd_detail/hip_assert.h +0 -101
triton/backends/amd/include/hip/amd_detail/hip_cooperative_groups_helper.h +0 -242
triton/backends/amd/include/hip/amd_detail/hip_fp16_gcc.h +0 -254
triton/backends/amd/include/hip/amd_detail/hip_fp16_math_fwd.h +0 -96
triton/backends/amd/include/hip/amd_detail/hip_ldg.h +0 -100
triton/backends/amd/include/hip/amd_detail/hip_prof_str.h +0 -10570
triton/backends/amd/include/hip/amd_detail/hip_runtime_prof.h +0 -78
triton/backends/amd/include/hip/amd_detail/host_defines.h +0 -184
triton/backends/amd/include/hip/amd_detail/hsa_helpers.hpp +0 -102
triton/backends/amd/include/hip/amd_detail/macro_based_grid_launch.hpp +0 -798
triton/backends/amd/include/hip/amd_detail/math_fwd.h +0 -698
triton/backends/amd/include/hip/amd_detail/ockl_image.h +0 -177
triton/backends/amd/include/hip/amd_detail/program_state.hpp +0 -107
triton/backends/amd/include/hip/amd_detail/texture_fetch_functions.h +0 -491
triton/backends/amd/include/hip/amd_detail/texture_indirect_functions.h +0 -478
triton/backends/amd/include/hip/channel_descriptor.h +0 -39
triton/backends/amd/include/hip/device_functions.h +0 -38
triton/backends/amd/include/hip/driver_types.h +0 -468
triton/backends/amd/include/hip/hip_bf16.h +0 -36
triton/backends/amd/include/hip/hip_bfloat16.h +0 -44
triton/backends/amd/include/hip/hip_common.h +0 -100
triton/backends/amd/include/hip/hip_complex.h +0 -38
triton/backends/amd/include/hip/hip_cooperative_groups.h +0 -46
triton/backends/amd/include/hip/hip_deprecated.h +0 -95
triton/backends/amd/include/hip/hip_ext.h +0 -161
triton/backends/amd/include/hip/hip_fp16.h +0 -36
triton/backends/amd/include/hip/hip_fp8.h +0 -33
triton/backends/amd/include/hip/hip_gl_interop.h +0 -32
triton/backends/amd/include/hip/hip_hcc.h +0 -24
triton/backends/amd/include/hip/hip_math_constants.h +0 -36
triton/backends/amd/include/hip/hip_profile.h +0 -27
triton/backends/amd/include/hip/hip_runtime.h +0 -75
triton/backends/amd/include/hip/hip_runtime_api.h +0 -9261
triton/backends/amd/include/hip/hip_texture_types.h +0 -29
triton/backends/amd/include/hip/hip_vector_types.h +0 -41
triton/backends/amd/include/hip/hip_version.h +0 -17
triton/backends/amd/include/hip/hiprtc.h +0 -421
triton/backends/amd/include/hip/library_types.h +0 -78
triton/backends/amd/include/hip/math_functions.h +0 -42
triton/backends/amd/include/hip/surface_types.h +0 -63
triton/backends/amd/include/hip/texture_types.h +0 -194
triton/backends/amd/include/hsa/Brig.h +0 -1131
triton/backends/amd/include/hsa/amd_hsa_common.h +0 -91
triton/backends/amd/include/hsa/amd_hsa_elf.h +0 -462
triton/backends/amd/include/hsa/amd_hsa_kernel_code.h +0 -269
triton/backends/amd/include/hsa/amd_hsa_queue.h +0 -109
triton/backends/amd/include/hsa/amd_hsa_signal.h +0 -80
triton/backends/amd/include/hsa/hsa.h +0 -5738
triton/backends/amd/include/hsa/hsa_amd_tool.h +0 -91
triton/backends/amd/include/hsa/hsa_api_trace.h +0 -579
triton/backends/amd/include/hsa/hsa_api_trace_version.h +0 -68
triton/backends/amd/include/hsa/hsa_ext_amd.h +0 -3146
triton/backends/amd/include/hsa/hsa_ext_finalize.h +0 -531
triton/backends/amd/include/hsa/hsa_ext_image.h +0 -1454
triton/backends/amd/include/hsa/hsa_ven_amd_aqlprofile.h +0 -488
triton/backends/amd/include/hsa/hsa_ven_amd_loader.h +0 -667
triton/backends/amd/include/hsa/hsa_ven_amd_pc_sampling.h +0 -416
triton/backends/amd/include/roctracer/ext/prof_protocol.h +0 -107
triton/backends/amd/include/roctracer/hip_ostream_ops.h +0 -4515
triton/backends/amd/include/roctracer/hsa_ostream_ops.h +0 -1727
triton/backends/amd/include/roctracer/hsa_prof_str.h +0 -3059
triton/backends/amd/include/roctracer/roctracer.h +0 -779
triton/backends/amd/include/roctracer/roctracer_ext.h +0 -81
triton/backends/amd/include/roctracer/roctracer_hcc.h +0 -24
triton/backends/amd/include/roctracer/roctracer_hip.h +0 -37
triton/backends/amd/include/roctracer/roctracer_hsa.h +0 -112
triton/backends/amd/include/roctracer/roctracer_plugin.h +0 -137
triton/backends/amd/include/roctracer/roctracer_roctx.h +0 -67
triton/backends/amd/include/roctracer/roctx.h +0 -229
triton/language/_utils.py +0 -21
triton/language/extra/cuda/_experimental_tma.py +0 -106
triton/tools/experimental_descriptor.py +0 -32
triton_windows-3.3.1.post19.dist-info/RECORD +0 -260
triton_windows-3.3.1.post19.dist-info/top_level.txt +0 -14
{triton_windows-3.3.1.post19.dist-info → triton_windows-3.4.0.post20.dist-info}/WHEEL +0 -0

triton/language/core.py CHANGED Viewed

@@ -6,14 +6,14 @@ from enum import Enum
 from functools import partial, wraps
 import typing
 from typing import Union, Callable, List, Sequence, TypeVar, Optional, Tuple
+from dataclasses import dataclass
 import builtins
-from ..runtime.jit import jit
+from .. import knobs
+from ..runtime.jit import jit, JITFunction
 import inspect
-import os
 from .._C.libtriton import ir
-from . import semantic
-from ._utils import TRITON_MAX_TENSOR_NUMEL, validate_block_shape
+from .._utils import TRITON_MAX_TENSOR_NUMEL, validate_block_shape, get_primitive_bitwidth
 T = TypeVar('T')
@@ -22,15 +22,23 @@ TRITON_BUILTIN = "__triton_builtin__"
 PropagateNan = ir.PROPAGATE_NAN
+def must_use_result(x, s=True):
+    """If the result of this function is unused, throw an error."""
+    if isinstance(x, str):
+        return (lambda fn: must_use_result(fn, x))
+    x._must_use_result = s
+    return x
 def builtin(fn: T) -> T:
     """Mark a function as a builtin."""
     assert callable(fn)
     @wraps(fn)
     def wrapper(*args, **kwargs):
-        if "_builder" not in kwargs or kwargs["_builder"] is None:
+        if "_semantic" not in kwargs or kwargs["_semantic"] is None:
             raise ValueError("Did you forget to add @triton.jit ? "
-                             "(`_builder` argument must be provided outside of JIT functions.)")
+                             "(`_semantic` argument must be provided outside of JIT functions.)")
         return fn(*args, **kwargs)
     setattr(wrapper, TRITON_BUILTIN, True)
@@ -53,8 +61,8 @@ def _tensor_member_fn(fn: T) -> T:
     """
     assert callable(fn)
     orig_sig = inspect.signature(fn)
-    # Does fn take args other than _builder, _generator, and the tensor itself?
-    has_args = len(orig_sig.parameters.keys() - {"_builder", "_generator"}) > 1
+    # Does fn take args other than _semantic, _generator, and the tensor itself?
+    has_args = len(orig_sig.parameters.keys() - {"_semantic", "_generator"}) > 1
     if not fn.__doc__:
         fn.__doc__ = ""
@@ -78,7 +86,7 @@ def _tensor_member_fn(fn: T) -> T:
     if is_builtin(fn):
         setattr(wrapper, TRITON_BUILTIN, True)
-    setattr(tensor, fn.__name__, wrapper)
+    setattr(tensor, fn.__name__, fn if isinstance(fn, JITFunction) else wrapper)
     return fn
@@ -110,8 +118,8 @@ def is_builtin(fn) -> bool:
 @builtin
-def to_tensor(x, _builder=None):
-    return semantic.to_tensor(x, _builder)
+def to_tensor(x, _semantic=None):
+    return _semantic.to_tensor(x)
 # -----------------------
@@ -130,7 +138,62 @@ class const:
     pass
-class constexpr:
+class base_value:
+    """Base class of values that exist in the triton IR (i.e. not constexprs).
+    """
+    type: base_type
+    def _flatten_ir(self, handles: List[ir.value]) -> None:
+        """Flatten frontend value into a sequence of mlir handles, which are appended
+        to the output list
+        """
+        raise NotImplementedError
+class base_type:
+    def __eq__(self, other):
+        raise NotImplementedError("Types must implement __eq__")
+    def __ne__(self, other):
+        return not (self == other)
+    def _unflatten_ir(self, handles: List[ir.value], cursor: int) -> Tuple[base_value, int]:
+        """Build a frontend value with the current dtype, wrapping a list of existing handles.
+        cursor is the index of the first handle relevant to this value, and the function
+        should return the updated cursor position after any handles consumed by the created value.
+        """
+        raise NotImplementedError
+    def mangle(self) -> str:
+        raise NotImplementedError(f"NYI: Type mangling for type {self.__class__}")
+    def _flatten_ir_types(self, builder: ir.builder, out: List[ir.type]) -> None:
+        raise NotImplementedError
+class constexpr_type(base_type):
+    def __init__(self, value):
+        self.value = value
+    def __eq__(self, other):
+        return self.value == other.value
+    def __repr__(self) -> str:
+        return f"constexpr[{self.value}]"
+    def mangle(self) -> str:
+        return repr(self)
+    def _flatten_ir_types(self, builder: ir.builder, out: List[ir.type]) -> None:
+        return
+    def _unflatten_ir(self, handles: List[ir.value], cursor: int) -> Tuple[base_value, int]:
+        return constexpr(self.value), cursor
+class constexpr(base_value):
     """
     This class is used to store a value that is known at compile-time.
     """
@@ -140,80 +203,83 @@ class constexpr:
             self.value = value.value
         else:
             self.value = value
-        self.type = constexpr
+        self.type = constexpr_type(value)
     def __repr__(self) -> str:
         return f"constexpr[{self.value}]"
+    def _flatten_ir(self, handles: List[ir.value]) -> None:
+        return
     def __index__(self):
         return self.value
     # In interpreter mode, constant values are not wrapped in constexpr,
     # and therefore do not have a .value attribute.
-    # As a result, from here and below, we need to call the _constexpr_to_value
+    # As a result, from here and below, we need to call the _unwrap_if_constexpr
     # function to obtain either constexpr.value or the value itself.
     def __add__(self, other):
-        return constexpr(self.value + _constexpr_to_value(other))
+        return constexpr(self.value + _unwrap_if_constexpr(other))
     def __radd__(self, other):
-        return constexpr(_constexpr_to_value(other) + self.value)
+        return constexpr(_unwrap_if_constexpr(other) + self.value)
     def __sub__(self, other):
-        return constexpr(self.value - _constexpr_to_value(other))
+        return constexpr(self.value - _unwrap_if_constexpr(other))
     def __rsub__(self, other):
-        return constexpr(_constexpr_to_value(other) - self.value)
+        return constexpr(_unwrap_if_constexpr(other) - self.value)
     def __mul__(self, other):
-        return constexpr(self.value * _constexpr_to_value(other))
+        return constexpr(self.value * _unwrap_if_constexpr(other))
     def __mod__(self, other):
-        return constexpr(self.value % _constexpr_to_value(other))
+        return constexpr(self.value % _unwrap_if_constexpr(other))
     def __rmul__(self, other):
-        return constexpr(_constexpr_to_value(other) * self.value)
+        return constexpr(_unwrap_if_constexpr(other) * self.value)
     def __truediv__(self, other):
-        return constexpr(self.value / _constexpr_to_value(other))
+        return constexpr(self.value / _unwrap_if_constexpr(other))
     def __rtruediv__(self, other):
-        return constexpr(_constexpr_to_value(other) / self.value)
+        return constexpr(_unwrap_if_constexpr(other) / self.value)
     def __floordiv__(self, other):
-        return constexpr(self.value // _constexpr_to_value(other))
+        return constexpr(self.value // _unwrap_if_constexpr(other))
     def __rfloordiv__(self, other):
-        return constexpr(_constexpr_to_value(other) // self.value)
+        return constexpr(_unwrap_if_constexpr(other) // self.value)
     def __gt__(self, other):
-        return constexpr(self.value > _constexpr_to_value(other))
+        return constexpr(self.value > _unwrap_if_constexpr(other))
     def __rgt__(self, other):
-        return constexpr(_constexpr_to_value(other) > self.value)
+        return constexpr(_unwrap_if_constexpr(other) > self.value)
     def __ge__(self, other):
-        return constexpr(self.value >= _constexpr_to_value(other))
+        return constexpr(self.value >= _unwrap_if_constexpr(other))
     def __rge__(self, other):
-        return constexpr(_constexpr_to_value(other) >= self.value)
+        return constexpr(_unwrap_if_constexpr(other) >= self.value)
     def __lt__(self, other):
-        return constexpr(self.value < _constexpr_to_value(other))
+        return constexpr(self.value < _unwrap_if_constexpr(other))
     def __rlt__(self, other):
-        return constexpr(_constexpr_to_value(other) < self.value)
+        return constexpr(_unwrap_if_constexpr(other) < self.value)
     def __le__(self, other):
-        return constexpr(self.value <= _constexpr_to_value(other))
+        return constexpr(self.value <= _unwrap_if_constexpr(other))
     def __rle__(self, other):
-        return constexpr(_constexpr_to_value(other) <= self.value)
+        return constexpr(_unwrap_if_constexpr(other) <= self.value)
     def __eq__(self, other):
-        return constexpr(self.value == _constexpr_to_value(other))
+        return constexpr(self.value == _unwrap_if_constexpr(other))
     def __ne__(self, other):
-        return constexpr(self.value != _constexpr_to_value(other))
+        return constexpr(self.value != _unwrap_if_constexpr(other))
     def __bool__(self):
         return bool(self.value)
@@ -222,19 +288,19 @@ class constexpr:
         return constexpr(-self.value)
     def __and__(self, other):
-        return constexpr(self.value & _constexpr_to_value(other))
+        return constexpr(self.value & _unwrap_if_constexpr(other))
     def logical_and(self, other):
-        return constexpr(self.value and _constexpr_to_value(other))
+        return constexpr(self.value and _unwrap_if_constexpr(other))
     def __or__(self, other):
-        return constexpr(self.value | _constexpr_to_value(other))
+        return constexpr(self.value | _unwrap_if_constexpr(other))
     def __xor__(self, other):
-        return constexpr(self.value ^ _constexpr_to_value(other))
+        return constexpr(self.value ^ _unwrap_if_constexpr(other))
     def logical_or(self, other):
-        return constexpr(self.value or _constexpr_to_value(other))
+        return constexpr(self.value or _unwrap_if_constexpr(other))
     def __pos__(self):
         return constexpr(+self.value)
@@ -243,16 +309,16 @@ class constexpr:
         return constexpr(~self.value)
     def __pow__(self, other):
-        return constexpr(self.value**_constexpr_to_value(other))
+        return constexpr(self.value**_unwrap_if_constexpr(other))
     def __rpow__(self, other):
-        return constexpr(_constexpr_to_value(other)**self.value)
+        return constexpr(_unwrap_if_constexpr(other)**self.value)
     def __rshift__(self, other):
-        return constexpr(self.value >> _constexpr_to_value(other))
+        return constexpr(self.value >> _unwrap_if_constexpr(other))
     def __lshift__(self, other):
-        return constexpr(self.value << _constexpr_to_value(other))
+        return constexpr(self.value << _unwrap_if_constexpr(other))
     def __not__(self):
         return constexpr(not self.value)
@@ -263,14 +329,57 @@ class constexpr:
     def __call__(self, *args, **kwds):
         return self.value(*args, **kwds)
+    def __getitem__(self, *args):
+        args = (_unwrap_if_constexpr(x) for x in _normalize_tuple(args))
+        return self.value.__getitem__(*args)
+def constexpr_function(f):
+    """
+    Wraps an arbitrary Python function so that it can be called at
+    compile-time on constexpr arguments in a Triton function and
+    returns a constexpr result.
+    """
+    @wraps(f)
+    def wrapper(*args, _semantic=None, **kwargs):
+        # de-constexpr arguments and discard the _semantic keyword argument:
+        args = [_unwrap_if_constexpr(x) for x in args]
+        kwargs = {k: _unwrap_if_constexpr(v) for (k, v) in kwargs.items()}
+        # call the raw Python function f:
+        res = f(*args, **kwargs)
+        # convert result back to a Triton constexpr:
+        return constexpr(res)
+    # disguise the function as a Triton builtin to avoid raising an error
+    # that we're calling a non-JIT function from within a Triton kernel:
+    wrapper.__triton_builtin__ = True
+    wrapper.__module__ = constexpr_function.__module__
+    return wrapper
 CONSTEXPR_0 = constexpr(0)
 def _unwrap_if_constexpr(o):
+    if isinstance(o, list):
+        return [_unwrap_if_constexpr(x) for x in o]
+    if isinstance(o, builtins.tuple):
+        return builtins.tuple(_unwrap_if_constexpr(x) for x in o)
+    if isinstance(o, tuple):
+        return tuple(_unwrap_if_constexpr(x) for x in o)
     return o.value if isinstance(o, constexpr) else o
+def _normalize_tuple(t):
+    normalized_tuple = _unwrap_if_constexpr(t)
+    if isinstance(normalized_tuple, (list, builtins.tuple)):
+        normalized_tuple = tuple(normalized_tuple)
+    return normalized_tuple
 def check_bit_width(value, shift_value):
     if isinstance(value, tensor) and isinstance(shift_value, constexpr):
         bitwidth = value.type.scalar.primitive_bitwidth
@@ -280,34 +389,6 @@ def check_bit_width(value, shift_value):
             )
-class base_value:
-    """Base class of values that exist in the triton IR (i.e. not constexprs).
-    """
-    type: base_type
-    def _flatten_ir(self, handles: List[ir.value]) -> None:
-        """Flatten frontend value into a sequence of mlir handles, which are appended
-        to the output list
-        """
-        raise NotImplementedError
-class base_type:
-    def __eq__(self, other):
-        raise NotImplementedError("Types must implement __eq__")
-    def __ne__(self, other):
-        return not (self == other)
-    def _unflatten_ir(self, handles: List[ir.value], cursor: int) -> Tuple[base_value, int]:
-        """Build a frontend value with the current dtype, wrapping a list of existing handles.
-        cursor is the index of the first handle relevant to this value, and the function
-        should return the updated cursor position after any handles consumed by the created value.
-        """
-        raise NotImplementedError
 # -----------------------
 # dtype
 # -----------------------
@@ -333,55 +414,44 @@ class dtype(base_type):
         name = _unwrap_if_constexpr(name)
         self.name = name
         assert name in dtype.SINT_TYPES + dtype.UINT_TYPES + dtype.FP_TYPES + dtype.OTHER_TYPES, name
+        self.primitive_bitwidth = get_primitive_bitwidth(name)
+        self.itemsize = self.primitive_bitwidth // 8
         if name in dtype.SINT_TYPES:
             self.int_signedness = dtype.SIGNEDNESS.SIGNED
-            self.int_bitwidth = int(name.split('int')[-1])
-            self.primitive_bitwidth = self.int_bitwidth
+            self.int_bitwidth = self.primitive_bitwidth
         elif name in dtype.UINT_TYPES:
             self.int_signedness = dtype.SIGNEDNESS.UNSIGNED
-            self.int_bitwidth = int(name.split('int')[-1])
-            self.primitive_bitwidth = self.int_bitwidth
+            self.int_bitwidth = self.primitive_bitwidth
         elif name in dtype.FP_TYPES:
             if name == 'fp8e4b15':
                 self.fp_mantissa_width = 3
-                self.primitive_bitwidth = 8
                 self.exponent_bias = 15
             elif name == 'fp8e4nv':
                 self.fp_mantissa_width = 3
-                self.primitive_bitwidth = 8
                 self.exponent_bias = 7
             elif name == 'fp8e4b8':
                 self.fp_mantissa_width = 3
-                self.primitive_bitwidth = 8
                 self.exponent_bias = 8
             elif name == 'fp8e5':
                 self.fp_mantissa_width = 2
-                self.primitive_bitwidth = 8
                 self.exponent_bias = 15
             elif name == 'fp8e5b16':
                 self.fp_mantissa_width = 2
-                self.primitive_bitwidth = 8
                 self.exponent_bias = 16
             elif name == 'fp16':
                 self.fp_mantissa_width = 10
-                self.primitive_bitwidth = 16
                 self.exponent_bias = 15
             elif name == 'bf16':
                 self.fp_mantissa_width = 7
-                self.primitive_bitwidth = 16
                 self.exponent_bias = 127
             elif name == 'fp32':
                 self.fp_mantissa_width = 23
-                self.primitive_bitwidth = 32
                 self.exponent_bias = 127
             elif name == 'fp64':
                 self.fp_mantissa_width = 52
-                self.primitive_bitwidth = 64
                 self.exponent_bias = 1023
             else:
                 raise RuntimeError(f'Unsupported floating-point type {name}')
-        elif name == 'void':
-            self.primitive_bitwidth = 0
     def is_fp8(self):
         return 'fp8' in self.name
@@ -502,10 +572,6 @@ class dtype(base_type):
     def is_const():
         return False
-    @staticmethod
-    def is_tuple():
-        return False
     def __eq__(self, other: dtype):
         if not isinstance(other, dtype):
             return False
@@ -518,13 +584,14 @@ class dtype(base_type):
     def scalar(self):
         return self
+    def _flatten_ir_types(self, builder: ir.builder, out: List[ir.type]) -> None:
+        out.append(self.to_ir(builder))
     def to_ir(self, builder: ir.builder) -> ir.type:
         if self.name.startswith("fp8"):
             if self.name not in builder.options.supported_fp8_dtypes:
                 raise ValueError(f'type {self} not supported in this architecture. '
                                  f'The supported fp8 dtypes are {builder.options.supported_fp8_dtypes}')
-            if self.name in builder.options.deprecated_fp8_dtypes:
-                warn(f"{self.name} is deprecated in this architecture and will be removed in a future triton release")
         if self.name == 'void':
             return builder.get_void_ty()
@@ -581,6 +648,21 @@ class dtype(base_type):
     def _unflatten_ir(self, handles: List[ir.value], cursor: int) -> Tuple[base_value, int]:
         return tensor(handles[cursor], self), cursor + 1
+    def mangle(self) -> str:
+        if self.is_int():
+            SIGNED = dtype.SIGNEDNESS.SIGNED
+            prefix = 'i' if self.int_signedness == SIGNED else 'u'
+            return prefix + str(self.int_bitwidth)
+        if self.is_floating():
+            return str(self)
+        if self.is_void():
+            return 'V'
+        return super().mangle()
+    def with_element_ty(self, element_ty: dtype):
+        assert not self.is_block()
+        return element_ty
 # Some functions have a param named `dtype`, which shadows the `dtype` class.
 # We can't change the param name because it is part of function's public API.
@@ -623,12 +705,8 @@ class pointer_type(dtype):
     def scalar(self):
         return self
-class nv_tma_desc_type(pointer_type):
-    def __init__(self, const=True, address_space=0):
-        super().__init__(uint8, const=const, address_space=address_space)
-        self.name = 'nv_tma_desc_type'
+    def mangle(self) -> str:
+        return f"P{self.element_ty.mangle()}"
 class block_type(dtype):
@@ -660,9 +738,12 @@ class block_type(dtype):
     def is_block(self):
         return True
-    def get_block_shapes(self) -> List[int]:
+    def get_block_shapes(self) -> Tuple[int]:
         return self.shape
+    def with_element_ty(self, scalar_ty: dtype) -> block_type:
+        return block_type(scalar_ty, self.shape)
     def __eq__(self, other) -> bool:
         if not isinstance(other, block_type):
             return False
@@ -672,6 +753,11 @@ class block_type(dtype):
     def scalar(self):
         return self.element_ty
+    def mangle(self) -> str:
+        elt = self.scalar.mangle()
+        shape = '_'.join(map(str, self.shape))
+        return f'{elt}S{shape}S'
 class tuple_type(base_type):
@@ -686,15 +772,14 @@ class tuple_type(base_type):
     def __iter__(self):
         return iter(self.types)
-    def to_ir(self, builder: ir.builder):
-        return [ty.to_ir(builder) for ty in self.types]
+    def _flatten_ir_types(self, builder: ir.builder, out: List[ir.type]):
+        for ty in self.types:
+            if not isinstance(ty, constexpr):
+                ty._flatten_ir_types(builder, out)
     def __getitem__(self, index: int) -> dtype:
         return self.types[index]
-    def is_tuple(self):
-        return True
     def __eq__(self, other):
         return type(self) is type(other) and self.types == other.types and self.fields == other.fields
@@ -705,6 +790,9 @@ class tuple_type(base_type):
             values.append(value)
         return tuple(values, self), cursor
+    def mangle(self):
+        return 'T' + '_'.join(ty.mangle for ty in self.types) + 'T'
 class slice_type(dtype):
@@ -808,224 +896,224 @@ class tensor(base_value):
         return str(self.dtype) + '[' + ', '.join(str(s) for s in self.shape) + ']'
     @builtin
-    def __add__(self, other, _builder=None):
-        return add(self, other, sanitize_overflow=True, _builder=_builder)
+    def __add__(self, other, _semantic=None):
+        return add(self, other, sanitize_overflow=True, _semantic=_semantic)
     @builtin
-    def __radd__(self, other, _builder=None):
-        return add(other, self, sanitize_overflow=True, _builder=_builder)
+    def __radd__(self, other, _semantic=None):
+        return add(other, self, sanitize_overflow=True, _semantic=_semantic)
     @builtin
-    def __sub__(self, other, _builder=None):
-        return sub(self, other, sanitize_overflow=True, _builder=_builder)
+    def __sub__(self, other, _semantic=None):
+        return sub(self, other, sanitize_overflow=True, _semantic=_semantic)
     @builtin
-    def __rsub__(self, other, _builder=None):
-        return sub(other, self, sanitize_overflow=True, _builder=_builder)
+    def __rsub__(self, other, _semantic=None):
+        return sub(other, self, sanitize_overflow=True, _semantic=_semantic)
     @builtin
-    def __mul__(self, other, _builder=None):
-        return mul(self, other, sanitize_overflow=True, _builder=_builder)
+    def __mul__(self, other, _semantic=None):
+        return mul(self, other, sanitize_overflow=True, _semantic=_semantic)
     @builtin
-    def __rmul__(self, other, _builder=None):
-        return mul(other, self, sanitize_overflow=True, _builder=_builder)
+    def __rmul__(self, other, _semantic=None):
+        return mul(other, self, sanitize_overflow=True, _semantic=_semantic)
     @builtin
-    def __truediv__(self, other, _builder=None):
+    def __truediv__(self, other, _semantic=None):
         other = _unwrap_if_constexpr(other)
-        return semantic.truediv(self, other, _builder)
+        return _semantic.truediv(self, other)
     @builtin
-    def __rtruediv__(self, other, _builder=None):
+    def __rtruediv__(self, other, _semantic=None):
         other = _unwrap_if_constexpr(other)
-        return semantic.truediv(other, self, _builder)
+        return _semantic.truediv(other, self)
     @builtin
-    def __floordiv__(self, other, _builder=None):
+    def __floordiv__(self, other, _semantic=None):
         other = _unwrap_if_constexpr(other)
-        return semantic.floordiv(self, other, _builder)
+        return _semantic.floordiv(self, other)
     @builtin
-    def __rfloordiv__(self, other, _builder=None):
+    def __rfloordiv__(self, other, _semantic=None):
         other = _unwrap_if_constexpr(other)
-        return semantic.floordiv(other, self, _builder)
+        return _semantic.floordiv(other, self)
     @builtin
-    def __mod__(self, other, _builder=None):
+    def __mod__(self, other, _semantic=None):
         other = _unwrap_if_constexpr(other)
-        return semantic.mod(self, other, _builder)
+        return _semantic.mod(self, other)
     @builtin
-    def __rmod__(self, other, _builder=None):
+    def __rmod__(self, other, _semantic=None):
         other = _unwrap_if_constexpr(other)
-        return semantic.mod(other, self, _builder)
+        return _semantic.mod(other, self)
     # unary operators
     @builtin
-    def __neg__(self, _builder=None):
-        return semantic.minus(self, _builder)
+    def __neg__(self, _semantic=None):
+        return _semantic.minus(self)
     @builtin
-    def __invert__(self, _builder=None):
-        return semantic.invert(self, _builder)
+    def __invert__(self, _semantic=None):
+        return _semantic.invert(self)
     # bitwise operators
     @builtin
-    def __and__(self, other, _builder=None):
+    def __and__(self, other, _semantic=None):
         other = _unwrap_if_constexpr(other)
-        return semantic.and_(self, other, _builder)
+        return _semantic.and_(self, other)
     @builtin
-    def __rand__(self, other, _builder=None):
+    def __rand__(self, other, _semantic=None):
         other = _unwrap_if_constexpr(other)
-        return semantic.and_(other, self, _builder)
+        return _semantic.and_(other, self)
     @builtin
-    def __or__(self, other, _builder=None):
+    def __or__(self, other, _semantic=None):
         other = _unwrap_if_constexpr(other)
-        return semantic.or_(self, other, _builder)
+        return _semantic.or_(self, other)
     @builtin
-    def __ror__(self, other, _builder=None):
+    def __ror__(self, other, _semantic=None):
         other = _unwrap_if_constexpr(other)
-        return semantic.or_(other, self, _builder)
+        return _semantic.or_(other, self)
     @builtin
-    def __xor__(self, other, _builder=None):
+    def __xor__(self, other, _semantic=None):
         other = _unwrap_if_constexpr(other)
-        return semantic.xor_(self, other, _builder)
+        return _semantic.xor_(self, other)
     @builtin
-    def __rxor__(self, other, _builder=None):
+    def __rxor__(self, other, _semantic=None):
         other = _unwrap_if_constexpr(other)
-        return semantic.xor_(other, self, _builder)
+        return _semantic.xor_(other, self)
     @builtin
-    def __lshift__(self, other, _builder=None):
+    def __lshift__(self, other, _semantic=None):
         check_bit_width(self, other)
         other = _unwrap_if_constexpr(other)
-        return semantic.shl(self, other, _builder)
+        return _semantic.shl(self, other)
     @builtin
-    def __rlshift__(self, other, _builder=None):
+    def __rlshift__(self, other, _semantic=None):
         check_bit_width(other, self)
         other = _unwrap_if_constexpr(other)
-        return semantic.shl(other, self, _builder)
+        return _semantic.shl(other, self)
     @builtin
-    def __rshift__(self, other, _builder=None):
+    def __rshift__(self, other, _semantic=None):
         check_bit_width(self, other)
         other = _unwrap_if_constexpr(other)
         if self.dtype.is_int_signed():
-            return semantic.ashr(self, other, _builder)
+            return _semantic.ashr(self, other)
         else:
-            return semantic.lshr(self, other, _builder)
+            return _semantic.lshr(self, other)
     @builtin
-    def __rrshift__(self, other, _builder=None):
+    def __rrshift__(self, other, _semantic=None):
         check_bit_width(other, self)
         other = _unwrap_if_constexpr(other)
         if self.dtype.is_int_signed():
-            return semantic.ashr(other, self, _builder)
+            return _semantic.ashr(other, self)
         else:
-            return semantic.lshr(other, self, _builder)
+            return _semantic.lshr(other, self)
     # >
     @builtin
-    def __gt__(self, other, _builder=None):
-        other = semantic.to_tensor(other, _builder)
-        return semantic.greater_than(self, other, _builder)
+    def __gt__(self, other, _semantic=None):
+        other = _semantic.to_tensor(other)
+        return _semantic.greater_than(self, other)
     @builtin
-    def __rgt__(self, other, _builder=None):
-        other = semantic.to_tensor(other, _builder)
-        return semantic.greater_than(other, self, _builder)
+    def __rgt__(self, other, _semantic=None):
+        other = _semantic.to_tensor(other)
+        return _semantic.greater_than(other, self)
     # >=
     @builtin
-    def __ge__(self, other, _builder=None):
-        other = semantic.to_tensor(other, _builder)
-        return semantic.greater_equal(self, other, _builder)
+    def __ge__(self, other, _semantic=None):
+        other = _semantic.to_tensor(other)
+        return _semantic.greater_equal(self, other)
     @builtin
-    def __rge__(self, other, _builder=None):
-        other = semantic.to_tensor(other, _builder)
-        return semantic.greater_equal(other, self, _builder)
+    def __rge__(self, other, _semantic=None):
+        other = _semantic.to_tensor(other)
+        return _semantic.greater_equal(other, self)
     # <
     @builtin
-    def __lt__(self, other, _builder=None):
-        other = semantic.to_tensor(other, _builder)
-        return semantic.less_than(self, other, _builder)
+    def __lt__(self, other, _semantic=None):
+        other = _semantic.to_tensor(other)
+        return _semantic.less_than(self, other)
     @builtin
-    def __rlt__(self, other, _builder=None):
-        other = semantic.to_tensor(other, _builder)
-        return semantic.less_than(other, self, _builder)
+    def __rlt__(self, other, _semantic=None):
+        other = _semantic.to_tensor(other)
+        return _semantic.less_than(other, self)
     # <=
     @builtin
-    def __le__(self, other, _builder=None):
-        other = semantic.to_tensor(other, _builder)
-        return semantic.less_equal(self, other, _builder)
+    def __le__(self, other, _semantic=None):
+        other = _semantic.to_tensor(other)
+        return _semantic.less_equal(self, other)
     @builtin
-    def __rle__(self, other, _builder=None):
-        other = semantic.to_tensor(other, _builder)
-        return semantic.less_equal(other, self, _builder)
+    def __rle__(self, other, _semantic=None):
+        other = _semantic.to_tensor(other)
+        return _semantic.less_equal(other, self)
     # ==
     @builtin
-    def __eq__(self, other, _builder=None):
-        other = semantic.to_tensor(other, _builder)
-        return semantic.equal(self, other, _builder)
+    def __eq__(self, other, _semantic=None):
+        other = _semantic.to_tensor(other)
+        return _semantic.equal(self, other)
     @builtin
-    def __req__(self, other, _builder=None):
-        other = semantic.to_tensor(other, _builder)
-        return semantic.equal(other, self, _builder)
+    def __req__(self, other, _semantic=None):
+        other = _semantic.to_tensor(other)
+        return _semantic.equal(other, self)
     @builtin
-    def __ne__(self, other, _builder=None):
-        other = semantic.to_tensor(other, _builder)
-        return semantic.not_equal(self, other, _builder)
+    def __ne__(self, other, _semantic=None):
+        other = _semantic.to_tensor(other)
+        return _semantic.not_equal(self, other)
     @builtin
-    def __rne__(self, other, _builder=None):
-        other = semantic.to_tensor(other, _builder)
-        return semantic.not_equal(other, self, _builder)
+    def __rne__(self, other, _semantic=None):
+        other = _semantic.to_tensor(other)
+        return _semantic.not_equal(other, self)
     @builtin
-    def logical_and(self, other, _builder=None):
-        other = semantic.to_tensor(other, _builder)
-        return semantic.logical_and(self, other, _builder)
+    def logical_and(self, other, _semantic=None):
+        other = _semantic.to_tensor(other)
+        return _semantic.logical_and(self, other)
     @builtin
-    def logical_or(self, other, _builder=None):
-        other = semantic.to_tensor(other, _builder)
-        return semantic.logical_or(self, other, _builder)
+    def logical_or(self, other, _semantic=None):
+        other = _semantic.to_tensor(other)
+        return _semantic.logical_or(self, other)
     # note: __not__ isn't actually a magic method in python
     # but it's ok because our ASTVisitor handles it
     @builtin
-    def __not__(self, _builder=None):
-        return semantic.not_(self, _builder)
+    def __not__(self, _semantic=None):
+        return _semantic.not_(self)
     @builtin
-    def __getitem__(self, slices, _builder=None):
-        import builtins
+    def __getitem__(self, slices, _semantic=None):
         if isinstance(slices, (builtins.slice, slice, constexpr)) or slices is None:
             slices = [slices]
         if isinstance(slices, tuple):
             slices = slices.values
         ret = self
         for dim, sl in enumerate(slices):
-            if sl is None or isinstance(sl, constexpr) and sl.value is None:
-                ret = semantic.expand_dims(ret, dim, _builder)
-            elif isinstance(sl, (builtins.slice, slice)) and sl.start is None and sl.stop is None and sl.step is None:
-                pass
+            if _unwrap_if_constexpr(sl) is None:
+                ret = _semantic.expand_dims(ret, dim)
+            elif isinstance(sl, (builtins.slice, slice)) and all(
+                    _unwrap_if_constexpr(arg) is None for arg in (sl.start, sl.stop, sl.step)):
+                pass  # an unsqueeze
             else:
                 raise ValueError(f"unsupported tensor index: {sl}")
         return ret
@@ -1036,11 +1124,11 @@ class tensor(base_value):
         assert False, "Transposition must be created by the AST Visitor"
     @builtin
-    def to(self, dtype: dtype, fp_downcast_rounding: Optional[str] = None, bitcast: bool = False, _builder=None):
+    def to(self, dtype: dtype, fp_downcast_rounding: Optional[str] = None, bitcast: bool = False, _semantic=None):
         """
         Alias for :py:func:`tensor.cast`.
         """
-        return cast(self, dtype, fp_downcast_rounding, bitcast, _builder=_builder)
+        return cast(self, dtype, fp_downcast_rounding, bitcast, _semantic=_semantic)
     # Type stubs for functions added by the _tensor_member_fn decorator.
     # (Unfortunately these can't be created automatically.)
@@ -1140,7 +1228,7 @@ class tensor(base_value):
     def sigmoid(self) -> tensor:
         ...
-    def softmax(self, ieee_rounding=False) -> tensor:
+    def softmax(self, dim=None, keep_dims=False, ieee_rounding=False) -> tensor:
         ...
     def ravel(self) -> tensor:
@@ -1164,6 +1252,9 @@ class tensor(base_value):
     def xor_sum(self, axis=None, keep_dims=False) -> tensor:
         ...
+    def reduce_or(self, axis=None, keep_dims=False) -> tensor:
+        ...
     def cumsum(self, axis=0, reverse=False) -> tensor:
         ...
@@ -1179,13 +1270,13 @@ class tensor(base_value):
 class tuple(base_value):
-    def __init__(self, args: list, type: tuple_type = None):
+    def __init__(self, args: Sequence, type: tuple_type = None):
         self.values = [i for i in args]
         def get_type(x):
             if isinstance(x, dtype):
                 return dtype
-            if isinstance(x, int):
+            if isinstance(x, (int, float)):
                 return constexpr
             return x.type
@@ -1197,7 +1288,6 @@ class tuple(base_value):
         if isinstance(idx, constexpr):
             return self.values[idx]
         else:
-            import builtins
             assert isinstance(idx, (slice, builtins.slice))
             return tuple(self.values[idx.start:idx.stop:idx.step])
@@ -1212,8 +1302,7 @@ class tuple(base_value):
         self.values[idx] = value
     def __add__(self, other):
-        if isinstance(other, list):
-            other = tuple(other)
+        other = _normalize_tuple(other)
         return tuple(self.values + other.values)
         # return tuple(a + b for a, b in zip(self.values, other.values))
@@ -1222,13 +1311,10 @@ class tuple(base_value):
         return tuple(self.values * other.value)
     def __eq__(self, other):
-        import builtins
-        if isinstance(other, (list, builtins.tuple)):
-            other = tuple(other)
+        other = _normalize_tuple(other)
         return constexpr(self.values == other.values)
     def __hash__(self):
-        import builtins
         return hash(builtins.tuple(self.values))
     def __str__(self):
@@ -1244,6 +1330,9 @@ class tuple(base_value):
         for v in self.values:
             v._flatten_ir(handles)
+    def __repr__(self):
+        return f"({' ,'.join(repr(x) for x in self.values)})"
 class slice:
@@ -1259,12 +1348,13 @@ class tensor_descriptor_base_type(base_type):
     def __init__(self, block_type: block_type):
         self.block_type = block_type
-    def _unflatten_ir(self, handles: List[ir.value], cursor: int) -> Tuple[_experimental_tensor_descriptor_base, int]:
-        value = _experimental_tensor_descriptor_base(handles[cursor], self.block_type)
+    def _unflatten_ir(self, handles: List[ir.value], cursor: int) -> Tuple[tensor_descriptor_base, int]:
+        value = tensor_descriptor_base(handles[cursor], self.block_type)
         return value, cursor + 1
-    def to_ir(self, builder: ir.builder):
-        return builder.create_tensor_descriptor_type(self.block_type.to_ir(builder))
+    def _flatten_ir_types(self, builder: ir.builder, out: List[ir.type]) -> None:
+        is_signed = self.block_type.element_ty.is_int_signed()
+        out.append(builder.create_tensor_descriptor_type(self.block_type.to_ir(builder), is_signed))
     def __str__(self) -> str:
         # ex. "tensor_descriptor<float32[16, 32]>"
@@ -1278,8 +1368,11 @@ class tensor_descriptor_base_type(base_type):
     def __neq__(self, other) -> bool:
         return not (self == other)
+    def mangle(self) -> str:
+        return f"TD{self.block_type.mangle()}"
-class _experimental_tensor_descriptor_base(base_value):
+class tensor_descriptor_base(base_value):
     """"
     A tensor descriptor with unknown shape and strides
     """
@@ -1310,40 +1403,64 @@ class _experimental_tensor_descriptor_base(base_value):
         return str(self.type)
     @builtin
-    def load(self, offsets: Sequence[constexpr | tensor], _builder=None) -> tensor:
+    def load(self, offsets: Sequence[constexpr | tensor], _semantic=None) -> tensor:
         """Load a block from the descriptor starting at the given element offsets.
         Values outside of the tensor bounds will be filled with zeros.
         :note: Offset must be a multiple of 16-bytes
         """
-        return semantic.descriptor_load(self, offsets, "", "", _builder)
+        return _semantic.descriptor_load(self, offsets, "", "")
     @builtin
-    def store(self, offsets: Sequence[constexpr | tensor], value: tensor, _builder=None) -> tensor:
+    def store(self, offsets: Sequence[constexpr | tensor], value: tensor, _semantic=None) -> tensor:
         """Store a block from the descriptor starting at the given element offsets.
         Values outside of the tensor bounds will be ignored.
         :note: Offset must be a multiple of 16-bytes
         """
-        return semantic.descriptor_store(self, value, offsets, _builder)
+        return _semantic.descriptor_store(self, value, offsets)
+    @builtin
+    def atomic_add(self, offsets: Sequence[constexpr | tensor], value: tensor, _semantic=None) -> tensor:
+        return _semantic.descriptor_atomic_add(self, value, offsets)
+    @builtin
+    def atomic_min(self, offsets: Sequence[constexpr | tensor], value: tensor, _semantic=None) -> tensor:
+        return _semantic.descriptor_atomic_min(self, value, offsets)
     @builtin
-    def gather(self, *args, _builder=None) -> tensor:
+    def atomic_max(self, offsets: Sequence[constexpr | tensor], value: tensor, _semantic=None) -> tensor:
+        return _semantic.descriptor_atomic_max(self, value, offsets)
+    @builtin
+    def atomic_and(self, offsets: Sequence[constexpr | tensor], value: tensor, _semantic=None) -> tensor:
+        return _semantic.descriptor_atomic_and(self, value, offsets)
+    @builtin
+    def atomic_or(self, offsets: Sequence[constexpr | tensor], value: tensor, _semantic=None) -> tensor:
+        return _semantic.descriptor_atomic_or(self, value, offsets)
+    @builtin
+    def atomic_xor(self, offsets: Sequence[constexpr | tensor], value: tensor, _semantic=None) -> tensor:
+        return _semantic.descriptor_atomic_xor(self, value, offsets)
+    @builtin
+    def gather(self, *args, _semantic=None) -> tensor:
         """Gather multiple descriptors worth of data"""
         assert len(args) == 2, f"descriptor gather only supports 2D indexing, but got {len(args)}"
         x_offsets = args[0]
         y_offset = args[1]
-        return semantic.descriptor_gather(self, x_offsets, y_offset, "", "", _builder)
+        return _semantic.descriptor_gather(self, x_offsets, y_offset, "", "")
     @builtin
-    def scatter(self, value, *args, _builder=None) -> tensor:
+    def scatter(self, value, *args, _semantic=None) -> tensor:
         """Scatter multiple descriptors worth of data"""
         assert len(args) == 2, f"descriptor scatter only supports 2D indexing, but got {len(args)}"
         x_offsets = args[0]
         y_offset = args[1]
-        return semantic.descriptor_scatter(self, value, x_offsets, y_offset, _builder)
+        return _semantic.descriptor_scatter(self, value, x_offsets, y_offset)
 class tensor_descriptor_type(tensor_descriptor_base_type):
@@ -1353,25 +1470,27 @@ class tensor_descriptor_type(tensor_descriptor_base_type):
         self.shape_type = shape_type
         self.strides_type = strides_type
-    def _unflatten_ir(self, handles: List[ir.value], cursor: int) -> Tuple[_experimental_tensor_descriptor_base, int]:
+    def _unflatten_ir(self, handles: List[ir.value], cursor: int) -> Tuple[tensor_descriptor_base, int]:
         handle = handles[cursor]
         cursor += 1
         shape, cursor = self.shape_type._unflatten_ir(handles, cursor)
         strides, cursor = self.strides_type._unflatten_ir(handles, cursor)
         shape = shape.values
         strides = strides.values
-        value = _experimental_tensor_descriptor(handle, shape, strides, self.block_type)
+        value = tensor_descriptor(handle, shape, strides, self.block_type)
         return value, cursor
-    def to_ir(self, builder: ir.builder):
-        return [super().to_ir(builder), *self.shape_type.to_ir(builder), *self.strides_type.to_ir(builder)]
+    def _flatten_ir_types(self, builder: ir.builder, out: List[ir.type]) -> None:
+        super()._flatten_ir_types(builder, out)
+        self.shape_type._flatten_ir_types(builder, out)
+        self.strides_type._flatten_ir_types(builder, out)
     def __eq__(self, other):
         return super().__eq__(other) and (self.shape_type == other.shape_type) and (self.strides_type
                                                                                     == other.strides_type)
-class _experimental_tensor_descriptor(_experimental_tensor_descriptor_base):
+class tensor_descriptor(tensor_descriptor_base):
     """A descriptor representing a tensor in global memory.
     """
@@ -1379,37 +1498,121 @@ class _experimental_tensor_descriptor(_experimental_tensor_descriptor_base):
         """Not called by user code."""
         # IR handle
         super().__init__(handle, block_type)
+        # Global shape
+        self.shape = tuple(shape)
+        self.strides = tuple(strides)
         self.type = tensor_descriptor_type(
             block_type,
-            shape_type=tuple_type([s.type for s in shape]),
-            strides_type=tuple_type([s.type for s in strides]),
+            shape_type=self.shape.type,
+            strides_type=self.strides.type,
         )
-        # Global shape
-        self.shape = shape
-        self.strides = strides
     def _flatten_ir(self, handles: List[ir.value]) -> None:
         handles.append(self.handle)
-        handles.extend(s.handle for s in self.shape)
-        handles.extend(s.handle for s in self.strides)
+        self.shape._flatten_ir(handles)
+        self.strides._flatten_ir(handles)
+# -----------------------
+# aggregate
+# -----------------------
+@dataclass(frozen=True)
+class _aggregate_type(base_type):
+    """A generic base type for all Triton aggregate types.
-def get_bool_env_var(var_name):
-    v = os.getenv(var_name, "0")
-    return v == "1" or v == "true" or v == "on"
+    This class contains a reference to the original user-defined Python class
+    and a list of class fields with their Triton types.
+    """
+    base_cls: type
+    fields: List[Tuple[str, base_type]]
+    def _unflatten_ir(self, handles: List[ir.value], cursor: int) -> Tuple[ir.value, int]:
+        instance = self.base_cls._get_instance()
+        for name, ty in self.fields:
+            value, cursor = ty._unflatten_ir(handles, cursor)
+            setattr(instance, name, value)
+        return instance, cursor
+    def _flatten_ir_types(self, builder: ir.builder, out: List[ir.type]) -> None:
+        for name, ty in self.fields:
+            ty._flatten_ir_types(builder, out)
+    def mangle(self) -> str:
+        name = f"{self.base_cls.__module__}.{self.base_cls.__qualname__}"
+        fields = [ty.mangle() for (name, ty) in self.fields]
+        return f"{name}<{', '.join(fields)}>"
+def _aggregate(cls):
+    # Define the wrapped Triton value type.
+    class aggregate_value(base_value):
+        __triton_builtin__ = True
+        __triton_aggregate__ = True
+        @classmethod
+        def _get_instance(this_cls):
+            return super().__new__(this_cls)
+        def __new__(this_cls, *args, _semantic=None, _generator=None, **kwargs):
+            # Call into the user-defined constructor.
+            instance = this_cls._get_instance()
+            if isinstance(cls.__init__, JITFunction):
+                raise ValueError(f"{cls.__name__}.__init__ cannot be a @triton.jit function")
+            extra_kwargs = {}
+            if "_semantic" in inspect.signature(cls.__init__).parameters:
+                extra_kwargs["_semantic"] = _semantic
+            if "_generator" in inspect.signature(cls.__init__).parameters:
+                extra_kwargs["_generator"] = _generator
+            cls.__init__(instance, *args, **extra_kwargs, **kwargs)
+            # Require that the user-defined constructor initialized all fields.
+            for name in cls.__annotations__.keys():
+                if not hasattr(instance, name):
+                    raise AttributeError(f"constructor for {cls.__name__} did not initialize attribute '{name}'")
+            return instance
+        # Only allow setting attributes defined in the class annotations.
+        def __setattr__(self, name, value):
+            if name not in cls.__annotations__:
+                raise AttributeError(f"{cls.__name__} has no attribute '{name}'")
+            if not isinstance(value, cls.__annotations__[name]):
+                raise TypeError(f"Expected {cls.__annotations__[name]} for attribute '{name}', got {type(value)}")
+            super().__setattr__(name, value)
+        def _flatten_ir(self, handles: List[ir.value]) -> None:
+            for name in cls.__annotations__.keys():
+                getattr(self, name)._flatten_ir(handles)
+        @property
+        def type(self):
+            return _aggregate_type(aggregate_value,
+                                   [(name, getattr(self, name).type) for name in cls.__annotations__.keys()])
+    for (name, member) in inspect.getmembers(cls):
+        if inspect.isfunction(member) or inspect.ismethod(member) or isinstance(member, JITFunction):
+            if name != "__init__":
+                setattr(aggregate_value, name, member)
+    aggregate_value.__name__ = cls.__name__
+    aggregate_value.__module__ = cls.__module__
+    aggregate_value.__qualname__ = cls.__qualname__
+    aggregate_value.__doc__ = cls.__doc__
+    return aggregate_value
 # -----------------------
 # SPMD Programming Model
 # -----------------------
-def _constexpr_to_value(v):
-    if isinstance(v, constexpr):
-        return v.value
-    return v
 @builtin
-def program_id(axis, _builder=None):
+def program_id(axis, _semantic=None):
     """
     Returns the id of the current program instance along the given :code:`axis`.
@@ -1417,26 +1620,26 @@ def program_id(axis, _builder=None):
     :type axis: int
     """
     # if axis == -1:
-    #     pid0 = program_id(0, _builder)
-    #     pid1 = program_id(1, _builder)
-    #     pid2 = program_id(2, _builder)
-    #     npg0 = num_programs(0, _builder)
-    #     npg1 = num_programs(1, _builder)
+    #     pid0 = _semantic.program_id(0)
+    #     pid1 = _semantic.program_id(1)
+    #     pid2 = _semantic.program_id(2)
+    #     npg0 = _semantic.num_programs(0)
+    #     npg1 = _semantic.num_programs(1)
     #     return pid0 + pid1*npg0 + pid2*npg0*npg1
-    axis = _constexpr_to_value(axis)
-    return semantic.program_id(axis, _builder)
+    axis = _unwrap_if_constexpr(axis)
+    return _semantic.program_id(axis)
 @builtin
-def num_programs(axis, _builder=None):
+def num_programs(axis, _semantic=None):
     """
     Returns the number of program instances launched along the given :code:`axis`.
     :param axis: The axis of the 3D launch grid. Must be 0, 1 or 2.
     :type axis: int
     """
-    axis = _constexpr_to_value(axis)
-    return semantic.num_programs(axis, _builder)
+    axis = _unwrap_if_constexpr(axis)
+    return _semantic.num_programs(axis)
 # -----------------------
@@ -1445,10 +1648,10 @@ def num_programs(axis, _builder=None):
 @builtin
-def arange(start, end, _builder=None):
-    start = _constexpr_to_value(start)
-    end = _constexpr_to_value(end)
-    return semantic.arange(start, end, _builder)
+def arange(start, end, _semantic=None):
+    start = _unwrap_if_constexpr(start)
+    end = _unwrap_if_constexpr(end)
+    return _semantic.arange(start, end)
 arange.__doc__ = f"""
@@ -1465,8 +1668,8 @@ arange.__doc__ = f"""
 def _unwrap_shape(shape):
-    shape = _constexpr_to_value(shape)
-    return [_constexpr_to_value(s) for s in shape]
+    shape = _unwrap_if_constexpr(shape)
+    return [_unwrap_if_constexpr(s) for s in shape]
 def _shape_check_impl(shape):
@@ -1476,7 +1679,7 @@ def _shape_check_impl(shape):
 @builtin
-def full(shape, value, dtype, _builder=None):
+def full(shape, value, dtype, _semantic=None):
     """
     Returns a tensor filled with the scalar value for the given :code:`shape` and :code:`dtype`.
@@ -1488,9 +1691,9 @@ def full(shape, value, dtype, _builder=None):
     :type dtype: tl.dtype
     """
     shape = _shape_check_impl(shape)
-    value = _constexpr_to_value(value)
-    dtype = _constexpr_to_value(dtype)
-    return semantic.full(shape, value, dtype, _builder)
+    value = _unwrap_if_constexpr(value)
+    dtype = _unwrap_if_constexpr(dtype)
+    return _semantic.full(shape, value, dtype)
 # -----------------------
@@ -1499,7 +1702,7 @@ def full(shape, value, dtype, _builder=None):
 @builtin
-def broadcast(input, other, _builder=None):
+def broadcast(input, other, _semantic=None):
     """
     Tries to broadcast the two given blocks to a common compatible shape.
@@ -1508,12 +1711,12 @@ def broadcast(input, other, _builder=None):
     :param other: The second input tensor.
     :type other: Block
     """
-    return semantic.broadcast_impl_value(input, other, _builder)
+    return _semantic.broadcast_impl_value(input, other)
 @_tensor_member_fn
 @builtin
-def broadcast_to(input, *shape, _builder=None):
+def broadcast_to(input, *shape, _semantic=None):
     """
     Tries to broadcast the given tensor to a new :code:`shape`.
@@ -1529,12 +1732,12 @@ def broadcast_to(input, *shape, _builder=None):
         broadcast_to(x, 32, 32)
     """
     shape = _shape_check_impl(_unwrap_iterable(shape))
-    return semantic.broadcast_impl_shape(input, shape, _builder)
+    return _semantic.broadcast_impl_shape(input, shape)
 @_tensor_member_fn
 @builtin
-def trans(input: tensor, *dims, _builder=None):
+def trans(input: tensor, *dims, _semantic=None):
     """
     Permutes the dimensions of a tensor.
@@ -1543,7 +1746,7 @@ def trans(input: tensor, *dims, _builder=None):
     :param input: The input tensor.
     :param dims: The desired ordering of dimensions.  For example,
-        :code:`(2, 1, 0)` reverses the order dims in a a 3D tensor.
+        :code:`(2, 1, 0)` reverses the order dims in a 3D tensor.
     :code:`dims` can be passed as a tuple or as individual parameters: ::
@@ -1557,19 +1760,19 @@ def trans(input: tensor, *dims, _builder=None):
     dims = _unwrap_iterable(dims)
     if not dims:
         dims = (1, 0)
-    return semantic.permute(input, dims, _builder)
+    return _semantic.permute(input, dims)
 @_tensor_member_fn
 @builtin
-def permute(input, *dims, _builder=None):
+def permute(input, *dims, _semantic=None):
     """
     Permutes the dimensions of a tensor.
     :param input: The input tensor.
     :type input: Block
     :param dims: The desired ordering of dimensions.  For example,
-        :code:`(2, 1, 0)` reverses the order dims in a a 3D tensor.
+        :code:`(2, 1, 0)` reverses the order dims in a 3D tensor.
     :code:`dims` can be passed as a tuple or as individual parameters: ::
@@ -1581,11 +1784,11 @@ def permute(input, *dims, _builder=None):
     :code:`dims` is empty, it tries to do a (1,0) permutation.
     """
     dims = _unwrap_iterable(dims)
-    return semantic.permute(input, dims, _builder)
+    return _semantic.permute(input, dims)
 @builtin
-def cat(input, other, can_reorder=False, _builder=None):
+def cat(input, other, can_reorder=False, _semantic=None):
     """
     Concatenate the given blocks
@@ -1598,11 +1801,11 @@ def cat(input, other, can_reorder=False, _builder=None):
         order does not matter (e.g., result is only used in reduction ops).
         Current implementation of `cat` supports only can_reorder=True.
     """
-    return semantic.cat(input, other, can_reorder, _builder)
+    return _semantic.cat(input, other, can_reorder)
 @builtin
-def join(a, b, _builder=None):
+def join(a, b, _semantic=None):
     """
     Join the given tensors in a new, minor dimension.
@@ -1622,7 +1825,7 @@ def join(a, b, _builder=None):
     :param b: The second input tensor.
     :type b: Tensor
     """
-    return semantic.join(a, b, _builder)
+    return _semantic.join(a, b)
 @jit
@@ -1630,9 +1833,25 @@ def _take_first(a, b):
     return a
+def _unsplat(x, _semantic=None, _generator=None):
+    """
+    Convert a single-element tensor to a scalar.
+    """
+    if len(x.shape) == 0:
+        return x
+    numel = 1
+    for d in x.shape:
+        numel *= d
+    assert numel == 1, "can only unsplat single-element tensors"
+    if len(x.shape) >= 2:
+        x = _semantic.reshape(x, [1])
+    x = typing.cast(tensor, reduce(x, 0, _take_first, _semantic=_semantic, _generator=_generator))
+    return x
 @_tensor_member_fn
 @builtin
-def split(a, _builder=None, _generator=None) -> tuple[tensor, tensor]:
+def split(a, _semantic=None, _generator=None) -> tuple[tensor, tensor]:
     """
     Split a tensor in two along its last dim, which must have size 2.
@@ -1649,25 +1868,25 @@ def split(a, _builder=None, _generator=None) -> tuple[tensor, tensor]:
     :type a: Tensor
     """
     # If len(a.shape) == 1, i.e. a.shape == [2], we should return two scalars.
-    # But semantic.split can only handle returning tensors.  Work around this by
+    # But _semantic.split can only handle returning tensors.  Work around this by
     # expanding the input to shape [1,2] and then reducing the result.
     was_rank_1 = len(a.shape) == 1
     if was_rank_1:
-        a = semantic.expand_dims(a, 0, _builder)
+        a = _semantic.expand_dims(a, 0)
-    out_lhs, out_rhs = semantic.split(a, _builder)
+    out_lhs, out_rhs = _semantic.split(a)
     if was_rank_1:
         # Currently `reduce` is the best way to convert a tensor of shape [1] to a scalar.
-        out_lhs = typing.cast(tensor, reduce(out_lhs, None, _take_first, _builder=_builder, _generator=_generator))
-        out_rhs = typing.cast(tensor, reduce(out_rhs, None, _take_first, _builder=_builder, _generator=_generator))
+        out_lhs = _unsplat(out_lhs, _semantic=_semantic, _generator=_generator)
+        out_rhs = _unsplat(out_rhs, _semantic=_semantic, _generator=_generator)
     return out_lhs, out_rhs
 @_tensor_member_fn
 @builtin
-def view(input, *shape, _builder=None):
+def view(input, *shape, _semantic=None):
     """
     Returns a tensor with the same elements as `input` but a different shape.
     The order of the elements may not be preserved.
@@ -1684,12 +1903,21 @@ def view(input, *shape, _builder=None):
     """
     warn("view is deprecated, please use reshape with can_reorder being true.")
     shape = _shape_check_impl(_unwrap_iterable(shape))
-    return semantic.reshape(input, shape, can_reorder=True, builder=_builder)
+    return _semantic.reshape(input, shape, can_reorder=True)
 @_tensor_member_fn
 @builtin
-def reshape(input, *shape, can_reorder=False, _builder=None):
+def item(input, _semantic=None, _generator=None):
+    """
+    Converts a single-element tensor into a scalar.
+    """
+    return _unsplat(input, _semantic=_semantic, _generator=_generator)
+@_tensor_member_fn
+@builtin
+def reshape(input, *shape, can_reorder=False, _semantic=None, _generator=None):
     """
     Returns a tensor with the same number of elements as input but with the
     provided shape.
@@ -1705,7 +1933,9 @@ def reshape(input, *shape, can_reorder=False, _builder=None):
         reshape(x, 32, 32)
     """
     shape = _shape_check_impl(_unwrap_iterable(shape))
-    return semantic.reshape(input, shape, can_reorder, _builder)
+    if len(shape) == 0:
+        return _unsplat(input, _semantic=_semantic, _generator=_generator)
+    return _semantic.reshape(input, shape, can_reorder)
 def _wrap_axis(axis, ndim):
@@ -1717,7 +1947,7 @@ def _wrap_axis(axis, ndim):
 @_tensor_member_fn
 @builtin
-def expand_dims(input, axis, _builder=None):
+def expand_dims(input, axis, _semantic=None):
     """
     Expand the shape of a tensor, by inserting new length-1 dimensions.
@@ -1730,24 +1960,24 @@ def expand_dims(input, axis, _builder=None):
     :type axis: int | Sequence[int]
     """
-    input = semantic.to_tensor(input, _builder)
-    axis = _constexpr_to_value(axis)
+    input = _semantic.to_tensor(input)
+    axis = _unwrap_if_constexpr(axis)
     axes = list(axis) if isinstance(axis, (Sequence, tuple)) else [axis]
     new_ndim = len(input.shape) + len(axes)
-    axes = [_wrap_axis(_constexpr_to_value(d), new_ndim) for d in axes]
+    axes = [_wrap_axis(_unwrap_if_constexpr(d), new_ndim) for d in axes]
     if len(set(axes)) != len(axes):
         raise ValueError(f"expand_dims received duplicate axes, normalized axes = {axes}")
     ret = input
     for a in sorted(axes):
-        ret = semantic.expand_dims(ret, a, _builder)
+        ret = _semantic.expand_dims(ret, a)
     return ret
 @_tensor_member_fn
 @builtin
-def cast(input, dtype: dtype, fp_downcast_rounding: Optional[str] = None, bitcast: bool = False, _builder=None):
+def cast(input, dtype: dtype, fp_downcast_rounding: Optional[str] = None, bitcast: bool = False, _semantic=None):
     """
     Casts a tensor to the given :code:`dtype`.
@@ -1763,13 +1993,13 @@ def cast(input, dtype: dtype, fp_downcast_rounding: Optional[str] = None, bitcas
         :code:`dtype`, instead of being numerically casted.
     :type bitcast: bool, optional
     """
-    input = semantic.to_tensor(input, _builder)
-    dtype = _constexpr_to_value(dtype)
-    fp_downcast_rounding = _constexpr_to_value(fp_downcast_rounding)
-    bitcast = _constexpr_to_value(bitcast)
+    input = _semantic.to_tensor(input)
+    dtype = _unwrap_if_constexpr(dtype)
+    fp_downcast_rounding = _unwrap_if_constexpr(fp_downcast_rounding)
+    bitcast = _unwrap_if_constexpr(bitcast)
     if bitcast:
-        return semantic.bitcast(input, dtype, _builder)
-    return semantic.cast(input, dtype, _builder, fp_downcast_rounding)
+        return _semantic.bitcast(input, dtype)
+    return _semantic.cast(input, dtype, fp_downcast_rounding)
 # -----------------------
@@ -1779,7 +2009,7 @@ def cast(input, dtype: dtype, fp_downcast_rounding: Optional[str] = None, bitcas
 @builtin
 def dot(input, other, acc=None, input_precision=None, allow_tf32=None, max_num_imprecise_acc=None, out_dtype=float32,
-        _builder=None):
+        _semantic=None):
     """
     Returns the matrix product of two blocks.
@@ -1804,19 +2034,20 @@ def dot(input, other, acc=None, input_precision=None, allow_tf32=None, max_num_i
     """
     assert input_precision is None or allow_tf32 is None, "Only one of input_precision and allow_tf32 can be specified"
     if input_precision is None:
-        supports_tf32 = _builder and "tf32" in _builder.options.allowed_dot_input_precisions
-        default_precision = "tf32" if (supports_tf32 and (allow_tf32 or allow_tf32 is None)) else "ieee"
-        input_precision = os.getenv("TRITON_F32_DEFAULT", default_precision)
+        supports_tf32 = "tf32" in _semantic.builder.options.allowed_dot_input_precisions
+        input_precision = knobs.language.fp32_default or ("tf32" if (supports_tf32 and
+                                                                     (allow_tf32 or allow_tf32 is None)) else "ieee")
-    input_precision = _constexpr_to_value(input_precision)
-    out_dtype = _constexpr_to_value(out_dtype)
-    max_num_imprecise_acc = _constexpr_to_value(max_num_imprecise_acc)
-    return semantic.dot(input, other, acc, input_precision, max_num_imprecise_acc, out_dtype, _builder)
+    input_precision = _unwrap_if_constexpr(input_precision)
+    out_dtype = _unwrap_if_constexpr(out_dtype)
+    max_num_imprecise_acc = _unwrap_if_constexpr(max_num_imprecise_acc)
+    acc = _unwrap_if_constexpr(acc)
+    return _semantic.dot(input, other, acc, input_precision, max_num_imprecise_acc, out_dtype)
 @builtin
-def dot_scaled(lhs, lhs_scale, lhs_format, rhs, rhs_scale, rhs_format, acc=None, fast_math=False, out_dtype=float32,
-               _builder=None):
+def dot_scaled(lhs, lhs_scale, lhs_format, rhs, rhs_scale, rhs_format, acc=None, fast_math=False, lhs_k_pack=True,
+               rhs_k_pack=True, out_dtype=float32, _semantic=None):
     """
     Returns the matrix product of two blocks in microscaling format.
@@ -1843,11 +2074,15 @@ def dot_scaled(lhs, lhs_scale, lhs_format, rhs, rhs_scale, rhs_format, acc=None,
     :param rhs_format: format of the rhs tensor. Available formats: {:code:`e2m1`, :code:`e4m3`, :code:`e5m2`, :code:`bf16`, :code:`fp16`}.
     :type rhs_format: str
     :param acc: The accumulator tensor. If not None, the result is added to this tensor.
+    :param lhs_k_pack: If false, the lhs tensor is packed into uint8 along M dimension.
+    :type lhs_k_pack: bool, optional
+    :param rhs_k_pack: If false, the rhs tensor is packed into uint8 along N dimension.
+    :type rhs_k_pack: bool, optional
     """
-    out_dtype = _constexpr_to_value(out_dtype)
+    out_dtype = _unwrap_if_constexpr(out_dtype)
     assert out_dtype == float32, "Only float32 is supported for out_dtype at the moment"
-    return semantic.dot_scaled(lhs, lhs_scale, lhs_format, rhs, rhs_scale, rhs_format, acc, fast_math, out_dtype,
-                               _builder)
+    return _semantic.dot_scaled(lhs, lhs_scale, lhs_format, rhs, rhs_scale, rhs_format, acc, fast_math, lhs_k_pack,
+                                rhs_k_pack, out_dtype)
 # -----------------------
@@ -1857,7 +2092,7 @@ def dot_scaled(lhs, lhs_scale, lhs_format, rhs, rhs_scale, rhs_format, acc=None,
 @builtin
 def load(pointer, mask=None, other=None, boundary_check=(), padding_option="", cache_modifier="", eviction_policy="",
-         volatile=False, _builder=None):
+         volatile=False, _semantic=None):
     """
     Return a tensor of data whose values are loaded from memory at location defined by `pointer`:
@@ -1892,8 +2127,9 @@ def load(pointer, mask=None, other=None, boundary_check=(), padding_option="", c
     :type boundary_check: tuple of ints, optional
     :param padding_option: should be one of {"", "zero", "nan"}, the padding value to use while out of bounds. "" means an undefined value.
     :param cache_modifier: changes cache option in NVIDIA PTX
-    :type cache_modifier: str, optional, should be one of {"", "ca", "cg"}, where "ca" stands for
-        cache at all levels and "cg" stands for cache at global level (cache in L2 and below, not L1), see
+    :type cache_modifier: str, optional, should be one of {"", ".ca", ".cg", ".cv"}, where ".ca" stands for
+        cache at all levels, ".cg" stands for cache at global level (cache in L2 and below, not L1),
+        and ".cv" means don’t cache and fetch again. see
         `cache operator <https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#cache-operators>`_ for more details.
     :param eviction_policy: changes eviction policy in NVIDIA PTX
     :type eviction_policy: str, optional
@@ -1901,57 +2137,37 @@ def load(pointer, mask=None, other=None, boundary_check=(), padding_option="", c
     :type volatile: bool, optional
     """
     # `mask` and `other` can be constexpr
-    mask = _constexpr_to_value(mask)
-    other = _constexpr_to_value(other)
+    mask = _unwrap_if_constexpr(mask)
+    other = _unwrap_if_constexpr(other)
     if mask is not None:
-        mask = semantic.to_tensor(mask, _builder)
+        mask = _semantic.to_tensor(mask)
     if other is not None:
-        other = semantic.to_tensor(other, _builder)
-    padding_option = _constexpr_to_value(padding_option)
-    cache_modifier = _constexpr_to_value(cache_modifier)
-    eviction_policy = _constexpr_to_value(eviction_policy)
-    volatile = _constexpr_to_value(volatile)
-    return semantic.load(pointer, mask, other, boundary_check, padding_option, cache_modifier, eviction_policy,
-                         volatile, _builder)
-@builtin
-def _experimental_reinterpret_tensor_descriptor(desc_ptr, block_shape, dtype,
-                                                _builder=None) -> _experimental_tensor_descriptor_base:
-    """
-    Reinterpret a generic pointer as a TMA-backed tensor descriptor object.
-    """
-    block_ty = block_type(_constexpr_to_value(dtype), block_shape)
-    return semantic.reinterpret_tensor_descriptor(desc_ptr, block_ty, _builder)
+        other = _semantic.to_tensor(other)
+    padding_option = _unwrap_if_constexpr(padding_option)
+    cache_modifier = _unwrap_if_constexpr(cache_modifier)
+    eviction_policy = _unwrap_if_constexpr(eviction_policy)
+    volatile = _unwrap_if_constexpr(volatile)
+    return _semantic.load(pointer, mask, other, boundary_check, padding_option, cache_modifier, eviction_policy,
+                          volatile)
 @builtin
-def _experimental_descriptor_load(desc_pointer, offsets, shape, dtype, _builder=None):
-    """
-    Experimental feature to access TMA descriptors loads. This is an escape hatch to easily exercise TTGIR operations.
-    This will be removed in the future and shouldn't be used in production code.
-    This loads a tensor of data based on the descriptor and offsets.
-    """
-    desc = _experimental_reinterpret_tensor_descriptor(desc_pointer, shape, dtype, _builder=_builder)
-    return desc.load(offsets, _builder=_builder)
+def load_tensor_descriptor(desc: tensor_descriptor_base, offsets: Sequence[constexpr | tensor],
+                           _semantic=None) -> tensor:
+    """Load a block of data from a tensor descriptor."""
+    return desc.load(offsets, _semantic=_semantic)
 @builtin
-def _experimental_descriptor_store(desc_pointer, value, offsets, _builder=None):
-    """
-    Experimental feature to access TMA descriptors stores. This is an escape hatch to easily exercise TTGIR operations.
-    This will be removed in the future and shouldn't be used in production code.
-    This stores a tensor of data based on the descriptor and offsets.
-    """
-    desc = _experimental_reinterpret_tensor_descriptor(desc_pointer, value.shape, value.dtype, _builder=_builder)
-    return desc.store(offsets, value, _builder=_builder)
+def store_tensor_descriptor(desc: tensor_descriptor_base, offsets: Sequence[constexpr | tensor], value: tensor,
+                            _semantic=None) -> tensor:
+    """Store a block of data to a tensor descriptor."""
+    return desc.store(offsets, value, _semantic=_semantic)
 @_tensor_member_fn
 @builtin
-def store(pointer, value, mask=None, boundary_check=(), cache_modifier="", eviction_policy="", _builder=None):
+def store(pointer, value, mask=None, boundary_check=(), cache_modifier="", eviction_policy="", _semantic=None):
     """
     Store a tensor of data into memory locations defined by `pointer`.
@@ -1991,17 +2207,17 @@ def store(pointer, value, mask=None, boundary_check=(), cache_modifier="", evict
     :type eviction_policy: str, optional, should be one of {"", "evict_first", "evict_last"}
     """
     # `value` can be constexpr
-    value = semantic.to_tensor(value, _builder)
-    mask = _constexpr_to_value(mask)
+    value = _semantic.to_tensor(value)
+    mask = _unwrap_if_constexpr(mask)
     if mask is not None:
-        mask = semantic.to_tensor(mask, _builder)
-    cache_modifier = _constexpr_to_value(cache_modifier)
-    eviction_policy = _constexpr_to_value(eviction_policy)
-    return semantic.store(pointer, value, mask, boundary_check, cache_modifier, eviction_policy, _builder)
+        mask = _semantic.to_tensor(mask)
+    cache_modifier = _unwrap_if_constexpr(cache_modifier)
+    eviction_policy = _unwrap_if_constexpr(eviction_policy)
+    return _semantic.store(pointer, value, mask, boundary_check, cache_modifier, eviction_policy)
 @builtin
-def make_block_ptr(base: tensor, shape, strides, offsets, block_shape, order, _builder=None):
+def make_block_ptr(base: tensor, shape, strides, offsets, block_shape, order, _semantic=None):
     """
     Returns a pointer to a block in a parent tensor
@@ -2012,30 +2228,33 @@ def make_block_ptr(base: tensor, shape, strides, offsets, block_shape, order, _b
     :param block_shape: The shape of the block
     :param order: The order of the original data format
     """
-    return semantic.make_block_ptr(base, shape, strides, offsets, block_shape, order, _builder)
+    return _semantic.make_block_ptr(base, shape, strides, offsets, block_shape, order)
+@must_use_result(
+    "Note that tl.advance does not have any side effects. To move the block pointer, you need to assign the result of tl.advance to a variable."
+)
 @_tensor_member_fn
 @builtin
-def advance(base, offsets, _builder=None):
+def advance(base, offsets, _semantic=None):
     """
     Advance a block pointer
     :param base: the block pointer to advance
     :param offsets: the offsets to advance, a tuple by dimension
     """
-    return semantic.advance(base, offsets, _builder)
+    return _semantic.advance(base, offsets)
 @builtin
-def _experimental_make_tensor_descriptor(
+def make_tensor_descriptor(
     base: tensor,
     shape: List[tensor],
     strides: List[tensor],
     block_shape: List[constexpr],
-    _builder=None,
-) -> _experimental_tensor_descriptor:
-    """Make an experimental tensor descriptor object
+    _semantic=None,
+) -> tensor_descriptor:
+    """Make a tensor descriptor object
     :param base: the base pointer of the tensor, must be 16-byte aligned
     :param shape: A list of non-negative integers representing the tensor shape
@@ -2056,7 +2275,7 @@ def _experimental_make_tensor_descriptor(
         @triton.jit
         def inplace_abs(in_out_ptr, M, N, M_BLOCK: tl.constexpr, N_BLOCK: tl.constexpr):
-            desc = tl._experimental_make_tensor_descriptor(
+            desc = tl.make_tensor_descriptor(
                 in_out_ptr,
                 shape=[M, N],
                 strides=[N, 1],
@@ -2082,7 +2301,7 @@ def _experimental_make_tensor_descriptor(
         inplace_abs[grid](x, M, N, M_BLOCK, N_BLOCK)
     """
-    return semantic.make_tensor_descriptor(base, shape, strides, block_shape, _builder)
+    return _semantic.make_tensor_descriptor(base, shape, strides, block_shape)
 # -----------------------
@@ -2124,89 +2343,89 @@ def _add_atomic_docstr(name: str, has_cmp: bool = False) -> Callable[[T], T]:
 @_tensor_member_fn
 @builtin
 @_add_atomic_docstr("compare-and-swap", has_cmp=True)
-def atomic_cas(pointer, cmp, val, sem=None, scope=None, _builder=None):
-    cmp = semantic.to_tensor(cmp, _builder)
-    val = semantic.to_tensor(val, _builder)
-    sem = _constexpr_to_value(sem)
-    scope = _constexpr_to_value(scope)
-    return semantic.atomic_cas(pointer, cmp, val, sem, scope, _builder)
+def atomic_cas(pointer, cmp, val, sem=None, scope=None, _semantic=None):
+    cmp = _semantic.to_tensor(cmp)
+    val = _semantic.to_tensor(val)
+    sem = _unwrap_if_constexpr(sem)
+    scope = _unwrap_if_constexpr(scope)
+    return _semantic.atomic_cas(pointer, cmp, val, sem, scope)
 @_tensor_member_fn
 @builtin
 @_add_atomic_docstr("exchange")
-def atomic_xchg(pointer, val, mask=None, sem=None, scope=None, _builder=None):
-    val = semantic.to_tensor(val, _builder)
-    sem = _constexpr_to_value(sem)
-    scope = _constexpr_to_value(scope)
-    mask = _constexpr_to_value(mask)
-    return semantic.atomic_xchg(pointer, val, mask, sem, scope, _builder)
+def atomic_xchg(pointer, val, mask=None, sem=None, scope=None, _semantic=None):
+    val = _semantic.to_tensor(val)
+    sem = _unwrap_if_constexpr(sem)
+    scope = _unwrap_if_constexpr(scope)
+    mask = _unwrap_if_constexpr(mask)
+    return _semantic.atomic_xchg(pointer, val, mask, sem, scope)
 @_tensor_member_fn
 @builtin
 @_add_atomic_docstr("add")
-def atomic_add(pointer, val, mask=None, sem=None, scope=None, _builder=None):
-    val = semantic.to_tensor(val, _builder)
-    sem = _constexpr_to_value(sem)
-    scope = _constexpr_to_value(scope)
-    mask = _constexpr_to_value(mask)
-    return semantic.atomic_add(pointer, val, mask, sem, scope, _builder)
+def atomic_add(pointer, val, mask=None, sem=None, scope=None, _semantic=None):
+    val = _semantic.to_tensor(val)
+    sem = _unwrap_if_constexpr(sem)
+    scope = _unwrap_if_constexpr(scope)
+    mask = _unwrap_if_constexpr(mask)
+    return _semantic.atomic_add(pointer, val, mask, sem, scope)
 @_tensor_member_fn
 @builtin
 @_add_atomic_docstr("max")
-def atomic_max(pointer, val, mask=None, sem=None, scope=None, _builder=None):
-    val = semantic.to_tensor(val, _builder)
-    sem = _constexpr_to_value(sem)
-    scope = _constexpr_to_value(scope)
-    mask = _constexpr_to_value(mask)
-    return semantic.atomic_max(pointer, val, mask, sem, scope, _builder)
+def atomic_max(pointer, val, mask=None, sem=None, scope=None, _semantic=None):
+    val = _semantic.to_tensor(val)
+    sem = _unwrap_if_constexpr(sem)
+    scope = _unwrap_if_constexpr(scope)
+    mask = _unwrap_if_constexpr(mask)
+    return _semantic.atomic_max(pointer, val, mask, sem, scope)
 @_tensor_member_fn
 @builtin
 @_add_atomic_docstr("min")
-def atomic_min(pointer, val, mask=None, sem=None, scope=None, _builder=None):
-    val = semantic.to_tensor(val, _builder)
-    sem = _constexpr_to_value(sem)
-    scope = _constexpr_to_value(scope)
-    mask = _constexpr_to_value(mask)
-    return semantic.atomic_min(pointer, val, mask, sem, scope, _builder)
+def atomic_min(pointer, val, mask=None, sem=None, scope=None, _semantic=None):
+    val = _semantic.to_tensor(val)
+    sem = _unwrap_if_constexpr(sem)
+    scope = _unwrap_if_constexpr(scope)
+    mask = _unwrap_if_constexpr(mask)
+    return _semantic.atomic_min(pointer, val, mask, sem, scope)
 @_tensor_member_fn
 @builtin
 @_add_atomic_docstr("logical and")
-def atomic_and(pointer, val, mask=None, sem=None, scope=None, _builder=None):
-    val = semantic.to_tensor(val, _builder)
-    sem = _constexpr_to_value(sem)
-    scope = _constexpr_to_value(scope)
-    mask = _constexpr_to_value(mask)
-    return semantic.atomic_and(pointer, val, mask, sem, scope, _builder)
+def atomic_and(pointer, val, mask=None, sem=None, scope=None, _semantic=None):
+    val = _semantic.to_tensor(val)
+    sem = _unwrap_if_constexpr(sem)
+    scope = _unwrap_if_constexpr(scope)
+    mask = _unwrap_if_constexpr(mask)
+    return _semantic.atomic_and(pointer, val, mask, sem, scope)
 @_tensor_member_fn
 @builtin
 @_add_atomic_docstr("logical or")
-def atomic_or(pointer, val, mask=None, sem=None, scope=None, _builder=None):
-    val = semantic.to_tensor(val, _builder)
-    sem = _constexpr_to_value(sem)
-    scope = _constexpr_to_value(scope)
-    mask = _constexpr_to_value(mask)
-    return semantic.atomic_or(pointer, val, mask, sem, scope, _builder)
+def atomic_or(pointer, val, mask=None, sem=None, scope=None, _semantic=None):
+    val = _semantic.to_tensor(val)
+    sem = _unwrap_if_constexpr(sem)
+    scope = _unwrap_if_constexpr(scope)
+    mask = _unwrap_if_constexpr(mask)
+    return _semantic.atomic_or(pointer, val, mask, sem, scope)
 @_tensor_member_fn
 @builtin
 @_add_atomic_docstr("logical xor")
-def atomic_xor(pointer, val, mask=None, sem=None, scope=None, _builder=None):
-    val = semantic.to_tensor(val, _builder)
-    sem = _constexpr_to_value(sem)
-    scope = _constexpr_to_value(scope)
-    mask = _constexpr_to_value(mask)
-    return semantic.atomic_xor(pointer, val, mask, sem, scope, _builder)
+def atomic_xor(pointer, val, mask=None, sem=None, scope=None, _semantic=None):
+    val = _semantic.to_tensor(val)
+    sem = _unwrap_if_constexpr(sem)
+    scope = _unwrap_if_constexpr(scope)
+    mask = _unwrap_if_constexpr(mask)
+    return _semantic.atomic_xor(pointer, val, mask, sem, scope)
 # -----------------------
@@ -2215,7 +2434,7 @@ def atomic_xor(pointer, val, mask=None, sem=None, scope=None, _builder=None):
 @builtin
-def where(condition, x, y, _builder=None):
+def where(condition, x, y, _semantic=None):
     """
     Returns a tensor of elements from either :code:`x` or :code:`y`, depending on :code:`condition`.
@@ -2231,10 +2450,10 @@ def where(condition, x, y, _builder=None):
     :param x: values selected at indices where condition is True.
     :param y: values selected at indices where condition is False.
     """
-    condition = semantic.to_tensor(condition, _builder)
+    condition = _semantic.to_tensor(condition)
     x = _unwrap_if_constexpr(x)
     y = _unwrap_if_constexpr(y)
-    return semantic.where(condition, x, y, _builder)
+    return _semantic.where(condition, x, y)
 # -----------------------
@@ -2243,28 +2462,28 @@ def where(condition, x, y, _builder=None):
 @builtin
-def add(x, y, sanitize_overflow: constexpr = True, _builder=None):
+def add(x, y, sanitize_overflow: constexpr = True, _semantic=None):
     x = _unwrap_if_constexpr(x)
     y = _unwrap_if_constexpr(y)
-    return semantic.add(x, y, sanitize_overflow, _builder)
+    return _semantic.add(x, y, sanitize_overflow)
 @builtin
-def sub(x, y, sanitize_overflow: constexpr = True, _builder=None):
+def sub(x, y, sanitize_overflow: constexpr = True, _semantic=None):
     x = _unwrap_if_constexpr(x)
     y = _unwrap_if_constexpr(y)
-    return semantic.sub(x, y, sanitize_overflow, _builder)
+    return _semantic.sub(x, y, sanitize_overflow)
 @builtin
-def mul(x, y, sanitize_overflow: constexpr = True, _builder=None):
+def mul(x, y, sanitize_overflow: constexpr = True, _semantic=None):
     x = _unwrap_if_constexpr(x)
     y = _unwrap_if_constexpr(y)
-    return semantic.mul(x, y, sanitize_overflow, _builder)
+    return _semantic.mul(x, y, sanitize_overflow)
 @builtin
-def minimum(x, y, propagate_nan: constexpr = PropagateNan.NONE, _builder=None):
+def minimum(x, y, propagate_nan: constexpr = PropagateNan.NONE, _semantic=None):
     """
     Computes the element-wise minimum of :code:`x` and :code:`y`.
@@ -2277,16 +2496,16 @@ def minimum(x, y, propagate_nan: constexpr = PropagateNan.NONE, _builder=None):
     .. seealso:: :class:`tl.PropagateNan`
     """
-    x = semantic.to_tensor(x, _builder)
-    y = semantic.to_tensor(y, _builder)
-    x = _promote_bfloat16_to_float32(x, _builder=_builder)
-    y = _promote_bfloat16_to_float32(y, _builder=_builder)
-    propagate_nan = _constexpr_to_value(propagate_nan)
-    return semantic.minimum(x, y, propagate_nan, _builder)
+    x = _semantic.to_tensor(x)
+    y = _semantic.to_tensor(y)
+    x = _promote_bfloat16_to_float32(x, _semantic=_semantic)
+    y = _promote_bfloat16_to_float32(y, _semantic=_semantic)
+    propagate_nan = _unwrap_if_constexpr(propagate_nan)
+    return _semantic.minimum(x, y, propagate_nan)
 @builtin
-def maximum(x, y, propagate_nan: constexpr = PropagateNan.NONE, _builder=None):
+def maximum(x, y, propagate_nan: constexpr = PropagateNan.NONE, _semantic=None):
     """
     Computes the element-wise maximum of :code:`x` and :code:`y`.
@@ -2299,16 +2518,16 @@ def maximum(x, y, propagate_nan: constexpr = PropagateNan.NONE, _builder=None):
     .. seealso:: :class:`tl.PropagateNan`
     """
-    x = semantic.to_tensor(x, _builder)
-    y = semantic.to_tensor(y, _builder)
-    x = _promote_bfloat16_to_float32(x, _builder=_builder)
-    y = _promote_bfloat16_to_float32(y, _builder=_builder)
-    propagate_nan = _constexpr_to_value(propagate_nan)
-    return semantic.maximum(x, y, propagate_nan, _builder)
+    x = _semantic.to_tensor(x)
+    y = _semantic.to_tensor(y)
+    x = _promote_bfloat16_to_float32(x, _semantic=_semantic)
+    y = _promote_bfloat16_to_float32(y, _semantic=_semantic)
+    propagate_nan = _unwrap_if_constexpr(propagate_nan)
+    return _semantic.maximum(x, y, propagate_nan)
 @builtin
-def clamp(x, min, max, propagate_nan: constexpr = PropagateNan.NONE, _builder=None):
+def clamp(x, min, max, propagate_nan: constexpr = PropagateNan.NONE, _semantic=None):
     """
     Clamps the input tensor :code:`x` within the range [min, max].
     Behavior when :code:`min` > :code:`max` is undefined.
@@ -2325,16 +2544,16 @@ def clamp(x, min, max, propagate_nan: constexpr = PropagateNan.NONE, _builder=No
     .. seealso:: :class:`tl.PropagateNan`
     """
-    x = semantic.to_tensor(x, _builder)
-    min = semantic.to_tensor(min, _builder)
-    max = semantic.to_tensor(max, _builder)
-    x = _promote_bfloat16_to_float32(x, _builder=_builder)
-    min = _promote_bfloat16_to_float32(min, _builder=_builder)
-    max = _promote_bfloat16_to_float32(max, _builder=_builder)
+    x = _semantic.to_tensor(x)
+    min = _semantic.to_tensor(min)
+    max = _semantic.to_tensor(max)
+    x = _promote_bfloat16_to_float32(x, _semantic=_semantic)
+    min = _promote_bfloat16_to_float32(min, _semantic=_semantic)
+    max = _promote_bfloat16_to_float32(max, _semantic=_semantic)
-    propagate_nan = _constexpr_to_value(propagate_nan)
+    propagate_nan = _unwrap_if_constexpr(propagate_nan)
-    return semantic.clamp(x, min, max, propagate_nan, _builder)
+    return _semantic.clamp(x, min, max, propagate_nan)
 # -----------------------
@@ -2383,7 +2602,7 @@ def _insertion_guard(builder):
 @_tensor_member_fn
 @builtin
-def reduce(input, axis, combine_fn, keep_dims=False, _builder=None, _generator=None):
+def reduce(input, axis, combine_fn, keep_dims=False, _semantic=None, _generator=None):
     """Applies the combine_fn to all elements in :code:`input` tensors along the provided :code:`axis`
     :param input: the input tensor, or tuple of tensors
@@ -2397,64 +2616,65 @@ def reduce(input, axis, combine_fn, keep_dims=False, _builder=None, _generator=N
     """
     if isinstance(input, tensor):
-        return reduce((input, ), axis, combine_fn, keep_dims=keep_dims, _builder=_builder, _generator=_generator)[0]
+        return reduce((input, ), axis, combine_fn, keep_dims=keep_dims, _semantic=_semantic, _generator=_generator)[0]
     def make_combine_region(reduce_op):
         param_types = [t.type.scalar for t in input] * 2
         region = reduce_op.get_region(0)
-        with _insertion_guard(_builder):
-            to_ir = lambda T: T.to_ir(_builder)
-            block = _builder.create_block_with_parent(region, list(map(to_ir, param_types)))
+        builder = _semantic.builder
+        with _insertion_guard(builder):
+            to_ir = lambda T: T.to_ir(builder)
+            block = builder.create_block_with_parent(region, list(map(to_ir, param_types)))
             args = [tensor(block.arg(i), ty) for i, ty in enumerate(param_types)]
             results = _generator.call_JitFunction(combine_fn, args, kwargs={})
             if isinstance(results, tensor):
                 handles = [results.handle]
             else:
                 handles = [r.handle for r in results]
-            _builder.create_reduce_ret(*handles)
+            builder.create_reduce_ret(*handles)
     def expand_ndims(t, ndims):
         for _ in builtins.range(ndims):
-            t = expand_dims(t, 0, _builder=_builder)
+            t = expand_dims(t, 0, _semantic=_semantic)
         return t
-    axis = _constexpr_to_value(axis)
-    keep_dims = _constexpr_to_value(keep_dims)
+    axis = _unwrap_if_constexpr(axis)
+    keep_dims = _unwrap_if_constexpr(keep_dims)
     if axis is not None:
         axis = _wrap_axis(axis, len(input[0].shape))
-    ret = semantic.reduction(input, axis, make_combine_region, _builder)
+    ret = _semantic.reduction(input, axis, make_combine_region)
     if keep_dims:
         if axis is not None:
-            ret = tuple(expand_dims(t, axis, _builder=_builder) for t in ret)
+            ret = tuple(expand_dims(t, axis, _semantic=_semantic) for t in ret)
         else:
             ret = tuple(expand_ndims(t, len(input[0].shape)) for t in ret)
     return ret
 @builtin
-def _promote_bfloat16_to_float32(t, _builder=None):
+def _promote_bfloat16_to_float32(t, _semantic=None):
     scalar_ty = t.type.scalar
     # hardware doesn't support FMAX, FMIN, CMP for bfloat16
     if scalar_ty is bfloat16:
-        return t.to(float32, _builder=_builder)
+        return t.to(float32, _semantic=_semantic)
     return t
 @builtin
-def _reduce_with_indices(input, axis, combine_fn, keep_dims=False, _builder=None, _generator=None):
-    axis = _constexpr_to_value(axis)
+def _reduce_with_indices(input, axis, combine_fn, keep_dims=False, _semantic=None, _generator=None):
+    axis = _unwrap_if_constexpr(axis)
     n = input.shape[axis]
-    index = arange(0, n, _builder=_builder)
+    index = arange(0, n, _semantic=_semantic)
     if len(input.shape) > 1:
         # Broadcast index across the non-reduced axes
         axes_to_expand = [constexpr(d) for d in builtins.range(len(input.shape))]
         del axes_to_expand[axis]
-        index = expand_dims(index, axes_to_expand, _builder=_builder)
-        index = broadcast_to(index, input.shape, _builder=_builder)
+        index = expand_dims(index, axes_to_expand, _semantic=_semantic)
+        index = broadcast_to(index, input.shape, _semantic=_semantic)
-    rvalue, rindices = reduce((input, index), axis, combine_fn, keep_dims=keep_dims, _builder=_builder,
+    rvalue, rindices = reduce((input, index), axis, combine_fn, keep_dims=keep_dims, _semantic=_semantic,
                               _generator=_generator)
     return rvalue, rindices
@@ -2464,7 +2684,7 @@ def _reduce_with_indices(input, axis, combine_fn, keep_dims=False, _builder=None
 # -----------------------
-def _add_scan_docstr(name: str) -> Callable[[T], T]:
+def _add_scan_docstr(name: str, dtype_arg: str = None) -> Callable[[T], T]:
     def _decorator(func: T) -> T:
         docstr = """
@@ -2473,7 +2693,15 @@ def _add_scan_docstr(name: str) -> Callable[[T], T]:
     :param input: the input values
     :type input: Tensor
     :param axis: the dimension along which the scan should be done
-    :type axis: int"""
+    :type axis: int
+    :param reverse: if true, the scan is performed in the reverse direction
+    :type reverse: bool"""
+        if dtype_arg is not None:
+            docstr += f"""
+    :param {dtype_arg}: the desired data type of the returned tensor. If specified, the input tensor is casted to :code:`{dtype_arg}` before the operation is performed. If not specified, small integer types (< 32 bits) are upcasted to prevent overflow. Note that :code:`tl.bfloat16` inputs are automatically promoted to :code:`tl.float32`.
+    :type {dtype_arg}: tl.dtype"""
         func.__doc__ = docstr.format(name=name)
         return func
@@ -2482,7 +2710,7 @@ def _add_scan_docstr(name: str) -> Callable[[T], T]:
 @_tensor_member_fn
 @builtin
-def associative_scan(input, axis, combine_fn, reverse=False, _builder=None, _generator=None):
+def associative_scan(input, axis, combine_fn, reverse=False, _semantic=None, _generator=None):
     """Applies the combine_fn to each elements with a carry in :code:`input` tensors along the provided :code:`axis` and update the carry
     :param input: the input tensor, or tuple of tensors
@@ -2496,46 +2724,52 @@ def associative_scan(input, axis, combine_fn, reverse=False, _builder=None, _gen
     """
     if isinstance(input, tensor):
-        return associative_scan((input, ), axis, combine_fn, reverse, _builder=_builder, _generator=_generator)[0]
+        return associative_scan((input, ), axis, combine_fn, reverse, _semantic=_semantic, _generator=_generator)[0]
     def make_combine_region(scan_op):
         param_types = [t.type.scalar for t in input] * 2
         region = scan_op.get_region(0)
-        with _insertion_guard(_builder):
-            to_ir = lambda T: T.to_ir(_builder)
-            block = _builder.create_block_with_parent(region, list(map(to_ir, param_types)))
+        builder = _semantic.builder
+        with _insertion_guard(builder):
+            to_ir = lambda T: T.to_ir(builder)
+            block = builder.create_block_with_parent(region, list(map(to_ir, param_types)))
             args = [tensor(block.arg(i), ty) for i, ty in enumerate(param_types)]
             results = _generator.call_JitFunction(combine_fn, args, kwargs={})
             if isinstance(results, tensor):
                 handles = [results.handle]
             else:
                 handles = [r.handle for r in results]
-            _builder.create_scan_ret(*handles)
+            builder.create_scan_ret(*handles)
-    axis = _constexpr_to_value(axis)
+    axis = _unwrap_if_constexpr(axis)
     if axis is not None:
         axis = _wrap_axis(axis, len(input[0].shape))
-    return semantic.associative_scan(input, axis, make_combine_region, reverse, _builder)
+    return _semantic.associative_scan(input, axis, make_combine_region, reverse)
 @_tensor_member_fn
 @builtin
-def histogram(input, num_bins, _builder=None, _generator=None):
+def histogram(input, num_bins, mask=None, _semantic=None, _generator=None):
     """computes an histogram based on input tensor with num_bins bins, the bins have a width of 1 and start at 0.
     :param input: the input tensor
     :type input: Tensor
     :param num_bins: number of histogram bins
     :type num_bins: int
+    :param mask: if `mask[idx]` is false, exclude `input[idx]` from histogram
+    :type mask: Block of `triton.int1`, optional
     """
-    num_bins = _constexpr_to_value(num_bins)
-    return semantic.histogram(input, num_bins, _builder)
+    num_bins = _unwrap_if_constexpr(num_bins)
+    mask = _unwrap_if_constexpr(mask)
+    if mask is not None:
+        mask = _semantic.to_tensor(mask)
+    return _semantic.histogram(input, num_bins, mask)
 @_tensor_member_fn
 @builtin
-def gather(src, index, axis, _builder=None):
+def gather(src, index, axis, _semantic=None):
     """Gather from a tensor along a given dimension.
     :param src: the source tensor
@@ -2546,8 +2780,8 @@ def gather(src, index, axis, _builder=None):
     :type axis: int
     """
-    axis = _constexpr_to_value(axis)
-    return semantic.gather(src, index, axis, _builder)
+    axis = _unwrap_if_constexpr(axis)
+    return _semantic.gather(src, index, axis)
 # -----------------------
@@ -2556,15 +2790,15 @@ def gather(src, index, axis, _builder=None):
 @builtin
-def debug_barrier(_builder=None):
+def debug_barrier(_semantic=None):
     '''
     Insert a barrier to synchronize all threads in a block.
     '''
-    return semantic.debug_barrier(_builder)
+    return _semantic.debug_barrier()
 @builtin
-def multiple_of(input, values, _builder=None):
+def multiple_of(input, values, _semantic=None):
     """
     Let the compiler know that the values in :code:`input` are all multiples of :code:`value`.
     """
@@ -2576,11 +2810,11 @@ def multiple_of(input, values, _builder=None):
         if not isinstance(d.value, int):
             raise TypeError(f"values element {i} must have type `constexpr[int]`, got `constexpr[{type(d.value)}]")
     values = [x.value for x in values]
-    return semantic.multiple_of(input, values)
+    return _semantic.multiple_of(input, values)
 @builtin
-def max_contiguous(input, values, _builder=None):
+def max_contiguous(input, values, _semantic=None):
     """
     Let the compiler know that the `value` first values in :code:`input` are contiguous.
     """
@@ -2592,11 +2826,11 @@ def max_contiguous(input, values, _builder=None):
         if not isinstance(d.value, int):
             raise TypeError(f"values element {i} must have type `constexpr[int]`, got `constexpr[{type(d.value)}]")
     values = [x.value for x in values]
-    return semantic.max_contiguous(input, values)
+    return _semantic.max_contiguous(input, values)
 @builtin
-def max_constancy(input, values, _builder=None):
+def max_constancy(input, values, _semantic=None):
     """
     Let the compiler know that the `value` first values in :code:`input` are constant.
@@ -2611,15 +2845,15 @@ def max_constancy(input, values, _builder=None):
         if not isinstance(d.value, int):
             raise TypeError(f"values element {i} must have type `constexpr[int]`, got `constexpr[{type(d.value)}]")
     values = [x.value for x in values]
-    return semantic.max_constancy(input, values)
+    return _semantic.max_constancy(input, values)
 @builtin
-def assume(cond, _builder=None):
+def assume(cond, _semantic=None):
     '''
     Allow compiler to assume the :code:`cond` is True.
     '''
-    return semantic.assume(semantic.to_tensor(cond, _builder), _builder)
+    return _semantic.assume(_semantic.to_tensor(cond))
 # -----------------------
@@ -2628,7 +2862,7 @@ def assume(cond, _builder=None):
 @builtin
-def static_print(*values, sep: str = " ", end: str = "\n", file=None, flush=False, _builder=None):
+def static_print(*values, sep: str = " ", end: str = "\n", file=None, flush=False, _semantic=None):
     '''
     Print the values at compile time.  The parameters are the same as the builtin :code:`print`.
@@ -2644,7 +2878,7 @@ def static_print(*values, sep: str = " ", end: str = "\n", file=None, flush=Fals
 @builtin
-def static_assert(cond, msg="", _builder=None):
+def static_assert(cond, msg="", _semantic=None):
     '''
     Assert the condition at compile time.  Does not require that the :code:`TRITON_DEBUG` environment variable
     is set.
@@ -2658,7 +2892,7 @@ def static_assert(cond, msg="", _builder=None):
 @builtin
-def device_print(prefix, *args, hex=False, _builder=None):
+def device_print(prefix, *args, hex=False, _semantic=None):
     '''
     Print the values at runtime from the device.  String formatting does not work for runtime values, so you should
     provide the values you want to print as arguments.  The first value must be a string, all following values must
@@ -2692,7 +2926,7 @@ def device_print(prefix, *args, hex=False, _builder=None):
     :param hex: print all values as hex instead of decimal
     '''
     import string
-    prefix = _constexpr_to_value(prefix)
+    prefix = _unwrap_if_constexpr(prefix)
     assert isinstance(prefix, str), f"{prefix} is not string"
     b_ascii = True
     for ch in prefix:
@@ -2702,12 +2936,12 @@ def device_print(prefix, *args, hex=False, _builder=None):
     assert b_ascii, f"{prefix} is not an ascii string"
     new_args = []
     for arg in args:
-        new_args.append(semantic.to_tensor(arg, _builder))
-    return semantic.device_print(prefix, new_args, hex, _builder)
+        new_args.append(_semantic.to_tensor(arg))
+    return _semantic.device_print(prefix, new_args, hex)
 @builtin
-def device_assert(cond, msg="", _builder=None):
+def device_assert(cond, msg="", _semantic=None):
     '''
     Assert the condition at runtime from the device.  Requires that the environment variable :code:`TRITON_DEBUG`
     is set to a value besides :code:`0` in order for this to have any effect.
@@ -2725,13 +2959,13 @@ def device_assert(cond, msg="", _builder=None):
     :param cond: the condition to assert. This is required to be a boolean tensor.
     :param msg: the message to print if the assertion fails. This is required to be a string literal.
     '''
-    msg = _constexpr_to_value(msg)
-    return semantic.device_assert(semantic.to_tensor(cond, _builder), msg, _builder)
+    msg = _unwrap_if_constexpr(msg)
+    return _semantic.device_assert(_semantic.to_tensor(cond), msg)
 @builtin
 def inline_asm_elementwise(asm: str, constraints: str, args: Sequence, dtype: Union[dtype, Sequence[dtype]],
-                           is_pure: bool, pack: int, _builder=None):
+                           is_pure: bool, pack: int, _semantic=None):
     '''
         Execute inline assembly over a tensor.  Essentially, this is :code:`map`
         where the function is inline assembly.
@@ -2816,13 +3050,12 @@ def inline_asm_elementwise(asm: str, constraints: str, args: Sequence, dtype: Un
         :param dtype: the element type(s) of the returned tensor(s)
         :param is_pure: if true, the compiler assumes the asm block has no side-effects
         :param pack: the number of elements to be processed by one instance of inline assembly
-        :param _builder: the builder
         :return: one tensor or a tuple of tensors of the given dtypes
     '''
-    asm = _constexpr_to_value(asm)
-    constraints = _constexpr_to_value(constraints)
-    pack = _constexpr_to_value(pack)
-    is_pure = _constexpr_to_value(is_pure)
+    asm = _unwrap_if_constexpr(asm)
+    constraints = _unwrap_if_constexpr(constraints)
+    pack = _unwrap_if_constexpr(pack)
+    is_pure = _unwrap_if_constexpr(is_pure)
     # Wrap `dtype` in a tuple if it's not already.
     try:
@@ -2835,10 +3068,9 @@ def inline_asm_elementwise(asm: str, constraints: str, args: Sequence, dtype: Un
     dtype = typing.cast(Sequence[_DtypeClass], dtype)
     res_tys = dtype
-    if dispatch_args := [semantic.to_tensor(arg, _builder) for arg in args]:
+    if dispatch_args := [_semantic.to_tensor(arg) for arg in args]:
         bin_op_type_checking = partial(
-            semantic.binary_op_type_checking_impl,
-            builder=_builder,
+            _semantic.binary_op_type_checking_impl,
             arithmetic_check=False,
             allow_lhs_ptr=True,
             allow_rhs_ptr=True,
@@ -2851,9 +3083,10 @@ def inline_asm_elementwise(asm: str, constraints: str, args: Sequence, dtype: Un
             # Change the shape of each argument based on the broadcast shape
             for i, item in enumerate(dispatch_args):
                 dispatch_args[i], _ = bin_op_type_checking(item, broadcast_arg)
-            res_tys = [block_type(dt, broadcast_arg.shape) for dt in dtype]
+            res_tys = [broadcast_arg.type.with_element_ty(dt) for dt in dtype]
     handles = [t.handle for t in dispatch_args]
-    call = _builder.create_inline_asm(asm, constraints, handles, [ty.to_ir(_builder) for ty in res_tys], is_pure, pack)
+    builder = _semantic.builder
+    call = builder.create_inline_asm(asm, constraints, handles, [ty.to_ir(builder) for ty in res_tys], is_pure, pack)
     if not has_multiple_outputs:
         return tensor(call.get_result(0), res_tys[0])
@@ -2905,6 +3138,22 @@ class static_range:
         raise RuntimeError("static_range can only be used in @triton.jit'd functions")
+class async_task:
+    """
+    Context manager to run code fragments asynchronously.
+    """
+    def __init__(self, task_ids, _builder=None):
+        self.task_ids = list({_unwrap_if_constexpr(tid) for tid in task_ids})
+        self.builder = _builder
+    def __enter__(self):
+        self.builder.set_async_task_ids(self.task_ids)
+    def __exit__(self, exc_type, exc_value, traceback):
+        self.builder.unset_async_task_ids()
 class range:
     """
     Iterator that counts upward forever.
@@ -2936,10 +3185,18 @@ class range:
     :param flatten: automatically flatten the loop nest starting at this loop to
         create a single flattened loop. The compiler will try to pipeline the
         flattened loop which can avoid stage stalling.
+    :param warp_specialize: Enable automatic warp specialization on the loop.
+        The compiler will attempt to partition memory, MMA, and vector
+        operations in the loop into separate async partitions. This will
+        increase the total number of warps required by the kernel.
+        Note that warp specialization is only supported on Blackwell GPUs and
+        only works on simple matmul loops. Support for arbitrary loops will be
+        expanded over time.
     """
     def __init__(self, arg1, arg2=None, step=None, num_stages=None, loop_unroll_factor=None,
-                 disallow_acc_multi_buffer=False, flatten=False):
+                 disallow_acc_multi_buffer=False, flatten=False, warp_specialize=False):
         if step is None:
             self.step = constexpr(1)
         else:
@@ -2954,6 +3211,7 @@ class range:
         self.loop_unroll_factor = loop_unroll_factor
         self.disallow_acc_multi_buffer = disallow_acc_multi_buffer
         self.flatten = flatten
+        self.warp_specialize = warp_specialize
     def __iter__(self):
         raise RuntimeError("tl.range can only be used in @triton.jit'd functions")
@@ -2968,7 +3226,7 @@ class range:
 def dispatch(func, lib_name: str, lib_path: str, args: list, arg_type_symbol_dict: dict, ret_shape: tuple,
-             is_pure: bool, _builder=None):
+             is_pure: bool, _semantic):
     '''
         Dispatch a function to a library
         :param func: the function to dispatch
@@ -2977,7 +3235,6 @@ def dispatch(func, lib_name: str, lib_path: str, args: list, arg_type_symbol_dic
         :param args: the arguments of the function
         :param arg_type_symbol_dict: the type of the arguments
         :param ret_shape: the shape of the return value
-        :param _builder: the builder
         :return: the return value of the function
     '''
     if len(arg_type_symbol_dict) == 0:
@@ -3007,12 +3264,13 @@ def dispatch(func, lib_name: str, lib_path: str, args: list, arg_type_symbol_dic
         ret_type = arg_type_symbol_dict[arg_types][1]
         if ret_shape:
             ret_type = block_type(ret_type, ret_shape)
-        return tensor(func(lib_name, lib_path, symbol, arg_list, ret_type.to_ir(_builder), is_pure), ret_type)
+        builder = _semantic.builder
+        return tensor(func(lib_name, lib_path, symbol, arg_list, ret_type.to_ir(builder), is_pure), ret_type)
 @builtin
 def extern_elementwise(lib_name: str, lib_path: str, args: list, arg_type_symbol_dict: dict, is_pure: bool,
-                       _builder=None):
+                       _semantic=None):
     '''
         Dispatch an elementwise function to a library
         :param lib_name: the name of the library
@@ -3020,7 +3278,6 @@ def extern_elementwise(lib_name: str, lib_path: str, args: list, arg_type_symbol
         :param args: the arguments of the function
         :param arg_type_symbol_dict: the type of the arguments
         :param is_pure: whether the function is pure
-        :param _builder: the builder
         :return: the return value of the function
     '''
     dispatch_args = args.copy()
@@ -3028,7 +3285,7 @@ def extern_elementwise(lib_name: str, lib_path: str, args: list, arg_type_symbol
     ret_shape = None
     arg_types = []
     for i in builtins.range(len(dispatch_args)):
-        dispatch_args[i] = semantic.to_tensor(dispatch_args[i], _builder)
+        dispatch_args[i] = _semantic.to_tensor(dispatch_args[i])
         arg_types.append(dispatch_args[i].dtype)
         if dispatch_args[i].type.is_block():
             all_scalar = False
@@ -3041,26 +3298,26 @@ def extern_elementwise(lib_name: str, lib_path: str, args: list, arg_type_symbol
         broadcast_arg = dispatch_args[0]
         # Get the broadcast shape over all the arguments
         for item in dispatch_args:
-            _, broadcast_arg = semantic.binary_op_type_checking_impl(item, broadcast_arg, _builder,
-                                                                     arithmetic_check=arithmetic_check)
+            _, broadcast_arg = _semantic.binary_op_type_checking_impl(item, broadcast_arg,
+                                                                      arithmetic_check=arithmetic_check)
         # Change the shape of each argument based on the broadcast shape
         for i in builtins.range(len(dispatch_args)):
-            dispatch_args[i], _ = semantic.binary_op_type_checking_impl(dispatch_args[i], broadcast_arg, _builder,
-                                                                        arithmetic_check=arithmetic_check)
+            dispatch_args[i], _ = _semantic.binary_op_type_checking_impl(dispatch_args[i], broadcast_arg,
+                                                                         arithmetic_check=arithmetic_check)
         if not all_scalar:
             ret_shape = broadcast_arg.shape
-    func = _builder.create_extern_elementwise
-    return dispatch(func, lib_name, lib_path, dispatch_args, arg_type_symbol_dict, ret_shape, is_pure, _builder)
+    func = _semantic.builder.create_extern_elementwise
+    return dispatch(func, lib_name, lib_path, dispatch_args, arg_type_symbol_dict, ret_shape, is_pure, _semantic)
-def binary_op_type_legalization(lhs, rhs, builder):
+def binary_op_type_legalization(lhs, rhs, semantic):
     '''
         Convert both operands to a single common type
         :param lhs: the left operand
         :param rhs: the right operand
         :param builder: the builder
     '''
-    return semantic.binary_op_type_checking_impl(lhs, rhs, builder)
+    return semantic.binary_op_type_checking_impl(lhs, rhs)
 def extern(fn):