PyPI - triton-windows - Versions diffs - 3.2.0.post12__cp312-cp312-win_amd64.whl → 3.3.0a0.post12__cp312-cp312-win_amd64.whl - Mend

triton-windows 3.2.0.post12__cp312-cp312-win_amd64.whl → 3.3.0a0.post12__cp312-cp312-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of triton-windows might be problematic. Click here for more details.

Files changed (68) hide show

triton/_C/libtriton.pyd +0 -0
triton/__init__.py +3 -3
triton/_internal_testing.py +59 -4
triton/_utils.py +35 -0
triton/backends/amd/compiler.py +121 -74
triton/backends/amd/driver.py +77 -43
triton/backends/amd/include/hip/amd_detail/amd_device_functions.h +28 -49
triton/backends/amd/include/hip/amd_detail/amd_hip_atomic.h +35 -9
triton/backends/amd/include/hip/amd_detail/amd_hip_bf16.h +761 -284
triton/backends/amd/include/hip/amd_detail/amd_hip_cooperative_groups.h +9 -3
triton/backends/amd/include/hip/amd_detail/amd_hip_fp8.h +1391 -0
triton/backends/amd/include/hip/amd_detail/amd_hip_gl_interop.h +3 -3
triton/backends/amd/include/hip/amd_detail/amd_warp_functions.h +44 -0
triton/backends/amd/include/hip/amd_detail/amd_warp_sync_functions.h +288 -0
triton/backends/amd/include/hip/amd_detail/hip_api_trace.hpp +110 -14
triton/backends/amd/include/hip/amd_detail/hip_prof_str.h +504 -103
triton/backends/amd/include/hip/amd_detail/hip_runtime_prof.h +2 -1
triton/backends/amd/include/hip/amd_detail/host_defines.h +4 -0
triton/backends/amd/include/hip/hip_ext.h +4 -2
triton/backends/amd/include/hip/hip_fp8.h +33 -0
triton/backends/amd/include/hip/hip_runtime_api.h +375 -33
triton/backends/amd/include/hip/hip_version.h +3 -3
triton/backends/amd/include/hip/hiprtc.h +25 -25
triton/backends/amd/include/hsa/amd_hsa_elf.h +40 -14
triton/backends/amd/include/hsa/hsa.h +11 -2
triton/backends/amd/include/hsa/hsa_api_trace.h +30 -17
triton/backends/amd/include/hsa/hsa_api_trace_version.h +68 -0
triton/backends/amd/include/hsa/hsa_ext_amd.h +83 -27
triton/backends/amd/include/hsa/hsa_ven_amd_aqlprofile.h +46 -46
triton/backends/amd/include/hsa/hsa_ven_amd_pc_sampling.h +416 -0
triton/backends/amd/include/roctracer/hip_ostream_ops.h +84 -4
triton/backends/amd/include/roctracer/hsa_ostream_ops.h +260 -0
triton/backends/amd/include/roctracer/hsa_prof_str.h +51 -19
triton/backends/amd/lib/asanrtl.bc +0 -0
triton/backends/compiler.py +25 -225
triton/backends/driver.py +7 -2
triton/backends/nvidia/bin/ptxas.exe +0 -0
triton/backends/nvidia/compiler.py +135 -90
triton/backends/nvidia/driver.c +0 -1
triton/backends/nvidia/driver.py +135 -49
triton/backends/nvidia/include/cuda.h +2162 -241
triton/backends/nvidia/lib/x64/cuda.lib +0 -0
triton/compiler/__init__.py +2 -2
triton/compiler/code_generator.py +334 -231
triton/compiler/compiler.py +77 -66
triton/language/__init__.py +22 -5
triton/language/core.py +448 -74
triton/language/extra/cuda/_experimental_tma.py +3 -5
triton/language/math.py +1 -1
triton/language/random.py +2 -1
triton/language/semantic.py +206 -52
triton/language/standard.py +35 -18
triton/runtime/_allocation.py +32 -0
triton/runtime/autotuner.py +27 -32
triton/runtime/build.py +1 -48
triton/runtime/cache.py +6 -6
triton/runtime/errors.py +10 -0
triton/runtime/interpreter.py +179 -45
triton/runtime/jit.py +149 -190
triton/testing.py +39 -11
triton/tools/compile.py +27 -20
triton/tools/{compile.c → extra/cuda/compile.c} +1 -0
triton/tools/mxfp.py +301 -0
{triton_windows-3.2.0.post12.dist-info → triton_windows-3.3.0a0.post12.dist-info}/METADATA +5 -2
{triton_windows-3.2.0.post12.dist-info → triton_windows-3.3.0a0.post12.dist-info}/RECORD +68 -59
{triton_windows-3.2.0.post12.dist-info → triton_windows-3.3.0a0.post12.dist-info}/top_level.txt +2 -0
/triton/tools/{compile.h → extra/cuda/compile.h} +0 -0
{triton_windows-3.2.0.post12.dist-info → triton_windows-3.3.0a0.post12.dist-info}/WHEEL +0 -0

triton/runtime/autotuner.py CHANGED Viewed

@@ -4,10 +4,10 @@ import builtins
 import os
 import time
 import inspect
-from typing import Dict
+from typing import Dict, Tuple, List, Optional
 from .jit import KernelInterface
-from .errors import OutOfResources
+from .errors import OutOfResources, PTXASError
 from .driver import driver
@@ -23,7 +23,7 @@ class Autotuner(KernelInterface):
         restore_value,
         pre_hook=None,
         post_hook=None,
-        prune_configs_by: Dict = None,
+        prune_configs_by: Optional[Dict] = None,
         warmup=None,
         rep=None,
         use_cuda_graph=False,
@@ -36,14 +36,11 @@ class Autotuner(KernelInterface):
             'prune_num_stages_by'(optional): a function used to prune num_stages. It takes configs:List[Config] as its input, and returns pruned configs.
         """
         if not configs:
-            self.configs = [
-                Config({}, num_warps=4, num_stages=2, num_ctas=1, num_buffers_warp_spec=0, num_consumer_groups=0,
-                       reg_dec_producer=0, reg_inc_consumer=0)
-            ]
+            self.configs = [Config({}, num_warps=4, num_stages=3, num_ctas=1)]
         else:
             self.configs = configs
         self.keys = key
-        self.cache = {}
+        self.cache: Dict[Tuple, Config] = {}
         self.arg_names = arg_names
         # Reset to zero or restore values
@@ -134,6 +131,10 @@ class Autotuner(KernelInterface):
     def _bench(self, *args, config, **meta):
         from ..compiler.errors import CompileTimeAssertionFailure
+        verbose = os.environ.get("TRITON_PRINT_AUTOTUNING", None) == "1"
+        if verbose:
+            print(f"Autotuning kernel {self.base_fn.__name__} with config {config}")
         # check for conflicts, i.e. meta-parameters both provided
         # as kwargs and by the autotuner
         conflicts = meta.keys() & config.kwargs.keys()
@@ -164,7 +165,9 @@ class Autotuner(KernelInterface):
         try:
             return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))
-        except (OutOfResources, CompileTimeAssertionFailure):
+        except (OutOfResources, CompileTimeAssertionFailure, PTXASError) as e:
+            if verbose:
+                print(f"Autotuning failed with {e}")
             return [float("inf"), float("inf"), float("inf")]
     def run(self, *args, **kwargs):
@@ -208,7 +211,7 @@ class Autotuner(KernelInterface):
         self.nargs = None
         return ret
-    def prune_configs(self, kwargs):
+    def prune_configs(self, kwargs: Dict) -> List[Config]:
         pruned_configs = self.configs
         if self.early_config_prune:
             pruned_configs = self.early_config_prune(self.configs, self.nargs, **kwargs)
@@ -216,6 +219,10 @@ class Autotuner(KernelInterface):
             top_k = self.configs_top_k
             if isinstance(top_k, float) and top_k <= 1.0:
                 top_k = int(len(self.configs) * top_k)
+            elif not isinstance(top_k, int):
+                # Slice index must be an integer
+                raise TypeError("Error while pruning configs, top_k must be either 1) a float <= 1.0 or 2) an int")
             if len(pruned_configs) > top_k:
                 est_timing = {
                     config: self.perf_model(
@@ -262,16 +269,11 @@ class Config:
                     function are args.
     """
-    def __init__(self, kwargs, num_warps=4, num_stages=2, num_ctas=1, num_buffers_warp_spec=0, num_consumer_groups=0,
-                 reg_dec_producer=0, reg_inc_consumer=0, maxnreg=None, pre_hook=None):
+    def __init__(self, kwargs, num_warps=4, num_stages=3, num_ctas=1, maxnreg=None, pre_hook=None):
         self.kwargs = kwargs
         self.num_warps = num_warps
         self.num_ctas = num_ctas
         self.num_stages = num_stages
-        self.num_buffers_warp_spec = num_buffers_warp_spec
-        self.num_consumer_groups = num_consumer_groups
-        self.reg_dec_producer = reg_dec_producer
-        self.reg_inc_consumer = reg_inc_consumer
         self.maxnreg = maxnreg
         self.pre_hook = pre_hook
@@ -283,10 +285,6 @@ class Config:
                     ("num_warps", self.num_warps),
                     ("num_ctas", self.num_ctas),
                     ("num_stages", self.num_stages),
-                    ("num_buffers_warp_spec", self.num_buffers_warp_spec),
-                    ("num_consumer_groups", self.num_consumer_groups),
-                    ("reg_dec_producer", self.reg_dec_producer),
-                    ("reg_inc_consumer", self.reg_inc_consumer),
                     ("maxnreg", self.maxnreg),
                 ) if v is not None
             }
@@ -299,10 +297,6 @@ class Config:
         res.append(f"num_warps: {self.num_warps}")
         res.append(f"num_ctas: {self.num_ctas}")
         res.append(f"num_stages: {self.num_stages}")
-        res.append(f"num_buffers_warp_spec: {self.num_buffers_warp_spec}")
-        res.append(f"num_consumer_groups: {self.num_consumer_groups}")
-        res.append(f"reg_dec_producer: {self.reg_dec_producer}")
-        res.append(f"reg_inc_consumer: {self.reg_inc_consumer}")
         res.append(f"maxnreg: {self.maxnreg}")
         return ", ".join(res)
@@ -323,8 +317,8 @@ def autotune(configs, key, prune_configs_by=None, reset_to_zero=None, restore_va
                          # the value of x_size changes
         )
         @triton.jit
-        def kernel(x_ptr, x_size, **META):
-            BLOCK_SIZE = META['BLOCK_SIZE']
+        def kernel(x_ptr, x_size, BLOCK_SIZE: tl.constexpr):
+            ...
     :note: When all the configurations are evaluated, the kernel will run multiple times.
            This means that whatever value the kernel updates will be updated multiple times.
            To avoid this undesired behavior, you can use the `reset_to_zero` argument, which
@@ -367,7 +361,7 @@ def autotune(configs, key, prune_configs_by=None, reset_to_zero=None, restore_va
     def decorator(fn):
         return Autotuner(fn, fn.arg_names, configs, key, reset_to_zero, restore_value, pre_hook=pre_hook,
                          post_hook=post_hook, prune_configs_by=prune_configs_by, warmup=warmup, rep=rep,
-                         use_cuda_graph=use_cuda_graph)
+                         use_cuda_graph=use_cuda_graph, do_bench=do_bench)
     return decorator
@@ -388,18 +382,19 @@ class Heuristics(KernelInterface):
 def heuristics(values):
     """
     Decorator for specifying how the values of certain meta-parameters may be computed.
-    This is useful for cases where auto-tuning is prohibitevely expensive, or just not applicable.
+    This is useful for cases where auto-tuning is prohibitively expensive, or just not applicable.
     .. highlight:: python
     .. code-block:: python
-        @triton.heuristics(values={'BLOCK_SIZE': lambda args: 2 ** int(math.ceil(math.log2(args[1])))})
+        # smallest power-of-two >= x_size
+        @triton.heuristics(values={'BLOCK_SIZE': lambda args: triton.next_power_of_2(args['x_size'])})
         @triton.jit
-        def kernel(x_ptr, x_size, **META):
-            BLOCK_SIZE = META['BLOCK_SIZE'] # smallest power-of-two >= x_size
+        def kernel(x_ptr, x_size, BLOCK_SIZE: tl.constexpr):
+            ...
     :param values: a dictionary of meta-parameter names and functions that compute the value of the meta-parameter.
                    each such function takes a list of positional arguments as input.
-    :type values: dict[str, Callable[[list[Any]], Any]]
+    :type values: dict[str, Callable[[dict[str, Any]], Any]]
     """
     def decorator(fn):

triton/runtime/build.py CHANGED Viewed

@@ -1,26 +1,12 @@
-import contextlib
-import sys
-import io
 import sysconfig
 import os
 import shutil
 import subprocess
-import setuptools
 if os.name == "nt":
     from triton.windows_utils import find_msvc_winsdk, find_python
-@contextlib.contextmanager
-def quiet():
-    old_stdout, old_stderr = sys.stdout, sys.stderr
-    sys.stdout, sys.stderr = io.StringIO(), io.StringIO()
-    try:
-        yield
-    finally:
-        sys.stdout, sys.stderr = old_stdout, old_stderr
 def _cc_cmd(cc, src, out, include_dirs, library_dirs, libraries):
     if cc.lower().endswith("cl") or cc.lower().endswith("cl.exe"):
         out_base = os.path.splitext(out)[0]
@@ -74,38 +60,5 @@ def _build(name, src, srcdir, library_dirs, include_dirs, libraries):
         include_dirs += msvc_winsdk_inc_dirs
         library_dirs += msvc_winsdk_lib_dirs
     cc_cmd = _cc_cmd(cc, src, so, include_dirs, library_dirs, libraries)
-    ret = subprocess.check_call(cc_cmd)
-    if ret == 0:
-        return so
-    # fallback on setuptools
-    extra_compile_args = []
-    if cc.lower().endswith("cl") or cc.lower().endswith("cl.exe"):
-        extra_compile_args += ["/O2"]
-    else:
-        extra_compile_args += ["-O3"]
-    # extra arguments
-    extra_link_args = []
-    # create extension module
-    ext = setuptools.Extension(
-        name=name,
-        language='c',
-        sources=[src],
-        include_dirs=include_dirs,
-        extra_compile_args=extra_compile_args,
-        extra_link_args=extra_link_args,
-        library_dirs=library_dirs,
-        libraries=libraries,
-    )
-    # build extension module
-    args = ['build_ext']
-    args.append('--build-temp=' + srcdir)
-    args.append('--build-lib=' + srcdir)
-    args.append('-q')
-    args = dict(
-        name=name,
-        ext_modules=[ext],
-        script_args=args,
-    )
-    with quiet():
-        setuptools.setup(**args)
+    subprocess.check_call(cc_cmd, stdout=subprocess.DEVNULL)
     return so

triton/runtime/cache.py CHANGED Viewed

@@ -256,9 +256,9 @@ __cache_cls = FileCacheManager
 __cache_cls_nme = "DEFAULT"
-def _base64(key):
+def _base32(key):
     # Assume key is a hex string.
-    return base64.urlsafe_b64encode(bytes.fromhex(key)).decode("utf-8").rstrip("=")
+    return base64.b32encode(bytes.fromhex(key)).decode("utf-8").rstrip("=")
 def get_cache_manager(key) -> CacheManager:
@@ -274,15 +274,15 @@ def get_cache_manager(key) -> CacheManager:
         __cache_cls = getattr(module, clz_nme)
         __cache_cls_nme = user_cache_manager
-    return __cache_cls(_base64(key))
+    return __cache_cls(_base32(key))
 def get_override_manager(key) -> CacheManager:
-    return __cache_cls(_base64(key), override=True)
+    return __cache_cls(_base32(key), override=True)
 def get_dump_manager(key) -> CacheManager:
-    return __cache_cls(_base64(key), dump=True)
+    return __cache_cls(_base32(key), dump=True)
 def make_so_cache_key(version_hash, signature, constants, ids, **kwargs):
@@ -292,4 +292,4 @@ def make_so_cache_key(version_hash, signature, constants, ids, **kwargs):
     for kw in kwargs:
         key = f"{key}-{kwargs.get(kw)}"
     key = hashlib.sha256(key.encode("utf-8")).hexdigest()
-    return _base64(key)
+    return _base32(key)

triton/runtime/errors.py CHANGED Viewed

@@ -24,3 +24,13 @@ class OutOfResources(TritonError):
     def __reduce__(self):
         # this is necessary to make CompilationError picklable
         return (type(self), (self.required, self.limit, self.name))
+class PTXASError(TritonError):
+    def __init__(self, error_message: Optional[str] = None):
+        self.error_message = error_message
+    def __str__(self) -> str:
+        error_message = self.error_message or ""
+        return f"PTXAS error: {error_message}"

triton/runtime/interpreter.py CHANGED Viewed

@@ -1,7 +1,7 @@
 import ast
 import textwrap
 import inspect
-from typing import Tuple
+from typing import Tuple, List
 import math
 import numpy as np
@@ -21,7 +21,7 @@ class TensorHandle:
         '''
             data: numpy array
             dtype: triton type, either pointer_type or scalar_type.
-            we don't store block_type here because the shape information is already availale in the data field
+            we don't store block_type here because the shape information is already available in the data field
             attr: a dictionary of attributes
         '''
         self.data = data
@@ -46,27 +46,63 @@ class TensorHandle:
 class BlockPointerHandle:
-    def __init__(self, base, shape, strides, offsets, tensor_shape, order):
+    def __init__(self, base, shape, strides, offsets, block_shape, order):
         self.base = base
         self.shape = shape
         self.strides = strides
         self.offsets = offsets
-        self.tensor_shape = tensor_shape
+        self.block_shape = block_shape
         self.order = order
     def materialize_pointers(self, boundary_check):
         dtype_tt = self.base.get_element_ty()
         n_bytes = dtype_tt.primitive_bitwidth // 8
-        tensor_shape = self.tensor_shape
-        ptrs = np.broadcast_to(self.base.data, self.tensor_shape)
-        masks = np.ones(self.tensor_shape, dtype=bool)
-        for dim in range(len(tensor_shape)):
-            bcast_dims = [1] * len(tensor_shape)
-            bcast_dims[dim] = tensor_shape[dim]
-            off = (self.offsets[dim].data + np.arange(tensor_shape[dim])).reshape(bcast_dims)
+        ptrs = np.broadcast_to(self.base.data, self.block_shape)
+        masks = np.ones(self.block_shape, dtype=bool)
+        for dim in range(len(self.block_shape)):
+            bcast_dims = [1] * len(self.block_shape)
+            bcast_dims[dim] = self.block_shape[dim]
+            off = (self.offsets[dim].data + np.arange(self.block_shape[dim])).reshape(bcast_dims)
             ptrs = ptrs + (n_bytes * off * self.strides[dim].data).astype(np.uint64)
             if dim in boundary_check:
-                masks = np.logical_and(masks, off < self.shape[dim].data)
+                masks = masks & (off < self.shape[dim].data) & (off >= 0)
+        ptrs = TensorHandle(ptrs, self.base.dtype.scalar)
+        return ptrs, masks
+class TensorDescHandle:
+    def __init__(self, base: TensorHandle, shape: List[TensorHandle], strides: List[TensorHandle],
+                 block_shape: List[int]):
+        self.base = base
+        self.ndim = len(shape)
+        self.shape = shape
+        self.strides = strides
+        self.block_shape = block_shape
+    def validate(self):
+        assert self.base.data.item() % 16 == 0, "base must be 16-byte aligned"
+        assert len(self.strides) == self.ndim
+        assert len(self.block_shape) == self.ndim
+        for stride in self.strides[:-1]:
+            assert stride.data.item() % 16 == 0, "stride must be 16-byte aligned"
+        assert self.strides[-1].data.item() == 1, "last dim must be contiguous"
+    def materialize_pointers(self, offsets: List[TensorHandle]):
+        assert len(offsets) == self.ndim
+        scalar_ty = self.base.dtype.element_ty
+        itemsize = scalar_ty.primitive_bitwidth // 8
+        assert (offsets[-1].data * itemsize) % 16 == 0, "block offset start must be 16-byte aligned"
+        ptrs = np.broadcast_to(self.base.data, self.block_shape)
+        masks = np.ones(self.block_shape, dtype=bool)
+        for dim in range(len(self.block_shape)):
+            bcast_dims = [1] * len(self.block_shape)
+            bcast_dims[dim] = self.block_shape[dim]
+            off = (offsets[dim].data + np.arange(self.block_shape[dim])).reshape(bcast_dims)
+            ptrs = ptrs + (itemsize * off * self.strides[dim].data).astype(np.uint64)
+            masks = masks & (0 <= off) & (off < self.shape[dim].data)
         ptrs = TensorHandle(ptrs, self.base.dtype.scalar)
         return ptrs, masks
@@ -242,7 +278,7 @@ class InterpreterBuilder:
         self.options = InterpreterOptions()
         self.codegen_fns = {}
         self.codegen_fns["convert_custom_types"] = ExtraFunctions._convert_custom_types
-        self.codegen_fns["min_dot_size"] = lambda lhsType, rhsType: (16, 16, 16)
+        self.codegen_fns["min_dot_size"] = lambda lhsType, rhsType: (1, 1, 1)
     def set_grid_idx(self, x, y, z):
         if not x < self.grid_dim[0]:
@@ -419,7 +455,7 @@ class InterpreterBuilder:
     create_fadd = lambda self, lhs, rhs: self.binary_op(lhs, rhs, np.add)
     create_fmul = lambda self, lhs, rhs: self.binary_op(lhs, rhs, np.multiply)
     create_fdiv = lambda self, lhs, rhs: self.binary_op(lhs, rhs, np.divide)
-    create_frem = lambda self, lhs, rhs: self.binary_op(lhs, rhs, np.remainder)
+    create_frem = lambda self, lhs, rhs: self.binary_op(lhs, rhs, np.fmod)
     create_fsub = lambda self, lhs, rhs: self.binary_op(lhs, rhs, np.subtract)
     create_mul = lambda self, lhs, rhs: self.binary_op(lhs, rhs, np.multiply)
     create_precise_divf = lambda self, lhs, rhs: self.binary_op(lhs, rhs, np.divide)
@@ -557,6 +593,9 @@ class InterpreterBuilder:
     def create_histogram(self, data, bins):
         return TensorHandle(np.histogram(data.data, bins=bins, range=(0, bins))[0], tl.int32)
+    def create_gather(self, src, indices, axis):
+        return TensorHandle(np.take_along_axis(src.data, indices.data, axis=axis), src.dtype.scalar)
     # pointer arithmetic
     def create_addptr(self, ptr, offset):
@@ -655,21 +694,61 @@ class InterpreterBuilder:
         # Triton's barrier applies to each program in a grid, so it's a no-op in the interpreter
         pass
-    def create_make_block_ptr(self, base, shape, strides, offsets, tensor_shape, order):
+    def create_make_block_ptr(self, base, shape, strides, offsets, block_shape, order):
         # Create new offsets to avoid modifying the original
         new_offsets = [offset.clone() for offset in offsets]
-        return BlockPointerHandle(base, shape, strides, new_offsets, tensor_shape, order)
+        return BlockPointerHandle(base, shape, strides, new_offsets, block_shape, order)
     def create_advance(self, ptr, offsets):
         if len(ptr.offsets) != len(offsets):
             raise ValueError("len(ptr.offsets) != len(offsets)")
         # Create new offsets to avoid modifying the original
         new_offsets = [offset.clone() for offset in ptr.offsets]
-        ret = BlockPointerHandle(ptr.base, ptr.shape, ptr.strides, new_offsets, ptr.tensor_shape, ptr.order)
+        ret = BlockPointerHandle(ptr.base, ptr.shape, ptr.strides, new_offsets, ptr.block_shape, ptr.order)
         for i in range(len(offsets)):
             ret.offsets[i].data += offsets[i].data
         return ret
+    def create_make_tensor_descriptor(
+        self,
+        base: TensorHandle,
+        shape: List[TensorHandle],
+        strides: List[TensorHandle],
+        tensor_shape: List[int],
+    ):
+        desc = TensorDescHandle(base, shape, strides, tensor_shape)
+        desc.validate()
+        return desc
+    def create_descriptor_load(self, desc: TensorDescHandle, indices: List[TensorHandle], cache_modifier,
+                               eviction_policy):
+        assert isinstance(desc, TensorDescHandle)
+        ptrs, mask = desc.materialize_pointers(indices)
+        return self.create_masked_load(ptrs, mask, other=None, cache_modifier=cache_modifier,
+                                       eviction_policy=eviction_policy, is_volatile=False)
+    def create_descriptor_store(self, desc: TensorDescHandle, value: TensorHandle, indices: List[TensorHandle]):
+        ptrs, mask = desc.materialize_pointers(indices)
+        return self.create_masked_store(ptrs, value, mask, None, None)
+    def create_descriptor_gather(self, desc: TensorDescHandle, x_offsets: TensorHandle, y_offset: TensorHandle, type):
+        dtype = desc.base.dtype.element_ty
+        np_dtype = _get_np_dtype(dtype)
+        result = np.zeros([x_offsets.data.shape[0], desc.block_shape[-1]], dtype=np_dtype)
+        cache_modifier = None
+        eviction_policy = None
+        for i, x_offset in enumerate(x_offsets.data):
+            indices = [TensorHandle(x_offset, tl.int32), y_offset]
+            result[i, :] = self.create_descriptor_load(desc, indices, cache_modifier, eviction_policy).data
+        return TensorHandle(result, dtype)
+    def create_descriptor_scatter(self, desc: TensorDescHandle, value: TensorHandle, x_offsets: TensorHandle,
+                                  y_offset: TensorHandle):
+        for i, x_offset in enumerate(x_offsets.data):
+            slice = TensorHandle(value.data[i], value.dtype)
+            indices = [TensorHandle(x_offset, tl.int32), y_offset]
+            self.create_descriptor_store(desc, slice, indices)
     def get_all_ones_value(self, type):
         np_type = _get_np_dtype(type)
         if "int" in np_type.name:
@@ -701,7 +780,12 @@ def _patch_lang_tensor(tensor):
         return bool(data) if data.size == 1 else True
     def _get_transpose(self):
-        return tl.core.tensor(TensorHandle(np.transpose(self.handle.data), self.handle.dtype), self.dtype.scalar)
+        handle = TensorHandle(np.transpose(self.handle.data), self.handle.dtype)
+        assert self.type.is_block()
+        block_shape = list(self.type.shape)
+        block_shape[-1], block_shape[-2] = block_shape[-2], block_shape[-1]
+        res_ty = tl.core.block_type(self.dtype, block_shape)
+        return tl.core.tensor(handle, res_ty)
     tensor.__index__ = lambda self: int(self.handle.data)
     tensor.__bool__ = lambda self: _get_bool(self)
@@ -710,7 +794,7 @@ def _patch_lang_tensor(tensor):
     tensor.T = property(_get_transpose)
-class ReduceScanOpIneterface:
+class ReduceScanOpInterface:
     def __init__(self, axis, combine_fn):
         self.axis = axis
@@ -727,10 +811,12 @@ class ReduceScanOpIneterface:
             self.check_axis(arg.shape, self.axis)
     def to_tensor(self, ret, dtype):
+        np_dtype = _get_np_dtype(dtype)
         if hasattr(ret, "shape") and ret.shape:
-            ret_type = tl.block_type(dtype, ret.shape)
+            ret = ret.astype(np_dtype)
+            ret_type = tl.block_type(dtype, list(ret.shape))
         else:
-            ret = np.array([ret]).astype(_get_np_dtype(dtype))
+            ret = np.array([ret], dtype=np_dtype)
             ret_type = dtype
         return tl.core.tensor(TensorHandle(ret, dtype.scalar), ret_type)
@@ -744,7 +830,7 @@ class ReduceScanOpIneterface:
         raise NotImplementedError("apply_impl not implemented")
-class ReduceOps(ReduceScanOpIneterface):
+class ReduceOps(ReduceScanOpInterface):
     def __init__(self, axis, combine_fn, keep_dims):
         super().__init__(axis, combine_fn)
@@ -840,7 +926,7 @@ class ReduceOps(ReduceScanOpIneterface):
             return self.generic_reduce(input)
-class ScanOps(ReduceScanOpIneterface):
+class ScanOps(ReduceScanOpInterface):
     def __init__(self, axis, combine_fn, reverse):
         super().__init__(axis, combine_fn)
@@ -989,7 +1075,7 @@ def _patch_lang_core(lang):
     lang.static_assert = _new_static_assert
     lang.static_print = print
     lang.dtype.to_ir = _new_to_ir
-    lang.multiple_of = partial(_set_attr, name="tt.divisiblity")
+    lang.multiple_of = partial(_set_attr, name="tt.divisibility")
     lang.max_contiguous = partial(_set_attr, name="tt.contiguity")
     lang.max_constancy = partial(_set_attr, name="tt.constancy")
@@ -997,7 +1083,7 @@ def _patch_lang_core(lang):
 def _patch_lang(fn):
-    langs = [value for _, value in fn.__globals__.items() if value in [tl, tl.core]]
+    langs = [value for _, value in fn.__globals__.items() if inspect.ismodule(value) and value in [tl, tl.core]]
     assert len(langs) >= 1, "triton.language must be visible from within jit'd function"
     for lang in langs:
         _patch_builtin(lang, interpreter_builder)
@@ -1006,12 +1092,22 @@ def _patch_lang(fn):
             _patch_builtin(lang.math, interpreter_builder)
         _patch_lang_tensor(lang.tensor)
         _patch_lang_core(lang)
+    _patch_builtin(tl.core._experimental_tensor_descriptor_base, interpreter_builder)
+def _tuple_create(arg, contents):
+    # NamedTuples and tuples have different construction semantics. NamedTuple
+    # has a constructor that takes individual arguments, while tuple takes an
+    # iterable. Both have type "tuple" making it difficult to distinguish
+    # between them, but only NamedTuple has "_fields" and apparently this is how
+    # everyone does the check.
+    return type(arg)(*contents) if hasattr(arg, "_fields") else type(arg)(contents)
 # TODO: wrap everything in triton tensors
 def _implicit_cvt(arg):
     if isinstance(arg, int):
-        ty = tl.str_to_ty(triton.runtime.jit.JITFunction._type_of(triton.runtime.jit.JITFunction._key_of(arg)))
+        ty = tl.str_to_ty(triton.runtime.jit.mangle_type(arg))
         dtype = np.int32
         if -2**31 <= arg < 2**31:
             dtype = np.int32
@@ -1026,16 +1122,27 @@ def _implicit_cvt(arg):
         handle = TensorHandle(np.array([arg], dtype=dtype), ty)
         return tl.tensor(handle, ty)
     if hasattr(arg, "data_ptr"):
-        ty = tl.str_to_ty(triton.runtime.jit.JITFunction._type_of(triton.runtime.jit.JITFunction._key_of(arg)))
+        ty = tl.str_to_ty(triton.runtime.jit.mangle_type(arg))
         handle = TensorHandle(np.array([arg.data_ptr()], dtype=np.uint64), ty)
         return tl.tensor(handle, ty)
+    elif isinstance(arg, tuple):
+        return _tuple_create(arg, map(_implicit_cvt, arg))
     return arg
 interpreter_builder = InterpreterBuilder()
-# These keywords are not supported by the interpreter
-RESERVED_KWS = ["num_warps", "num_stages", "num_ctas", "enable_fp_fusion", "grid", "maxnreg"]
+def _unwrap_tensor(t):
+    if isinstance(t, triton.runtime.jit.TensorWrapper):
+        return t.base
+    return t
+def _rewrap_tensor(t, original_tensor):
+    if isinstance(original_tensor, triton.runtime.jit.TensorWrapper):
+        return triton.runtime.jit.TensorWrapper(t, original_tensor.dtype)
+    return t
 class GridExecutor:
@@ -1050,37 +1157,64 @@ class GridExecutor:
         self.constexprs = [name for name in arg_names if __annotations__.get(name) == "constexpr"]
     def _init_args_hst(self, args_dev, kwargs):
-        args_hst = []
-        for arg in args_dev:
-            if hasattr(arg, "data_ptr"):
-                args_hst.append(arg.cpu())
-            else:
-                args_hst.append(arg)
+        storages = {}
+        def _to_cpu(arg):
+            if isinstance(arg, tuple):
+                return _tuple_create(arg, map(_to_cpu, arg))
+            elif not hasattr(arg, "data_ptr"):
+                return arg
+            unwrapped_arg = _unwrap_tensor(arg)
+            if unwrapped_arg.untyped_storage().data_ptr() not in storages:
+                storage = unwrapped_arg.untyped_storage()
+                storages[storage.data_ptr()] = storage.cpu()
+            storage = storages[unwrapped_arg.untyped_storage().data_ptr()]
+            cpu_arg = unwrapped_arg.new_empty(0, device='cpu')
+            cpu_arg.set_(storage, unwrapped_arg.storage_offset(), unwrapped_arg.size(), unwrapped_arg.stride())
+            cpu_arg = _rewrap_tensor(cpu_arg, original_tensor=arg)
+            return cpu_arg
+        args_hst = [_to_cpu(arg) for arg in args_dev]
         # Process keyword arguments
         kwargs_hst = {}
         for key, value in kwargs.items():
-            if hasattr(value, "data_ptr"):
-                kwargs_hst[key] = value.cpu()
-            else:
-                kwargs_hst[key] = value
+            kwargs_hst[key] = _to_cpu(value)
         return args_hst, kwargs_hst
     def _restore_args_dev(self, args_dev, args_hst, kwargs, kwargs_hst):
-        for arg_dev, arg_hst in zip(args_dev, args_hst):
+        storages = {}
+        def _from_cpu(arg_dev, arg_hst):
             if hasattr(arg_dev, "data_ptr"):
-                arg_dev.data.copy_(arg_hst.to(arg_dev.device).data)
+                # No need to rewrap because this just modifies internal
+                arg_dev, arg_hst = _unwrap_tensor(arg_dev), _unwrap_tensor(arg_hst)
+                storages[arg_dev.untyped_storage().data_ptr()] = (arg_dev.untyped_storage(), arg_hst.untyped_storage())
+            elif isinstance(arg_dev, tuple):
+                for (arg_dev, arg_hst) in zip(arg_dev, arg_hst):
+                    _from_cpu(arg_dev, arg_hst)
+        for arg_dev, arg_hst in zip(args_dev, args_hst):
+            _from_cpu(arg_dev, arg_hst)
         # Restore keyword arguments
         for key, kwarg_dev in kwargs.items():
             kwarg_hst = kwargs_hst[key]
-            if hasattr(kwarg_dev, "data_ptr"):
-                kwarg_dev.data.copy_(kwarg_hst.to(kwarg_dev.device).data)
+            _from_cpu(kwarg_dev, kwarg_hst)
+        for (arg_dev, arg_hst) in storages.values():
+            arg_dev.copy_(arg_hst)
     def __call__(self, *args_dev, **kwargs):
-        # removes reserved keywords from kwargs
-        kwargs = {k: v for k, v in kwargs.items() if k not in RESERVED_KWS}
         if kwargs.pop("warmup", False):
             return
+        # Removes not used reserved keywords from kwargs
+        # Triton doesn't support keyword-only, variable positional or variable keyword arguments
+        # It's safe to inspect only positional or keyword arguments (i.e., argspec.args)
+        argspec = inspect.getfullargspec(self.fn)
+        kwargs = {k: v for k, v in kwargs.items() if k in argspec.args}
         # copy arguments to the host
         args_hst, kwargs_hst = self._init_args_hst(args_dev, kwargs)
         # remaps core language functions to interpreted ones