PyPI - triton-windows - Versions diffs - 3.3.1.post19__cp312-cp312-win_amd64.whl → 3.5.0.post21__cp312-cp312-win_amd64.whl - Mend

triton-windows 3.3.1.post19__cp312-cp312-win_amd64.whl → 3.5.0.post21__cp312-cp312-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of triton-windows might be problematic. Click here for more details.

Files changed (225) hide show

triton/_C/libtriton.pyd +0 -0
triton/__init__.py +11 -2
triton/_filecheck.py +97 -0
triton/_internal_testing.py +95 -18
triton/_utils.py +112 -21
triton/backends/__init__.py +20 -23
triton/backends/amd/__init__.py +0 -0
triton/backends/amd/compiler.py +161 -119
triton/backends/amd/driver.c +118 -46
triton/backends/amd/driver.py +274 -96
triton/backends/compiler.py +7 -21
triton/backends/driver.py +13 -0
triton/backends/nvidia/bin/ptxas.exe +0 -0
triton/backends/nvidia/compiler.py +163 -106
triton/backends/nvidia/driver.c +166 -101
triton/backends/nvidia/driver.py +384 -202
triton/compiler/__init__.py +5 -2
triton/compiler/code_generator.py +439 -231
triton/compiler/compiler.py +152 -84
triton/experimental/__init__.py +0 -0
triton/experimental/gluon/__init__.py +5 -0
triton/experimental/gluon/_compiler.py +0 -0
triton/experimental/gluon/_runtime.py +102 -0
triton/experimental/gluon/language/__init__.py +119 -0
triton/experimental/gluon/language/_core.py +490 -0
triton/experimental/gluon/language/_layouts.py +583 -0
triton/experimental/gluon/language/_math.py +20 -0
triton/experimental/gluon/language/_semantic.py +380 -0
triton/experimental/gluon/language/_standard.py +80 -0
triton/experimental/gluon/language/amd/__init__.py +4 -0
triton/experimental/gluon/language/amd/_layouts.py +96 -0
triton/experimental/gluon/language/amd/cdna3/__init__.py +100 -0
triton/experimental/gluon/language/amd/cdna4/__init__.py +48 -0
triton/experimental/gluon/language/amd/cdna4/async_copy.py +151 -0
triton/experimental/gluon/language/extra/__init__.py +3 -0
triton/experimental/gluon/language/nvidia/__init__.py +4 -0
triton/experimental/gluon/language/nvidia/ampere/__init__.py +3 -0
triton/experimental/gluon/language/nvidia/ampere/async_copy.py +74 -0
triton/experimental/gluon/language/nvidia/ampere/mbarrier.py +80 -0
triton/experimental/gluon/language/nvidia/blackwell/__init__.py +387 -0
triton/experimental/gluon/language/nvidia/blackwell/tma.py +52 -0
triton/experimental/gluon/language/nvidia/hopper/__init__.py +132 -0
triton/experimental/gluon/language/nvidia/hopper/mbarrier.py +34 -0
triton/experimental/gluon/language/nvidia/hopper/tma.py +97 -0
triton/experimental/gluon/nvidia/__init__.py +4 -0
triton/experimental/gluon/nvidia/blackwell.py +3 -0
triton/experimental/gluon/nvidia/hopper.py +45 -0
triton/knobs.py +546 -0
triton/language/__init__.py +50 -19
triton/language/core.py +909 -572
triton/language/extra/cuda/__init__.py +10 -7
triton/language/extra/cuda/gdc.py +42 -0
triton/language/extra/cuda/libdevice.py +394 -394
triton/language/extra/cuda/utils.py +21 -21
triton/language/extra/hip/__init__.py +3 -1
triton/language/extra/hip/libdevice.py +120 -104
triton/language/extra/hip/utils.py +35 -0
triton/language/extra/libdevice.py +4 -0
triton/language/math.py +65 -66
triton/language/random.py +12 -2
triton/language/semantic.py +1757 -1768
triton/language/standard.py +127 -62
triton/language/target_info.py +54 -0
triton/runtime/_allocation.py +15 -3
triton/runtime/_async_compile.py +55 -0
triton/runtime/autotuner.py +117 -60
triton/runtime/build.py +83 -17
triton/runtime/cache.py +61 -47
triton/runtime/driver.py +25 -47
triton/runtime/interpreter.py +95 -50
triton/runtime/jit.py +445 -248
triton/runtime/tcc/include/_mingw.h +8 -10
triton/runtime/tcc/include/assert.h +5 -0
triton/runtime/tcc/include/errno.h +1 -1
triton/runtime/tcc/include/float.h +21 -3
triton/runtime/tcc/include/iso646.h +36 -0
triton/runtime/tcc/include/limits.h +5 -0
triton/runtime/tcc/include/malloc.h +2 -2
triton/runtime/tcc/include/math.h +21 -261
triton/runtime/tcc/include/stdalign.h +16 -0
triton/runtime/tcc/include/stdarg.h +5 -70
triton/runtime/tcc/include/stdatomic.h +171 -0
triton/runtime/tcc/include/stddef.h +7 -19
triton/runtime/tcc/include/stdlib.h +15 -4
triton/runtime/tcc/include/stdnoreturn.h +7 -0
triton/runtime/tcc/include/sys/stat.h +2 -2
triton/runtime/tcc/include/sys/types.h +5 -0
triton/runtime/tcc/include/tcc/tcc_libm.h +444 -27
triton/runtime/tcc/include/tccdefs.h +342 -0
triton/runtime/tcc/include/tgmath.h +89 -0
triton/runtime/tcc/include/uchar.h +33 -0
triton/runtime/tcc/include/unistd.h +1 -0
triton/runtime/tcc/include/winapi/qos.h +72 -0
triton/runtime/tcc/include/winapi/shellapi.h +59 -0
triton/runtime/tcc/include/winapi/winbase.h +9 -2
triton/runtime/tcc/include/winapi/wincon.h +8 -0
triton/runtime/tcc/include/winapi/windows.h +1 -1
triton/runtime/tcc/include/winapi/winnls.h +778 -0
triton/runtime/tcc/include/winapi/winnt.h +9 -7
triton/runtime/tcc/include/winapi/winsock2.h +1474 -0
triton/runtime/tcc/include/winapi/ws2ipdef.h +21 -0
triton/runtime/tcc/include/winapi/ws2tcpip.h +391 -0
triton/runtime/tcc/lib/libtcc1.a +0 -0
triton/runtime/tcc/lib/python314.def +1800 -0
triton/runtime/tcc/lib/python314t.def +1809 -0
triton/runtime/tcc/libtcc.dll +0 -0
triton/runtime/tcc/tcc.exe +0 -0
triton/testing.py +16 -12
triton/tools/compile.py +62 -14
triton/tools/disasm.py +3 -4
triton/tools/extra/cuda/compile.c +1 -0
triton/tools/extra/hip/compile.cpp +66 -0
triton/tools/extra/hip/compile.h +13 -0
triton/tools/ragged_tma.py +92 -0
triton/tools/tensor_descriptor.py +34 -0
triton/windows_utils.py +52 -81
{triton_windows-3.3.1.post19.dist-info → triton_windows-3.5.0.post21.dist-info}/METADATA +8 -4
triton_windows-3.5.0.post21.dist-info/RECORD +217 -0
triton_windows-3.5.0.post21.dist-info/entry_points.txt +3 -0
triton_windows-3.5.0.post21.dist-info/licenses/LICENSE +23 -0
triton_windows-3.5.0.post21.dist-info/top_level.txt +1 -0
triton/backends/amd/include/hip/amd_detail/amd_channel_descriptor.h +0 -358
triton/backends/amd/include/hip/amd_detail/amd_device_functions.h +0 -1010
triton/backends/amd/include/hip/amd_detail/amd_hip_atomic.h +0 -1638
triton/backends/amd/include/hip/amd_detail/amd_hip_bf16.h +0 -1814
triton/backends/amd/include/hip/amd_detail/amd_hip_bfloat16.h +0 -293
triton/backends/amd/include/hip/amd_detail/amd_hip_common.h +0 -32
triton/backends/amd/include/hip/amd_detail/amd_hip_complex.h +0 -174
triton/backends/amd/include/hip/amd_detail/amd_hip_cooperative_groups.h +0 -835
triton/backends/amd/include/hip/amd_detail/amd_hip_fp16.h +0 -1809
triton/backends/amd/include/hip/amd_detail/amd_hip_fp8.h +0 -1391
triton/backends/amd/include/hip/amd_detail/amd_hip_gl_interop.h +0 -108
triton/backends/amd/include/hip/amd_detail/amd_hip_math_constants.h +0 -124
triton/backends/amd/include/hip/amd_detail/amd_hip_runtime.h +0 -405
triton/backends/amd/include/hip/amd_detail/amd_hip_runtime_pt_api.h +0 -196
triton/backends/amd/include/hip/amd_detail/amd_hip_unsafe_atomics.h +0 -565
triton/backends/amd/include/hip/amd_detail/amd_hip_vector_types.h +0 -2226
triton/backends/amd/include/hip/amd_detail/amd_math_functions.h +0 -104
triton/backends/amd/include/hip/amd_detail/amd_surface_functions.h +0 -244
triton/backends/amd/include/hip/amd_detail/amd_warp_functions.h +0 -538
triton/backends/amd/include/hip/amd_detail/amd_warp_sync_functions.h +0 -288
triton/backends/amd/include/hip/amd_detail/concepts.hpp +0 -30
triton/backends/amd/include/hip/amd_detail/device_library_decls.h +0 -133
triton/backends/amd/include/hip/amd_detail/functional_grid_launch.hpp +0 -218
triton/backends/amd/include/hip/amd_detail/grid_launch.h +0 -67
triton/backends/amd/include/hip/amd_detail/grid_launch.hpp +0 -50
triton/backends/amd/include/hip/amd_detail/grid_launch_GGL.hpp +0 -26
triton/backends/amd/include/hip/amd_detail/helpers.hpp +0 -137
triton/backends/amd/include/hip/amd_detail/hip_api_trace.hpp +0 -1446
triton/backends/amd/include/hip/amd_detail/hip_assert.h +0 -101
triton/backends/amd/include/hip/amd_detail/hip_cooperative_groups_helper.h +0 -242
triton/backends/amd/include/hip/amd_detail/hip_fp16_gcc.h +0 -254
triton/backends/amd/include/hip/amd_detail/hip_fp16_math_fwd.h +0 -96
triton/backends/amd/include/hip/amd_detail/hip_ldg.h +0 -100
triton/backends/amd/include/hip/amd_detail/hip_prof_str.h +0 -10570
triton/backends/amd/include/hip/amd_detail/hip_runtime_prof.h +0 -78
triton/backends/amd/include/hip/amd_detail/host_defines.h +0 -184
triton/backends/amd/include/hip/amd_detail/hsa_helpers.hpp +0 -102
triton/backends/amd/include/hip/amd_detail/macro_based_grid_launch.hpp +0 -798
triton/backends/amd/include/hip/amd_detail/math_fwd.h +0 -698
triton/backends/amd/include/hip/amd_detail/ockl_image.h +0 -177
triton/backends/amd/include/hip/amd_detail/program_state.hpp +0 -107
triton/backends/amd/include/hip/amd_detail/texture_fetch_functions.h +0 -491
triton/backends/amd/include/hip/amd_detail/texture_indirect_functions.h +0 -478
triton/backends/amd/include/hip/channel_descriptor.h +0 -39
triton/backends/amd/include/hip/device_functions.h +0 -38
triton/backends/amd/include/hip/driver_types.h +0 -468
triton/backends/amd/include/hip/hip_bf16.h +0 -36
triton/backends/amd/include/hip/hip_bfloat16.h +0 -44
triton/backends/amd/include/hip/hip_common.h +0 -100
triton/backends/amd/include/hip/hip_complex.h +0 -38
triton/backends/amd/include/hip/hip_cooperative_groups.h +0 -46
triton/backends/amd/include/hip/hip_deprecated.h +0 -95
triton/backends/amd/include/hip/hip_ext.h +0 -161
triton/backends/amd/include/hip/hip_fp16.h +0 -36
triton/backends/amd/include/hip/hip_fp8.h +0 -33
triton/backends/amd/include/hip/hip_gl_interop.h +0 -32
triton/backends/amd/include/hip/hip_hcc.h +0 -24
triton/backends/amd/include/hip/hip_math_constants.h +0 -36
triton/backends/amd/include/hip/hip_profile.h +0 -27
triton/backends/amd/include/hip/hip_runtime.h +0 -75
triton/backends/amd/include/hip/hip_runtime_api.h +0 -9261
triton/backends/amd/include/hip/hip_texture_types.h +0 -29
triton/backends/amd/include/hip/hip_vector_types.h +0 -41
triton/backends/amd/include/hip/hip_version.h +0 -17
triton/backends/amd/include/hip/hiprtc.h +0 -421
triton/backends/amd/include/hip/library_types.h +0 -78
triton/backends/amd/include/hip/math_functions.h +0 -42
triton/backends/amd/include/hip/surface_types.h +0 -63
triton/backends/amd/include/hip/texture_types.h +0 -194
triton/backends/amd/include/hsa/Brig.h +0 -1131
triton/backends/amd/include/hsa/amd_hsa_common.h +0 -91
triton/backends/amd/include/hsa/amd_hsa_elf.h +0 -462
triton/backends/amd/include/hsa/amd_hsa_kernel_code.h +0 -269
triton/backends/amd/include/hsa/amd_hsa_queue.h +0 -109
triton/backends/amd/include/hsa/amd_hsa_signal.h +0 -80
triton/backends/amd/include/hsa/hsa.h +0 -5738
triton/backends/amd/include/hsa/hsa_amd_tool.h +0 -91
triton/backends/amd/include/hsa/hsa_api_trace.h +0 -579
triton/backends/amd/include/hsa/hsa_api_trace_version.h +0 -68
triton/backends/amd/include/hsa/hsa_ext_amd.h +0 -3146
triton/backends/amd/include/hsa/hsa_ext_finalize.h +0 -531
triton/backends/amd/include/hsa/hsa_ext_image.h +0 -1454
triton/backends/amd/include/hsa/hsa_ven_amd_aqlprofile.h +0 -488
triton/backends/amd/include/hsa/hsa_ven_amd_loader.h +0 -667
triton/backends/amd/include/hsa/hsa_ven_amd_pc_sampling.h +0 -416
triton/backends/amd/include/roctracer/ext/prof_protocol.h +0 -107
triton/backends/amd/include/roctracer/hip_ostream_ops.h +0 -4515
triton/backends/amd/include/roctracer/hsa_ostream_ops.h +0 -1727
triton/backends/amd/include/roctracer/hsa_prof_str.h +0 -3059
triton/backends/amd/include/roctracer/roctracer.h +0 -779
triton/backends/amd/include/roctracer/roctracer_ext.h +0 -81
triton/backends/amd/include/roctracer/roctracer_hcc.h +0 -24
triton/backends/amd/include/roctracer/roctracer_hip.h +0 -37
triton/backends/amd/include/roctracer/roctracer_hsa.h +0 -112
triton/backends/amd/include/roctracer/roctracer_plugin.h +0 -137
triton/backends/amd/include/roctracer/roctracer_roctx.h +0 -67
triton/backends/amd/include/roctracer/roctx.h +0 -229
triton/language/_utils.py +0 -21
triton/language/extra/cuda/_experimental_tma.py +0 -106
triton/runtime/tcc/lib/libtcc1-64.a +0 -0
triton/tools/experimental_descriptor.py +0 -32
triton_windows-3.3.1.post19.dist-info/RECORD +0 -260
triton_windows-3.3.1.post19.dist-info/top_level.txt +0 -14
{triton_windows-3.3.1.post19.dist-info → triton_windows-3.5.0.post21.dist-info}/WHEEL +0 -0

triton/runtime/driver.py CHANGED Viewed

@@ -1,60 +1,38 @@
-from ..backends import backends
-from ..backends import DriverBase
+from __future__ import annotations
+from ..backends import backends, DriverBase
-def _create_driver():
-    actives = [x.driver for x in backends.values() if x.driver.is_active()]
-    if len(actives) != 1:
-        raise RuntimeError(f"{len(actives)} active drivers ({actives}). There should only be one.")
-    return actives[0]()
+def _create_driver() -> DriverBase:
+    active_drivers = [x.driver for x in backends.values() if x.driver.is_active()]
+    if len(active_drivers) != 1:
+        raise RuntimeError(f"{len(active_drivers)} active drivers ({active_drivers}). There should only be one.")
+    return active_drivers[0]()
-class LazyProxy:
-    def __init__(self, init_fn):
-        self._init_fn = init_fn
-        self._obj = None
-    def _initialize_obj(self):
-        if self._obj is None:
-            self._obj = self._init_fn()
-    def __getattr__(self, name):
-        self._initialize_obj()
-        return getattr(self._obj, name)
-    def __setattr__(self, name, value):
-        if name in ["_init_fn", "_obj"]:
-            super().__setattr__(name, value)
-        else:
-            self._initialize_obj()
-            setattr(self._obj, name, value)
-    def __delattr__(self, name):
-        self._initialize_obj()
-        delattr(self._obj, name)
-    def __repr__(self):
-        if self._obj is None:
-            return f"<{self.__class__.__name__} for {self._init_fn} not yet initialized>"
-        return repr(self._obj)
-    def __str__(self):
-        self._initialize_obj()
-        return str(self._obj)
+class DriverConfig:
+    def __init__(self) -> None:
+        self._default: DriverBase | None = None
+        self._active: DriverBase | None = None
-class DriverConfig:
+    @property
+    def default(self) -> DriverBase:
+        if self._default is None:
+            self._default = _create_driver()
+        return self._default
-    def __init__(self):
-        self.default = LazyProxy(_create_driver)
-        self.active = self.default
+    @property
+    def active(self) -> DriverBase:
+        if self._active is None:
+            self._active = self.default
+        return self._active
-    def set_active(self, driver: DriverBase):
-        self.active = driver
+    def set_active(self, driver: DriverBase) -> None:
+        self._active = driver
-    def reset_active(self):
-        self.active = self.default
+    def reset_active(self) -> None:
+        self._active = self.default
 driver = DriverConfig()

triton/runtime/interpreter.py CHANGED Viewed

@@ -1,32 +1,36 @@
+from __future__ import annotations
 import ast
 import textwrap
 import inspect
-from typing import Tuple, List
+from typing import Tuple, List, Dict, Callable
 import math
 import numpy as np
 import triton
 import triton.language as tl
+import dataclasses
 from dataclasses import dataclass
+from triton.language.semantic import TritonSemantic
+from triton.tools.tensor_descriptor import TensorDescriptor
 from .errors import InterpreterError
 from functools import partial
 from .._C.libtriton import interpreter as _interpreter
 from .._C.libtriton import ir as _ir
+@dataclass
 class TensorHandle:
-    def __init__(self, data, dtype):
-        '''
-            data: numpy array
-            dtype: triton type, either pointer_type or scalar_type.
-            we don't store block_type here because the shape information is already available in the data field
-            attr: a dictionary of attributes
-        '''
-        self.data = data
-        self.dtype = dtype
-        self.attr = {}
+    '''
+        data: numpy array
+        dtype: triton type, either pointer_type or scalar_type.
+        we don't store block_type here because the shape information is already available in the data field
+        attr: a dictionary of attributes
+    '''
+    data: np.array
+    dtype: tl.dtype
+    attr: Dict = dataclasses.field(default_factory=dict)
     def __bool__(self):
         return bool(self.data.all())
@@ -73,17 +77,19 @@ class BlockPointerHandle:
 class TensorDescHandle:
     def __init__(self, base: TensorHandle, shape: List[TensorHandle], strides: List[TensorHandle],
-                 block_shape: List[int]):
+                 block_shape: List[int], padding):
         self.base = base
         self.ndim = len(shape)
         self.shape = shape
         self.strides = strides
         self.block_shape = block_shape
+        self.padding = padding
     def validate(self):
         assert self.base.data.item() % 16 == 0, "base must be 16-byte aligned"
         assert len(self.strides) == self.ndim
         assert len(self.block_shape) == self.ndim
+        assert self.ndim >= 1, "descriptor cannot be 0 dimensional"
         for stride in self.strides[:-1]:
             assert stride.data.item() % 16 == 0, "stride must be 16-byte aligned"
@@ -103,6 +109,7 @@ class TensorDescHandle:
             off = (offsets[dim].data + np.arange(self.block_shape[dim])).reshape(bcast_dims)
             ptrs = ptrs + (itemsize * off * self.strides[dim].data).astype(np.uint64)
             masks = masks & (0 <= off) & (off < self.shape[dim].data)
+        assert ptrs.dtype == np.uint64
         ptrs = TensorHandle(ptrs, self.base.dtype.scalar)
         return ptrs, masks
@@ -114,7 +121,7 @@ class InterpreterOptions:
     sanitize_overflow: bool = True
     arch: str = None
     supported_fp8_dtypes: Tuple[str] = ("fp8e5", "fp8e5b16", "fp8e4nv", "fp8e4b8", "fp8e4b15")
-    deprecated_fp8_dtypes: Tuple[str] = ()
+    deprecated_fp8_dot_operand_dtypes: Tuple[str] = ()
     default_dot_input_precision: str = "tf32"
     allowed_dot_input_precisions: Tuple[str] = ("tf32", "tf32x3", "ieee")
     max_num_imprecise_acc_default: int = 0
@@ -248,8 +255,8 @@ np_umulhi_u64 = np.vectorize(_umulhi_64, otypes=[np.uint64])
 class ExtraFunctions:
     @staticmethod
-    def _convert_custom_types(input, dst_ty, fp_downcast_rounding, _builder):
-        return tl.tensor(_builder.create_fp_to_fp(input.handle, dst_ty, fp_downcast_rounding), dst_ty)
+    def _convert_custom_types(input, dst_ty, fp_downcast_rounding, _semantic):
+        return tl.tensor(_semantic.builder.create_fp_to_fp(input.handle, dst_ty, fp_downcast_rounding), dst_ty)
 class InterpreterBuilder:
@@ -306,6 +313,9 @@ class InterpreterBuilder:
     def get_double_ty(self):
         return tl.float64
+    def get_int1_ty(self):
+        return tl.int1
     def get_int8_ty(self):
         return tl.int8
@@ -587,11 +597,18 @@ class InterpreterBuilder:
             b_data = _convert_float(b_data, b.dtype, tl.float16, None).view(np.float16)
         return TensorHandle(np.matmul(a_data, b_data, dtype=d.data.dtype) + d.data, d.dtype.scalar)
-    def create_make_range(self, start, stop):
+    def create_make_range(self, ret_ty, start, stop):
         return TensorHandle(np.arange(start, stop, dtype=np.int32), tl.int32)
-    def create_histogram(self, data, bins):
-        return TensorHandle(np.histogram(data.data, bins=bins, range=(0, bins))[0], tl.int32)
+    def create_histogram(self, data, bins, mask):
+        if mask is None:
+            mask = TensorHandle(np.ones_like(data.data, dtype=bool), tl.int1)
+        # force all masked elements to zero
+        data = np.where(mask.data, data.data, np.zeros_like(data.data))
+        histogram = np.histogram(data, bins=bins, range=(0, bins))[0]
+        # remove overcounted elements
+        histogram[0] -= np.logical_not(mask.data).sum()
+        return TensorHandle(histogram, tl.int32)
     def create_gather(self, src, indices, axis):
         return TensorHandle(np.take_along_axis(src.data, indices.data, axis=axis), src.dtype.scalar)
@@ -641,12 +658,16 @@ class InterpreterBuilder:
         # Triton only supports splitting the original tensor into two along the last axis
         return (TensorHandle(val.data[..., 0], val.dtype.scalar), TensorHandle(val.data[..., 1], val.dtype.scalar))
-    def create_splat(self, arg, shape):
+    def create_splat(self, ret_ty, arg):
+        shape = ret_ty.shape
         if isinstance(arg.dtype, tl.block_type):
             return TensorHandle(np.full(shape, arg.data[0], dtype=_get_np_dtype(arg.dtype)), arg.dtype.scalar)
         else:  # scalar
             return TensorHandle(np.full(shape, arg.data, dtype=_get_np_dtype(arg.dtype)), arg.dtype.scalar)
+    def create_unsplat(self, arg):
+        return TensorHandle(np.full((1, ), arg.data[0], dtype=_get_np_dtype(arg.dtype)), arg.dtype.scalar)
     def create_atomic_cas(self, ptr, cmp, val, sem, scope):
         if sem not in self.ir_sem_to_interpreter_sem:
             raise ValueError(f"unsupported semantic {sem}")
@@ -709,14 +730,9 @@ class InterpreterBuilder:
             ret.offsets[i].data += offsets[i].data
         return ret
-    def create_make_tensor_descriptor(
-        self,
-        base: TensorHandle,
-        shape: List[TensorHandle],
-        strides: List[TensorHandle],
-        tensor_shape: List[int],
-    ):
-        desc = TensorDescHandle(base, shape, strides, tensor_shape)
+    def create_make_tensor_descriptor(self, base: TensorHandle, shape: List[TensorHandle], strides: List[TensorHandle],
+                                      tensor_shape: List[int], is_signed: bool, padding: str = "zero"):
+        desc = TensorDescHandle(base, shape, strides, tensor_shape, padding)
         desc.validate()
         return desc
@@ -724,7 +740,16 @@ class InterpreterBuilder:
                                eviction_policy):
         assert isinstance(desc, TensorDescHandle)
         ptrs, mask = desc.materialize_pointers(indices)
-        return self.create_masked_load(ptrs, mask, other=None, cache_modifier=cache_modifier,
+        dtype_tt = ptrs.get_element_ty()
+        dtype_np = _get_np_dtype(dtype_tt)
+        padding = desc.padding
+        if padding == _ir.PADDING_OPTION.PAD_ZERO:
+            other = TensorHandle(np.zeros_like(ptrs.data, dtype=dtype_np), dtype_tt)
+        elif padding == _ir.PADDING_OPTION.PAD_NAN:
+            other = TensorHandle(np.full_like(ptrs.data, float('nan'), dtype=dtype_np), dtype_tt)
+        else:
+            raise ValueError(f"unsupported padding {padding}")
+        return self.create_masked_load(ptrs, mask, other, cache_modifier=cache_modifier,
                                        eviction_policy=eviction_policy, is_volatile=False)
     def create_descriptor_store(self, desc: TensorDescHandle, value: TensorHandle, indices: List[TensorHandle]):
@@ -753,15 +778,18 @@ class InterpreterBuilder:
         np_type = _get_np_dtype(type)
         if "int" in np_type.name:
             return TensorHandle(np.full(1, -1, dtype=np_type), type.scalar)
+        elif np_type == np.bool_:
+            return TensorHandle(np.full(1, True, dtype=np_type), type.scalar)
         else:
             raise TypeError(f"unsupported type {type}")
 def _patch_attr(obj, name, member, builder):
+    semantic = TritonSemantic(builder)
     new_member = lambda *args, member=member, **kwargs: (member(*args, **
                                                                 {k: v
                                                                  for k, v in kwargs.items()
-                                                                 if k != "_builder"}, _builder=builder))
+                                                                 if k != "_semantic"}, _semantic=semantic))
     setattr(obj, name, new_member)
@@ -822,12 +850,10 @@ class ReduceScanOpInterface:
     def apply(self, input):
         if not isinstance(input, tuple):
-            input = (input, )
+            return self.apply((input, ))[0]
         self.check_tensor(input)
-        return self.apply_impl(input)
-    def apply_impl(self, input):
-        raise NotImplementedError("apply_impl not implemented")
+        ret = self.apply_impl(input)
+        return tuple(ret) if isinstance(ret, (list, tuple)) else (ret, )
 class ReduceOps(ReduceScanOpInterface):
@@ -887,7 +913,7 @@ class ReduceOps(ReduceScanOpInterface):
                 # Take a scalar
                 data = data.item()
             ret.append(self.to_tensor(data, input[i].dtype))
-        return ret[0] if len(ret) == 1 else tuple(ret)
+        return ret
     def min_max(self, input, val_reduce_op, idx_reduce_op=None):
         # If input is a tuple, it must be (val, index), and we only take val
@@ -916,9 +942,9 @@ class ReduceOps(ReduceScanOpInterface):
         elif self.combine_fn == tl.standard._argmax_combine_tie_break_left:
             return self.min_max(input[0], val_reduce_op=np.max, idx_reduce_op=np.argmax)
         elif self.combine_fn == tl.standard._elementwise_max:
-            return self.min_max(input[0], val_reduce_op=np.max, idx_reduce_op=None)
+            return self.min_max(input[0], val_reduce_op=np.nanmax, idx_reduce_op=None)
         elif self.combine_fn == tl.standard._elementwise_min:
-            return self.min_max(input[0], val_reduce_op=np.min, idx_reduce_op=None)
+            return self.min_max(input[0], val_reduce_op=np.nanmin, idx_reduce_op=None)
         elif self.combine_fn == tl.standard._sum_combine:
             return self.sum(input[0])
         else:
@@ -985,7 +1011,7 @@ class ScanOps(ReduceScanOpInterface):
         if self.reverse:
             for arg in ret:
                 arg.handle.data = np.flip(arg.handle.data, axis=self.axis)
-        return len(ret) == 1 and ret[0] or tuple(ret)
+        return ret
 def _patch_reduce_scan():
@@ -1092,7 +1118,7 @@ def _patch_lang(fn):
             _patch_builtin(lang.math, interpreter_builder)
         _patch_lang_tensor(lang.tensor)
         _patch_lang_core(lang)
-    _patch_builtin(tl.core._experimental_tensor_descriptor_base, interpreter_builder)
+    _patch_builtin(tl.core.tensor_descriptor_base, interpreter_builder)
 def _tuple_create(arg, contents):
@@ -1107,7 +1133,7 @@ def _tuple_create(arg, contents):
 # TODO: wrap everything in triton tensors
 def _implicit_cvt(arg):
     if isinstance(arg, int):
-        ty = tl.str_to_ty(triton.runtime.jit.mangle_type(arg))
+        ty = tl.str_to_ty(triton.runtime.jit.mangle_type(arg), None)
         dtype = np.int32
         if -2**31 <= arg < 2**31:
             dtype = np.int32
@@ -1122,15 +1148,25 @@ def _implicit_cvt(arg):
         handle = TensorHandle(np.array([arg], dtype=dtype), ty)
         return tl.tensor(handle, ty)
     if hasattr(arg, "data_ptr"):
-        ty = tl.str_to_ty(triton.runtime.jit.mangle_type(arg))
+        ty = tl.str_to_ty(triton.runtime.jit.mangle_type(arg), None)
         handle = TensorHandle(np.array([arg.data_ptr()], dtype=np.uint64), ty)
         return tl.tensor(handle, ty)
     elif isinstance(arg, tuple):
         return _tuple_create(arg, map(_implicit_cvt, arg))
+    elif isinstance(arg, TensorDescriptor):
+        strides = [_implicit_cvt(s) for s in arg.strides]
+        assert arg.strides[-1] == 1
+        strides[-1] = tl.constexpr(1)
+        semantic = TritonSemantic(InterpreterBuilder())
+        return semantic.make_tensor_descriptor(base=_implicit_cvt(arg.base),
+                                               shape=[_implicit_cvt(s) for s in arg.shape], strides=strides,
+                                               block_shape=[tl.constexpr(b)
+                                                            for b in arg.block_shape], padding_option=arg.padding)
     return arg
 interpreter_builder = InterpreterBuilder()
+interpreter_semantic = TritonSemantic(interpreter_builder)
 def _unwrap_tensor(t):
@@ -1162,6 +1198,14 @@ class GridExecutor:
         def _to_cpu(arg):
             if isinstance(arg, tuple):
                 return _tuple_create(arg, map(_to_cpu, arg))
+            elif isinstance(arg, TensorDescriptor):
+                return TensorDescriptor(
+                    _to_cpu(arg.base),
+                    arg.shape,
+                    arg.strides,
+                    arg.block_shape,
+                    arg.padding,
+                )
             elif not hasattr(arg, "data_ptr"):
                 return arg
@@ -1195,6 +1239,8 @@ class GridExecutor:
             elif isinstance(arg_dev, tuple):
                 for (arg_dev, arg_hst) in zip(arg_dev, arg_hst):
                     _from_cpu(arg_dev, arg_hst)
+            elif isinstance(arg_dev, TensorDescriptor):
+                _from_cpu(arg_dev.base, arg_hst.base)
         for arg_dev, arg_hst in zip(args_dev, args_hst):
             _from_cpu(arg_dev, arg_hst)
@@ -1235,6 +1281,8 @@ class GridExecutor:
                         interpreter_builder.set_grid_idx(x, y, z)
                         self.fn(**args)
         except Exception as e:
+            if triton.knobs.compilation.front_end_debugging:
+                raise
             raise InterpreterError(repr(e)) from e
         # copy arguments back to propagate side-effects
         self._restore_args_dev(args_dev, args_hst, kwargs, kwargs_hst)
@@ -1249,14 +1297,10 @@ class ASTTransformer(ast.NodeTransformer):
         if len(names) > 1:
             raise ValueError("Multiple assignments are not supported")
         # Modify the assignment x = value to
-        # triton.language.semantic.to_tensor(value, interpreter_builder, False)
+        # interpreter_semantic.to_tensor(value, False)
         node.value = ast.Call(
-            func=ast.Attribute(
-                value=ast.Attribute(
-                    value=ast.Attribute(value=ast.Name(id='triton', ctx=ast.Load()), attr='language', ctx=ast.Load()),
-                    attr='semantic', ctx=ast.Load()), attr='to_tensor', ctx=ast.Load()),
-            args=[node.value, ast.Name(id='interpreter_builder', ctx=ast.Load()),
-                  ast.Constant(value=False)], keywords=[])
+            func=ast.Attribute(value=ast.Name(id="interpreter_semantic", ctx=ast.Load()), attr="to_tensor",
+                               ctx=ast.Load()), args=[node.value, ast.Constant(value=False)], keywords=[])
         return node
@@ -1331,11 +1375,12 @@ class FunctionRewriter:
 class InterpretedFunction:
     # Cache all rewritten functions
-    rewritten_fn = {}
+    rewritten_fn: Dict[Callable, Callable] = {}
     def __init__(self, fn, **kwargs) -> None:
         self.fn = fn
         self.rewriter = FunctionRewriter(fn, **kwargs)
+        self.kwargs = kwargs
         def run(*args, **kwargs):
             grid = kwargs["grid"]