PyPI - triton-windows - Versions diffs - 3.3.1.post19__cp313-cp313-win_amd64.whl → 3.4.0.post20__cp313-cp313-win_amd64.whl - Mend

triton-windows 3.3.1.post19__cp313-cp313-win_amd64.whl → 3.4.0.post20__cp313-cp313-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of triton-windows might be problematic. Click here for more details.

Files changed (166) hide show

triton/_C/libtriton.pyd +0 -0
triton/__init__.py +4 -1
triton/_filecheck.py +87 -0
triton/_internal_testing.py +26 -15
triton/_utils.py +110 -21
triton/backends/__init__.py +20 -23
triton/backends/amd/__init__.py +0 -0
triton/backends/amd/compiler.py +112 -78
triton/backends/amd/driver.c +5 -2
triton/backends/amd/driver.py +149 -47
triton/backends/compiler.py +7 -21
triton/backends/nvidia/bin/ptxas.exe +0 -0
triton/backends/nvidia/compiler.py +92 -93
triton/backends/nvidia/driver.c +90 -98
triton/backends/nvidia/driver.py +303 -128
triton/compiler/code_generator.py +212 -111
triton/compiler/compiler.py +110 -25
triton/experimental/__init__.py +0 -0
triton/experimental/gluon/__init__.py +4 -0
triton/experimental/gluon/_compiler.py +0 -0
triton/experimental/gluon/_runtime.py +99 -0
triton/experimental/gluon/language/__init__.py +18 -0
triton/experimental/gluon/language/_core.py +312 -0
triton/experimental/gluon/language/_layouts.py +230 -0
triton/experimental/gluon/language/_math.py +12 -0
triton/experimental/gluon/language/_semantic.py +287 -0
triton/experimental/gluon/language/_standard.py +47 -0
triton/experimental/gluon/language/nvidia/__init__.py +4 -0
triton/experimental/gluon/language/nvidia/blackwell/__init__.py +202 -0
triton/experimental/gluon/language/nvidia/blackwell/tma.py +32 -0
triton/experimental/gluon/language/nvidia/hopper/__init__.py +11 -0
triton/experimental/gluon/language/nvidia/hopper/mbarrier.py +51 -0
triton/experimental/gluon/language/nvidia/hopper/tma.py +96 -0
triton/experimental/gluon/nvidia/__init__.py +4 -0
triton/experimental/gluon/nvidia/blackwell.py +3 -0
triton/experimental/gluon/nvidia/hopper.py +40 -0
triton/knobs.py +481 -0
triton/language/__init__.py +39 -14
triton/language/core.py +794 -537
triton/language/extra/cuda/__init__.py +10 -7
triton/language/extra/cuda/gdc.py +42 -0
triton/language/extra/cuda/libdevice.py +394 -394
triton/language/extra/cuda/utils.py +21 -21
triton/language/extra/hip/libdevice.py +113 -104
triton/language/math.py +65 -66
triton/language/random.py +12 -2
triton/language/semantic.py +1706 -1770
triton/language/standard.py +116 -51
triton/runtime/autotuner.py +117 -59
triton/runtime/build.py +76 -12
triton/runtime/cache.py +18 -47
triton/runtime/driver.py +32 -29
triton/runtime/interpreter.py +72 -35
triton/runtime/jit.py +146 -110
triton/testing.py +16 -12
triton/tools/disasm.py +3 -4
triton/tools/tensor_descriptor.py +36 -0
triton/windows_utils.py +14 -6
{triton_windows-3.3.1.post19.dist-info → triton_windows-3.4.0.post20.dist-info}/METADATA +7 -2
triton_windows-3.4.0.post20.dist-info/RECORD +186 -0
triton_windows-3.4.0.post20.dist-info/entry_points.txt +3 -0
triton_windows-3.4.0.post20.dist-info/licenses/LICENSE +23 -0
triton_windows-3.4.0.post20.dist-info/top_level.txt +1 -0
triton/backends/amd/include/hip/amd_detail/amd_channel_descriptor.h +0 -358
triton/backends/amd/include/hip/amd_detail/amd_device_functions.h +0 -1010
triton/backends/amd/include/hip/amd_detail/amd_hip_atomic.h +0 -1638
triton/backends/amd/include/hip/amd_detail/amd_hip_bf16.h +0 -1814
triton/backends/amd/include/hip/amd_detail/amd_hip_bfloat16.h +0 -293
triton/backends/amd/include/hip/amd_detail/amd_hip_common.h +0 -32
triton/backends/amd/include/hip/amd_detail/amd_hip_complex.h +0 -174
triton/backends/amd/include/hip/amd_detail/amd_hip_cooperative_groups.h +0 -835
triton/backends/amd/include/hip/amd_detail/amd_hip_fp16.h +0 -1809
triton/backends/amd/include/hip/amd_detail/amd_hip_fp8.h +0 -1391
triton/backends/amd/include/hip/amd_detail/amd_hip_gl_interop.h +0 -108
triton/backends/amd/include/hip/amd_detail/amd_hip_math_constants.h +0 -124
triton/backends/amd/include/hip/amd_detail/amd_hip_runtime.h +0 -405
triton/backends/amd/include/hip/amd_detail/amd_hip_runtime_pt_api.h +0 -196
triton/backends/amd/include/hip/amd_detail/amd_hip_unsafe_atomics.h +0 -565
triton/backends/amd/include/hip/amd_detail/amd_hip_vector_types.h +0 -2226
triton/backends/amd/include/hip/amd_detail/amd_math_functions.h +0 -104
triton/backends/amd/include/hip/amd_detail/amd_surface_functions.h +0 -244
triton/backends/amd/include/hip/amd_detail/amd_warp_functions.h +0 -538
triton/backends/amd/include/hip/amd_detail/amd_warp_sync_functions.h +0 -288
triton/backends/amd/include/hip/amd_detail/concepts.hpp +0 -30
triton/backends/amd/include/hip/amd_detail/device_library_decls.h +0 -133
triton/backends/amd/include/hip/amd_detail/functional_grid_launch.hpp +0 -218
triton/backends/amd/include/hip/amd_detail/grid_launch.h +0 -67
triton/backends/amd/include/hip/amd_detail/grid_launch.hpp +0 -50
triton/backends/amd/include/hip/amd_detail/grid_launch_GGL.hpp +0 -26
triton/backends/amd/include/hip/amd_detail/helpers.hpp +0 -137
triton/backends/amd/include/hip/amd_detail/hip_api_trace.hpp +0 -1446
triton/backends/amd/include/hip/amd_detail/hip_assert.h +0 -101
triton/backends/amd/include/hip/amd_detail/hip_cooperative_groups_helper.h +0 -242
triton/backends/amd/include/hip/amd_detail/hip_fp16_gcc.h +0 -254
triton/backends/amd/include/hip/amd_detail/hip_fp16_math_fwd.h +0 -96
triton/backends/amd/include/hip/amd_detail/hip_ldg.h +0 -100
triton/backends/amd/include/hip/amd_detail/hip_prof_str.h +0 -10570
triton/backends/amd/include/hip/amd_detail/hip_runtime_prof.h +0 -78
triton/backends/amd/include/hip/amd_detail/host_defines.h +0 -184
triton/backends/amd/include/hip/amd_detail/hsa_helpers.hpp +0 -102
triton/backends/amd/include/hip/amd_detail/macro_based_grid_launch.hpp +0 -798
triton/backends/amd/include/hip/amd_detail/math_fwd.h +0 -698
triton/backends/amd/include/hip/amd_detail/ockl_image.h +0 -177
triton/backends/amd/include/hip/amd_detail/program_state.hpp +0 -107
triton/backends/amd/include/hip/amd_detail/texture_fetch_functions.h +0 -491
triton/backends/amd/include/hip/amd_detail/texture_indirect_functions.h +0 -478
triton/backends/amd/include/hip/channel_descriptor.h +0 -39
triton/backends/amd/include/hip/device_functions.h +0 -38
triton/backends/amd/include/hip/driver_types.h +0 -468
triton/backends/amd/include/hip/hip_bf16.h +0 -36
triton/backends/amd/include/hip/hip_bfloat16.h +0 -44
triton/backends/amd/include/hip/hip_common.h +0 -100
triton/backends/amd/include/hip/hip_complex.h +0 -38
triton/backends/amd/include/hip/hip_cooperative_groups.h +0 -46
triton/backends/amd/include/hip/hip_deprecated.h +0 -95
triton/backends/amd/include/hip/hip_ext.h +0 -161
triton/backends/amd/include/hip/hip_fp16.h +0 -36
triton/backends/amd/include/hip/hip_fp8.h +0 -33
triton/backends/amd/include/hip/hip_gl_interop.h +0 -32
triton/backends/amd/include/hip/hip_hcc.h +0 -24
triton/backends/amd/include/hip/hip_math_constants.h +0 -36
triton/backends/amd/include/hip/hip_profile.h +0 -27
triton/backends/amd/include/hip/hip_runtime.h +0 -75
triton/backends/amd/include/hip/hip_runtime_api.h +0 -9261
triton/backends/amd/include/hip/hip_texture_types.h +0 -29
triton/backends/amd/include/hip/hip_vector_types.h +0 -41
triton/backends/amd/include/hip/hip_version.h +0 -17
triton/backends/amd/include/hip/hiprtc.h +0 -421
triton/backends/amd/include/hip/library_types.h +0 -78
triton/backends/amd/include/hip/math_functions.h +0 -42
triton/backends/amd/include/hip/surface_types.h +0 -63
triton/backends/amd/include/hip/texture_types.h +0 -194
triton/backends/amd/include/hsa/Brig.h +0 -1131
triton/backends/amd/include/hsa/amd_hsa_common.h +0 -91
triton/backends/amd/include/hsa/amd_hsa_elf.h +0 -462
triton/backends/amd/include/hsa/amd_hsa_kernel_code.h +0 -269
triton/backends/amd/include/hsa/amd_hsa_queue.h +0 -109
triton/backends/amd/include/hsa/amd_hsa_signal.h +0 -80
triton/backends/amd/include/hsa/hsa.h +0 -5738
triton/backends/amd/include/hsa/hsa_amd_tool.h +0 -91
triton/backends/amd/include/hsa/hsa_api_trace.h +0 -579
triton/backends/amd/include/hsa/hsa_api_trace_version.h +0 -68
triton/backends/amd/include/hsa/hsa_ext_amd.h +0 -3146
triton/backends/amd/include/hsa/hsa_ext_finalize.h +0 -531
triton/backends/amd/include/hsa/hsa_ext_image.h +0 -1454
triton/backends/amd/include/hsa/hsa_ven_amd_aqlprofile.h +0 -488
triton/backends/amd/include/hsa/hsa_ven_amd_loader.h +0 -667
triton/backends/amd/include/hsa/hsa_ven_amd_pc_sampling.h +0 -416
triton/backends/amd/include/roctracer/ext/prof_protocol.h +0 -107
triton/backends/amd/include/roctracer/hip_ostream_ops.h +0 -4515
triton/backends/amd/include/roctracer/hsa_ostream_ops.h +0 -1727
triton/backends/amd/include/roctracer/hsa_prof_str.h +0 -3059
triton/backends/amd/include/roctracer/roctracer.h +0 -779
triton/backends/amd/include/roctracer/roctracer_ext.h +0 -81
triton/backends/amd/include/roctracer/roctracer_hcc.h +0 -24
triton/backends/amd/include/roctracer/roctracer_hip.h +0 -37
triton/backends/amd/include/roctracer/roctracer_hsa.h +0 -112
triton/backends/amd/include/roctracer/roctracer_plugin.h +0 -137
triton/backends/amd/include/roctracer/roctracer_roctx.h +0 -67
triton/backends/amd/include/roctracer/roctx.h +0 -229
triton/language/_utils.py +0 -21
triton/language/extra/cuda/_experimental_tma.py +0 -106
triton/tools/experimental_descriptor.py +0 -32
triton_windows-3.3.1.post19.dist-info/RECORD +0 -260
triton_windows-3.3.1.post19.dist-info/top_level.txt +0 -14
{triton_windows-3.3.1.post19.dist-info → triton_windows-3.4.0.post20.dist-info}/WHEEL +0 -0

triton/experimental/gluon/language/_semantic.py ADDED Viewed

@@ -0,0 +1,287 @@
+from typing import Sequence, List, TypeVar, Tuple, Callable
+from triton.language.semantic import TritonSemantic
+from . import _core as ttgl
+from ._layouts import SliceLayout
+from triton._C.libtriton.gluon_ir import GluonOpBuilder
+from triton.compiler.code_generator import flatten_values_to_ir, unflatten_ir_values
+TensorTy = TypeVar("TensorTy")
+def _check(cond: bool, msg_fn: Callable[[], str], category=ValueError):
+    if not cond:
+        raise category(msg_fn())
+class GluonSemantic(TritonSemantic[TensorTy]):
+    tensor = ttgl.tensor
+    lang = ttgl
+    builder: GluonOpBuilder
+    def __init__(self, builder: GluonOpBuilder):
+        self.builder = builder
+    def _wrap_tensor_infer_layout(self, tensor):
+        ty = ttgl.distributed_type(tensor.type.scalar, tensor.shape,
+                                   self.builder.get_gluon_layout_from_tensor(tensor.handle))
+        return self.tensor(tensor.handle, ty)
+    def _broadcast_shapes(self, lhs_shape: List[int], rhs_shape: List[int]):
+        if len(lhs_shape) != len(rhs_shape):
+            raise ValueError(f"Cannot broadcast, rank mismatch: {lhs_shape}, {rhs_shape}")
+        ret_shape = []
+        for i, left in enumerate(lhs_shape):
+            right = rhs_shape[i]
+            if left == 1:
+                ret_shape.append(right)
+            elif (right == 1) or (right == left):
+                ret_shape.append(left)
+            else:
+                raise ValueError("Cannot make_shape_compatible: incompatible dimensions "
+                                 "at index " + str(i) + ": " + str(left) + " and " + str(right))
+        return ret_shape
+    def expand_dims(self, input: TensorTy, axis: int) -> TensorTy:
+        dst_shape = [ttgl._unwrap_if_constexpr(x) for x in input.shape]
+        dst_shape.insert(axis, 1)
+        if axis < 0:
+            axis += len(input.shape)
+        _check(isinstance(input.type, ttgl.distributed_type),
+               lambda: f"expected expand_dims input to be a distributed_type but got: {input.type!r}")
+        layout = input.type.layout
+        _check(isinstance(layout, SliceLayout),
+               lambda: f"expected expand_dims input to have a SliceLayout, but got: {layout}")
+        _check(layout.dim == axis,
+               lambda: f"expected expand_dims input layout to be sliced in axis {axis} but got {layout.dim}")
+        ret_ty = ttgl.distributed_type(input.type.scalar, dst_shape, layout.parent)
+        handle = self.builder.create_expand_dims(input.handle, axis, ret_ty.to_ir(self.builder))
+        return self.tensor(handle, ret_ty)
+    def join(self, a: TensorTy, b: TensorTy) -> TensorTy:
+        a, b = self.broadcast_impl_value(a, b)
+        _check(a.shape != [], "Cannot join scalars in gluon")
+        value = super().join(a, b)
+        return self._wrap_tensor_infer_layout(value)
+    def split(self, a: TensorTy) -> Tuple[TensorTy, TensorTy]:
+        lhs, rhs = super().split(a)
+        return self._wrap_tensor_infer_layout(lhs), self._wrap_tensor_infer_layout(rhs)
+    def permute(self, input: TensorTy, dims: Tuple[int]) -> TensorTy:
+        value = super().permute(input, dims)
+        return self._wrap_tensor_infer_layout(value)
+    def broadcast_impl_shape(self, input: TensorTy, shape: Tuple[int]) -> TensorTy:
+        _check(isinstance(input.type, ttgl.distributed_type),
+               lambda: f"expected expand_dims input to be a distributed_type but got: {input.type!r}")
+        src_shape = input.type.get_block_shapes()
+        _check(len(src_shape) == len(shape), lambda: f"Cannot broadcast, rank mismatch: {src_shape}, {shape}")
+        if shape == src_shape:
+            return input
+        for i, item in enumerate(src_shape):
+            if shape[i] != item and item != 1:
+                raise ValueError(f"Cannot broadcast, the expanded size of the tensor ({shape[i]})"
+                                 f" must match the existing size ({item}) at non-singleton dimension"
+                                 f" {i}: {src_shape}, {shape}")
+        ret_ty = ttgl.distributed_type(input.type.scalar, shape, input.type.layout)
+        handle = self.builder.create_broadcast(input.handle, ret_ty.to_ir(self.builder))
+        return self.tensor(handle, ret_ty)
+    def broadcast_impl_value(self, lhs: TensorTy, rhs: TensorTy) -> TensorTy:
+        lhs_ty = lhs.type
+        rhs_ty = rhs.type
+        if not lhs_ty.is_block() or not rhs_ty.is_block():
+            return super().broadcast_impl_value(lhs, rhs)
+        _check(isinstance(lhs_ty, ttgl.distributed_type),
+               lambda: f"expected broadcast left input to be a distributed_type but got: {lhs_ty!r}")
+        _check(isinstance(rhs_ty, ttgl.distributed_type),
+               lambda: f"expected broadcast right input to be a distributed_type but got: {rhs_ty!r}")
+        lhs_shape = lhs_ty.get_block_shapes()
+        rhs_shape = rhs_ty.get_block_shapes()
+        ret_shape = self._broadcast_shapes(lhs_shape, rhs_shape)
+        if lhs_ty.layout != rhs_ty.layout:
+            raise ValueError(f"Layout mismatch in broadcast: {lhs_ty.layout} vs {rhs_ty.layout}")
+        lhs = self.broadcast_impl_shape(lhs, ret_shape)
+        rhs = self.broadcast_impl_shape(rhs, ret_shape)
+        return lhs, rhs
+    def arange(self, start, end, layout):
+        shape = [end - start]
+        ret_ty = ttgl.distributed_type(ttgl.int32, shape, layout)
+        return super().arange(start, end, ret_ty=ret_ty)
+    def reshape(self, input: TensorTy, dst_shape: List[int], can_reorder: bool):
+        _check(not can_reorder, "can_reorder is not supported in gluon")
+        value = super().reshape(input, dst_shape, can_reorder)
+        return self._wrap_tensor_infer_layout(value)
+    def splat(self, value, shape, layout):
+        ret_ty = ttgl.distributed_type(value.dtype, shape, layout)
+        handle = self.builder.create_splat(ret_ty.to_ir(self.builder), value.handle)
+        return ttgl.tensor(handle, ret_ty)
+    def full(self, shape, value, dtype, layout):
+        scalar = self.make_scalar(value, dtype)
+        return self.splat(scalar, shape, layout)
+    def convert_layout(self, value, layout):
+        ty = value.type
+        _check(isinstance(ty, ttgl.distributed_type),
+               lambda: f"expected convert_layout input to be a distributed_type but got: {ty!r}")
+        ret_ty = ttgl.distributed_type(ty.element_ty, ty.shape, layout)
+        handle = self.builder.create_convert_layout(ret_ty.to_ir(self.builder), value.handle)
+        return ttgl.tensor(handle, ret_ty)
+    def allocate_shared(self, element_ty, shape, layout, value):
+        ty = ttgl.shared_memory_descriptor_type(element_ty, shape, layout, shape)
+        if value is not None:
+            handle = self.builder.create_local_alloc(ty.to_ir(self.builder), value.handle)
+        else:
+            handle = self.builder.create_local_alloc(ty.to_ir(self.builder))
+        return ttgl.shared_memory_descriptor(handle, element_ty, shape, layout, shape)
+    def shared_load(self, mem_desc, layout):
+        ret_ty = ttgl.distributed_type(mem_desc.dtype, mem_desc.shape, layout)
+        handle = self.builder.create_local_load(ret_ty.to_ir(self.builder), mem_desc.handle)
+        return ttgl.tensor(handle, ret_ty)
+    def shared_store(self, mem_desc, value):
+        self.builder.create_local_store(mem_desc.handle, value.handle)
+    def shared_dealloc(self, mem_desc):
+        self.builder.create_local_dealloc(mem_desc.handle)
+    def _memdesc_subview(self, mem_desc, offsets, shape):
+        layout = mem_desc.layout
+        ty = ttgl.shared_memory_descriptor_type(mem_desc.dtype, shape, layout, mem_desc.type.alloc_shape)
+        builder = self.builder
+        handle = builder.create_memdesc_subview(ty.to_ir(builder), mem_desc.handle, offsets)
+        return ttgl.shared_memory_descriptor(handle, **ty.__dict__)
+    def memdesc_slice(self, mem_desc, start, length, dim):
+        offsets = [self.builder.get_int32(0)] * mem_desc.rank
+        offsets[dim] = self.to_tensor(start).handle
+        shape = list(mem_desc.shape)
+        shape[dim] = length
+        return self._memdesc_subview(mem_desc, offsets, shape)
+    def memdesc_index(self, mem_desc, index):
+        shape = mem_desc.shape[1:]
+        offsets = [self.builder.get_int32(0)] * mem_desc.rank
+        offsets[0] = self.to_tensor(index).handle
+        return self._memdesc_subview(mem_desc, offsets, shape)
+    def memdesc_trans(self, mem_desc, order):
+        assert len(order) == len(
+            mem_desc.shape), f"source rank ({mem_desc.rank}) and order length ({len(order)}) must match"
+        shape = [mem_desc.shape[i] for i in order]
+        alloc_shape = mem_desc.type.alloc_shape
+        new_alloc_shape = alloc_shape[:len(alloc_shape) - mem_desc.rank]
+        new_alloc_shape += [alloc_shape[len(alloc_shape) - mem_desc.rank:][i] for i in order]
+        handle = self.builder.create_memdesc_trans(mem_desc.handle, order)
+        layout = self.builder.get_gluon_layout_from_memdesc(handle)
+        return ttgl.shared_memory_descriptor(handle, element_ty=mem_desc.dtype, shape=shape,
+                                             alloc_shape=new_alloc_shape, layout=layout)
+    def memdesc_reshape(self, mem_desc, shape, layout):
+        ty = ttgl.shared_memory_descriptor_type(mem_desc.dtype, shape, layout, mem_desc.type.alloc_shape)
+        handle = self.builder.create_memdesc_reshape(ty.to_ir(self.builder), mem_desc.handle)
+        return ttgl.shared_memory_descriptor(handle, **ty.__dict__)
+    def memdesc_reinterpret(self, mem_desc, dtype, shape, layout):
+        ty = ttgl.shared_memory_descriptor_type(dtype, shape, layout, shape)
+        handle = self.builder.create_memdesc_reinterpret(ty.to_ir(self.builder), mem_desc.handle)
+        return ttgl.shared_memory_descriptor(handle, **ty.__dict__)
+    def wrap_tensor(self, x, scalar_ty, ret_shape, layout):
+        if ret_shape:
+            res_ty = ttgl.distributed_type(scalar_ty, ret_shape, layout)
+        else:
+            res_ty = scalar_ty
+        return self.tensor(x, res_ty)
+    @staticmethod
+    def _check_same_layout(xs):
+        for x in xs:
+            _check(isinstance(x.type, ttgl.distributed_type), lambda: f"expected distributed_type but got: {x.type!r}")
+        layouts = [x.type.layout for x in xs]
+        l0 = layouts[0]
+        _check(all(l == l0 for l in layouts[1:]),
+               lambda: f"Expected inputs to have matching layouts, but got: {layouts}")
+    def reduction(self, inputs: Sequence[TensorTy], axis: int, region_builder_fn) -> Tuple[TensorTy, ...]:
+        _check(axis is not None, lambda: "All-reduce is not yet implemented in gluon")
+        # get result shape
+        shape = inputs[0].type.shape
+        rank = len(shape)
+        _check(0 <= axis < rank, lambda: f"expected reduction axis to be in the range [0, {rank}) but got {axis}")
+        self._check_same_layout(inputs)
+        ret_shape = [s for i, s in enumerate(shape) if i != axis]
+        ret_layout = SliceLayout(axis, inputs[0].type.layout)
+        assert all(t.type.shape == shape for t in inputs), "all reduction inputs must have the same shape"
+        reduce_op = self.builder.create_reduce([t.handle for t in inputs], axis)
+        region_builder_fn(reduce_op)
+        assert reduce_op.verify()
+        return tuple(
+            self.wrap_tensor(reduce_op.get_result(i), inputs[i].type.scalar, ret_shape, ret_layout)
+            for i in range(len(inputs)))
+    def warp_specialize(self, args, default_partition, worker_partitions, worker_num_warps: Sequence[int],
+                        worker_num_regs: Sequence[int], generator):
+        num_partitions = len(worker_partitions)
+        assert num_partitions == len(
+            worker_num_warps
+        ), f"warp specialize got {num_partitions} partitions but {len(worker_num_warps)} warp counts"
+        assert num_partitions == len(
+            worker_num_regs
+        ), f"warp specialize got {num_partitions} partitions but {len(worker_num_regs)} register counts"
+        builder = self.builder
+        insert_pt = builder.get_insertion_point()
+        # Emit the default partition to get the result types.
+        default_block = builder.new_block()
+        builder.set_insertion_point_to_start(default_block)
+        default_results = generator.call_JitFunction(default_partition, args, kwargs={})
+        mlir_results = []
+        if default_results is not None:
+            mlir_results = flatten_values_to_ir(default_results)
+        builder.create_warp_yield(mlir_results)
+        result_types = [r.get_type() for r in mlir_results]
+        # Create the warp specialize op.
+        builder.restore_insertion_point(insert_pt)
+        mlir_args = flatten_values_to_ir(args)
+        ws_op = builder.create_warp_specialize(result_types, mlir_args, worker_num_warps)
+        ws_op.get_default_region().push_back(default_block)
+        ws_op.set_requested_registers(worker_num_regs)
+        # Emit the partition regions.
+        builder.create_block_with_parent(ws_op.get_partition_op_holder(), [])
+        partitions_op = builder.create_warp_specialize_partitions(num_partitions)
+        arg_types = [arg.get_type() for arg in mlir_args]
+        for i in range(num_partitions):
+            block = builder.create_block_with_parent(partitions_op.get_region(i), arg_types)
+            block_args = [block.get_argument(j) for j in range(len(mlir_args))]
+            block_args = unflatten_ir_values(block_args, [arg.type for arg in args])
+            generator.call_JitFunction(worker_partitions[i], block_args, kwargs={})
+            builder.create_warp_return()
+        builder.set_insertion_point_after(ws_op.get_operation())
+        mlir_results = [ws_op.get_result(i) for i in range(len(result_types))]
+        if default_results is None:
+            return
+        return tuple(unflatten_ir_values(mlir_results, [r.type for r in default_results]))

triton/experimental/gluon/language/_standard.py ADDED Viewed

@@ -0,0 +1,47 @@
+# flake8: noqa
+import triton
+import triton.language.standard as tl_standard
+from .._runtime import jit
+from triton import knobs
+from . import _core as ttgl
+_IMPORT_FROM_TRITON = [
+    "sum",
+    "max",
+    "min",
+    "reduce_or",
+    "xor_sum",
+]
+__all__ = [
+    "full_like",
+    "zeros",
+    "zeros_like",
+    *_IMPORT_FROM_TRITON,
+]
+for name in _IMPORT_FROM_TRITON:
+    # Convert JITFunction -> GluonJitFunction
+    fn = getattr(tl_standard, name)
+    assert knobs.runtime.interpret or isinstance(fn, triton.runtime.JITFunction)
+    globals()[name] = jit(fn.fn)
+@jit
+def zeros(shape, dtype, layout):
+    return ttgl.full(shape, 0, dtype, layout)
+@jit
+def full_like(input, value, shape=None, dtype=None, layout=None):
+    return ttgl.full(
+        input.shape if shape is None else shape,
+        value,
+        input.dtype if dtype is None else dtype,
+        input.type.layout if layout is None else layout,
+    )
+@jit
+def zeros_like(input, shape=None, dtype=None, layout=None):
+    return full_like(input, 0, shape=shape, dtype=dtype, layout=layout)

triton/experimental/gluon/language/nvidia/__init__.py ADDED Viewed

@@ -0,0 +1,4 @@
+from . import blackwell
+from . import hopper
+__all__ = ["blackwell", "hopper"]

triton/experimental/gluon/language/nvidia/blackwell/__init__.py ADDED Viewed

@@ -0,0 +1,202 @@
+from __future__ import annotations
+from typing import Optional, Tuple, List, TYPE_CHECKING
+from dataclasses import dataclass
+from triton.experimental.gluon.language import _core as ttgl
+from triton.experimental.gluon.language._core import builtin, base_type, base_value, _unwrap_if_constexpr
+from triton.experimental.gluon.language._semantic import _check
+from . import tma
+from ..hopper import mbarrier, fence_async_shared
+if TYPE_CHECKING:
+    from triton._C.libtriton.gluon_ir import GluonOpBuilder
+    from triton._C.libtriton import gluon_ir as ir
+    from ..._semantic import GluonSemantic
+__all__ = [
+    "allocate_tensor_memory",
+    "fence_async_shared",
+    "mbarrier",
+    "tensor_memory_descriptor",
+    "TensorMemoryLayout",
+    "tma",
+]
+@dataclass(frozen=True, eq=True)
+class TensorMemoryLayout:
+    block: Tuple[int, int]
+    unpacked: bool
+    cta_split_num: Optional[Tuple[int, int]] = None
+    def __post_init__(self):
+        assert len(self.block) == 2
+        assert self.cta_split_num is None or len(self.cta_split_num) == 2
+    def _to_ir(self, builder):
+        cta_split_num = self.cta_split_num or [1, 1]
+        return builder.get_tensor_memory_layout(
+            self.block,
+            self.unpacked,
+            cta_split_num,
+        )
+    def mangle(self) -> str:
+        block_str = f"{self.block[0]}x{self.block[1]}"
+        unpacked_str = "U" if self.unpacked else "P"
+        cta_split_str = f"CS{self.cta_split_num[0]}x{self.cta_split_num[1]}" if self.cta_split_num else ""
+        return f"TL{block_str}{unpacked_str}{cta_split_str}TL"
+class tensor_memory_descriptor_type(base_type):
+    def __init__(self, element_ty, shape, layout, alloc_shape):
+        self.element_ty = element_ty
+        self.shape = shape
+        self.layout = layout
+        self.alloc_shape = alloc_shape
+        assert isinstance(layout, TensorMemoryLayout)
+    def to_ir(self, builder: GluonOpBuilder) -> None:
+        return builder.get_tensor_mem_desc_ty(
+            self.element_ty.to_ir(builder),
+            self.shape,
+            self.layout._to_ir(builder),
+            self.alloc_shape,
+        )
+    def _unflatten_ir(self, handles: List[ir.Value], cursor: int) -> Tuple[tensor_memory_descriptor, int]:
+        value = tensor_memory_descriptor(handles[cursor], self.element_ty, self.shape, self.layout, self.alloc_shape)
+        return value, cursor + 1
+    def _flatten_ir_types(self, builder: GluonOpBuilder, out: List[ir.type]) -> None:
+        out.append(self.to_ir(builder))
+    def __str__(self) -> str:
+        return f"tensor_memory_descriptor<{self.element_ty}, {self.shape}, {self.layout}>"
+    def __eq__(self, other) -> bool:
+        return (type(self) is type(other) and self.shape == other.shape and self.layout == other.layout
+                and self.alloc_shape == other.alloc_shape)
+    def __neq__(self, other) -> bool:
+        return not (self == other)
+    def mangle(self) -> str:
+        shape_str = "_".join([str(s) for s in self.shape])
+        return f"MD{self.element_ty.mangle()}S{shape_str}SL{self.layout.mangle()}LAS{self.alloc_shape}ASMD"
+class tensor_memory_descriptor(base_value):
+    def __init__(self, handle, element_ty, shape, layout, alloc_shape):
+        self.handle = handle
+        self.type = tensor_memory_descriptor_type(element_ty, shape, layout, alloc_shape)
+    def _flatten_ir(self, handles: List[ir.value]) -> None:
+        handles.append(self.handle)
+    @property
+    def dtype(self):
+        return self.type.element_ty
+    @property
+    def shape(self):
+        return self.type.shape
+    @property
+    def rank(self):
+        return len(self.shape)
+    @property
+    def layout(self):
+        return self.type.layout
+    def __str__(self) -> str:
+        return str(self.type)
+    @builtin
+    def load(self, layout, _semantic: GluonSemantic) -> ttgl.tensor:
+        layout = _unwrap_if_constexpr(layout)
+        ret_ty = ttgl.distributed_type(self.dtype, self.shape, layout)
+        builder = _semantic.builder
+        handle = builder.create_tmem_load(ret_ty.to_ir(builder), self.handle)
+        return ttgl.tensor(handle, ret_ty)
+    @builtin
+    def store(self, value, pred=True, _semantic: GluonSemantic = None) -> None:
+        pred = _unwrap_if_constexpr(pred)
+        pred = _semantic.to_tensor(pred)
+        _semantic.builder.create_tmem_store(self.handle, value.handle, pred.handle)
+    @builtin
+    def slice(self, start, length, _semantic: GluonSemantic) -> None:
+        start = _unwrap_if_constexpr(start)
+        length = _unwrap_if_constexpr(length)
+        _check(isinstance(start, int), lambda: "start must be a constant int")
+        _check(isinstance(length, int), lambda: "length must be a constant int")
+        shape = self.shape[:-1] + [length]
+        layout = self.type.layout
+        layout = TensorMemoryLayout((layout.block[0], min(layout.block[1], length)), layout.unpacked,
+                                    layout.cta_split_num)
+        ret = tensor_memory_descriptor(None, self.dtype, shape, layout, self.type.alloc_shape)
+        builder = _semantic.builder
+        ret.handle = builder.create_tmem_subslice(ret.type.to_ir(builder), self.handle, start)
+        return ret
+    @builtin
+    def index(self, index, _semantic: GluonSemantic = None) -> tensor_memory_descriptor:
+        index = _semantic.to_tensor(index)
+        builder = _semantic.builder
+        offsets = [builder.get_int32(0)] * self.rank
+        offsets[0] = index.handle
+        shape = self.shape[1:]
+        layout = self.layout
+        ret = tensor_memory_descriptor(None, self.dtype, shape, layout, self.type.alloc_shape)
+        ret.handle = builder.create_memdesc_subview(ret.type.to_ir(builder), self.handle, offsets)
+        return ret
+    @builtin
+    def _reinterpret(self, dtype, shape, layout, _semantic: GluonSemantic = None) -> tensor_memory_descriptor:
+        dtype = _unwrap_if_constexpr(dtype)
+        shape = [_unwrap_if_constexpr(s) for s in shape]
+        layout = _unwrap_if_constexpr(layout)
+        ty = tensor_memory_descriptor_type(dtype, shape, layout, shape)
+        handle = _semantic.builder.create_memdesc_reinterpret(ty.to_ir(_semantic.builder), self.handle)
+        return tensor_memory_descriptor(handle, **ty.__dict__)
+@builtin
+def allocate_tensor_memory(element_ty, shape, layout, value=None, _semantic=None):
+    element_ty = _unwrap_if_constexpr(element_ty)
+    shape = _unwrap_if_constexpr(shape)
+    layout = _unwrap_if_constexpr(layout)
+    value = value.handle if value is not None else None
+    ty = tensor_memory_descriptor_type(element_ty, shape, layout, shape)
+    builder = _semantic.builder
+    handle = builder.create_tmem_alloc(ty.to_ir(builder), value)
+    return tensor_memory_descriptor(handle, element_ty, shape, layout, shape)
+@builtin
+def tcgen05_mma(a, b, acc, *, use_acc=True, pred=True, mbarriers=None, mbarrier_preds=None, _semantic=None):
+    use_acc = _semantic.to_tensor(use_acc)
+    pred = _semantic.to_tensor(pred)
+    if mbarriers is None:
+        assert mbarrier_preds is None
+        mbarriers = []
+        mbarrier_preds = []
+    else:
+        mbarriers = [bar.handle for bar in mbarriers]
+        if mbarrier_preds is None:
+            true = _semantic.to_tensor(True)
+            mbarrier_preds = [true] * len(mbarriers)
+        else:
+            mbarrier_preds = _semantic._convert_to_ir_values(mbarrier_preds, require_i64=False)
+    _semantic.builder.create_tcgen05_mma(a.handle, b.handle, acc.handle, use_acc.handle, pred.handle, mbarriers,
+                                         mbarrier_preds)

triton/experimental/gluon/language/nvidia/blackwell/tma.py ADDED Viewed

@@ -0,0 +1,32 @@
+from triton.experimental.gluon.language._core import builtin
+from triton.experimental.gluon.language.nvidia.hopper.tma import (
+    async_copy_global_to_shared,
+    async_copy_shared_to_global,
+    store_wait,
+    tensor_descriptor,
+    tensor_descriptor_type,
+)
+__all__ = [
+    "async_gather",
+    "async_scatter",
+    "async_copy_global_to_shared",
+    "async_copy_shared_to_global",
+    "store_wait",
+    "tensor_descriptor",
+    "tensor_descriptor_type",
+]
+@builtin
+def async_gather(tensor_desc, x_offsets, y_offset, barrier, result, pred=True, _semantic=None):
+    pred = _semantic.to_tensor(pred)
+    y_offset = _semantic.to_tensor(y_offset)
+    _semantic.builder.create_async_tma_gather(tensor_desc.handle, x_offsets.handle, y_offset.handle, barrier.handle,
+                                              result.handle, pred.handle)
+@builtin
+def async_scatter(tensor_desc, x_offsets, y_offset, src, _semantic=None):
+    y_offset = _semantic.to_tensor(y_offset)
+    _semantic.builder.create_async_tma_scatter(tensor_desc.handle, x_offsets.handle, y_offset.handle, src.handle)

triton/experimental/gluon/language/nvidia/hopper/__init__.py ADDED Viewed

@@ -0,0 +1,11 @@
+from . import mbarrier
+from . import tma
+from ... import _core
+__all__ = ["fence_async_shared", "mbarrier", "tma"]
+@_core.builtin
+def fence_async_shared(cluster=False, _semantic=None):
+    cluster = _core._unwrap_if_constexpr(cluster)
+    _semantic.builder.create_fence_async_shared(cluster)

triton/experimental/gluon/language/nvidia/hopper/mbarrier.py ADDED Viewed

@@ -0,0 +1,51 @@
+from triton.experimental.gluon.language._layouts import SwizzledSharedLayout
+from triton.experimental.gluon.language._core import builtin, _unwrap_if_constexpr
+__all__ = ["MBarrierLayout", "init", "invalidate", "expect", "wait", "arrive"]
+class MBarrierLayout(SwizzledSharedLayout):
+    def __init__(self, ctas_per_cga: int = 1, cta_split_num: int = 1):
+        super().__init__(
+            vec=1,
+            per_phase=1,
+            max_phase=1,
+            order=[0],
+            ctas_per_cga=[ctas_per_cga],
+            cta_split_num=[cta_split_num],
+            cta_order=[0],
+        )
+@builtin
+def init(mbarrier, count, _semantic=None):
+    count = _unwrap_if_constexpr(count)
+    _semantic.builder.create_mbarrier_init(mbarrier.handle, count)
+@builtin
+def invalidate(mbarrier, _semantic=None):
+    _semantic.builder.create_mbarrier_inval(mbarrier.handle)
+@builtin
+def expect(mbarrier, bytes, pred=True, _semantic=None):
+    bytes = _unwrap_if_constexpr(bytes)
+    pred = _semantic.to_tensor(pred)
+    _semantic.builder.create_mbarrier_expect(mbarrier.handle, bytes, pred.handle)
+@builtin
+def wait(mbarrier, phase, pred=True, deps=(), _semantic=None):
+    phase = _semantic.to_tensor(phase)
+    pred = _semantic.to_tensor(pred)
+    deps = [x.handle for x in deps]
+    _semantic.builder.create_mbarrier_wait(mbarrier.handle, phase.handle, pred.handle, deps)
+@builtin
+def arrive(mbarrier, count, pred=True, _semantic=None):
+    count = _unwrap_if_constexpr(count)
+    pred = _semantic.to_tensor(pred)
+    _semantic.builder.create_mbarrier_arrive(mbarrier.handle, count, pred.handle)