PyPI - triton-windows - Versions diffs - 3.3.1.post19__cp310-cp310-win_amd64.whl → 3.5.0.post21__cp310-cp310-win_amd64.whl - Mend

triton-windows 3.3.1.post19__cp310-cp310-win_amd64.whl → 3.5.0.post21__cp310-cp310-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of triton-windows might be problematic. Click here for more details.

Files changed (225) hide show

triton/_C/libtriton.pyd +0 -0
triton/__init__.py +11 -2
triton/_filecheck.py +97 -0
triton/_internal_testing.py +95 -18
triton/_utils.py +112 -21
triton/backends/__init__.py +20 -23
triton/backends/amd/__init__.py +0 -0
triton/backends/amd/compiler.py +161 -119
triton/backends/amd/driver.c +118 -46
triton/backends/amd/driver.py +274 -96
triton/backends/compiler.py +7 -21
triton/backends/driver.py +13 -0
triton/backends/nvidia/bin/ptxas.exe +0 -0
triton/backends/nvidia/compiler.py +163 -106
triton/backends/nvidia/driver.c +166 -101
triton/backends/nvidia/driver.py +384 -202
triton/compiler/__init__.py +5 -2
triton/compiler/code_generator.py +439 -231
triton/compiler/compiler.py +152 -84
triton/experimental/__init__.py +0 -0
triton/experimental/gluon/__init__.py +5 -0
triton/experimental/gluon/_compiler.py +0 -0
triton/experimental/gluon/_runtime.py +102 -0
triton/experimental/gluon/language/__init__.py +119 -0
triton/experimental/gluon/language/_core.py +490 -0
triton/experimental/gluon/language/_layouts.py +583 -0
triton/experimental/gluon/language/_math.py +20 -0
triton/experimental/gluon/language/_semantic.py +380 -0
triton/experimental/gluon/language/_standard.py +80 -0
triton/experimental/gluon/language/amd/__init__.py +4 -0
triton/experimental/gluon/language/amd/_layouts.py +96 -0
triton/experimental/gluon/language/amd/cdna3/__init__.py +100 -0
triton/experimental/gluon/language/amd/cdna4/__init__.py +48 -0
triton/experimental/gluon/language/amd/cdna4/async_copy.py +151 -0
triton/experimental/gluon/language/extra/__init__.py +3 -0
triton/experimental/gluon/language/nvidia/__init__.py +4 -0
triton/experimental/gluon/language/nvidia/ampere/__init__.py +3 -0
triton/experimental/gluon/language/nvidia/ampere/async_copy.py +74 -0
triton/experimental/gluon/language/nvidia/ampere/mbarrier.py +80 -0
triton/experimental/gluon/language/nvidia/blackwell/__init__.py +387 -0
triton/experimental/gluon/language/nvidia/blackwell/tma.py +52 -0
triton/experimental/gluon/language/nvidia/hopper/__init__.py +132 -0
triton/experimental/gluon/language/nvidia/hopper/mbarrier.py +34 -0
triton/experimental/gluon/language/nvidia/hopper/tma.py +97 -0
triton/experimental/gluon/nvidia/__init__.py +4 -0
triton/experimental/gluon/nvidia/blackwell.py +3 -0
triton/experimental/gluon/nvidia/hopper.py +45 -0
triton/knobs.py +546 -0
triton/language/__init__.py +50 -19
triton/language/core.py +909 -572
triton/language/extra/cuda/__init__.py +10 -7
triton/language/extra/cuda/gdc.py +42 -0
triton/language/extra/cuda/libdevice.py +394 -394
triton/language/extra/cuda/utils.py +21 -21
triton/language/extra/hip/__init__.py +3 -1
triton/language/extra/hip/libdevice.py +120 -104
triton/language/extra/hip/utils.py +35 -0
triton/language/extra/libdevice.py +4 -0
triton/language/math.py +65 -66
triton/language/random.py +12 -2
triton/language/semantic.py +1757 -1768
triton/language/standard.py +127 -62
triton/language/target_info.py +54 -0
triton/runtime/_allocation.py +15 -3
triton/runtime/_async_compile.py +55 -0
triton/runtime/autotuner.py +117 -60
triton/runtime/build.py +83 -17
triton/runtime/cache.py +61 -47
triton/runtime/driver.py +25 -47
triton/runtime/interpreter.py +95 -50
triton/runtime/jit.py +445 -248
triton/runtime/tcc/include/_mingw.h +8 -10
triton/runtime/tcc/include/assert.h +5 -0
triton/runtime/tcc/include/errno.h +1 -1
triton/runtime/tcc/include/float.h +21 -3
triton/runtime/tcc/include/iso646.h +36 -0
triton/runtime/tcc/include/limits.h +5 -0
triton/runtime/tcc/include/malloc.h +2 -2
triton/runtime/tcc/include/math.h +21 -261
triton/runtime/tcc/include/stdalign.h +16 -0
triton/runtime/tcc/include/stdarg.h +5 -70
triton/runtime/tcc/include/stdatomic.h +171 -0
triton/runtime/tcc/include/stddef.h +7 -19
triton/runtime/tcc/include/stdlib.h +15 -4
triton/runtime/tcc/include/stdnoreturn.h +7 -0
triton/runtime/tcc/include/sys/stat.h +2 -2
triton/runtime/tcc/include/sys/types.h +5 -0
triton/runtime/tcc/include/tcc/tcc_libm.h +444 -27
triton/runtime/tcc/include/tccdefs.h +342 -0
triton/runtime/tcc/include/tgmath.h +89 -0
triton/runtime/tcc/include/uchar.h +33 -0
triton/runtime/tcc/include/unistd.h +1 -0
triton/runtime/tcc/include/winapi/qos.h +72 -0
triton/runtime/tcc/include/winapi/shellapi.h +59 -0
triton/runtime/tcc/include/winapi/winbase.h +9 -2
triton/runtime/tcc/include/winapi/wincon.h +8 -0
triton/runtime/tcc/include/winapi/windows.h +1 -1
triton/runtime/tcc/include/winapi/winnls.h +778 -0
triton/runtime/tcc/include/winapi/winnt.h +9 -7
triton/runtime/tcc/include/winapi/winsock2.h +1474 -0
triton/runtime/tcc/include/winapi/ws2ipdef.h +21 -0
triton/runtime/tcc/include/winapi/ws2tcpip.h +391 -0
triton/runtime/tcc/lib/libtcc1.a +0 -0
triton/runtime/tcc/lib/python314.def +1800 -0
triton/runtime/tcc/lib/python314t.def +1809 -0
triton/runtime/tcc/libtcc.dll +0 -0
triton/runtime/tcc/tcc.exe +0 -0
triton/testing.py +16 -12
triton/tools/compile.py +62 -14
triton/tools/disasm.py +3 -4
triton/tools/extra/cuda/compile.c +1 -0
triton/tools/extra/hip/compile.cpp +66 -0
triton/tools/extra/hip/compile.h +13 -0
triton/tools/ragged_tma.py +92 -0
triton/tools/tensor_descriptor.py +34 -0
triton/windows_utils.py +52 -81
{triton_windows-3.3.1.post19.dist-info → triton_windows-3.5.0.post21.dist-info}/METADATA +8 -4
triton_windows-3.5.0.post21.dist-info/RECORD +217 -0
triton_windows-3.5.0.post21.dist-info/entry_points.txt +3 -0
triton_windows-3.5.0.post21.dist-info/licenses/LICENSE +23 -0
triton_windows-3.5.0.post21.dist-info/top_level.txt +1 -0
triton/backends/amd/include/hip/amd_detail/amd_channel_descriptor.h +0 -358
triton/backends/amd/include/hip/amd_detail/amd_device_functions.h +0 -1010
triton/backends/amd/include/hip/amd_detail/amd_hip_atomic.h +0 -1638
triton/backends/amd/include/hip/amd_detail/amd_hip_bf16.h +0 -1814
triton/backends/amd/include/hip/amd_detail/amd_hip_bfloat16.h +0 -293
triton/backends/amd/include/hip/amd_detail/amd_hip_common.h +0 -32
triton/backends/amd/include/hip/amd_detail/amd_hip_complex.h +0 -174
triton/backends/amd/include/hip/amd_detail/amd_hip_cooperative_groups.h +0 -835
triton/backends/amd/include/hip/amd_detail/amd_hip_fp16.h +0 -1809
triton/backends/amd/include/hip/amd_detail/amd_hip_fp8.h +0 -1391
triton/backends/amd/include/hip/amd_detail/amd_hip_gl_interop.h +0 -108
triton/backends/amd/include/hip/amd_detail/amd_hip_math_constants.h +0 -124
triton/backends/amd/include/hip/amd_detail/amd_hip_runtime.h +0 -405
triton/backends/amd/include/hip/amd_detail/amd_hip_runtime_pt_api.h +0 -196
triton/backends/amd/include/hip/amd_detail/amd_hip_unsafe_atomics.h +0 -565
triton/backends/amd/include/hip/amd_detail/amd_hip_vector_types.h +0 -2226
triton/backends/amd/include/hip/amd_detail/amd_math_functions.h +0 -104
triton/backends/amd/include/hip/amd_detail/amd_surface_functions.h +0 -244
triton/backends/amd/include/hip/amd_detail/amd_warp_functions.h +0 -538
triton/backends/amd/include/hip/amd_detail/amd_warp_sync_functions.h +0 -288
triton/backends/amd/include/hip/amd_detail/concepts.hpp +0 -30
triton/backends/amd/include/hip/amd_detail/device_library_decls.h +0 -133
triton/backends/amd/include/hip/amd_detail/functional_grid_launch.hpp +0 -218
triton/backends/amd/include/hip/amd_detail/grid_launch.h +0 -67
triton/backends/amd/include/hip/amd_detail/grid_launch.hpp +0 -50
triton/backends/amd/include/hip/amd_detail/grid_launch_GGL.hpp +0 -26
triton/backends/amd/include/hip/amd_detail/helpers.hpp +0 -137
triton/backends/amd/include/hip/amd_detail/hip_api_trace.hpp +0 -1446
triton/backends/amd/include/hip/amd_detail/hip_assert.h +0 -101
triton/backends/amd/include/hip/amd_detail/hip_cooperative_groups_helper.h +0 -242
triton/backends/amd/include/hip/amd_detail/hip_fp16_gcc.h +0 -254
triton/backends/amd/include/hip/amd_detail/hip_fp16_math_fwd.h +0 -96
triton/backends/amd/include/hip/amd_detail/hip_ldg.h +0 -100
triton/backends/amd/include/hip/amd_detail/hip_prof_str.h +0 -10570
triton/backends/amd/include/hip/amd_detail/hip_runtime_prof.h +0 -78
triton/backends/amd/include/hip/amd_detail/host_defines.h +0 -184
triton/backends/amd/include/hip/amd_detail/hsa_helpers.hpp +0 -102
triton/backends/amd/include/hip/amd_detail/macro_based_grid_launch.hpp +0 -798
triton/backends/amd/include/hip/amd_detail/math_fwd.h +0 -698
triton/backends/amd/include/hip/amd_detail/ockl_image.h +0 -177
triton/backends/amd/include/hip/amd_detail/program_state.hpp +0 -107
triton/backends/amd/include/hip/amd_detail/texture_fetch_functions.h +0 -491
triton/backends/amd/include/hip/amd_detail/texture_indirect_functions.h +0 -478
triton/backends/amd/include/hip/channel_descriptor.h +0 -39
triton/backends/amd/include/hip/device_functions.h +0 -38
triton/backends/amd/include/hip/driver_types.h +0 -468
triton/backends/amd/include/hip/hip_bf16.h +0 -36
triton/backends/amd/include/hip/hip_bfloat16.h +0 -44
triton/backends/amd/include/hip/hip_common.h +0 -100
triton/backends/amd/include/hip/hip_complex.h +0 -38
triton/backends/amd/include/hip/hip_cooperative_groups.h +0 -46
triton/backends/amd/include/hip/hip_deprecated.h +0 -95
triton/backends/amd/include/hip/hip_ext.h +0 -161
triton/backends/amd/include/hip/hip_fp16.h +0 -36
triton/backends/amd/include/hip/hip_fp8.h +0 -33
triton/backends/amd/include/hip/hip_gl_interop.h +0 -32
triton/backends/amd/include/hip/hip_hcc.h +0 -24
triton/backends/amd/include/hip/hip_math_constants.h +0 -36
triton/backends/amd/include/hip/hip_profile.h +0 -27
triton/backends/amd/include/hip/hip_runtime.h +0 -75
triton/backends/amd/include/hip/hip_runtime_api.h +0 -9261
triton/backends/amd/include/hip/hip_texture_types.h +0 -29
triton/backends/amd/include/hip/hip_vector_types.h +0 -41
triton/backends/amd/include/hip/hip_version.h +0 -17
triton/backends/amd/include/hip/hiprtc.h +0 -421
triton/backends/amd/include/hip/library_types.h +0 -78
triton/backends/amd/include/hip/math_functions.h +0 -42
triton/backends/amd/include/hip/surface_types.h +0 -63
triton/backends/amd/include/hip/texture_types.h +0 -194
triton/backends/amd/include/hsa/Brig.h +0 -1131
triton/backends/amd/include/hsa/amd_hsa_common.h +0 -91
triton/backends/amd/include/hsa/amd_hsa_elf.h +0 -462
triton/backends/amd/include/hsa/amd_hsa_kernel_code.h +0 -269
triton/backends/amd/include/hsa/amd_hsa_queue.h +0 -109
triton/backends/amd/include/hsa/amd_hsa_signal.h +0 -80
triton/backends/amd/include/hsa/hsa.h +0 -5738
triton/backends/amd/include/hsa/hsa_amd_tool.h +0 -91
triton/backends/amd/include/hsa/hsa_api_trace.h +0 -579
triton/backends/amd/include/hsa/hsa_api_trace_version.h +0 -68
triton/backends/amd/include/hsa/hsa_ext_amd.h +0 -3146
triton/backends/amd/include/hsa/hsa_ext_finalize.h +0 -531
triton/backends/amd/include/hsa/hsa_ext_image.h +0 -1454
triton/backends/amd/include/hsa/hsa_ven_amd_aqlprofile.h +0 -488
triton/backends/amd/include/hsa/hsa_ven_amd_loader.h +0 -667
triton/backends/amd/include/hsa/hsa_ven_amd_pc_sampling.h +0 -416
triton/backends/amd/include/roctracer/ext/prof_protocol.h +0 -107
triton/backends/amd/include/roctracer/hip_ostream_ops.h +0 -4515
triton/backends/amd/include/roctracer/hsa_ostream_ops.h +0 -1727
triton/backends/amd/include/roctracer/hsa_prof_str.h +0 -3059
triton/backends/amd/include/roctracer/roctracer.h +0 -779
triton/backends/amd/include/roctracer/roctracer_ext.h +0 -81
triton/backends/amd/include/roctracer/roctracer_hcc.h +0 -24
triton/backends/amd/include/roctracer/roctracer_hip.h +0 -37
triton/backends/amd/include/roctracer/roctracer_hsa.h +0 -112
triton/backends/amd/include/roctracer/roctracer_plugin.h +0 -137
triton/backends/amd/include/roctracer/roctracer_roctx.h +0 -67
triton/backends/amd/include/roctracer/roctx.h +0 -229
triton/language/_utils.py +0 -21
triton/language/extra/cuda/_experimental_tma.py +0 -106
triton/runtime/tcc/lib/libtcc1-64.a +0 -0
triton/tools/experimental_descriptor.py +0 -32
triton_windows-3.3.1.post19.dist-info/RECORD +0 -260
triton_windows-3.3.1.post19.dist-info/top_level.txt +0 -14
{triton_windows-3.3.1.post19.dist-info → triton_windows-3.5.0.post21.dist-info}/WHEEL +0 -0

triton/experimental/gluon/language/nvidia/blackwell/__init__.py ADDED Viewed

@@ -0,0 +1,387 @@
+from __future__ import annotations
+from typing import Optional, Tuple, List, TYPE_CHECKING
+from dataclasses import dataclass
+from triton.runtime.jit import constexpr_function
+from triton.experimental.gluon.language import _core as ttgl
+from triton.experimental.gluon.language._core import builtin, base_type, base_value, _unwrap_if_constexpr
+from triton.experimental.gluon.language._layouts import BlockedLayout, _get_shape_per_cta
+from triton.experimental.gluon.language._semantic import _check
+from . import tma
+from ..hopper import fence_async_shared, mbarrier
+from ..ampere import async_copy
+from triton._C.libtriton import ir
+if TYPE_CHECKING:
+    from triton._C.libtriton.gluon_ir import GluonOpBuilder
+    from ..._semantic import GluonSemantic
+__all__ = [
+    "allocate_tensor_memory",
+    "async_copy",
+    "fence_async_shared",
+    "get_tmem_32x32b_reg_layout",
+    "mbarrier",
+    "tensor_memory_descriptor",
+    "TensorMemoryLayout",
+    "tma",
+]
+@dataclass(frozen=True, eq=True)
+class TensorMemoryLayout:
+    """
+    Describes the layout for tensor memory in Blackwell architecture.
+    Args:
+        block (Tuple[int, int]): Tiling block dimensions (M/rows, N/cols).
+        unpacked (bool): For sub-32 bit elements, whether they are unpacked to 32 bits.
+        cta_split_num (Optional[Tuple[int, int]]): CTA split factors. Defaults to None.
+    """
+    block: Tuple[int, int]
+    unpacked: bool
+    cta_split_num: Optional[Tuple[int, int]] = None
+    def __post_init__(self):
+        assert len(self.block) == 2
+        assert self.cta_split_num is None or len(self.cta_split_num) == 2
+    def _to_ir(self, builder):
+        cta_split_num = self.cta_split_num or [1, 1]
+        return builder.get_tensor_memory_layout(
+            self.block,
+            self.unpacked,
+            cta_split_num,
+        )
+    def mangle(self) -> str:
+        block_str = f"{self.block[0]}x{self.block[1]}"
+        unpacked_str = "U" if self.unpacked else "P"
+        cta_split_str = f"CS{self.cta_split_num[0]}x{self.cta_split_num[1]}" if self.cta_split_num else ""
+        return f"TL{block_str}{unpacked_str}{cta_split_str}TL"
+@dataclass(frozen=True, eq=True)
+class TensorMemoryScalesLayout:
+    """
+    Describes the layout for tensor memory scales in Blackwell architecture.
+    Args:
+        cta_split_num (Optional[Tuple[int, int]]): CTA split factors. Defaults to None.
+    """
+    cta_split_num: Optional[Tuple[int, int]] = None
+    def __post_init__(self):
+        assert self.cta_split_num is None or len(self.cta_split_num) == 2
+    def _to_ir(self, builder):
+        cta_split_num = self.cta_split_num or [1, 1]
+        return builder.get_tensor_memory_scales_layout(cta_split_num, )
+    def mangle(self) -> str:
+        cta_split_str = f"CS{self.cta_split_num[0]}x{self.cta_split_num[1]}" if self.cta_split_num else ""
+        return f"TLS{cta_split_str}TLS"
+@constexpr_function
+def _cdiv(x, div):
+    return (x + div - 1) // div
+@constexpr_function
+def get_tmem_32x32b_reg_layout(M, N, shape, num_warps, ctas_per_cga=None, cta_split_num=None, cta_order=None):
+    """Returns a BlockedLayout compatible with load/store on tensor memory with the 32x32b instruction variant.
+    """
+    assert len(shape) == 2, "expected a 2D tensor"
+    assert num_warps in [4, 8], "expected 4 or 8 warps"
+    shape_per_cta = _get_shape_per_cta(shape, cta_split_num)
+    blocks_per_tile = [shape_per_cta[0] // M, shape_per_cta[1] // N]
+    num_blocks = blocks_per_tile[0] * blocks_per_tile[1]
+    num_warp_groups = num_warps // 4
+    if M == 64:
+        threads_per_warp = [16, 2]
+        if num_blocks == 1:
+            size_per_thread = [1, _cdiv(N, num_warp_groups * 2)]
+            warps_per_cta = [4, num_warp_groups]
+        else:
+            size_per_thread = [1, _cdiv(N, 2)]
+            warps_per_cta = [4 * min(blocks_per_tile[0], num_warp_groups)]
+            warps_per_cta.append(_cdiv(num_warp_groups, warps_per_cta[0] // 4))
+    else:
+        if shape[0] > 128:
+            size_per_thread = [1, N]
+            threads_per_warp = [32, 1]
+            warps_per_cta = [4 * num_warp_groups, 1]
+        else:
+            size_per_thread = [1, _cdiv(N, num_warp_groups)]
+            threads_per_warp = [32, 1]
+            warps_per_cta = [4, num_warp_groups]
+    return BlockedLayout(
+        size_per_thread=size_per_thread,
+        threads_per_warp=threads_per_warp,
+        warps_per_cta=warps_per_cta,
+        order=[0, 1],
+        ctas_per_cga=ctas_per_cga,
+        cta_split_num=cta_split_num,
+        cta_order=cta_order,
+    )
+class tensor_memory_descriptor_type(base_type):
+    def __init__(self, element_ty, shape, layout, alloc_shape):
+        self.element_ty = element_ty
+        self.shape = shape
+        self.layout = layout
+        self.alloc_shape = alloc_shape
+        assert isinstance(layout, TensorMemoryLayout) or isinstance(layout, TensorMemoryScalesLayout)
+    def to_ir(self, builder: GluonOpBuilder) -> None:
+        return builder.get_tensor_mem_desc_ty(
+            self.element_ty.to_ir(builder),
+            self.shape,
+            self.layout._to_ir(builder),
+            self.alloc_shape,
+        )
+    def _unflatten_ir(self, handles: List[ir.Value], cursor: int) -> Tuple[tensor_memory_descriptor, int]:
+        value = tensor_memory_descriptor(handles[cursor], self.element_ty, self.shape, self.layout, self.alloc_shape)
+        return value, cursor + 1
+    def _flatten_ir_types(self, builder: GluonOpBuilder, out: List[ir.type]) -> None:
+        out.append(self.to_ir(builder))
+    def __str__(self) -> str:
+        return f"tensor_memory_descriptor<{self.element_ty}, {self.shape}, {self.layout}>"
+    def __eq__(self, other) -> bool:
+        return (type(self) is type(other) and self.shape == other.shape and self.layout == other.layout
+                and self.alloc_shape == other.alloc_shape)
+    def __neq__(self, other) -> bool:
+        return not (self == other)
+    def mangle(self) -> str:
+        shape_str = "_".join([str(s) for s in self.shape])
+        return f"MD{self.element_ty.mangle()}S{shape_str}SL{self.layout.mangle()}LAS{self.alloc_shape}ASMD"
+class tensor_memory_descriptor(base_value):
+    """
+    Represents a tensor memory descriptor handle for Tensor Core Gen5 operations.
+    """
+    def __init__(self, handle, element_ty, shape, layout, alloc_shape):
+        self.handle = handle
+        self.type = tensor_memory_descriptor_type(element_ty, shape, layout, alloc_shape)
+    def _flatten_ir(self, handles: List[ir.value]) -> None:
+        handles.append(self.handle)
+    @property
+    def dtype(self):
+        return self.type.element_ty
+    @property
+    def shape(self):
+        return self.type.shape
+    @property
+    def rank(self):
+        return len(self.shape)
+    @property
+    def layout(self):
+        return self.type.layout
+    def __str__(self) -> str:
+        return str(self.type)
+    @builtin
+    def load(self, layout, _semantic: GluonSemantic) -> ttgl.tensor:
+        """
+        Load a tensor from tensor memory.
+        Args:
+            layout (DistributedLayout): Destination layout of the tensor.
+        Returns:
+            tensor: A distributed tensor containing the loaded data.
+        """
+        layout = _unwrap_if_constexpr(layout)
+        ret_ty = ttgl.distributed_type(self.dtype, self.shape, layout)
+        builder = _semantic.builder
+        handle = builder.create_tmem_load(ret_ty.to_ir(builder), self.handle)
+        return ttgl.tensor(handle, ret_ty)
+    @builtin
+    def store(self, value, pred=True, _semantic: GluonSemantic = None) -> None:
+        """
+        Store a tensor into tensor memory.
+        Args:
+            value (tensor): The tensor to store.
+            pred (bool): Scalar predicate. Operation is skipped if predicate is False. Defaults to True.
+        """
+        pred = _unwrap_if_constexpr(pred)
+        pred = _semantic.to_tensor(pred)
+        assert value.shape == self.shape, f"source shape {value.shape} does not match destination shape {self.shape}"
+        assert value.dtype == self.dtype, f"source dtype {value.dtype} does not match destination dtype {self.dtype}"
+        _semantic.builder.create_tmem_store(self.handle, value.handle, pred.handle)
+    @builtin
+    def slice(self, start, length, _semantic: GluonSemantic) -> None:
+        """
+        Create a slice of the tensor memory descriptor along the last dimension.
+        Args:
+            start (int): The starting index for subslice.
+            length (int): The length of the subslice.
+        Returns:
+            tensor_memory_descriptor: Descriptor for the subslice.
+        """
+        start = _unwrap_if_constexpr(start)
+        length = _unwrap_if_constexpr(length)
+        _check(isinstance(start, int), lambda: "start must be a constant int")
+        _check(isinstance(length, int), lambda: "length must be a constant int")
+        shape = self.shape[:-1] + [length]
+        layout = self.type.layout
+        layout = TensorMemoryLayout((layout.block[0], min(layout.block[1], length)), layout.unpacked,
+                                    layout.cta_split_num)
+        ret = tensor_memory_descriptor(None, self.dtype, shape, layout, self.type.alloc_shape)
+        builder = _semantic.builder
+        ret.handle = builder.create_tmem_subslice(ret.type.to_ir(builder), self.handle, start)
+        return ret
+    @builtin
+    def index(self, index, _semantic: GluonSemantic = None) -> tensor_memory_descriptor:
+        """
+        Create a subview of tensor memory by indexing the first dimension.
+        Args:
+            index (tensor): The index tensor for the subview.
+        Returns:
+            tensor_memory_descriptor: Descriptor for the indexed subview.
+        """
+        index = _semantic.to_tensor(index)
+        builder = _semantic.builder
+        shape = self.shape[1:]
+        layout = self.layout
+        ret = tensor_memory_descriptor(None, self.dtype, shape, layout, self.type.alloc_shape)
+        ret.handle = builder.create_memdesc_index(ret.type.to_ir(builder), self.handle, index.handle)
+        return ret
+    @builtin
+    def _reinterpret(self, dtype, shape, layout, _semantic: GluonSemantic = None) -> tensor_memory_descriptor:
+        """
+        Reinterpret tensor memory descriptor with a new dtype, shape, and layout.
+        Args:
+            dtype (dtype): The new data type.
+            shape (Sequence[int]): The new shape.
+            layout (TensorMemoryLayout): The new layout.
+        Returns:
+            tensor_memory_descriptor: Descriptor with updated type and layout.
+        """
+        dtype = _unwrap_if_constexpr(dtype)
+        shape = [_unwrap_if_constexpr(s) for s in shape]
+        layout = _unwrap_if_constexpr(layout)
+        ty = tensor_memory_descriptor_type(dtype, shape, layout, shape)
+        handle = _semantic.builder.create_memdesc_reinterpret(ty.to_ir(_semantic.builder), self.handle)
+        return tensor_memory_descriptor(handle, **ty.__dict__)
+@builtin
+def allocate_tensor_memory(element_ty, shape, layout, value=None, _semantic=None):
+    """
+    Allocate tensor memory.
+    Args:
+        element_ty (dtype): The element data type.
+        shape (Sequence[int]): The descriptor shape.
+        layout (TensorMemoryLayout): The layout of the tensor memory.
+        value (tensor, optional): Initial tensor to copy. Defaults to None.
+    Returns:
+        tensor_memory_descriptor: Descriptor for the allocated memory.
+    """
+    element_ty = _unwrap_if_constexpr(element_ty)
+    shape = _unwrap_if_constexpr(shape)
+    layout = _unwrap_if_constexpr(layout)
+    value = value.handle if value is not None else None
+    ty = tensor_memory_descriptor_type(element_ty, shape, layout, shape)
+    builder = _semantic.builder
+    handle = builder.create_tmem_alloc(ty.to_ir(builder), value)
+    return tensor_memory_descriptor(handle, element_ty, shape, layout, shape)
+@builtin
+def tcgen05_copy(src, dst, _semantic=None):
+    """
+    Start an asynchronous copy from shared memory to tensor memory.
+    WARNING: The current semantics of the instruction are not well defined and
+    the API will change in the future. Use at your own risk.
+    Args:
+        src (shared_memory_descriptor): Shared memory to copy from.
+        dst (tensor_memory_descriptor): Tensor memory to copy to.
+    """
+    assert isinstance(src, ttgl.shared_memory_descriptor), "source must be a shared memory descriptor"
+    assert isinstance(dst, tensor_memory_descriptor), "destination must be a tensor memory descriptor"
+    _semantic.builder.create_tmem_copy(src.handle, dst.handle)
+@builtin
+def tcgen05_mma(a, b, acc, *, use_acc=True, pred=True, mbarriers=None, mbarrier_preds=None, _semantic=None):
+    """
+    Emit a 5th generation TensorCore MMA instruction.
+    acc = a * b + (acc if use_acc else 0)
+    Args:
+        a (shared_memory_descriptor): Left hand side operand in shared memory.
+        b (shared_memory_descriptor or tensor_memory_descriptor): Right hand side operand in shared or tensor memory.
+        acc (tensor_memory_descriptor): Accumulator value in tensor memory (mutated).
+        use_acc (bool): Whether to use the initial value of the accumulator. Defaults to True.
+        pred (bool): Scalar predicate. Operation is skipped if predicate is False. Defaults to True.
+        mbarriers (Sequence[shared_memory_descriptor], optional): Barriers to signal when the operation is complete. If None, mma is synchronous. Defaults to None.
+        mbarrier_preds (Sequence[bool], optional): Predicates for barriers. Defaults to None.
+    """
+    use_acc = _semantic.to_tensor(use_acc)
+    pred = _semantic.to_tensor(pred)
+    if mbarriers is None:
+        assert mbarrier_preds is None
+        mbarriers = []
+        mbarrier_preds = []
+    else:
+        mbarriers = [bar.handle for bar in mbarriers]
+        if mbarrier_preds is None:
+            true = _semantic.to_tensor(True)
+            mbarrier_preds = [true.handle] * len(mbarriers)
+        else:
+            mbarrier_preds = _semantic._convert_to_ir_values(mbarrier_preds, require_i64=False)
+    _semantic.builder.create_tcgen05_mma(a.handle, b.handle, acc.handle, use_acc.handle, pred.handle, mbarriers,
+                                         mbarrier_preds)
+@builtin
+def tcgen05_commit(barrier, _semantic=None):
+    """
+    This instruction causes the provided mbarrier to be arrived-on with a count
+    of 1 when all async tcgen05 MMA and copy instructions previously issued by
+    the thread are complete.
+    Args:
+        barrier (shared_memory_descriptor): The barrier to track completion of tcgen05 MMA and copy instructions.
+    """
+    _semantic.builder.create_tcgen05_commit(barrier.handle)

triton/experimental/gluon/language/nvidia/blackwell/tma.py ADDED Viewed

@@ -0,0 +1,52 @@
+from triton.experimental.gluon.language._core import builtin
+from triton.experimental.gluon.language.nvidia.hopper.tma import (
+    async_copy_global_to_shared,
+    async_copy_shared_to_global,
+    store_wait,
+    tensor_descriptor,
+    tensor_descriptor_type,
+)
+__all__ = [
+    "async_gather",
+    "async_scatter",
+    "async_copy_global_to_shared",
+    "async_copy_shared_to_global",
+    "store_wait",
+    "tensor_descriptor",
+    "tensor_descriptor_type",
+]
+@builtin
+def async_gather(tensor_desc, x_offsets, y_offset, barrier, result, pred=True, _semantic=None):
+    """
+    Asynchronously gather elements from global memory to shared memory using TMA.
+    Args:
+        tensor_desc (tensor_descriptor): The tensor descriptor.
+        x_offsets (tensor): 1D tensor of X offsets.
+        y_offset (int): Scalar Y offset.
+        barrier (shared_memory_descriptor): Barrier that will be signaled when the operation is complete.
+        result (tensor_memory_descriptor): Result shared memory, must have NVMMASharedLayout.
+        pred (bool): Scalar predicate. Operation is skipped if predicate is False. Defaults to True.
+    """
+    pred = _semantic.to_tensor(pred)
+    y_offset = _semantic.to_tensor(y_offset)
+    _semantic.builder.create_async_tma_gather(tensor_desc.handle, x_offsets.handle, y_offset.handle, barrier.handle,
+                                              result.handle, pred.handle)
+@builtin
+def async_scatter(tensor_desc, x_offsets, y_offset, src, _semantic=None):
+    """
+    Asynchronously scatter elements from shared memory to global memory using TMA.
+    Args:
+        tensor_desc (tensor_descriptor): The tensor descriptor.
+        x_offsets (tensor): 1D tensor of X offsets.
+        y_offset (int): Scalar Y offset.
+        src (tensor_memory_descriptor): The source data, must be in NVMMASharedLayout.
+    """
+    y_offset = _semantic.to_tensor(y_offset)
+    _semantic.builder.create_async_tma_scatter(tensor_desc.handle, x_offsets.handle, y_offset.handle, src.handle)

triton/experimental/gluon/language/nvidia/hopper/__init__.py ADDED Viewed

@@ -0,0 +1,132 @@
+from __future__ import annotations
+from triton.compiler.code_generator import unflatten_ir_values
+from ..ampere import async_copy
+from . import mbarrier, tma
+from ... import _core
+from typing import List, Tuple, TYPE_CHECKING
+if TYPE_CHECKING:
+    from triton._C.libtriton import ir
+__all__ = ["async_copy", "fence_async_shared", "mbarrier", "tma", "warpgroup_mma", "warpgroup_mma_wait"]
+@_core.builtin
+def fence_async_shared(cluster=False, _semantic=None):
+    """
+    Issue a fence to complete asynchronous shared memory operations.
+    Args:
+        cluster (bool): Whether to fence across cluster. Defaults to False.
+    """
+    cluster = _core._unwrap_if_constexpr(cluster)
+    _semantic.builder.create_fence_async_shared(cluster)
+class warpgroup_mma_accumulator_type(_core.base_type):
+    tensor_type: _core.dtype
+    def __init__(self, tensor_type: _core.dtype):
+        self.tensor_type = tensor_type
+    def __str__(self) -> str:
+        return f"warpgroup_mma_accumulator<{self.tensor_type}>"
+    def _unflatten_ir(self, handles: List[ir.value], cursor: int) -> Tuple[warpgroup_mma_accumulator, int]:
+        return warpgroup_mma_accumulator(handles[cursor], self.tensor_type), cursor + 1
+    def _flatten_ir_types(self, builder: ir.builder, out: List[ir.type]) -> None:
+        self.tensor_type._flatten_ir_types(builder, out)
+    def __eq__(self, other) -> bool:
+        return type(self) is type(other) and self.tensor_type == other.tensor_type
+    def mangle(self) -> str:
+        return f"FT{self.tensor_type.mangle()}FT"
+class warpgroup_mma_accumulator(_core.base_value):
+    handle: ir.value
+    type: warpgroup_mma_accumulator_type
+    def __init__(self, handle, tensor_type: _core.dtype):
+        self.handle = handle
+        self.type = warpgroup_mma_accumulator_type(tensor_type)
+    def _flatten_ir(self, handles: List[ir.value]) -> None:
+        handles.append(self.handle)
+@_core.builtin
+def warpgroup_mma_init(value, _semantic):
+    assert isinstance(value, _core.tensor)
+    return warpgroup_mma_accumulator(value.handle, value.type)
+@_core.builtin
+def warpgroup_mma(a, b, acc, *, use_acc=True, precision=None, max_num_imprecise_acc=None, is_async=False,
+                  _semantic=None):
+    """
+    Perform warpgroup MMA (Tensor Core) operations.
+    acc = a * b + (acc if use_acc else 0)
+    Args:
+        a (tensor or shared_memory_descriptor): Left hand side operand.
+        b (shared_memory_descriptor): Right hand side operand.
+        acc (tensor): Accumulator tensor.
+        use_acc (bool): Whether to use the initial value of the accumulator. Defaults to True.
+        precision (str, optional): Dot input precision. Defaults to builder default.
+        max_num_imprecise_acc (int): Max imprecise accumulations. Used for fp8 -> fp32 dot. Determines how many accumulation are done in limited precision. Defaults to None, which means no upcasting is done.
+        is_async (bool): Whether operation is asynchronous. Defaults to False.
+    Returns:
+        tensor or warpgroup_mma_accumulator: Returns the result if synchronous, or a token to load the value once computed if asynchronous.
+    """
+    use_acc = _semantic.to_tensor(use_acc)
+    if precision is None:
+        precision = _semantic.builder.options.default_dot_input_precision
+    precision = _semantic._str_to_dot_input_precision(precision)
+    K = a.type.shape[-1]
+    if max_num_imprecise_acc is None:
+        if a.dtype.is_fp8() and b.dtype.is_fp8():
+            max_num_imprecise_acc = _semantic.builder.options.max_num_imprecise_acc_default
+        else:
+            max_num_imprecise_acc = 0
+    else:
+        if a.dtype.is_fp8() and b.dtype.is_fp8() and max_num_imprecise_acc > K:
+            raise ValueError(f"max_num_imprecise_acc ({max_num_imprecise_acc}) must be <= K ({K})")
+    max_num_imprecise_acc = _core._unwrap_if_constexpr(max_num_imprecise_acc)
+    is_async = _core._unwrap_if_constexpr(is_async)
+    handle = _semantic.builder.create_warpgroup_mma(a.handle, b.handle, acc.handle, use_acc.handle, precision,
+                                                    max_num_imprecise_acc, is_async)
+    tensor_ty = acc.type.tensor_type if isinstance(acc, warpgroup_mma_accumulator) else acc.type
+    if is_async:
+        return warpgroup_mma_accumulator(handle, tensor_ty)
+    else:
+        return _core.tensor(handle, tensor_ty)
+@_core.builtin
+def warpgroup_mma_wait(num_outstanding=0, deps=None, _semantic=None):
+    """
+    Wait until `num_outstanding` or less warpgroup MMA operations are in-flight.
+    Args:
+        num_outstanding (int): Number of outstanding warpgroup MMA operations to wait for. Defaults to 0.
+        deps (Sequence[tensor]): List of dependencies that need to be kept alive while the mma is unfinished.
+    """
+    if deps is None:
+        raise ValueError("warpgroup_mma_wait deps must be given")
+    deps_handles = [x.handle for x in deps] if deps is not None else []
+    num_outstanding = _core._unwrap_if_constexpr(num_outstanding)
+    results = _semantic.builder.create_warpgroup_mma_wait(deps_handles, num_outstanding)
+    result_types = [dep.type.tensor_type if isinstance(dep, warpgroup_mma_accumulator) else dep.type for dep in deps]
+    results = unflatten_ir_values(results, result_types)
+    if len(deps) == 1:
+        return next(results)
+    return tuple(results)

triton/experimental/gluon/language/nvidia/hopper/mbarrier.py ADDED Viewed

@@ -0,0 +1,34 @@
+from ..ampere.mbarrier import MBarrierLayout, init, invalidate, wait
+from ..._core import _unwrap_if_constexpr, builtin
+__all__ = ["arrive", "expect", "init", "invalidate", "MBarrierLayout", "wait"]
+@builtin
+def expect(mbarrier, bytes, pred=True, _semantic=None):
+    """
+    Expect a specific number of bytes being copied. When they are copied, the barrier is signaled.
+    Args:
+        mbarrier (shared_memory_descriptor): Barrier that will be signaled when the operation is complete.
+        bytes (int): Expected byte count.
+        pred (bool): Scalar predicate. Operation is skipped if predicate is False. Defaults to True.
+    """
+    bytes = _unwrap_if_constexpr(bytes)
+    pred = _semantic.to_tensor(pred)
+    _semantic.builder.create_mbarrier_expect(mbarrier.handle, bytes, pred.handle)
+@builtin
+def arrive(mbarrier, *, count=1, pred=True, _semantic=None):
+    """
+    Arrive at an mbarrier with a specified count.
+    Args:
+        mbarrier (shared_memory_descriptor): Barrier to be signalled.
+        count (int): Count to arrive with. Defaults to 1.
+        pred (bool): Scalar predicate. Operation is skipped if predicate is False. Defaults to True.
+    """
+    count = _unwrap_if_constexpr(count)
+    pred = _semantic.to_tensor(pred)
+    _semantic.builder.create_mbarrier_arrive(mbarrier.handle, count, pred.handle)