PyPI - triton-windows - Versions diffs - 3.4.0.post20__cp311-cp311-win_amd64.whl → 3.5.0.post21__cp311-cp311-win_amd64.whl - Mend

triton-windows 3.4.0.post20__cp311-cp311-win_amd64.whl → 3.5.0.post21__cp311-cp311-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of triton-windows might be problematic. Click here for more details.

Files changed (107) hide show

triton/_C/libtriton.pyd +0 -0
triton/__init__.py +8 -2
triton/_filecheck.py +24 -14
triton/_internal_testing.py +70 -4
triton/_utils.py +3 -1
triton/backends/amd/compiler.py +68 -60
triton/backends/amd/driver.c +113 -44
triton/backends/amd/driver.py +133 -57
triton/backends/driver.py +13 -0
triton/backends/nvidia/compiler.py +80 -22
triton/backends/nvidia/driver.c +88 -15
triton/backends/nvidia/driver.py +130 -123
triton/compiler/__init__.py +5 -2
triton/compiler/code_generator.py +270 -163
triton/compiler/compiler.py +45 -62
triton/experimental/gluon/__init__.py +3 -2
triton/experimental/gluon/_runtime.py +9 -6
triton/experimental/gluon/language/__init__.py +117 -16
triton/experimental/gluon/language/_core.py +246 -68
triton/experimental/gluon/language/_layouts.py +398 -45
triton/experimental/gluon/language/_math.py +17 -9
triton/experimental/gluon/language/_semantic.py +130 -37
triton/experimental/gluon/language/_standard.py +55 -22
triton/experimental/gluon/language/amd/__init__.py +4 -0
triton/experimental/gluon/language/amd/_layouts.py +96 -0
triton/experimental/gluon/language/amd/cdna3/__init__.py +100 -0
triton/experimental/gluon/language/amd/cdna4/__init__.py +48 -0
triton/experimental/gluon/language/amd/cdna4/async_copy.py +151 -0
triton/experimental/gluon/language/extra/__init__.py +3 -0
triton/experimental/gluon/language/nvidia/ampere/__init__.py +3 -0
triton/experimental/gluon/language/nvidia/ampere/async_copy.py +74 -0
triton/experimental/gluon/language/nvidia/ampere/mbarrier.py +80 -0
triton/experimental/gluon/language/nvidia/blackwell/__init__.py +192 -7
triton/experimental/gluon/language/nvidia/blackwell/tma.py +20 -0
triton/experimental/gluon/language/nvidia/hopper/__init__.py +124 -3
triton/experimental/gluon/language/nvidia/hopper/mbarrier.py +20 -37
triton/experimental/gluon/language/nvidia/hopper/tma.py +4 -3
triton/experimental/gluon/nvidia/hopper.py +6 -1
triton/knobs.py +132 -67
triton/language/__init__.py +16 -10
triton/language/core.py +163 -83
triton/language/extra/cuda/gdc.py +6 -6
triton/language/extra/hip/__init__.py +3 -1
triton/language/extra/hip/libdevice.py +7 -0
triton/language/extra/hip/utils.py +35 -0
triton/language/extra/libdevice.py +4 -0
triton/language/semantic.py +76 -23
triton/language/standard.py +14 -14
triton/language/target_info.py +54 -0
triton/runtime/_allocation.py +15 -3
triton/runtime/_async_compile.py +55 -0
triton/runtime/autotuner.py +4 -5
triton/runtime/build.py +11 -9
triton/runtime/cache.py +44 -1
triton/runtime/driver.py +16 -41
triton/runtime/interpreter.py +31 -23
triton/runtime/jit.py +318 -157
triton/runtime/tcc/include/_mingw.h +8 -10
triton/runtime/tcc/include/assert.h +5 -0
triton/runtime/tcc/include/errno.h +1 -1
triton/runtime/tcc/include/float.h +21 -3
triton/runtime/tcc/include/iso646.h +36 -0
triton/runtime/tcc/include/limits.h +5 -0
triton/runtime/tcc/include/malloc.h +2 -2
triton/runtime/tcc/include/math.h +21 -261
triton/runtime/tcc/include/stdalign.h +16 -0
triton/runtime/tcc/include/stdarg.h +5 -70
triton/runtime/tcc/include/stdatomic.h +171 -0
triton/runtime/tcc/include/stddef.h +7 -19
triton/runtime/tcc/include/stdlib.h +15 -4
triton/runtime/tcc/include/stdnoreturn.h +7 -0
triton/runtime/tcc/include/sys/stat.h +2 -2
triton/runtime/tcc/include/sys/types.h +5 -0
triton/runtime/tcc/include/tcc/tcc_libm.h +444 -27
triton/runtime/tcc/include/tccdefs.h +342 -0
triton/runtime/tcc/include/tgmath.h +89 -0
triton/runtime/tcc/include/uchar.h +33 -0
triton/runtime/tcc/include/unistd.h +1 -0
triton/runtime/tcc/include/winapi/qos.h +72 -0
triton/runtime/tcc/include/winapi/shellapi.h +59 -0
triton/runtime/tcc/include/winapi/winbase.h +9 -2
triton/runtime/tcc/include/winapi/wincon.h +8 -0
triton/runtime/tcc/include/winapi/windows.h +1 -1
triton/runtime/tcc/include/winapi/winnls.h +778 -0
triton/runtime/tcc/include/winapi/winnt.h +9 -7
triton/runtime/tcc/include/winapi/winsock2.h +1474 -0
triton/runtime/tcc/include/winapi/ws2ipdef.h +21 -0
triton/runtime/tcc/include/winapi/ws2tcpip.h +391 -0
triton/runtime/tcc/lib/libtcc1.a +0 -0
triton/runtime/tcc/lib/python314.def +1800 -0
triton/runtime/tcc/lib/python314t.def +1809 -0
triton/runtime/tcc/libtcc.dll +0 -0
triton/runtime/tcc/tcc.exe +0 -0
triton/tools/compile.py +62 -14
triton/tools/extra/cuda/compile.c +1 -0
triton/tools/extra/hip/compile.cpp +66 -0
triton/tools/extra/hip/compile.h +13 -0
triton/tools/ragged_tma.py +92 -0
triton/tools/tensor_descriptor.py +7 -9
triton/windows_utils.py +42 -79
{triton_windows-3.4.0.post20.dist-info → triton_windows-3.5.0.post21.dist-info}/METADATA +3 -4
{triton_windows-3.4.0.post20.dist-info → triton_windows-3.5.0.post21.dist-info}/RECORD +106 -75
triton/runtime/tcc/lib/libtcc1-64.a +0 -0
{triton_windows-3.4.0.post20.dist-info → triton_windows-3.5.0.post21.dist-info}/WHEEL +0 -0
{triton_windows-3.4.0.post20.dist-info → triton_windows-3.5.0.post21.dist-info}/entry_points.txt +0 -0
{triton_windows-3.4.0.post20.dist-info → triton_windows-3.5.0.post21.dist-info}/licenses/LICENSE +0 -0
{triton_windows-3.4.0.post20.dist-info → triton_windows-3.5.0.post21.dist-info}/top_level.txt +0 -0

triton/experimental/gluon/language/_semantic.py CHANGED Viewed

@@ -1,7 +1,8 @@
 from typing import Sequence, List, TypeVar, Tuple, Callable
+import math
 from triton.language.semantic import TritonSemantic
 from . import _core as ttgl
-from ._layouts import SliceLayout
+from ._layouts import AutoLayout, DistributedLayout, SliceLayout
 from triton._C.libtriton.gluon_ir import GluonOpBuilder
 from triton.compiler.code_generator import flatten_values_to_ir, unflatten_ir_values
@@ -13,6 +14,18 @@ def _check(cond: bool, msg_fn: Callable[[], str], category=ValueError):
         raise category(msg_fn())
+class GluonCallerContext:
+    def __init__(self, num_warps: int):
+        self.num_warps = num_warps
+    def mangle(self):
+        return f"_NW{self.num_warps}"
+    def initialize_callee(self, fn, builder):
+        fn.set_attr("ttg.num-warps", builder.get_int32_attr(self.num_warps))
 class GluonSemantic(TritonSemantic[TensorTy]):
     tensor = ttgl.tensor
     lang = ttgl
@@ -22,10 +35,15 @@ class GluonSemantic(TritonSemantic[TensorTy]):
     def __init__(self, builder: GluonOpBuilder):
         self.builder = builder
+    def _wrap_handle_infer_layout(self, handle, scalar_ty, shape):
+        if shape == []:
+            ty = scalar_ty
+        else:
+            ty = ttgl.distributed_type(scalar_ty, shape, self.builder.get_gluon_layout_from_tensor(handle))
+        return self.tensor(handle, ty)
     def _wrap_tensor_infer_layout(self, tensor):
-        ty = ttgl.distributed_type(tensor.type.scalar, tensor.shape,
-                                   self.builder.get_gluon_layout_from_tensor(tensor.handle))
-        return self.tensor(tensor.handle, ty)
+        return self._wrap_handle_infer_layout(tensor.handle, tensor.type.scalar, tensor.shape)
     def _broadcast_shapes(self, lhs_shape: List[int], rhs_shape: List[int]):
         if len(lhs_shape) != len(rhs_shape):
@@ -53,14 +71,14 @@ class GluonSemantic(TritonSemantic[TensorTy]):
         _check(isinstance(input.type, ttgl.distributed_type),
                lambda: f"expected expand_dims input to be a distributed_type but got: {input.type!r}")
         layout = input.type.layout
-        _check(isinstance(layout, SliceLayout),
+        _check(isinstance(layout, (SliceLayout, AutoLayout)),
                lambda: f"expected expand_dims input to have a SliceLayout, but got: {layout}")
-        _check(layout.dim == axis,
-               lambda: f"expected expand_dims input layout to be sliced in axis {axis} but got {layout.dim}")
+        _check(
+            isinstance(layout, AutoLayout) or layout.dim == axis,
+            lambda: f"expected expand_dims input layout to be sliced in axis {axis} but got {layout.dim}")
-        ret_ty = ttgl.distributed_type(input.type.scalar, dst_shape, layout.parent)
-        handle = self.builder.create_expand_dims(input.handle, axis, ret_ty.to_ir(self.builder))
-        return self.tensor(handle, ret_ty)
+        handle = self.builder.create_expand_dims(input.handle, axis)
+        return self._wrap_handle_infer_layout(handle, input.type.scalar, dst_shape)
     def join(self, a: TensorTy, b: TensorTy) -> TensorTy:
         a, b = self.broadcast_impl_value(a, b)
@@ -107,7 +125,14 @@ class GluonSemantic(TritonSemantic[TensorTy]):
         lhs_shape = lhs_ty.get_block_shapes()
         rhs_shape = rhs_ty.get_block_shapes()
         ret_shape = self._broadcast_shapes(lhs_shape, rhs_shape)
-        if lhs_ty.layout != rhs_ty.layout:
+        is_lhs_auto = isinstance(lhs_ty.layout, AutoLayout)
+        is_rhs_auto = isinstance(rhs_ty.layout, AutoLayout)
+        if is_lhs_auto and not is_rhs_auto:
+            lhs = self.set_auto_layout(lhs, rhs_ty.layout)
+        elif is_rhs_auto and not is_lhs_auto:
+            rhs = self.set_auto_layout(rhs, lhs_ty.layout)
+        elif lhs_ty.layout != rhs_ty.layout:
             raise ValueError(f"Layout mismatch in broadcast: {lhs_ty.layout} vs {rhs_ty.layout}")
         lhs = self.broadcast_impl_shape(lhs, ret_shape)
@@ -116,6 +141,8 @@ class GluonSemantic(TritonSemantic[TensorTy]):
     def arange(self, start, end, layout):
         shape = [end - start]
+        if layout is None:
+            layout = AutoLayout()
         ret_ty = ttgl.distributed_type(ttgl.int32, shape, layout)
         return super().arange(start, end, ret_ty=ret_ty)
@@ -131,14 +158,19 @@ class GluonSemantic(TritonSemantic[TensorTy]):
     def full(self, shape, value, dtype, layout):
         scalar = self.make_scalar(value, dtype)
+        if layout is None:
+            layout = AutoLayout()
         return self.splat(scalar, shape, layout)
-    def convert_layout(self, value, layout):
+    def convert_layout(self, value, layout, assert_trivial=False):
         ty = value.type
         _check(isinstance(ty, ttgl.distributed_type),
                lambda: f"expected convert_layout input to be a distributed_type but got: {ty!r}")
         ret_ty = ttgl.distributed_type(ty.element_ty, ty.shape, layout)
-        handle = self.builder.create_convert_layout(ret_ty.to_ir(self.builder), value.handle)
+        ret_ty_ir = ret_ty.to_ir(self.builder)
+        if assert_trivial and not self.builder.is_convert_layout_trivial(ret_ty_ir, value.handle):
+            raise TypeError(f"layout conversion from {ty.layout} to {layout} is not trivial")
+        handle = self.builder.create_convert_layout(ret_ty_ir, value.handle)
         return ttgl.tensor(handle, ret_ty)
     def allocate_shared(self, element_ty, shape, layout, value):
@@ -155,30 +187,42 @@ class GluonSemantic(TritonSemantic[TensorTy]):
         return ttgl.tensor(handle, ret_ty)
     def shared_store(self, mem_desc, value):
+        assert value.shape == mem_desc.shape, f"source shape {value.shape} and destination shape {mem_desc.shape} must match"
+        assert value.dtype == mem_desc.dtype, f"source dtype {value.dtype} and destination dtype {mem_desc.dtype} must match"
         self.builder.create_local_store(mem_desc.handle, value.handle)
     def shared_dealloc(self, mem_desc):
         self.builder.create_local_dealloc(mem_desc.handle)
-    def _memdesc_subview(self, mem_desc, offsets, shape):
-        layout = mem_desc.layout
-        ty = ttgl.shared_memory_descriptor_type(mem_desc.dtype, shape, layout, mem_desc.type.alloc_shape)
-        builder = self.builder
-        handle = builder.create_memdesc_subview(ty.to_ir(builder), mem_desc.handle, offsets)
-        return ttgl.shared_memory_descriptor(handle, **ty.__dict__)
+    def set_auto_layout(self, value, layout):
+        src_ty = value.type
+        assert isinstance(layout,
+                          DistributedLayout), f"set_auto_layout must set to a distributed layout but got {layout}"
+        assert isinstance(src_ty.layout,
+                          AutoLayout), f"set_auto_layout input must have auto layout but got {value.type.layout}"
+        handle = self.builder.create_set_auto_layout(layout._to_ir(self.builder), value.handle)
+        res_ty = ttgl.distributed_type(src_ty.element_ty, src_ty.shape, layout)
+        return self.tensor(handle, res_ty)
     def memdesc_slice(self, mem_desc, start, length, dim):
-        offsets = [self.builder.get_int32(0)] * mem_desc.rank
-        offsets[dim] = self.to_tensor(start).handle
+        offsets = [0] * mem_desc.rank
+        offsets[dim] = start
         shape = list(mem_desc.shape)
         shape[dim] = length
-        return self._memdesc_subview(mem_desc, offsets, shape)
+        layout = mem_desc.layout
+        ty = ttgl.shared_memory_descriptor_type(mem_desc.dtype, shape, layout, mem_desc.type.alloc_shape)
+        builder = self.builder
+        handle = builder.create_memdesc_subslice(ty.to_ir(builder), mem_desc.handle, offsets)
+        return ttgl.shared_memory_descriptor(handle, **ty.__dict__)
     def memdesc_index(self, mem_desc, index):
         shape = mem_desc.shape[1:]
-        offsets = [self.builder.get_int32(0)] * mem_desc.rank
-        offsets[0] = self.to_tensor(index).handle
-        return self._memdesc_subview(mem_desc, offsets, shape)
+        index = self.to_tensor(index).handle
+        layout = mem_desc.layout
+        ty = ttgl.shared_memory_descriptor_type(mem_desc.dtype, shape, layout, mem_desc.type.alloc_shape)
+        builder = self.builder
+        handle = builder.create_memdesc_index(ty.to_ir(builder), mem_desc.handle, index)
+        return ttgl.shared_memory_descriptor(handle, **ty.__dict__)
     def memdesc_trans(self, mem_desc, order):
         assert len(order) == len(
@@ -194,10 +238,26 @@ class GluonSemantic(TritonSemantic[TensorTy]):
         return ttgl.shared_memory_descriptor(handle, element_ty=mem_desc.dtype, shape=shape,
                                              alloc_shape=new_alloc_shape, layout=layout)
-    def memdesc_reshape(self, mem_desc, shape, layout):
-        ty = ttgl.shared_memory_descriptor_type(mem_desc.dtype, shape, layout, mem_desc.type.alloc_shape)
-        handle = self.builder.create_memdesc_reshape(ty.to_ir(self.builder), mem_desc.handle)
-        return ttgl.shared_memory_descriptor(handle, **ty.__dict__)
+    def memdesc_reshape(self, mem_desc, shape):
+        _check(
+            math.prod(shape) == math.prod(mem_desc.shape),
+            lambda: (f"memdesc_reshape total elements mismatch: "
+                     f"{mem_desc.shape} -> {shape}"),
+        )
+        handle = self.builder.create_memdesc_reshape(mem_desc.handle, shape)
+        layout = self.builder.get_gluon_layout_from_memdesc(handle)
+        alloc_shape = mem_desc.type.alloc_shape
+        prefix_len = len(alloc_shape) - mem_desc.rank
+        new_alloc_shape = alloc_shape[:prefix_len] + list(shape)
+        return ttgl.shared_memory_descriptor(
+            handle,
+            element_ty=mem_desc.dtype,
+            shape=shape,
+            alloc_shape=new_alloc_shape,
+            layout=layout,
+        )
     def memdesc_reinterpret(self, mem_desc, dtype, shape, layout):
         ty = ttgl.shared_memory_descriptor_type(dtype, shape, layout, shape)
@@ -220,6 +280,27 @@ class GluonSemantic(TritonSemantic[TensorTy]):
         _check(all(l == l0 for l in layouts[1:]),
                lambda: f"Expected inputs to have matching layouts, but got: {layouts}")
+    def associative_scan(self, inputs: Sequence[TensorTy], axis: int, region_builder_fn,
+                         reverse: bool) -> Tuple[TensorTy, ...]:
+        shape = inputs[0].type.shape
+        rank = len(shape)
+        assert -rank <= axis < rank, f"scan axis {axis} must be < inputs rank ({rank})"
+        if axis < 0:
+            axis += rank
+        for t in inputs:
+            assert t.type.shape == shape, "all scan inputs must have the same shape"
+        scan_op = self.builder.create_scan([t.handle for t in inputs], axis, reverse)
+        region_builder_fn(scan_op)
+        assert scan_op.verify()
+        return tuple(
+            self._wrap_handle_infer_layout(scan_op.get_result(i), inputs[i].type.scalar, shape)
+            for i in range(len(inputs)))
     def reduction(self, inputs: Sequence[TensorTy], axis: int, region_builder_fn) -> Tuple[TensorTy, ...]:
         _check(axis is not None, lambda: "All-reduce is not yet implemented in gluon")
         # get result shape
@@ -228,7 +309,6 @@ class GluonSemantic(TritonSemantic[TensorTy]):
         _check(0 <= axis < rank, lambda: f"expected reduction axis to be in the range [0, {rank}) but got {axis}")
         self._check_same_layout(inputs)
         ret_shape = [s for i, s in enumerate(shape) if i != axis]
-        ret_layout = SliceLayout(axis, inputs[0].type.layout)
         assert all(t.type.shape == shape for t in inputs), "all reduction inputs must have the same shape"
         reduce_op = self.builder.create_reduce([t.handle for t in inputs], axis)
@@ -236,11 +316,23 @@ class GluonSemantic(TritonSemantic[TensorTy]):
         assert reduce_op.verify()
         return tuple(
-            self.wrap_tensor(reduce_op.get_result(i), inputs[i].type.scalar, ret_shape, ret_layout)
+            self._wrap_handle_infer_layout(reduce_op.get_result(i), inputs[i].type.scalar, ret_shape)
             for i in range(len(inputs)))
-    def warp_specialize(self, args, default_partition, worker_partitions, worker_num_warps: Sequence[int],
-                        worker_num_regs: Sequence[int], generator):
+    def histogram(self, input: TensorTy, num_bins: int, mask: TensorTy, layout) -> TensorTy:
+        _check(len(input.shape) == 1, lambda: "histogram only supports 1D input")
+        _check(input.dtype.is_int(), lambda: "histogram only supports integer input")
+        _check(layout is not None, lambda: "histogram requires a destination layout")
+        if mask is not None:
+            mask, input = self.broadcast_impl_value(mask, input)
+            _check(mask.type.scalar.is_bool(), lambda: "Mask must have boolean scalar type")
+            mask = mask.handle
+        layout_attr = layout._to_ir(self.builder)
+        handle = self.builder.create_histogram(input.handle, num_bins, mask, layout_attr)
+        return self.wrap_tensor(handle, ttgl.int32, [num_bins], layout)
+    def warp_specialize(self, default_args, default_partition, worker_args, worker_partitions,
+                        worker_num_warps: Sequence[int], worker_num_regs: Sequence[int], generator):
         num_partitions = len(worker_partitions)
         assert num_partitions == len(
             worker_num_warps
@@ -255,7 +347,7 @@ class GluonSemantic(TritonSemantic[TensorTy]):
         # Emit the default partition to get the result types.
         default_block = builder.new_block()
         builder.set_insertion_point_to_start(default_block)
-        default_results = generator.call_JitFunction(default_partition, args, kwargs={})
+        default_results = generator.call_JitFunction(default_partition, default_args, kwargs={})
         mlir_results = []
         if default_results is not None:
             mlir_results = flatten_values_to_ir(default_results)
@@ -264,7 +356,7 @@ class GluonSemantic(TritonSemantic[TensorTy]):
         # Create the warp specialize op.
         builder.restore_insertion_point(insert_pt)
-        mlir_args = flatten_values_to_ir(args)
+        mlir_args = flatten_values_to_ir(worker_args)
         ws_op = builder.create_warp_specialize(result_types, mlir_args, worker_num_warps)
         ws_op.get_default_region().push_back(default_block)
         ws_op.set_requested_registers(worker_num_regs)
@@ -274,10 +366,11 @@ class GluonSemantic(TritonSemantic[TensorTy]):
         partitions_op = builder.create_warp_specialize_partitions(num_partitions)
         arg_types = [arg.get_type() for arg in mlir_args]
         for i in range(num_partitions):
+            caller_context = GluonCallerContext(num_warps=worker_num_warps[i])
             block = builder.create_block_with_parent(partitions_op.get_region(i), arg_types)
             block_args = [block.get_argument(j) for j in range(len(mlir_args))]
-            block_args = unflatten_ir_values(block_args, [arg.type for arg in args])
-            generator.call_JitFunction(worker_partitions[i], block_args, kwargs={})
+            block_args = unflatten_ir_values(block_args, [arg.type for arg in worker_args])
+            generator.call_JitFunction(worker_partitions[i], block_args, kwargs={}, caller_context=caller_context)
             builder.create_warp_return()
         builder.set_insertion_point_after(ws_op.get_operation())

triton/experimental/gluon/language/_standard.py CHANGED Viewed

@@ -1,39 +1,60 @@
-# flake8: noqa
-import triton
+from typing import TypeVar
+from triton.runtime.jit import JITFunction
 import triton.language.standard as tl_standard
-from .._runtime import jit
+from .._runtime import GluonJITFunction, jit
 from triton import knobs
 from . import _core as ttgl
-_IMPORT_FROM_TRITON = [
-    "sum",
-    "max",
-    "min",
-    "reduce_or",
-    "xor_sum",
-]
+T = TypeVar("T")
-__all__ = [
-    "full_like",
-    "zeros",
-    "zeros_like",
-    *_IMPORT_FROM_TRITON,
-]
-for name in _IMPORT_FROM_TRITON:
-    # Convert JITFunction -> GluonJitFunction
-    fn = getattr(tl_standard, name)
-    assert knobs.runtime.interpret or isinstance(fn, triton.runtime.JITFunction)
-    globals()[name] = jit(fn.fn)
+def _import_from_triton(fn: JITFunction[T]) -> GluonJITFunction[T]:
+    assert knobs.runtime.interpret or isinstance(fn, JITFunction)
+    # Wrap the function and preserve its original docstring
+    gluon_fn = jit(fn.fn)
+    gluon_fn.__doc__ = fn.__doc__
+    return gluon_fn
+cdiv = _import_from_triton(tl_standard.cdiv)
+sum = _import_from_triton(tl_standard.sum)
+max = _import_from_triton(tl_standard.max)
+min = _import_from_triton(tl_standard.min)
+reduce_or = _import_from_triton(tl_standard.reduce_or)
+xor_sum = _import_from_triton(tl_standard.xor_sum)
 @jit
-def zeros(shape, dtype, layout):
+def zeros(shape, dtype, layout=None):
+    """
+    Create a tensor filled with zeros.
+    Args:
+        shape (Sequence[int]): The shape of the tensor.
+        dtype (dtype): The data type for the tensor.
+        layout (Optional[DistributedLayout]): The distributed layout of the tensor, defaults to AutoLayout().
+    Returns:
+        tensor: A tensor where every element is zero.
+    """
     return ttgl.full(shape, 0, dtype, layout)
 @jit
 def full_like(input, value, shape=None, dtype=None, layout=None):
+    """
+    Create a tensor with the same properties as a given tensor, filled with a specified value.
+    Args:
+        input (tensor): Reference tensor to infer default shape, dtype, and layout.
+        value (int or float): The fill value.
+        shape (Sequence[int], optional): Target shape. Defaults to input.shape.
+        dtype (dtype, optional): Target data type. Defaults to input.dtype.
+        layout (DistributedLayout, optional): Target layout. Defaults to input.layout.
+    Returns:
+        tensor: A tensor where every element equals value.
+    """
     return ttgl.full(
         input.shape if shape is None else shape,
         value,
@@ -44,4 +65,16 @@ def full_like(input, value, shape=None, dtype=None, layout=None):
 @jit
 def zeros_like(input, shape=None, dtype=None, layout=None):
+    """
+    Create a tensor with the same properties as a given tensor, filled with zeros.
+    Args:
+        input (tensor): Reference tensor to infer default shape, dtype, and layout.
+        shape (Sequence[int], optional): Target shape. Defaults to input.shape.
+        dtype (dtype, optional): Target data type. Defaults to input.dtype.
+        layout (DistributedLayout, optional): Target layout. Defaults to input.layout.
+    Returns:
+        tensor: A tensor where every element is zero.
+    """
     return full_like(input, 0, shape=shape, dtype=dtype, layout=layout)

triton/experimental/gluon/language/amd/__init__.py ADDED Viewed

@@ -0,0 +1,4 @@
+from ._layouts import AMDMFMALayout
+from . import cdna3, cdna4
+__all__ = ["AMDMFMALayout", "cdna3", "cdna4"]

triton/experimental/gluon/language/amd/_layouts.py ADDED Viewed

@@ -0,0 +1,96 @@
+from __future__ import annotations
+from dataclasses import dataclass
+from typing import List, Optional
+from triton.language.core import _unwrap_if_constexpr
+from triton.experimental.gluon.language._layouts import _realize_cta_layout, DistributedLayout
+from triton.experimental.gluon import language as ttgl
+__all__ = [
+    "AMDMFMALayout",
+]
+@dataclass(frozen=True)
+class AMDMFMALayout(DistributedLayout):
+    """
+    Represents a layout for AMD MFMA (matrix core) operations.
+    Args:
+        version (int): Major and minor identifier for the MFMA instruction.
+        instr_shape: (M, N) dimension for the instrinsic shape.
+        transposed (bool): indicates the result tensor is transposed so that each thread holds consecutive elements in the same row instead of column, which is good for chained dot and global write.
+        warps_per_cta (List[int]): Number of warps per CTA.
+        elem_type Optional(ttgl.dtype): Supported types are int32, fp32 and fp64. Default is fp32.
+        tiles_per_warp Optional(List[int]): Number of tiles per WARP. For mfma layout, if missing, use the default where we have unit tile size on all dimensions.
+        ctas_per_cga (Optional[List[int]]): CTAs per CGA grouping.
+        cta_split_num (Optional[List[int]]): Split factors for CTAs.
+        cta_order (Optional[List[int]]): CTA ordering.
+    """
+    version: int
+    instr_shape: List[int]
+    transposed: bool
+    warps_per_cta: List[int]
+    elem_type: ttgl.dtype = ttgl.float32
+    tiles_per_warp: Optional[List[int]] = None
+    ctas_per_cga: Optional[List[int]] = None
+    cta_split_num: Optional[List[int]] = None
+    cta_order: Optional[List[int]] = None
+    def __post_init__(self):
+        super().__setattr__("version", _unwrap_if_constexpr(self.version))
+        super().__setattr__("instr_shape", _unwrap_if_constexpr(self.instr_shape))
+        super().__setattr__("transposed", _unwrap_if_constexpr(self.transposed))
+        super().__setattr__("warps_per_cta", _unwrap_if_constexpr(self.warps_per_cta))
+        super().__setattr__("tiles_per_warp", _unwrap_if_constexpr(self.tiles_per_warp))
+        super().__setattr__("elem_type", _unwrap_if_constexpr(self.elem_type))
+        super().__setattr__("ctas_per_cga", _unwrap_if_constexpr(self.ctas_per_cga))
+        super().__setattr__("cta_split_num", _unwrap_if_constexpr(self.cta_split_num))
+        super().__setattr__("cta_order", _unwrap_if_constexpr(self.cta_order))
+        if self.tiles_per_warp is None:
+            object.__setattr__(self, "tiles_per_warp", [1] * len(self.warps_per_cta))
+        self.verify()
+    def _to_ir(self, builder):
+        type = self.elem_type.to_ir(builder)
+        return builder.get_amd_mfma_layout(self.version, self.instr_shape, self.transposed, self.warps_per_cta, type,
+                                           self.tiles_per_warp, self.ctas_per_cga, self.cta_split_num, self.cta_order)
+    def mangle(self) -> str:
+        def stringify(x):
+            if x is None:
+                return ""
+            return "_".join(map(str, x))
+        return f"MFMA_{self.version}_{stringify(self.instr_shape)}_{self.transposed}_{stringify(self.warps_per_cta)}_{stringify(self.tiles_per_warp)}_{self.elem_type}_{stringify(self.ctas_per_cga)}_{stringify(self.cta_split_num)}_{stringify(self.cta_order)}_MFMA"
+    def verify(self):
+        assert self.version >= 1 and self.version <= 4, "version must be in the [1, 4] range"
+        valid_shapes = [[32, 32], [16, 16], [64, 4], [4, 64]]
+        assert self.instr_shape in valid_shapes, "invalid intrinsic shape; accepted shapes are " + str(valid_shapes)
+        assert self.elem_type.is_fp32() or self.elem_type.is_fp64() \
+          or self.elem_type.is_int32() , "element type must be float32, float64, or int32"
+        rank = len(self.warps_per_cta)
+        _realize_cta_layout(self, rank)
+        assert len(self.ctas_per_cga) == rank
+        assert len(self.cta_split_num) == rank
+        assert len(self.cta_order) == rank
+    def __hash__(self):
+        return hash((
+            self.version,
+            tuple(self.instr_shape),
+            self.transposed,
+            tuple(self.warps_per_cta),
+            self.elem_type,
+            tuple(self.tiles_per_warp) if self.tiles_per_warp else None,
+            tuple(self.ctas_per_cga) if self.ctas_per_cga else None,
+            tuple(self.cta_split_num) if self.cta_split_num else None,
+            tuple(self.cta_order) if self.cta_order else None,
+        ))

triton/experimental/gluon/language/amd/cdna3/__init__.py ADDED Viewed

@@ -0,0 +1,100 @@
+from __future__ import annotations
+from typing import TYPE_CHECKING
+from triton import knobs
+from triton.experimental.gluon.language import _core as ttgl
+from triton._C.libtriton import ir
+from ..._core import builtin, _unwrap_if_constexpr
+if TYPE_CHECKING:
+    from ..._semantic import GluonSemantic
+__all__ = ["buffer_load", "buffer_store", "mfma"]
+def _verify_buffer_ops(ptr, offsets, mask=None, other=None):
+    assert ptr.type.is_ptr(), "ptr must be a scalar pointer type"
+    assert isinstance(offsets.type, ttgl.distributed_type), "expected offsets type to be a distributed_type"
+    assert offsets.dtype.is_int32() or offsets.dtype.is_uint32(), "offsets element type must be int32 or uint32"
+    element_type = ptr.type.scalar.element_ty
+    if other is not None:
+        assert mask is not None, "when other is not None, mask should not be None"
+        assert other.dtype == element_type, "other must have the same data type as ptr scalar type"
+@builtin
+def buffer_load(ptr, offsets, mask=None, other=None, cache=None, _semantic=None):
+    """
+    AMD buffer load from global memory via a scalar base pointer and a tensor of
+    offsets instead of a tensor of pointers. This operation will load data
+    directly into registers.
+    Args:
+        ptr (pointer to scalar): Global memory scalar base pointer to load from.
+        offsets (tensor): Offsets tensor for the load operation.
+        mask (tensor, optional): Mask tensor for predicated loads. Defaults to None.
+        other (tensor, optional): Tensor providing default values for masked elements. Defaults to None.
+        cache_modifier (str): Cache modifier specifier. Defaults to "".
+    """
+    _verify_buffer_ops(ptr, offsets, mask, other)
+    mask = _unwrap_if_constexpr(mask)
+    if mask is not None:
+        offsets, mask = _semantic.broadcast_impl_value(offsets, mask)
+    other = _unwrap_if_constexpr(other)
+    if other is not None:
+        offsets, other = _semantic.broadcast_impl_value(offsets, other)
+    other = other.handle if other is not None else ir.value()
+    mask = mask.handle if mask is not None else ir.value()
+    cache_modifier = _semantic._str_to_load_cache_modifier(cache) if cache is not None else ir.CACHE_MODIFIER.NONE
+    ret_ty = offsets.type.with_element_ty(ptr.type.scalar.element_ty)
+    builder = _semantic.builder
+    handle = builder.create_buffer_load(ret_ty.to_ir(builder), ptr.handle, offsets.handle, mask, other, cache_modifier)
+    return ttgl.tensor(handle, ret_ty)
+@builtin
+def buffer_store(stored_value, ptr, offsets, mask=None, cache=None, _semantic: GluonSemantic = None):
+    """
+    AMD buffer store a tensor directly to global memory via a scalar base pointer and a tensor of
+    offsets instead of a tensor of pointers.
+    Args:
+        stored_value (tensor to be stored): The tensor to be stored to global memory.
+        ptr (pointer to scalar): Global memory scalar base pointer to store to.
+        offsets (tensor): Offsets tensor for the store operation.
+        mask (tensor, optional): Mask tensor for predicated store. Defaults to None.
+        cache_modifier (str): Cache modifier specifier. Defaults to "".
+    """
+    _verify_buffer_ops(ptr, offsets, mask)
+    if mask is not None:
+        offsets, mask = _semantic.broadcast_impl_value(offsets, mask)
+    mask = mask.handle if mask is not None else ir.value()
+    cache_modifier = _semantic._str_to_store_cache_modifier(cache) if cache is not None else ir.CACHE_MODIFIER.NONE
+    _semantic.builder.create_buffer_store(stored_value.handle, ptr.handle, offsets.handle, mask, cache_modifier)
+@builtin
+def mfma(a, b, acc, _semantic: GluonSemantic = None):
+    """
+    Computes matrix-multiplication of a * b + acc using AMD native matrix core units.
+    Args:
+        a (tensor): The first operand of mfma.
+        b (tensor): The second operand of mfma.
+        acc (tensor): The accumulator tensor.
+    """
+    assert acc is not None, "acc is required"
+    ret_type = acc.type
+    acc = ttgl._unwrap_if_constexpr(acc)
+    handle = _semantic.dot(a, b, acc, input_precision=knobs.language.fp32_default, max_num_imprecise_acc=None,
+                           out_dtype=acc.dtype).handle
+    return ttgl.tensor(handle, ret_type)

triton/experimental/gluon/language/amd/cdna4/__init__.py ADDED Viewed

@@ -0,0 +1,48 @@
+from triton.experimental.gluon.language import _core as ttgl
+from ..._core import builtin, float32
+from ..._layouts import DotOperandLayout
+from .._layouts import AMDMFMALayout
+from ..cdna3 import *  # NOQA: F403
+from ..cdna3 import __all__ as __cdna3_all
+from . import async_copy
+__all__ = [*__cdna3_all, "async_copy", "mfma_scaled"]
+@builtin
+def mfma_scaled(a, a_scale, a_format, b, b_scale, b_format, acc, _semantic=None):
+    """
+    AMD Scaled MFMA operation.
+    ```
+    c = a * a_scale @ b * b_scale + acc
+    ```
+    `a` and `b` use microscaling formats described in
+    "OCP Microscaling Formats (MX) Specification":
+    https://www.opencompute.org/documents/ocp-microscaling-formats-mx-v1-0-spec-final-pdf.
+    Currently supported only on CDNA4 hardware.
+    Args:
+        a (tensor): The operand A to be multiplied.
+        a_scale (tensor): Scale factor for operand A.
+        a_format (str): Format of the operand A. Available formats: `e2m1`, `e4m3`, `e5m2`.
+        b (tensor): The operand B to be multiplied.
+        b_scale (tensor): Scale factor for operand B. Available formats: `e2m1`, `e4m3`, `e5m2`.
+        b_format (str): Format of the operand B.
+        acc (tensor): Accumulator tensor.
+    """
+    layout = acc.type.layout
+    assert isinstance(layout, AMDMFMALayout), "Expected layout to be an instance of AMDMFMALayout"
+    assert (isinstance(a.type.layout, DotOperandLayout) and a.type.layout.parent== layout), \
+            "Expected lhs layout to be a DotOperandLayout with parent matching MFMA layout"
+    assert (isinstance(b.type.layout, DotOperandLayout) and b.type.layout.parent == layout), \
+            "Expected rhs layout to be a DotOperandLayout with parent matching MFMA layout"
+    assert a_format.value in {"e2m1", "e4m3", "e5m2"}, f"Unsupported lhs_format: {a_format.value}"
+    assert b_format.value in {"e2m1", "e4m3", "e5m2"}, f"Unsupported rhs_format: {b_format.value}"
+    tensor = _semantic.dot_scaled(a, a_scale, a_format, b, b_scale, b_format, acc, False, True, True, float32)
+    ret_ty = ttgl.distributed_type(tensor.dtype, tensor.shape, layout)
+    return ttgl.tensor(tensor.handle, ret_ty)