PyPI - triton-windows - Versions diffs - 3.3.1.post19__cp312-cp312-win_amd64.whl → 3.4.0.post20__cp312-cp312-win_amd64.whl - Mend

triton-windows 3.3.1.post19__cp312-cp312-win_amd64.whl → 3.4.0.post20__cp312-cp312-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of triton-windows might be problematic. Click here for more details.

Files changed (166) hide show

triton/_C/libtriton.pyd +0 -0
triton/__init__.py +4 -1
triton/_filecheck.py +87 -0
triton/_internal_testing.py +26 -15
triton/_utils.py +110 -21
triton/backends/__init__.py +20 -23
triton/backends/amd/__init__.py +0 -0
triton/backends/amd/compiler.py +112 -78
triton/backends/amd/driver.c +5 -2
triton/backends/amd/driver.py +149 -47
triton/backends/compiler.py +7 -21
triton/backends/nvidia/bin/ptxas.exe +0 -0
triton/backends/nvidia/compiler.py +92 -93
triton/backends/nvidia/driver.c +90 -98
triton/backends/nvidia/driver.py +303 -128
triton/compiler/code_generator.py +212 -111
triton/compiler/compiler.py +110 -25
triton/experimental/__init__.py +0 -0
triton/experimental/gluon/__init__.py +4 -0
triton/experimental/gluon/_compiler.py +0 -0
triton/experimental/gluon/_runtime.py +99 -0
triton/experimental/gluon/language/__init__.py +18 -0
triton/experimental/gluon/language/_core.py +312 -0
triton/experimental/gluon/language/_layouts.py +230 -0
triton/experimental/gluon/language/_math.py +12 -0
triton/experimental/gluon/language/_semantic.py +287 -0
triton/experimental/gluon/language/_standard.py +47 -0
triton/experimental/gluon/language/nvidia/__init__.py +4 -0
triton/experimental/gluon/language/nvidia/blackwell/__init__.py +202 -0
triton/experimental/gluon/language/nvidia/blackwell/tma.py +32 -0
triton/experimental/gluon/language/nvidia/hopper/__init__.py +11 -0
triton/experimental/gluon/language/nvidia/hopper/mbarrier.py +51 -0
triton/experimental/gluon/language/nvidia/hopper/tma.py +96 -0
triton/experimental/gluon/nvidia/__init__.py +4 -0
triton/experimental/gluon/nvidia/blackwell.py +3 -0
triton/experimental/gluon/nvidia/hopper.py +40 -0
triton/knobs.py +481 -0
triton/language/__init__.py +39 -14
triton/language/core.py +794 -537
triton/language/extra/cuda/__init__.py +10 -7
triton/language/extra/cuda/gdc.py +42 -0
triton/language/extra/cuda/libdevice.py +394 -394
triton/language/extra/cuda/utils.py +21 -21
triton/language/extra/hip/libdevice.py +113 -104
triton/language/math.py +65 -66
triton/language/random.py +12 -2
triton/language/semantic.py +1706 -1770
triton/language/standard.py +116 -51
triton/runtime/autotuner.py +117 -59
triton/runtime/build.py +76 -12
triton/runtime/cache.py +18 -47
triton/runtime/driver.py +32 -29
triton/runtime/interpreter.py +72 -35
triton/runtime/jit.py +146 -110
triton/testing.py +16 -12
triton/tools/disasm.py +3 -4
triton/tools/tensor_descriptor.py +36 -0
triton/windows_utils.py +14 -6
{triton_windows-3.3.1.post19.dist-info → triton_windows-3.4.0.post20.dist-info}/METADATA +7 -2
triton_windows-3.4.0.post20.dist-info/RECORD +186 -0
triton_windows-3.4.0.post20.dist-info/entry_points.txt +3 -0
triton_windows-3.4.0.post20.dist-info/licenses/LICENSE +23 -0
triton_windows-3.4.0.post20.dist-info/top_level.txt +1 -0
triton/backends/amd/include/hip/amd_detail/amd_channel_descriptor.h +0 -358
triton/backends/amd/include/hip/amd_detail/amd_device_functions.h +0 -1010
triton/backends/amd/include/hip/amd_detail/amd_hip_atomic.h +0 -1638
triton/backends/amd/include/hip/amd_detail/amd_hip_bf16.h +0 -1814
triton/backends/amd/include/hip/amd_detail/amd_hip_bfloat16.h +0 -293
triton/backends/amd/include/hip/amd_detail/amd_hip_common.h +0 -32
triton/backends/amd/include/hip/amd_detail/amd_hip_complex.h +0 -174
triton/backends/amd/include/hip/amd_detail/amd_hip_cooperative_groups.h +0 -835
triton/backends/amd/include/hip/amd_detail/amd_hip_fp16.h +0 -1809
triton/backends/amd/include/hip/amd_detail/amd_hip_fp8.h +0 -1391
triton/backends/amd/include/hip/amd_detail/amd_hip_gl_interop.h +0 -108
triton/backends/amd/include/hip/amd_detail/amd_hip_math_constants.h +0 -124
triton/backends/amd/include/hip/amd_detail/amd_hip_runtime.h +0 -405
triton/backends/amd/include/hip/amd_detail/amd_hip_runtime_pt_api.h +0 -196
triton/backends/amd/include/hip/amd_detail/amd_hip_unsafe_atomics.h +0 -565
triton/backends/amd/include/hip/amd_detail/amd_hip_vector_types.h +0 -2226
triton/backends/amd/include/hip/amd_detail/amd_math_functions.h +0 -104
triton/backends/amd/include/hip/amd_detail/amd_surface_functions.h +0 -244
triton/backends/amd/include/hip/amd_detail/amd_warp_functions.h +0 -538
triton/backends/amd/include/hip/amd_detail/amd_warp_sync_functions.h +0 -288
triton/backends/amd/include/hip/amd_detail/concepts.hpp +0 -30
triton/backends/amd/include/hip/amd_detail/device_library_decls.h +0 -133
triton/backends/amd/include/hip/amd_detail/functional_grid_launch.hpp +0 -218
triton/backends/amd/include/hip/amd_detail/grid_launch.h +0 -67
triton/backends/amd/include/hip/amd_detail/grid_launch.hpp +0 -50
triton/backends/amd/include/hip/amd_detail/grid_launch_GGL.hpp +0 -26
triton/backends/amd/include/hip/amd_detail/helpers.hpp +0 -137
triton/backends/amd/include/hip/amd_detail/hip_api_trace.hpp +0 -1446
triton/backends/amd/include/hip/amd_detail/hip_assert.h +0 -101
triton/backends/amd/include/hip/amd_detail/hip_cooperative_groups_helper.h +0 -242
triton/backends/amd/include/hip/amd_detail/hip_fp16_gcc.h +0 -254
triton/backends/amd/include/hip/amd_detail/hip_fp16_math_fwd.h +0 -96
triton/backends/amd/include/hip/amd_detail/hip_ldg.h +0 -100
triton/backends/amd/include/hip/amd_detail/hip_prof_str.h +0 -10570
triton/backends/amd/include/hip/amd_detail/hip_runtime_prof.h +0 -78
triton/backends/amd/include/hip/amd_detail/host_defines.h +0 -184
triton/backends/amd/include/hip/amd_detail/hsa_helpers.hpp +0 -102
triton/backends/amd/include/hip/amd_detail/macro_based_grid_launch.hpp +0 -798
triton/backends/amd/include/hip/amd_detail/math_fwd.h +0 -698
triton/backends/amd/include/hip/amd_detail/ockl_image.h +0 -177
triton/backends/amd/include/hip/amd_detail/program_state.hpp +0 -107
triton/backends/amd/include/hip/amd_detail/texture_fetch_functions.h +0 -491
triton/backends/amd/include/hip/amd_detail/texture_indirect_functions.h +0 -478
triton/backends/amd/include/hip/channel_descriptor.h +0 -39
triton/backends/amd/include/hip/device_functions.h +0 -38
triton/backends/amd/include/hip/driver_types.h +0 -468
triton/backends/amd/include/hip/hip_bf16.h +0 -36
triton/backends/amd/include/hip/hip_bfloat16.h +0 -44
triton/backends/amd/include/hip/hip_common.h +0 -100
triton/backends/amd/include/hip/hip_complex.h +0 -38
triton/backends/amd/include/hip/hip_cooperative_groups.h +0 -46
triton/backends/amd/include/hip/hip_deprecated.h +0 -95
triton/backends/amd/include/hip/hip_ext.h +0 -161
triton/backends/amd/include/hip/hip_fp16.h +0 -36
triton/backends/amd/include/hip/hip_fp8.h +0 -33
triton/backends/amd/include/hip/hip_gl_interop.h +0 -32
triton/backends/amd/include/hip/hip_hcc.h +0 -24
triton/backends/amd/include/hip/hip_math_constants.h +0 -36
triton/backends/amd/include/hip/hip_profile.h +0 -27
triton/backends/amd/include/hip/hip_runtime.h +0 -75
triton/backends/amd/include/hip/hip_runtime_api.h +0 -9261
triton/backends/amd/include/hip/hip_texture_types.h +0 -29
triton/backends/amd/include/hip/hip_vector_types.h +0 -41
triton/backends/amd/include/hip/hip_version.h +0 -17
triton/backends/amd/include/hip/hiprtc.h +0 -421
triton/backends/amd/include/hip/library_types.h +0 -78
triton/backends/amd/include/hip/math_functions.h +0 -42
triton/backends/amd/include/hip/surface_types.h +0 -63
triton/backends/amd/include/hip/texture_types.h +0 -194
triton/backends/amd/include/hsa/Brig.h +0 -1131
triton/backends/amd/include/hsa/amd_hsa_common.h +0 -91
triton/backends/amd/include/hsa/amd_hsa_elf.h +0 -462
triton/backends/amd/include/hsa/amd_hsa_kernel_code.h +0 -269
triton/backends/amd/include/hsa/amd_hsa_queue.h +0 -109
triton/backends/amd/include/hsa/amd_hsa_signal.h +0 -80
triton/backends/amd/include/hsa/hsa.h +0 -5738
triton/backends/amd/include/hsa/hsa_amd_tool.h +0 -91
triton/backends/amd/include/hsa/hsa_api_trace.h +0 -579
triton/backends/amd/include/hsa/hsa_api_trace_version.h +0 -68
triton/backends/amd/include/hsa/hsa_ext_amd.h +0 -3146
triton/backends/amd/include/hsa/hsa_ext_finalize.h +0 -531
triton/backends/amd/include/hsa/hsa_ext_image.h +0 -1454
triton/backends/amd/include/hsa/hsa_ven_amd_aqlprofile.h +0 -488
triton/backends/amd/include/hsa/hsa_ven_amd_loader.h +0 -667
triton/backends/amd/include/hsa/hsa_ven_amd_pc_sampling.h +0 -416
triton/backends/amd/include/roctracer/ext/prof_protocol.h +0 -107
triton/backends/amd/include/roctracer/hip_ostream_ops.h +0 -4515
triton/backends/amd/include/roctracer/hsa_ostream_ops.h +0 -1727
triton/backends/amd/include/roctracer/hsa_prof_str.h +0 -3059
triton/backends/amd/include/roctracer/roctracer.h +0 -779
triton/backends/amd/include/roctracer/roctracer_ext.h +0 -81
triton/backends/amd/include/roctracer/roctracer_hcc.h +0 -24
triton/backends/amd/include/roctracer/roctracer_hip.h +0 -37
triton/backends/amd/include/roctracer/roctracer_hsa.h +0 -112
triton/backends/amd/include/roctracer/roctracer_plugin.h +0 -137
triton/backends/amd/include/roctracer/roctracer_roctx.h +0 -67
triton/backends/amd/include/roctracer/roctx.h +0 -229
triton/language/_utils.py +0 -21
triton/language/extra/cuda/_experimental_tma.py +0 -106
triton/tools/experimental_descriptor.py +0 -32
triton_windows-3.3.1.post19.dist-info/RECORD +0 -260
triton_windows-3.3.1.post19.dist-info/top_level.txt +0 -14
{triton_windows-3.3.1.post19.dist-info → triton_windows-3.4.0.post20.dist-info}/WHEEL +0 -0

triton/experimental/gluon/language/_core.py ADDED Viewed

@@ -0,0 +1,312 @@
+from __future__ import annotations
+from typing import TypeVar, List, TYPE_CHECKING, Tuple
+from functools import wraps
+if TYPE_CHECKING:
+    from triton._C.libtriton.gluon_ir import GluonOpBuilder
+    from ._semantic import GluonSemantic
+from ._layouts import SharedLayout, DistributedLayout
+from triton._C.libtriton import ir
+import triton.language.core as tl_core
+from triton.language.core import (
+    constexpr,
+    base_value,
+    base_type,
+    dtype,
+    block_type,  # TODO: block type with layout info
+    pointer_type,
+    void,
+    int1,
+    int8,
+    int16,
+    int32,
+    int64,
+    uint8,
+    uint16,
+    uint32,
+    uint64,
+    float8e5,
+    float8e5b16,
+    float8e4nv,
+    float8e4b8,
+    float8e4b15,
+    float16,
+    bfloat16,
+    float32,
+    float64,
+    _unwrap_if_constexpr,
+    _unwrap_shape,
+    tensor,
+    tuple,
+    tuple_type,
+)
+_IMPORT_FROM_TRITON: List[str] = [
+    "expand_dims",
+    "join",
+    "load",
+    "maximum",
+    "minimum",
+    "permute",
+    "program_id",
+    "reduce",
+    "reshape",
+    "split",
+    "static_assert",
+    "static_print",
+    "store",
+    "to_tensor",
+    "where",
+    "inline_asm_elementwise",
+]
+__all__ = [
+    "constexpr",
+    "base_value",
+    "base_type",
+    "dtype",
+    "block_type",
+    "pointer_type",
+    "tuple_type",
+    "void",
+    "int1",
+    "int8",
+    "int16",
+    "int32",
+    "int64",
+    "uint8",
+    "uint16",
+    "uint32",
+    "uint64",
+    "float8e5",
+    "float8e5b16",
+    "float8e4nv",
+    "float8e4b8",
+    "float8e4b8",
+    "float8e4b15",
+    "float16",
+    "bfloat16",
+    "float32",
+    "float64",
+    "_unwrap_if_constexpr",
+    "tensor",
+    "tuple",
+    "tuple_type",
+    "thread_barrier",
+    "arange",
+    "full",
+    "convert_layout",
+    "allocate_shared_memory",
+    "shared_memory_descriptor",
+    "warp_specialize",
+    *_IMPORT_FROM_TRITON,
+]
+T = TypeVar("T")
+# TODO: split these
+GLUON_BUILTIN = "__triton_builtin__"
+class distributed_type(block_type):
+    def __init__(self, element_ty: dtype, shape: List[int], layout):
+        super().__init__(element_ty, shape)
+        self.layout = layout
+        self.name = f"<{self.shape}, {self.element_ty}, {self.layout}>"
+        assert isinstance(layout, DistributedLayout)
+    def to_ir(self, builder: ir.builder) -> ir.type:
+        elem_ty = self.element_ty.to_ir(builder)
+        layout = self.layout._to_ir(builder)
+        return builder.get_distributed_ty(elem_ty, self.shape, layout)
+    def mangle(self) -> str:
+        elt = self.scalar.mangle()
+        shape = "_".join(map(str, self.shape))
+        layout = self.layout.mangle()
+        return f"{elt}S{shape}SL{layout}L"
+    def with_element_ty(self, scalar_ty: dtype) -> block_type:
+        return distributed_type(scalar_ty, self.shape, self.layout)
+def builtin(fn: T) -> T:
+    """Mark a function as a builtin."""
+    assert callable(fn)
+    @wraps(fn)
+    def wrapper(*args, **kwargs):
+        if "_semantic" not in kwargs or kwargs["_semantic"] is None:
+            raise ValueError("Did you forget to add @triton.gluon.jit ? "
+                             "(`_semantic` argument must be provided outside of JIT functions.)")
+        return fn(*args, **kwargs)
+    setattr(wrapper, GLUON_BUILTIN, True)
+    return wrapper
+class shared_memory_descriptor_type(base_type):
+    def __init__(self, element_ty, shape, layout, alloc_shape):
+        self.element_ty = element_ty
+        self.shape = shape
+        self.layout = layout
+        self.alloc_shape = alloc_shape
+        assert isinstance(layout, SharedLayout)
+    def to_ir(self, builder: GluonOpBuilder) -> None:
+        return builder.get_shared_mem_desc_ty(
+            self.element_ty.to_ir(builder),
+            self.shape,
+            self.layout._to_ir(builder),
+            self.alloc_shape,
+        )
+    def _unflatten_ir(self, handles: List[ir.Value], cursor: int) -> Tuple[shared_memory_descriptor, int]:
+        value = shared_memory_descriptor(handles[cursor], self.element_ty, self.shape, self.layout, self.alloc_shape)
+        return value, cursor + 1
+    def _flatten_ir_types(self, builder: GluonOpBuilder, out: List[ir.type]) -> None:
+        out.append(self.to_ir(builder))
+    def __str__(self) -> str:
+        return f"shared_memory_descriptor<{self.element_ty}, {self.shape}, {self.layout}, {self.alloc_shape}>"
+    def __eq__(self, other) -> bool:
+        return (type(self) is type(other) and self.shape == other.shape and self.layout == other.layout
+                and self.alloc_shape == other.alloc_shape)
+    def __neq__(self, other) -> bool:
+        return not (self == other)
+    def mangle(self) -> str:
+        shape_str = "_".join([str(s) for s in self.shape])
+        return f"MD{self.element_ty.mangle()}S{shape_str}SL{self.layout.mangle()}LAS{self.alloc_shape}ASMD"
+class shared_memory_descriptor(base_value):
+    def __init__(self, handle, element_ty, shape, layout, alloc_shape):
+        self.handle = handle
+        self.type = shared_memory_descriptor_type(element_ty, shape, layout, alloc_shape)
+    def _flatten_ir(self, handles: List[ir.value]) -> None:
+        handles.append(self.handle)
+    @property
+    def dtype(self):
+        return self.type.element_ty
+    @property
+    def shape(self):
+        return self.type.shape
+    @property
+    def rank(self):
+        return len(self.shape)
+    @property
+    def layout(self):
+        return self.type.layout
+    def __str__(self) -> str:
+        return str(self.type)
+    @builtin
+    def load(self, layout, _semantic: GluonSemantic) -> tensor:
+        layout = _unwrap_if_constexpr(layout)
+        return _semantic.shared_load(self, layout)
+    @builtin
+    def store(self, value, _semantic: GluonSemantic) -> None:
+        return _semantic.shared_store(self, value)
+    @builtin
+    def slice(self, start, length, dim=0, _semantic: GluonSemantic = None) -> shared_memory_descriptor:
+        start = _unwrap_if_constexpr(start)
+        length = _unwrap_if_constexpr(length)
+        dim = _unwrap_if_constexpr(dim)
+        return _semantic.memdesc_slice(self, start, length, dim)
+    @builtin
+    def index(self, index, _semantic: GluonSemantic = None) -> shared_memory_descriptor:
+        index = _unwrap_if_constexpr(index)
+        return _semantic.memdesc_index(self, index)
+    @builtin
+    def permute(self, order, _semantic: GluonSemantic) -> shared_memory_descriptor:
+        order = [_unwrap_if_constexpr(o) for o in order]
+        return _semantic.memdesc_trans(self, order)
+    @builtin
+    def reshape(self, shape, layout, _semantic: GluonSemantic) -> shared_memory_descriptor:
+        shape = [_unwrap_if_constexpr(s) for s in shape]
+        layout = _unwrap_if_constexpr(layout)
+        return _semantic.memdesc_reshape(self, shape, layout)
+    @builtin
+    def _reinterpret(self, dtype, shape, layout, _semantic: GluonSemantic = None) -> shared_memory_descriptor:
+        dtype = _unwrap_if_constexpr(dtype)
+        shape = [_unwrap_if_constexpr(s) for s in shape]
+        layout = _unwrap_if_constexpr(layout)
+        return _semantic.memdesc_reinterpret(self, dtype, shape, layout)
+    @builtin
+    def _keep_alive(self, _semantic: GluonSemantic = None) -> None:
+        return _semantic.shared_dealloc(self)
+for name in _IMPORT_FROM_TRITON:
+    fn = getattr(tl_core, name)
+    globals()[name] = builtin(fn)
+@builtin
+def arange(start, end, layout, _semantic=None):
+    start = _unwrap_if_constexpr(start)
+    end = _unwrap_if_constexpr(end)
+    layout = _unwrap_if_constexpr(layout)
+    return _semantic.arange(start, end, layout)
+@builtin
+def convert_layout(value, layout, _semantic=None):
+    layout = _unwrap_if_constexpr(layout)
+    return _semantic.convert_layout(value, layout)
+@builtin
+def full(shape, value, dtype, layout, _semantic=None):
+    shape = _unwrap_shape(shape)
+    value = _unwrap_if_constexpr(value)
+    dtype = _unwrap_if_constexpr(dtype)
+    layout = _unwrap_if_constexpr(layout)
+    return _semantic.full(shape, value, dtype, layout)
+@builtin
+def allocate_shared_memory(element_ty, shape, layout, value=None, _semantic=None):
+    element_ty = _unwrap_if_constexpr(element_ty)
+    shape = _unwrap_if_constexpr(shape)
+    shape = [_unwrap_if_constexpr(s) for s in shape]
+    layout = _unwrap_if_constexpr(layout)
+    return _semantic.allocate_shared(element_ty, shape, layout, value)
+@builtin
+def warp_specialize(args, default_partition, worker_partitions, worker_num_warps, worker_num_regs,  #
+                    _semantic=None, _generator=None):
+    worker_num_warps = [_unwrap_if_constexpr(w) for w in worker_num_warps]
+    worker_num_regs = [_unwrap_if_constexpr(r) for r in worker_num_regs]
+    return _semantic.warp_specialize(args, default_partition, worker_partitions, worker_num_warps,  #
+                                     worker_num_regs, _generator)
+@builtin
+def thread_barrier(_semantic=None):
+    return _semantic.debug_barrier()

triton/experimental/gluon/language/_layouts.py ADDED Viewed

@@ -0,0 +1,230 @@
+from dataclasses import dataclass
+from typing import List, Optional
+from triton.language.core import _unwrap_if_constexpr, _unwrap_shape
+__all__ = [
+    "BlockedLayout",
+    "SliceLayout",
+    "DistributedLinearLayout",
+    "NVMMASharedLayout",
+    "SwizzledSharedLayout",
+]
+def _realize_cta_layout(rank, ctas_per_cga, cta_split_num, cta_order):
+    ctas_per_cga = ctas_per_cga or [1] * rank
+    cta_split_num = cta_split_num or [1] * rank
+    cta_order = cta_order or list(reversed(range(rank)))
+    return ctas_per_cga, cta_split_num, cta_order
+class DistributedLayout:
+    pass
+@dataclass(frozen=True)
+class BlockedLayout(DistributedLayout):
+    size_per_thread: List[int]
+    threads_per_warp: List[int]
+    warps_per_cta: List[int]
+    order: List[int]
+    ctas_per_cga: Optional[List[int]] = None
+    cta_split_num: Optional[List[int]] = None
+    cta_order: Optional[List[int]] = None
+    def __post_init__(self):
+        super().__setattr__("size_per_thread", _unwrap_if_constexpr(self.size_per_thread))
+        super().__setattr__("threads_per_warp", _unwrap_if_constexpr(self.threads_per_warp))
+        super().__setattr__("warps_per_cta", _unwrap_if_constexpr(self.warps_per_cta))
+        super().__setattr__("order", _unwrap_if_constexpr(self.order))
+        super().__setattr__("ctas_per_cga", _unwrap_if_constexpr(self.ctas_per_cga))
+        super().__setattr__("cta_split_num", _unwrap_if_constexpr(self.cta_split_num))
+        super().__setattr__("cta_order", _unwrap_if_constexpr(self.cta_order))
+        rank = len(self.size_per_thread)
+        assert len(self.threads_per_warp) == rank
+        assert len(self.warps_per_cta) == rank
+        assert len(self.order) == rank
+        assert self.ctas_per_cga is None or len(self.ctas_per_cga) == rank
+        assert self.cta_split_num is None or len(self.cta_split_num) == rank
+        assert self.cta_order is None or len(self.cta_order) == rank
+    def _to_ir(self, builder):
+        rank = len(self.size_per_thread)
+        ctas_per_cga, cta_split_num, cta_order = _realize_cta_layout(rank, self.ctas_per_cga, self.cta_split_num,
+                                                                     self.cta_order)
+        return builder.get_blocked_layout(
+            self.size_per_thread,
+            self.threads_per_warp,
+            self.warps_per_cta,
+            self.order,
+            ctas_per_cga,
+            cta_split_num,
+            cta_order,
+        )
+    def mangle(self) -> str:
+        def stringify(x):
+            if x is None:
+                return ""
+            return "_".join(map(str, x))
+        size_per_thread = stringify(self.size_per_thread)
+        threads_per_warp = stringify(self.threads_per_warp)
+        warps_per_cta = stringify(self.warps_per_cta)
+        order = stringify(self.order)
+        ctas_per_cga = stringify(self.ctas_per_cga)
+        cta_split_num = stringify(self.cta_split_num)
+        cta_order = stringify(self.cta_order)
+        return f"B{size_per_thread}B{threads_per_warp}B{warps_per_cta}B{order}B{ctas_per_cga}B{cta_split_num}B{cta_order}B"
+@dataclass(frozen=True)
+class SliceLayout(DistributedLayout):
+    dim: int
+    parent: DistributedLayout
+    def __post_init__(self):
+        super().__setattr__("dim", _unwrap_if_constexpr(self.dim))
+        super().__setattr__("parent", _unwrap_if_constexpr(self.parent))
+    def _to_ir(self, builder):
+        return builder.get_slice_layout(
+            self.dim,
+            self.parent._to_ir(builder),
+        )
+    def mangle(self) -> str:
+        return f"SL{self.dim}_{self.parent.mangle()}SL"
+@dataclass(frozen=True)
+class DistributedLinearLayout(DistributedLayout):
+    reg_bases: List[List[int]]
+    lane_bases: List[List[int]]
+    warp_bases: List[List[int]]
+    block_bases: List[List[int]]
+    shape: List[int]
+    def __post_init__(self):
+        super().__setattr__("reg_bases", _unwrap_shape(self.reg_bases))
+        super().__setattr__("lane_bases", _unwrap_shape(self.lane_bases))
+        super().__setattr__("warp_bases", _unwrap_shape(self.warp_bases))
+        super().__setattr__("block_bases", _unwrap_shape(self.block_bases))
+        super().__setattr__("shape", _unwrap_shape(self.shape))
+        rank = len(self.shape)
+        for basis in self.reg_bases:
+            assert len(basis) == rank
+        for basis in self.lane_bases:
+            assert len(basis) == rank
+        for basis in self.warp_bases:
+            assert len(basis) == rank
+        for basis in self.block_bases:
+            assert len(basis) == rank
+    def _to_ir(self, builder):
+        return builder.get_distributed_linear_layout(self.reg_bases, self.lane_bases, self.warp_bases, self.block_bases,
+                                                     self.shape)
+    def mangle(self):
+        return f"DLL{self.reg_bases}_{self.lane_bases}_{self.warp_bases}_{self.block_bases}_{self.shape}DLL"
+class SharedLayout:
+    pass
+@dataclass(frozen=True)
+class NVMMASharedLayout(SharedLayout):
+    swizzle_byte_width: int
+    element_bitwidth: int
+    rank: int
+    transposed: bool = False
+    fp4_padded: bool = False
+    ctas_per_cga: Optional[List[int]] = None
+    cta_split_num: Optional[List[int]] = None
+    cta_order: Optional[List[int]] = None
+    def __post_init__(self):
+        super().__setattr__("swizzle_byte_width", _unwrap_if_constexpr(self.swizzle_byte_width))
+        super().__setattr__("element_bitwidth", _unwrap_if_constexpr(self.element_bitwidth))
+        super().__setattr__("rank", _unwrap_if_constexpr(self.rank))
+        super().__setattr__("transposed", _unwrap_if_constexpr(self.transposed))
+        super().__setattr__("fp4_padded", _unwrap_if_constexpr(self.fp4_padded))
+        super().__setattr__("ctas_per_cga", _unwrap_if_constexpr(self.ctas_per_cga))
+        super().__setattr__("cta_split_num", _unwrap_if_constexpr(self.cta_split_num))
+        super().__setattr__("cta_order", _unwrap_if_constexpr(self.cta_order))
+        assert self.element_bitwidth in [8, 16, 32, 64]
+        assert self.swizzle_byte_width in [0, 32, 64, 128]
+        rank = self.rank
+        assert self.ctas_per_cga is None or len(self.ctas_per_cga) == rank
+        assert self.cta_split_num is None or len(self.cta_split_num) == rank
+        assert self.cta_order is None or len(self.cta_order) == rank
+    def _to_ir(self, builder):
+        ctas_per_cga, cta_split_num, cta_order = _realize_cta_layout(self.rank, self.ctas_per_cga, self.cta_split_num,
+                                                                     self.cta_order)
+        return builder.get_nvmma_shared_layout(
+            self.swizzle_byte_width,
+            self.element_bitwidth,
+            self.transposed,
+            self.fp4_padded,
+            ctas_per_cga,
+            cta_split_num,
+            cta_order,
+        )
+    def mangle(self) -> str:
+        return f"NVMMA_{self.swizzle_byte_width}_{self.element_bitwidth}_{self.transposed}_{self.fp4_padded}_NVMMA"
+@dataclass(frozen=True, eq=True)
+class SwizzledSharedLayout(SharedLayout):
+    vec: int
+    per_phase: int
+    max_phase: int
+    order: List[int]
+    ctas_per_cga: Optional[List[int]] = None
+    cta_split_num: Optional[List[int]] = None
+    cta_order: Optional[List[int]] = None
+    def __post_init__(self):
+        super().__setattr__("vec", _unwrap_if_constexpr(self.vec))
+        super().__setattr__("per_phase", _unwrap_if_constexpr(self.per_phase))
+        super().__setattr__("max_phase", _unwrap_if_constexpr(self.max_phase))
+        super().__setattr__("order", _unwrap_if_constexpr(self.order))
+        super().__setattr__("ctas_per_cga", _unwrap_if_constexpr(self.ctas_per_cga))
+        super().__setattr__("cta_split_num", _unwrap_if_constexpr(self.cta_split_num))
+        super().__setattr__("cta_order", _unwrap_if_constexpr(self.cta_order))
+        rank = len(self.order)
+        assert self.ctas_per_cga is None or len(self.ctas_per_cga) == rank
+        assert self.cta_split_num is None or len(self.cta_split_num) == rank
+        assert self.cta_order is None or len(self.cta_order) == rank
+    def _to_ir(self, builder):
+        rank = len(self.order)
+        ctas_per_cga, cta_split_num, cta_order = _realize_cta_layout(rank, self.ctas_per_cga, self.cta_split_num,
+                                                                     self.cta_order)
+        return builder.get_swizzled_shared_layout(
+            _unwrap_if_constexpr(self.vec),
+            _unwrap_if_constexpr(self.per_phase),
+            _unwrap_if_constexpr(self.max_phase),
+            self.order,
+            ctas_per_cga,
+            cta_split_num,
+            cta_order,
+        )
+    def mangle(self) -> str:
+        def stringify(x):
+            if x is None:
+                return ""
+            return "_".join(map(str, x))
+        return f"SSS_{self.vec}_{self.per_phase}_{self.max_phase}_{stringify(self.order)}_{stringify(self.ctas_per_cga)}_{stringify(self.cta_split_num)}_{stringify(self.cta_order)}_SSS"

triton/experimental/gluon/language/_math.py ADDED Viewed

@@ -0,0 +1,12 @@
+# flake8: noqa
+import triton.language.math as tl_math
+from ._core import builtin
+__all__ = [
+    "umulhi", "exp", "exp2", "fma", "log", "log2", "cos", "rsqrt", "sin", "sqrt", "sqrt_rn", "abs", "fdiv", "div_rn",
+    "erf", "floor", "ceil"
+]
+for name in __all__:
+    fn = getattr(tl_math, name)
+    globals()[name] = builtin(fn)