PyPI - triton-windows - Versions diffs - 3.5.1.post21__cp313-cp313-win_amd64.whl - Mend

triton-windows 3.5.1.post21__cp313-cp313-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (217) hide show

triton/_C/libtriton.pyd +0 -0
triton/__init__.py +82 -0
triton/_filecheck.py +97 -0
triton/_internal_testing.py +255 -0
triton/_utils.py +126 -0
triton/backends/__init__.py +47 -0
triton/backends/amd/__init__.py +0 -0
triton/backends/amd/compiler.py +461 -0
triton/backends/amd/driver.c +283 -0
triton/backends/amd/driver.py +724 -0
triton/backends/amd/lib/asanrtl.bc +0 -0
triton/backends/amd/lib/ockl.bc +0 -0
triton/backends/amd/lib/ocml.bc +0 -0
triton/backends/compiler.py +90 -0
triton/backends/driver.py +66 -0
triton/backends/nvidia/__init__.py +0 -0
triton/backends/nvidia/bin/ptxas.exe +0 -0
triton/backends/nvidia/compiler.py +533 -0
triton/backends/nvidia/driver.c +517 -0
triton/backends/nvidia/driver.py +799 -0
triton/backends/nvidia/include/cuda.h +26280 -0
triton/backends/nvidia/lib/libdevice.10.bc +0 -0
triton/backends/nvidia/lib/x64/cuda.lib +0 -0
triton/compiler/__init__.py +7 -0
triton/compiler/code_generator.py +1614 -0
triton/compiler/compiler.py +509 -0
triton/compiler/errors.py +51 -0
triton/compiler/make_launcher.py +0 -0
triton/errors.py +5 -0
triton/experimental/__init__.py +0 -0
triton/experimental/gluon/__init__.py +5 -0
triton/experimental/gluon/_compiler.py +0 -0
triton/experimental/gluon/_runtime.py +102 -0
triton/experimental/gluon/language/__init__.py +119 -0
triton/experimental/gluon/language/_core.py +490 -0
triton/experimental/gluon/language/_layouts.py +583 -0
triton/experimental/gluon/language/_math.py +20 -0
triton/experimental/gluon/language/_semantic.py +380 -0
triton/experimental/gluon/language/_standard.py +80 -0
triton/experimental/gluon/language/amd/__init__.py +4 -0
triton/experimental/gluon/language/amd/_layouts.py +96 -0
triton/experimental/gluon/language/amd/cdna3/__init__.py +100 -0
triton/experimental/gluon/language/amd/cdna4/__init__.py +48 -0
triton/experimental/gluon/language/amd/cdna4/async_copy.py +151 -0
triton/experimental/gluon/language/extra/__init__.py +3 -0
triton/experimental/gluon/language/nvidia/__init__.py +4 -0
triton/experimental/gluon/language/nvidia/ampere/__init__.py +3 -0
triton/experimental/gluon/language/nvidia/ampere/async_copy.py +74 -0
triton/experimental/gluon/language/nvidia/ampere/mbarrier.py +80 -0
triton/experimental/gluon/language/nvidia/blackwell/__init__.py +387 -0
triton/experimental/gluon/language/nvidia/blackwell/tma.py +52 -0
triton/experimental/gluon/language/nvidia/hopper/__init__.py +132 -0
triton/experimental/gluon/language/nvidia/hopper/mbarrier.py +34 -0
triton/experimental/gluon/language/nvidia/hopper/tma.py +97 -0
triton/experimental/gluon/nvidia/__init__.py +4 -0
triton/experimental/gluon/nvidia/blackwell.py +3 -0
triton/experimental/gluon/nvidia/hopper.py +45 -0
triton/knobs.py +546 -0
triton/language/__init__.py +342 -0
triton/language/core.py +3405 -0
triton/language/extra/__init__.py +26 -0
triton/language/extra/cuda/__init__.py +16 -0
triton/language/extra/cuda/gdc.py +42 -0
triton/language/extra/cuda/libdevice.py +1629 -0
triton/language/extra/cuda/utils.py +109 -0
triton/language/extra/hip/__init__.py +5 -0
triton/language/extra/hip/libdevice.py +491 -0
triton/language/extra/hip/utils.py +35 -0
triton/language/extra/libdevice.py +790 -0
triton/language/math.py +249 -0
triton/language/random.py +218 -0
triton/language/semantic.py +1939 -0
triton/language/standard.py +534 -0
triton/language/target_info.py +54 -0
triton/runtime/__init__.py +23 -0
triton/runtime/_allocation.py +44 -0
triton/runtime/_async_compile.py +55 -0
triton/runtime/autotuner.py +476 -0
triton/runtime/build.py +168 -0
triton/runtime/cache.py +317 -0
triton/runtime/driver.py +38 -0
triton/runtime/errors.py +36 -0
triton/runtime/interpreter.py +1414 -0
triton/runtime/jit.py +1107 -0
triton/runtime/tcc/include/_mingw.h +168 -0
triton/runtime/tcc/include/assert.h +62 -0
triton/runtime/tcc/include/conio.h +409 -0
triton/runtime/tcc/include/ctype.h +281 -0
triton/runtime/tcc/include/dir.h +31 -0
triton/runtime/tcc/include/direct.h +68 -0
triton/runtime/tcc/include/dirent.h +135 -0
triton/runtime/tcc/include/dos.h +55 -0
triton/runtime/tcc/include/errno.h +75 -0
triton/runtime/tcc/include/excpt.h +123 -0
triton/runtime/tcc/include/fcntl.h +52 -0
triton/runtime/tcc/include/fenv.h +108 -0
triton/runtime/tcc/include/float.h +75 -0
triton/runtime/tcc/include/inttypes.h +297 -0
triton/runtime/tcc/include/io.h +418 -0
triton/runtime/tcc/include/iso646.h +36 -0
triton/runtime/tcc/include/limits.h +116 -0
triton/runtime/tcc/include/locale.h +91 -0
triton/runtime/tcc/include/malloc.h +181 -0
triton/runtime/tcc/include/math.h +497 -0
triton/runtime/tcc/include/mem.h +13 -0
triton/runtime/tcc/include/memory.h +40 -0
triton/runtime/tcc/include/process.h +176 -0
triton/runtime/tcc/include/sec_api/conio_s.h +42 -0
triton/runtime/tcc/include/sec_api/crtdbg_s.h +19 -0
triton/runtime/tcc/include/sec_api/io_s.h +33 -0
triton/runtime/tcc/include/sec_api/mbstring_s.h +52 -0
triton/runtime/tcc/include/sec_api/search_s.h +25 -0
triton/runtime/tcc/include/sec_api/stdio_s.h +145 -0
triton/runtime/tcc/include/sec_api/stdlib_s.h +67 -0
triton/runtime/tcc/include/sec_api/stralign_s.h +30 -0
triton/runtime/tcc/include/sec_api/string_s.h +41 -0
triton/runtime/tcc/include/sec_api/sys/timeb_s.h +34 -0
triton/runtime/tcc/include/sec_api/tchar_s.h +266 -0
triton/runtime/tcc/include/sec_api/time_s.h +61 -0
triton/runtime/tcc/include/sec_api/wchar_s.h +128 -0
triton/runtime/tcc/include/setjmp.h +160 -0
triton/runtime/tcc/include/share.h +28 -0
triton/runtime/tcc/include/signal.h +63 -0
triton/runtime/tcc/include/stdalign.h +16 -0
triton/runtime/tcc/include/stdarg.h +14 -0
triton/runtime/tcc/include/stdatomic.h +171 -0
triton/runtime/tcc/include/stdbool.h +11 -0
triton/runtime/tcc/include/stddef.h +42 -0
triton/runtime/tcc/include/stdint.h +212 -0
triton/runtime/tcc/include/stdio.h +429 -0
triton/runtime/tcc/include/stdlib.h +591 -0
triton/runtime/tcc/include/stdnoreturn.h +7 -0
triton/runtime/tcc/include/string.h +164 -0
triton/runtime/tcc/include/sys/fcntl.h +13 -0
triton/runtime/tcc/include/sys/file.h +14 -0
triton/runtime/tcc/include/sys/locking.h +30 -0
triton/runtime/tcc/include/sys/stat.h +290 -0
triton/runtime/tcc/include/sys/time.h +69 -0
triton/runtime/tcc/include/sys/timeb.h +133 -0
triton/runtime/tcc/include/sys/types.h +123 -0
triton/runtime/tcc/include/sys/unistd.h +14 -0
triton/runtime/tcc/include/sys/utime.h +146 -0
triton/runtime/tcc/include/tcc/tcc_libm.h +618 -0
triton/runtime/tcc/include/tccdefs.h +342 -0
triton/runtime/tcc/include/tcclib.h +80 -0
triton/runtime/tcc/include/tchar.h +1102 -0
triton/runtime/tcc/include/tgmath.h +89 -0
triton/runtime/tcc/include/time.h +287 -0
triton/runtime/tcc/include/uchar.h +33 -0
triton/runtime/tcc/include/unistd.h +1 -0
triton/runtime/tcc/include/vadefs.h +11 -0
triton/runtime/tcc/include/values.h +4 -0
triton/runtime/tcc/include/varargs.h +12 -0
triton/runtime/tcc/include/wchar.h +873 -0
triton/runtime/tcc/include/wctype.h +172 -0
triton/runtime/tcc/include/winapi/basetsd.h +149 -0
triton/runtime/tcc/include/winapi/basetyps.h +85 -0
triton/runtime/tcc/include/winapi/guiddef.h +156 -0
triton/runtime/tcc/include/winapi/poppack.h +8 -0
triton/runtime/tcc/include/winapi/pshpack1.h +8 -0
triton/runtime/tcc/include/winapi/pshpack2.h +8 -0
triton/runtime/tcc/include/winapi/pshpack4.h +8 -0
triton/runtime/tcc/include/winapi/pshpack8.h +8 -0
triton/runtime/tcc/include/winapi/qos.h +72 -0
triton/runtime/tcc/include/winapi/shellapi.h +59 -0
triton/runtime/tcc/include/winapi/winbase.h +2958 -0
triton/runtime/tcc/include/winapi/wincon.h +309 -0
triton/runtime/tcc/include/winapi/windef.h +293 -0
triton/runtime/tcc/include/winapi/windows.h +127 -0
triton/runtime/tcc/include/winapi/winerror.h +3166 -0
triton/runtime/tcc/include/winapi/wingdi.h +4080 -0
triton/runtime/tcc/include/winapi/winnls.h +778 -0
triton/runtime/tcc/include/winapi/winnt.h +5837 -0
triton/runtime/tcc/include/winapi/winreg.h +272 -0
triton/runtime/tcc/include/winapi/winsock2.h +1474 -0
triton/runtime/tcc/include/winapi/winuser.h +5651 -0
triton/runtime/tcc/include/winapi/winver.h +160 -0
triton/runtime/tcc/include/winapi/ws2ipdef.h +21 -0
triton/runtime/tcc/include/winapi/ws2tcpip.h +391 -0
triton/runtime/tcc/lib/cuda.def +697 -0
triton/runtime/tcc/lib/gdi32.def +337 -0
triton/runtime/tcc/lib/kernel32.def +770 -0
triton/runtime/tcc/lib/libtcc1.a +0 -0
triton/runtime/tcc/lib/msvcrt.def +1399 -0
triton/runtime/tcc/lib/python3.def +810 -0
triton/runtime/tcc/lib/python310.def +1610 -0
triton/runtime/tcc/lib/python311.def +1633 -0
triton/runtime/tcc/lib/python312.def +1703 -0
triton/runtime/tcc/lib/python313.def +1651 -0
triton/runtime/tcc/lib/python313t.def +1656 -0
triton/runtime/tcc/lib/python314.def +1800 -0
triton/runtime/tcc/lib/python314t.def +1809 -0
triton/runtime/tcc/lib/python39.def +1644 -0
triton/runtime/tcc/lib/python3t.def +905 -0
triton/runtime/tcc/lib/user32.def +658 -0
triton/runtime/tcc/libtcc.dll +0 -0
triton/runtime/tcc/tcc.exe +0 -0
triton/testing.py +543 -0
triton/tools/__init__.py +0 -0
triton/tools/build_extern.py +365 -0
triton/tools/compile.py +210 -0
triton/tools/disasm.py +143 -0
triton/tools/extra/cuda/compile.c +70 -0
triton/tools/extra/cuda/compile.h +14 -0
triton/tools/extra/hip/compile.cpp +66 -0
triton/tools/extra/hip/compile.h +13 -0
triton/tools/link.py +322 -0
triton/tools/mxfp.py +301 -0
triton/tools/ragged_tma.py +92 -0
triton/tools/tensor_descriptor.py +34 -0
triton/windows_utils.py +405 -0
triton_windows-3.5.1.post21.dist-info/METADATA +46 -0
triton_windows-3.5.1.post21.dist-info/RECORD +217 -0
triton_windows-3.5.1.post21.dist-info/WHEEL +5 -0
triton_windows-3.5.1.post21.dist-info/entry_points.txt +3 -0
triton_windows-3.5.1.post21.dist-info/licenses/LICENSE +23 -0
triton_windows-3.5.1.post21.dist-info/top_level.txt +1 -0

triton/experimental/gluon/language/__init__.py ADDED Viewed

@@ -0,0 +1,119 @@
+from ._core import (
+    base_value,
+    base_type,
+    block_type,
+    broadcast,
+    constexpr,
+    dtype,
+    void,
+    int1,
+    int8,
+    int16,
+    int32,
+    int64,
+    uint8,
+    uint16,
+    uint32,
+    uint64,
+    float8e5,
+    float8e5b16,
+    float8e4nv,
+    float8e4b8,
+    float8e4b15,
+    float16,
+    bfloat16,
+    float32,
+    float64,
+    pointer_type,
+    shared_memory_descriptor,
+    tensor,
+    tuple,
+    tuple_type,
+    _unwrap_if_constexpr,
+    # API Functions
+    allocate_shared_memory,
+    arange,
+    associative_scan,
+    atomic_add,
+    atomic_and,
+    atomic_cas,
+    atomic_max,
+    atomic_min,
+    atomic_or,
+    atomic_xchg,
+    atomic_xor,
+    convert_layout,
+    device_assert,
+    expand_dims,
+    full,
+    histogram,
+    inline_asm_elementwise,
+    join,
+    load,
+    map_elementwise,
+    max_constancy,
+    max_contiguous,
+    maximum,
+    minimum,
+    multiple_of,
+    num_programs,
+    permute,
+    program_id,
+    reduce,
+    reshape,
+    set_auto_layout,
+    split,
+    static_assert,
+    static_print,
+    static_range,
+    store,
+    thread_barrier,
+    to_tensor,
+    warp_specialize,
+    where,
+)
+from ._layouts import (
+    AutoLayout,
+    BlockedLayout,
+    SliceLayout,
+    DistributedLinearLayout,
+    DotOperandLayout,
+    NVMMADistributedLayout,
+    NVMMASharedLayout,
+    SwizzledSharedLayout,
+    PaddedSharedLayout,
+)
+from ._math import (
+    umulhi,
+    exp,
+    exp2,
+    fma,
+    log,
+    log2,
+    cos,
+    rsqrt,
+    sin,
+    sqrt,
+    sqrt_rn,
+    abs,
+    fdiv,
+    div_rn,
+    erf,
+    floor,
+    ceil,
+)
+from ._standard import (
+    cdiv,
+    full_like,
+    max,
+    min,
+    reduce_or,
+    sum,
+    xor_sum,
+    zeros,
+    zeros_like,
+)
+from . import nvidia
+from . import amd
+from . import extra

triton/experimental/gluon/language/_core.py ADDED Viewed

@@ -0,0 +1,490 @@
+from __future__ import annotations
+import math
+from typing import TypeVar, List, TYPE_CHECKING, Tuple
+from functools import wraps
+if TYPE_CHECKING:
+    from triton._C.libtriton.gluon_ir import GluonOpBuilder
+    from ._semantic import GluonSemantic
+from ._layouts import SharedLayout, DistributedLayout
+from triton._C.libtriton import ir
+import triton.language.core as tl_core
+from triton.language.core import (
+    constexpr,
+    base_value,
+    base_type,
+    dtype,
+    block_type,  # TODO: block type with layout info
+    pointer_type,
+    void,
+    int1,
+    int8,
+    int16,
+    int32,
+    int64,
+    uint8,
+    uint16,
+    uint32,
+    uint64,
+    float8e5,
+    float8e5b16,
+    float8e4nv,
+    float8e4b8,
+    float8e4b15,
+    float16,
+    bfloat16,
+    float32,
+    float64,
+    _unwrap_if_constexpr,
+    _unwrap_shape,
+    static_range,
+    tensor,
+    tuple,
+    tuple_type,
+)
+# We define __all__ only to appease the python linter, these are not used in
+# this file but we want to import them anyway so they are importable from here.
+__all__ = [
+    "constexpr",
+    "pointer_type",
+    "void",
+    "int1",
+    "int8",
+    "int16",
+    "int32",
+    "int64",
+    "uint8",
+    "uint16",
+    "uint32",
+    "uint64",
+    "float8e5",
+    "float8e5b16",
+    "float8e4nv",
+    "float8e4b8",
+    "float8e4b15",
+    "float16",
+    "bfloat16",
+    "float32",
+    "float64",
+    "static_range",
+    "tuple",
+    "tuple_type",
+]
+T = TypeVar("T")
+# TODO: split these
+GLUON_BUILTIN = "__triton_builtin__"
+def builtin(fn: T) -> T:
+    """Mark a function as a builtin."""
+    assert callable(fn)
+    @wraps(fn)
+    def wrapper(*args, **kwargs):
+        if "_semantic" not in kwargs or kwargs["_semantic"] is None:
+            raise ValueError("Did you forget to add @triton.gluon.jit ? "
+                             "(`_semantic` argument must be provided outside of JIT functions.)")
+        return fn(*args, **kwargs)
+    setattr(wrapper, GLUON_BUILTIN, True)
+    return wrapper
+# Explicitly import forwarded Triton language symbols so mypy sees them.
+associative_scan = builtin(tl_core.associative_scan)
+atomic_add = builtin(tl_core.atomic_add)
+atomic_and = builtin(tl_core.atomic_and)
+atomic_cas = builtin(tl_core.atomic_cas)
+atomic_max = builtin(tl_core.atomic_max)
+atomic_min = builtin(tl_core.atomic_min)
+atomic_or = builtin(tl_core.atomic_or)
+atomic_xchg = builtin(tl_core.atomic_xchg)
+atomic_xor = builtin(tl_core.atomic_xor)
+broadcast = builtin(tl_core.broadcast)
+device_assert = builtin(tl_core.device_assert)
+expand_dims = builtin(tl_core.expand_dims)
+inline_asm_elementwise = builtin(tl_core.inline_asm_elementwise)
+join = builtin(tl_core.join)
+load = builtin(tl_core.load)
+map_elementwise = builtin(tl_core.map_elementwise)
+max_constancy = builtin(tl_core.max_constancy)
+max_contiguous = builtin(tl_core.max_contiguous)
+maximum = builtin(tl_core.maximum)
+minimum = builtin(tl_core.minimum)
+multiple_of = builtin(tl_core.multiple_of)
+num_programs = builtin(tl_core.num_programs)
+permute = builtin(tl_core.permute)
+program_id = builtin(tl_core.program_id)
+reduce = builtin(tl_core.reduce)
+reshape = builtin(tl_core.reshape)
+split = builtin(tl_core.split)
+static_assert = builtin(tl_core.static_assert)
+static_print = builtin(tl_core.static_print)
+store = builtin(tl_core.store)
+to_tensor = builtin(tl_core.to_tensor)
+where = builtin(tl_core.where)
+class distributed_type(block_type):
+    def __init__(self, element_ty: dtype, shape: List[int], layout):
+        super().__init__(element_ty, shape)
+        self.layout = layout
+        self.name = f"<{self.shape}, {self.element_ty}, {self.layout}>"
+        assert isinstance(layout, DistributedLayout)
+    def to_ir(self, builder: ir.builder) -> ir.type:
+        elem_ty = self.element_ty.to_ir(builder)
+        layout = self.layout._to_ir(builder)
+        return builder.get_distributed_ty(elem_ty, self.shape, layout)
+    def mangle(self) -> str:
+        elt = self.scalar.mangle()
+        shape = "_".join(map(str, self.shape))
+        layout = self.layout.mangle()
+        return f"{elt}S{shape}SL{layout}L"
+    def with_element_ty(self, scalar_ty: dtype) -> block_type:
+        return distributed_type(scalar_ty, self.shape, self.layout)
+    def __eq__(self, other) -> bool:
+        if not isinstance(other, distributed_type):
+            return False
+        return super().__eq__(other) and self.layout == other.layout
+class shared_memory_descriptor_type(base_type):
+    def __init__(self, element_ty, shape, layout, alloc_shape):
+        self.element_ty = element_ty
+        self.shape = shape
+        self.layout = layout
+        self.alloc_shape = alloc_shape
+        assert isinstance(layout, SharedLayout)
+    def to_ir(self, builder: GluonOpBuilder) -> None:
+        return builder.get_shared_mem_desc_ty(
+            self.element_ty.to_ir(builder),
+            self.shape,
+            self.layout._to_ir(builder),
+            self.alloc_shape,
+        )
+    def _unflatten_ir(self, handles: List[ir.Value], cursor: int) -> Tuple[shared_memory_descriptor, int]:
+        value = shared_memory_descriptor(handles[cursor], self.element_ty, self.shape, self.layout, self.alloc_shape)
+        return value, cursor + 1
+    def _flatten_ir_types(self, builder: GluonOpBuilder, out: List[ir.type]) -> None:
+        out.append(self.to_ir(builder))
+    def __str__(self) -> str:
+        return f"shared_memory_descriptor<{self.element_ty}, {self.shape}, {self.layout}, {self.alloc_shape}>"
+    def __eq__(self, other) -> bool:
+        return (type(self) is type(other) and self.shape == other.shape and self.layout == other.layout
+                and self.alloc_shape == other.alloc_shape)
+    def __neq__(self, other) -> bool:
+        return not (self == other)
+    def mangle(self) -> str:
+        shape_str = "_".join([str(s) for s in self.shape])
+        return f"MD{self.element_ty.mangle()}S{shape_str}SL{self.layout.mangle()}LAS{self.alloc_shape}ASMD"
+class shared_memory_descriptor(base_value):
+    """
+    Represents a handle to a shared memory allocation in Gluon IR.
+    """
+    def __init__(self, handle, element_ty, shape, layout, alloc_shape):
+        self.handle = handle
+        self.type = shared_memory_descriptor_type(element_ty, shape, layout, alloc_shape)
+    def _flatten_ir(self, handles: List[ir.value]) -> None:
+        handles.append(self.handle)
+    @property
+    def dtype(self):
+        return self.type.element_ty
+    @property
+    def shape(self):
+        return self.type.shape
+    @property
+    def rank(self):
+        return len(self.shape)
+    @property
+    def numel(self) -> int:
+        return math.prod(self.shape)
+    @property
+    def layout(self):
+        return self.type.layout
+    def __str__(self) -> str:
+        return str(self.type)
+    @builtin
+    def load(self, layout, _semantic: GluonSemantic = None) -> tensor:
+        """
+        Load a tensor from shared memory.
+        Args:
+            layout (DistributedLayout): The destination layout of the tensor.
+        Returns:
+            tensor: A Gluon tensor containing the loaded data.
+        """
+        layout = _unwrap_if_constexpr(layout)
+        return _semantic.shared_load(self, layout)
+    @builtin
+    def store(self, value, _semantic: GluonSemantic = None) -> None:
+        """
+        Store a tensor into shared memory.
+        Args:
+            value (tensor): The tensor whose contents to store.
+        """
+        return _semantic.shared_store(self, value)
+    @builtin
+    def slice(self, start, length, dim=0, _semantic: GluonSemantic = None) -> shared_memory_descriptor:
+        """
+        Create a subview of shared memory by slicing along a given dimension.
+        Args:
+            start (int): The starting index of the slice.
+            length (int): The length of the slice.
+            dim (int): The dimension to slice (default: 0).
+        Returns:
+            shared_memory_descriptor: Descriptor for the sliced subview.
+        """
+        start = _unwrap_if_constexpr(start)
+        length = _unwrap_if_constexpr(length)
+        dim = _unwrap_if_constexpr(dim)
+        return _semantic.memdesc_slice(self, start, length, dim)
+    @builtin
+    def index(self, index, _semantic: GluonSemantic = None) -> shared_memory_descriptor:
+        """
+        Create a subview of shared memory by indexing along the first dimension.
+        Args:
+            index (int): The index at which to take the subview.
+        Returns:
+            shared_memory_descriptor: Descriptor for the indexed subview.
+        """
+        index = _unwrap_if_constexpr(index)
+        return _semantic.memdesc_index(self, index)
+    @builtin
+    def permute(self, order, _semantic: GluonSemantic = None) -> shared_memory_descriptor:
+        """
+        Permute the dimensions of the shared memory descriptor.
+        Args:
+            order (List[int]): The new ordering of dimensions.
+        Returns:
+            shared_memory_descriptor: Descriptor with permuted dimensions.
+        """
+        order = [_unwrap_if_constexpr(o) for o in order]
+        return _semantic.memdesc_trans(self, order)
+    @builtin
+    def reshape(self, shape, _semantic: GluonSemantic = None) -> shared_memory_descriptor:
+        """
+        Reshape the shared memory descriptor to a new shape and layout.
+        Args:
+            shape (List[int]): The target shape.
+        Returns:
+            shared_memory_descriptor: Descriptor with the new shape and layout.
+        """
+        shape = [_unwrap_if_constexpr(s) for s in shape]
+        return _semantic.memdesc_reshape(self, shape)
+    @builtin
+    def _reinterpret(self, dtype, shape, layout, _semantic: GluonSemantic = None) -> shared_memory_descriptor:
+        """
+        Reinterpret the shared memory descriptor as a different dtype, shape, or layout.
+        Args:
+            dtype (dtype): The new data type.
+            shape (List[int]): The new shape.
+            layout (SharedLayout): The new layout.
+        Returns:
+            shared_memory_descriptor: Descriptor with updated type and layout.
+        """
+        dtype = _unwrap_if_constexpr(dtype)
+        shape = [_unwrap_if_constexpr(s) for s in shape]
+        layout = _unwrap_if_constexpr(layout)
+        return _semantic.memdesc_reinterpret(self, dtype, shape, layout)
+    @builtin
+    def _keep_alive(self, _semantic: GluonSemantic = None) -> None:
+        """
+        Dummy use to keep the shared memory descriptor alive.
+        """
+        return _semantic.shared_dealloc(self)
+@builtin
+def arange(start, end, layout=None, _semantic=None):
+    """
+    Generate a sequence tensor with values in [start, end) using a specified layout.
+    Args:
+        start (int): Inclusive start of the sequence.
+        end (int): Exclusive end of the sequence.
+        layout (DistributedLayout): The layout of the output tensor. Defaults to AutoLayout.
+    Returns:
+        tensor: A 1D tensor containing sequential values.
+    """
+    start = _unwrap_if_constexpr(start)
+    end = _unwrap_if_constexpr(end)
+    layout = _unwrap_if_constexpr(layout)
+    return _semantic.arange(start, end, layout)
+@builtin
+def convert_layout(value, layout, assert_trivial=False, _semantic=None):
+    """
+    Convert a tensor to a different distributed layout.
+    Args:
+        value (tensor): The input tensor.
+        layout (DistributedLayout): The target layout.
+        assert_trivial (bool): If True, asserts that the conversion is trivial (no data movement).
+    Returns:
+        tensor: The tensor with the new layout.
+    """
+    layout = _unwrap_if_constexpr(layout)
+    return _semantic.convert_layout(value, layout, assert_trivial)
+@builtin
+def full(shape, value, dtype, layout=None, _semantic=None):
+    """
+    Create a tensor filled with a scalar value, with specified shape, dtype, and layout.
+    Args:
+        shape (Sequence[int]): The shape of the tensor.
+        value (int or float): The fill value.
+        dtype (dtype): The data type for the tensor.
+        layout (Optional[DistributedLayout]): The layout of the output tensor, defaults to AutoLayout().
+    Returns:
+        tensor: A tensor where every element equals value.
+    """
+    shape = _unwrap_shape(shape)
+    value = _unwrap_if_constexpr(value)
+    dtype = _unwrap_if_constexpr(dtype)
+    layout = _unwrap_if_constexpr(layout)
+    return _semantic.full(shape, value, dtype, layout)
+@builtin
+def histogram(input, num_bins, mask=None, layout=None, _semantic=None, _generator=None):
+    """
+    Compute a histogram of a 1D integer tensor.
+    Args:
+        input (tensor): 1D tensor of integer values.
+        num_bins (int): Number of bins. Bins have width 1 and start at 0.
+        mask (Optional[tensor]): Boolean mask to exclude elements when False.
+        layout (DistributedLayout): Destination layout of the output histogram.
+    Returns:
+        tensor: 1D int32 tensor of length `num_bins` with the requested layout.
+    """
+    num_bins = _unwrap_if_constexpr(num_bins)
+    layout = _unwrap_if_constexpr(layout)
+    if mask is not None:
+        mask = _semantic.to_tensor(mask)
+    return _semantic.histogram(input, num_bins, mask, layout)
+@builtin
+def allocate_shared_memory(element_ty, shape, layout, value=None, _semantic=None) -> shared_memory_descriptor:
+    """
+    Allocate shared memory for a tensor with the given element type, shape, and layout.
+    Args:
+        element_ty (dtype): The element data type.
+        shape (Sequence[int]): The dimensions of the shared memory.
+        layout (SharedLayout): The shared memory layout.
+        value (tensor, optional): Initial value to copy into shared memory.
+    Returns:
+        shared_memory_descriptor: Descriptor for the allocated memory.
+    """
+    element_ty = _unwrap_if_constexpr(element_ty)
+    shape = _unwrap_if_constexpr(shape)
+    shape = [_unwrap_if_constexpr(s) for s in shape]
+    layout = _unwrap_if_constexpr(layout)
+    return _semantic.allocate_shared(element_ty, shape, layout, value)
+@builtin
+def set_auto_layout(value, layout, _semantic=None):
+    """
+    Set a a tensor with AutoLayout to a concrete layout
+    Args:
+        value (tensor): The input tensor.
+        layout (DistribtedLayout): The target layout.
+    Returns:
+        tensor: The tensor with the new layout.
+    """
+    layout = _unwrap_if_constexpr(layout)
+    return _semantic.set_auto_layout(value, layout)
+@builtin
+def warp_specialize(default_args, default_partition, worker_args, worker_partitions, worker_num_warps, worker_num_regs,
+                    _semantic=None, _generator=None):
+    """
+    Create a warp-specialized execution region, partitioning work across warps.
+    Args:
+        default_args (List[Any]): Arguments for the default region.
+        default_partition (callable): Function to build the default execution region.
+        worker_args (List[Any]): Arguments for each warp partition.
+        worker_partitions (List[callable]): Functions for each warp partition.
+        worker_num_warps (List[int]): Number of warps per partition.
+        worker_num_regs (List[int]): Number of registers per partition.
+    Returns:
+        Tuple[Any, ...]: Results from the default region.
+    """
+    worker_num_warps = [_unwrap_if_constexpr(w) for w in worker_num_warps]
+    worker_num_regs = [_unwrap_if_constexpr(r) for r in worker_num_regs]
+    return _semantic.warp_specialize(default_args, default_partition, worker_args, worker_partitions, worker_num_warps,
+                                     worker_num_regs, _generator)
+@builtin
+def thread_barrier(_semantic=None):
+    """
+    Insert a barrier to synchronize threads within a CTA.
+    """
+    return _semantic.debug_barrier()