PyPI - triton-windows - Versions diffs - 3.4.0.post20__cp310-cp310-win_amd64.whl → 3.5.0.post21__cp310-cp310-win_amd64.whl - Mend

triton-windows 3.4.0.post20__cp310-cp310-win_amd64.whl → 3.5.0.post21__cp310-cp310-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of triton-windows might be problematic. Click here for more details.

Files changed (107) hide show

triton/_C/libtriton.pyd +0 -0
triton/__init__.py +8 -2
triton/_filecheck.py +24 -14
triton/_internal_testing.py +70 -4
triton/_utils.py +3 -1
triton/backends/amd/compiler.py +68 -60
triton/backends/amd/driver.c +113 -44
triton/backends/amd/driver.py +133 -57
triton/backends/driver.py +13 -0
triton/backends/nvidia/compiler.py +80 -22
triton/backends/nvidia/driver.c +88 -15
triton/backends/nvidia/driver.py +130 -123
triton/compiler/__init__.py +5 -2
triton/compiler/code_generator.py +270 -163
triton/compiler/compiler.py +45 -62
triton/experimental/gluon/__init__.py +3 -2
triton/experimental/gluon/_runtime.py +9 -6
triton/experimental/gluon/language/__init__.py +117 -16
triton/experimental/gluon/language/_core.py +246 -68
triton/experimental/gluon/language/_layouts.py +398 -45
triton/experimental/gluon/language/_math.py +17 -9
triton/experimental/gluon/language/_semantic.py +130 -37
triton/experimental/gluon/language/_standard.py +55 -22
triton/experimental/gluon/language/amd/__init__.py +4 -0
triton/experimental/gluon/language/amd/_layouts.py +96 -0
triton/experimental/gluon/language/amd/cdna3/__init__.py +100 -0
triton/experimental/gluon/language/amd/cdna4/__init__.py +48 -0
triton/experimental/gluon/language/amd/cdna4/async_copy.py +151 -0
triton/experimental/gluon/language/extra/__init__.py +3 -0
triton/experimental/gluon/language/nvidia/ampere/__init__.py +3 -0
triton/experimental/gluon/language/nvidia/ampere/async_copy.py +74 -0
triton/experimental/gluon/language/nvidia/ampere/mbarrier.py +80 -0
triton/experimental/gluon/language/nvidia/blackwell/__init__.py +192 -7
triton/experimental/gluon/language/nvidia/blackwell/tma.py +20 -0
triton/experimental/gluon/language/nvidia/hopper/__init__.py +124 -3
triton/experimental/gluon/language/nvidia/hopper/mbarrier.py +20 -37
triton/experimental/gluon/language/nvidia/hopper/tma.py +4 -3
triton/experimental/gluon/nvidia/hopper.py +6 -1
triton/knobs.py +132 -67
triton/language/__init__.py +16 -10
triton/language/core.py +163 -83
triton/language/extra/cuda/gdc.py +6 -6
triton/language/extra/hip/__init__.py +3 -1
triton/language/extra/hip/libdevice.py +7 -0
triton/language/extra/hip/utils.py +35 -0
triton/language/extra/libdevice.py +4 -0
triton/language/semantic.py +76 -23
triton/language/standard.py +14 -14
triton/language/target_info.py +54 -0
triton/runtime/_allocation.py +15 -3
triton/runtime/_async_compile.py +55 -0
triton/runtime/autotuner.py +4 -5
triton/runtime/build.py +11 -9
triton/runtime/cache.py +44 -1
triton/runtime/driver.py +16 -41
triton/runtime/interpreter.py +31 -23
triton/runtime/jit.py +318 -157
triton/runtime/tcc/include/_mingw.h +8 -10
triton/runtime/tcc/include/assert.h +5 -0
triton/runtime/tcc/include/errno.h +1 -1
triton/runtime/tcc/include/float.h +21 -3
triton/runtime/tcc/include/iso646.h +36 -0
triton/runtime/tcc/include/limits.h +5 -0
triton/runtime/tcc/include/malloc.h +2 -2
triton/runtime/tcc/include/math.h +21 -261
triton/runtime/tcc/include/stdalign.h +16 -0
triton/runtime/tcc/include/stdarg.h +5 -70
triton/runtime/tcc/include/stdatomic.h +171 -0
triton/runtime/tcc/include/stddef.h +7 -19
triton/runtime/tcc/include/stdlib.h +15 -4
triton/runtime/tcc/include/stdnoreturn.h +7 -0
triton/runtime/tcc/include/sys/stat.h +2 -2
triton/runtime/tcc/include/sys/types.h +5 -0
triton/runtime/tcc/include/tcc/tcc_libm.h +444 -27
triton/runtime/tcc/include/tccdefs.h +342 -0
triton/runtime/tcc/include/tgmath.h +89 -0
triton/runtime/tcc/include/uchar.h +33 -0
triton/runtime/tcc/include/unistd.h +1 -0
triton/runtime/tcc/include/winapi/qos.h +72 -0
triton/runtime/tcc/include/winapi/shellapi.h +59 -0
triton/runtime/tcc/include/winapi/winbase.h +9 -2
triton/runtime/tcc/include/winapi/wincon.h +8 -0
triton/runtime/tcc/include/winapi/windows.h +1 -1
triton/runtime/tcc/include/winapi/winnls.h +778 -0
triton/runtime/tcc/include/winapi/winnt.h +9 -7
triton/runtime/tcc/include/winapi/winsock2.h +1474 -0
triton/runtime/tcc/include/winapi/ws2ipdef.h +21 -0
triton/runtime/tcc/include/winapi/ws2tcpip.h +391 -0
triton/runtime/tcc/lib/libtcc1.a +0 -0
triton/runtime/tcc/lib/python314.def +1800 -0
triton/runtime/tcc/lib/python314t.def +1809 -0
triton/runtime/tcc/libtcc.dll +0 -0
triton/runtime/tcc/tcc.exe +0 -0
triton/tools/compile.py +62 -14
triton/tools/extra/cuda/compile.c +1 -0
triton/tools/extra/hip/compile.cpp +66 -0
triton/tools/extra/hip/compile.h +13 -0
triton/tools/ragged_tma.py +92 -0
triton/tools/tensor_descriptor.py +7 -9
triton/windows_utils.py +42 -79
{triton_windows-3.4.0.post20.dist-info → triton_windows-3.5.0.post21.dist-info}/METADATA +3 -4
{triton_windows-3.4.0.post20.dist-info → triton_windows-3.5.0.post21.dist-info}/RECORD +106 -75
triton/runtime/tcc/lib/libtcc1-64.a +0 -0
{triton_windows-3.4.0.post20.dist-info → triton_windows-3.5.0.post21.dist-info}/WHEEL +0 -0
{triton_windows-3.4.0.post20.dist-info → triton_windows-3.5.0.post21.dist-info}/entry_points.txt +0 -0
{triton_windows-3.4.0.post20.dist-info → triton_windows-3.5.0.post21.dist-info}/licenses/LICENSE +0 -0
{triton_windows-3.4.0.post20.dist-info → triton_windows-3.5.0.post21.dist-info}/top_level.txt +0 -0

triton/language/__init__.py CHANGED Viewed

@@ -55,9 +55,10 @@ from .core import (
     cat,
     cast,
     clamp,
+    condition,
     const,
     constexpr,
-    constexpr_function,
+    constexpr_type,
     debug_barrier,
     device_assert,
     device_print,
@@ -85,6 +86,7 @@ from .core import (
     join,
     load,
     make_block_ptr,
+    map_elementwise,
     max_constancy,
     max_contiguous,
     maximum,
@@ -130,6 +132,7 @@ from .random import (
     randn4x,
     uint_to_uniform_float,
 )
+from . import target_info
 __all__ = [
     "PropagateNan",
@@ -165,9 +168,10 @@ __all__ = [
     "cdiv",
     "ceil",
     "clamp",
+    "condition",
     "const",
     "constexpr",
-    "constexpr_function",
+    "constexpr_type",
     "cos",
     "cumprod",
     "cumsum",
@@ -210,6 +214,7 @@ __all__ = [
     "log",
     "log2",
     "make_block_ptr",
+    "map_elementwise",
     "math",
     "max",
     "max_constancy",
@@ -252,6 +257,7 @@ __all__ = [
     "store",
     "sum",
     "swizzle2d",
+    "target_info",
     "tensor",
     "topk",
     "trans",
@@ -271,12 +277,12 @@ __all__ = [
 ]
-def str_to_ty(name):
+def str_to_ty(name, c):
     from builtins import tuple
     if isinstance(name, tuple):
         fields = type(name).__dict__.get("_fields", None)
-        return tuple_type([str_to_ty(x) for x in name], fields)
+        return tuple_type([str_to_ty(x, c) for x in name], fields)
     if name[0] == "*":
         name = name[1:]
@@ -284,17 +290,17 @@ def str_to_ty(name):
         if name[0] == "k":
             name = name[1:]
             const = True
-        ty = str_to_ty(name)
+        ty = str_to_ty(name, c)
         return pointer_type(element_ty=ty, const=const)
     if name.startswith("tensordesc"):
         inner = name.split("<")[1].rstrip(">")
-        dtype, rest = inner.split("[", maxsplit=2)
-        block_shape, rest = rest.split("]", maxsplit=2)
+        dtype, rest = inner.split("[", maxsplit=1)
+        block_shape, rest = rest.split("]", maxsplit=1)
         block_shape = [int(s.strip()) for s in block_shape.rstrip("]").split(",")]
         layout = rest.lstrip(",")
         is_gluon = len(layout)
-        dtype = str_to_ty(dtype)
+        dtype = str_to_ty(dtype, None)
         ndim = len(block_shape)
         shape_type = tuple_type([int32] * ndim)
         # FIXME: Last dim stride should be constexpr(1)
@@ -308,8 +314,8 @@ def str_to_ty(name):
             return gluon_tensor_descriptor_type(block, shape_type, stride_type, layout)
         return tensor_descriptor_type(block, shape_type, stride_type)
-    if name == "constexpr":
-        return constexpr
+    if name.startswith("constexpr"):
+        return constexpr_type(c)
     tys = {
         "fp8e4nv": float8e4nv,

triton/language/core.py CHANGED Viewed

@@ -1,5 +1,6 @@
 from __future__ import annotations
+import math
 from warnings import warn
 from contextlib import contextmanager
 from enum import Enum
@@ -9,7 +10,7 @@ from typing import Union, Callable, List, Sequence, TypeVar, Optional, Tuple
 from dataclasses import dataclass
 import builtins
 from .. import knobs
-from ..runtime.jit import jit, JITFunction
+from ..runtime.jit import JITCallable
 import inspect
 from .._C.libtriton import ir
@@ -86,7 +87,7 @@ def _tensor_member_fn(fn: T) -> T:
     if is_builtin(fn):
         setattr(wrapper, TRITON_BUILTIN, True)
-    setattr(tensor, fn.__name__, fn if isinstance(fn, JITFunction) else wrapper)
+    setattr(tensor, fn.__name__, fn if isinstance(fn, JITCallable) else wrapper)
     return fn
@@ -152,10 +153,10 @@ class base_value:
 class base_type:
-    def __eq__(self, other):
+    def __eq__(self, other) -> bool:
         raise NotImplementedError("Types must implement __eq__")
-    def __ne__(self, other):
+    def __ne__(self, other) -> bool:
         return not (self == other)
     def _unflatten_ir(self, handles: List[ir.value], cursor: int) -> Tuple[base_value, int]:
@@ -178,10 +179,13 @@ class constexpr_type(base_type):
         self.value = value
     def __eq__(self, other):
-        return self.value == other.value
+        return isinstance(other, constexpr_type) and self.value == other.value
     def __repr__(self) -> str:
-        return f"constexpr[{self.value}]"
+        return f"constexpr_type[{self.value}]"
+    def __hash__(self):
+        return hash(self.value)
     def mangle(self) -> str:
         return repr(self)
@@ -199,15 +203,17 @@ class constexpr(base_value):
     """
     def __init__(self, value):
-        if isinstance(value, constexpr):
-            self.value = value.value
-        else:
-            self.value = value
+        while isinstance(value, constexpr):
+            value = value.value
+        self.value = value
         self.type = constexpr_type(value)
     def __repr__(self) -> str:
         return f"constexpr[{self.value}]"
+    def __hash__(self):
+        return hash((self.value, self.type))
     def _flatten_ir(self, handles: List[ir.value]) -> None:
         return
@@ -334,32 +340,6 @@ class constexpr(base_value):
         return self.value.__getitem__(*args)
-def constexpr_function(f):
-    """
-    Wraps an arbitrary Python function so that it can be called at
-    compile-time on constexpr arguments in a Triton function and
-    returns a constexpr result.
-    """
-    @wraps(f)
-    def wrapper(*args, _semantic=None, **kwargs):
-        # de-constexpr arguments and discard the _semantic keyword argument:
-        args = [_unwrap_if_constexpr(x) for x in args]
-        kwargs = {k: _unwrap_if_constexpr(v) for (k, v) in kwargs.items()}
-        # call the raw Python function f:
-        res = f(*args, **kwargs)
-        # convert result back to a Triton constexpr:
-        return constexpr(res)
-    # disguise the function as a Triton builtin to avoid raising an error
-    # that we're calling a non-JIT function from within a Triton kernel:
-    wrapper.__triton_builtin__ = True
-    wrapper.__module__ = constexpr_function.__module__
-    return wrapper
 CONSTEXPR_0 = constexpr(0)
@@ -572,7 +552,8 @@ class dtype(base_type):
     def is_const():
         return False
-    def __eq__(self, other: dtype):
+    def __eq__(self, other) -> bool:
+        other = _unwrap_if_constexpr(other)
         if not isinstance(other, dtype):
             return False
         return self.name == other.name
@@ -696,7 +677,8 @@ class pointer_type(dtype):
     def is_const(self):
         return self.const
-    def __eq__(self, other: pointer_type) -> bool:
+    def __eq__(self, other) -> bool:
+        other = _unwrap_if_constexpr(other)
         if not isinstance(other, pointer_type):
             return False
         return self.element_ty == other.element_ty and self.address_space == other.address_space and self.const == other.const
@@ -753,6 +735,10 @@ class block_type(dtype):
     def scalar(self):
         return self.element_ty
+    @property
+    def nbytes(self):
+        return self.numel * (self.element_ty.primitive_bitwidth // 8)
     def mangle(self) -> str:
         elt = self.scalar.mangle()
         shape = '_'.join(map(str, self.shape))
@@ -879,10 +865,7 @@ class tensor(base_value):
         self.handle = handle
         # Block shape
         self.shape = type.shape if type.is_block() else ()
-        self.numel = 1
-        for s in self.shape:
-            self.numel *= s
-        self.numel = constexpr(self.numel)
+        self.numel = constexpr(math.prod(self.shape))
         self.type = type  # Tensor type (can be block_type)
         # Following the practice in pytorch, dtype is scalar type
         self.dtype = type.scalar
@@ -1268,19 +1251,20 @@ class tensor(base_value):
         ...
-class tuple(base_value):
+def _type_for_tuple_values(values, fields=None):
+    return tuple_type([constexpr_type(x) if isinstance(x, (int, float, dtype)) else x.type for x in values], fields)
-    def __init__(self, args: Sequence, type: tuple_type = None):
-        self.values = [i for i in args]
-        def get_type(x):
-            if isinstance(x, dtype):
-                return dtype
-            if isinstance(x, (int, float)):
-                return constexpr
-            return x.type
+class tuple(base_value):
-        self.type = type or tuple_type([get_type(x) for x in self.values])
+    def __init__(self, args: Sequence, type: Optional[tuple_type] = None):
+        self.values = [i for i in args]
+        if isinstance(type, tuple_type):
+            self.type = type
+        elif type is not None:  # make_template in ASTFunction.deserialize may pass us a list/tuple
+            self.type = tuple_type(type)
+        else:
+            self.type = _type_for_tuple_values(self.values)
     def __getitem__(self, idx: constexpr):
         if isinstance(idx, int):
@@ -1295,11 +1279,11 @@ class tuple(base_value):
         return self.values[self.type.fields.index(name)]
     # TODO: remove
-    def __setitem__(self, idx: constexpr, value):
-        if isinstance(idx, int):
-            idx = constexpr(idx)
-        assert isinstance(idx, constexpr)
+    def _setitem(self, idx, value):
+        idx = _unwrap_if_constexpr(idx)
+        assert isinstance(idx, int)
         self.values[idx] = value
+        self.type = _type_for_tuple_values(self.values, self.type.fields)
     def __add__(self, other):
         other = _normalize_tuple(other)
@@ -1560,7 +1544,7 @@ def _aggregate(cls):
         def __new__(this_cls, *args, _semantic=None, _generator=None, **kwargs):
             # Call into the user-defined constructor.
             instance = this_cls._get_instance()
-            if isinstance(cls.__init__, JITFunction):
+            if isinstance(cls.__init__, JITCallable):
                 raise ValueError(f"{cls.__name__}.__init__ cannot be a @triton.jit function")
             extra_kwargs = {}
             if "_semantic" in inspect.signature(cls.__init__).parameters:
@@ -1594,7 +1578,7 @@ def _aggregate(cls):
                                    [(name, getattr(self, name).type) for name in cls.__annotations__.keys()])
     for (name, member) in inspect.getmembers(cls):
-        if inspect.isfunction(member) or inspect.ismethod(member) or isinstance(member, JITFunction):
+        if inspect.isfunction(member) or inspect.ismethod(member) or isinstance(member, JITCallable):
             if name != "__init__":
                 setattr(aggregate_value, name, member)
@@ -1828,11 +1812,6 @@ def join(a, b, _semantic=None):
     return _semantic.join(a, b)
-@jit
-def _take_first(a, b):
-    return a
 def _unsplat(x, _semantic=None, _generator=None):
     """
     Convert a single-element tensor to a scalar.
@@ -1843,10 +1822,7 @@ def _unsplat(x, _semantic=None, _generator=None):
     for d in x.shape:
         numel *= d
     assert numel == 1, "can only unsplat single-element tensors"
-    if len(x.shape) >= 2:
-        x = _semantic.reshape(x, [1])
-    x = typing.cast(tensor, reduce(x, 0, _take_first, _semantic=_semantic, _generator=_generator))
-    return x
+    return _semantic.unsplat(x)
 @_tensor_member_fn
@@ -2252,6 +2228,7 @@ def make_tensor_descriptor(
     shape: List[tensor],
     strides: List[tensor],
     block_shape: List[constexpr],
+    padding_option="zero",
     _semantic=None,
 ) -> tensor_descriptor:
     """Make a tensor descriptor object
@@ -2301,7 +2278,9 @@ def make_tensor_descriptor(
         inplace_abs[grid](x, M, N, M_BLOCK, N_BLOCK)
     """
-    return _semantic.make_tensor_descriptor(base, shape, strides, block_shape)
+    padding_option = _unwrap_if_constexpr(padding_option)
+    return _semantic.make_tensor_descriptor(base, shape, strides, block_shape, padding_option)
 # -----------------------
@@ -2784,6 +2763,79 @@ def gather(src, index, axis, _semantic=None):
     return _semantic.gather(src, index, axis)
+@builtin
+def map_elementwise(
+    scalar_fn: Callable[..., Tuple[tensor, ...]],
+    *args: tensor,
+    pack=1,
+    _semantic=None,
+    _generator=None,
+):
+    '''
+        Map a scalar function over a tensor.
+        The input tensors :code:`args` are implicitly broadcasted to the same shape.
+        This may be useful in allowing control flow over single elements in a tensor,
+        for example a multi-branch function where one branch is more expensive. With
+        :code:`tl.where` you are forced to calculate both sides of the branch, but
+        with an if we only execute one side.
+        .. highlight:: python
+        .. code-block:: python
+            @triton.jit
+            def selu_scalar(x, alpha):
+                if x > 0:
+                    return a
+                else:
+                    return alpha * (tl.exp(x) - 1)
+            @triton.jit
+            def selu(x, alpha):
+                return tl.map_elementwise(selu_scalar, x, alpha)
+        :param scalar_fn: the function to map over.
+        :param pack: the number of elements to be processed by one function call.
+        :return: one tensor or a tuple of tensors, depending on the mapped function.
+    '''
+    # Build the block for the nested region first to discover the return types
+    assert pack >= 1
+    in_scalar_tys = [t.type.scalar for t in args]
+    builder = _semantic.builder
+    block = builder.new_block()
+    scalar_args = []
+    for i, ty in enumerate(in_scalar_tys):
+        for j in builtins.range(pack):
+            block.add_argument(ty.to_ir(builder))
+            scalar_args.append(tensor(block.arg(i * pack + j), ty))
+    with _insertion_guard(builder):
+        builder.set_insertion_point_to_start(block)
+        scalar_results = _generator.call_JitFunction(scalar_fn, scalar_args, kwargs={})
+        is_single = isinstance(scalar_results, tensor)
+        if is_single:
+            scalar_results = scalar_results,
+        handles = [r.handle for r in scalar_results]
+        builder.create_map_elementwise_ret(handles)
+    fn_result_types = [x.type for x in scalar_results]
+    scalar_result_types = fn_result_types
+    if pack > 1:
+        scalar_result_types = fn_result_types[::pack]
+        for offset in builtins.range(1, pack):
+            assert scalar_result_types == fn_result_types[offset::pack], "type mismatch in unpacked results"
+    def make_elementwise_region(elementwise_op):
+        region = elementwise_op.get_region(0)
+        region.push_back(block)
+    result = _semantic.map_elementwise(args, scalar_result_types, pack, make_elementwise_region)
+    return result[0] if is_single else result
 # -----------------------
 # Compiler Hint Ops
 # -----------------------
@@ -2941,7 +2993,7 @@ def device_print(prefix, *args, hex=False, _semantic=None):
 @builtin
-def device_assert(cond, msg="", _semantic=None):
+def device_assert(cond, msg="", mask=None, _semantic=None):
     '''
     Assert the condition at runtime from the device.  Requires that the environment variable :code:`TRITON_DEBUG`
     is set to a value besides :code:`0` in order for this to have any effect.
@@ -2960,7 +3012,10 @@ def device_assert(cond, msg="", _semantic=None):
     :param msg: the message to print if the assertion fails. This is required to be a string literal.
     '''
     msg = _unwrap_if_constexpr(msg)
-    return _semantic.device_assert(_semantic.to_tensor(cond), msg)
+    mask = _unwrap_if_constexpr(mask)
+    if mask is not None:
+        mask = _semantic.to_tensor(mask)
+    return _semantic.device_assert(_semantic.to_tensor(cond), msg, mask)
 @builtin
@@ -3098,7 +3153,7 @@ def inline_asm_elementwise(asm: str, constraints: str, args: Sequence, dtype: Un
 # -----------------------
-class static_range:
+class static_range(base_value):
     """
     Iterator that counts upward forever.
@@ -3154,7 +3209,7 @@ class async_task:
         self.builder.unset_async_task_ids()
-class range:
+class range(base_value):
     """
     Iterator that counts upward forever.
@@ -3189,6 +3244,9 @@ class range:
         The compiler will attempt to partition memory, MMA, and vector
         operations in the loop into separate async partitions. This will
         increase the total number of warps required by the kernel.
+    :param disable_licm: Tells the compiler it shouldn't hoist loop invariant
+        code outside the loop. This is often useful to avoid creating long liveranges
+        within a loop.
         Note that warp specialization is only supported on Blackwell GPUs and
         only works on simple matmul loops. Support for arbitrary loops will be
@@ -3196,7 +3254,7 @@ class range:
     """
     def __init__(self, arg1, arg2=None, step=None, num_stages=None, loop_unroll_factor=None,
-                 disallow_acc_multi_buffer=False, flatten=False, warp_specialize=False):
+                 disallow_acc_multi_buffer=False, flatten=False, warp_specialize=False, disable_licm=False):
         if step is None:
             self.step = constexpr(1)
         else:
@@ -3212,6 +3270,7 @@ class range:
         self.disallow_acc_multi_buffer = disallow_acc_multi_buffer
         self.flatten = flatten
         self.warp_specialize = warp_specialize
+        self.disable_licm = disable_licm
     def __iter__(self):
         raise RuntimeError("tl.range can only be used in @triton.jit'd functions")
@@ -3220,13 +3279,36 @@ class range:
         raise RuntimeError("tl.range can only be used in @triton.jit'd functions")
+class condition(base_value):
+    """
+    While loop condition wrapper.
+    .. highlight:: python
+    .. code-block:: python
+        @triton.jit
+        def kernel(...):
+            while tl.condition(c, disable_licm)
+                ...
+    :note: This is a special wrapper used to annotate while loops in the context of
+        :code:`triton.jit` functions. It allows user to pass extra attributes to the compiler.
+    :param disable_licm: Tells the compiler it shouldn't hoist loop invariant
+        code outside the loop. This is often useful to avoid creating long liveranges
+        within a loop.
+    """
+    def __init__(self, arg1, disable_licm=False):
+        self.condition = arg1
+        self.disable_licm = disable_licm
 # -----------------------
 # Extern functions
 # -----------------------
-def dispatch(func, lib_name: str, lib_path: str, args: list, arg_type_symbol_dict: dict, ret_shape: tuple,
-             is_pure: bool, _semantic):
+def dispatch(func, lib_name: str, lib_path: str, args: list, arg_type_symbol_dict: dict, ret_type: dtype, is_pure: bool,
+             _semantic):
     '''
         Dispatch a function to a library
         :param func: the function to dispatch
@@ -3234,7 +3316,7 @@ def dispatch(func, lib_name: str, lib_path: str, args: list, arg_type_symbol_dic
         :param lib_path: the path of the library
         :param args: the arguments of the function
         :param arg_type_symbol_dict: the type of the arguments
-        :param ret_shape: the shape of the return value
+        :param ret_type: the type of the return value
         :return: the return value of the function
     '''
     if len(arg_type_symbol_dict) == 0:
@@ -3261,9 +3343,6 @@ def dispatch(func, lib_name: str, lib_path: str, args: list, arg_type_symbol_dic
                          f"Expect one of {arg_type_symbol_dict.keys()}, got {arg_types}")
     else:
         symbol = arg_type_symbol_dict[arg_types][0]
-        ret_type = arg_type_symbol_dict[arg_types][1]
-        if ret_shape:
-            ret_type = block_type(ret_type, ret_shape)
         builder = _semantic.builder
         return tensor(func(lib_name, lib_path, symbol, arg_list, ret_type.to_ir(builder), is_pure), ret_type)
@@ -3282,15 +3361,16 @@ def extern_elementwise(lib_name: str, lib_path: str, args: list, arg_type_symbol
     '''
     dispatch_args = args.copy()
     all_scalar = True
-    ret_shape = None
     arg_types = []
     for i in builtins.range(len(dispatch_args)):
         dispatch_args[i] = _semantic.to_tensor(dispatch_args[i])
         arg_types.append(dispatch_args[i].dtype)
         if dispatch_args[i].type.is_block():
             all_scalar = False
+    arg_types = tuple(arg_types)
+    ret_type = arg_type_symbol_dict[arg_types][1]
     if len(arg_types) > 0:
-        arg_types = tuple(arg_types)
         arithmetic_check = True
         # If there's a type tuple that is not supported by the library, we will do arithmetic check
         if arg_types in arg_type_symbol_dict:
@@ -3305,9 +3385,9 @@ def extern_elementwise(lib_name: str, lib_path: str, args: list, arg_type_symbol
             dispatch_args[i], _ = _semantic.binary_op_type_checking_impl(dispatch_args[i], broadcast_arg,
                                                                          arithmetic_check=arithmetic_check)
         if not all_scalar:
-            ret_shape = broadcast_arg.shape
+            ret_type = broadcast_arg.type.with_element_ty(ret_type)
     func = _semantic.builder.create_extern_elementwise
-    return dispatch(func, lib_name, lib_path, dispatch_args, arg_type_symbol_dict, ret_shape, is_pure, _semantic)
+    return dispatch(func, lib_name, lib_path, dispatch_args, arg_type_symbol_dict, ret_type, is_pure, _semantic)
 def binary_op_type_legalization(lhs, rhs, semantic):

triton/language/extra/cuda/gdc.py CHANGED Viewed

@@ -10,22 +10,22 @@ from triton.language import core
 @core.extern
-def gdc_wait(_builder=None):
+def gdc_wait(_semantic=None):
     """
     GDC wait is a blocking instruction that waits for all instructions in a prior kernel to complete before continuing.
     This ensures all memory operations happening before the wait is visible to instructions after it,
     e.g. if the prior kernel writes to address "x" the new values will be visible in this kernel after the wait.
-    This instruction is also safe to execute when programatic dependent launch is disabled.
+    This instruction is also safe to execute when programmatic dependent launch is disabled.
     See https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-griddepcontrol for more details.
     """
     core.inline_asm_elementwise("griddepcontrol.wait; // dummy $0", "=r", [], dtype=core.int32, is_pure=False, pack=1,
-                                _builder=_builder)
+                                _semantic=_semantic)
 @core.extern
-def gdc_launch_dependents(_builder=None):
+def gdc_launch_dependents(_semantic=None):
     """
     This operation when launched with programmatic dependent launch signals that
     the next program may launch once all programs in the current kernel
@@ -34,9 +34,9 @@ def gdc_launch_dependents(_builder=None):
     Repeated calls to this function have no effect past the first call, and the first call should be
     treated by the programmer as a hint to the runtime system to launch the next kernel.
-    This instruction is also safe to execute when programatic dependent launch is disabled.
+    This instruction is also safe to execute when programmatic dependent launch is disabled.
     See https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-griddepcontrol for more details.
     """
     core.inline_asm_elementwise("griddepcontrol.launch_dependents; // dummy $0", "=r", [], dtype=core.int32,
-                                is_pure=False, pack=1, _builder=_builder)
+                                is_pure=False, pack=1, _semantic=_semantic)

triton/language/extra/hip/__init__.py CHANGED Viewed

@@ -1,3 +1,5 @@
 from . import libdevice
-__all__ = ["libdevice"]
+from .utils import memrealtime
+__all__ = ["libdevice", "memrealtime"]

triton/language/extra/hip/libdevice.py CHANGED Viewed

@@ -73,6 +73,13 @@ def fast_expf(arg0, _semantic=None):
     }, is_pure=True, _semantic=_semantic)
+@core.extern
+def fast_tanhf(arg0, _semantic=None):
+    return core.extern_elementwise("", "", [arg0], {
+        (core.dtype("fp32"), ): ("__triton_hip_fast_tanhf", core.dtype("fp32")),
+    }, is_pure=True, _semantic=_semantic)
 @core.extern
 def fast_dividef(arg0, arg1, _semantic=None):
     return core.extern_elementwise("", "", [arg0, arg1], {

triton/language/extra/hip/utils.py ADDED Viewed

@@ -0,0 +1,35 @@
+from triton.language import core
+@core.extern
+def memrealtime(_semantic=None):
+    """
+    Returns a 64-bit real time-counter value
+    """
+    target_arch = _semantic.builder.options.arch
+    if 'gfx11' in target_arch or 'gfx12' in target_arch:
+        return core.inline_asm_elementwise(
+            """
+            s_sendmsg_rtn_b64 $0, sendmsg(MSG_RTN_GET_REALTIME)
+            s_waitcnt lgkmcnt(0)
+            """,
+            "=r",
+            [],
+            dtype=core.int64,
+            is_pure=False,
+            pack=1,
+            _semantic=_semantic,
+        )
+    else:
+        return core.inline_asm_elementwise(
+            """
+            s_memrealtime $0
+            s_waitcnt vmcnt(0)
+            """,
+            "=r",
+            [],
+            dtype=core.int64,
+            is_pure=False,
+            pack=1,
+            _semantic=_semantic,
+        )