PyPI - triton-windows - Versions diffs - 3.5.1.post21__cp313-cp313-win_amd64.whl - Mend

triton-windows 3.5.1.post21__cp313-cp313-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (217) hide show

triton/_C/libtriton.pyd +0 -0
triton/__init__.py +82 -0
triton/_filecheck.py +97 -0
triton/_internal_testing.py +255 -0
triton/_utils.py +126 -0
triton/backends/__init__.py +47 -0
triton/backends/amd/__init__.py +0 -0
triton/backends/amd/compiler.py +461 -0
triton/backends/amd/driver.c +283 -0
triton/backends/amd/driver.py +724 -0
triton/backends/amd/lib/asanrtl.bc +0 -0
triton/backends/amd/lib/ockl.bc +0 -0
triton/backends/amd/lib/ocml.bc +0 -0
triton/backends/compiler.py +90 -0
triton/backends/driver.py +66 -0
triton/backends/nvidia/__init__.py +0 -0
triton/backends/nvidia/bin/ptxas.exe +0 -0
triton/backends/nvidia/compiler.py +533 -0
triton/backends/nvidia/driver.c +517 -0
triton/backends/nvidia/driver.py +799 -0
triton/backends/nvidia/include/cuda.h +26280 -0
triton/backends/nvidia/lib/libdevice.10.bc +0 -0
triton/backends/nvidia/lib/x64/cuda.lib +0 -0
triton/compiler/__init__.py +7 -0
triton/compiler/code_generator.py +1614 -0
triton/compiler/compiler.py +509 -0
triton/compiler/errors.py +51 -0
triton/compiler/make_launcher.py +0 -0
triton/errors.py +5 -0
triton/experimental/__init__.py +0 -0
triton/experimental/gluon/__init__.py +5 -0
triton/experimental/gluon/_compiler.py +0 -0
triton/experimental/gluon/_runtime.py +102 -0
triton/experimental/gluon/language/__init__.py +119 -0
triton/experimental/gluon/language/_core.py +490 -0
triton/experimental/gluon/language/_layouts.py +583 -0
triton/experimental/gluon/language/_math.py +20 -0
triton/experimental/gluon/language/_semantic.py +380 -0
triton/experimental/gluon/language/_standard.py +80 -0
triton/experimental/gluon/language/amd/__init__.py +4 -0
triton/experimental/gluon/language/amd/_layouts.py +96 -0
triton/experimental/gluon/language/amd/cdna3/__init__.py +100 -0
triton/experimental/gluon/language/amd/cdna4/__init__.py +48 -0
triton/experimental/gluon/language/amd/cdna4/async_copy.py +151 -0
triton/experimental/gluon/language/extra/__init__.py +3 -0
triton/experimental/gluon/language/nvidia/__init__.py +4 -0
triton/experimental/gluon/language/nvidia/ampere/__init__.py +3 -0
triton/experimental/gluon/language/nvidia/ampere/async_copy.py +74 -0
triton/experimental/gluon/language/nvidia/ampere/mbarrier.py +80 -0
triton/experimental/gluon/language/nvidia/blackwell/__init__.py +387 -0
triton/experimental/gluon/language/nvidia/blackwell/tma.py +52 -0
triton/experimental/gluon/language/nvidia/hopper/__init__.py +132 -0
triton/experimental/gluon/language/nvidia/hopper/mbarrier.py +34 -0
triton/experimental/gluon/language/nvidia/hopper/tma.py +97 -0
triton/experimental/gluon/nvidia/__init__.py +4 -0
triton/experimental/gluon/nvidia/blackwell.py +3 -0
triton/experimental/gluon/nvidia/hopper.py +45 -0
triton/knobs.py +546 -0
triton/language/__init__.py +342 -0
triton/language/core.py +3405 -0
triton/language/extra/__init__.py +26 -0
triton/language/extra/cuda/__init__.py +16 -0
triton/language/extra/cuda/gdc.py +42 -0
triton/language/extra/cuda/libdevice.py +1629 -0
triton/language/extra/cuda/utils.py +109 -0
triton/language/extra/hip/__init__.py +5 -0
triton/language/extra/hip/libdevice.py +491 -0
triton/language/extra/hip/utils.py +35 -0
triton/language/extra/libdevice.py +790 -0
triton/language/math.py +249 -0
triton/language/random.py +218 -0
triton/language/semantic.py +1939 -0
triton/language/standard.py +534 -0
triton/language/target_info.py +54 -0
triton/runtime/__init__.py +23 -0
triton/runtime/_allocation.py +44 -0
triton/runtime/_async_compile.py +55 -0
triton/runtime/autotuner.py +476 -0
triton/runtime/build.py +168 -0
triton/runtime/cache.py +317 -0
triton/runtime/driver.py +38 -0
triton/runtime/errors.py +36 -0
triton/runtime/interpreter.py +1414 -0
triton/runtime/jit.py +1107 -0
triton/runtime/tcc/include/_mingw.h +168 -0
triton/runtime/tcc/include/assert.h +62 -0
triton/runtime/tcc/include/conio.h +409 -0
triton/runtime/tcc/include/ctype.h +281 -0
triton/runtime/tcc/include/dir.h +31 -0
triton/runtime/tcc/include/direct.h +68 -0
triton/runtime/tcc/include/dirent.h +135 -0
triton/runtime/tcc/include/dos.h +55 -0
triton/runtime/tcc/include/errno.h +75 -0
triton/runtime/tcc/include/excpt.h +123 -0
triton/runtime/tcc/include/fcntl.h +52 -0
triton/runtime/tcc/include/fenv.h +108 -0
triton/runtime/tcc/include/float.h +75 -0
triton/runtime/tcc/include/inttypes.h +297 -0
triton/runtime/tcc/include/io.h +418 -0
triton/runtime/tcc/include/iso646.h +36 -0
triton/runtime/tcc/include/limits.h +116 -0
triton/runtime/tcc/include/locale.h +91 -0
triton/runtime/tcc/include/malloc.h +181 -0
triton/runtime/tcc/include/math.h +497 -0
triton/runtime/tcc/include/mem.h +13 -0
triton/runtime/tcc/include/memory.h +40 -0
triton/runtime/tcc/include/process.h +176 -0
triton/runtime/tcc/include/sec_api/conio_s.h +42 -0
triton/runtime/tcc/include/sec_api/crtdbg_s.h +19 -0
triton/runtime/tcc/include/sec_api/io_s.h +33 -0
triton/runtime/tcc/include/sec_api/mbstring_s.h +52 -0
triton/runtime/tcc/include/sec_api/search_s.h +25 -0
triton/runtime/tcc/include/sec_api/stdio_s.h +145 -0
triton/runtime/tcc/include/sec_api/stdlib_s.h +67 -0
triton/runtime/tcc/include/sec_api/stralign_s.h +30 -0
triton/runtime/tcc/include/sec_api/string_s.h +41 -0
triton/runtime/tcc/include/sec_api/sys/timeb_s.h +34 -0
triton/runtime/tcc/include/sec_api/tchar_s.h +266 -0
triton/runtime/tcc/include/sec_api/time_s.h +61 -0
triton/runtime/tcc/include/sec_api/wchar_s.h +128 -0
triton/runtime/tcc/include/setjmp.h +160 -0
triton/runtime/tcc/include/share.h +28 -0
triton/runtime/tcc/include/signal.h +63 -0
triton/runtime/tcc/include/stdalign.h +16 -0
triton/runtime/tcc/include/stdarg.h +14 -0
triton/runtime/tcc/include/stdatomic.h +171 -0
triton/runtime/tcc/include/stdbool.h +11 -0
triton/runtime/tcc/include/stddef.h +42 -0
triton/runtime/tcc/include/stdint.h +212 -0
triton/runtime/tcc/include/stdio.h +429 -0
triton/runtime/tcc/include/stdlib.h +591 -0
triton/runtime/tcc/include/stdnoreturn.h +7 -0
triton/runtime/tcc/include/string.h +164 -0
triton/runtime/tcc/include/sys/fcntl.h +13 -0
triton/runtime/tcc/include/sys/file.h +14 -0
triton/runtime/tcc/include/sys/locking.h +30 -0
triton/runtime/tcc/include/sys/stat.h +290 -0
triton/runtime/tcc/include/sys/time.h +69 -0
triton/runtime/tcc/include/sys/timeb.h +133 -0
triton/runtime/tcc/include/sys/types.h +123 -0
triton/runtime/tcc/include/sys/unistd.h +14 -0
triton/runtime/tcc/include/sys/utime.h +146 -0
triton/runtime/tcc/include/tcc/tcc_libm.h +618 -0
triton/runtime/tcc/include/tccdefs.h +342 -0
triton/runtime/tcc/include/tcclib.h +80 -0
triton/runtime/tcc/include/tchar.h +1102 -0
triton/runtime/tcc/include/tgmath.h +89 -0
triton/runtime/tcc/include/time.h +287 -0
triton/runtime/tcc/include/uchar.h +33 -0
triton/runtime/tcc/include/unistd.h +1 -0
triton/runtime/tcc/include/vadefs.h +11 -0
triton/runtime/tcc/include/values.h +4 -0
triton/runtime/tcc/include/varargs.h +12 -0
triton/runtime/tcc/include/wchar.h +873 -0
triton/runtime/tcc/include/wctype.h +172 -0
triton/runtime/tcc/include/winapi/basetsd.h +149 -0
triton/runtime/tcc/include/winapi/basetyps.h +85 -0
triton/runtime/tcc/include/winapi/guiddef.h +156 -0
triton/runtime/tcc/include/winapi/poppack.h +8 -0
triton/runtime/tcc/include/winapi/pshpack1.h +8 -0
triton/runtime/tcc/include/winapi/pshpack2.h +8 -0
triton/runtime/tcc/include/winapi/pshpack4.h +8 -0
triton/runtime/tcc/include/winapi/pshpack8.h +8 -0
triton/runtime/tcc/include/winapi/qos.h +72 -0
triton/runtime/tcc/include/winapi/shellapi.h +59 -0
triton/runtime/tcc/include/winapi/winbase.h +2958 -0
triton/runtime/tcc/include/winapi/wincon.h +309 -0
triton/runtime/tcc/include/winapi/windef.h +293 -0
triton/runtime/tcc/include/winapi/windows.h +127 -0
triton/runtime/tcc/include/winapi/winerror.h +3166 -0
triton/runtime/tcc/include/winapi/wingdi.h +4080 -0
triton/runtime/tcc/include/winapi/winnls.h +778 -0
triton/runtime/tcc/include/winapi/winnt.h +5837 -0
triton/runtime/tcc/include/winapi/winreg.h +272 -0
triton/runtime/tcc/include/winapi/winsock2.h +1474 -0
triton/runtime/tcc/include/winapi/winuser.h +5651 -0
triton/runtime/tcc/include/winapi/winver.h +160 -0
triton/runtime/tcc/include/winapi/ws2ipdef.h +21 -0
triton/runtime/tcc/include/winapi/ws2tcpip.h +391 -0
triton/runtime/tcc/lib/cuda.def +697 -0
triton/runtime/tcc/lib/gdi32.def +337 -0
triton/runtime/tcc/lib/kernel32.def +770 -0
triton/runtime/tcc/lib/libtcc1.a +0 -0
triton/runtime/tcc/lib/msvcrt.def +1399 -0
triton/runtime/tcc/lib/python3.def +810 -0
triton/runtime/tcc/lib/python310.def +1610 -0
triton/runtime/tcc/lib/python311.def +1633 -0
triton/runtime/tcc/lib/python312.def +1703 -0
triton/runtime/tcc/lib/python313.def +1651 -0
triton/runtime/tcc/lib/python313t.def +1656 -0
triton/runtime/tcc/lib/python314.def +1800 -0
triton/runtime/tcc/lib/python314t.def +1809 -0
triton/runtime/tcc/lib/python39.def +1644 -0
triton/runtime/tcc/lib/python3t.def +905 -0
triton/runtime/tcc/lib/user32.def +658 -0
triton/runtime/tcc/libtcc.dll +0 -0
triton/runtime/tcc/tcc.exe +0 -0
triton/testing.py +543 -0
triton/tools/__init__.py +0 -0
triton/tools/build_extern.py +365 -0
triton/tools/compile.py +210 -0
triton/tools/disasm.py +143 -0
triton/tools/extra/cuda/compile.c +70 -0
triton/tools/extra/cuda/compile.h +14 -0
triton/tools/extra/hip/compile.cpp +66 -0
triton/tools/extra/hip/compile.h +13 -0
triton/tools/link.py +322 -0
triton/tools/mxfp.py +301 -0
triton/tools/ragged_tma.py +92 -0
triton/tools/tensor_descriptor.py +34 -0
triton/windows_utils.py +405 -0
triton_windows-3.5.1.post21.dist-info/METADATA +46 -0
triton_windows-3.5.1.post21.dist-info/RECORD +217 -0
triton_windows-3.5.1.post21.dist-info/WHEEL +5 -0
triton_windows-3.5.1.post21.dist-info/entry_points.txt +3 -0
triton_windows-3.5.1.post21.dist-info/licenses/LICENSE +23 -0
triton_windows-3.5.1.post21.dist-info/top_level.txt +1 -0

triton/language/standard.py ADDED Viewed

@@ -0,0 +1,534 @@
+from __future__ import annotations
+from ..runtime.jit import jit, constexpr_function
+from . import core
+from . import math
+# constexpr utilities
+@constexpr_function
+def _log2(i):
+    log2 = 0
+    n = i
+    while n > 1:
+        n >>= 1
+        log2 += 1
+    return log2
+@constexpr_function
+def _is_power_of_two(i):
+    return (i & (i - 1)) == 0 and i != 0
+# -----------------------
+# Standard library
+# -----------------------
+@core._tensor_member_fn
+@jit
+def cdiv(x, div):
+    """
+    Computes the ceiling division of :code:`x` by :code:`div`
+    :param x: the input number
+    :type x: Block
+    :param div: the divisor
+    :type div: Block
+    """
+    return (x + div - 1) // div
+@core._tensor_member_fn
+@jit
+@math._add_math_1arg_docstr("sigmoid")
+def sigmoid(x):
+    return 1 / (1 + math.exp(-x))
+@core._tensor_member_fn
+@jit
+@math._add_math_1arg_docstr("softmax")
+def softmax(x, dim=None, keep_dims=False, ieee_rounding=False):
+    if dim is None:
+        _dim: core.constexpr = 0
+    else:
+        _dim: core.constexpr = dim
+    z = x - max(x, _dim, keep_dims=keep_dims)
+    num = math.exp(z)
+    den = sum(num, _dim, keep_dims=keep_dims)
+    return math.fdiv(num, den, ieee_rounding)
+@core._tensor_member_fn
+@jit
+def ravel(x, can_reorder=False):
+    """
+    Returns a contiguous flattened view of :code:`x`.
+    :param x: the input tensor
+    :type x: Block
+    """
+    return core.reshape(x, [x.numel], can_reorder=can_reorder)
+@jit
+def swizzle2d(i, j, size_i, size_j, size_g):
+    """
+    Transforms the indices of a row-major `size_i * size_j` matrix into
+    the indices of a column-major matrix for each group of `size_g` rows.
+    For example, for :code:`size_i = size_j = 4` and :code:`size_g = 2`, it will
+    transform ::
+        [[0 , 1 , 2 , 3 ],
+         [4 , 5 , 6 , 7 ],
+         [8 , 9 , 10, 11],
+         [12, 13, 14, 15]]
+    into ::
+        [[0, 2,  4 , 6 ],
+         [1, 3,  5 , 7 ],
+         [8, 10, 12, 14],
+         [9, 11, 13, 15]]
+    """
+    # "unrolled index in array"
+    ij = i * size_j + j
+    # number of elements in `size_g` groups
+    # of `size_j` columns
+    size_gj = size_g * size_j
+    # index of the group in which (i,j) is
+    group_id = ij // size_gj
+    # row-index of the first element of this group
+    off_i = group_id * size_g
+    # last group may have fewer rows
+    size_g = core.minimum(size_i - off_i, size_g)
+    # linear index with respect to the first element in this group
+    ij = ij % size_gj
+    # new row and column indices
+    new_i = off_i + ij % size_g
+    new_j = ij // size_g
+    return new_i, new_j
+@jit
+def zeros(shape, dtype):
+    """
+    Returns a tensor filled with the scalar value 0 for the given :code:`shape` and :code:`dtype`.
+    :param shape: Shape of the new array, e.g., (8, 16) or (8, )
+    :type shape: tuple of ints
+    :param dtype: Data-type of the new array, e.g., :code:`tl.float16`
+    :type dtype: DType
+    """
+    return core.full(shape, 0, dtype)
+@jit
+def zeros_like(input):
+    """
+    Returns a tensor of zeros with the same shape and type as a given tensor.
+    :param input: input tensor
+    :type input: Tensor
+    """
+    return zeros(input.shape, input.dtype)
+# max and argmax
+@jit
+def _argmax_combine(value1, index1, value2, index2, tie_break_left):
+    if tie_break_left:
+        tie = value1 == value2 and index1 < index2
+    else:
+        tie = False
+    gt = value1 > value2 or tie
+    v_ret = core.where(gt, value1, value2)
+    i_ret = core.where(gt, index1, index2)
+    return v_ret, i_ret
+@jit
+def _argmax_combine_tie_break_left(value1, index1, value2, index2):
+    return _argmax_combine(value1, index1, value2, index2, True)
+@jit
+def _argmax_combine_tie_break_fast(value1, index1, value2, index2):
+    return _argmax_combine(value1, index1, value2, index2, False)
+@jit
+def _elementwise_max(a, b):
+    return core.maximum(a, b)
+@core._tensor_member_fn
+@jit
+@core._add_reduction_docstr("maximum", return_indices_arg="return_indices",
+                            tie_break_arg="return_indices_tie_break_left")
+def max(input, axis=None, return_indices=False, return_indices_tie_break_left=True, keep_dims=False):
+    input = core._promote_bfloat16_to_float32(input)
+    if return_indices:
+        if return_indices_tie_break_left:
+            return core._reduce_with_indices(input, axis, _argmax_combine_tie_break_left, keep_dims=keep_dims)
+        else:
+            return core._reduce_with_indices(input, axis, _argmax_combine_tie_break_fast, keep_dims=keep_dims)
+    else:
+        if core.constexpr(input.dtype.primitive_bitwidth) < core.constexpr(32):
+            if core.constexpr(input.dtype.is_floating()):
+                input = input.to(core.float32)
+            else:
+                assert input.dtype.is_int(), "Expecting input to be integer type"
+                input = input.to(core.int32)
+        return core.reduce(input, axis, _elementwise_max, keep_dims=keep_dims)
+@core._tensor_member_fn
+@jit
+@core._add_reduction_docstr("maximum index", tie_break_arg="tie_break_left")
+def argmax(input, axis, tie_break_left=True, keep_dims=False):
+    (_, ret) = max(input, axis, return_indices=True, return_indices_tie_break_left=tie_break_left, keep_dims=keep_dims)
+    return ret
+# min and argmin
+@jit
+def _argmin_combine(value1, index1, value2, index2, tie_break_left):
+    if tie_break_left:
+        tie = value1 == value2 and index1 < index2
+    else:
+        tie = False
+    lt = value1 < value2 or tie
+    value_ret = core.where(lt, value1, value2)
+    index_ret = core.where(lt, index1, index2)
+    return value_ret, index_ret
+@jit
+def _argmin_combine_tie_break_left(value1, index1, value2, index2):
+    return _argmin_combine(value1, index1, value2, index2, True)
+@jit
+def _argmin_combine_tie_break_fast(value1, index1, value2, index2):
+    return _argmin_combine(value1, index1, value2, index2, False)
+@jit
+def _elementwise_min(a, b):
+    return core.minimum(a, b)
+@core._tensor_member_fn
+@jit
+@core._add_reduction_docstr("minimum", return_indices_arg="return_indices",
+                            tie_break_arg="return_indices_tie_break_left")
+def min(input, axis=None, return_indices=False, return_indices_tie_break_left=True, keep_dims=False):
+    input = core._promote_bfloat16_to_float32(input)
+    if return_indices:
+        if return_indices_tie_break_left:
+            return core._reduce_with_indices(input, axis, _argmin_combine_tie_break_left, keep_dims=keep_dims)
+        else:
+            return core._reduce_with_indices(input, axis, _argmin_combine_tie_break_fast, keep_dims=keep_dims)
+    else:
+        if core.constexpr(input.dtype.primitive_bitwidth) < 32:
+            if core.constexpr(input.dtype.is_floating()):
+                input = input.to(core.float32)
+            else:
+                assert input.dtype.is_int(), "Expecting input to be integer type"
+                input = input.to(core.int32)
+        return core.reduce(input, axis, _elementwise_min, keep_dims=keep_dims)
+@core._tensor_member_fn
+@jit
+@core._add_reduction_docstr("minimum index", tie_break_arg="tie_break_left")
+def argmin(input, axis, tie_break_left=True, keep_dims=False):
+    _, ret = min(input, axis, return_indices=True, return_indices_tie_break_left=tie_break_left, keep_dims=keep_dims)
+    return ret
+@jit
+def _sum_combine(a, b):
+    return a + b
+# sum
+@constexpr_function
+def _pick_sum_dtype(in_dtype, dtype):
+    if dtype is not None:
+        return dtype
+    # For integer bitwidths less than 32, pick int32 with the same sign to
+    # avoid overflow.
+    out_dtype = None
+    if in_dtype.is_int_signed():
+        out_dtype = core.int32 if in_dtype.int_bitwidth < 32 else None
+    elif in_dtype.is_int_unsigned():
+        out_dtype = core.uint32 if in_dtype.int_bitwidth < 32 else None
+    return out_dtype
+@core._tensor_member_fn
+@jit
+@core._add_reduction_docstr("sum", dtype_arg="dtype")
+def sum(input, axis=None, keep_dims=False, dtype: core.constexpr = None):
+    # Pick a default dtype for the reduction if one was not specified.
+    out_dtype: core.constexpr = _pick_sum_dtype(input.dtype, dtype)
+    if out_dtype is not None:
+        input = input.to(out_dtype)
+    return core.reduce(input, axis, _sum_combine, keep_dims=keep_dims)
+@jit
+def _xor_combine(a, b):
+    return a ^ b
+# xor sum
+@core._tensor_member_fn
+@jit
+@core._add_reduction_docstr("xor sum")
+def xor_sum(input, axis=None, keep_dims=False):
+    core.static_assert(input.type.scalar.is_int(), "xor_sum only supported for integers")
+    return core.reduce(input, axis, _xor_combine, keep_dims=keep_dims)
+# or reduction
+@jit
+def _or_combine(x, y):
+    return x | y
+@core._tensor_member_fn
+@jit
+@core._add_reduction_docstr("reduce_or")
+def reduce_or(input, axis, keep_dims=False):
+    core.static_assert(input.type.scalar.is_int(), "reduce_or only supported for integers")
+    return core.reduce(input, axis, _or_combine, keep_dims=keep_dims)
+# cumsum
+@core._tensor_member_fn
+@jit
+@core._add_scan_docstr("cumsum", dtype_arg="dtype")
+def cumsum(input, axis=0, reverse=False, dtype: core.constexpr = None):
+    # todo rename this to a generic function name
+    input = core._promote_bfloat16_to_float32(input)
+    out_dtype: core.constexpr = _pick_sum_dtype(input.dtype, dtype)
+    if out_dtype is not None:
+        input = input.to(out_dtype)
+    return core.associative_scan(input, axis, _sum_combine, reverse)
+# cumprod
+@jit
+def _prod_combine(a, b):
+    return a * b
+@core._tensor_member_fn
+@jit
+@core._add_scan_docstr("cumprod")
+def cumprod(input, axis=0, reverse=False):
+    # todo rename this to a generic function name
+    input = core._promote_bfloat16_to_float32(input)
+    return core.associative_scan(input, axis, _prod_combine, reverse)
+# sort
+@jit
+def _indicator(n_dims: core.constexpr, j: core.constexpr):
+    ar = core.arange(0, 2)
+    ar = core.reshape(ar, [1] * (n_dims - j - 1) + [2] + [1] * j)
+    return ar
+@jit
+def _compare_and_swap(x, flip, i: core.constexpr):
+    # compare-and-swap on the ith *innermost* dimension
+    n_dims: core.constexpr = _log2(x.numel)
+    # flip along middle dimension (the bitwise XORs will be optimised away):
+    idtype = core.get_int_dtype(bitwidth=x.dtype.primitive_bitwidth, signed=True)
+    ix = x.to(idtype, bitcast=True)
+    iy = ix ^ xor_sum(ix, n_dims - 1 - i, True)
+    y = iy.to(x.dtype, bitcast=True)
+    # determines whether we are in the right (rather than left) position along the axis:
+    is_right = _indicator(n_dims, i)
+    # conditional swap:
+    ret = core.where((x > y) != (flip ^ is_right), y, x)
+    return ret
+@jit
+def _bitonic_merge_hypercube(x, stage: core.constexpr, order: core.constexpr):
+    '''
+    order_type 0 == ascending
+    order_type 1 == descending
+    order_type 2 == alternating
+    '''
+    # flip denotes whether to re-arrange sub-sequences of elements in ascending or
+    # descending order.
+    # if flip = 00000000... then all elements will be re-arranged ascendingly at this stage
+    # if flip = 00110011... then all the elements will be re-arranged alternatingly (with
+    # a stride of 2) at this stage
+    if order == 2:
+        flip = _indicator(_log2(x.numel), stage)
+    else:
+        flip = order
+    # perform `stage` rounds of `compare-and-swap`
+    for i in core.static_range(stage):
+        x = _compare_and_swap(x, flip, stage - 1 - i)
+    return x
+@jit
+def _bitonic_merge(x, stage: core.constexpr, order: core.constexpr, n_dims: core.constexpr):
+    h = core.reshape(x, [2] * _log2(x.numel))
+    h = _bitonic_merge_hypercube(h, stage, order)
+    x = core.reshape(h, x.shape)
+    return x
+@jit
+def sort_impl(x, k: core.constexpr = None, dim: core.constexpr = None, descending: core.constexpr = core.CONSTEXPR_0):
+    """
+    Sorts a tensor along a specified dimension.
+    :param x: The input tensor to be sorted.
+    :type x: Tensor
+    :param dim: The dimension along which to sort the tensor. If None, the tensor is sorted along the last dimension. Currently, only sorting along the last dimension is supported.
+    :type dim: int, optional
+    :param k: the number of top elements to select. If none, assume k = x.shape[dim]
+    :type k: int, optional
+    :param descending: If set to True, the tensor is sorted in descending order. If set to False, the tensor is sorted in ascending order.
+    :type descending: bool, optional
+    """
+    # handle default dimension or check that it is the most minor dim
+    _dim: core.constexpr = len(x.shape) - 1 if dim is None else dim
+    core.static_assert(_dim == len(x.shape) - 1, "only minor dimension is currently supported")
+    log_n: core.constexpr = _log2(x.shape[_dim])
+    log_k: core.constexpr = log_n if k is None else _log2(k)
+    n_dims: core.constexpr = _log2(x.numel)
+    # reshape to hypercube:
+    h = core.reshape(x, [2] * n_dims)
+    # run first log_k bitonic sort iterations:
+    for i in core.static_range(1, log_k + 1):
+        h = _bitonic_merge_hypercube(h, i, 2 if i < log_n else descending)
+    # select top k elements using bitonic top-k
+    # https://www.doc.ic.ac.uk/~hlgr/pdfs/MassivelyParallelTopK.pdf
+    for i in core.static_range(log_k + 1, log_n + 1):
+        h = max(h, axis=(_log2(h.numel) - 1 - log_k)) if descending else min(h, axis=(_log2(h.numel) - 1 - log_k))
+        h = _bitonic_merge_hypercube(h, log_k, 2 if i < log_n else descending)
+    # reshape back:
+    x = core.reshape(h, x.shape[:-1] + [2**log_k])
+    return x
+@jit
+def sort(x, dim: core.constexpr = None, descending: core.constexpr = core.CONSTEXPR_0):
+    return sort_impl(x, dim=dim, descending=descending)
+@jit
+def topk(x, k: core.constexpr, dim: core.constexpr = None):
+    return sort_impl(x, k=k, dim=dim, descending=True)
+@jit
+def bitonic_merge(x, dim: core.constexpr = None, descending: core.constexpr = core.CONSTEXPR_0):
+    # handle default dimension or check that it is the most minor dim
+    _dim: core.constexpr = len(x.shape) - 1 if dim is None else dim
+    core.static_assert(_dim == len(x.shape) - 1, "only minor dimension is currently supported")
+    n_dims: core.constexpr = _log2(x.shape[-1])
+    return _bitonic_merge(x, n_dims, descending, n_dims)
+@constexpr_function
+def _get_flip_dim(dim, shape):
+    if dim is None:
+        dim = len(shape) - 1
+    if dim < 0:  # flip doesn't work if dim < 0 because the xor-swap for loop will start/end at the wrong index
+        dim += len(shape)
+    return dim
+@core._tensor_member_fn
+@jit
+def flip(x, dim=None):
+    """
+    Flips a tensor `x` along the dimension `dim`.
+    :param x: the first input tensor
+    :type x: Block
+    :param dim: the dimension to flip along
+    :type dim: int
+    """
+    core.static_assert(-len(x.shape) <= dim and dim < len(x.shape))
+    _dim: core.constexpr = _get_flip_dim(dim, x.shape)
+    core.static_assert(_is_power_of_two(x.shape[_dim]))
+    steps: core.constexpr = _log2(x.shape[_dim])
+    # reshape the swap dimension to (2, 2, ..., 2)
+    idtype = core.get_int_dtype(bitwidth=x.dtype.primitive_bitwidth, signed=True)
+    y = core.reshape(x.to(idtype, bitcast=True), x.shape[:_dim] + [2] * steps + x.shape[_dim + 1:])
+    for i in core.static_range(steps):
+        y = y ^ xor_sum(y, _dim + i, True)
+    x = core.reshape(y, x.shape).to(x.dtype, bitcast=True)
+    return x
+@jit
+def interleave(a, b):
+    """
+    Interleaves the values of two tensors along their last dimension. The two tensors must have the same shape.
+    Equivalent to `tl.join(a, b).reshape(a.shape[:-1] + [2 * a.shape[-1]])`
+    :param a: The first input tensor.
+    :type a: Tensor
+    :param b: The second input tensor.
+    :type b: Tensor
+    """
+    c = core.join(a, b)
+    if len(c.shape) == 1:
+        # We must have interleaved two scalars.
+        return c
+    else:
+        # This `else` is necessary because Triton's AST parser doesn't
+        # understand that if we take the `if` above we definitely don't run this
+        # `else`.
+        return core.reshape(c, c.shape[:-2] + [2 * c.shape[-2]])

triton/language/target_info.py ADDED Viewed

@@ -0,0 +1,54 @@
+from triton.runtime import driver
+from triton.runtime.jit import constexpr_function
+__all__ = ["current_target"]
+def current_target():
+    try:
+        active_driver = driver.active
+    except RuntimeError:
+        # If there is no active driver, return None
+        return None
+    return active_driver.get_current_target()
+current_target.__triton_builtin__ = True
+@constexpr_function
+def is_cuda():
+    target = current_target()
+    return target is not None and target.backend == "cuda"
+@constexpr_function
+def cuda_capability_geq(major, minor=0):
+    """
+    Determines whether we have compute capability >= (major, minor) and
+    returns this as a constexpr boolean. This can be used for guarding
+    inline asm implementations that require a certain compute capability.
+    """
+    target = current_target()
+    if target is None or target.backend != "cuda":
+        return False
+    assert isinstance(target.arch, int)
+    return target.arch >= major * 10 + minor
+@constexpr_function
+def is_hip():
+    target = current_target()
+    return target is not None and target.backend == "hip"
+@constexpr_function
+def is_hip_cdna3():
+    target = current_target()
+    return target is not None and target.arch == "gfx942"
+@constexpr_function
+def is_hip_cdna4():
+    target = current_target()
+    return target is not None and target.arch == "gfx950"

triton/runtime/__init__.py ADDED Viewed

@@ -0,0 +1,23 @@
+from .autotuner import (Autotuner, Config, Heuristics, autotune, heuristics)
+from .cache import RedisRemoteCacheBackend, RemoteCacheBackend
+from .driver import driver
+from .jit import JITFunction, KernelInterface, MockTensor, TensorWrapper, reinterpret
+from .errors import OutOfResources, InterpreterError
+__all__ = [
+    "autotune",
+    "Autotuner",
+    "Config",
+    "driver",
+    "Heuristics",
+    "heuristics",
+    "InterpreterError",
+    "JITFunction",
+    "KernelInterface",
+    "MockTensor",
+    "OutOfResources",
+    "RedisRemoteCacheBackend",
+    "reinterpret",
+    "RemoteCacheBackend",
+    "TensorWrapper",
+]

triton/runtime/_allocation.py ADDED Viewed

@@ -0,0 +1,44 @@
+from typing import Optional, Protocol
+from contextvars import ContextVar
+class Buffer(Protocol):
+    def data_ptr(self) -> int:
+        ...
+class Allocator(Protocol):
+    def __call__(self, size: int, alignment: int, stream: Optional[int]) -> Buffer:
+        ...
+class NullAllocator:
+    def __call__(self, size: int, alignment: int, stream: Optional[int]) -> Buffer:
+        raise RuntimeError("Kernel requires a runtime memory allocation, but no allocator was set. " +
+                           "Use triton.set_allocator to specify an allocator.")
+_allocator: ContextVar[Allocator] = ContextVar("_allocator", default=NullAllocator())
+def set_allocator(allocator: Allocator):
+    """
+    The allocator function is called during kernel launch for kernels that
+    require additional global memory workspace.
+    """
+    _allocator.set(allocator)
+_profile_allocator: Allocator = ContextVar("_allocator", default=NullAllocator())
+def set_profile_allocator(allocator: Optional[Allocator]):
+    """
+    The profile allocator function is called before kernel launch for kernels
+    that require additional global memory workspace.
+    """
+    global _profile_allocator
+    _profile_allocator.set(allocator)

triton/runtime/_async_compile.py ADDED Viewed

@@ -0,0 +1,55 @@
+from __future__ import annotations
+from typing import Callable, Optional
+from concurrent.futures import Executor, as_completed, Future
+from contextvars import ContextVar
+active_mode: ContextVar[Optional[AsyncCompileMode]] = ContextVar("async_compile_active_mode", default=None)
+class FutureKernel:
+    def __init__(self, finalize_compile: Callable, future: Future):
+        self.finalize_compile = finalize_compile
+        self.kernel = None
+        self.future = future
+    def result(self):
+        if self.kernel is not None:
+            return self.kernel
+        kernel = self.future.result()
+        self.finalize_compile(kernel)
+        self.kernel = kernel
+        return kernel
+class AsyncCompileMode:
+    def __init__(self, executor: Executor):
+        self.executor = executor
+        self.raw_futures = []
+        self.future_kernels = {}
+    def submit(self, key, compile_fn, finalize_fn):
+        future = self.future_kernels.get(key)
+        if future is not None:
+            return future
+        future = self.executor.submit(compile_fn)
+        future._key = key
+        self.raw_futures.append(future)
+        future_kernel = FutureKernel(finalize_fn, future)
+        self.future_kernels[key] = future_kernel
+        return future_kernel
+    def __enter__(self):
+        if active_mode.get() is not None:
+            raise RuntimeError("Another AsyncCompileMode is already active")
+        active_mode.set(self)
+        return self
+    def __exit__(self, exc_type, exc_value, traceback):
+        # Finalize any outstanding compiles
+        for future in as_completed(self.raw_futures):
+            self.future_kernels[future._key].result()
+        active_mode.set(None)