PyPI - triton-windows - Versions diffs - 3.2.0.post11__cp310-cp310-win_amd64.whl - Mend

triton-windows 3.2.0.post11__cp310-cp310-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of triton-windows might be problematic. Click here for more details.

Files changed (154) hide show

triton/_C/libtriton.pyd +0 -0
triton/__init__.py +85 -0
triton/_internal_testing.py +123 -0
triton/backends/__init__.py +50 -0
triton/backends/amd/compiler.py +368 -0
triton/backends/amd/driver.c +211 -0
triton/backends/amd/driver.py +512 -0
triton/backends/amd/include/hip/amd_detail/amd_channel_descriptor.h +358 -0
triton/backends/amd/include/hip/amd_detail/amd_device_functions.h +1031 -0
triton/backends/amd/include/hip/amd_detail/amd_hip_atomic.h +1612 -0
triton/backends/amd/include/hip/amd_detail/amd_hip_bf16.h +1337 -0
triton/backends/amd/include/hip/amd_detail/amd_hip_bfloat16.h +293 -0
triton/backends/amd/include/hip/amd_detail/amd_hip_common.h +32 -0
triton/backends/amd/include/hip/amd_detail/amd_hip_complex.h +174 -0
triton/backends/amd/include/hip/amd_detail/amd_hip_cooperative_groups.h +829 -0
triton/backends/amd/include/hip/amd_detail/amd_hip_fp16.h +1809 -0
triton/backends/amd/include/hip/amd_detail/amd_hip_gl_interop.h +108 -0
triton/backends/amd/include/hip/amd_detail/amd_hip_math_constants.h +124 -0
triton/backends/amd/include/hip/amd_detail/amd_hip_runtime.h +405 -0
triton/backends/amd/include/hip/amd_detail/amd_hip_runtime_pt_api.h +196 -0
triton/backends/amd/include/hip/amd_detail/amd_hip_unsafe_atomics.h +565 -0
triton/backends/amd/include/hip/amd_detail/amd_hip_vector_types.h +2226 -0
triton/backends/amd/include/hip/amd_detail/amd_math_functions.h +104 -0
triton/backends/amd/include/hip/amd_detail/amd_surface_functions.h +244 -0
triton/backends/amd/include/hip/amd_detail/amd_warp_functions.h +494 -0
triton/backends/amd/include/hip/amd_detail/concepts.hpp +30 -0
triton/backends/amd/include/hip/amd_detail/device_library_decls.h +133 -0
triton/backends/amd/include/hip/amd_detail/functional_grid_launch.hpp +218 -0
triton/backends/amd/include/hip/amd_detail/grid_launch.h +67 -0
triton/backends/amd/include/hip/amd_detail/grid_launch.hpp +50 -0
triton/backends/amd/include/hip/amd_detail/grid_launch_GGL.hpp +26 -0
triton/backends/amd/include/hip/amd_detail/helpers.hpp +137 -0
triton/backends/amd/include/hip/amd_detail/hip_api_trace.hpp +1350 -0
triton/backends/amd/include/hip/amd_detail/hip_assert.h +101 -0
triton/backends/amd/include/hip/amd_detail/hip_cooperative_groups_helper.h +242 -0
triton/backends/amd/include/hip/amd_detail/hip_fp16_gcc.h +254 -0
triton/backends/amd/include/hip/amd_detail/hip_fp16_math_fwd.h +96 -0
triton/backends/amd/include/hip/amd_detail/hip_ldg.h +100 -0
triton/backends/amd/include/hip/amd_detail/hip_prof_str.h +10169 -0
triton/backends/amd/include/hip/amd_detail/hip_runtime_prof.h +77 -0
triton/backends/amd/include/hip/amd_detail/host_defines.h +180 -0
triton/backends/amd/include/hip/amd_detail/hsa_helpers.hpp +102 -0
triton/backends/amd/include/hip/amd_detail/macro_based_grid_launch.hpp +798 -0
triton/backends/amd/include/hip/amd_detail/math_fwd.h +698 -0
triton/backends/amd/include/hip/amd_detail/ockl_image.h +177 -0
triton/backends/amd/include/hip/amd_detail/program_state.hpp +107 -0
triton/backends/amd/include/hip/amd_detail/texture_fetch_functions.h +491 -0
triton/backends/amd/include/hip/amd_detail/texture_indirect_functions.h +478 -0
triton/backends/amd/include/hip/channel_descriptor.h +39 -0
triton/backends/amd/include/hip/device_functions.h +38 -0
triton/backends/amd/include/hip/driver_types.h +468 -0
triton/backends/amd/include/hip/hip_bf16.h +36 -0
triton/backends/amd/include/hip/hip_bfloat16.h +44 -0
triton/backends/amd/include/hip/hip_common.h +100 -0
triton/backends/amd/include/hip/hip_complex.h +38 -0
triton/backends/amd/include/hip/hip_cooperative_groups.h +46 -0
triton/backends/amd/include/hip/hip_deprecated.h +95 -0
triton/backends/amd/include/hip/hip_ext.h +159 -0
triton/backends/amd/include/hip/hip_fp16.h +36 -0
triton/backends/amd/include/hip/hip_gl_interop.h +32 -0
triton/backends/amd/include/hip/hip_hcc.h +24 -0
triton/backends/amd/include/hip/hip_math_constants.h +36 -0
triton/backends/amd/include/hip/hip_profile.h +27 -0
triton/backends/amd/include/hip/hip_runtime.h +75 -0
triton/backends/amd/include/hip/hip_runtime_api.h +8919 -0
triton/backends/amd/include/hip/hip_texture_types.h +29 -0
triton/backends/amd/include/hip/hip_vector_types.h +41 -0
triton/backends/amd/include/hip/hip_version.h +17 -0
triton/backends/amd/include/hip/hiprtc.h +421 -0
triton/backends/amd/include/hip/library_types.h +78 -0
triton/backends/amd/include/hip/math_functions.h +42 -0
triton/backends/amd/include/hip/surface_types.h +63 -0
triton/backends/amd/include/hip/texture_types.h +194 -0
triton/backends/amd/include/hsa/Brig.h +1131 -0
triton/backends/amd/include/hsa/amd_hsa_common.h +91 -0
triton/backends/amd/include/hsa/amd_hsa_elf.h +436 -0
triton/backends/amd/include/hsa/amd_hsa_kernel_code.h +269 -0
triton/backends/amd/include/hsa/amd_hsa_queue.h +109 -0
triton/backends/amd/include/hsa/amd_hsa_signal.h +80 -0
triton/backends/amd/include/hsa/hsa.h +5729 -0
triton/backends/amd/include/hsa/hsa_amd_tool.h +91 -0
triton/backends/amd/include/hsa/hsa_api_trace.h +566 -0
triton/backends/amd/include/hsa/hsa_ext_amd.h +3090 -0
triton/backends/amd/include/hsa/hsa_ext_finalize.h +531 -0
triton/backends/amd/include/hsa/hsa_ext_image.h +1454 -0
triton/backends/amd/include/hsa/hsa_ven_amd_aqlprofile.h +488 -0
triton/backends/amd/include/hsa/hsa_ven_amd_loader.h +667 -0
triton/backends/amd/include/roctracer/ext/prof_protocol.h +107 -0
triton/backends/amd/include/roctracer/hip_ostream_ops.h +4435 -0
triton/backends/amd/include/roctracer/hsa_ostream_ops.h +1467 -0
triton/backends/amd/include/roctracer/hsa_prof_str.h +3027 -0
triton/backends/amd/include/roctracer/roctracer.h +779 -0
triton/backends/amd/include/roctracer/roctracer_ext.h +81 -0
triton/backends/amd/include/roctracer/roctracer_hcc.h +24 -0
triton/backends/amd/include/roctracer/roctracer_hip.h +37 -0
triton/backends/amd/include/roctracer/roctracer_hsa.h +112 -0
triton/backends/amd/include/roctracer/roctracer_plugin.h +137 -0
triton/backends/amd/include/roctracer/roctracer_roctx.h +67 -0
triton/backends/amd/include/roctracer/roctx.h +229 -0
triton/backends/amd/lib/ockl.bc +0 -0
triton/backends/amd/lib/ocml.bc +0 -0
triton/backends/compiler.py +304 -0
triton/backends/driver.py +48 -0
triton/backends/nvidia/__init__.py +0 -0
triton/backends/nvidia/bin/ptxas.exe +0 -0
triton/backends/nvidia/compiler.py +410 -0
triton/backends/nvidia/driver.c +451 -0
triton/backends/nvidia/driver.py +524 -0
triton/backends/nvidia/include/cuda.h +24359 -0
triton/backends/nvidia/lib/libdevice.10.bc +0 -0
triton/backends/nvidia/lib/x64/cuda.lib +0 -0
triton/compiler/__init__.py +4 -0
triton/compiler/code_generator.py +1303 -0
triton/compiler/compiler.py +430 -0
triton/compiler/errors.py +51 -0
triton/compiler/make_launcher.py +0 -0
triton/errors.py +5 -0
triton/language/__init__.py +294 -0
triton/language/_utils.py +21 -0
triton/language/core.py +2694 -0
triton/language/extra/__init__.py +26 -0
triton/language/extra/cuda/__init__.py +13 -0
triton/language/extra/cuda/_experimental_tma.py +108 -0
triton/language/extra/cuda/libdevice.py +1629 -0
triton/language/extra/cuda/utils.py +109 -0
triton/language/extra/hip/__init__.py +3 -0
triton/language/extra/hip/libdevice.py +475 -0
triton/language/extra/libdevice.py +786 -0
triton/language/math.py +250 -0
triton/language/random.py +207 -0
triton/language/semantic.py +1796 -0
triton/language/standard.py +452 -0
triton/runtime/__init__.py +23 -0
triton/runtime/autotuner.py +408 -0
triton/runtime/build.py +111 -0
triton/runtime/cache.py +295 -0
triton/runtime/driver.py +60 -0
triton/runtime/errors.py +26 -0
triton/runtime/interpreter.py +1235 -0
triton/runtime/jit.py +951 -0
triton/testing.py +511 -0
triton/tools/__init__.py +0 -0
triton/tools/build_extern.py +365 -0
triton/tools/compile.c +67 -0
triton/tools/compile.h +14 -0
triton/tools/compile.py +155 -0
triton/tools/disasm.py +144 -0
triton/tools/experimental_descriptor.py +32 -0
triton/tools/link.py +322 -0
triton/windows_utils.py +375 -0
triton_windows-3.2.0.post11.dist-info/METADATA +39 -0
triton_windows-3.2.0.post11.dist-info/RECORD +154 -0
triton_windows-3.2.0.post11.dist-info/WHEEL +5 -0
triton_windows-3.2.0.post11.dist-info/top_level.txt +12 -0

triton/language/standard.py ADDED Viewed

@@ -0,0 +1,452 @@
+from __future__ import annotations
+from ..runtime.jit import jit
+from . import core
+from . import math
+# constexpr utilities
+def _log2(i: core.constexpr):
+    log2 = 0
+    n = i.value
+    while n > 1:
+        n >>= 1
+        log2 += 1
+    return core.constexpr(log2)
+def _is_power_of_two(i: core.constexpr):
+    n = i.value
+    return core.constexpr((n & (n - 1)) == 0 and n != 0)
+# -----------------------
+# Standard library
+# -----------------------
+@core._tensor_member_fn
+@jit
+def cdiv(x, div):
+    """
+    Computes the ceiling division of :code:`x` by :code:`div`
+    :param x: the input number
+    :type x: Block
+    :param div: the divisor
+    :type div: Block
+    """
+    return (x + div - 1) // div
+@core._tensor_member_fn
+@jit
+@math._add_math_1arg_docstr("sigmoid")
+def sigmoid(x):
+    return 1 / (1 + math.exp(-x))
+@core._tensor_member_fn
+@jit
+@math._add_math_1arg_docstr("softmax")
+def softmax(x, ieee_rounding=False):
+    z = x - max(x, 0)
+    num = math.exp(z)
+    den = sum(num, 0)
+    return math.fdiv(num, den, ieee_rounding)
+@core._tensor_member_fn
+@jit
+def ravel(x):
+    """
+    Returns a contiguous flattened view of :code:`x`.
+    :param x: the input tensor
+    :type x: Block
+    """
+    return core.reshape(x, [x.numel], can_reorder=True)
+@jit
+def swizzle2d(i, j, size_i, size_j, size_g):
+    """
+    Transforms the indices of a row-major `size_i * size_j` matrix into
+    the indices of a column-major matrix for each group of `size_g` rows.
+    For example, for :code:`size_i = size_j = 4` and :code:`size_g = 2`, it will
+    transform ::
+        [[0 , 1 , 2 , 3 ],
+         [4 , 5 , 6 , 7 ],
+         [8 , 9 , 10, 11],
+         [12, 13, 14, 15]]
+    into ::
+        [[0, 2,  4 , 6 ],
+         [1, 3,  5 , 7 ],
+         [8, 10, 12, 14],
+         [9, 11, 13, 15]]
+    """
+    # "unrolled index in array"
+    ij = i * size_j + j
+    # number of elements in `size_g` groups
+    # of `size_j` columns
+    size_gj = size_g * size_j
+    # index of the group in which (i,j) is
+    group_id = ij // size_gj
+    # row-index of the first element of this group
+    off_i = group_id * size_g
+    # last group may have fewer rows
+    size_g = core.minimum(size_i - off_i, size_g)
+    # linear index with respect to the first element in this group
+    ij = ij % size_gj
+    # new row and column indices
+    new_i = off_i + ij % size_g
+    new_j = ij // size_g
+    return new_i, new_j
+@jit
+def zeros(shape, dtype):
+    """
+    Returns a tensor filled with the scalar value 0 for the given :code:`shape` and :code:`dtype`.
+    :param shape: Shape of the new array, e.g., (8, 16) or (8, )
+    :type shape: tuple of ints
+    :param dtype: Data-type of the new array, e.g., :code:`tl.float16`
+    :type dtype: DType
+    """
+    return core.full(shape, 0, dtype)
+@jit
+def zeros_like(input):
+    """
+    Returns a tensor of zeros with the same shape and type as a given tensor.
+    :param input: input tensor
+    :type input: Tensor
+    """
+    return zeros(input.shape, input.dtype)
+# max and argmax
+@jit
+def _argmax_combine(value1, index1, value2, index2, tie_break_left):
+    if tie_break_left:
+        tie = value1 == value2 and index1 < index2
+    else:
+        tie = False
+    gt = value1 > value2 or tie
+    v_ret = core.where(gt, value1, value2)
+    i_ret = core.where(gt, index1, index2)
+    return v_ret, i_ret
+@jit
+def _argmax_combine_tie_break_left(value1, index1, value2, index2):
+    return _argmax_combine(value1, index1, value2, index2, True)
+@jit
+def _argmax_combine_tie_break_fast(value1, index1, value2, index2):
+    return _argmax_combine(value1, index1, value2, index2, False)
+@jit
+def _elementwise_max(a, b):
+    return core.maximum(a, b)
+@core._tensor_member_fn
+@jit
+@core._add_reduction_docstr("maximum", return_indices_arg="return_indices",
+                            tie_break_arg="return_indices_tie_break_left")
+def max(input, axis=None, return_indices=False, return_indices_tie_break_left=True, keep_dims=False):
+    input = core._promote_bfloat16_to_float32(input)
+    if return_indices:
+        if return_indices_tie_break_left:
+            return core._reduce_with_indices(input, axis, _argmax_combine_tie_break_left, keep_dims=keep_dims)
+        else:
+            return core._reduce_with_indices(input, axis, _argmax_combine_tie_break_fast, keep_dims=keep_dims)
+    else:
+        if core.constexpr(input.dtype.primitive_bitwidth) < core.constexpr(32):
+            if core.constexpr(input.dtype.is_floating()):
+                input = input.to(core.float32)
+            else:
+                assert input.dtype.is_int(), "Expecting input to be integer type"
+                input = input.to(core.int32)
+        return core.reduce(input, axis, _elementwise_max, keep_dims=keep_dims)
+@core._tensor_member_fn
+@jit
+@core._add_reduction_docstr("maximum index", tie_break_arg="tie_break_left")
+def argmax(input, axis, tie_break_left=True, keep_dims=False):
+    (_, ret) = max(input, axis, return_indices=True, return_indices_tie_break_left=tie_break_left, keep_dims=keep_dims)
+    return ret
+# min and argmin
+@jit
+def _argmin_combine(value1, index1, value2, index2, tie_break_left):
+    if tie_break_left:
+        tie = value1 == value2 and index1 < index2
+    else:
+        tie = False
+    lt = value1 < value2 or tie
+    value_ret = core.where(lt, value1, value2)
+    index_ret = core.where(lt, index1, index2)
+    return value_ret, index_ret
+@jit
+def _argmin_combine_tie_break_left(value1, index1, value2, index2):
+    return _argmin_combine(value1, index1, value2, index2, True)
+@jit
+def _argmin_combine_tie_break_fast(value1, index1, value2, index2):
+    return _argmin_combine(value1, index1, value2, index2, False)
+@jit
+def _elementwise_min(a, b):
+    return core.minimum(a, b)
+@core._tensor_member_fn
+@jit
+@core._add_reduction_docstr("minimum", return_indices_arg="return_indices",
+                            tie_break_arg="return_indices_tie_break_left")
+def min(input, axis=None, return_indices=False, return_indices_tie_break_left=True, keep_dims=False):
+    input = core._promote_bfloat16_to_float32(input)
+    if return_indices:
+        if return_indices_tie_break_left:
+            return core._reduce_with_indices(input, axis, _argmin_combine_tie_break_left, keep_dims=keep_dims)
+        else:
+            return core._reduce_with_indices(input, axis, _argmin_combine_tie_break_fast, keep_dims=keep_dims)
+    else:
+        if core.constexpr(input.dtype.primitive_bitwidth) < 32:
+            if core.constexpr(input.dtype.is_floating()):
+                input = input.to(core.float32)
+            else:
+                assert input.dtype.is_int(), "Expecting input to be integer type"
+                input = input.to(core.int32)
+        return core.reduce(input, axis, _elementwise_min, keep_dims=keep_dims)
+@core._tensor_member_fn
+@jit
+@core._add_reduction_docstr("minimum index", tie_break_arg="tie_break_left")
+def argmin(input, axis, tie_break_left=True, keep_dims=False):
+    _, ret = min(input, axis, return_indices=True, return_indices_tie_break_left=tie_break_left, keep_dims=keep_dims)
+    return ret
+@jit
+def _sum_combine(a, b):
+    return a + b
+# sum
+@core._tensor_member_fn
+@jit
+@core._add_reduction_docstr("sum")
+def sum(input, axis=None, keep_dims=False):
+    input = core._promote_bfloat16_to_float32(input)
+    return core.reduce(input, axis, _sum_combine, keep_dims=keep_dims)
+@jit
+def _xor_combine(a, b):
+    return a ^ b
+# xor sum
+@core._tensor_member_fn
+@core.builtin
+@core._add_reduction_docstr("xor sum")
+def xor_sum(input, axis=None, keep_dims=False, _builder=None, _generator=None):
+    scalar_ty = input.type.scalar
+    if not scalar_ty.is_int():
+        raise ValueError("xor_sum only supported for integers")
+    input = core._promote_bfloat16_to_float32(input, _builder=_builder)
+    return core.reduce(input, axis, _xor_combine, keep_dims=keep_dims, _builder=_builder, _generator=_generator)
+# cumsum
+@core._tensor_member_fn
+@jit
+@core._add_scan_docstr("cumsum")
+def cumsum(input, axis=0, reverse=False):
+    # todo rename this to a generic function name
+    input = core._promote_bfloat16_to_float32(input)
+    return core.associative_scan(input, axis, _sum_combine, reverse)
+# cumprod
+@jit
+def _prod_combine(a, b):
+    return a * b
+@core._tensor_member_fn
+@jit
+@core._add_scan_docstr("cumprod")
+def cumprod(input, axis=0, reverse=False):
+    # todo rename this to a generic function name
+    input = core._promote_bfloat16_to_float32(input)
+    return core.associative_scan(input, axis, _prod_combine, reverse)
+# sort
+@jit
+def _compare_and_swap(x, flip, i: core.constexpr, n_dims: core.constexpr):
+    n_outer: core.constexpr = x.numel >> n_dims
+    shape: core.constexpr = [n_outer * 2**i, 2, 2**(n_dims - i - 1)]
+    y = core.reshape(x, shape)
+    # slice left/right with 'stride' 2**(n_dims - i - 1)
+    mask = core.arange(0, 2)[None, :, None]
+    left = core.broadcast_to(sum(y * (1 - mask), 1)[:, None, :], shape).to(y.dtype)
+    right = core.broadcast_to(sum(y * mask, 1)[:, None, :], shape).to(y.dtype)
+    left = core.reshape(left, x.shape)
+    right = core.reshape(right, x.shape)
+    # actual compare-and-swap
+    idtype = core.get_int_dtype(bitwidth=x.dtype.primitive_bitwidth, signed=True)
+    ileft = left.to(idtype, bitcast=True)
+    iright = right.to(idtype, bitcast=True)
+    ix = x.to(idtype, bitcast=True)
+    ret = ix ^ core.where((left > right) != flip, ileft ^ iright, zeros_like(ix))
+    return ret.to(x.dtype, bitcast=True)
+@jit
+def _bitonic_merge(x, stage: core.constexpr, order: core.constexpr, n_dims: core.constexpr):
+    '''
+    order_type 0 == ascending
+    order_type 1 == descending
+    order_type 2 == alternating
+    '''
+    n_outer: core.constexpr = x.numel >> n_dims
+    core.static_assert(stage <= n_dims)
+    # flip denotes whether to re-arrange sub-sequences of elements in ascending or
+    # descending order.
+    # if flip = 00000000... then all elements will be re-arranged ascendingly at this stage
+    # if flip = 00110011... then all the elements will be re-arranged alternatingly (with
+    # a stride of 2) at this stage
+    if order == 2:
+        shape: core.constexpr = [n_outer * 2**(n_dims - 1 - stage), 2, 2**stage]
+        flip = core.reshape(core.broadcast_to(core.arange(0, 2)[None, :, None], shape), x.shape)
+    else:
+        flip = order
+    # perform `stage` rounds of `compare-and-swap`
+    for i in core.static_range(stage):
+        x = _compare_and_swap(x, flip, i + (n_dims - stage), n_dims)
+    return x
+@core._tensor_member_fn
+@jit
+def sort(x, dim: core.constexpr = None, descending: core.constexpr = core.CONSTEXPR_0):
+    """
+    Sorts a tensor along a specified dimension.
+    :param x: The input tensor to be sorted.
+    :type x: Tensor
+    :param dim: The dimension along which to sort the tensor. If None, the tensor is sorted along the last dimension. Currently, only sorting along the last dimension is supported.
+    :type dim: int, optional
+    :param descending: If set to True, the tensor is sorted in descending order. If set to False, the tensor is sorted in ascending order.
+    :type descending: bool, optional
+    """
+    # handle default dimension or check that it is the most minor dim
+    _dim: core.constexpr = len(x.shape) - 1 if dim is None else dim
+    core.static_assert(_dim == len(x.shape) - 1, "only minor dimension is currently supported")
+    # iteratively run bitonic merge-sort steps
+    n_dims: core.constexpr = _log2(x.shape[_dim])
+    for i in core.static_range(1, n_dims + 1):
+        x = _bitonic_merge(x, i, 2 if i < n_dims else descending, n_dims)
+    return x
+# flip
+def _get_flip_dim(dim, shape):
+    dim = core._unwrap_if_constexpr(dim)
+    shape = core._unwrap_if_constexpr(shape)
+    if dim is None:
+        dim = len(shape) - 1
+    assert dim == len(shape) - 1, "Currently only support flipping the last dimension"
+    return core.constexpr(dim)
+@core._tensor_member_fn
+@jit
+def flip(x, dim=None):
+    """
+    Flips a tensor `x` along the dimension `dim`.
+    :param x: the first input tensor
+    :type x: Block
+    :param dim: the dimension to flip along (currently only final dimension supported)
+    :type dim: int
+    """
+    core.static_assert(_is_power_of_two(x.shape[_get_flip_dim(dim, x.shape)]))
+    core.static_assert(_is_power_of_two(x.numel))
+    # # reshape the tensor to have all dimensions be 2.
+    # # TODO: We shouldn't have to change the dimensions not sorted.
+    steps: core.constexpr = _log2(x.numel)
+    start: core.constexpr = _log2(x.numel) - _log2(x.shape[_get_flip_dim(dim, x.shape)])
+    y = core.reshape(x, [2] * steps)
+    y = core.expand_dims(y, start)
+    flip = (core.arange(0, 2)[:, None] == 1 - core.arange(0, 2))
+    for i in core.static_range(start, steps):
+        flip2 = flip
+        for j in core.static_range(0, steps + 1):
+            if j != i and j != i + 1:
+                flip2 = core.expand_dims(flip2, j)
+        y = sum(y * flip2, i + 1, keep_dims=True)
+    x = core.reshape(y, x.shape)
+    return x
+@jit
+def interleave(a, b):
+    """
+    Interleaves the values of two tensors along their last dimension. The two tensors must have the same shape.
+    Equivalent to `tl.join(a, b).reshape(a.shape[:-1] + [2 * a.shape[-1]])`
+    :param a: The first input tensor.
+    :type a: Tensor
+    :param b: The second input tensor.
+    :type b: Tensor
+    """
+    c = core.join(a, b)
+    if len(c.shape) == 1:
+        # We must have interleaved two scalars.
+        return c
+    else:
+        # This `else` is necessary because Triton's AST parser doesn't
+        # understand that if we take the `if` above we definitely don't run this
+        # `else`.
+        return core.reshape(c, c.shape[:-2] + [2 * c.shape[-2]])

triton/runtime/__init__.py ADDED Viewed

@@ -0,0 +1,23 @@
+from .autotuner import (Autotuner, Config, Heuristics, autotune, heuristics)
+from .cache import RedisRemoteCacheBackend, RemoteCacheBackend
+from .driver import driver
+from .jit import JITFunction, KernelInterface, MockTensor, TensorWrapper, reinterpret
+from .errors import OutOfResources, InterpreterError
+__all__ = [
+    "autotune",
+    "Autotuner",
+    "Config",
+    "driver",
+    "Heuristics",
+    "heuristics",
+    "InterpreterError",
+    "JITFunction",
+    "KernelInterface",
+    "MockTensor",
+    "OutOfResources",
+    "RedisRemoteCacheBackend",
+    "reinterpret",
+    "RemoteCacheBackend",
+    "TensorWrapper",
+]