PyPI - triton-windows - Versions diffs - 3.3.1.post19__cp310-cp310-win_amd64.whl → 3.5.0.post21__cp310-cp310-win_amd64.whl - Mend

triton-windows 3.3.1.post19__cp310-cp310-win_amd64.whl → 3.5.0.post21__cp310-cp310-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of triton-windows might be problematic. Click here for more details.

Files changed (225) hide show

triton/_C/libtriton.pyd +0 -0
triton/__init__.py +11 -2
triton/_filecheck.py +97 -0
triton/_internal_testing.py +95 -18
triton/_utils.py +112 -21
triton/backends/__init__.py +20 -23
triton/backends/amd/__init__.py +0 -0
triton/backends/amd/compiler.py +161 -119
triton/backends/amd/driver.c +118 -46
triton/backends/amd/driver.py +274 -96
triton/backends/compiler.py +7 -21
triton/backends/driver.py +13 -0
triton/backends/nvidia/bin/ptxas.exe +0 -0
triton/backends/nvidia/compiler.py +163 -106
triton/backends/nvidia/driver.c +166 -101
triton/backends/nvidia/driver.py +384 -202
triton/compiler/__init__.py +5 -2
triton/compiler/code_generator.py +439 -231
triton/compiler/compiler.py +152 -84
triton/experimental/__init__.py +0 -0
triton/experimental/gluon/__init__.py +5 -0
triton/experimental/gluon/_compiler.py +0 -0
triton/experimental/gluon/_runtime.py +102 -0
triton/experimental/gluon/language/__init__.py +119 -0
triton/experimental/gluon/language/_core.py +490 -0
triton/experimental/gluon/language/_layouts.py +583 -0
triton/experimental/gluon/language/_math.py +20 -0
triton/experimental/gluon/language/_semantic.py +380 -0
triton/experimental/gluon/language/_standard.py +80 -0
triton/experimental/gluon/language/amd/__init__.py +4 -0
triton/experimental/gluon/language/amd/_layouts.py +96 -0
triton/experimental/gluon/language/amd/cdna3/__init__.py +100 -0
triton/experimental/gluon/language/amd/cdna4/__init__.py +48 -0
triton/experimental/gluon/language/amd/cdna4/async_copy.py +151 -0
triton/experimental/gluon/language/extra/__init__.py +3 -0
triton/experimental/gluon/language/nvidia/__init__.py +4 -0
triton/experimental/gluon/language/nvidia/ampere/__init__.py +3 -0
triton/experimental/gluon/language/nvidia/ampere/async_copy.py +74 -0
triton/experimental/gluon/language/nvidia/ampere/mbarrier.py +80 -0
triton/experimental/gluon/language/nvidia/blackwell/__init__.py +387 -0
triton/experimental/gluon/language/nvidia/blackwell/tma.py +52 -0
triton/experimental/gluon/language/nvidia/hopper/__init__.py +132 -0
triton/experimental/gluon/language/nvidia/hopper/mbarrier.py +34 -0
triton/experimental/gluon/language/nvidia/hopper/tma.py +97 -0
triton/experimental/gluon/nvidia/__init__.py +4 -0
triton/experimental/gluon/nvidia/blackwell.py +3 -0
triton/experimental/gluon/nvidia/hopper.py +45 -0
triton/knobs.py +546 -0
triton/language/__init__.py +50 -19
triton/language/core.py +909 -572
triton/language/extra/cuda/__init__.py +10 -7
triton/language/extra/cuda/gdc.py +42 -0
triton/language/extra/cuda/libdevice.py +394 -394
triton/language/extra/cuda/utils.py +21 -21
triton/language/extra/hip/__init__.py +3 -1
triton/language/extra/hip/libdevice.py +120 -104
triton/language/extra/hip/utils.py +35 -0
triton/language/extra/libdevice.py +4 -0
triton/language/math.py +65 -66
triton/language/random.py +12 -2
triton/language/semantic.py +1757 -1768
triton/language/standard.py +127 -62
triton/language/target_info.py +54 -0
triton/runtime/_allocation.py +15 -3
triton/runtime/_async_compile.py +55 -0
triton/runtime/autotuner.py +117 -60
triton/runtime/build.py +83 -17
triton/runtime/cache.py +61 -47
triton/runtime/driver.py +25 -47
triton/runtime/interpreter.py +95 -50
triton/runtime/jit.py +445 -248
triton/runtime/tcc/include/_mingw.h +8 -10
triton/runtime/tcc/include/assert.h +5 -0
triton/runtime/tcc/include/errno.h +1 -1
triton/runtime/tcc/include/float.h +21 -3
triton/runtime/tcc/include/iso646.h +36 -0
triton/runtime/tcc/include/limits.h +5 -0
triton/runtime/tcc/include/malloc.h +2 -2
triton/runtime/tcc/include/math.h +21 -261
triton/runtime/tcc/include/stdalign.h +16 -0
triton/runtime/tcc/include/stdarg.h +5 -70
triton/runtime/tcc/include/stdatomic.h +171 -0
triton/runtime/tcc/include/stddef.h +7 -19
triton/runtime/tcc/include/stdlib.h +15 -4
triton/runtime/tcc/include/stdnoreturn.h +7 -0
triton/runtime/tcc/include/sys/stat.h +2 -2
triton/runtime/tcc/include/sys/types.h +5 -0
triton/runtime/tcc/include/tcc/tcc_libm.h +444 -27
triton/runtime/tcc/include/tccdefs.h +342 -0
triton/runtime/tcc/include/tgmath.h +89 -0
triton/runtime/tcc/include/uchar.h +33 -0
triton/runtime/tcc/include/unistd.h +1 -0
triton/runtime/tcc/include/winapi/qos.h +72 -0
triton/runtime/tcc/include/winapi/shellapi.h +59 -0
triton/runtime/tcc/include/winapi/winbase.h +9 -2
triton/runtime/tcc/include/winapi/wincon.h +8 -0
triton/runtime/tcc/include/winapi/windows.h +1 -1
triton/runtime/tcc/include/winapi/winnls.h +778 -0
triton/runtime/tcc/include/winapi/winnt.h +9 -7
triton/runtime/tcc/include/winapi/winsock2.h +1474 -0
triton/runtime/tcc/include/winapi/ws2ipdef.h +21 -0
triton/runtime/tcc/include/winapi/ws2tcpip.h +391 -0
triton/runtime/tcc/lib/libtcc1.a +0 -0
triton/runtime/tcc/lib/python314.def +1800 -0
triton/runtime/tcc/lib/python314t.def +1809 -0
triton/runtime/tcc/libtcc.dll +0 -0
triton/runtime/tcc/tcc.exe +0 -0
triton/testing.py +16 -12
triton/tools/compile.py +62 -14
triton/tools/disasm.py +3 -4
triton/tools/extra/cuda/compile.c +1 -0
triton/tools/extra/hip/compile.cpp +66 -0
triton/tools/extra/hip/compile.h +13 -0
triton/tools/ragged_tma.py +92 -0
triton/tools/tensor_descriptor.py +34 -0
triton/windows_utils.py +52 -81
{triton_windows-3.3.1.post19.dist-info → triton_windows-3.5.0.post21.dist-info}/METADATA +8 -4
triton_windows-3.5.0.post21.dist-info/RECORD +217 -0
triton_windows-3.5.0.post21.dist-info/entry_points.txt +3 -0
triton_windows-3.5.0.post21.dist-info/licenses/LICENSE +23 -0
triton_windows-3.5.0.post21.dist-info/top_level.txt +1 -0
triton/backends/amd/include/hip/amd_detail/amd_channel_descriptor.h +0 -358
triton/backends/amd/include/hip/amd_detail/amd_device_functions.h +0 -1010
triton/backends/amd/include/hip/amd_detail/amd_hip_atomic.h +0 -1638
triton/backends/amd/include/hip/amd_detail/amd_hip_bf16.h +0 -1814
triton/backends/amd/include/hip/amd_detail/amd_hip_bfloat16.h +0 -293
triton/backends/amd/include/hip/amd_detail/amd_hip_common.h +0 -32
triton/backends/amd/include/hip/amd_detail/amd_hip_complex.h +0 -174
triton/backends/amd/include/hip/amd_detail/amd_hip_cooperative_groups.h +0 -835
triton/backends/amd/include/hip/amd_detail/amd_hip_fp16.h +0 -1809
triton/backends/amd/include/hip/amd_detail/amd_hip_fp8.h +0 -1391
triton/backends/amd/include/hip/amd_detail/amd_hip_gl_interop.h +0 -108
triton/backends/amd/include/hip/amd_detail/amd_hip_math_constants.h +0 -124
triton/backends/amd/include/hip/amd_detail/amd_hip_runtime.h +0 -405
triton/backends/amd/include/hip/amd_detail/amd_hip_runtime_pt_api.h +0 -196
triton/backends/amd/include/hip/amd_detail/amd_hip_unsafe_atomics.h +0 -565
triton/backends/amd/include/hip/amd_detail/amd_hip_vector_types.h +0 -2226
triton/backends/amd/include/hip/amd_detail/amd_math_functions.h +0 -104
triton/backends/amd/include/hip/amd_detail/amd_surface_functions.h +0 -244
triton/backends/amd/include/hip/amd_detail/amd_warp_functions.h +0 -538
triton/backends/amd/include/hip/amd_detail/amd_warp_sync_functions.h +0 -288
triton/backends/amd/include/hip/amd_detail/concepts.hpp +0 -30
triton/backends/amd/include/hip/amd_detail/device_library_decls.h +0 -133
triton/backends/amd/include/hip/amd_detail/functional_grid_launch.hpp +0 -218
triton/backends/amd/include/hip/amd_detail/grid_launch.h +0 -67
triton/backends/amd/include/hip/amd_detail/grid_launch.hpp +0 -50
triton/backends/amd/include/hip/amd_detail/grid_launch_GGL.hpp +0 -26
triton/backends/amd/include/hip/amd_detail/helpers.hpp +0 -137
triton/backends/amd/include/hip/amd_detail/hip_api_trace.hpp +0 -1446
triton/backends/amd/include/hip/amd_detail/hip_assert.h +0 -101
triton/backends/amd/include/hip/amd_detail/hip_cooperative_groups_helper.h +0 -242
triton/backends/amd/include/hip/amd_detail/hip_fp16_gcc.h +0 -254
triton/backends/amd/include/hip/amd_detail/hip_fp16_math_fwd.h +0 -96
triton/backends/amd/include/hip/amd_detail/hip_ldg.h +0 -100
triton/backends/amd/include/hip/amd_detail/hip_prof_str.h +0 -10570
triton/backends/amd/include/hip/amd_detail/hip_runtime_prof.h +0 -78
triton/backends/amd/include/hip/amd_detail/host_defines.h +0 -184
triton/backends/amd/include/hip/amd_detail/hsa_helpers.hpp +0 -102
triton/backends/amd/include/hip/amd_detail/macro_based_grid_launch.hpp +0 -798
triton/backends/amd/include/hip/amd_detail/math_fwd.h +0 -698
triton/backends/amd/include/hip/amd_detail/ockl_image.h +0 -177
triton/backends/amd/include/hip/amd_detail/program_state.hpp +0 -107
triton/backends/amd/include/hip/amd_detail/texture_fetch_functions.h +0 -491
triton/backends/amd/include/hip/amd_detail/texture_indirect_functions.h +0 -478
triton/backends/amd/include/hip/channel_descriptor.h +0 -39
triton/backends/amd/include/hip/device_functions.h +0 -38
triton/backends/amd/include/hip/driver_types.h +0 -468
triton/backends/amd/include/hip/hip_bf16.h +0 -36
triton/backends/amd/include/hip/hip_bfloat16.h +0 -44
triton/backends/amd/include/hip/hip_common.h +0 -100
triton/backends/amd/include/hip/hip_complex.h +0 -38
triton/backends/amd/include/hip/hip_cooperative_groups.h +0 -46
triton/backends/amd/include/hip/hip_deprecated.h +0 -95
triton/backends/amd/include/hip/hip_ext.h +0 -161
triton/backends/amd/include/hip/hip_fp16.h +0 -36
triton/backends/amd/include/hip/hip_fp8.h +0 -33
triton/backends/amd/include/hip/hip_gl_interop.h +0 -32
triton/backends/amd/include/hip/hip_hcc.h +0 -24
triton/backends/amd/include/hip/hip_math_constants.h +0 -36
triton/backends/amd/include/hip/hip_profile.h +0 -27
triton/backends/amd/include/hip/hip_runtime.h +0 -75
triton/backends/amd/include/hip/hip_runtime_api.h +0 -9261
triton/backends/amd/include/hip/hip_texture_types.h +0 -29
triton/backends/amd/include/hip/hip_vector_types.h +0 -41
triton/backends/amd/include/hip/hip_version.h +0 -17
triton/backends/amd/include/hip/hiprtc.h +0 -421
triton/backends/amd/include/hip/library_types.h +0 -78
triton/backends/amd/include/hip/math_functions.h +0 -42
triton/backends/amd/include/hip/surface_types.h +0 -63
triton/backends/amd/include/hip/texture_types.h +0 -194
triton/backends/amd/include/hsa/Brig.h +0 -1131
triton/backends/amd/include/hsa/amd_hsa_common.h +0 -91
triton/backends/amd/include/hsa/amd_hsa_elf.h +0 -462
triton/backends/amd/include/hsa/amd_hsa_kernel_code.h +0 -269
triton/backends/amd/include/hsa/amd_hsa_queue.h +0 -109
triton/backends/amd/include/hsa/amd_hsa_signal.h +0 -80
triton/backends/amd/include/hsa/hsa.h +0 -5738
triton/backends/amd/include/hsa/hsa_amd_tool.h +0 -91
triton/backends/amd/include/hsa/hsa_api_trace.h +0 -579
triton/backends/amd/include/hsa/hsa_api_trace_version.h +0 -68
triton/backends/amd/include/hsa/hsa_ext_amd.h +0 -3146
triton/backends/amd/include/hsa/hsa_ext_finalize.h +0 -531
triton/backends/amd/include/hsa/hsa_ext_image.h +0 -1454
triton/backends/amd/include/hsa/hsa_ven_amd_aqlprofile.h +0 -488
triton/backends/amd/include/hsa/hsa_ven_amd_loader.h +0 -667
triton/backends/amd/include/hsa/hsa_ven_amd_pc_sampling.h +0 -416
triton/backends/amd/include/roctracer/ext/prof_protocol.h +0 -107
triton/backends/amd/include/roctracer/hip_ostream_ops.h +0 -4515
triton/backends/amd/include/roctracer/hsa_ostream_ops.h +0 -1727
triton/backends/amd/include/roctracer/hsa_prof_str.h +0 -3059
triton/backends/amd/include/roctracer/roctracer.h +0 -779
triton/backends/amd/include/roctracer/roctracer_ext.h +0 -81
triton/backends/amd/include/roctracer/roctracer_hcc.h +0 -24
triton/backends/amd/include/roctracer/roctracer_hip.h +0 -37
triton/backends/amd/include/roctracer/roctracer_hsa.h +0 -112
triton/backends/amd/include/roctracer/roctracer_plugin.h +0 -137
triton/backends/amd/include/roctracer/roctracer_roctx.h +0 -67
triton/backends/amd/include/roctracer/roctx.h +0 -229
triton/language/_utils.py +0 -21
triton/language/extra/cuda/_experimental_tma.py +0 -106
triton/runtime/tcc/lib/libtcc1-64.a +0 -0
triton/tools/experimental_descriptor.py +0 -32
triton_windows-3.3.1.post19.dist-info/RECORD +0 -260
triton_windows-3.3.1.post19.dist-info/top_level.txt +0 -14
{triton_windows-3.3.1.post19.dist-info → triton_windows-3.5.0.post21.dist-info}/WHEEL +0 -0

triton/language/standard.py CHANGED Viewed

@@ -1,24 +1,25 @@
 from __future__ import annotations
-from ..runtime.jit import jit
+from ..runtime.jit import jit, constexpr_function
 from . import core
 from . import math
 # constexpr utilities
-def _log2(i: core.constexpr):
+@constexpr_function
+def _log2(i):
     log2 = 0
-    n = i.value
+    n = i
     while n > 1:
         n >>= 1
         log2 += 1
-    return core.constexpr(log2)
+    return log2
-def _is_power_of_two(i: core.constexpr):
-    n = i.value
-    return core.constexpr((n & (n - 1)) == 0 and n != 0)
+@constexpr_function
+def _is_power_of_two(i):
+    return (i & (i - 1)) == 0 and i != 0
 # -----------------------
@@ -50,10 +51,14 @@ def sigmoid(x):
 @core._tensor_member_fn
 @jit
 @math._add_math_1arg_docstr("softmax")
-def softmax(x, ieee_rounding=False):
-    z = x - max(x, 0)
+def softmax(x, dim=None, keep_dims=False, ieee_rounding=False):
+    if dim is None:
+        _dim: core.constexpr = 0
+    else:
+        _dim: core.constexpr = dim
+    z = x - max(x, _dim, keep_dims=keep_dims)
     num = math.exp(z)
-    den = sum(num, 0)
+    den = sum(num, _dim, keep_dims=keep_dims)
     return math.fdiv(num, den, ieee_rounding)
@@ -259,8 +264,8 @@ def _sum_combine(a, b):
 # sum
-def _pick_sum_dtype(in_dtype: core.constexpr, dtype: core.constexpr):
-    dtype = core._unwrap_if_constexpr(dtype)
+@constexpr_function
+def _pick_sum_dtype(in_dtype, dtype):
     if dtype is not None:
         return dtype
@@ -302,15 +307,37 @@ def xor_sum(input, axis=None, keep_dims=False):
     return core.reduce(input, axis, _xor_combine, keep_dims=keep_dims)
+# or reduction
+@jit
+def _or_combine(x, y):
+    return x | y
+@core._tensor_member_fn
+@jit
+@core._add_reduction_docstr("reduce_or")
+def reduce_or(input, axis, keep_dims=False):
+    core.static_assert(input.type.scalar.is_int(), "reduce_or only supported for integers")
+    return core.reduce(input, axis, _or_combine, keep_dims=keep_dims)
 # cumsum
 @core._tensor_member_fn
 @jit
-@core._add_scan_docstr("cumsum")
-def cumsum(input, axis=0, reverse=False):
+@core._add_scan_docstr("cumsum", dtype_arg="dtype")
+def cumsum(input, axis=0, reverse=False, dtype: core.constexpr = None):
     # todo rename this to a generic function name
     input = core._promote_bfloat16_to_float32(input)
+    out_dtype: core.constexpr = _pick_sum_dtype(input.dtype, dtype)
+    if out_dtype is not None:
+        input = input.to(out_dtype)
     return core.associative_scan(input, axis, _sum_combine, reverse)
@@ -335,53 +362,63 @@ def cumprod(input, axis=0, reverse=False):
 @jit
-def _compare_and_swap(x, flip, i: core.constexpr, n_dims: core.constexpr):
-    n_outer: core.constexpr = x.numel >> n_dims
-    shape: core.constexpr = [n_outer * 2**i, 2, 2**(n_dims - i - 1)]
-    y = core.reshape(x, shape)
-    # slice left/right with 'stride' 2**(n_dims - i - 1)
-    mask = core.arange(0, 2)[None, :, None]
-    left = core.broadcast_to(sum(y * (1 - mask), 1)[:, None, :], shape).to(y.dtype)
-    right = core.broadcast_to(sum(y * mask, 1)[:, None, :], shape).to(y.dtype)
-    left = core.reshape(left, x.shape)
-    right = core.reshape(right, x.shape)
-    # actual compare-and-swap
+def _indicator(n_dims: core.constexpr, j: core.constexpr):
+    ar = core.arange(0, 2)
+    ar = core.reshape(ar, [1] * (n_dims - j - 1) + [2] + [1] * j)
+    return ar
+@jit
+def _compare_and_swap(x, flip, i: core.constexpr):
+    # compare-and-swap on the ith *innermost* dimension
+    n_dims: core.constexpr = _log2(x.numel)
+    # flip along middle dimension (the bitwise XORs will be optimised away):
     idtype = core.get_int_dtype(bitwidth=x.dtype.primitive_bitwidth, signed=True)
-    ileft = left.to(idtype, bitcast=True)
-    iright = right.to(idtype, bitcast=True)
     ix = x.to(idtype, bitcast=True)
-    ret = ix ^ core.where((left > right) != flip, ileft ^ iright, zeros_like(ix))
-    return ret.to(x.dtype, bitcast=True)
+    iy = ix ^ xor_sum(ix, n_dims - 1 - i, True)
+    y = iy.to(x.dtype, bitcast=True)
+    # determines whether we are in the right (rather than left) position along the axis:
+    is_right = _indicator(n_dims, i)
+    # conditional swap:
+    ret = core.where((x > y) != (flip ^ is_right), y, x)
+    return ret
 @jit
-def _bitonic_merge(x, stage: core.constexpr, order: core.constexpr, n_dims: core.constexpr):
+def _bitonic_merge_hypercube(x, stage: core.constexpr, order: core.constexpr):
     '''
     order_type 0 == ascending
     order_type 1 == descending
     order_type 2 == alternating
     '''
-    n_outer: core.constexpr = x.numel >> n_dims
-    core.static_assert(stage <= n_dims)
     # flip denotes whether to re-arrange sub-sequences of elements in ascending or
     # descending order.
     # if flip = 00000000... then all elements will be re-arranged ascendingly at this stage
     # if flip = 00110011... then all the elements will be re-arranged alternatingly (with
     # a stride of 2) at this stage
     if order == 2:
-        shape: core.constexpr = [n_outer * 2**(n_dims - 1 - stage), 2, 2**stage]
-        flip = core.reshape(core.broadcast_to(core.arange(0, 2)[None, :, None], shape), x.shape)
+        flip = _indicator(_log2(x.numel), stage)
     else:
         flip = order
     # perform `stage` rounds of `compare-and-swap`
     for i in core.static_range(stage):
-        x = _compare_and_swap(x, flip, i + (n_dims - stage), n_dims)
+        x = _compare_and_swap(x, flip, stage - 1 - i)
     return x
-@core._tensor_member_fn
 @jit
-def sort(x, dim: core.constexpr = None, descending: core.constexpr = core.CONSTEXPR_0):
+def _bitonic_merge(x, stage: core.constexpr, order: core.constexpr, n_dims: core.constexpr):
+    h = core.reshape(x, [2] * _log2(x.numel))
+    h = _bitonic_merge_hypercube(h, stage, order)
+    x = core.reshape(h, x.shape)
+    return x
+@jit
+def sort_impl(x, k: core.constexpr = None, dim: core.constexpr = None, descending: core.constexpr = core.CONSTEXPR_0):
     """
     Sorts a tensor along a specified dimension.
@@ -389,29 +426,64 @@ def sort(x, dim: core.constexpr = None, descending: core.constexpr = core.CONSTE
     :type x: Tensor
     :param dim: The dimension along which to sort the tensor. If None, the tensor is sorted along the last dimension. Currently, only sorting along the last dimension is supported.
     :type dim: int, optional
+    :param k: the number of top elements to select. If none, assume k = x.shape[dim]
+    :type k: int, optional
     :param descending: If set to True, the tensor is sorted in descending order. If set to False, the tensor is sorted in ascending order.
     :type descending: bool, optional
     """
     # handle default dimension or check that it is the most minor dim
     _dim: core.constexpr = len(x.shape) - 1 if dim is None else dim
     core.static_assert(_dim == len(x.shape) - 1, "only minor dimension is currently supported")
-    # iteratively run bitonic merge-sort steps
-    n_dims: core.constexpr = _log2(x.shape[_dim])
-    for i in core.static_range(1, n_dims + 1):
-        x = _bitonic_merge(x, i, 2 if i < n_dims else descending, n_dims)
+    log_n: core.constexpr = _log2(x.shape[_dim])
+    log_k: core.constexpr = log_n if k is None else _log2(k)
+    n_dims: core.constexpr = _log2(x.numel)
+    # reshape to hypercube:
+    h = core.reshape(x, [2] * n_dims)
+    # run first log_k bitonic sort iterations:
+    for i in core.static_range(1, log_k + 1):
+        h = _bitonic_merge_hypercube(h, i, 2 if i < log_n else descending)
+    # select top k elements using bitonic top-k
+    # https://www.doc.ic.ac.uk/~hlgr/pdfs/MassivelyParallelTopK.pdf
+    for i in core.static_range(log_k + 1, log_n + 1):
+        h = max(h, axis=(_log2(h.numel) - 1 - log_k)) if descending else min(h, axis=(_log2(h.numel) - 1 - log_k))
+        h = _bitonic_merge_hypercube(h, log_k, 2 if i < log_n else descending)
+    # reshape back:
+    x = core.reshape(h, x.shape[:-1] + [2**log_k])
     return x
-# flip
+@jit
+def sort(x, dim: core.constexpr = None, descending: core.constexpr = core.CONSTEXPR_0):
+    return sort_impl(x, dim=dim, descending=descending)
+@jit
+def topk(x, k: core.constexpr, dim: core.constexpr = None):
+    return sort_impl(x, k=k, dim=dim, descending=True)
+@jit
+def bitonic_merge(x, dim: core.constexpr = None, descending: core.constexpr = core.CONSTEXPR_0):
+    # handle default dimension or check that it is the most minor dim
+    _dim: core.constexpr = len(x.shape) - 1 if dim is None else dim
+    core.static_assert(_dim == len(x.shape) - 1, "only minor dimension is currently supported")
+    n_dims: core.constexpr = _log2(x.shape[-1])
+    return _bitonic_merge(x, n_dims, descending, n_dims)
+@constexpr_function
 def _get_flip_dim(dim, shape):
-    dim = core._unwrap_if_constexpr(dim)
-    shape = core._unwrap_if_constexpr(shape)
     if dim is None:
         dim = len(shape) - 1
-    assert dim == len(shape) - 1, "Currently only support flipping the last dimension"
-    return core.constexpr(dim)
+    if dim < 0:  # flip doesn't work if dim < 0 because the xor-swap for loop will start/end at the wrong index
+        dim += len(shape)
+    return dim
 @core._tensor_member_fn
@@ -422,26 +494,19 @@ def flip(x, dim=None):
     :param x: the first input tensor
     :type x: Block
-    :param dim: the dimension to flip along (currently only final dimension supported)
+    :param dim: the dimension to flip along
     :type dim: int
     """
-    core.static_assert(_is_power_of_two(x.shape[_get_flip_dim(dim, x.shape)]))
-    core.static_assert(_is_power_of_two(x.numel))
-    # reshape the tensor to have all dimensions be 2.
-    # TODO: We shouldn't have to change the dimensions not sorted.
-    steps: core.constexpr = _log2(x.numel)
-    start: core.constexpr = _log2(x.numel) - _log2(x.shape[_get_flip_dim(dim, x.shape)])
+    core.static_assert(-len(x.shape) <= dim and dim < len(x.shape))
+    _dim: core.constexpr = _get_flip_dim(dim, x.shape)
+    core.static_assert(_is_power_of_two(x.shape[_dim]))
+    steps: core.constexpr = _log2(x.shape[_dim])
+    # reshape the swap dimension to (2, 2, ..., 2)
     idtype = core.get_int_dtype(bitwidth=x.dtype.primitive_bitwidth, signed=True)
-    y = core.reshape(x.to(idtype, bitcast=True), [2] * steps)
-    y = core.expand_dims(y, start)
-    flip = (core.arange(0, 2)[:, None] == 1 - core.arange(0, 2))
-    for i in core.static_range(start, steps):
-        flip2 = flip
-        for j in core.static_range(0, steps + 1):
-            if j != i and j != i + 1:
-                flip2 = core.expand_dims(flip2, j)
-        y = sum(y * flip2, i + 1, keep_dims=True, dtype=y.dtype)
+    y = core.reshape(x.to(idtype, bitcast=True), x.shape[:_dim] + [2] * steps + x.shape[_dim + 1:])
+    for i in core.static_range(steps):
+        y = y ^ xor_sum(y, _dim + i, True)
     x = core.reshape(y, x.shape).to(x.dtype, bitcast=True)
     return x

triton/language/target_info.py ADDED Viewed

@@ -0,0 +1,54 @@
+from triton.runtime import driver
+from triton.runtime.jit import constexpr_function
+__all__ = ["current_target"]
+def current_target():
+    try:
+        active_driver = driver.active
+    except RuntimeError:
+        # If there is no active driver, return None
+        return None
+    return active_driver.get_current_target()
+current_target.__triton_builtin__ = True
+@constexpr_function
+def is_cuda():
+    target = current_target()
+    return target is not None and target.backend == "cuda"
+@constexpr_function
+def cuda_capability_geq(major, minor=0):
+    """
+    Determines whether we have compute capability >= (major, minor) and
+    returns this as a constexpr boolean. This can be used for guarding
+    inline asm implementations that require a certain compute capability.
+    """
+    target = current_target()
+    if target is None or target.backend != "cuda":
+        return False
+    assert isinstance(target.arch, int)
+    return target.arch >= major * 10 + minor
+@constexpr_function
+def is_hip():
+    target = current_target()
+    return target is not None and target.backend == "hip"
+@constexpr_function
+def is_hip_cdna3():
+    target = current_target()
+    return target is not None and target.arch == "gfx942"
+@constexpr_function
+def is_hip_cdna4():
+    target = current_target()
+    return target is not None and target.arch == "gfx950"

triton/runtime/_allocation.py CHANGED Viewed

@@ -1,4 +1,5 @@
 from typing import Optional, Protocol
+from contextvars import ContextVar
 class Buffer(Protocol):
@@ -20,7 +21,7 @@ class NullAllocator:
                            "Use triton.set_allocator to specify an allocator.")
-_allocator: Allocator = NullAllocator()
+_allocator: ContextVar[Allocator] = ContextVar("_allocator", default=NullAllocator())
 def set_allocator(allocator: Allocator):
@@ -28,5 +29,16 @@ def set_allocator(allocator: Allocator):
     The allocator function is called during kernel launch for kernels that
     require additional global memory workspace.
     """
-    global _allocator
-    _allocator = allocator
+    _allocator.set(allocator)
+_profile_allocator: Allocator = ContextVar("_allocator", default=NullAllocator())
+def set_profile_allocator(allocator: Optional[Allocator]):
+    """
+    The profile allocator function is called before kernel launch for kernels
+    that require additional global memory workspace.
+    """
+    global _profile_allocator
+    _profile_allocator.set(allocator)

triton/runtime/_async_compile.py ADDED Viewed

@@ -0,0 +1,55 @@
+from __future__ import annotations
+from typing import Callable, Optional
+from concurrent.futures import Executor, as_completed, Future
+from contextvars import ContextVar
+active_mode: ContextVar[Optional[AsyncCompileMode]] = ContextVar("async_compile_active_mode", default=None)
+class FutureKernel:
+    def __init__(self, finalize_compile: Callable, future: Future):
+        self.finalize_compile = finalize_compile
+        self.kernel = None
+        self.future = future
+    def result(self):
+        if self.kernel is not None:
+            return self.kernel
+        kernel = self.future.result()
+        self.finalize_compile(kernel)
+        self.kernel = kernel
+        return kernel
+class AsyncCompileMode:
+    def __init__(self, executor: Executor):
+        self.executor = executor
+        self.raw_futures = []
+        self.future_kernels = {}
+    def submit(self, key, compile_fn, finalize_fn):
+        future = self.future_kernels.get(key)
+        if future is not None:
+            return future
+        future = self.executor.submit(compile_fn)
+        future._key = key
+        self.raw_futures.append(future)
+        future_kernel = FutureKernel(finalize_fn, future)
+        self.future_kernels[key] = future_kernel
+        return future_kernel
+    def __enter__(self):
+        if active_mode.get() is not None:
+            raise RuntimeError("Another AsyncCompileMode is already active")
+        active_mode.set(self)
+        return self
+    def __exit__(self, exc_type, exc_value, traceback):
+        # Finalize any outstanding compiles
+        for future in as_completed(self.raw_futures):
+            self.future_kernels[future._key].result()
+        active_mode.set(None)