PyPI - triton-windows - Versions diffs - 3.4.0.post20__cp312-cp312-win_amd64.whl → 3.5.0.post21__cp312-cp312-win_amd64.whl - Mend

triton-windows 3.4.0.post20__cp312-cp312-win_amd64.whl → 3.5.0.post21__cp312-cp312-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of triton-windows might be problematic. Click here for more details.

Files changed (107) hide show

triton/_C/libtriton.pyd +0 -0
triton/__init__.py +8 -2
triton/_filecheck.py +24 -14
triton/_internal_testing.py +70 -4
triton/_utils.py +3 -1
triton/backends/amd/compiler.py +68 -60
triton/backends/amd/driver.c +113 -44
triton/backends/amd/driver.py +133 -57
triton/backends/driver.py +13 -0
triton/backends/nvidia/compiler.py +80 -22
triton/backends/nvidia/driver.c +88 -15
triton/backends/nvidia/driver.py +130 -123
triton/compiler/__init__.py +5 -2
triton/compiler/code_generator.py +270 -163
triton/compiler/compiler.py +45 -62
triton/experimental/gluon/__init__.py +3 -2
triton/experimental/gluon/_runtime.py +9 -6
triton/experimental/gluon/language/__init__.py +117 -16
triton/experimental/gluon/language/_core.py +246 -68
triton/experimental/gluon/language/_layouts.py +398 -45
triton/experimental/gluon/language/_math.py +17 -9
triton/experimental/gluon/language/_semantic.py +130 -37
triton/experimental/gluon/language/_standard.py +55 -22
triton/experimental/gluon/language/amd/__init__.py +4 -0
triton/experimental/gluon/language/amd/_layouts.py +96 -0
triton/experimental/gluon/language/amd/cdna3/__init__.py +100 -0
triton/experimental/gluon/language/amd/cdna4/__init__.py +48 -0
triton/experimental/gluon/language/amd/cdna4/async_copy.py +151 -0
triton/experimental/gluon/language/extra/__init__.py +3 -0
triton/experimental/gluon/language/nvidia/ampere/__init__.py +3 -0
triton/experimental/gluon/language/nvidia/ampere/async_copy.py +74 -0
triton/experimental/gluon/language/nvidia/ampere/mbarrier.py +80 -0
triton/experimental/gluon/language/nvidia/blackwell/__init__.py +192 -7
triton/experimental/gluon/language/nvidia/blackwell/tma.py +20 -0
triton/experimental/gluon/language/nvidia/hopper/__init__.py +124 -3
triton/experimental/gluon/language/nvidia/hopper/mbarrier.py +20 -37
triton/experimental/gluon/language/nvidia/hopper/tma.py +4 -3
triton/experimental/gluon/nvidia/hopper.py +6 -1
triton/knobs.py +132 -67
triton/language/__init__.py +16 -10
triton/language/core.py +163 -83
triton/language/extra/cuda/gdc.py +6 -6
triton/language/extra/hip/__init__.py +3 -1
triton/language/extra/hip/libdevice.py +7 -0
triton/language/extra/hip/utils.py +35 -0
triton/language/extra/libdevice.py +4 -0
triton/language/semantic.py +76 -23
triton/language/standard.py +14 -14
triton/language/target_info.py +54 -0
triton/runtime/_allocation.py +15 -3
triton/runtime/_async_compile.py +55 -0
triton/runtime/autotuner.py +4 -5
triton/runtime/build.py +11 -9
triton/runtime/cache.py +44 -1
triton/runtime/driver.py +16 -41
triton/runtime/interpreter.py +31 -23
triton/runtime/jit.py +318 -157
triton/runtime/tcc/include/_mingw.h +8 -10
triton/runtime/tcc/include/assert.h +5 -0
triton/runtime/tcc/include/errno.h +1 -1
triton/runtime/tcc/include/float.h +21 -3
triton/runtime/tcc/include/iso646.h +36 -0
triton/runtime/tcc/include/limits.h +5 -0
triton/runtime/tcc/include/malloc.h +2 -2
triton/runtime/tcc/include/math.h +21 -261
triton/runtime/tcc/include/stdalign.h +16 -0
triton/runtime/tcc/include/stdarg.h +5 -70
triton/runtime/tcc/include/stdatomic.h +171 -0
triton/runtime/tcc/include/stddef.h +7 -19
triton/runtime/tcc/include/stdlib.h +15 -4
triton/runtime/tcc/include/stdnoreturn.h +7 -0
triton/runtime/tcc/include/sys/stat.h +2 -2
triton/runtime/tcc/include/sys/types.h +5 -0
triton/runtime/tcc/include/tcc/tcc_libm.h +444 -27
triton/runtime/tcc/include/tccdefs.h +342 -0
triton/runtime/tcc/include/tgmath.h +89 -0
triton/runtime/tcc/include/uchar.h +33 -0
triton/runtime/tcc/include/unistd.h +1 -0
triton/runtime/tcc/include/winapi/qos.h +72 -0
triton/runtime/tcc/include/winapi/shellapi.h +59 -0
triton/runtime/tcc/include/winapi/winbase.h +9 -2
triton/runtime/tcc/include/winapi/wincon.h +8 -0
triton/runtime/tcc/include/winapi/windows.h +1 -1
triton/runtime/tcc/include/winapi/winnls.h +778 -0
triton/runtime/tcc/include/winapi/winnt.h +9 -7
triton/runtime/tcc/include/winapi/winsock2.h +1474 -0
triton/runtime/tcc/include/winapi/ws2ipdef.h +21 -0
triton/runtime/tcc/include/winapi/ws2tcpip.h +391 -0
triton/runtime/tcc/lib/libtcc1.a +0 -0
triton/runtime/tcc/lib/python314.def +1800 -0
triton/runtime/tcc/lib/python314t.def +1809 -0
triton/runtime/tcc/libtcc.dll +0 -0
triton/runtime/tcc/tcc.exe +0 -0
triton/tools/compile.py +62 -14
triton/tools/extra/cuda/compile.c +1 -0
triton/tools/extra/hip/compile.cpp +66 -0
triton/tools/extra/hip/compile.h +13 -0
triton/tools/ragged_tma.py +92 -0
triton/tools/tensor_descriptor.py +7 -9
triton/windows_utils.py +42 -79
{triton_windows-3.4.0.post20.dist-info → triton_windows-3.5.0.post21.dist-info}/METADATA +3 -4
{triton_windows-3.4.0.post20.dist-info → triton_windows-3.5.0.post21.dist-info}/RECORD +106 -75
triton/runtime/tcc/lib/libtcc1-64.a +0 -0
{triton_windows-3.4.0.post20.dist-info → triton_windows-3.5.0.post21.dist-info}/WHEEL +0 -0
{triton_windows-3.4.0.post20.dist-info → triton_windows-3.5.0.post21.dist-info}/entry_points.txt +0 -0
{triton_windows-3.4.0.post20.dist-info → triton_windows-3.5.0.post21.dist-info}/licenses/LICENSE +0 -0
{triton_windows-3.4.0.post20.dist-info → triton_windows-3.5.0.post21.dist-info}/top_level.txt +0 -0

triton/language/extra/libdevice.py CHANGED Viewed

@@ -466,6 +466,10 @@ def fast_expf(arg0):
     ...
+def fast_tanhf(arg0):
+    ...
 def fast_tanf(arg0):
     ...

triton/language/semantic.py CHANGED Viewed

@@ -219,7 +219,7 @@ class TritonSemantic(Generic[TensorTy]):
         min_value = self.scalar_constant(min_value, tl.int64)
         cond = self.and_(self.less_equal(ret, max_value), self.greater_equal(ret, min_value))
         msg = f"int{lhs_sca_ty.int_bitwidth} overflow detected for operation {binary_op.__name__}"
-        self.device_assert(cond, msg)
+        self.device_assert(cond, msg, None)
     def add(self, input: TensorTy | numbers.Number, other: TensorTy | numbers.Number,
             sanitize_overflow: bool) -> TensorTy:
@@ -619,6 +619,9 @@ class TritonSemantic(Generic[TensorTy]):
         ret_ty = tl.block_type(value.dtype, shape)
         return self.tensor(self.builder.create_splat(ret_ty.to_ir(self.builder), value.handle), ret_ty)
+    def unsplat(self, value: TensorTy) -> TensorTy:
+        return self.tensor(self.builder.create_unsplat(value.handle), value.dtype)
     def reshape(self, input: TensorTy, dst_shape: List[int], can_reorder: bool) -> TensorTy:
         numel = 1
         for s in dst_shape:
@@ -1034,9 +1037,9 @@ class TritonSemantic(Generic[TensorTy]):
         # Make `mask` and `other` into the same shape as `ptr`
         if ptr.type.is_block():
             if mask is not None:
-                mask = self.broadcast_impl_shape(mask, ptr.type.get_block_shapes())
+                ptr, mask = self.broadcast_impl_value(ptr, mask)
             if other is not None:
-                other = self.broadcast_impl_shape(other, ptr.type.get_block_shapes())
+                ptr, other = self.broadcast_impl_value(ptr, other)
         # Get `pointer_type<elt_ty>` and `elt_ty`
         ptr_ty = ptr.type.scalar
@@ -1104,6 +1107,8 @@ class TritonSemantic(Generic[TensorTy]):
     def descriptor_store(self, desc: tl.tensor_descriptor_base, value: TensorTy, offsets) -> TensorTy:
         self.validate_store_like(desc, value, offsets)
+        # implicitly cast to the descriptor's type
+        value = self.cast(value, desc.dtype)
         offsets = self._convert_to_ir_values(offsets, require_i64=False)
         return self.tensor(self.builder.create_descriptor_store(desc.handle, value.handle, offsets), tl.void)
@@ -1472,10 +1477,10 @@ class TritonSemantic(Generic[TensorTy]):
             # All combinations of supported fp8 x fp8 are permitted
             pass
         else:
-            assert lhs.dtype in (tl.int8, tl.uint8, tl.float16, tl.bfloat16,
-                                 tl.float32), f"Unsupported lhs dtype {lhs.dtype}"
-            assert rhs.dtype in (tl.int8, tl.uint8, tl.float16, tl.bfloat16,
-                                 tl.float32), f"Unsupported rhs dtype {rhs.dtype}"
+            assert lhs.dtype in (tl.int8, tl.uint8, tl.float16, tl.bfloat16, tl.float32,
+                                 tl.float64), f"Unsupported lhs dtype {lhs.dtype}"
+            assert rhs.dtype in (tl.int8, tl.uint8, tl.float16, tl.bfloat16, tl.float32,
+                                 tl.float64), f"Unsupported rhs dtype {rhs.dtype}"
             assert lhs.dtype == rhs.dtype, f"Both operands must be same dtype. Got {lhs.dtype} and {rhs.dtype}"
         if lhs.dtype.is_fp8e4b15() or rhs.dtype.is_fp8e4b15():
@@ -1487,6 +1492,18 @@ class TritonSemantic(Generic[TensorTy]):
             lhs = self.cast(lhs, tl.float16)
             rhs = self.cast(rhs, tl.float16)
+        uses_fp8e4b8 = lhs.dtype.is_fp8e4b8() or rhs.dtype.is_fp8e4b8()
+        uses_fp8e5b16 = lhs.dtype.is_fp8e5b16() or rhs.dtype.is_fp8e5b16()
+        if uses_fp8e4b8 or uses_fp8e5b16:
+            type_name = "fp8e4b8" if uses_fp8e4b8 else "fp8e5b16"
+            if type_name in self.builder.options.deprecated_fp8_dot_operand_dtypes:
+                arch = self.builder.options.arch
+                warnings.warn(
+                    f"{type_name} is AMD gfx942 specific and not supported on {arch} so it's upcasted to fp16 and can cause significant slow down. "
+                    f"Please use OCP fp8 variants on {arch} for performance")
+                lhs = self.cast(lhs, tl.float16)
+                rhs = self.cast(rhs, tl.float16)
         if input_precision is None:
             input_precision = self.builder.options.default_dot_input_precision
@@ -1514,6 +1531,9 @@ class TritonSemantic(Generic[TensorTy]):
         elif lhs.type.scalar.is_fp32() or lhs.type.scalar.is_bf16():
             _0 = self.builder.get_fp32(0)
             ret_scalar_ty = tl.float32
+        elif lhs.type.scalar.is_fp64():
+            _0 = self.builder.get_fp64(0)
+            ret_scalar_ty = tl.float64
         else:
             _0 = self.builder.get_fp16(0) if out_dtype.is_fp16() else self.builder.get_fp32(0)
             ret_scalar_ty = out_dtype
@@ -1527,7 +1547,7 @@ class TritonSemantic(Generic[TensorTy]):
             acc_handle = self.builder.create_splat(ret_ty.to_ir(self.builder), _0)
         else:
             acc_handle = acc.handle
-            assert acc.type == ret_ty
+            assert acc.type.shape == ret_ty.shape and acc.type.element_ty == out_dtype
         # max_num_imprecise_acc only applies to fp8 -> fp32 dot on sm_90
         if max_num_imprecise_acc is None:
@@ -1607,7 +1627,7 @@ class TritonSemantic(Generic[TensorTy]):
             acc_handle = self.builder.create_splat(ret_ty.to_ir(self.builder), _0)
         else:
             acc_handle = acc.handle
-            assert acc.type == ret_ty
+            assert acc.type.shape == ret_ty.shape and acc.type.element_ty == out_dtype
         rhs_scale_handle = None if rhs_scale_is_none else rhs_scale.handle
         lhs_scale_handle = None if lhs_scale_is_none else lhs_scale.handle
         return self.tensor(
@@ -1709,6 +1729,36 @@ class TritonSemantic(Generic[TensorTy]):
         gather = self.builder.create_gather(src.handle, index.handle, axis)
         return self.wrap_tensor(gather, src.type.scalar, index.type.shape)
+# ===----------------------------------------------------------------------===
+#                               Map Elementwise
+# ===----------------------------------------------------------------------===
+    def broadcast_tensors(self, *inputs):
+        if not inputs:
+            return ()
+        head, *tail = inputs
+        for i in range(len(tail)):
+            head, tail[i] = self.broadcast_impl_value(head, tail[i])
+        for i in range(len(tail)):
+            head, tail[i] = self.broadcast_impl_value(head, tail[i])
+        return (head, *tail)
+    def map_elementwise(self, inputs: Sequence[tl.tensor], result_types: Sequence[tl.dtype], pack: int,
+                        region_builder_fn) -> Tuple[tl.tensor, ...]:
+        inputs = self.broadcast_tensors(*inputs)
+        assert len(inputs) > 0, "map_elementwise must have at least 1 input tensor"
+        result_types = [inputs[0].type.with_element_ty(ty.scalar) for ty in result_types]
+        elementwise_op = self.builder.create_map_elementwise(
+            [t.handle for t in inputs],
+            [ty.to_ir(self.builder) for ty in result_types],
+            pack,
+        )
+        region_builder_fn(elementwise_op)
+        # assert elementwise_op.verify()
+        return tuple(self.tensor(elementwise_op.get_result(i), ty) for i, ty in enumerate(result_types))
 # ===----------------------------------------------------------------------===
 #                               Histogram
@@ -1760,9 +1810,11 @@ class TritonSemantic(Generic[TensorTy]):
         is_signed = [arg.dtype.is_int_signed() for arg in args]
         return self.tensor(self.builder.create_print(prefix, hex, new_args, is_signed), tl.void)
-    def device_assert(self, cond: TensorTy, msg: str) -> TensorTy:
+    def device_assert(self, cond: TensorTy, msg: str, mask: Optional[TensorTy]) -> TensorTy:
         if not self.builder.options.debug:
             return
+        if mask is not None:
+            cond = self.or_(cond, self.not_(mask))
         return self.tensor(self.builder.create_assert(cond.handle, msg), tl.void)
     def assume(self, cond) -> TensorTy:
@@ -1788,7 +1840,7 @@ class TritonSemantic(Generic[TensorTy]):
             if elem.dtype != tl.int64 and require_i64:
                 return self.builder.create_int_cast(elem.handle, self.builder.get_int64_ty(),
                                                     elem.dtype.is_int_signed())
-            elif elem.dtype != tl.int32 and not require_i64:
+            elif elem.dtype == tl.int64 and not require_i64:
                 assert False, "Block pointers only support 32 bit `offsets/block_shape`, " \
                     "add a `.to(tl.int32)` or use regular indexing for 64 bit support"
             return elem.handle
@@ -1844,13 +1896,8 @@ class TritonSemantic(Generic[TensorTy]):
         # Advanced block pointer type is the same as before
         return self.tensor(self.builder.create_advance(base.handle, offsets), base.type)
-    def make_tensor_descriptor(
-        self,
-        base: TensorTy,
-        shape: List[TensorTy],
-        strides: List[TensorTy],
-        block_shape: List[tl.constexpr],
-    ) -> tl.tensor_descriptor:
+    def make_tensor_descriptor(self, base: TensorTy, shape: List[TensorTy], strides: List[TensorTy],
+                               block_shape: List[tl.constexpr], padding_option: str = "zero") -> tl.tensor_descriptor:
         ndim = len(shape)
         if not (1 <= ndim <= 5):
             raise ValueError(f"Expected 1 <= ndim <= 5 but got {ndim} dimensions")
@@ -1866,12 +1913,12 @@ class TritonSemantic(Generic[TensorTy]):
                 f"Descriptor block shape must have at least 16 bytes in the last dimension, but got {contig_dim_size} * {elem_size} = {contig_dim_size * elem_size} bytes"
             )
-        strides[-1] = tl._unwrap_if_constexpr(strides[-1])
-        if strides[-1] != 1:
-            raise ValueError(f"Tensor descriptor last dim must be 1 but got {strides[-1]}")
+        last_stride = tl._unwrap_if_constexpr(strides[-1])
+        if last_stride != 1:
+            raise ValueError(f"Tensor descriptor last dim must be 1 but got {last_stride}")
         shape = [self.make_scalar(x, tl.int32) for x in shape]
-        strides = [self.make_scalar(x, tl.int64) for x in strides]
+        strides = [self.make_scalar(tl._unwrap_if_constexpr(x), tl.int64) for x in strides]
         # Check whether `block_shape` is static
         block_shape = tl._unwrap_shape(block_shape)
@@ -1881,6 +1928,12 @@ class TritonSemantic(Generic[TensorTy]):
         base_handle = base.handle
         is_signed_int = base.type.element_ty.is_int_signed()
+        padding = self._str_to_padding_option(padding_option)
+        if base.type.element_ty.is_int() and padding == ir.PADDING_OPTION.PAD_NAN:
+            raise ValueError("Padding option `nan` is not supported for integer blocks")
         handle = self.builder.create_make_tensor_descriptor(base_handle, [s.handle for s in shape],
-                                                            [s.handle for s in strides], block_shape, is_signed_int)
+                                                            [s.handle for s in strides], block_shape, is_signed_int,
+                                                            padding)
         return tl.tensor_descriptor(handle, shape, strides, type)

triton/language/standard.py CHANGED Viewed

@@ -1,24 +1,25 @@
 from __future__ import annotations
-from ..runtime.jit import jit
+from ..runtime.jit import jit, constexpr_function
 from . import core
 from . import math
 # constexpr utilities
-def _log2(i: core.constexpr):
+@constexpr_function
+def _log2(i):
     log2 = 0
-    n = core.constexpr(i).value
+    n = i
     while n > 1:
         n >>= 1
         log2 += 1
-    return core.constexpr(log2)
+    return log2
-def _is_power_of_two(i: core.constexpr):
-    n = i.value
-    return core.constexpr((n & (n - 1)) == 0 and n != 0)
+@constexpr_function
+def _is_power_of_two(i):
+    return (i & (i - 1)) == 0 and i != 0
 # -----------------------
@@ -263,8 +264,8 @@ def _sum_combine(a, b):
 # sum
-def _pick_sum_dtype(in_dtype: core.constexpr, dtype: core.constexpr):
-    dtype = core._unwrap_if_constexpr(dtype)
+@constexpr_function
+def _pick_sum_dtype(in_dtype, dtype):
     if dtype is not None:
         return dtype
@@ -316,9 +317,9 @@ def _or_combine(x, y):
 @core._tensor_member_fn
 @jit
-@core._add_reduction_docstr("reduce_of")
+@core._add_reduction_docstr("reduce_or")
 def reduce_or(input, axis, keep_dims=False):
-    core.static_assert(input.type.scalar.is_int(), "reduce_of only supported for integers")
+    core.static_assert(input.type.scalar.is_int(), "reduce_or only supported for integers")
     return core.reduce(input, axis, _or_combine, keep_dims=keep_dims)
@@ -476,14 +477,13 @@ def bitonic_merge(x, dim: core.constexpr = None, descending: core.constexpr = co
     return _bitonic_merge(x, n_dims, descending, n_dims)
+@constexpr_function
 def _get_flip_dim(dim, shape):
-    dim = core._unwrap_if_constexpr(dim)
-    shape = core._unwrap_if_constexpr(shape)
     if dim is None:
         dim = len(shape) - 1
     if dim < 0:  # flip doesn't work if dim < 0 because the xor-swap for loop will start/end at the wrong index
         dim += len(shape)
-    return core.constexpr(dim)
+    return dim
 @core._tensor_member_fn

triton/language/target_info.py ADDED Viewed

@@ -0,0 +1,54 @@
+from triton.runtime import driver
+from triton.runtime.jit import constexpr_function
+__all__ = ["current_target"]
+def current_target():
+    try:
+        active_driver = driver.active
+    except RuntimeError:
+        # If there is no active driver, return None
+        return None
+    return active_driver.get_current_target()
+current_target.__triton_builtin__ = True
+@constexpr_function
+def is_cuda():
+    target = current_target()
+    return target is not None and target.backend == "cuda"
+@constexpr_function
+def cuda_capability_geq(major, minor=0):
+    """
+    Determines whether we have compute capability >= (major, minor) and
+    returns this as a constexpr boolean. This can be used for guarding
+    inline asm implementations that require a certain compute capability.
+    """
+    target = current_target()
+    if target is None or target.backend != "cuda":
+        return False
+    assert isinstance(target.arch, int)
+    return target.arch >= major * 10 + minor
+@constexpr_function
+def is_hip():
+    target = current_target()
+    return target is not None and target.backend == "hip"
+@constexpr_function
+def is_hip_cdna3():
+    target = current_target()
+    return target is not None and target.arch == "gfx942"
+@constexpr_function
+def is_hip_cdna4():
+    target = current_target()
+    return target is not None and target.arch == "gfx950"

triton/runtime/_allocation.py CHANGED Viewed

@@ -1,4 +1,5 @@
 from typing import Optional, Protocol
+from contextvars import ContextVar
 class Buffer(Protocol):
@@ -20,7 +21,7 @@ class NullAllocator:
                            "Use triton.set_allocator to specify an allocator.")
-_allocator: Allocator = NullAllocator()
+_allocator: ContextVar[Allocator] = ContextVar("_allocator", default=NullAllocator())
 def set_allocator(allocator: Allocator):
@@ -28,5 +29,16 @@ def set_allocator(allocator: Allocator):
     The allocator function is called during kernel launch for kernels that
     require additional global memory workspace.
     """
-    global _allocator
-    _allocator = allocator
+    _allocator.set(allocator)
+_profile_allocator: Allocator = ContextVar("_allocator", default=NullAllocator())
+def set_profile_allocator(allocator: Optional[Allocator]):
+    """
+    The profile allocator function is called before kernel launch for kernels
+    that require additional global memory workspace.
+    """
+    global _profile_allocator
+    _profile_allocator.set(allocator)

triton/runtime/_async_compile.py ADDED Viewed

@@ -0,0 +1,55 @@
+from __future__ import annotations
+from typing import Callable, Optional
+from concurrent.futures import Executor, as_completed, Future
+from contextvars import ContextVar
+active_mode: ContextVar[Optional[AsyncCompileMode]] = ContextVar("async_compile_active_mode", default=None)
+class FutureKernel:
+    def __init__(self, finalize_compile: Callable, future: Future):
+        self.finalize_compile = finalize_compile
+        self.kernel = None
+        self.future = future
+    def result(self):
+        if self.kernel is not None:
+            return self.kernel
+        kernel = self.future.result()
+        self.finalize_compile(kernel)
+        self.kernel = kernel
+        return kernel
+class AsyncCompileMode:
+    def __init__(self, executor: Executor):
+        self.executor = executor
+        self.raw_futures = []
+        self.future_kernels = {}
+    def submit(self, key, compile_fn, finalize_fn):
+        future = self.future_kernels.get(key)
+        if future is not None:
+            return future
+        future = self.executor.submit(compile_fn)
+        future._key = key
+        self.raw_futures.append(future)
+        future_kernel = FutureKernel(finalize_fn, future)
+        self.future_kernels[key] = future_kernel
+        return future_kernel
+    def __enter__(self):
+        if active_mode.get() is not None:
+            raise RuntimeError("Another AsyncCompileMode is already active")
+        active_mode.set(self)
+        return self
+    def __exit__(self, exc_type, exc_value, traceback):
+        # Finalize any outstanding compiles
+        for future in as_completed(self.raw_futures):
+            self.future_kernels[future._key].result()
+        active_mode.set(None)

triton/runtime/autotuner.py CHANGED Viewed

@@ -9,9 +9,11 @@ from functools import cached_property
 from typing import Dict, Tuple, List, Optional
 from .. import knobs
-from .jit import KernelInterface
+from .jit import KernelInterface, JITFunction
 from .errors import OutOfResources, PTXASError
 from .driver import driver
+from .cache import get_cache_manager, triton_key
+from triton._C.libtriton import get_cache_invalidating_env_vars
 class Autotuner(KernelInterface):
@@ -169,10 +171,7 @@ class Autotuner(KernelInterface):
             bench_fn()
             return False
-        from triton._C.libtriton import get_cache_invalidating_env_vars
-        from triton.compiler.compiler import make_backend, triton_key
-        from triton.runtime.cache import get_cache_manager
-        from triton.runtime.jit import JITFunction
+        from triton.compiler.compiler import make_backend
         fn = self.fn
         while not isinstance(fn, JITFunction):

triton/runtime/build.py CHANGED Viewed

@@ -56,10 +56,11 @@ def is_clang(cc):
     return cc == "clang" or cc == "clang.exe"
-def _cc_cmd(cc, src, out, include_dirs, library_dirs, libraries):
+def _cc_cmd(cc: str, src: str, out: str, include_dirs: list[str], library_dirs: list[str], libraries: list[str],
+            ccflags: list[str]) -> list[str]:
     if is_msvc(cc):
         out_base = os.path.splitext(out)[0]
-        cc_cmd = [cc, src, "/nologo", "/O2", "/LD", "/wd4819"]
+        cc_cmd = [cc, src, "/nologo", "/O2", "/LD", "/std:c11", "/wd4819"]
         cc_cmd += [f"/I{dir}" for dir in include_dirs if dir is not None]
         cc_cmd += [f"/Fo{out_base + '.obj'}"]
         cc_cmd += ["/link"]
@@ -79,16 +80,16 @@ def _cc_cmd(cc, src, out, include_dirs, library_dirs, libraries):
         cc_cmd += [f'-l{lib}' for lib in libraries]
         cc_cmd += [f"-L{dir}" for dir in library_dirs]
         cc_cmd += [f"-I{dir}" for dir in include_dirs if dir is not None]
+    cc_cmd += ccflags
     return cc_cmd
-def _build(name: str, src: str, srcdir: str, library_dirs: list[str], include_dirs: list[str],
-           libraries: list[str]) -> str:
+def _build(name: str, src: str, srcdir: str, library_dirs: list[str], include_dirs: list[str], libraries: list[str],
+           ccflags: list[str]) -> str:
     if impl := knobs.build.impl:
         return impl(name, src, srcdir, library_dirs, include_dirs, libraries)
     suffix = sysconfig.get_config_var('EXT_SUFFIX')
     so = os.path.join(srcdir, '{name}{suffix}'.format(name=name, suffix=suffix))
-    # try to avoid setuptools if possible
     cc = get_cc()
     # This function was renamed and made public in Python 3.10
     if hasattr(sysconfig, 'get_default_scheme'):
@@ -113,10 +114,10 @@ def _build(name: str, src: str, srcdir: str, library_dirs: list[str], include_di
         _, msvc_winsdk_inc_dirs, msvc_winsdk_lib_dirs = find_msvc_winsdk()
         include_dirs = include_dirs + msvc_winsdk_inc_dirs
         library_dirs = library_dirs + msvc_winsdk_lib_dirs
-    cc_cmd = _cc_cmd(cc, src, so, include_dirs, library_dirs, libraries)
+    cc_cmd = _cc_cmd(cc, src, so, include_dirs, library_dirs, libraries, ccflags)
     try:
-        ret = subprocess.check_call(cc_cmd)
+        subprocess.check_call(cc_cmd)
     except Exception as e:
         print("Failed to compile. cc_cmd:", cc_cmd)
         raise e
@@ -142,7 +143,8 @@ def _load_module_from_path(name: str, path: str) -> ModuleType:
 def compile_module_from_src(src: str, name: str, library_dirs: list[str] | None = None,
-                            include_dirs: list[str] | None = None, libraries: list[str] | None = None) -> ModuleType:
+                            include_dirs: list[str] | None = None, libraries: list[str] | None = None,
+                            ccflags: list[str] | None = None) -> ModuleType:
     key = hashlib.sha256((src + platform_key()).encode("utf-8")).hexdigest()
     cache = get_cache_manager(key)
     suffix = sysconfig.get_config_var("EXT_SUFFIX")
@@ -159,7 +161,7 @@ def compile_module_from_src(src: str, name: str, library_dirs: list[str] | None
         src_path = os.path.join(tmpdir, name + ".c")
         with open(src_path, "w") as f:
             f.write(src)
-        so = _build(name, src_path, tmpdir, library_dirs or [], include_dirs or [], libraries or [])
+        so = _build(name, src_path, tmpdir, library_dirs or [], include_dirs or [], libraries or [], ccflags or [])
         with open(so, "rb") as f:
             cache_path = cache.put(f.read(), f"{name}{suffix}", binary=True)

triton/runtime/cache.py CHANGED Viewed

@@ -5,8 +5,10 @@ from abc import ABC, abstractmethod
 from typing import Dict, List, Optional
 import base64
 import hashlib
+import functools
+import sysconfig
-from .. import knobs
+from triton import __version__, knobs
 class CacheManager(ABC):
@@ -272,3 +274,44 @@ def make_so_cache_key(version_hash, signature, constants, ids, **kwargs):
         key = f"{key}-{kwargs.get(kw)}"
     key = hashlib.sha256(key.encode("utf-8")).hexdigest()
     return _base32(key)
+@functools.lru_cache()
+def triton_key():
+    import pkgutil
+    TRITON_PATH = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+    contents = []
+    # frontend
+    with open(__file__, "rb") as f:
+        contents += [hashlib.sha256(f.read()).hexdigest()]
+    # compiler
+    path_prefixes = [
+        (os.path.join(TRITON_PATH, "compiler"), "triton.compiler."),
+        (os.path.join(TRITON_PATH, "backends"), "triton.backends."),
+    ]
+    for path, prefix in path_prefixes:
+        for lib in pkgutil.walk_packages([path], prefix=prefix):
+            with open(lib.module_finder.find_spec(lib.name).origin, "rb") as f:
+                contents += [hashlib.sha256(f.read()).hexdigest()]
+    # backend
+    libtriton_hash = hashlib.sha256()
+    ext = sysconfig.get_config_var("EXT_SUFFIX").split(".")[-1]
+    with open(os.path.join(TRITON_PATH, "_C", f"libtriton.{ext}"), "rb") as f:
+        while True:
+            chunk = f.read(1024**2)
+            if not chunk:
+                break
+            libtriton_hash.update(chunk)
+    contents.append(libtriton_hash.hexdigest())
+    # language
+    language_path = os.path.join(TRITON_PATH, 'language')
+    for lib in pkgutil.walk_packages([language_path], prefix="triton.language."):
+        with open(lib.module_finder.find_spec(lib.name).origin, "rb") as f:
+            contents += [hashlib.sha256(f.read()).hexdigest()]
+    return f'{__version__}' + '-'.join(contents)
+def get_cache_key(src, backend, backend_options, env_vars):
+    key = f"{triton_key()}-{src.hash()}-{backend.hash()}-{backend_options.hash()}-{str(sorted(env_vars.items()))}"
+    return key

triton/runtime/driver.py CHANGED Viewed

@@ -2,8 +2,6 @@ from __future__ import annotations
 from ..backends import backends, DriverBase
-from typing import Any, Callable, Generic, TypeVar, Union
 def _create_driver() -> DriverBase:
     active_drivers = [x.driver for x in backends.values() if x.driver.is_active()]
@@ -12,52 +10,29 @@ def _create_driver() -> DriverBase:
     return active_drivers[0]()
-T = TypeVar("T")
-class LazyProxy(Generic[T]):
-    def __init__(self, init_fn: Callable[[], T]) -> None:
-        self._init_fn = init_fn
-        self._obj: Union[T, None] = None
-    def _initialize_obj(self) -> T:
-        if self._obj is None:
-            self._obj = self._init_fn()
-        return self._obj
-    def __getattr__(self, name) -> Any:
-        return getattr(self._initialize_obj(), name)
-    def __setattr__(self, name: str, value: Any) -> None:
-        if name in ["_init_fn", "_obj"]:
-            super().__setattr__(name, value)
-        else:
-            setattr(self._initialize_obj(), name, value)
-    def __delattr__(self, name: str) -> None:
-        delattr(self._initialize_obj(), name)
-    def __repr__(self) -> str:
-        if self._obj is None:
-            return f"<{self.__class__.__name__} for {self._init_fn} not yet initialized>"
-        return repr(self._obj)
-    def __str__(self) -> str:
-        return str(self._initialize_obj())
 class DriverConfig:
     def __init__(self) -> None:
-        self.default: LazyProxy[DriverBase] = LazyProxy(_create_driver)
-        self.active: Union[LazyProxy[DriverBase], DriverBase] = self.default
+        self._default: DriverBase | None = None
+        self._active: DriverBase | None = None
+    @property
+    def default(self) -> DriverBase:
+        if self._default is None:
+            self._default = _create_driver()
+        return self._default
+    @property
+    def active(self) -> DriverBase:
+        if self._active is None:
+            self._active = self.default
+        return self._active
     def set_active(self, driver: DriverBase) -> None:
-        self.active = driver
+        self._active = driver
     def reset_active(self) -> None:
-        self.active = self.default
+        self._active = self.default
 driver = DriverConfig()