PyPI - triton-windows - Versions diffs - 3.4.0.post20__cp312-cp312-win_amd64.whl → 3.5.0.post21__cp312-cp312-win_amd64.whl - Mend

triton-windows 3.4.0.post20__cp312-cp312-win_amd64.whl → 3.5.0.post21__cp312-cp312-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of triton-windows might be problematic. Click here for more details.

Files changed (107) hide show

triton/_C/libtriton.pyd +0 -0
triton/__init__.py +8 -2
triton/_filecheck.py +24 -14
triton/_internal_testing.py +70 -4
triton/_utils.py +3 -1
triton/backends/amd/compiler.py +68 -60
triton/backends/amd/driver.c +113 -44
triton/backends/amd/driver.py +133 -57
triton/backends/driver.py +13 -0
triton/backends/nvidia/compiler.py +80 -22
triton/backends/nvidia/driver.c +88 -15
triton/backends/nvidia/driver.py +130 -123
triton/compiler/__init__.py +5 -2
triton/compiler/code_generator.py +270 -163
triton/compiler/compiler.py +45 -62
triton/experimental/gluon/__init__.py +3 -2
triton/experimental/gluon/_runtime.py +9 -6
triton/experimental/gluon/language/__init__.py +117 -16
triton/experimental/gluon/language/_core.py +246 -68
triton/experimental/gluon/language/_layouts.py +398 -45
triton/experimental/gluon/language/_math.py +17 -9
triton/experimental/gluon/language/_semantic.py +130 -37
triton/experimental/gluon/language/_standard.py +55 -22
triton/experimental/gluon/language/amd/__init__.py +4 -0
triton/experimental/gluon/language/amd/_layouts.py +96 -0
triton/experimental/gluon/language/amd/cdna3/__init__.py +100 -0
triton/experimental/gluon/language/amd/cdna4/__init__.py +48 -0
triton/experimental/gluon/language/amd/cdna4/async_copy.py +151 -0
triton/experimental/gluon/language/extra/__init__.py +3 -0
triton/experimental/gluon/language/nvidia/ampere/__init__.py +3 -0
triton/experimental/gluon/language/nvidia/ampere/async_copy.py +74 -0
triton/experimental/gluon/language/nvidia/ampere/mbarrier.py +80 -0
triton/experimental/gluon/language/nvidia/blackwell/__init__.py +192 -7
triton/experimental/gluon/language/nvidia/blackwell/tma.py +20 -0
triton/experimental/gluon/language/nvidia/hopper/__init__.py +124 -3
triton/experimental/gluon/language/nvidia/hopper/mbarrier.py +20 -37
triton/experimental/gluon/language/nvidia/hopper/tma.py +4 -3
triton/experimental/gluon/nvidia/hopper.py +6 -1
triton/knobs.py +132 -67
triton/language/__init__.py +16 -10
triton/language/core.py +163 -83
triton/language/extra/cuda/gdc.py +6 -6
triton/language/extra/hip/__init__.py +3 -1
triton/language/extra/hip/libdevice.py +7 -0
triton/language/extra/hip/utils.py +35 -0
triton/language/extra/libdevice.py +4 -0
triton/language/semantic.py +76 -23
triton/language/standard.py +14 -14
triton/language/target_info.py +54 -0
triton/runtime/_allocation.py +15 -3
triton/runtime/_async_compile.py +55 -0
triton/runtime/autotuner.py +4 -5
triton/runtime/build.py +11 -9
triton/runtime/cache.py +44 -1
triton/runtime/driver.py +16 -41
triton/runtime/interpreter.py +31 -23
triton/runtime/jit.py +318 -157
triton/runtime/tcc/include/_mingw.h +8 -10
triton/runtime/tcc/include/assert.h +5 -0
triton/runtime/tcc/include/errno.h +1 -1
triton/runtime/tcc/include/float.h +21 -3
triton/runtime/tcc/include/iso646.h +36 -0
triton/runtime/tcc/include/limits.h +5 -0
triton/runtime/tcc/include/malloc.h +2 -2
triton/runtime/tcc/include/math.h +21 -261
triton/runtime/tcc/include/stdalign.h +16 -0
triton/runtime/tcc/include/stdarg.h +5 -70
triton/runtime/tcc/include/stdatomic.h +171 -0
triton/runtime/tcc/include/stddef.h +7 -19
triton/runtime/tcc/include/stdlib.h +15 -4
triton/runtime/tcc/include/stdnoreturn.h +7 -0
triton/runtime/tcc/include/sys/stat.h +2 -2
triton/runtime/tcc/include/sys/types.h +5 -0
triton/runtime/tcc/include/tcc/tcc_libm.h +444 -27
triton/runtime/tcc/include/tccdefs.h +342 -0
triton/runtime/tcc/include/tgmath.h +89 -0
triton/runtime/tcc/include/uchar.h +33 -0
triton/runtime/tcc/include/unistd.h +1 -0
triton/runtime/tcc/include/winapi/qos.h +72 -0
triton/runtime/tcc/include/winapi/shellapi.h +59 -0
triton/runtime/tcc/include/winapi/winbase.h +9 -2
triton/runtime/tcc/include/winapi/wincon.h +8 -0
triton/runtime/tcc/include/winapi/windows.h +1 -1
triton/runtime/tcc/include/winapi/winnls.h +778 -0
triton/runtime/tcc/include/winapi/winnt.h +9 -7
triton/runtime/tcc/include/winapi/winsock2.h +1474 -0
triton/runtime/tcc/include/winapi/ws2ipdef.h +21 -0
triton/runtime/tcc/include/winapi/ws2tcpip.h +391 -0
triton/runtime/tcc/lib/libtcc1.a +0 -0
triton/runtime/tcc/lib/python314.def +1800 -0
triton/runtime/tcc/lib/python314t.def +1809 -0
triton/runtime/tcc/libtcc.dll +0 -0
triton/runtime/tcc/tcc.exe +0 -0
triton/tools/compile.py +62 -14
triton/tools/extra/cuda/compile.c +1 -0
triton/tools/extra/hip/compile.cpp +66 -0
triton/tools/extra/hip/compile.h +13 -0
triton/tools/ragged_tma.py +92 -0
triton/tools/tensor_descriptor.py +7 -9
triton/windows_utils.py +42 -79
{triton_windows-3.4.0.post20.dist-info → triton_windows-3.5.0.post21.dist-info}/METADATA +3 -4
{triton_windows-3.4.0.post20.dist-info → triton_windows-3.5.0.post21.dist-info}/RECORD +106 -75
triton/runtime/tcc/lib/libtcc1-64.a +0 -0
{triton_windows-3.4.0.post20.dist-info → triton_windows-3.5.0.post21.dist-info}/WHEEL +0 -0
{triton_windows-3.4.0.post20.dist-info → triton_windows-3.5.0.post21.dist-info}/entry_points.txt +0 -0
{triton_windows-3.4.0.post20.dist-info → triton_windows-3.5.0.post21.dist-info}/licenses/LICENSE +0 -0
{triton_windows-3.4.0.post20.dist-info → triton_windows-3.5.0.post21.dist-info}/top_level.txt +0 -0

triton/compiler/compiler.py CHANGED Viewed

@@ -7,15 +7,15 @@ from ..backends.compiler import Language
 from ..backends.compiler import BaseBackend, GPUTarget
 from .. import __version__, knobs
 from ..runtime.autotuner import OutOfResources
-from ..runtime.cache import get_cache_manager, get_dump_manager, get_override_manager
+from ..runtime.cache import get_cache_manager, get_dump_manager, get_override_manager, get_cache_key
 from ..runtime.driver import driver
 from ..tools.disasm import get_sass
 from pathlib import Path
 import re
 import functools
 import os
-import sysconfig
 import time
+import copy
 # - ^\s*tt\.func\s+ : match the start of the string, any leading whitespace, the keyword func,
 #    and any following whitespace
@@ -64,12 +64,9 @@ class ASTSource:
                 assert isinstance(k, tuple)
                 self.constants[k] = v
         self.attrs = attrs or dict()
-        if isinstance(self.signature, str):
-            self.signature = {k: v.strip() for k, v in enumerate(self.signature.split(","))}
-        else:
-            for k in self.signature.keys():
-                if not isinstance(k, str):
-                    raise TypeError("Signature keys must be string")
+        for k in self.signature.keys():
+            if not isinstance(k, str):
+                raise TypeError("Signature keys must be string")
     def hash(self):
         sorted_sig = [v for k, v in sorted(self.signature.items())]
@@ -78,7 +75,7 @@ class ASTSource:
         key = f"{self.fn.cache_key}-{str(self.attrs)}-{sorted_sig}-{constants_key}"
         return hashlib.sha256(key.encode("utf-8")).hexdigest()
-    def make_ir(self, options, codegen_fns, module_map, context):
+    def make_ir(self, target: GPUTarget, options, codegen_fns, module_map, context):
         from .code_generator import ast_to_ttir
         return ast_to_ttir(self.fn, self, context=context, options=options, codegen_fns=codegen_fns,
                            module_map=module_map)
@@ -117,7 +114,7 @@ class IRSource:
     def hash(self):
         return hashlib.sha256(self.src.encode("utf-8")).hexdigest()
-    def make_ir(self, options, codegen_fns, module_map, context):
+    def make_ir(self, target: GPUTarget, options, codegen_fns, module_map, context):
         self.module.context = context
         return self.module
@@ -129,42 +126,6 @@ class IRSource:
         return dict()
-@functools.lru_cache()
-def triton_key():
-    import pkgutil
-    TRITON_PATH = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
-    contents = []
-    # frontend
-    with open(__file__, "rb") as f:
-        contents += [hashlib.sha256(f.read()).hexdigest()]
-    # compiler
-    path_prefixes = [
-        (os.path.join(TRITON_PATH, "compiler"), "triton.compiler."),
-        (os.path.join(TRITON_PATH, "backends"), "triton.backends."),
-    ]
-    for path, prefix in path_prefixes:
-        for lib in pkgutil.walk_packages([path], prefix=prefix):
-            with open(lib.module_finder.find_spec(lib.name).origin, "rb") as f:
-                contents += [hashlib.sha256(f.read()).hexdigest()]
-    # backend
-    libtriton_hash = hashlib.sha256()
-    ext = sysconfig.get_config_var("EXT_SUFFIX").split(".")[-1]
-    with open(os.path.join(TRITON_PATH, "_C", f"libtriton.{ext}"), "rb") as f:
-        while True:
-            chunk = f.read(1024**2)
-            if not chunk:
-                break
-            libtriton_hash.update(chunk)
-    contents.append(libtriton_hash.hexdigest())
-    # language
-    language_path = os.path.join(TRITON_PATH, 'language')
-    for lib in pkgutil.walk_packages([language_path], prefix="triton.language."):
-        with open(lib.module_finder.find_spec(lib.name).origin, "rb") as f:
-            contents += [hashlib.sha256(f.read()).hexdigest()]
-    return f'{__version__}' + '-'.join(contents)
 @functools.lru_cache()
 def max_shared_mem(device):
     return driver.active.utils.get_device_properties(device)["max_shared_mem"]
@@ -258,7 +219,7 @@ class CompileTimer:
         )
-def compile(src, target=None, options=None):
+def compile(src, target=None, options=None, _env_vars=None):
     compilation_listener = knobs.compilation.listener
     if compilation_listener:
         timer = CompileTimer()
@@ -277,8 +238,8 @@ def compile(src, target=None, options=None):
     extra_options = src.parse_options()
     options = backend.parse_options(dict(options or dict(), **extra_options))
     # create cache manager
-    env_vars = get_cache_invalidating_env_vars()
-    key = f"{triton_key()}-{src.hash()}-{backend.hash()}-{options.hash()}-{str(sorted(env_vars.items()))}"
+    env_vars = get_cache_invalidating_env_vars() if _env_vars is None else _env_vars
+    key = get_cache_key(src, backend, options, env_vars=env_vars)
     hash = hashlib.sha256(key.encode("utf-8")).hexdigest()
     fn_cache_manager = get_cache_manager(hash)
     # For dumping/overriding only hash the source as we want it to be independent of triton
@@ -336,7 +297,7 @@ def compile(src, target=None, options=None):
     codegen_fns = backend.get_codegen_implementation(options)
     module_map = backend.get_module_map()
     try:
-        module = src.make_ir(options, codegen_fns, module_map, context)
+        module = src.make_ir(target, options, codegen_fns, module_map, context)
     except Exception as e:
         filter_traceback(e)
         raise
@@ -371,6 +332,9 @@ def compile(src, target=None, options=None):
             metadata_group[ir_filename] = fn_cache_manager.put(next_module, ir_filename)
         if fn_dump_manager is not None:
             fn_dump_manager.put(next_module, ir_filename)
+            if ext == "cubin":
+                sass = get_sass(next_module)
+                fn_dump_manager.put(sass, file_name + ".sass")
         # use an env variable to parse ir from file
         if use_ir_loc == ext:
             ir_full_name = fn_cache_manager.get_file(ir_filename)
@@ -440,6 +404,10 @@ class AsmDict(dict):
         return value
+def _raise_error(err, *args, **kwargs):
+    raise copy.deepcopy(err)
 class CompiledKernel:
     def __init__(self, src, metadata_group, hash):
@@ -464,51 +432,66 @@ class CompiledKernel:
             file.suffix[1:]: file.read_bytes() if file.suffix[1:] == binary_ext else file.read_text()
             for file in asm_files
         })
+        self.metadata_group = metadata_group
         self.kernel = self.asm[binary_ext]
         # binaries are lazily initialized
         # because it involves doing runtime things
         # (e.g., checking amount of shared memory on current device)
         self.module = None
         self.function = None
+        self._run = None
     def _init_handles(self):
         if self.module is not None:
             return
+        def raise_(err):
+            # clone the exception object so that the one saved in the closure
+            # of the partial function below doesn't get assigned a stack trace
+            # after the subsequent raise. otherwise, the CompiledKernel instance
+            # saved in the (global) kernel cache will keep references to all the
+            # locals in the traceback via the exception instance in the closure.
+            cloned_err = copy.deepcopy(err)
+            self._run = functools.partial(_raise_error, cloned_err)
+            raise err
         device = driver.active.get_current_device()
         # create launcher
-        self.run = driver.active.launcher_cls(self.src, self.metadata)
+        self._run = driver.active.launcher_cls(self.src, self.metadata)
         # not enough shared memory to run the kernel
         max_shared = max_shared_mem(device)
         if self.metadata.shared > max_shared:
-            raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+            raise_(OutOfResources(self.metadata.shared, max_shared, "shared memory"))
         if hasattr(self.metadata, "tmem_size") and self.metadata.tmem_size is not None:
             # Use blackwell max tmem size for now, this should be moved in device properties
             max_tmem_size = 512  # tmem size in number of columns
             if self.metadata.tmem_size > max_tmem_size:
-                raise OutOfResources(self.metadata.tmem_size, max_tmem_size, "tensor memory")
+                raise_(OutOfResources(self.metadata.tmem_size, max_tmem_size, "tensor memory"))
+        if knobs.runtime.kernel_load_start_hook is not None:
+            knobs.runtime.kernel_load_start_hook(self.module, self.function, self.name, self.metadata_group, self.hash)
         # TODO: n_regs, n_spills should be metadata generated when calling `ptxas`
         self.module, self.function, self.n_regs, self.n_spills, self.n_max_threads = driver.active.utils.load_binary(
             self.name, self.kernel, self.metadata.shared, device)
         warp_size = driver.active.get_current_target().warp_size
         if self.metadata.num_warps * warp_size > self.n_max_threads:
-            raise OutOfResources(self.metadata.num_warps * warp_size, self.n_max_threads, "threads")
+            raise_(OutOfResources(self.metadata.num_warps * warp_size, self.n_max_threads, "threads"))
+        if knobs.runtime.kernel_load_end_hook is not None:
+            knobs.runtime.kernel_load_end_hook(self.module, self.function, self.name, self.metadata_group, self.hash)
-    def __getattribute__(self, name):
-        if name == 'run':
+    @property
+    def run(self):
+        if self._run is None:
             self._init_handles()
-        return super().__getattribute__(name)
+        return self._run
     def launch_metadata(self, grid, stream, *args):
         if knobs.runtime.launch_enter_hook is None:
             return None
+        self._init_handles()
         ret = LazyDict({"name": self.name, "function": self.function, "stream": stream})
         if not isinstance(self.src, ASTSource) or self.src.fn.launch_metadata is None:
             return ret
-        arg_dict = {}
-        arg_idx = 0
-        for i, arg_name in enumerate(self.src.fn.arg_names):
-            arg_dict[arg_name] = args[arg_idx]
-            arg_idx += 1
+        arg_dict = {name: arg for name, arg in zip(self.src.fn.arg_names, args)}
         ret.add(self.src.fn.launch_metadata, (grid, self.metadata, arg_dict))
         return ret

triton/experimental/gluon/__init__.py CHANGED Viewed

@@ -1,4 +1,5 @@
 from . import nvidia
-from ._runtime import jit
+from ._runtime import constexpr_function, jit
+from triton.language.core import must_use_result
-__all__ = ["jit", "nvidia"]
+__all__ = ["constexpr_function", "jit", "must_use_result", "nvidia"]

triton/experimental/gluon/_runtime.py CHANGED Viewed

@@ -1,13 +1,14 @@
 from __future__ import annotations
-import triton
 from triton.compiler.compiler import ASTSource
 from triton.backends.compiler import Language
-from triton.runtime.jit import JITFunction
+from triton.runtime.jit import JITFunction, constexpr_function
 from typing import TypeVar, Optional, Callable, Iterable, Union
 from triton._C.libtriton import ir
 T = TypeVar("T")
+__all__ = ["constexpr_function", "jit"]
 class GluonASTSource(ASTSource):
@@ -16,7 +17,7 @@ class GluonASTSource(ASTSource):
         self.language = Language.GLUON
         self.ext = "ttgir"
-    def make_ir(self, options, codegen_fns, module_map, context):
+    def make_ir(self, target, options, codegen_fns, module_map, context):
         from triton.compiler.compiler import make_backend
         from triton.compiler.code_generator import ast_to_ttir
@@ -24,14 +25,16 @@ class GluonASTSource(ASTSource):
         module = builder.create_module()
         # Assign module attributes eagerly, as they are needed to verify layouts
-        target = triton.runtime.driver.active.get_current_target()
         backend = make_backend(target)
         target = backend.get_target_name(options)
         module.set_attr("ttg.target", builder.get_string_attr(target))
         module.set_attr("ttg.num-warps", builder.get_int32_attr(options.num_warps))
         module.set_attr("ttg.num-ctas", builder.get_int32_attr(options.num_ctas))
-        module.set_attr("ttg.threads-per-warp", builder.get_int32_attr(32))
-        if options.maxnreg is not None:
+        module.set_attr("ttg.threads-per-warp", builder.get_int32_attr(options.warp_size))
+        is_cuda = options.backend_name == "cuda"
+        if is_cuda and options.maxnreg is not None:
             module.set_attr("ttg.maxnreg", builder.get_int32_attr(options.maxnreg))
         module = ast_to_ttir(self.fn, self, context=context, options=options, codegen_fns=codegen_fns,

triton/experimental/gluon/language/__init__.py CHANGED Viewed

@@ -1,18 +1,119 @@
-from ._core import *  # NOQA: F403
-from ._core import __all__ as __core_all
-from ._layouts import *  # NOQA: F403
-from ._layouts import __all__ as __layouts_all
-from ._math import *  # NOQA: F403
-from ._math import __all__ as __math_all
-from ._standard import *  # NOQA: F403
-from ._standard import __all__ as __standard_all
+from ._core import (
+    base_value,
+    base_type,
+    block_type,
+    broadcast,
+    constexpr,
+    dtype,
+    void,
+    int1,
+    int8,
+    int16,
+    int32,
+    int64,
+    uint8,
+    uint16,
+    uint32,
+    uint64,
+    float8e5,
+    float8e5b16,
+    float8e4nv,
+    float8e4b8,
+    float8e4b15,
+    float16,
+    bfloat16,
+    float32,
+    float64,
+    pointer_type,
+    shared_memory_descriptor,
+    tensor,
+    tuple,
+    tuple_type,
+    _unwrap_if_constexpr,
+    # API Functions
+    allocate_shared_memory,
+    arange,
+    associative_scan,
+    atomic_add,
+    atomic_and,
+    atomic_cas,
+    atomic_max,
+    atomic_min,
+    atomic_or,
+    atomic_xchg,
+    atomic_xor,
+    convert_layout,
+    device_assert,
+    expand_dims,
+    full,
+    histogram,
+    inline_asm_elementwise,
+    join,
+    load,
+    map_elementwise,
+    max_constancy,
+    max_contiguous,
+    maximum,
+    minimum,
+    multiple_of,
+    num_programs,
+    permute,
+    program_id,
+    reduce,
+    reshape,
+    set_auto_layout,
+    split,
+    static_assert,
+    static_print,
+    static_range,
+    store,
+    thread_barrier,
+    to_tensor,
+    warp_specialize,
+    where,
+)
+from ._layouts import (
+    AutoLayout,
+    BlockedLayout,
+    SliceLayout,
+    DistributedLinearLayout,
+    DotOperandLayout,
+    NVMMADistributedLayout,
+    NVMMASharedLayout,
+    SwizzledSharedLayout,
+    PaddedSharedLayout,
+)
+from ._math import (
+    umulhi,
+    exp,
+    exp2,
+    fma,
+    log,
+    log2,
+    cos,
+    rsqrt,
+    sin,
+    sqrt,
+    sqrt_rn,
+    abs,
+    fdiv,
+    div_rn,
+    erf,
+    floor,
+    ceil,
+)
+from ._standard import (
+    cdiv,
+    full_like,
+    max,
+    min,
+    reduce_or,
+    sum,
+    xor_sum,
+    zeros,
+    zeros_like,
+)
 from . import nvidia
-__all__ = [
-    *__core_all,
-    *__layouts_all,
-    *__math_all,
-    *__standard_all,
-    "nvidia",
-]
+from . import amd
+from . import extra