PyPI - triton-windows - Versions diffs - 3.3.1.post19__cp312-cp312-win_amd64.whl → 3.5.0.post21__cp312-cp312-win_amd64.whl - Mend

triton-windows 3.3.1.post19__cp312-cp312-win_amd64.whl → 3.5.0.post21__cp312-cp312-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of triton-windows might be problematic. Click here for more details.

Files changed (225) hide show

triton/_C/libtriton.pyd +0 -0
triton/__init__.py +11 -2
triton/_filecheck.py +97 -0
triton/_internal_testing.py +95 -18
triton/_utils.py +112 -21
triton/backends/__init__.py +20 -23
triton/backends/amd/__init__.py +0 -0
triton/backends/amd/compiler.py +161 -119
triton/backends/amd/driver.c +118 -46
triton/backends/amd/driver.py +274 -96
triton/backends/compiler.py +7 -21
triton/backends/driver.py +13 -0
triton/backends/nvidia/bin/ptxas.exe +0 -0
triton/backends/nvidia/compiler.py +163 -106
triton/backends/nvidia/driver.c +166 -101
triton/backends/nvidia/driver.py +384 -202
triton/compiler/__init__.py +5 -2
triton/compiler/code_generator.py +439 -231
triton/compiler/compiler.py +152 -84
triton/experimental/__init__.py +0 -0
triton/experimental/gluon/__init__.py +5 -0
triton/experimental/gluon/_compiler.py +0 -0
triton/experimental/gluon/_runtime.py +102 -0
triton/experimental/gluon/language/__init__.py +119 -0
triton/experimental/gluon/language/_core.py +490 -0
triton/experimental/gluon/language/_layouts.py +583 -0
triton/experimental/gluon/language/_math.py +20 -0
triton/experimental/gluon/language/_semantic.py +380 -0
triton/experimental/gluon/language/_standard.py +80 -0
triton/experimental/gluon/language/amd/__init__.py +4 -0
triton/experimental/gluon/language/amd/_layouts.py +96 -0
triton/experimental/gluon/language/amd/cdna3/__init__.py +100 -0
triton/experimental/gluon/language/amd/cdna4/__init__.py +48 -0
triton/experimental/gluon/language/amd/cdna4/async_copy.py +151 -0
triton/experimental/gluon/language/extra/__init__.py +3 -0
triton/experimental/gluon/language/nvidia/__init__.py +4 -0
triton/experimental/gluon/language/nvidia/ampere/__init__.py +3 -0
triton/experimental/gluon/language/nvidia/ampere/async_copy.py +74 -0
triton/experimental/gluon/language/nvidia/ampere/mbarrier.py +80 -0
triton/experimental/gluon/language/nvidia/blackwell/__init__.py +387 -0
triton/experimental/gluon/language/nvidia/blackwell/tma.py +52 -0
triton/experimental/gluon/language/nvidia/hopper/__init__.py +132 -0
triton/experimental/gluon/language/nvidia/hopper/mbarrier.py +34 -0
triton/experimental/gluon/language/nvidia/hopper/tma.py +97 -0
triton/experimental/gluon/nvidia/__init__.py +4 -0
triton/experimental/gluon/nvidia/blackwell.py +3 -0
triton/experimental/gluon/nvidia/hopper.py +45 -0
triton/knobs.py +546 -0
triton/language/__init__.py +50 -19
triton/language/core.py +909 -572
triton/language/extra/cuda/__init__.py +10 -7
triton/language/extra/cuda/gdc.py +42 -0
triton/language/extra/cuda/libdevice.py +394 -394
triton/language/extra/cuda/utils.py +21 -21
triton/language/extra/hip/__init__.py +3 -1
triton/language/extra/hip/libdevice.py +120 -104
triton/language/extra/hip/utils.py +35 -0
triton/language/extra/libdevice.py +4 -0
triton/language/math.py +65 -66
triton/language/random.py +12 -2
triton/language/semantic.py +1757 -1768
triton/language/standard.py +127 -62
triton/language/target_info.py +54 -0
triton/runtime/_allocation.py +15 -3
triton/runtime/_async_compile.py +55 -0
triton/runtime/autotuner.py +117 -60
triton/runtime/build.py +83 -17
triton/runtime/cache.py +61 -47
triton/runtime/driver.py +25 -47
triton/runtime/interpreter.py +95 -50
triton/runtime/jit.py +445 -248
triton/runtime/tcc/include/_mingw.h +8 -10
triton/runtime/tcc/include/assert.h +5 -0
triton/runtime/tcc/include/errno.h +1 -1
triton/runtime/tcc/include/float.h +21 -3
triton/runtime/tcc/include/iso646.h +36 -0
triton/runtime/tcc/include/limits.h +5 -0
triton/runtime/tcc/include/malloc.h +2 -2
triton/runtime/tcc/include/math.h +21 -261
triton/runtime/tcc/include/stdalign.h +16 -0
triton/runtime/tcc/include/stdarg.h +5 -70
triton/runtime/tcc/include/stdatomic.h +171 -0
triton/runtime/tcc/include/stddef.h +7 -19
triton/runtime/tcc/include/stdlib.h +15 -4
triton/runtime/tcc/include/stdnoreturn.h +7 -0
triton/runtime/tcc/include/sys/stat.h +2 -2
triton/runtime/tcc/include/sys/types.h +5 -0
triton/runtime/tcc/include/tcc/tcc_libm.h +444 -27
triton/runtime/tcc/include/tccdefs.h +342 -0
triton/runtime/tcc/include/tgmath.h +89 -0
triton/runtime/tcc/include/uchar.h +33 -0
triton/runtime/tcc/include/unistd.h +1 -0
triton/runtime/tcc/include/winapi/qos.h +72 -0
triton/runtime/tcc/include/winapi/shellapi.h +59 -0
triton/runtime/tcc/include/winapi/winbase.h +9 -2
triton/runtime/tcc/include/winapi/wincon.h +8 -0
triton/runtime/tcc/include/winapi/windows.h +1 -1
triton/runtime/tcc/include/winapi/winnls.h +778 -0
triton/runtime/tcc/include/winapi/winnt.h +9 -7
triton/runtime/tcc/include/winapi/winsock2.h +1474 -0
triton/runtime/tcc/include/winapi/ws2ipdef.h +21 -0
triton/runtime/tcc/include/winapi/ws2tcpip.h +391 -0
triton/runtime/tcc/lib/libtcc1.a +0 -0
triton/runtime/tcc/lib/python314.def +1800 -0
triton/runtime/tcc/lib/python314t.def +1809 -0
triton/runtime/tcc/libtcc.dll +0 -0
triton/runtime/tcc/tcc.exe +0 -0
triton/testing.py +16 -12
triton/tools/compile.py +62 -14
triton/tools/disasm.py +3 -4
triton/tools/extra/cuda/compile.c +1 -0
triton/tools/extra/hip/compile.cpp +66 -0
triton/tools/extra/hip/compile.h +13 -0
triton/tools/ragged_tma.py +92 -0
triton/tools/tensor_descriptor.py +34 -0
triton/windows_utils.py +52 -81
{triton_windows-3.3.1.post19.dist-info → triton_windows-3.5.0.post21.dist-info}/METADATA +8 -4
triton_windows-3.5.0.post21.dist-info/RECORD +217 -0
triton_windows-3.5.0.post21.dist-info/entry_points.txt +3 -0
triton_windows-3.5.0.post21.dist-info/licenses/LICENSE +23 -0
triton_windows-3.5.0.post21.dist-info/top_level.txt +1 -0
triton/backends/amd/include/hip/amd_detail/amd_channel_descriptor.h +0 -358
triton/backends/amd/include/hip/amd_detail/amd_device_functions.h +0 -1010
triton/backends/amd/include/hip/amd_detail/amd_hip_atomic.h +0 -1638
triton/backends/amd/include/hip/amd_detail/amd_hip_bf16.h +0 -1814
triton/backends/amd/include/hip/amd_detail/amd_hip_bfloat16.h +0 -293
triton/backends/amd/include/hip/amd_detail/amd_hip_common.h +0 -32
triton/backends/amd/include/hip/amd_detail/amd_hip_complex.h +0 -174
triton/backends/amd/include/hip/amd_detail/amd_hip_cooperative_groups.h +0 -835
triton/backends/amd/include/hip/amd_detail/amd_hip_fp16.h +0 -1809
triton/backends/amd/include/hip/amd_detail/amd_hip_fp8.h +0 -1391
triton/backends/amd/include/hip/amd_detail/amd_hip_gl_interop.h +0 -108
triton/backends/amd/include/hip/amd_detail/amd_hip_math_constants.h +0 -124
triton/backends/amd/include/hip/amd_detail/amd_hip_runtime.h +0 -405
triton/backends/amd/include/hip/amd_detail/amd_hip_runtime_pt_api.h +0 -196
triton/backends/amd/include/hip/amd_detail/amd_hip_unsafe_atomics.h +0 -565
triton/backends/amd/include/hip/amd_detail/amd_hip_vector_types.h +0 -2226
triton/backends/amd/include/hip/amd_detail/amd_math_functions.h +0 -104
triton/backends/amd/include/hip/amd_detail/amd_surface_functions.h +0 -244
triton/backends/amd/include/hip/amd_detail/amd_warp_functions.h +0 -538
triton/backends/amd/include/hip/amd_detail/amd_warp_sync_functions.h +0 -288
triton/backends/amd/include/hip/amd_detail/concepts.hpp +0 -30
triton/backends/amd/include/hip/amd_detail/device_library_decls.h +0 -133
triton/backends/amd/include/hip/amd_detail/functional_grid_launch.hpp +0 -218
triton/backends/amd/include/hip/amd_detail/grid_launch.h +0 -67
triton/backends/amd/include/hip/amd_detail/grid_launch.hpp +0 -50
triton/backends/amd/include/hip/amd_detail/grid_launch_GGL.hpp +0 -26
triton/backends/amd/include/hip/amd_detail/helpers.hpp +0 -137
triton/backends/amd/include/hip/amd_detail/hip_api_trace.hpp +0 -1446
triton/backends/amd/include/hip/amd_detail/hip_assert.h +0 -101
triton/backends/amd/include/hip/amd_detail/hip_cooperative_groups_helper.h +0 -242
triton/backends/amd/include/hip/amd_detail/hip_fp16_gcc.h +0 -254
triton/backends/amd/include/hip/amd_detail/hip_fp16_math_fwd.h +0 -96
triton/backends/amd/include/hip/amd_detail/hip_ldg.h +0 -100
triton/backends/amd/include/hip/amd_detail/hip_prof_str.h +0 -10570
triton/backends/amd/include/hip/amd_detail/hip_runtime_prof.h +0 -78
triton/backends/amd/include/hip/amd_detail/host_defines.h +0 -184
triton/backends/amd/include/hip/amd_detail/hsa_helpers.hpp +0 -102
triton/backends/amd/include/hip/amd_detail/macro_based_grid_launch.hpp +0 -798
triton/backends/amd/include/hip/amd_detail/math_fwd.h +0 -698
triton/backends/amd/include/hip/amd_detail/ockl_image.h +0 -177
triton/backends/amd/include/hip/amd_detail/program_state.hpp +0 -107
triton/backends/amd/include/hip/amd_detail/texture_fetch_functions.h +0 -491
triton/backends/amd/include/hip/amd_detail/texture_indirect_functions.h +0 -478
triton/backends/amd/include/hip/channel_descriptor.h +0 -39
triton/backends/amd/include/hip/device_functions.h +0 -38
triton/backends/amd/include/hip/driver_types.h +0 -468
triton/backends/amd/include/hip/hip_bf16.h +0 -36
triton/backends/amd/include/hip/hip_bfloat16.h +0 -44
triton/backends/amd/include/hip/hip_common.h +0 -100
triton/backends/amd/include/hip/hip_complex.h +0 -38
triton/backends/amd/include/hip/hip_cooperative_groups.h +0 -46
triton/backends/amd/include/hip/hip_deprecated.h +0 -95
triton/backends/amd/include/hip/hip_ext.h +0 -161
triton/backends/amd/include/hip/hip_fp16.h +0 -36
triton/backends/amd/include/hip/hip_fp8.h +0 -33
triton/backends/amd/include/hip/hip_gl_interop.h +0 -32
triton/backends/amd/include/hip/hip_hcc.h +0 -24
triton/backends/amd/include/hip/hip_math_constants.h +0 -36
triton/backends/amd/include/hip/hip_profile.h +0 -27
triton/backends/amd/include/hip/hip_runtime.h +0 -75
triton/backends/amd/include/hip/hip_runtime_api.h +0 -9261
triton/backends/amd/include/hip/hip_texture_types.h +0 -29
triton/backends/amd/include/hip/hip_vector_types.h +0 -41
triton/backends/amd/include/hip/hip_version.h +0 -17
triton/backends/amd/include/hip/hiprtc.h +0 -421
triton/backends/amd/include/hip/library_types.h +0 -78
triton/backends/amd/include/hip/math_functions.h +0 -42
triton/backends/amd/include/hip/surface_types.h +0 -63
triton/backends/amd/include/hip/texture_types.h +0 -194
triton/backends/amd/include/hsa/Brig.h +0 -1131
triton/backends/amd/include/hsa/amd_hsa_common.h +0 -91
triton/backends/amd/include/hsa/amd_hsa_elf.h +0 -462
triton/backends/amd/include/hsa/amd_hsa_kernel_code.h +0 -269
triton/backends/amd/include/hsa/amd_hsa_queue.h +0 -109
triton/backends/amd/include/hsa/amd_hsa_signal.h +0 -80
triton/backends/amd/include/hsa/hsa.h +0 -5738
triton/backends/amd/include/hsa/hsa_amd_tool.h +0 -91
triton/backends/amd/include/hsa/hsa_api_trace.h +0 -579
triton/backends/amd/include/hsa/hsa_api_trace_version.h +0 -68
triton/backends/amd/include/hsa/hsa_ext_amd.h +0 -3146
triton/backends/amd/include/hsa/hsa_ext_finalize.h +0 -531
triton/backends/amd/include/hsa/hsa_ext_image.h +0 -1454
triton/backends/amd/include/hsa/hsa_ven_amd_aqlprofile.h +0 -488
triton/backends/amd/include/hsa/hsa_ven_amd_loader.h +0 -667
triton/backends/amd/include/hsa/hsa_ven_amd_pc_sampling.h +0 -416
triton/backends/amd/include/roctracer/ext/prof_protocol.h +0 -107
triton/backends/amd/include/roctracer/hip_ostream_ops.h +0 -4515
triton/backends/amd/include/roctracer/hsa_ostream_ops.h +0 -1727
triton/backends/amd/include/roctracer/hsa_prof_str.h +0 -3059
triton/backends/amd/include/roctracer/roctracer.h +0 -779
triton/backends/amd/include/roctracer/roctracer_ext.h +0 -81
triton/backends/amd/include/roctracer/roctracer_hcc.h +0 -24
triton/backends/amd/include/roctracer/roctracer_hip.h +0 -37
triton/backends/amd/include/roctracer/roctracer_hsa.h +0 -112
triton/backends/amd/include/roctracer/roctracer_plugin.h +0 -137
triton/backends/amd/include/roctracer/roctracer_roctx.h +0 -67
triton/backends/amd/include/roctracer/roctx.h +0 -229
triton/language/_utils.py +0 -21
triton/language/extra/cuda/_experimental_tma.py +0 -106
triton/runtime/tcc/lib/libtcc1-64.a +0 -0
triton/tools/experimental_descriptor.py +0 -32
triton_windows-3.3.1.post19.dist-info/RECORD +0 -260
triton_windows-3.3.1.post19.dist-info/top_level.txt +0 -14
{triton_windows-3.3.1.post19.dist-info → triton_windows-3.5.0.post21.dist-info}/WHEEL +0 -0

triton/compiler/compiler.py CHANGED Viewed

@@ -3,19 +3,19 @@ import hashlib
 import json
 from .._C.libtriton import get_cache_invalidating_env_vars, ir
 from ..backends import backends
-from ..backends.compiler import GPUTarget
-from .. import __version__
+from ..backends.compiler import Language
+from ..backends.compiler import BaseBackend, GPUTarget
+from .. import __version__, knobs
 from ..runtime.autotuner import OutOfResources
-from ..runtime.cache import get_cache_manager, get_dump_manager, get_override_manager
+from ..runtime.cache import get_cache_manager, get_dump_manager, get_override_manager, get_cache_key
 from ..runtime.driver import driver
 from ..tools.disasm import get_sass
-# TODO: this shouldn't be here
-from .code_generator import ast_to_ttir
 from pathlib import Path
 import re
 import functools
 import os
-import sysconfig
+import time
+import copy
 # - ^\s*tt\.func\s+ : match the start of the string, any leading whitespace, the keyword func,
 #    and any following whitespace
@@ -53,6 +53,7 @@ class ASTSource:
     def __init__(self, fn, signature, constexprs=None, attrs=None) -> None:
         self.fn = fn
+        self.language = Language.TRITON
         self.ext = "ttir"
         self.name = fn.__name__
         self.signature = signature
@@ -63,12 +64,9 @@ class ASTSource:
                 assert isinstance(k, tuple)
                 self.constants[k] = v
         self.attrs = attrs or dict()
-        if isinstance(self.signature, str):
-            self.signature = {k: v.strip() for k, v in enumerate(self.signature.split(","))}
-        else:
-            for k in self.signature.keys():
-                if not isinstance(k, str):
-                    raise TypeError("Signature keys must be string")
+        for k in self.signature.keys():
+            if not isinstance(k, str):
+                raise TypeError("Signature keys must be string")
     def hash(self):
         sorted_sig = [v for k, v in sorted(self.signature.items())]
@@ -77,7 +75,8 @@ class ASTSource:
         key = f"{self.fn.cache_key}-{str(self.attrs)}-{sorted_sig}-{constants_key}"
         return hashlib.sha256(key.encode("utf-8")).hexdigest()
-    def make_ir(self, options, codegen_fns, module_map, context):
+    def make_ir(self, target: GPUTarget, options, codegen_fns, module_map, context):
+        from .code_generator import ast_to_ttir
         return ast_to_ttir(self.fn, self, context=context, options=options, codegen_fns=codegen_fns,
                            module_map=module_map)
@@ -91,6 +90,7 @@ class IRSource:
         self.path = path
         path = Path(path)
         self.ext = path.suffix[1:]
+        self.language = Language.TRITON
         self.src = path.read_text()
         ir.load_dialects(context)
         backend.load_dialects(context)
@@ -114,7 +114,7 @@ class IRSource:
     def hash(self):
         return hashlib.sha256(self.src.encode("utf-8")).hexdigest()
-    def make_ir(self, options, codegen_fns, module_map, context):
+    def make_ir(self, target: GPUTarget, options, codegen_fns, module_map, context):
         self.module.context = context
         return self.module
@@ -127,39 +127,8 @@ class IRSource:
 @functools.lru_cache()
-def triton_key():
-    import pkgutil
-    TRITON_PATH = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
-    contents = []
-    # frontend
-    with open(__file__, "rb") as f:
-        contents += [hashlib.sha256(f.read()).hexdigest()]
-    # compiler
-    path_prefixes = [
-        (os.path.join(TRITON_PATH, "compiler"), "triton.compiler."),
-        (os.path.join(TRITON_PATH, "backends"), "triton.backends."),
-    ]
-    for path, prefix in path_prefixes:
-        for lib in pkgutil.walk_packages([path], prefix=prefix):
-            with open(lib.module_finder.find_spec(lib.name).origin, "rb") as f:
-                contents += [hashlib.sha256(f.read()).hexdigest()]
-    # backend
-    libtriton_hash = hashlib.sha256()
-    ext = sysconfig.get_config_var("EXT_SUFFIX").split(".")[-1]
-    with open(os.path.join(TRITON_PATH, "_C", f"libtriton.{ext}"), "rb") as f:
-        while True:
-            chunk = f.read(1024**2)
-            if not chunk:
-                break
-            libtriton_hash.update(chunk)
-    contents.append(libtriton_hash.hexdigest())
-    # language
-    language_path = os.path.join(TRITON_PATH, 'language')
-    for lib in pkgutil.walk_packages([language_path], prefix="triton.language."):
-        with open(lib.module_finder.find_spec(lib.name).origin, "rb") as f:
-            contents += [hashlib.sha256(f.read()).hexdigest()]
-    return f'{__version__}' + '-'.join(contents)
+def max_shared_mem(device):
+    return driver.active.utils.get_device_properties(device)["max_shared_mem"]
 def parse(full_name, ext, context):
@@ -179,7 +148,7 @@ def filter_traceback(e: BaseException):
     These are uninteresting to the user -- "just show me *my* code!"
     """
-    if os.getenv("TRITON_FRONT_END_DEBUGGING", "0") == "1":
+    if knobs.compilation.front_end_debugging:
         return
     if e.__cause__ is not None:
@@ -211,7 +180,50 @@ def filter_traceback(e: BaseException):
         e.__traceback__ = frames[0]
-def compile(src, target=None, options=None):
+class CompileTimer:
+    def __init__(self) -> None:
+        self.start: float = time.perf_counter()
+        self.ir_initialization_end: float | None = None
+        self.lowering_stage_ends: list[tuple[str, float]] = []
+        self.store_results_end: float | None = None
+    def finished_ir_initialization(self) -> None:
+        self.ir_initialization_end = time.perf_counter()
+    def stage_finished(self, stage_name: str) -> None:
+        self.lowering_stage_ends.append((stage_name, time.perf_counter()))
+    def end(self) -> knobs.CompileTimes:
+        timestamp = time.perf_counter()
+        if self.ir_initialization_end is None:
+            self.ir_initialization_end = timestamp
+        else:
+            self.store_results_end = timestamp
+        def delta(start: float, end: float | None) -> int:
+            if end is None:
+                return 0
+            return int((end - start) * 1000000)
+        lowering_stage_durations = []
+        stage_start = self.ir_initialization_end
+        for stage_name, stage_end in self.lowering_stage_ends:
+            lowering_stage_durations.append((stage_name, delta(stage_start, stage_end)))
+            stage_start = stage_end
+        return knobs.CompileTimes(
+            ir_initialization=delta(self.start, self.ir_initialization_end),
+            lowering_stages=lowering_stage_durations,
+            store_results=delta(stage_start, self.store_results_end),
+        )
+def compile(src, target=None, options=None, _env_vars=None):
+    compilation_listener = knobs.compilation.listener
+    if compilation_listener:
+        timer = CompileTimer()
     if target is None:
         target = driver.active.get_current_target()
     assert isinstance(target, GPUTarget), "target must be of GPUTarget type"
@@ -226,15 +238,15 @@ def compile(src, target=None, options=None):
     extra_options = src.parse_options()
     options = backend.parse_options(dict(options or dict(), **extra_options))
     # create cache manager
-    env_vars = get_cache_invalidating_env_vars()
-    key = f"{triton_key()}-{src.hash()}-{backend.hash()}-{options.hash()}-{str(sorted(env_vars.items()))}"
+    env_vars = get_cache_invalidating_env_vars() if _env_vars is None else _env_vars
+    key = get_cache_key(src, backend, options, env_vars=env_vars)
     hash = hashlib.sha256(key.encode("utf-8")).hexdigest()
     fn_cache_manager = get_cache_manager(hash)
     # For dumping/overriding only hash the source as we want it to be independent of triton
     # core changes to make it easier to track kernels by hash.
-    enable_override = os.environ.get("TRITON_KERNEL_OVERRIDE", "0") == "1"
-    enable_ir_dump = os.environ.get("TRITON_KERNEL_DUMP", "0") == "1"
-    store_only_binary = os.environ.get("TRITON_STORE_BINARY_ONLY", "0") == "1"
+    enable_override = knobs.compilation.override
+    enable_ir_dump = knobs.compilation.dump_ir
+    store_only_binary = knobs.compilation.store_binary_only
     fn_override_manager = get_override_manager(src.hash()) if enable_override else None
     fn_dump_manager = get_dump_manager(src.hash()) if enable_ir_dump else None
     # Pre-truncate the file name here to avoid hitting the 255 character limit on common platforms.
@@ -245,10 +257,20 @@ def compile(src, target=None, options=None):
     metadata_filename = f"{file_name}.json"
     metadata_group = fn_cache_manager.get_group(metadata_filename) or {}
     metadata_path = metadata_group.get(metadata_filename)
-    always_compile = os.environ.get("TRITON_ALWAYS_COMPILE", "0") == "1"
+    always_compile = knobs.compilation.always_compile
     if not always_compile and metadata_path is not None:
         # cache hit!
-        return CompiledKernel(src, metadata_group, hash)
+        res = CompiledKernel(src, metadata_group, hash)
+        if compilation_listener:
+            compilation_listener(
+                src=src,
+                metadata=res.metadata._asdict(),
+                metadata_group=metadata_group,
+                times=timer.end(),
+                cache_hit=True,
+            )
+        return res
     # initialize metadata
     metadata = {
         "hash": hash,
@@ -259,7 +281,7 @@ def compile(src, target=None, options=None):
     metadata["triton_version"] = __version__
     # run compilation pipeline  and populate metadata
     stages = dict()
-    backend.add_stages(stages, options)
+    backend.add_stages(stages, options, src.language)
     first_stage = list(stages.keys()).index(src.ext)
     # when the source is an IR file, don't apply the passes related to this stage. This makes it easier to write IR level tests.
     if ir_source:
@@ -275,15 +297,34 @@ def compile(src, target=None, options=None):
     codegen_fns = backend.get_codegen_implementation(options)
     module_map = backend.get_module_map()
     try:
-        module = src.make_ir(options, codegen_fns, module_map, context)
+        module = src.make_ir(target, options, codegen_fns, module_map, context)
     except Exception as e:
         filter_traceback(e)
         raise
-    use_ir_loc = os.environ.get("USE_IR_LOC", None)
+    if ir_source:
+        ir_filename = f"{file_name}.{src.ext}"
+        metadata_group[ir_filename] = fn_cache_manager.put(module, ir_filename)
+    else:
+        ir_filename = f"{file_name}.source"
+        metadata_group[ir_filename] = fn_cache_manager.put(module, ir_filename)
+    use_ir_loc = knobs.compilation.use_ir_loc
+    if ir_source and use_ir_loc:
+        module.create_location_snapshot(src.path)
+        print(f"Creating new locations for {src.path}")
+    if compilation_listener:
+        timer.finished_ir_initialization()
     for ext, compile_ir in list(stages.items())[first_stage:]:
         next_module = compile_ir(module, metadata)
         ir_filename = f"{file_name}.{ext}"
-        if (fn_override_manager is not None and (full_name := fn_override_manager.get_file(ir_filename)) is not None):
+        if fn_override_manager is None:
+            # Users can override kernels at scale by setting `ir_override` in autotune config
+            # without TRITON_KERNEL_OVERRIDE
+            if (ir_override := metadata.get("ir_override", None)) and ir_override.endswith(f".{ext}"):
+                next_module = parse(ir_override, ext, context)
+        elif full_name := fn_override_manager.get_file(ir_filename):
             print(f"\nOverriding kernel with file {full_name}")
             next_module = parse(full_name, ext, context)
         # If TRITON_STORE_BINARY_ONLY is 1, only store cubin/hsaco/json
@@ -291,12 +332,17 @@ def compile(src, target=None, options=None):
             metadata_group[ir_filename] = fn_cache_manager.put(next_module, ir_filename)
         if fn_dump_manager is not None:
             fn_dump_manager.put(next_module, ir_filename)
+            if ext == "cubin":
+                sass = get_sass(next_module)
+                fn_dump_manager.put(sass, file_name + ".sass")
         # use an env variable to parse ir from file
         if use_ir_loc == ext:
             ir_full_name = fn_cache_manager.get_file(ir_filename)
             next_module.create_location_snapshot(ir_full_name)
             print(f"Creating new locations for {ir_full_name}")
         module = next_module
+        if compilation_listener:
+            timer.stage_finished(ext)
     # write-back metadata
     metadata_group[metadata_filename] = fn_cache_manager.put(json.dumps(metadata, default=vars), metadata_filename,
                                                              binary=False)
@@ -310,13 +356,18 @@ def compile(src, target=None, options=None):
     # this is likely due to the llvm-symbolizer forking a process
     # TODO: Reconcile the difference here between the ASAN and non-ASAN path with enabling
     # multithreading in the MLIR context
-    if not os.environ.get("TRITON_ENABLE_ASAN", "0") == "1":
+    if not knobs.compilation.enable_asan:
         context.disable_multithreading()
+    # notify any listener
+    if compilation_listener:
+        compilation_listener(src=src, metadata=metadata, metadata_group=metadata_group, times=timer.end(),
+                             cache_hit=False)
     # return handle to compiled kernel
     return CompiledKernel(src, metadata_group, hash)
-def make_backend(target):
+def make_backend(target: GPUTarget) -> BaseBackend:
     actives = [x.compiler for x in backends.values() if x.compiler.supports_target(target)]
     if len(actives) != 1:
         raise RuntimeError(
@@ -330,7 +381,7 @@ class LazyDict:
         self.data = data
         self.extras = []
-    def get(self) -> None:
+    def get(self):
         for func, args in self.extras:
             self.data = self.data | func(*args)
         self.extras.clear()
@@ -353,12 +404,11 @@ class AsmDict(dict):
         return value
-class CompiledKernel:
+def _raise_error(err, *args, **kwargs):
+    raise copy.deepcopy(err)
-    # Hooks for external tools to monitor the execution of triton kernels
-    # TODO: move out of this namespace since it's a runtime thing
-    launch_enter_hook = None
-    launch_exit_hook = None
+class CompiledKernel:
     def __init__(self, src, metadata_group, hash):
         from collections import namedtuple
@@ -382,48 +432,66 @@ class CompiledKernel:
             file.suffix[1:]: file.read_bytes() if file.suffix[1:] == binary_ext else file.read_text()
             for file in asm_files
         })
+        self.metadata_group = metadata_group
         self.kernel = self.asm[binary_ext]
         # binaries are lazily initialized
         # because it involves doing runtime things
         # (e.g., checking amount of shared memory on current device)
         self.module = None
         self.function = None
+        self._run = None
     def _init_handles(self):
         if self.module is not None:
             return
+        def raise_(err):
+            # clone the exception object so that the one saved in the closure
+            # of the partial function below doesn't get assigned a stack trace
+            # after the subsequent raise. otherwise, the CompiledKernel instance
+            # saved in the (global) kernel cache will keep references to all the
+            # locals in the traceback via the exception instance in the closure.
+            cloned_err = copy.deepcopy(err)
+            self._run = functools.partial(_raise_error, cloned_err)
+            raise err
         device = driver.active.get_current_device()
         # create launcher
-        self.run = driver.active.launcher_cls(self.src, self.metadata)
+        self._run = driver.active.launcher_cls(self.src, self.metadata)
         # not enough shared memory to run the kernel
-        max_shared = driver.active.utils.get_device_properties(device)["max_shared_mem"]
+        max_shared = max_shared_mem(device)
         if self.metadata.shared > max_shared:
-            raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+            raise_(OutOfResources(self.metadata.shared, max_shared, "shared memory"))
         if hasattr(self.metadata, "tmem_size") and self.metadata.tmem_size is not None:
             # Use blackwell max tmem size for now, this should be moved in device properties
             max_tmem_size = 512  # tmem size in number of columns
             if self.metadata.tmem_size > max_tmem_size:
-                raise OutOfResources(self.metadata.tmem_size, max_tmem_size, "tensor memory")
+                raise_(OutOfResources(self.metadata.tmem_size, max_tmem_size, "tensor memory"))
+        if knobs.runtime.kernel_load_start_hook is not None:
+            knobs.runtime.kernel_load_start_hook(self.module, self.function, self.name, self.metadata_group, self.hash)
         # TODO: n_regs, n_spills should be metadata generated when calling `ptxas`
-        self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+        self.module, self.function, self.n_regs, self.n_spills, self.n_max_threads = driver.active.utils.load_binary(
             self.name, self.kernel, self.metadata.shared, device)
-    def __getattribute__(self, name):
-        if name == 'run':
+        warp_size = driver.active.get_current_target().warp_size
+        if self.metadata.num_warps * warp_size > self.n_max_threads:
+            raise_(OutOfResources(self.metadata.num_warps * warp_size, self.n_max_threads, "threads"))
+        if knobs.runtime.kernel_load_end_hook is not None:
+            knobs.runtime.kernel_load_end_hook(self.module, self.function, self.name, self.metadata_group, self.hash)
+    @property
+    def run(self):
+        if self._run is None:
             self._init_handles()
-        return super().__getattribute__(name)
+        return self._run
     def launch_metadata(self, grid, stream, *args):
-        if CompiledKernel.launch_enter_hook is None:
+        if knobs.runtime.launch_enter_hook is None:
             return None
+        self._init_handles()
         ret = LazyDict({"name": self.name, "function": self.function, "stream": stream})
         if not isinstance(self.src, ASTSource) or self.src.fn.launch_metadata is None:
             return ret
-        arg_dict = {}
-        arg_idx = 0
-        for i, arg_name in enumerate(self.src.fn.arg_names):
-            arg_dict[arg_name] = args[arg_idx]
-            arg_idx += 1
+        arg_dict = {name: arg for name, arg in zip(self.src.fn.arg_names, args)}
         ret.add(self.src.fn.launch_metadata, (grid, self.metadata, arg_dict))
         return ret
@@ -436,6 +504,6 @@ class CompiledKernel:
                 stream = driver.active.get_current_stream(device)
             launch_metadata = self.launch_metadata(grid, stream, *args)
             self.run(grid[0], grid[1], grid[2], stream, self.function, self.packed_metadata, launch_metadata,
-                     CompiledKernel.launch_enter_hook, CompiledKernel.launch_exit_hook, *args)
+                     knobs.runtime.launch_enter_hook, knobs.runtime.launch_exit_hook, *args)
         return runner

triton/experimental/__init__.py ADDED Viewed

File without changes

triton/experimental/gluon/__init__.py ADDED Viewed

@@ -0,0 +1,5 @@
+from . import nvidia
+from ._runtime import constexpr_function, jit
+from triton.language.core import must_use_result
+__all__ = ["constexpr_function", "jit", "must_use_result", "nvidia"]

triton/experimental/gluon/_compiler.py ADDED Viewed

File without changes

triton/experimental/gluon/_runtime.py ADDED Viewed

@@ -0,0 +1,102 @@
+from __future__ import annotations
+from triton.compiler.compiler import ASTSource
+from triton.backends.compiler import Language
+from triton.runtime.jit import JITFunction, constexpr_function
+from typing import TypeVar, Optional, Callable, Iterable, Union
+from triton._C.libtriton import ir
+T = TypeVar("T")
+__all__ = ["constexpr_function", "jit"]
+class GluonASTSource(ASTSource):
+    def __init__(self, fn, signature, constexprs=None, attrs=None) -> None:
+        super().__init__(fn, signature, constexprs, attrs)
+        self.language = Language.GLUON
+        self.ext = "ttgir"
+    def make_ir(self, target, options, codegen_fns, module_map, context):
+        from triton.compiler.compiler import make_backend
+        from triton.compiler.code_generator import ast_to_ttir
+        builder = ir.builder(context)
+        module = builder.create_module()
+        # Assign module attributes eagerly, as they are needed to verify layouts
+        backend = make_backend(target)
+        target = backend.get_target_name(options)
+        module.set_attr("ttg.target", builder.get_string_attr(target))
+        module.set_attr("ttg.num-warps", builder.get_int32_attr(options.num_warps))
+        module.set_attr("ttg.num-ctas", builder.get_int32_attr(options.num_ctas))
+        module.set_attr("ttg.threads-per-warp", builder.get_int32_attr(options.warp_size))
+        is_cuda = options.backend_name == "cuda"
+        if is_cuda and options.maxnreg is not None:
+            module.set_attr("ttg.maxnreg", builder.get_int32_attr(options.maxnreg))
+        module = ast_to_ttir(self.fn, self, context=context, options=options, codegen_fns=codegen_fns,
+                             module_map=module_map, module=module)
+        return module
+class GluonJITFunction(JITFunction[T]):
+    def create_binder(self):
+        result = super().create_binder()
+        self.ASTSource = GluonASTSource
+        return result
+    def is_gluon(self):
+        return True
+def jit(
+    fn: Optional[T] = None,
+    *,
+    version=None,
+    repr: Optional[Callable] = None,
+    launch_metadata: Optional[Callable] = None,
+    do_not_specialize: Optional[Iterable[int | str]] = None,
+    do_not_specialize_on_alignment: Optional[Iterable[int | str]] = None,
+    debug: Optional[bool] = None,
+    noinline: Optional[bool] = None,
+) -> Union[GluonJITFunction[T], Callable[[T], JITFunction[T]]]:
+    """
+    Decorator for JIT-compiling a function using the Triton compiler.
+    :note: When a jit'd function is called, arguments are
+        implicitly converted to pointers if they have a :code:`.data_ptr()` method
+        and a `.dtype` attribute.
+    :note: This function will be compiled and run on the GPU. It will only have access to:
+           * python primitives,
+           * builtins within the triton package,
+           * arguments to this function,
+           * other jit'd functions
+    :param fn: the function to be jit-compiled
+    :type fn: Callable
+    """
+    def decorator(fn: T) -> JITFunction[T]:
+        assert callable(fn)
+        return GluonJITFunction(
+            fn,
+            version=version,
+            do_not_specialize=do_not_specialize,
+            do_not_specialize_on_alignment=do_not_specialize_on_alignment,
+            debug=debug,
+            noinline=noinline,
+            repr=repr,
+            launch_metadata=launch_metadata,
+        )
+    if fn is not None:
+        return decorator(fn)
+    else:
+        return decorator

triton/experimental/gluon/language/__init__.py ADDED Viewed

@@ -0,0 +1,119 @@
+from ._core import (
+    base_value,
+    base_type,
+    block_type,
+    broadcast,
+    constexpr,
+    dtype,
+    void,
+    int1,
+    int8,
+    int16,
+    int32,
+    int64,
+    uint8,
+    uint16,
+    uint32,
+    uint64,
+    float8e5,
+    float8e5b16,
+    float8e4nv,
+    float8e4b8,
+    float8e4b15,
+    float16,
+    bfloat16,
+    float32,
+    float64,
+    pointer_type,
+    shared_memory_descriptor,
+    tensor,
+    tuple,
+    tuple_type,
+    _unwrap_if_constexpr,
+    # API Functions
+    allocate_shared_memory,
+    arange,
+    associative_scan,
+    atomic_add,
+    atomic_and,
+    atomic_cas,
+    atomic_max,
+    atomic_min,
+    atomic_or,
+    atomic_xchg,
+    atomic_xor,
+    convert_layout,
+    device_assert,
+    expand_dims,
+    full,
+    histogram,
+    inline_asm_elementwise,
+    join,
+    load,
+    map_elementwise,
+    max_constancy,
+    max_contiguous,
+    maximum,
+    minimum,
+    multiple_of,
+    num_programs,
+    permute,
+    program_id,
+    reduce,
+    reshape,
+    set_auto_layout,
+    split,
+    static_assert,
+    static_print,
+    static_range,
+    store,
+    thread_barrier,
+    to_tensor,
+    warp_specialize,
+    where,
+)
+from ._layouts import (
+    AutoLayout,
+    BlockedLayout,
+    SliceLayout,
+    DistributedLinearLayout,
+    DotOperandLayout,
+    NVMMADistributedLayout,
+    NVMMASharedLayout,
+    SwizzledSharedLayout,
+    PaddedSharedLayout,
+)
+from ._math import (
+    umulhi,
+    exp,
+    exp2,
+    fma,
+    log,
+    log2,
+    cos,
+    rsqrt,
+    sin,
+    sqrt,
+    sqrt_rn,
+    abs,
+    fdiv,
+    div_rn,
+    erf,
+    floor,
+    ceil,
+)
+from ._standard import (
+    cdiv,
+    full_like,
+    max,
+    min,
+    reduce_or,
+    sum,
+    xor_sum,
+    zeros,
+    zeros_like,
+)
+from . import nvidia
+from . import amd
+from . import extra