PyPI - triton-windows - Versions diffs - 3.5.0.post21__cp314-cp314-win_amd64.whl - Mend

triton-windows 3.5.0.post21__cp314-cp314-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of triton-windows might be problematic. Click here for more details.

Files changed (217) hide show

triton/_C/libtriton.pyd +0 -0
triton/__init__.py +82 -0
triton/_filecheck.py +97 -0
triton/_internal_testing.py +255 -0
triton/_utils.py +126 -0
triton/backends/__init__.py +47 -0
triton/backends/amd/__init__.py +0 -0
triton/backends/amd/compiler.py +461 -0
triton/backends/amd/driver.c +283 -0
triton/backends/amd/driver.py +724 -0
triton/backends/amd/lib/asanrtl.bc +0 -0
triton/backends/amd/lib/ockl.bc +0 -0
triton/backends/amd/lib/ocml.bc +0 -0
triton/backends/compiler.py +90 -0
triton/backends/driver.py +66 -0
triton/backends/nvidia/__init__.py +0 -0
triton/backends/nvidia/bin/ptxas.exe +0 -0
triton/backends/nvidia/compiler.py +533 -0
triton/backends/nvidia/driver.c +517 -0
triton/backends/nvidia/driver.py +799 -0
triton/backends/nvidia/include/cuda.h +26280 -0
triton/backends/nvidia/lib/libdevice.10.bc +0 -0
triton/backends/nvidia/lib/x64/cuda.lib +0 -0
triton/compiler/__init__.py +7 -0
triton/compiler/code_generator.py +1614 -0
triton/compiler/compiler.py +509 -0
triton/compiler/errors.py +51 -0
triton/compiler/make_launcher.py +0 -0
triton/errors.py +5 -0
triton/experimental/__init__.py +0 -0
triton/experimental/gluon/__init__.py +5 -0
triton/experimental/gluon/_compiler.py +0 -0
triton/experimental/gluon/_runtime.py +102 -0
triton/experimental/gluon/language/__init__.py +119 -0
triton/experimental/gluon/language/_core.py +490 -0
triton/experimental/gluon/language/_layouts.py +583 -0
triton/experimental/gluon/language/_math.py +20 -0
triton/experimental/gluon/language/_semantic.py +380 -0
triton/experimental/gluon/language/_standard.py +80 -0
triton/experimental/gluon/language/amd/__init__.py +4 -0
triton/experimental/gluon/language/amd/_layouts.py +96 -0
triton/experimental/gluon/language/amd/cdna3/__init__.py +100 -0
triton/experimental/gluon/language/amd/cdna4/__init__.py +48 -0
triton/experimental/gluon/language/amd/cdna4/async_copy.py +151 -0
triton/experimental/gluon/language/extra/__init__.py +3 -0
triton/experimental/gluon/language/nvidia/__init__.py +4 -0
triton/experimental/gluon/language/nvidia/ampere/__init__.py +3 -0
triton/experimental/gluon/language/nvidia/ampere/async_copy.py +74 -0
triton/experimental/gluon/language/nvidia/ampere/mbarrier.py +80 -0
triton/experimental/gluon/language/nvidia/blackwell/__init__.py +387 -0
triton/experimental/gluon/language/nvidia/blackwell/tma.py +52 -0
triton/experimental/gluon/language/nvidia/hopper/__init__.py +132 -0
triton/experimental/gluon/language/nvidia/hopper/mbarrier.py +34 -0
triton/experimental/gluon/language/nvidia/hopper/tma.py +97 -0
triton/experimental/gluon/nvidia/__init__.py +4 -0
triton/experimental/gluon/nvidia/blackwell.py +3 -0
triton/experimental/gluon/nvidia/hopper.py +45 -0
triton/knobs.py +546 -0
triton/language/__init__.py +342 -0
triton/language/core.py +3405 -0
triton/language/extra/__init__.py +26 -0
triton/language/extra/cuda/__init__.py +16 -0
triton/language/extra/cuda/gdc.py +42 -0
triton/language/extra/cuda/libdevice.py +1629 -0
triton/language/extra/cuda/utils.py +109 -0
triton/language/extra/hip/__init__.py +5 -0
triton/language/extra/hip/libdevice.py +491 -0
triton/language/extra/hip/utils.py +35 -0
triton/language/extra/libdevice.py +790 -0
triton/language/math.py +249 -0
triton/language/random.py +218 -0
triton/language/semantic.py +1939 -0
triton/language/standard.py +534 -0
triton/language/target_info.py +54 -0
triton/runtime/__init__.py +23 -0
triton/runtime/_allocation.py +44 -0
triton/runtime/_async_compile.py +55 -0
triton/runtime/autotuner.py +476 -0
triton/runtime/build.py +168 -0
triton/runtime/cache.py +317 -0
triton/runtime/driver.py +38 -0
triton/runtime/errors.py +36 -0
triton/runtime/interpreter.py +1414 -0
triton/runtime/jit.py +1107 -0
triton/runtime/tcc/include/_mingw.h +168 -0
triton/runtime/tcc/include/assert.h +62 -0
triton/runtime/tcc/include/conio.h +409 -0
triton/runtime/tcc/include/ctype.h +281 -0
triton/runtime/tcc/include/dir.h +31 -0
triton/runtime/tcc/include/direct.h +68 -0
triton/runtime/tcc/include/dirent.h +135 -0
triton/runtime/tcc/include/dos.h +55 -0
triton/runtime/tcc/include/errno.h +75 -0
triton/runtime/tcc/include/excpt.h +123 -0
triton/runtime/tcc/include/fcntl.h +52 -0
triton/runtime/tcc/include/fenv.h +108 -0
triton/runtime/tcc/include/float.h +75 -0
triton/runtime/tcc/include/inttypes.h +297 -0
triton/runtime/tcc/include/io.h +418 -0
triton/runtime/tcc/include/iso646.h +36 -0
triton/runtime/tcc/include/limits.h +116 -0
triton/runtime/tcc/include/locale.h +91 -0
triton/runtime/tcc/include/malloc.h +181 -0
triton/runtime/tcc/include/math.h +497 -0
triton/runtime/tcc/include/mem.h +13 -0
triton/runtime/tcc/include/memory.h +40 -0
triton/runtime/tcc/include/process.h +176 -0
triton/runtime/tcc/include/sec_api/conio_s.h +42 -0
triton/runtime/tcc/include/sec_api/crtdbg_s.h +19 -0
triton/runtime/tcc/include/sec_api/io_s.h +33 -0
triton/runtime/tcc/include/sec_api/mbstring_s.h +52 -0
triton/runtime/tcc/include/sec_api/search_s.h +25 -0
triton/runtime/tcc/include/sec_api/stdio_s.h +145 -0
triton/runtime/tcc/include/sec_api/stdlib_s.h +67 -0
triton/runtime/tcc/include/sec_api/stralign_s.h +30 -0
triton/runtime/tcc/include/sec_api/string_s.h +41 -0
triton/runtime/tcc/include/sec_api/sys/timeb_s.h +34 -0
triton/runtime/tcc/include/sec_api/tchar_s.h +266 -0
triton/runtime/tcc/include/sec_api/time_s.h +61 -0
triton/runtime/tcc/include/sec_api/wchar_s.h +128 -0
triton/runtime/tcc/include/setjmp.h +160 -0
triton/runtime/tcc/include/share.h +28 -0
triton/runtime/tcc/include/signal.h +63 -0
triton/runtime/tcc/include/stdalign.h +16 -0
triton/runtime/tcc/include/stdarg.h +14 -0
triton/runtime/tcc/include/stdatomic.h +171 -0
triton/runtime/tcc/include/stdbool.h +11 -0
triton/runtime/tcc/include/stddef.h +42 -0
triton/runtime/tcc/include/stdint.h +212 -0
triton/runtime/tcc/include/stdio.h +429 -0
triton/runtime/tcc/include/stdlib.h +591 -0
triton/runtime/tcc/include/stdnoreturn.h +7 -0
triton/runtime/tcc/include/string.h +164 -0
triton/runtime/tcc/include/sys/fcntl.h +13 -0
triton/runtime/tcc/include/sys/file.h +14 -0
triton/runtime/tcc/include/sys/locking.h +30 -0
triton/runtime/tcc/include/sys/stat.h +290 -0
triton/runtime/tcc/include/sys/time.h +69 -0
triton/runtime/tcc/include/sys/timeb.h +133 -0
triton/runtime/tcc/include/sys/types.h +123 -0
triton/runtime/tcc/include/sys/unistd.h +14 -0
triton/runtime/tcc/include/sys/utime.h +146 -0
triton/runtime/tcc/include/tcc/tcc_libm.h +618 -0
triton/runtime/tcc/include/tccdefs.h +342 -0
triton/runtime/tcc/include/tcclib.h +80 -0
triton/runtime/tcc/include/tchar.h +1102 -0
triton/runtime/tcc/include/tgmath.h +89 -0
triton/runtime/tcc/include/time.h +287 -0
triton/runtime/tcc/include/uchar.h +33 -0
triton/runtime/tcc/include/unistd.h +1 -0
triton/runtime/tcc/include/vadefs.h +11 -0
triton/runtime/tcc/include/values.h +4 -0
triton/runtime/tcc/include/varargs.h +12 -0
triton/runtime/tcc/include/wchar.h +873 -0
triton/runtime/tcc/include/wctype.h +172 -0
triton/runtime/tcc/include/winapi/basetsd.h +149 -0
triton/runtime/tcc/include/winapi/basetyps.h +85 -0
triton/runtime/tcc/include/winapi/guiddef.h +156 -0
triton/runtime/tcc/include/winapi/poppack.h +8 -0
triton/runtime/tcc/include/winapi/pshpack1.h +8 -0
triton/runtime/tcc/include/winapi/pshpack2.h +8 -0
triton/runtime/tcc/include/winapi/pshpack4.h +8 -0
triton/runtime/tcc/include/winapi/pshpack8.h +8 -0
triton/runtime/tcc/include/winapi/qos.h +72 -0
triton/runtime/tcc/include/winapi/shellapi.h +59 -0
triton/runtime/tcc/include/winapi/winbase.h +2958 -0
triton/runtime/tcc/include/winapi/wincon.h +309 -0
triton/runtime/tcc/include/winapi/windef.h +293 -0
triton/runtime/tcc/include/winapi/windows.h +127 -0
triton/runtime/tcc/include/winapi/winerror.h +3166 -0
triton/runtime/tcc/include/winapi/wingdi.h +4080 -0
triton/runtime/tcc/include/winapi/winnls.h +778 -0
triton/runtime/tcc/include/winapi/winnt.h +5837 -0
triton/runtime/tcc/include/winapi/winreg.h +272 -0
triton/runtime/tcc/include/winapi/winsock2.h +1474 -0
triton/runtime/tcc/include/winapi/winuser.h +5651 -0
triton/runtime/tcc/include/winapi/winver.h +160 -0
triton/runtime/tcc/include/winapi/ws2ipdef.h +21 -0
triton/runtime/tcc/include/winapi/ws2tcpip.h +391 -0
triton/runtime/tcc/lib/cuda.def +697 -0
triton/runtime/tcc/lib/gdi32.def +337 -0
triton/runtime/tcc/lib/kernel32.def +770 -0
triton/runtime/tcc/lib/libtcc1.a +0 -0
triton/runtime/tcc/lib/msvcrt.def +1399 -0
triton/runtime/tcc/lib/python3.def +810 -0
triton/runtime/tcc/lib/python310.def +1610 -0
triton/runtime/tcc/lib/python311.def +1633 -0
triton/runtime/tcc/lib/python312.def +1703 -0
triton/runtime/tcc/lib/python313.def +1651 -0
triton/runtime/tcc/lib/python313t.def +1656 -0
triton/runtime/tcc/lib/python314.def +1800 -0
triton/runtime/tcc/lib/python314t.def +1809 -0
triton/runtime/tcc/lib/python39.def +1644 -0
triton/runtime/tcc/lib/python3t.def +905 -0
triton/runtime/tcc/lib/user32.def +658 -0
triton/runtime/tcc/libtcc.dll +0 -0
triton/runtime/tcc/tcc.exe +0 -0
triton/testing.py +543 -0
triton/tools/__init__.py +0 -0
triton/tools/build_extern.py +365 -0
triton/tools/compile.py +210 -0
triton/tools/disasm.py +143 -0
triton/tools/extra/cuda/compile.c +70 -0
triton/tools/extra/cuda/compile.h +14 -0
triton/tools/extra/hip/compile.cpp +66 -0
triton/tools/extra/hip/compile.h +13 -0
triton/tools/link.py +322 -0
triton/tools/mxfp.py +301 -0
triton/tools/ragged_tma.py +92 -0
triton/tools/tensor_descriptor.py +34 -0
triton/windows_utils.py +405 -0
triton_windows-3.5.0.post21.dist-info/METADATA +46 -0
triton_windows-3.5.0.post21.dist-info/RECORD +217 -0
triton_windows-3.5.0.post21.dist-info/WHEEL +5 -0
triton_windows-3.5.0.post21.dist-info/entry_points.txt +3 -0
triton_windows-3.5.0.post21.dist-info/licenses/LICENSE +23 -0
triton_windows-3.5.0.post21.dist-info/top_level.txt +1 -0

triton/tools/disasm.py ADDED Viewed

@@ -0,0 +1,143 @@
+# MIT License
+# Copyright (c) 2020 Da Yan @ HKUST
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+import functools
+import os
+import re
+import subprocess
+import tempfile
+FLINE_RE = re.compile(r'\s*/\*\w{4}\*/\s*([^;]*;)\s*/\* 0x(\w{16}) \*/\s*')
+SLINE_RE = re.compile(r'\s*/\* 0x(\w{16}) \*/\s*')
+FNAME_RE = re.compile(r'\s*Function : (\w+)\s*')
+BRA_RE = re.compile(r'(.*BRA(?:\.U)? )(0x\w+);')
+def parseCtrl(sline):
+    enc = int(SLINE_RE.match(sline).group(1), 16)
+    stall = (enc >> 41) & 0xf
+    yld = (enc >> 45) & 0x1
+    wrtdb = (enc >> 46) & 0x7
+    readb = (enc >> 49) & 0x7
+    watdb = (enc >> 52) & 0x3f
+    yld_str = 'Y' if yld == 0 else '-'
+    wrtdb_str = '-' if wrtdb == 7 else str(wrtdb)
+    readb_str = '-' if readb == 7 else str(readb)
+    watdb_str = '--' if watdb == 0 else f'{watdb:02d}'
+    return f'{watdb_str}:{readb_str}:{wrtdb_str}:{yld_str}:{stall:x}'
+def processSassLines(fline, sline, labels):
+    asm = FLINE_RE.match(fline).group(1)
+    # Remove tailing space
+    if asm.endswith(" ;"):
+        asm = asm[:-2] + ";"
+    ctrl = parseCtrl(sline)
+    # BRA target address
+    if BRA_RE.match(asm) is not None:
+        target = int(BRA_RE.match(asm).group(2), 16)
+        if target in labels:
+            pass
+        else:
+            labels[target] = len(labels)
+    return (f'{ctrl}', f'{asm}')
+@functools.lru_cache()
+def get_sass(cubin_asm, fun=None):
+    fd, path = tempfile.mkstemp()
+    try:
+        with open(fd, 'wb') as cubin:
+            cubin.write(cubin_asm)
+        sass = extract(path, fun)
+    finally:
+        os.remove(path)
+    return sass
+def path_to_cuobjdump():
+    from triton import knobs
+    return knobs.nvidia.cuobjdump.path
+def extract(file_path, fun):
+    cuobjdump = path_to_cuobjdump()
+    if fun is None:
+        sass_str = subprocess.check_output([cuobjdump, "-sass", file_path])
+    else:
+        sass_str = subprocess.check_output([cuobjdump, "-fun", fun, "-sass", file_path])
+    sass_lines = sass_str.splitlines()
+    line_idx = 0
+    while line_idx < len(sass_lines):
+        line = sass_lines[line_idx].decode()
+        # format:
+        # function : <function_name>
+        # .headerflags: ...
+        # /*0000*/ asmstr /*0x...*/
+        #                 /*0x...*/
+        # Looking for new function header (function: <name>)
+        while FNAME_RE.match(line) is None:
+            line_idx += 1
+            if line_idx < len(sass_lines):
+                line = sass_lines[line_idx].decode()
+            else:
+                return
+        fname = FNAME_RE.match(line).group(1)
+        ret = ''
+        ret += f'Function:{fname}\n'
+        line_idx += 2  # bypass .headerflags
+        line = sass_lines[line_idx].decode()
+        # Remapping address to label
+        labels = {}  # address -> label_idx
+        # store sass asm in buffer and them print them (for labels)
+        # (ctrl, asm)
+        asm_buffer = []
+        while FLINE_RE.match(line) is not None:
+            # First line (Offset ASM Encoding)
+            fline = sass_lines[line_idx].decode()
+            line_idx += 1
+            # Second line (Encoding)
+            sline = sass_lines[line_idx].decode()
+            line_idx += 1
+            asm_buffer.append(processSassLines(fline, sline, labels))
+            # peek the next line
+            line = sass_lines[line_idx].decode()
+        # Print sass
+        # label naming convention: LBB#i
+        for idx, (ctrl, asm) in enumerate(asm_buffer):
+            # Print label if this is BRA target
+            offset = idx * 16
+            if offset in labels:
+                label_name = f'LBB{labels[offset]}'
+                ret += f'{label_name}:\n'
+            ret += ctrl + '\t'
+            # if this is BRA, remap offset to label
+            if BRA_RE.match(asm):
+                target = int(BRA_RE.match(asm).group(2), 16)
+                target_name = f'LBB{labels[target]}'
+                asm = BRA_RE.sub(rf'\1{target_name};', asm)
+            ret += asm + '\n'
+        ret += '\n'
+        return ret

triton/tools/extra/cuda/compile.c ADDED Viewed

@@ -0,0 +1,70 @@
+/* clang-format off */
+#include <stdio.h>
+#include <stdint.h>
+#include <inttypes.h>
+#include <string.h>
+#include <cuda.h>
+// helpers to check for cuda errors
+#define CUDA_CHECK(ans) {{\
+    gpuAssert((ans), __FILE__, __LINE__);\
+  }}\
+static inline void gpuAssert(CUresult code, const char *file, int line) {{
+  if (code != CUDA_SUCCESS) {{
+    const char *prefix = "Triton Error [CUDA]: ";
+    const char *str;
+    cuGetErrorString(code, &str);
+    char err[1024] = {{0}};
+    strcat(err, prefix);
+    strcat(err, str);
+    printf("%s\\n", err);
+    exit(code);
+  }}
+}}
+// globals
+#define CUBIN_NAME {kernel_name}_cubin
+CUmodule {kernel_name}_mod = NULL;
+CUfunction {kernel_name}_func = NULL;
+unsigned char CUBIN_NAME[{bin_size}] = {{ {bin_data} }};
+void unload_{kernel_name}(void) {{
+    CUDA_CHECK(cuModuleUnload({kernel_name}_mod));
+}}
+// TODO: some code duplication with `runtime/backend/cuda.c`
+void load_{kernel_name}() {{
+    int dev = 0;
+    void *bin = (void *)&CUBIN_NAME;
+    int shared = {shared};
+    CUDA_CHECK(cuModuleLoadData(&{kernel_name}_mod, bin));
+    CUDA_CHECK(cuModuleGetFunction(&{kernel_name}_func, {kernel_name}_mod, "{triton_kernel_name}"));
+    // set dynamic shared memory if necessary
+    int shared_optin;
+    CUDA_CHECK(cuDeviceGetAttribute(&shared_optin, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK_OPTIN, dev));
+    if (shared > 49152 && shared_optin > 49152) {{
+      CUDA_CHECK(cuFuncSetCacheConfig({kernel_name}_func, CU_FUNC_CACHE_PREFER_SHARED));
+      CUDA_CHECK(cuFuncSetAttribute({kernel_name}_func, CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, shared_optin))
+    }}
+}}
+/*
+{kernel_docstring}
+*/
+CUresult {kernel_name}(CUstream stream, {signature}) {{
+    if ({kernel_name}_func == NULL)
+       load_{kernel_name}();
+    unsigned int gX = {gridX};
+    unsigned int gY = {gridY};
+    unsigned int gZ = {gridZ};
+    CUdeviceptr global_scratch = 0;
+    CUdeviceptr profile_scratch = 0;
+    void *args[{num_args}] = {{ {arg_pointers} }};
+    // TODO: shared memory
+    if(gX * gY * gZ > 0)
+      return cuLaunchKernel({kernel_name}_func, gX, gY, gZ, {num_warps} * 32, 1, 1, {shared}, stream, args, NULL);
+    return (CUresult)NULL;
+}}

triton/tools/extra/cuda/compile.h ADDED Viewed

@@ -0,0 +1,14 @@
+#ifndef TT_KERNEL_INCLUDES
+#define TT_KERNEL_INCLUDES
+#include <cuda.h>
+#include <inttypes.h>
+#include <stdint.h>
+#include <stdio.h>
+#endif
+void unload_{kernel_name}(void);
+void load_{kernel_name}(void);
+// tt-linker: {kernel_name}:{full_signature}:{algo_info}
+CUresult{_placeholder} {kernel_name}(CUstream stream, {signature});

triton/tools/extra/hip/compile.cpp ADDED Viewed

@@ -0,0 +1,66 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+/* clang-format off */
+#include <stdio.h>
+#include <stdint.h>
+#include <inttypes.h>
+#include <string.h>
+#include <hip/hip_runtime.h>
+// helpers to check for hip errors
+#define HIP_CHECK(ans) {{\
+    gpuAssert((ans), __FILE__, __LINE__);\
+  }}\
+static inline void gpuAssert(hipError_t code, const char *file, int line) {{
+  if (code != hipSuccess) {{
+    const char *prefix = "Triton Error [HIP]: ";
+    const char *str;
+    hipDrvGetErrorString(code, &str);
+    char err[1024] = {{0}};
+    strcat(err, prefix);
+    strcat(err, str);
+    printf("%s\\n", err);
+    exit(code);
+  }}
+}}
+// globals
+#define HSACO_NAME {kernel_name}_hsaco
+hipModule_t {kernel_name}_mod = nullptr;
+hipFunction_t {kernel_name}_func = nullptr;
+unsigned char HSACO_NAME[{bin_size}] = {{ {bin_data} }};
+void unload_{kernel_name}(void) {{
+    HIP_CHECK(hipModuleUnload({kernel_name}_mod));
+}}
+void load_{kernel_name}() {{
+    int dev = 0;
+    void *bin = (void *)&HSACO_NAME;
+    int shared = {shared};
+    HIP_CHECK(hipModuleLoadData(&{kernel_name}_mod, bin));
+    HIP_CHECK(hipModuleGetFunction(&{kernel_name}_func, {kernel_name}_mod, "{triton_kernel_name}"));
+}}
+/*
+{kernel_docstring}
+*/
+hipError_t {kernel_name}(hipStream_t stream, {signature}) {{
+    if ({kernel_name}_func == nullptr)
+       load_{kernel_name}();
+    unsigned int gX = {gridX};
+    unsigned int gY = {gridY};
+    unsigned int gZ = {gridZ};
+    hipDeviceptr_t global_scratch = 0;
+    hipDeviceptr_t profile_scratch = 0;
+    void *args[{num_args}] = {{ {arg_pointers} }};
+    // TODO: shared memory
+    if(gX * gY * gZ > 0)
+      return hipModuleLaunchKernel({kernel_name}_func, gX, gY, gZ, {num_warps} * warpSize, 1, 1, {shared}, stream, args, nullptr);
+    else
+      return hipErrorInvalidValue;
+}}

triton/tools/extra/hip/compile.h ADDED Viewed

@@ -0,0 +1,13 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+#pragma once
+#include <hip/hip_runtime.h>
+#include <inttypes.h>
+#include <stdint.h>
+#include <stdio.h>
+void unload_{kernel_name}(void);
+void load_{kernel_name}(void);
+hipError_t{_placeholder} {kernel_name}(hipStream_t stream, {signature});

triton/tools/link.py ADDED Viewed

@@ -0,0 +1,322 @@
+from collections import defaultdict
+from pathlib import Path
+from typing import Sequence, Union
+from dataclasses import dataclass
+def _exists(x):
+    return x is not None
+class LinkerError(Exception):
+    pass
+@dataclass
+class KernelLinkerMeta:
+    orig_kernel_name: str
+    arg_names: Sequence[str]
+    arg_ctypes: Sequence[str]
+    sizes: Sequence[Union[int, None]]
+    sig_hash: str
+    triton_suffix: str
+    suffix: str
+    num_specs: int
+    """ number of specialized arguments """
+class HeaderParser:
+    def __init__(self) -> None:
+        import re
+        # [kernel_name, c signature]
+        self.linker_directives = re.compile("//[\\s]*tt-linker:[\\s]*([\\w]+):(.+):(.+)")
+        # [name, hash, suffix]
+        self.kernel_name = re.compile("^([\\w]+)_([\\w]+)_([\\w]+)$")
+        # [(type, name)]
+        self.c_sig = re.compile("[\\s]*(\\w+)\\s(\\w+)[,]?")
+        # [d|c]
+        self.arg_suffix = re.compile("[c,d]")
+        self.kernels = defaultdict(list)
+    def extract_linker_meta(self, header: str):
+        for ln in header.splitlines():
+            if ln.startswith("//"):
+                m = self.linker_directives.match(ln)
+                if _exists(m):
+                    ker_name, c_sig, algo_info = m.group(1), m.group(2), m.group(3)
+                    name, sig_hash, suffix = self._match_name(ker_name)
+                    c_types, arg_names = self._match_c_sig(c_sig)
+                    num_specs, sizes = self._match_suffix(suffix, c_sig)
+                    self._add_kernel(
+                        "_".join([name, algo_info]),
+                        KernelLinkerMeta(
+                            orig_kernel_name=name,
+                            arg_names=arg_names,
+                            arg_ctypes=c_types,
+                            sizes=sizes,
+                            sig_hash=sig_hash,
+                            triton_suffix=suffix,
+                            suffix=suffix,
+                            num_specs=num_specs,
+                        ),
+                    )
+    def _match_name(self, ker_name: str):
+        m = self.kernel_name.match(ker_name)
+        if _exists(m):
+            name, sig_hash, suffix = m.group(1), m.group(2), m.group(3)
+            return name, sig_hash, suffix
+        raise LinkerError(f"{ker_name} is not a valid kernel name")
+    def _match_c_sig(self, c_sig: str):
+        m = self.c_sig.findall(c_sig)
+        if len(m):
+            tys, args = [], []
+            for ty, arg_name in m:
+                tys.append(ty)
+                args.append(arg_name)
+            return tys, args
+        raise LinkerError(f"{c_sig} is not a valid argument signature")
+    def _match_suffix(self, suffix: str, c_sig: str):
+        args = c_sig.split(",")
+        s2i = {"c": 1, "d": 16}
+        num_specs = 0
+        sizes = []
+        # scan through suffix, first find the index,
+        # then see if it is followed by d or c
+        for i in range(len(args)):
+            pos = suffix.find(str(i))
+            if pos == -1:
+                raise LinkerError(f"{suffix} is not a valid kernel suffix")
+            pos += len(str(i))
+            if self.arg_suffix.match(suffix, pos):
+                num_specs += 1
+                sizes.extend([None] * (i - len(sizes)))
+                sizes.append(s2i[suffix[pos]])
+                pos += 1
+            if i < len(args) - 1:
+                suffix = suffix[pos:]
+            else:
+                sizes.extend([None] * (len(args) - len(sizes)))
+        return num_specs, sizes
+    def _add_kernel(self, name: str, ker: KernelLinkerMeta):
+        if name in self.kernels:
+            last: KernelLinkerMeta = self.kernels[name][-1]
+            for cur, new_ in zip(last.arg_ctypes, ker.arg_ctypes):
+                if cur != new_:
+                    raise LinkerError(
+                        f"Mismatched signature for kernel {name}: \n\texisting sig is: {','.join(last.arg_ctypes)}\n\tcurrent is: {','.join(ker.arg_ctypes)}"
+                    )
+        self.kernels[name].append(ker)
+def gen_signature_with_full_args(m):
+    return ", ".join([f"{ty} {arg}" for ty, arg in zip(m.arg_ctypes, m.arg_names)])
+def gen_signature(m):
+    arg_types = [ty for ty, hint in zip(m.arg_ctypes, m.sizes) if hint != 1]
+    arg_names = [arg for arg, hint in zip(m.arg_names, m.sizes) if hint != 1]
+    sig = ", ".join([f"{ty} {arg}" for ty, arg in zip(arg_types, arg_names)])
+    return sig
+# generate declarations of kernels with meta-parameter and constant values
+def make_algo_decls(name: str, metas: Sequence[KernelLinkerMeta]) -> str:
+    return f"""
+CUresult {name}(CUstream stream, {gen_signature_with_full_args(metas[-1])});
+void load_{name}();
+void unload_{name}();
+    """
+# generate declarations of kernels with meta-parameter and constant values
+def make_global_decl(meta: KernelLinkerMeta) -> str:
+    return f"""
+CUresult {meta.orig_kernel_name}_default(CUstream stream, {gen_signature_with_full_args(meta)});
+CUresult {meta.orig_kernel_name}(CUstream stream, {gen_signature_with_full_args(meta)}, int algo_id);
+void load_{meta.orig_kernel_name}();
+void unload_{meta.orig_kernel_name}();
+    """
+# generate dispatcher function for kernels with different meta-parameter and constant values
+def make_default_algo_kernel(meta: KernelLinkerMeta) -> str:
+    src = f"CUresult {meta.orig_kernel_name}_default(CUstream stream, {gen_signature_with_full_args(meta)}){{\n"
+    src += (f"  return {meta.orig_kernel_name}(stream, {', '.join(meta.arg_names)}, 0);\n")
+    src += "}\n"
+    return src
+# generate dispatcher function for kernels with different integer value hints
+def make_kernel_hints_dispatcher(name: str, metas: Sequence[KernelLinkerMeta]) -> str:
+    src = f"// launcher for: {name}\n"
+    for meta in sorted(metas, key=lambda m: -m.num_specs):
+        src += f"CUresult {meta.orig_kernel_name}_{meta.sig_hash}_{meta.suffix}(CUstream stream, {gen_signature(meta)});\n"
+    src += "\n"
+    src += (f"CUresult {name}(CUstream stream, {gen_signature_with_full_args(metas[-1])}){{")
+    src += "\n"
+    for meta in sorted(metas, key=lambda m: -m.num_specs):
+        cond_fn = (  #
+            lambda val, hint: f"({val} % {hint} == 0)"  #
+            if hint == 16  #
+            else f"({val} == {hint})"  #
+            if hint == 1  #
+            else None)
+        conds = " && ".join([  #
+            cond_fn(val, hint)  #
+            for val, hint in zip(meta.arg_names, meta.sizes)  #
+            if hint is not None
+        ])
+        src += (f"  if ({conds})\n" if any(meta.sizes) else "if (1)\n"
+                )  # Edge case where no specializations hence no dispatching required
+        arg_names = [arg for arg, hint in zip(meta.arg_names, meta.sizes) if hint != 1]
+        src += f"    return {meta.orig_kernel_name}_{meta.sig_hash}_{meta.suffix}(stream, {', '.join(arg_names)});\n"
+    src += "\n"
+    src += "  return CUDA_ERROR_INVALID_VALUE;\n"
+    src += "}\n"
+    for mode in ["load", "unload"]:
+        src += f"\n// {mode} for: {name}\n"
+        for meta in sorted(metas, key=lambda m: -m.num_specs):
+            src += f"void {mode}_{meta.orig_kernel_name}_{meta.sig_hash}_{meta.suffix}();\n"
+        src += f"void {mode}_{name}() {{"
+        src += "\n"
+        for meta in sorted(metas, key=lambda m: -m.num_specs):
+            src += (f"  {mode}_{meta.orig_kernel_name}_{meta.sig_hash}_{meta.suffix}();\n")
+        src += "}\n"
+    return src
+# generate dispatcher function for kernels with different meta-parameter and constant values
+def make_kernel_meta_const_dispatcher(meta: KernelLinkerMeta) -> str:
+    src = f"CUresult {meta.orig_kernel_name}(CUstream stream, {gen_signature_with_full_args(meta)}, int algo_id){{\n"
+    src += f"  assert (algo_id < (int)sizeof({meta.orig_kernel_name}_kernels));\n"
+    src += f"  return {meta.orig_kernel_name}_kernels[algo_id](stream, {', '.join(meta.arg_names)});\n"
+    src += "}\n"
+    return src
+# generate definition of function pointers of kernel dispatchers based on meta-parameter and constant values
+def make_func_pointers(names: str, meta: KernelLinkerMeta) -> str:
+    # the table of hint dispatchers
+    src = f"typedef CUresult (*kernel_func_t)(CUstream stream, {gen_signature_with_full_args(meta)});\n"
+    src += f"kernel_func_t {meta.orig_kernel_name}_kernels[] = {{\n"
+    for name in names:
+        src += f"  {name},\n"
+    src += "};\n"
+    return src
+# generate definition for load/unload functions for kernels with different meta-parameter and constant values
+def make_kernel_load_def(names: str, meta: KernelLinkerMeta) -> str:
+    src = ""
+    for mode in ["load", "unload"]:
+        src += f"void {mode}_{meta.orig_kernel_name}(void){{\n"
+        for name in names:
+            src += f"  {mode}_{name}();\n"
+        src += "}\n\n"
+    return src
+def make_get_num_algos_decl(meta: KernelLinkerMeta) -> str:
+    src = f"int {meta.orig_kernel_name}_get_num_algos(void);"
+    return src
+def make_get_num_algos_def(meta: KernelLinkerMeta) -> str:
+    src = f"int {meta.orig_kernel_name}_get_num_algos(void){{\n"
+    src += f"  return (int)(sizeof({meta.orig_kernel_name}_kernels) / sizeof({meta.orig_kernel_name}_kernels[0]));\n"
+    src += "}\n"
+    return src
+desc = """
+Triton ahead-of-time linker:
+This program takes in header files generated by compile.py, and generates a
+single entry-point responsible for dispatching the user's input to the right
+kernel given the specializations that were compiled.
+Example usage:
+python link.py /path/to/headers/*.h -o kernel_name
+"""
+if __name__ == "__main__":
+    from argparse import ArgumentParser
+    parser = ArgumentParser(description=desc)
+    parser.add_argument(
+        "headers",
+        nargs="+",
+        help="Paths to header files to link. Must include linker directive annotations (autogenerated by ttc)",
+    )
+    parser.add_argument("--out", "-o", type=Path, help="Out filename")
+    parser.add_argument(
+        "--prefix",
+        type=str,
+        default="",
+        help="String to prefix kernel dispatcher names",
+    )
+    args = parser.parse_args()
+    # metadata
+    parser = HeaderParser()
+    includes = []
+    for header in args.headers:
+        h_path = Path(header)
+        h_str = h_path.read_text()
+        includes.append(h_path.name)
+        parser.extract_linker_meta(h_str)
+    # generate headers
+    algo_decls = [make_algo_decls(name, meta) for name, meta in parser.kernels.items()]
+    meta_lists = [meta for name, meta in parser.kernels.items()]
+    meta = meta_lists[0][0]
+    get_num_algos_decl = make_get_num_algos_decl(meta)
+    global_decl = make_global_decl(meta)
+    with args.out.with_suffix(".h").open("w") as fp:
+        out = "#include <cuda.h>\n"
+        out += "\n".join(algo_decls)
+        out += "\n"
+        out += get_num_algos_decl
+        out += "\n"
+        out += global_decl
+        fp.write(out)
+    # generate source
+    defs = [make_kernel_hints_dispatcher(name, meta) for name, meta in parser.kernels.items()]
+    names = [name for name in parser.kernels.keys()]
+    func_pointers_def = make_func_pointers(names, meta)
+    meta_const_def = make_kernel_meta_const_dispatcher(meta)
+    load_unload_def = make_kernel_load_def(names, meta)
+    get_num_algos_def = make_get_num_algos_def(meta)
+    default_algo_kernel = make_default_algo_kernel(meta)
+    with args.out.with_suffix(".c").open("w") as fp:
+        out = ""
+        out += "#include <cuda.h>\n"
+        out += "#include <stdint.h>\n"
+        out += "#include <assert.h>\n"
+        out += "\n"
+        out += "\n".join(defs)
+        out += "\n"
+        out += func_pointers_def
+        out += "\n"
+        out += get_num_algos_def
+        out += "\n"
+        out += meta_const_def
+        out += "\n"
+        out += load_unload_def
+        out += "\n"
+        out += default_algo_kernel
+        fp.write(out)