PyPI - triton-windows - Versions diffs - 3.4.0.post20__cp311-cp311-win_amd64.whl → 3.5.0.post21__cp311-cp311-win_amd64.whl - Mend

triton-windows 3.4.0.post20__cp311-cp311-win_amd64.whl → 3.5.0.post21__cp311-cp311-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of triton-windows might be problematic. Click here for more details.

Files changed (107) hide show

triton/_C/libtriton.pyd +0 -0
triton/__init__.py +8 -2
triton/_filecheck.py +24 -14
triton/_internal_testing.py +70 -4
triton/_utils.py +3 -1
triton/backends/amd/compiler.py +68 -60
triton/backends/amd/driver.c +113 -44
triton/backends/amd/driver.py +133 -57
triton/backends/driver.py +13 -0
triton/backends/nvidia/compiler.py +80 -22
triton/backends/nvidia/driver.c +88 -15
triton/backends/nvidia/driver.py +130 -123
triton/compiler/__init__.py +5 -2
triton/compiler/code_generator.py +270 -163
triton/compiler/compiler.py +45 -62
triton/experimental/gluon/__init__.py +3 -2
triton/experimental/gluon/_runtime.py +9 -6
triton/experimental/gluon/language/__init__.py +117 -16
triton/experimental/gluon/language/_core.py +246 -68
triton/experimental/gluon/language/_layouts.py +398 -45
triton/experimental/gluon/language/_math.py +17 -9
triton/experimental/gluon/language/_semantic.py +130 -37
triton/experimental/gluon/language/_standard.py +55 -22
triton/experimental/gluon/language/amd/__init__.py +4 -0
triton/experimental/gluon/language/amd/_layouts.py +96 -0
triton/experimental/gluon/language/amd/cdna3/__init__.py +100 -0
triton/experimental/gluon/language/amd/cdna4/__init__.py +48 -0
triton/experimental/gluon/language/amd/cdna4/async_copy.py +151 -0
triton/experimental/gluon/language/extra/__init__.py +3 -0
triton/experimental/gluon/language/nvidia/ampere/__init__.py +3 -0
triton/experimental/gluon/language/nvidia/ampere/async_copy.py +74 -0
triton/experimental/gluon/language/nvidia/ampere/mbarrier.py +80 -0
triton/experimental/gluon/language/nvidia/blackwell/__init__.py +192 -7
triton/experimental/gluon/language/nvidia/blackwell/tma.py +20 -0
triton/experimental/gluon/language/nvidia/hopper/__init__.py +124 -3
triton/experimental/gluon/language/nvidia/hopper/mbarrier.py +20 -37
triton/experimental/gluon/language/nvidia/hopper/tma.py +4 -3
triton/experimental/gluon/nvidia/hopper.py +6 -1
triton/knobs.py +132 -67
triton/language/__init__.py +16 -10
triton/language/core.py +163 -83
triton/language/extra/cuda/gdc.py +6 -6
triton/language/extra/hip/__init__.py +3 -1
triton/language/extra/hip/libdevice.py +7 -0
triton/language/extra/hip/utils.py +35 -0
triton/language/extra/libdevice.py +4 -0
triton/language/semantic.py +76 -23
triton/language/standard.py +14 -14
triton/language/target_info.py +54 -0
triton/runtime/_allocation.py +15 -3
triton/runtime/_async_compile.py +55 -0
triton/runtime/autotuner.py +4 -5
triton/runtime/build.py +11 -9
triton/runtime/cache.py +44 -1
triton/runtime/driver.py +16 -41
triton/runtime/interpreter.py +31 -23
triton/runtime/jit.py +318 -157
triton/runtime/tcc/include/_mingw.h +8 -10
triton/runtime/tcc/include/assert.h +5 -0
triton/runtime/tcc/include/errno.h +1 -1
triton/runtime/tcc/include/float.h +21 -3
triton/runtime/tcc/include/iso646.h +36 -0
triton/runtime/tcc/include/limits.h +5 -0
triton/runtime/tcc/include/malloc.h +2 -2
triton/runtime/tcc/include/math.h +21 -261
triton/runtime/tcc/include/stdalign.h +16 -0
triton/runtime/tcc/include/stdarg.h +5 -70
triton/runtime/tcc/include/stdatomic.h +171 -0
triton/runtime/tcc/include/stddef.h +7 -19
triton/runtime/tcc/include/stdlib.h +15 -4
triton/runtime/tcc/include/stdnoreturn.h +7 -0
triton/runtime/tcc/include/sys/stat.h +2 -2
triton/runtime/tcc/include/sys/types.h +5 -0
triton/runtime/tcc/include/tcc/tcc_libm.h +444 -27
triton/runtime/tcc/include/tccdefs.h +342 -0
triton/runtime/tcc/include/tgmath.h +89 -0
triton/runtime/tcc/include/uchar.h +33 -0
triton/runtime/tcc/include/unistd.h +1 -0
triton/runtime/tcc/include/winapi/qos.h +72 -0
triton/runtime/tcc/include/winapi/shellapi.h +59 -0
triton/runtime/tcc/include/winapi/winbase.h +9 -2
triton/runtime/tcc/include/winapi/wincon.h +8 -0
triton/runtime/tcc/include/winapi/windows.h +1 -1
triton/runtime/tcc/include/winapi/winnls.h +778 -0
triton/runtime/tcc/include/winapi/winnt.h +9 -7
triton/runtime/tcc/include/winapi/winsock2.h +1474 -0
triton/runtime/tcc/include/winapi/ws2ipdef.h +21 -0
triton/runtime/tcc/include/winapi/ws2tcpip.h +391 -0
triton/runtime/tcc/lib/libtcc1.a +0 -0
triton/runtime/tcc/lib/python314.def +1800 -0
triton/runtime/tcc/lib/python314t.def +1809 -0
triton/runtime/tcc/libtcc.dll +0 -0
triton/runtime/tcc/tcc.exe +0 -0
triton/tools/compile.py +62 -14
triton/tools/extra/cuda/compile.c +1 -0
triton/tools/extra/hip/compile.cpp +66 -0
triton/tools/extra/hip/compile.h +13 -0
triton/tools/ragged_tma.py +92 -0
triton/tools/tensor_descriptor.py +7 -9
triton/windows_utils.py +42 -79
{triton_windows-3.4.0.post20.dist-info → triton_windows-3.5.0.post21.dist-info}/METADATA +3 -4
{triton_windows-3.4.0.post20.dist-info → triton_windows-3.5.0.post21.dist-info}/RECORD +106 -75
triton/runtime/tcc/lib/libtcc1-64.a +0 -0
{triton_windows-3.4.0.post20.dist-info → triton_windows-3.5.0.post21.dist-info}/WHEEL +0 -0
{triton_windows-3.4.0.post20.dist-info → triton_windows-3.5.0.post21.dist-info}/entry_points.txt +0 -0
{triton_windows-3.4.0.post20.dist-info → triton_windows-3.5.0.post21.dist-info}/licenses/LICENSE +0 -0
{triton_windows-3.4.0.post20.dist-info → triton_windows-3.5.0.post21.dist-info}/top_level.txt +0 -0

triton/runtime/tcc/libtcc.dll CHANGED Viewed

Binary file

triton/runtime/tcc/tcc.exe CHANGED Viewed

Binary file

triton/tools/compile.py CHANGED Viewed

@@ -3,12 +3,29 @@ import hashlib
 import importlib.util
 import sys
 from argparse import ArgumentParser
+from dataclasses import dataclass
 from pathlib import Path
 from typing import List
 import triton
 import triton.backends
-from triton.backends.nvidia.driver import ty_to_cpp
+@dataclass
+class CompileArgs:
+    '''
+    A class to contain arguments from command-line parser.
+    '''
+    path: str = ''
+    kernel_name: str = ''
+    signature: str = ''
+    grid: str = ''
+    target: str | None = None
+    num_warps: int = 1
+    num_stages: int = 3
+    out_name: str | None = None
+    out_path: Path | None = None
 desc = """
 Triton ahead-of-time compiler:
@@ -36,14 +53,18 @@ NOTE: when resolving the scope of /path/to/kernel.py, the file will be executed
 used to run this `compile.py` script
 """
-if __name__ == "__main__":
+def main():
     # command-line arguments
     parser = ArgumentParser(description=desc)
     parser.add_argument("path",
                         help="Path to Python source containing desired kernel in its scope. File will be executed.")
     parser.add_argument("--kernel-name", "-n", type=str, default="", help="Name of the kernel to compile",
                         required=True)
+    parser.add_argument(
+        "--target", "-t", type=str, default=None,
+        help="The target to compile towards, in format of '<backend>:<arch>:<warp-size>'; "
+        "e.g., 'cuda:80:32', 'hip:gfx942:64'. Default to None, which means using current machine's GPU target")
     parser.add_argument("--num-warps", "-w", type=int, default=1, help="Number of warps to launch the kernel")
     parser.add_argument("--num-stages", "-ns", type=int, default=3,
                         help="Number of stages (meta-parameter of the kernel)")
@@ -51,8 +72,12 @@ if __name__ == "__main__":
     parser.add_argument("--out-path", "-o", type=Path, default=None, help="Out filename")
     parser.add_argument("--signature", "-s", type=str, help="Signature of the kernel", required=True)
     parser.add_argument("--grid", "-g", type=str, help="Launch grid of the kernel", required=True)
-    args = parser.parse_args()
+    cli_args = parser.parse_args()
+    args = CompileArgs(**vars(cli_args))  # A sanity check to ensure class CompileArgs is updated as well.
+    compile_kernel(args)
+def compile_kernel(args: CompileArgs):
     out_name = args.out_name if args.out_name else args.kernel_name
     out_path = args.out_path if args.out_path else Path(out_name)
@@ -108,10 +133,18 @@ if __name__ == "__main__":
         assert h in [1, 16], f"Only 1 and 16 are valid hints, got {h}"
     attrs = {k: [["tt.divisibility", 16]] for k, v in hints.items() if v == 16}
     src = triton.compiler.ASTSource(fn=kernel, constexprs=constants, signature=signature, attrs=attrs)
-    opts = {"num_warps": args.num_warps, "num_stages": args.num_stages}
-    ccinfo = triton.compile(src, options=opts)
-    if ccinfo.metadata.global_scratch_size > 0:
+    target = triton.backends.compiler.GPUTarget(*args.target.split(":")) \
+        if args.target else triton.runtime.driver.active.get_current_target()
+    backend = triton.compiler.make_backend(target)
+    kwargs = {"num_warps": args.num_warps, "num_stages": args.num_stages}
+    options = backend.parse_options(kwargs)
+    ccinfo = triton.compile(src, target=target, options=options.__dict__)
+    if getattr(ccinfo.metadata, "global_scratch_size", 0) > 0:
         raise RuntimeError("AOT compiling kernels with global scratch requirements is not yet implemented")
+    if ccinfo.metadata.profile_scratch_size > 0:
+        raise RuntimeError("AOT compiling kernels with profile scratch requirements is not yet implemented")
     arg_names = []
     arg_types = []
@@ -136,8 +169,12 @@ if __name__ == "__main__":
         if hints.get((i, ), None) == 16:
             suffix += 'd'
     func_name = '_'.join([out_name, sig_hash, suffix])
-    asm = ccinfo.asm["cubin"]  # store binary data once
+    asm = ccinfo.asm[backend.binary_ext]  # store binary data once
     hex_ = str(binascii.hexlify(asm))[2:-1]
+    ty_to_cpp = triton.runtime.driver.active.map_python_to_cpp_type
     params = {
         "kernel_name": func_name,
         "triton_kernel_name": args.kernel_name,
@@ -145,18 +182,29 @@ if __name__ == "__main__":
         "bin_data": ", ".join([f"0x{x}{y}" for x, y in zip(hex_[::2], hex_[1::2])]),
         "signature": ", ".join([f"{ty_to_cpp(ty)} {name}" for name, ty in zip(arg_names_not_1, arg_types_not_1)]),
         "full_signature": ", ".join([f"{ty_to_cpp(ty)} {name}" for name, ty in zip(arg_names, arg_types)]),
-        "arg_pointers": ", ".join([f"&{arg}" for arg in arg_names_not_1] + ["&global_scratch"]),
-        "num_args": len(arg_names_not_1) + 1,
+        "arg_pointers": ", ".join([f"&{arg}" for arg in arg_names_not_1] + ["&global_scratch"] + ["&profile_scratch"]),
+        "num_args": len(arg_names_not_1) + 2,  # +2 for global and profile scratch
         "kernel_docstring": doc_string,
         "shared": ccinfo.metadata.shared,
         "num_warps": args.num_warps,
-        "algo_info": '_'.join([const_sig, meta_sig]),
+        "algo_info": "_".join([const_sig, meta_sig]),
         "gridX": grid[0],
         "gridY": grid[1],
         "gridZ": grid[2],
         "_placeholder": "",
     }
-    for ext in ['h', 'c']:
-        template_path = Path(__file__).parent / "extra" / "cuda" / f"compile.{ext}"
-        with out_path.with_suffix(f".{sig_hash}_{suffix}.{ext}").open("w") as fp:
-            fp.write(Path(template_path).read_text().format(**params))
+    output_files = []
+    backend_name = target.backend
+    template_dir = Path(__file__).parent / "extra" / backend_name
+    for template_path in template_dir.glob('compile.*'):
+        ext = template_path.suffix
+        output_file = out_path.with_suffix(f".{sig_hash}_{suffix}{ext}")
+        with output_file.open("w") as fp:
+            fp.write(template_path.read_text().format(**params))
+        output_files.append(output_file)
+    return func_name, output_files
+if __name__ == "__main__":
+    main()

triton/tools/extra/cuda/compile.c CHANGED Viewed

@@ -61,6 +61,7 @@ CUresult {kernel_name}(CUstream stream, {signature}) {{
     unsigned int gY = {gridY};
     unsigned int gZ = {gridZ};
     CUdeviceptr global_scratch = 0;
+    CUdeviceptr profile_scratch = 0;
     void *args[{num_args}] = {{ {arg_pointers} }};
     // TODO: shared memory
     if(gX * gY * gZ > 0)

triton/tools/extra/hip/compile.cpp ADDED Viewed

@@ -0,0 +1,66 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+/* clang-format off */
+#include <stdio.h>
+#include <stdint.h>
+#include <inttypes.h>
+#include <string.h>
+#include <hip/hip_runtime.h>
+// helpers to check for hip errors
+#define HIP_CHECK(ans) {{\
+    gpuAssert((ans), __FILE__, __LINE__);\
+  }}\
+static inline void gpuAssert(hipError_t code, const char *file, int line) {{
+  if (code != hipSuccess) {{
+    const char *prefix = "Triton Error [HIP]: ";
+    const char *str;
+    hipDrvGetErrorString(code, &str);
+    char err[1024] = {{0}};
+    strcat(err, prefix);
+    strcat(err, str);
+    printf("%s\\n", err);
+    exit(code);
+  }}
+}}
+// globals
+#define HSACO_NAME {kernel_name}_hsaco
+hipModule_t {kernel_name}_mod = nullptr;
+hipFunction_t {kernel_name}_func = nullptr;
+unsigned char HSACO_NAME[{bin_size}] = {{ {bin_data} }};
+void unload_{kernel_name}(void) {{
+    HIP_CHECK(hipModuleUnload({kernel_name}_mod));
+}}
+void load_{kernel_name}() {{
+    int dev = 0;
+    void *bin = (void *)&HSACO_NAME;
+    int shared = {shared};
+    HIP_CHECK(hipModuleLoadData(&{kernel_name}_mod, bin));
+    HIP_CHECK(hipModuleGetFunction(&{kernel_name}_func, {kernel_name}_mod, "{triton_kernel_name}"));
+}}
+/*
+{kernel_docstring}
+*/
+hipError_t {kernel_name}(hipStream_t stream, {signature}) {{
+    if ({kernel_name}_func == nullptr)
+       load_{kernel_name}();
+    unsigned int gX = {gridX};
+    unsigned int gY = {gridY};
+    unsigned int gZ = {gridZ};
+    hipDeviceptr_t global_scratch = 0;
+    hipDeviceptr_t profile_scratch = 0;
+    void *args[{num_args}] = {{ {arg_pointers} }};
+    // TODO: shared memory
+    if(gX * gY * gZ > 0)
+      return hipModuleLaunchKernel({kernel_name}_func, gX, gY, gZ, {num_warps} * warpSize, 1, 1, {shared}, stream, args, nullptr);
+    else
+      return hipErrorInvalidValue;
+}}

triton/tools/extra/hip/compile.h ADDED Viewed

@@ -0,0 +1,13 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+#pragma once
+#include <hip/hip_runtime.h>
+#include <inttypes.h>
+#include <stdint.h>
+#include <stdio.h>
+void unload_{kernel_name}(void);
+void load_{kernel_name}(void);
+hipError_t{_placeholder} {kernel_name}(hipStream_t stream, {signature});

triton/tools/ragged_tma.py ADDED Viewed

@@ -0,0 +1,92 @@
+import triton
+import triton.language as tl
+from triton.tools.tensor_descriptor import TensorDescriptor
+# fmt: off
+def create_ragged_descriptor(T, block_shape, ragged_dim=0):
+    """
+    Given a 2- or 3-dimensional tensor T, this creates a 'ragged descriptor'
+    which behaves like a concatenation (along the first axis) of subarrays
+    of potentially unequal size.
+    The load_ragged and store_ragged device functions can be used to read
+    and write from subarrays T[batch_offset : batch_offset + batch_size]
+    with hardware bounds-checking preventing any sort of leakage outside
+    the subarray.
+    """
+    block_shape = list(block_shape)
+    tensor_shape = list(T.shape)
+    rank = len(tensor_shape)
+    if ragged_dim < 0:
+        ragged_dim += rank
+    assert 0 <= ragged_dim < rank - 1, "last dimension cannot be ragged"
+    assert rank <= 3, "read-write ragged descriptors must have at most 3 dimensions"
+    assert len(block_shape) == rank, "block shape must have same length as tensor shape"
+    max_int = 0x7fff0000
+    billion = 0x40000000  # == 2**30
+    assert tensor_shape[ragged_dim] <= billion, "number of rows may not exceed 2**30"
+    tensor_shape[ragged_dim] = billion
+    ragged_stride = T.stride(ragged_dim)
+    # we prepend an extra two dimensions and rely on the fact that pointers
+    # have 64-bit wraparound semantics:
+    tma_stride = [2**34 - ragged_stride, ragged_stride] + [T.stride(i) for i in range(rank)]
+    tma_shape  = [max_int, max_int] + tensor_shape
+    box_shape  = [1, 1] + block_shape
+    return TensorDescriptor(T, tma_shape, tma_stride, box_shape)
+@triton.jit
+def to_ragged_indices(batch_offset, batch_size, row):
+    """
+    Helper function for load_ragged and store_ragged.
+    """
+    billion = 0x40000000  # == 2**30
+    x = billion - batch_size + row
+    y = batch_offset + batch_size
+    return billion, y, x
+@triton.jit
+def load_ragged(TMA, batch_offset, batch_size, coords, ragged_dim: tl.constexpr = 0):
+    """
+    Read from a subarray T[batch_offset : batch_offset + batch_size] with
+    hardware bounds-checking, where reading outside the subarray gives zeros.
+    Coords should be an appropriately-sized list of integers, just like in
+    TMA.load().
+    """
+    tl.static_assert(len(TMA.shape) == len(coords) + 2, "TMA must be a read-write ragged descriptor")
+    c0, c1, c2 = to_ragged_indices(batch_offset, batch_size, coords[ragged_dim])
+    data = TMA.load([c0, c1] + coords[:ragged_dim] + [c2] + coords[ragged_dim + 1:])
+    data = tl.reshape(data, data.shape[2:])
+    return data
+@triton.jit
+def store_ragged(TMA, batch_offset, batch_size, coords, data, ragged_dim: tl.constexpr = 0):
+    """
+    Write to a subarray T[batch_offset : batch_offset + batch_size] with
+    hardware bounds-checking, where writes outside the subarray are masked
+    correctly.
+    Coords should be an appropriately-sized list of integers, just like in
+    TMA.store().
+    """
+    c0, c1, c2 = to_ragged_indices(batch_offset, batch_size, coords[ragged_dim])
+    data = tl.reshape(data, [1, 1] + data.shape)
+    TMA.store([c0, c1] + coords[:ragged_dim] + [c2] + coords[ragged_dim + 1:], data)

triton/tools/tensor_descriptor.py CHANGED Viewed

@@ -9,6 +9,7 @@ class TensorDescriptor:
     shape: List[int]
     strides: List[int]
     block_shape: List[int]
+    padding: str = "zero"
     def __post_init__(self):
         rank = len(self.shape)
@@ -17,20 +18,17 @@ class TensorDescriptor:
         assert rank > 0, "rank must not be zero"
         assert rank <= 5, "rank cannot be more than 5"
         ty = type(self.base)
-        type_name = f"{ty.__module__}.{ty.__name__}"
-        if type_name not in ("torch.FakeTensor", "torch.FunctionalTensor"):
+        if ty.__name__ not in ("FakeTensor", "FunctionalTensor"):
             assert self.base.data_ptr() % 16 == 0, "base must be 16-byte aligned"
         validate_block_shape(self.block_shape)
         elem_bytes = self.base.dtype.itemsize
         for stride in self.strides[:-1]:
             assert (stride * elem_bytes) % 16 == 0, "strides must be 16-byte aligned"
         assert self.strides[-1] == 1, "Last dimension must be contiguous"
+        assert self.padding == "zero" or self.padding == "nan", "Illegal value for padding"
+        if self.padding == "nan":
+            assert self.base.dtype.is_floating_point, "Padding option `nan` is only supported for floating point tensors"
     @staticmethod
-    def from_tensor(tensor: Any, block_shape: List[int]):
-        return TensorDescriptor(
-            tensor,
-            tensor.shape,
-            tensor.stride(),
-            block_shape,
-        )
+    def from_tensor(tensor: Any, block_shape: List[int], padding="zero"):
+        return TensorDescriptor(tensor, tensor.shape, tensor.stride(), block_shape, padding)

triton/windows_utils.py CHANGED Viewed

@@ -54,14 +54,11 @@ def max_version(
 def check_msvc(msvc_base_path: Path, version: str) -> bool:
-    return all(
-        x.exists()
-        for x in [
-            msvc_base_path / version / "bin" / "Hostx64" / "x64" / "cl.exe",
-            msvc_base_path / version / "include" / "vcruntime.h",
-            msvc_base_path / version / "lib" / "x64" / "vcruntime.lib",
-        ]
-    )
+    return all(x.exists() for x in [
+        msvc_base_path / version / "bin" / "Hostx64" / "x64" / "cl.exe",
+        msvc_base_path / version / "include" / "vcruntime.h",
+        msvc_base_path / version / "lib" / "x64" / "vcruntime.lib",
+    ])
 def find_msvc_env() -> tuple[Optional[Path], Optional[str]]:
@@ -72,20 +69,16 @@ def find_msvc_env() -> tuple[Optional[Path], Optional[str]]:
     version = os.getenv("VCToolsVersion")
     if not check_msvc(msvc_base_path, version):
-        warnings.warn(
-            f"Environment variables VCINSTALLDIR = {os.getenv('VCINSTALLDIR')}, "
-            f"VCToolsVersion = {os.getenv('VCToolsVersion')} are set, "
-            "but this MSVC installation is incomplete."
-        )
+        warnings.warn(f"Environment variables VCINSTALLDIR = {os.getenv('VCINSTALLDIR')}, "
+                      f"VCToolsVersion = {os.getenv('VCToolsVersion')} are set, "
+                      "but this MSVC installation is incomplete.")
         return None, None
     return msvc_base_path, version
 def find_msvc_vswhere() -> tuple[Optional[Path], Optional[str]]:
-    vswhere_path = find_in_program_files(
-        r"Microsoft Visual Studio\Installer\vswhere.exe"
-    )
+    vswhere_path = find_in_program_files(r"Microsoft Visual Studio\Installer\vswhere.exe")
     if vswhere_path is None:
         return None, None
@@ -111,9 +104,7 @@ def find_msvc_vswhere() -> tuple[Optional[Path], Optional[str]]:
     if not msvc_base_path.exists():
         return None, None
-    version = max_version(
-        os.listdir(msvc_base_path), check=partial(check_msvc, msvc_base_path)
-    )
+    version = max_version(os.listdir(msvc_base_path), check=partial(check_msvc, msvc_base_path))
     if version is None:
         return None, None
@@ -132,9 +123,7 @@ def find_msvc_envpath() -> tuple[Optional[Path], Optional[str]]:
         if not msvc_base_path.exists():
             continue
-        version = max_version(
-            os.listdir(msvc_base_path), check=partial(check_msvc, msvc_base_path)
-        )
+        version = max_version(os.listdir(msvc_base_path), check=partial(check_msvc, msvc_base_path))
         if version is None:
             continue
@@ -153,9 +142,7 @@ def find_msvc_hardcoded() -> tuple[Optional[Path], Optional[str]]:
     paths = sorted(paths)[::-1]
     for msvc_base_path in paths:
         msvc_base_path = Path(msvc_base_path)
-        version = max_version(
-            os.listdir(msvc_base_path), check=partial(check_msvc, msvc_base_path)
-        )
+        version = max_version(os.listdir(msvc_base_path), check=partial(check_msvc, msvc_base_path))
         if version is None:
             continue
         return msvc_base_path, version
@@ -188,13 +175,10 @@ def find_msvc(env_only: bool) -> tuple[Optional[str], list[str], list[str]]:
 def check_winsdk(winsdk_base_path: Path, version: str) -> bool:
-    return all(
-        x.exists()
-        for x in [
-            winsdk_base_path / "Include" / version / "ucrt" / "stdlib.h",
-            winsdk_base_path / "Lib" / version / "ucrt" / "x64" / "ucrt.lib",
-        ]
-    )
+    return all(x.exists() for x in [
+        winsdk_base_path / "Include" / version / "ucrt" / "stdlib.h",
+        winsdk_base_path / "Lib" / version / "ucrt" / "x64" / "ucrt.lib",
+    ])
 def find_winsdk_env() -> tuple[Optional[Path], Optional[str]]:
@@ -205,18 +189,16 @@ def find_winsdk_env() -> tuple[Optional[Path], Optional[str]]:
     version = os.getenv("WindowsSDKVersion")
     if version is None:
-        warnings.warn(
-            f"Environment variable WindowsSdkDir = {os.getenv('WindowsSdkDir')}, "
-            "but WindowsSDKVersion is not set."
-        )
+        version = os.getenv("WindowsSDKVer")
+    if version is None:
+        warnings.warn(f"Environment variable WindowsSdkDir = {winsdk_base_path}, "
+                      "but WindowsSDKVersion (or WindowsSDKVer) is not set.")
         return None, None
     version = version.rstrip("\\")
     if not check_winsdk(winsdk_base_path, version):
-        warnings.warn(
-            f"Environment variables WindowsSdkDir = {os.getenv('WindowsSdkDir')}, "
-            f"WindowsSDKVersion = {os.getenv('WindowsSDKVersion')} are set, "
-            "but this Windows SDK installation is incomplete."
-        )
+        warnings.warn(f"Environment variables WindowsSdkDir = {winsdk_base_path}, "
+                      f"WindowsSDKVersion (or WindowsSDKVer) = {version} are set, "
+                      "but this Windows SDK installation is incomplete.")
         return None, None
     return winsdk_base_path, version
@@ -225,9 +207,7 @@ def find_winsdk_env() -> tuple[Optional[Path], Optional[str]]:
 def find_winsdk_registry() -> tuple[Optional[Path], Optional[str]]:
     try:
         reg = winreg.ConnectRegistry(None, winreg.HKEY_LOCAL_MACHINE)
-        key = winreg.OpenKeyEx(
-            reg, r"SOFTWARE\WOW6432Node\Microsoft\Microsoft SDKs\Windows\v10.0"
-        )
+        key = winreg.OpenKeyEx(reg, r"SOFTWARE\WOW6432Node\Microsoft\Microsoft SDKs\Windows\v10.0")
         folder = winreg.QueryValueEx(key, "InstallationFolder")[0]
         winreg.CloseKey(key)
     except OSError:
@@ -294,9 +274,7 @@ def find_winsdk(env_only: bool) -> tuple[list[str], list[str]]:
 @functools.lru_cache
-def find_msvc_winsdk(
-    env_only: bool = False,
-) -> tuple[Optional[str], list[str], list[str]]:
+def find_msvc_winsdk(env_only: bool = False, ) -> tuple[Optional[str], list[str], list[str]]:
     msvc_bin_path, msvc_inc_dirs, msvc_lib_dirs = find_msvc(env_only)
     winsdk_inc_dirs, winsdk_lib_dirs = find_winsdk(env_only)
     return (
@@ -312,9 +290,9 @@ def find_python() -> list[str]:
     if sysconfig.get_config_var("Py_GIL_DISABLED"):
         version += "t"
     for python_base_path in [
-        sys.exec_prefix,
-        sys.base_exec_prefix,
-        os.path.dirname(sys.executable),
+            sys.exec_prefix,
+            sys.base_exec_prefix,
+            os.path.dirname(sys.executable),
     ]:
         python_lib_dir = Path(python_base_path) / "libs"
         if (python_lib_dir / f"python{version}.lib").exists():
@@ -326,14 +304,11 @@ def find_python() -> list[str]:
 def check_and_find_cuda(base_path: Path) -> tuple[Optional[str], list[str], list[str]]:
     # pip
-    if all(
-        x.exists()
-        for x in [
+    if all(x.exists() for x in [
             base_path / "cuda_nvcc" / "bin" / "ptxas.exe",
             base_path / "cuda_runtime" / "include" / "cuda.h",
             base_path / "cuda_runtime" / "lib" / "x64" / "cuda.lib",
-        ]
-    ):
+    ]):
         return (
             str(base_path / "cuda_nvcc" / "bin"),
             [str(base_path / "cuda_runtime" / "include")],
@@ -341,14 +316,11 @@ def check_and_find_cuda(base_path: Path) -> tuple[Optional[str], list[str], list
         )
     # conda
-    if all(
-        x.exists()
-        for x in [
+    if all(x.exists() for x in [
             base_path / "bin" / "ptxas.exe",
             base_path / "include" / "cuda.h",
             base_path / "lib" / "cuda.lib",
-        ]
-    ):
+    ]):
         return (
             str(base_path / "bin"),
             [str(base_path / "include")],
@@ -356,14 +328,11 @@ def check_and_find_cuda(base_path: Path) -> tuple[Optional[str], list[str], list
         )
     # bundled or system-wide
-    if all(
-        x.exists()
-        for x in [
+    if all(x.exists() for x in [
             base_path / "bin" / "ptxas.exe",
             base_path / "include" / "cuda.h",
             base_path / "lib" / "x64" / "cuda.lib",
-        ]
-    ):
+    ]):
         return (
             str(base_path / "bin"),
             [str(base_path / "include")],
@@ -380,9 +349,7 @@ def find_cuda_env() -> tuple[Optional[str], list[str], list[str]]:
             continue
         cuda_base_path = Path(cuda_base_path)
-        cuda_bin_path, cuda_inc_dirs, cuda_lib_dirs = check_and_find_cuda(
-            cuda_base_path
-        )
+        cuda_bin_path, cuda_inc_dirs, cuda_lib_dirs = check_and_find_cuda(cuda_base_path)
         if cuda_bin_path:
             return cuda_bin_path, cuda_inc_dirs, cuda_lib_dirs
@@ -390,9 +357,7 @@ def find_cuda_env() -> tuple[Optional[str], list[str], list[str]]:
 def find_cuda_bundled() -> tuple[Optional[str], list[str], list[str]]:
-    cuda_base_path = (
-        Path(sysconfig.get_paths()["platlib"]) / "triton" / "backends" / "nvidia"
-    )
+    cuda_base_path = (Path(sysconfig.get_paths()["platlib"]) / "triton" / "backends" / "nvidia")
     return check_and_find_cuda(cuda_base_path)
@@ -416,9 +381,7 @@ def find_cuda_hardcoded() -> tuple[Optional[str], list[str], list[str]]:
     paths = sorted(paths)[::-1]
     for cuda_base_path in paths:
         cuda_base_path = Path(cuda_base_path)
-        cuda_bin_path, cuda_inc_dirs, cuda_lib_dirs = check_and_find_cuda(
-            cuda_base_path
-        )
+        cuda_bin_path, cuda_inc_dirs, cuda_lib_dirs = check_and_find_cuda(cuda_base_path)
         if cuda_bin_path:
             return cuda_bin_path, cuda_inc_dirs, cuda_lib_dirs
@@ -428,11 +391,11 @@ def find_cuda_hardcoded() -> tuple[Optional[str], list[str], list[str]]:
 @functools.lru_cache
 def find_cuda() -> tuple[Optional[str], list[str], list[str]]:
     for f in [
-        find_cuda_env,
-        find_cuda_bundled,
-        find_cuda_pip,
-        find_cuda_conda,
-        find_cuda_hardcoded,
+            find_cuda_env,
+            find_cuda_bundled,
+            find_cuda_pip,
+            find_cuda_conda,
+            find_cuda_hardcoded,
     ]:
         cuda_bin_path, cuda_inc_dirs, cuda_lib_dirs = f()
         if cuda_bin_path:

{triton_windows-3.4.0.post20.dist-info → triton_windows-3.5.0.post21.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: triton-windows
-Version: 3.4.0.post20
+Version: 3.5.0.post21
 Summary: A language and compiler for custom Deep Learning operations
 Home-page: https://github.com/woct0rdho/triton-windows
 Author: Philippe Tillet, Dian Wu
@@ -10,14 +10,13 @@ Classifier: Development Status :: 4 - Beta
 Classifier: Intended Audience :: Developers
 Classifier: Topic :: Software Development :: Build Tools
 Classifier: License :: OSI Approved :: MIT License
-Classifier: Programming Language :: Python :: 3.9
 Classifier: Programming Language :: Python :: 3.10
 Classifier: Programming Language :: Python :: 3.11
 Classifier: Programming Language :: Python :: 3.12
 Classifier: Programming Language :: Python :: 3.13
-Requires-Python: >=3.9,<3.14
+Classifier: Programming Language :: Python :: 3.14
+Requires-Python: >=3.10,<3.15
 License-File: LICENSE
-Requires-Dist: setuptools>=40.8.0
 Requires-Dist: importlib-metadata; python_version < "3.10"
 Provides-Extra: build
 Requires-Dist: cmake<4.0,>=3.20; extra == "build"