PyPI - triton-windows - Versions diffs - 3.2.0.post11__cp312-cp312-win_amd64.whl → 3.3.0a0.post11__cp312-cp312-win_amd64.whl - Mend

triton-windows 3.2.0.post11__cp312-cp312-win_amd64.whl → 3.3.0a0.post11__cp312-cp312-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of triton-windows might be problematic. Click here for more details.

Files changed (68) hide show

triton/_C/libtriton.pyd +0 -0
triton/__init__.py +3 -3
triton/_internal_testing.py +59 -4
triton/_utils.py +35 -0
triton/backends/amd/compiler.py +121 -74
triton/backends/amd/driver.py +77 -43
triton/backends/amd/include/hip/amd_detail/amd_device_functions.h +28 -49
triton/backends/amd/include/hip/amd_detail/amd_hip_atomic.h +35 -9
triton/backends/amd/include/hip/amd_detail/amd_hip_bf16.h +761 -284
triton/backends/amd/include/hip/amd_detail/amd_hip_cooperative_groups.h +9 -3
triton/backends/amd/include/hip/amd_detail/amd_hip_fp8.h +1391 -0
triton/backends/amd/include/hip/amd_detail/amd_hip_gl_interop.h +3 -3
triton/backends/amd/include/hip/amd_detail/amd_warp_functions.h +44 -0
triton/backends/amd/include/hip/amd_detail/amd_warp_sync_functions.h +288 -0
triton/backends/amd/include/hip/amd_detail/hip_api_trace.hpp +110 -14
triton/backends/amd/include/hip/amd_detail/hip_prof_str.h +504 -103
triton/backends/amd/include/hip/amd_detail/hip_runtime_prof.h +2 -1
triton/backends/amd/include/hip/amd_detail/host_defines.h +4 -0
triton/backends/amd/include/hip/hip_ext.h +4 -2
triton/backends/amd/include/hip/hip_fp8.h +33 -0
triton/backends/amd/include/hip/hip_runtime_api.h +375 -33
triton/backends/amd/include/hip/hip_version.h +3 -3
triton/backends/amd/include/hip/hiprtc.h +25 -25
triton/backends/amd/include/hsa/amd_hsa_elf.h +40 -14
triton/backends/amd/include/hsa/hsa.h +11 -2
triton/backends/amd/include/hsa/hsa_api_trace.h +30 -17
triton/backends/amd/include/hsa/hsa_api_trace_version.h +68 -0
triton/backends/amd/include/hsa/hsa_ext_amd.h +83 -27
triton/backends/amd/include/hsa/hsa_ven_amd_aqlprofile.h +46 -46
triton/backends/amd/include/hsa/hsa_ven_amd_pc_sampling.h +416 -0
triton/backends/amd/include/roctracer/hip_ostream_ops.h +84 -4
triton/backends/amd/include/roctracer/hsa_ostream_ops.h +260 -0
triton/backends/amd/include/roctracer/hsa_prof_str.h +51 -19
triton/backends/amd/lib/asanrtl.bc +0 -0
triton/backends/compiler.py +25 -225
triton/backends/driver.py +7 -2
triton/backends/nvidia/bin/ptxas.exe +0 -0
triton/backends/nvidia/compiler.py +135 -90
triton/backends/nvidia/driver.c +0 -1
triton/backends/nvidia/driver.py +135 -49
triton/backends/nvidia/include/cuda.h +2162 -241
triton/backends/nvidia/lib/x64/cuda.lib +0 -0
triton/compiler/__init__.py +2 -2
triton/compiler/code_generator.py +334 -231
triton/compiler/compiler.py +77 -66
triton/language/__init__.py +22 -5
triton/language/core.py +448 -74
triton/language/extra/cuda/_experimental_tma.py +3 -5
triton/language/math.py +1 -1
triton/language/random.py +2 -1
triton/language/semantic.py +206 -52
triton/language/standard.py +35 -18
triton/runtime/_allocation.py +32 -0
triton/runtime/autotuner.py +27 -32
triton/runtime/build.py +1 -48
triton/runtime/cache.py +6 -6
triton/runtime/errors.py +10 -0
triton/runtime/interpreter.py +179 -45
triton/runtime/jit.py +149 -190
triton/testing.py +39 -11
triton/tools/compile.py +27 -20
triton/tools/{compile.c → extra/cuda/compile.c} +1 -0
triton/tools/mxfp.py +301 -0
{triton_windows-3.2.0.post11.dist-info → triton_windows-3.3.0a0.post11.dist-info}/METADATA +5 -2
{triton_windows-3.2.0.post11.dist-info → triton_windows-3.3.0a0.post11.dist-info}/RECORD +68 -59
{triton_windows-3.2.0.post11.dist-info → triton_windows-3.3.0a0.post11.dist-info}/top_level.txt +2 -0
/triton/tools/{compile.h → extra/cuda/compile.h} +0 -0
{triton_windows-3.2.0.post11.dist-info → triton_windows-3.3.0a0.post11.dist-info}/WHEEL +0 -0

triton/compiler/compiler.py CHANGED Viewed

@@ -3,7 +3,7 @@ import hashlib
 import json
 from .._C.libtriton import get_cache_invalidating_env_vars, ir
 from ..backends import backends
-from ..backends.compiler import GPUTarget, AttrsDescriptor
+from ..backends.compiler import GPUTarget
 from .. import __version__
 from ..runtime.autotuner import OutOfResources
 from ..runtime.cache import get_cache_manager, get_dump_manager, get_override_manager
@@ -15,6 +15,7 @@ from pathlib import Path
 import re
 import functools
 import os
+import sysconfig
 # - ^\s*tt\.func\s+ : match the start of the string, any leading whitespace, the keyword func,
 #    and any following whitespace
@@ -24,19 +25,13 @@ import os
 # - (\((?:%\w+: \S+(?: \{\S+ = \S+ : \S+\})?(?:, )?)*\)) : match a pair of parentheses enclosing
 #   zero or more arguments separated by commas, and capture it as group 2 (the argument list)
 # - (attributes \{[\S\s]+\})? : optionally match attributes enclosed in braces and capture it as group 3
-mlir_prototype_pattern = r"^\s*tt\.func\s+(?:public\s+)?(@\w+)(\((?:%\w+: [\S\s]+(?: \{\S+ = \S+ : \S+\})?(?:, )?)*\))\s*(attributes \{[\S\s]+\})?\s+\{\s*$"
 ptx_prototype_pattern = r"\.(?:visible|extern)\s+\.(?:entry|func)\s+(\w+)\s*\(([^)]*)\)"
 prototype_pattern = {
-    "ttir": mlir_prototype_pattern,
-    "ttgir": mlir_prototype_pattern,
     "ptx": ptx_prototype_pattern,
 }
-mlir_arg_type_pattern = r'%\w+: ((?:[^,\s<)]+|<[^>]+>)+(?: {[^}]+})?),?'
 ptx_arg_type_pattern = r"\.param\s+\.(\w+)"
 arg_type_pattern = {
-    "ttir": mlir_arg_type_pattern,
-    "ttgir": mlir_arg_type_pattern,
     "ptx": ptx_arg_type_pattern,
 }
@@ -54,46 +49,32 @@ def convert_type_repr(x):
     return x
-def _get_num_warps_from_ir_str(src: str):
-    ttgir_num_warps_pattern = r'"triton_gpu.num-warps"\s?=\s?(\d+)\s?:'
-    # TODO(jlebar): Using a regex to get num-warps is a hack, and will break if
-    # e.g. someone has an instruction (not module) attribute named "num-warps".
-    num_warps_matches = re.findall(ttgir_num_warps_pattern, src)
-    assert len(num_warps_matches) == 1, "Expected exactly one match for num_warps"
-    num_warps = int(num_warps_matches[0])
-    return num_warps
 class ASTSource:
-    def __init__(self, fn, signature, constants=None, attrs=None) -> None:
+    def __init__(self, fn, signature, constexprs=None, attrs=None) -> None:
         self.fn = fn
         self.ext = "ttir"
         self.name = fn.__name__
         self.signature = signature
-        self.constants = constants
-        self.attrs = attrs
+        self.constants = dict()
+        if constexprs is not None:
+            for k, v in constexprs.items():
+                k = (fn.arg_names.index(k), ) if isinstance(k, str) else k
+                assert isinstance(k, tuple)
+                self.constants[k] = v
+        self.attrs = attrs or dict()
         if isinstance(self.signature, str):
             self.signature = {k: v.strip() for k, v in enumerate(self.signature.split(","))}
-        else:
-            for k in self.signature.keys():
-                if not isinstance(k, str):
-                    raise TypeError("Signature keys must be string")
-        if self.constants is None:
-            self.constants = {}
-        else:
-            for k in self.constants.keys():
-                if not isinstance(k, str):
-                    raise TypeError("Constants keys must be string")
-        if self.attrs is None:
-            self.attrs = AttrsDescriptor()
+        # else:
+        #     for k in self.signature.keys():
+        #         if not isinstance(k, str):
+        #             raise TypeError("Signature keys must be string")
     def hash(self):
         sorted_sig = [v for k, v in sorted(self.signature.items())]
-        # Note - we stringify the keys here to allow sorting to work for cases
-        # where constants have mixed int/str keys.
-        sorted_constants = sorted((str(k), v) for k, v in self.constants.items())
-        key = f"{self.fn.cache_key}-{self.attrs.hash()}-{sorted_sig}-{sorted_constants}"
+        get_key = lambda x: x.cache_key if hasattr(x, 'cache_key') else str(x)
+        constants_key = '-'.join([get_key(v) for k, v in sorted(self.constants.items())])
+        key = f"{self.fn.cache_key}-{str(self.attrs)}-{sorted_sig}-{constants_key}"
         return hashlib.sha256(key.encode("utf-8")).hexdigest()
     def make_ir(self, options, codegen_fns, module_map, context):
@@ -106,28 +87,42 @@ class ASTSource:
 class IRSource:
-    def __init__(self, path):
+    def __init__(self, path, context, backend):
         self.path = path
         path = Path(path)
         self.ext = path.suffix[1:]
         self.src = path.read_text()
-        match = re.search(prototype_pattern[self.ext], self.src, re.MULTILINE)
-        self.name = match.group(1)
-        signature = match.group(2)
-        types = re.findall(arg_type_pattern[self.ext], signature)
-        self.signature = {k: convert_type_repr(ty) for k, ty in enumerate(types)}
+        ir.load_dialects(context)
+        backend.load_dialects(context)
+        # We don't have a easy-to-use PTX parser that we can use, so keep that regex for now.
+        # TODO - replace with a proper parser
+        if self.ext == "ptx":
+            match = re.search(prototype_pattern[self.ext], self.src, re.MULTILINE)
+            self.name = match.group(1)
+            signature = match.group(2)
+            types = re.findall(arg_type_pattern[self.ext], signature)
+            self.signature = {k: convert_type_repr(ty) for k, ty in enumerate(types)}
+        else:
+            self.module = ir.parse_mlir_module(self.path, context)
+            fn_name = self.module.get_entry_func_name()
+            self.name = "@" + fn_name
+            funcOp = self.module.get_function(fn_name)
+            func_ty = self.module.get_function_signature(funcOp)
+            self.signature = {k: ty for k, ty in enumerate(func_ty)}
     def hash(self):
         return hashlib.sha256(self.src.encode("utf-8")).hexdigest()
     def make_ir(self, options, codegen_fns, module_map, context):
-        module = ir.parse_mlir_module(self.path, context)
-        module.context = context
-        return module
+        self.module.context = context
+        return self.module
     def parse_options(self):
         if self.ext == "ttgir":
-            return {'num_warps': _get_num_warps_from_ir_str(self.src)}
+            num_warps = self.module.get_int_attr("ttg.num-warps")
+            assert num_warps is not None, "Unable to parse ttg.num-warps attribute"
+            return {'num_warps': num_warps}
         return dict()
@@ -151,11 +146,8 @@ def triton_key():
     # backend
     libtriton_hash = hashlib.sha256()
-    if os.name == "nt":
-        so_name = "libtriton.pyd"
-    else:
-        so_name = "libtriton.so"
-    with open(os.path.join(TRITON_PATH, f"_C/{so_name}"), "rb") as f:
+    ext = sysconfig.get_config_var("EXT_SUFFIX").split(".")[-1]
+    with open(os.path.join(TRITON_PATH, "_C", f"libtriton.{ext}"), "rb") as f:
         while True:
             chunk = f.read(1024**2)
             if not chunk:
@@ -175,9 +167,9 @@ def parse(full_name, ext, context):
         module = ir.parse_mlir_module(full_name, context)
         module.context = context
         return module
-    if ext == "llir" or ext == "ptx":
+    if ext == "llir" or ext == "ptx" or ext == "amdgcn":
         return Path(full_name).read_text()
-    if ext == "cubin":
+    if ext == "cubin" or ext == "hsaco":
         return Path(full_name).read_bytes()
@@ -200,6 +192,7 @@ def filter_traceback(e: BaseException):
         "/triton/compiler/code_generator.py",
         "/ast.py",
     ]
+    BAD_FILES = [bad_file.replace("/", os.sep) for bad_file in BAD_FILES]
     tb = e.__traceback__
     frames = []
@@ -227,7 +220,9 @@ def compile(src, target=None, options=None):
     # create backend
     if ir_source:
         assert isinstance(src, str), "source must be either AST or a filepath"
-        src = IRSource(src)
+        context = ir.context()
+        src = IRSource(src, context, backend)
     extra_options = src.parse_options()
     options = backend.parse_options(dict(options or dict(), **extra_options))
     # create cache manager
@@ -239,6 +234,7 @@ def compile(src, target=None, options=None):
     # core changes to make it easier to track kernels by hash.
     enable_override = os.environ.get("TRITON_KERNEL_OVERRIDE", "0") == "1"
     enable_ir_dump = os.environ.get("TRITON_KERNEL_DUMP", "0") == "1"
+    store_only_binary = os.environ.get("TRITON_STORE_BINARY_ONLY", "0") == "1"
     fn_override_manager = get_override_manager(src.hash()) if enable_override else None
     fn_dump_manager = get_dump_manager(src.hash()) if enable_ir_dump else None
     # Pre-truncate the file name here to avoid hitting the 255 character limit on common platforms.
@@ -252,7 +248,6 @@ def compile(src, target=None, options=None):
     always_compile = os.environ.get("TRITON_ALWAYS_COMPILE", "0") == "1"
     if not always_compile and metadata_path is not None:
         # cache hit!
-        metadata = json.loads(Path(metadata_path).read_text())
         return CompiledKernel(src, metadata_group, hash)
     # initialize metadata
     metadata = {
@@ -261,6 +256,7 @@ def compile(src, target=None, options=None):
         **options.__dict__,
         **env_vars,
     }
+    metadata["triton_version"] = __version__
     # run compilation pipeline  and populate metadata
     stages = dict()
     backend.add_stages(stages, options)
@@ -268,10 +264,15 @@ def compile(src, target=None, options=None):
     # when the source is an IR file, don't apply the passes related to this stage. This makes it easier to write IR level tests.
     if ir_source:
         first_stage += 1
-    context = ir.context()
-    ir.load_dialects(context)
-    backend.load_dialects(context)
-    codegen_fns = backend.get_codegen_implementation()
+    # For IRSource, we have already grabbed the context + called both
+    # ir.load_dialects and backend.load_dialects.
+    if not isinstance(src, IRSource):
+        context = ir.context()
+        ir.load_dialects(context)
+        backend.load_dialects(context)
+    codegen_fns = backend.get_codegen_implementation(options)
     module_map = backend.get_module_map()
     try:
         module = src.make_ir(options, codegen_fns, module_map, context)
@@ -285,7 +286,9 @@ def compile(src, target=None, options=None):
         if (fn_override_manager is not None and (full_name := fn_override_manager.get_file(ir_filename)) is not None):
             print(f"\nOverriding kernel with file {full_name}")
             next_module = parse(full_name, ext, context)
-        metadata_group[ir_filename] = fn_cache_manager.put(next_module, ir_filename)
+        # If TRITON_STORE_BINARY_ONLY is 1, only store cubin/hsaco/json
+        if (not store_only_binary) or (ext in ("cubin", "hsaco", "json")):
+            metadata_group[ir_filename] = fn_cache_manager.put(next_module, ir_filename)
         if fn_dump_manager is not None:
             fn_dump_manager.put(next_module, ir_filename)
         # use an env variable to parse ir from file
@@ -302,7 +305,13 @@ def compile(src, target=None, options=None):
     # This is needed to safely finalize threads pool inside context: if current process forks before
     # python GC deletes context object, thread pool in child process will be invalid, which could
     # lead to child crash or hang.
-    context.disable_multithreading()
+    #
+    # However disabling multithreading causes the code to hang if the ASAN pass is enabled
+    # this is likely due to the llvm-symbolizer forking a process
+    # TODO: Reconcile the difference here between the ASAN and non-ASAN path with enabling
+    # multithreading in the MLIR context
+    if not os.environ.get("TRITON_ENABLE_ASAN", "0") == "1":
+        context.disable_multithreading()
     # return handle to compiled kernel
     return CompiledKernel(src, metadata_group, hash)
@@ -390,6 +399,11 @@ class CompiledKernel:
         max_shared = driver.active.utils.get_device_properties(device)["max_shared_mem"]
         if self.metadata.shared > max_shared:
             raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+        if hasattr(self.metadata, "tmem_size") and self.metadata.tmem_size is not None:
+            # Use blackwell max tmem size for now, this should be moved in device properties
+            max_tmem_size = 512  # tmem size in number of columns
+            if self.metadata.tmem_size > max_tmem_size:
+                raise OutOfResources(self.metadata.tmem_size, max_tmem_size, "tensor memory")
         # TODO: n_regs, n_spills should be metadata generated when calling `ptxas`
         self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
             self.name, self.kernel, self.metadata.shared, device)
@@ -408,11 +422,8 @@ class CompiledKernel:
         arg_dict = {}
         arg_idx = 0
         for i, arg_name in enumerate(self.src.fn.arg_names):
-            if i in self.src.fn.constexprs:
-                arg_dict[arg_name] = self.src.constants[arg_name]
-            else:
-                arg_dict[arg_name] = args[arg_idx]
-                arg_idx += 1
+            arg_dict[arg_name] = args[arg_idx]
+            arg_idx += 1
         ret.add(self.src.fn.launch_metadata, (grid, self.metadata, arg_dict))
         return ret

triton/language/__init__.py CHANGED Viewed

@@ -28,6 +28,9 @@ from .core import (
     TRITON_MAX_TENSOR_NUMEL,
     _experimental_descriptor_load,
     _experimental_descriptor_store,
+    _experimental_make_tensor_descriptor,
+    _experimental_reinterpret_tensor_descriptor,
+    _experimental_tensor_descriptor,
     add,
     advance,
     arange,
@@ -66,7 +69,7 @@ from .core import (
     float8e5,
     float8e5b16,
     full,
-    function_type,
+    gather,
     histogram,
     inline_asm_elementwise,
     int1,
@@ -91,6 +94,7 @@ from .core import (
     range,
     reduce,
     reshape,
+    slice,
     split,
     static_assert,
     static_print,
@@ -98,6 +102,8 @@ from .core import (
     store,
     tensor,
     trans,
+    tuple,
+    tuple_type,
     uint16,
     uint32,
     uint64,
@@ -126,6 +132,9 @@ __all__ = [
     "TRITON_MAX_TENSOR_NUMEL",
     "_experimental_descriptor_load",
     "_experimental_descriptor_store",
+    "_experimental_make_tensor_descriptor",
+    "_experimental_reinterpret_tensor_descriptor",
+    "_experimental_tensor_descriptor",
     "abs",
     "add",
     "advance",
@@ -146,7 +155,6 @@ __all__ = [
     "block_type",
     "broadcast",
     "broadcast_to",
-    "builtin",
     "cat",
     "cast",
     "cdiv",
@@ -182,7 +190,7 @@ __all__ = [
     "floor",
     "fma",
     "full",
-    "function_type",
+    "gather",
     "histogram",
     "inline_asm_elementwise",
     "interleave",
@@ -191,7 +199,6 @@ __all__ = [
     "int32",
     "int64",
     "int8",
-    "ir",
     "join",
     "load",
     "log",
@@ -225,6 +232,7 @@ __all__ = [
     "reduce",
     "reshape",
     "rsqrt",
+    "slice",
     "sigmoid",
     "sin",
     "softmax",
@@ -240,7 +248,7 @@ __all__ = [
     "swizzle2d",
     "tensor",
     "trans",
-    "triton",
+    "tuple",
     "uint16",
     "uint32",
     "uint64",
@@ -257,6 +265,12 @@ __all__ = [
 def str_to_ty(name):
+    from builtins import tuple
+    if isinstance(name, tuple):
+        fields = type(name).__dict__.get("_fields", None)
+        return tuple_type([str_to_ty(x) for x in name], fields)
     if name[0] == "*":
         name = name[1:]
         const = False
@@ -269,6 +283,9 @@ def str_to_ty(name):
     if name == "nvTmaDesc":
         return nv_tma_desc_type()
+    if name == "constexpr":
+        return constexpr
     tys = {
         "fp8e4nv": float8e4nv,
         "fp8e4b8": float8e4b8,