PyPI - triton-windows - Versions diffs - 3.2.0.post12__cp39-cp39-win_amd64.whl → 3.3.0a0.post12__cp39-cp39-win_amd64.whl - Mend

triton-windows 3.2.0.post12__cp39-cp39-win_amd64.whl → 3.3.0a0.post12__cp39-cp39-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of triton-windows might be problematic. Click here for more details.

Files changed (68) hide show

triton/_C/libtriton.pyd +0 -0
triton/__init__.py +3 -3
triton/_internal_testing.py +59 -4
triton/_utils.py +35 -0
triton/backends/amd/compiler.py +121 -74
triton/backends/amd/driver.py +77 -43
triton/backends/amd/include/hip/amd_detail/amd_device_functions.h +28 -49
triton/backends/amd/include/hip/amd_detail/amd_hip_atomic.h +35 -9
triton/backends/amd/include/hip/amd_detail/amd_hip_bf16.h +761 -284
triton/backends/amd/include/hip/amd_detail/amd_hip_cooperative_groups.h +9 -3
triton/backends/amd/include/hip/amd_detail/amd_hip_fp8.h +1391 -0
triton/backends/amd/include/hip/amd_detail/amd_hip_gl_interop.h +3 -3
triton/backends/amd/include/hip/amd_detail/amd_warp_functions.h +44 -0
triton/backends/amd/include/hip/amd_detail/amd_warp_sync_functions.h +288 -0
triton/backends/amd/include/hip/amd_detail/hip_api_trace.hpp +110 -14
triton/backends/amd/include/hip/amd_detail/hip_prof_str.h +504 -103
triton/backends/amd/include/hip/amd_detail/hip_runtime_prof.h +2 -1
triton/backends/amd/include/hip/amd_detail/host_defines.h +4 -0
triton/backends/amd/include/hip/hip_ext.h +4 -2
triton/backends/amd/include/hip/hip_fp8.h +33 -0
triton/backends/amd/include/hip/hip_runtime_api.h +375 -33
triton/backends/amd/include/hip/hip_version.h +3 -3
triton/backends/amd/include/hip/hiprtc.h +25 -25
triton/backends/amd/include/hsa/amd_hsa_elf.h +40 -14
triton/backends/amd/include/hsa/hsa.h +11 -2
triton/backends/amd/include/hsa/hsa_api_trace.h +30 -17
triton/backends/amd/include/hsa/hsa_api_trace_version.h +68 -0
triton/backends/amd/include/hsa/hsa_ext_amd.h +83 -27
triton/backends/amd/include/hsa/hsa_ven_amd_aqlprofile.h +46 -46
triton/backends/amd/include/hsa/hsa_ven_amd_pc_sampling.h +416 -0
triton/backends/amd/include/roctracer/hip_ostream_ops.h +84 -4
triton/backends/amd/include/roctracer/hsa_ostream_ops.h +260 -0
triton/backends/amd/include/roctracer/hsa_prof_str.h +51 -19
triton/backends/amd/lib/asanrtl.bc +0 -0
triton/backends/compiler.py +25 -225
triton/backends/driver.py +7 -2
triton/backends/nvidia/bin/ptxas.exe +0 -0
triton/backends/nvidia/compiler.py +135 -90
triton/backends/nvidia/driver.c +0 -1
triton/backends/nvidia/driver.py +135 -49
triton/backends/nvidia/include/cuda.h +2162 -241
triton/backends/nvidia/lib/x64/cuda.lib +0 -0
triton/compiler/__init__.py +2 -2
triton/compiler/code_generator.py +334 -231
triton/compiler/compiler.py +77 -66
triton/language/__init__.py +22 -5
triton/language/core.py +448 -74
triton/language/extra/cuda/_experimental_tma.py +3 -5
triton/language/math.py +1 -1
triton/language/random.py +2 -1
triton/language/semantic.py +206 -52
triton/language/standard.py +35 -18
triton/runtime/_allocation.py +32 -0
triton/runtime/autotuner.py +27 -32
triton/runtime/build.py +1 -48
triton/runtime/cache.py +6 -6
triton/runtime/errors.py +10 -0
triton/runtime/interpreter.py +179 -45
triton/runtime/jit.py +149 -190
triton/testing.py +39 -11
triton/tools/compile.py +27 -20
triton/tools/{compile.c → extra/cuda/compile.c} +1 -0
triton/tools/mxfp.py +301 -0
{triton_windows-3.2.0.post12.dist-info → triton_windows-3.3.0a0.post12.dist-info}/METADATA +5 -2
{triton_windows-3.2.0.post12.dist-info → triton_windows-3.3.0a0.post12.dist-info}/RECORD +68 -59
{triton_windows-3.2.0.post12.dist-info → triton_windows-3.3.0a0.post12.dist-info}/top_level.txt +2 -0
/triton/tools/{compile.h → extra/cuda/compile.h} +0 -0
{triton_windows-3.2.0.post12.dist-info → triton_windows-3.3.0a0.post12.dist-info}/WHEEL +0 -0

triton/runtime/jit.py CHANGED Viewed

@@ -11,6 +11,7 @@ from functools import cached_property
 from typing import Callable, Generic, Iterable, Optional, TypeVar, Union, overload, Dict, Any, Tuple
 from ..runtime.driver import driver
 from types import ModuleType
+from .._utils import find_paths_if, get_iterable_path
 TRITON_MODULE = __name__[:-len(".runtime.jit")]
@@ -275,47 +276,63 @@ class KernelParam:
         return self._param.default != inspect.Parameter.empty
-def compute_spec_key(v, align):
-    if align and hasattr(v, "data_ptr") and (v.data_ptr() % 16 == 0):
-        return "D"
-    elif isinstance(v, int):
-        # bool is a subclass of int, so we don't check explicitly above.
-        if align and (v % 16 == 0):
-            return "D"
-        elif v == 1:
-            return "1"
-    return "N"
+dtype2str = {}
+specialize_impl_cache = []
-dtype2str = {}
+def create_specialize_impl():
+    if specialize_impl_cache:
+        return specialize_impl_cache[-1]
+    from ..language import constexpr
-def mangle_type(arg, is_const=False):
+    def specialize_impl(arg, specialize_extra, is_const=False, specialize_value=True, align=True):
-    if arg is None:
-        return "none"
-    elif isinstance(arg, bool):
-        return "i1"
-    elif isinstance(arg, int):
-        if -(2**31) <= arg and arg <= 2**31 - 1:
-            return "i32"
-        elif 2**63 <= arg and arg <= 2**64 - 1:
-            return "u64"
+        if arg is None:
+            return ("constexpr", None)
+        elif isinstance(arg, JITFunction):
+            return ("constexpr", arg.cache_key)
+        elif isinstance(arg, constexpr):
+            return ("constexpr", arg)
+        elif isinstance(arg, bool):
+            return ("i1", None)
+        elif isinstance(arg, int):
+            key = specialize_extra(arg, "int", align=align) if specialize_value else None
+            if arg == 1 and specialize_value:
+                return ("constexpr", 1)
+            elif -(2**31) <= arg and arg <= 2**31 - 1:
+                return ("i32", key)
+            elif 2**63 <= arg and arg <= 2**64 - 1:
+                return ("u64", key)
+            else:
+                return ("i64", key)
+        elif isinstance(arg, float):
+            return ("fp32", None)
+        elif hasattr(arg, "tma_desc_cpu_ptr"):
+            return ("nvTmaDesc", None)
+        elif isinstance(arg, tuple):
+            spec = [specialize_impl(x, specialize_extra) for x in arg]
+            make_tuple = lambda vals: type(arg)(*vals) if hasattr(arg, "_fields") else tuple(vals)
+            tys = make_tuple([x[0] for x in spec])
+            keys = make_tuple([x[1] for x in spec])
+            return (tys, keys)
         else:
-            return "i64"
-    elif isinstance(arg, float):
-        return "fp32"
-    elif hasattr(arg, "tma_desc_cpu_ptr"):
-        return "nvTmaDesc"
-    else:
-        # dtypes are hashable so we can memoize this mapping:
-        dsk = (arg.dtype, is_const)
-        res = dtype2str.get(dsk, None)
-        if res is None:
-            res = ("*k" if dsk[1] else "*") + type_canonicalisation_dict[str(dsk[0]).split('.')[-1]]
-            dtype2str[dsk] = res
-        return res
+            # dtypes are hashable so we can memoize this mapping:
+            dsk = (arg.dtype, is_const)
+            res = dtype2str.get(dsk, None)
+            if res is None:
+                res = ("*k" if dsk[1] else "*") + type_canonicalisation_dict[str(dsk[0]).split('.')[-1]]
+                dtype2str[dsk] = res
+            key = specialize_extra(arg, "tensor", align=align) if specialize_value else None
+            return (res, key)
+    specialize_impl_cache.append(specialize_impl)
+    return specialize_impl
+def mangle_type(arg, specialize=False):
+    specialize_impl = create_specialize_impl()
+    return specialize_impl(arg, lambda _, **kwargs: None, specialize_value=specialize)[0]
 class KernelInterface(Generic[T]):
@@ -335,8 +352,9 @@ def serialize_specialization_data(name, signature, constants, attrs, options, ke
     constants = {key: str(value) if value.__class__.__name__ == "dtype" else value for key, value in constants.items()}
     import json
     obj = {
-        'name': name, 'signature': signature, 'constants': constants, 'attrs': attrs.to_dict(), 'options':
-        options.__dict__, 'key': key
+        'name': name, 'signature': signature, 'constant_keys': [list(x) for x in constants.keys()], 'constant_vals':
+        list(constants.values()), 'attrs_keys': [list(x) for x in attrs.keys()], 'attrs_vals': list(attrs.values()),
+        'options': options.__dict__, 'key': key
     }
     serialized_obj = json.dumps(obj)
     return serialized_obj
@@ -349,50 +367,32 @@ def create_function_from_signature(sig, kparams, backend):
     basis to avoid having to run these expensive functions -- which constitute
     much of the kernel launch overhead -- every time we run the kernel.
     """
     assert len(sig.parameters) == len(kparams)
     # Create the function argument list and the dict entries for the return statement
-    func_args = []
-    dict_entries = []
-    constexpr_vals = []
-    non_constexpr_vals = []
-    signature_types = []
-    specialisations = []
-    for ((name, sp), kp) in zip(sig.parameters.items(), kparams):
-        if sp.default is inspect.Parameter.empty:
-            func_args.append(name)
-            dict_entries.append(f"'{name}': {name}")
-        else:
-            func_args.append(f"{name}=default_{name}")
-            dict_entries.append(f"'{name}': {name}")
+    specialization = []
+    # signature
+    for name, kp in zip(sig.parameters.keys(), kparams):
         if kp.is_constexpr:
-            constexpr_vals.append(name)
+            specialization.append(f'("constexpr", {name})')
         else:
-            non_constexpr_vals.append(name)
-            if not kp.do_not_specialize:
-                if not kp.do_not_specialize_on_alignment:
-                    specialisations.append('compute_spec_key(%s, align=True)' % name)
-                else:
-                    specialisations.append('compute_spec_key(%s, align=False)' % name)
+            is_const = 'True' if kp.is_const else 'False'
+            specialize = 'False' if kp.do_not_specialize else 'True'
+            align = 'False' if kp.do_not_specialize_on_alignment else 'True'
+            ret = f"specialize_impl({name}, specialize_extra, {is_const}, {specialize}, {align})"
             if kp.annotation_type:
-                signature_types.append('"%s"' % kp.annotation_type)
+                specialization.append(f'("{kp.annotation_type}",) + {ret}[1:]')
             else:
-                signature_types.append('mangle_type(%s, %s)' % (name, 'True' if kp.is_const else 'False'))
-    cache_key = ''.join([x + ', ' for x in signature_types + specialisations])
-    constexpr_vals = ''.join([x + ', ' for x in constexpr_vals])
-    non_constexpr_vals = ''.join([x + ', ' for x in non_constexpr_vals])
-    func_args.append('**excess_kwargs')
+                specialization.append(f"{ret}")
+    # compute argument string for a given parameter
+    arg = lambda x: x[0] if x[1].default is inspect.Parameter.empty else f"{x[0]}=default_{x[0]}"
     # Join all arguments into a function definition string
-    args_str = ', '.join(func_args)
-    dict_str = ', '.join(dict_entries)
-    func_body = "def dynamic_func(%s):\n    return {%s}, (%s), (%s), (%s), excess_kwargs" % (
-        args_str, dict_str, cache_key, constexpr_vals, non_constexpr_vals)
+    func_body = f"""
+def dynamic_func({", ".join(list(map(arg, sig.parameters.items())) + ["**options"])}):
+    params = {{{', '.join([f"'{name}': {name}" for name in sig.parameters.keys()])}}}
+    specialization = [{','.join(specialization)}]
+    return params, specialization, options
+"""
     # Prepare defaults to be inserted into function namespace
     func_namespace = {
         f"default_{name}": param.default
@@ -400,8 +400,9 @@ def create_function_from_signature(sig, kparams, backend):
         if param.default is not inspect.Parameter.empty
     }
-    func_namespace['mangle_type'] = mangle_type
-    func_namespace['compute_spec_key'] = backend.compute_spec_key
+    func_namespace["JITFunction"] = JITFunction
+    func_namespace["specialize_impl"] = create_specialize_impl()
+    func_namespace["specialize_extra"] = backend.get_arg_specialization
     # Execute the function string in func_namespace to create the function
     exec(func_body, func_namespace)
@@ -446,43 +447,6 @@ class JITFunction(KernelInterface[T]):
     # cache_hook will always be called before compilation and compiled_hook after.
     compiled_hook = None
-    @staticmethod
-    def _key_of(arg):
-        if hasattr(arg, "dtype"):
-            return arg.dtype
-        elif isinstance(arg, bool):
-            return "i1"
-        elif isinstance(arg, int):
-            if -(2**31) <= arg and arg <= 2**31 - 1:
-                return "i32"
-            elif 2**63 <= arg and arg <= 2**64 - 1:
-                return "u64"
-            else:
-                return "i64"
-        elif isinstance(arg, float):
-            return "fp32"
-        elif arg is None:
-            return None
-        else:
-            raise TypeError(f"Unsupported type {type(arg)} for {arg}")
-    @staticmethod
-    def _type_of(key, is_const=False):
-        # `None` is nullptr.  Implicitly convert to *i8.
-        if key is None:
-            return "*i8"
-        elif isinstance(key, str):
-            return key
-        dtype_str = str(key).split(".")[-1]
-        dtype_str = type_canonicalisation_dict[dtype_str]
-        const_str = "*k" if is_const else "*"
-        return const_str + dtype_str
-    def _make_constants(self, constexpr_key):
-        constants = dict(zip(self.constexprs, constexpr_key))
-        return constants
     def _call_hook(
         self,
         key,
@@ -501,7 +465,7 @@ class JITFunction(KernelInterface[T]):
         name = self.fn.__name__
         module = self.fn.__module__
         arg_reprs = ", ".join([f"{param.name}: {ty}" for param, ty in zip(self.params, key[1])])
-        repr = f"{name}[num_warps={options.num_warps}, num_ctas={options.num_ctas}, num_stages={options.num_stages}, enable_fp_fusion={options.enable_fp_fusion}]({arg_reprs})"
+        repr = f"{name}[num_warps={options.num_warps}, num_ctas={options.num_ctas}, num_stages={options.num_stages}, enable_fp_fusion={options.enable_fp_fusion}, launch_cooperative_grid={options.launch_cooperative_grid}]({arg_reprs})"
         class JitFunctionInfo:
@@ -521,6 +485,7 @@ class JITFunction(KernelInterface[T]):
             'num_ctas': options.num_ctas,
             'num_stages': options.num_stages,
             'enable_fp_fusion': options.enable_fp_fusion,
+            'launch_cooperative_grid': options.launch_cooperative_grid,
             'extern_libs': options.extern_libs,
             'configs': configs,
             'specialization_data': specialization_data,
@@ -544,89 +509,66 @@ class JITFunction(KernelInterface[T]):
         assert callable(hook)
         self.pre_run_hooks.append(hook)
-    def create_binder(self, backend):
+    def create_binder(self):
         """
         Precompute as much as possible.
         """
         from ..compiler import CompiledKernel, compile, ASTSource, make_backend
+        target = driver.active.get_current_target()
+        backend = make_backend(target)
         self.CompiledKernel = CompiledKernel
         self.compile = compile
         self.ASTSource = ASTSource
-        self.make_backend = make_backend
-        self.binder = create_function_from_signature(self.signature, self.params, backend)
-        self.constexpr_indices = [i for (i, p) in enumerate(self.params) if p.is_constexpr]
-        self.non_constexpr_indices = [i for (i, p) in enumerate(self.params) if not p.is_constexpr]
-        self.specialised_indices = [
-            i for (i, p) in enumerate(self.params) if (not p.do_not_specialize) and (not p.is_constexpr)
-        ]
+        binder = create_function_from_signature(self.signature, self.params, backend)
+        return {}, target, backend, binder
     def run(self, *args, grid, warmup, **kwargs):
-        kwargs["debug"] = kwargs.get("debug", False) or os.environ.get("TRITON_DEBUG", "0") == "1"
+        kwargs["debug"] = kwargs.get("debug", self.debug) or os.environ.get("TRITON_DEBUG", "0") == "1"
         # parse options
-        from ..compiler import make_backend
         device = driver.active.get_current_device()
         stream = driver.active.get_current_stream(device)
-        target = driver.active.get_current_target()
-        backend = make_backend(target)
         # Execute pre run hooks with args and kwargs
         for hook in self.pre_run_hooks:
             hook(*args, **kwargs)
-        if self.binder is None:
-            self.create_binder(backend)
-        bound_args, sig_and_spec, constexpr_vals, non_constexpr_vals, excess_kwargs = self.binder(*args, **kwargs)
+        kernel_cache, target, backend, binder = self.device_caches[device]
+        bound_args, specialization, options = binder(*args, **kwargs)
         # compute cache key
-        key = ''.join(sig_and_spec) + str((constexpr_vals, excess_kwargs))
-        kernel = self.cache[device].get(key, None)
+        key = str(specialization) + str(options)
+        kernel = kernel_cache.get(key, None)
+        # Kernel is not cached; we have to compile.
         if kernel is None:
-            # Kernel is not cached; we have to compile.
+            # options
             options = backend.parse_options(kwargs)
-            # deprecated arguments
+            # signature
+            sigkeys = [x.name for x in self.params]
+            sigvals = [x[0] for x in specialization]
+            signature = {k: v for (k, v) in zip(sigkeys, sigvals)}
+            # check arguments
             assert "device_type" not in kwargs, "device_type option is deprecated; current target will be used"
             assert "device" not in kwargs, "device option is deprecated; current device will be used"
             assert "stream" not in kwargs, "stream option is deprecated; current stream will be used"
-            for k in excess_kwargs:
-                if k not in options.__dict__:
+            for k in kwargs:
+                if k not in options.__dict__ and k not in sigkeys:
                     raise KeyError("Keyword argument %s was specified but unrecognised" % k)
-            bound_vals = tuple(bound_args.values())
-            # `None` is nullptr. Implicitly convert to *i8. This needs to be
-            # done here rather than when we build the signature as otherwise
-            # the kernel cache key could not distinguish between byte pointers
-            # and None arguments, resulting in a downstream mismatch:
-            sigkeys = [self.params[i].name for i in self.non_constexpr_indices]
-            sigvals = sig_and_spec[:len(sigkeys)]
-            signature = {k: ('*i8' if (v == 'none') else v) for (k, v) in zip(sigkeys, sigvals)}
-            configs = (backend.get_attrs_descriptor(self.params, bound_vals), )
-            constant_params = configs[0].get_constants()
-            constants = {
-                p.name: v
-                for (v, p) in zip(bound_vals, self.params)
-                if p.is_constexpr or (p.num in constant_params) or v is None
-            }
-            for i, arg in constants.items():
-                if callable(arg):
-                    raise TypeError(f"Callable constexpr at index {i} is not supported")
-            if self._call_hook(key, signature, device, constants, options, configs, warmup, before=True):
+            # constexprs
+            constexprs = find_paths_if(sigvals, lambda _, val: val == "constexpr")
+            constexprs = {path: get_iterable_path(list(bound_args.values()), path) for path in constexprs}
+            # attributes
+            attrvals = [x[1] for x in specialization]
+            attrs = find_paths_if(attrvals, lambda _, x: isinstance(x, str))
+            attrs = {k: backend.parse_attr(get_iterable_path(attrvals, k)) for k in attrs}
+            if self._call_hook(key, signature, device, constexprs, options, [attrs], warmup, before=True):
                 return None
             # compile the kernel
-            src = self.ASTSource(self, signature, constants, configs[0])
-            kernel = self.compile(
-                src,
-                target=target,
-                options=options.__dict__,
-            )
-            self.cache[device][key] = kernel
-            self._call_hook(key, signature, device, constants, options, configs, warmup, before=False)
+            src = self.ASTSource(self, signature, constexprs, attrs)
+            kernel = self.compile(src, target=target, options=options.__dict__)
+            kernel_cache[key] = kernel
+            self._call_hook(key, signature, device, constexprs, options, [attrs], warmup, before=False)
         # Check that used global values have not changed.
         not_present = object()
@@ -639,21 +581,21 @@ class JITFunction(KernelInterface[T]):
             # canonicalize grid
             assert grid is not None
             if callable(grid):
-                # Arguments are passed as a dict to `grid`, by contract.
-                # TODO(jlebar): In the new launch API, pass the compiler flags as a
-                # second parameter to `grid`.
                 grid = grid(bound_args)
             grid_size = len(grid)
             grid_0 = grid[0]
             grid_1 = grid[1] if grid_size > 1 else 1
             grid_2 = grid[2] if grid_size > 2 else 1
             # launch kernel
-            launch_metadata = kernel.launch_metadata(grid, stream, *non_constexpr_vals)
-            kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,
-                       self.CompiledKernel.launch_enter_hook, self.CompiledKernel.launch_exit_hook, *non_constexpr_vals)
+            launch_metadata = kernel.launch_metadata(grid, stream, *bound_args.values())
+            kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+                       launch_metadata, self.CompiledKernel.launch_enter_hook, self.CompiledKernel.launch_exit_hook,
+                       *bound_args.values())
         return kernel
+    def repr(self, _):
+        return self._fn_name if self._repr is None else self._repr(_)
     def __init__(self, fn, version=None, do_not_specialize=None, do_not_specialize_on_alignment=None, debug=None,
                  noinline=None, repr=None, launch_metadata=None):
         do_not_specialize = do_not_specialize if do_not_specialize else []
@@ -666,11 +608,10 @@ class JITFunction(KernelInterface[T]):
         self.do_not_specialize = do_not_specialize
         self.do_not_specialize_on_alignment = do_not_specialize_on_alignment
         self.starting_line_number = inspect.getsourcelines(fn)[1]
-        self.repr = lambda _: fn.__name__ if repr is None else repr(_)
+        self._repr = repr
+        self._fn_name = fn.__name__
         self.launch_metadata = launch_metadata
-        self.binder = None
         self.params = []
         for i, param in enumerate(self.signature.parameters.values()):
             dns = i in do_not_specialize or param.name in do_not_specialize
@@ -678,10 +619,11 @@ class JITFunction(KernelInterface[T]):
             self.params.append(KernelParam(i, param, dns, dns_oa))
         # function source code (without decorators)
-        self.src = textwrap.dedent(inspect.getsource(fn))
-        self.src = self.src[re.search(r"^def\s+\w+\s*\(", self.src, re.MULTILINE).start():]
+        src = textwrap.dedent(inspect.getsource(fn))
+        src = src[re.search(r"^def\s+\w+\s*\(", src, re.MULTILINE).start():]
+        self._unsafe_update_src(src)
         # cache of just-in-time compiled kernels
-        self.cache = defaultdict(dict)
+        self.device_caches = defaultdict(self.create_binder)
         self.hash = None
         # Map of global variables used by the function and any functions it
@@ -698,6 +640,7 @@ class JITFunction(KernelInterface[T]):
         # JITFunction can be instantiated as kernel
         # when called with a grid using __getitem__
         self.kernel = None
+        self.debug = debug
         self.noinline = noinline
         # TODO(jlebar): Remove uses of these fields outside this file, then
@@ -729,7 +672,6 @@ class JITFunction(KernelInterface[T]):
     def preload(self, specialization_data):
         from ..compiler import compile, ASTSource
-        from triton.backends.compiler import AttrsDescriptor
         import json
         import triton.language as tl
         device = driver.active.get_current_device()
@@ -737,19 +679,24 @@ class JITFunction(KernelInterface[T]):
         if deserialized_obj['name'] != self.fn.__name__:
             raise RuntimeError(
                 f"Specialization data is for {deserialized_obj['name']} but trying to preload for {self.fn.__name__}")
+        constant_keys = map(tuple, deserialized_obj['constant_keys'])
+        constant_vals = deserialized_obj['constant_vals']
         constants = {
             key: tl.dtype(value) if tl.dtype.is_dtype(value) else value
-            for key, value in deserialized_obj['constants'].items()
+            for key, value in zip(constant_keys, constant_vals)
         }
+        attrs_keys = map(tuple, deserialized_obj['attrs_keys'])
+        attrs_vals = deserialized_obj['attrs_vals']
+        attrs = dict(zip(attrs_keys, attrs_vals))
         signature = dict(deserialized_obj['signature'].items())
-        src = ASTSource(self, signature, constants, AttrsDescriptor.from_dict(deserialized_obj['attrs']))
+        src = ASTSource(self, signature, constants, attrs)
         options = {
             key: tuple(value) if isinstance(value, list) else value
             for key, value in deserialized_obj['options'].items()
         }
         key = deserialized_obj['key']
         kernel = compile(src, None, options)
-        self.cache[device][key] = kernel
+        self.device_caches[device][0][key] = kernel
         return kernel
     # we do not parse `src` in the constructor because
@@ -766,11 +713,20 @@ class JITFunction(KernelInterface[T]):
         raise RuntimeError("Cannot call @triton.jit'd outside of the scope of a kernel")
     def __setattr__(self, name, value):
-        super(JITFunction, self).__setattr__(name, value)
-        # - when `.src` attribute is set, cache path needs
-        #   to be reinitialized
+        # - when `.src` attribute is set, cache key of all callers need to be re-computed
         if name == "src":
-            self.hash = None
+            raise AttributeError(f"Cannot set attribute '{name}' directly. "
+                                 f"Use '_unsafe_update_src()' and manually clear `.hash` of all callers"
+                                 f"instead.")
+        super(JITFunction, self).__setattr__(name, value)
+    def _unsafe_update_src(self, new_src):
+        """
+        The only method allowed to modify src.
+        Bypasses the __setattr__ restriction by calling super().__setattr__ directly.
+        """
+        self.hash = None
+        super().__setattr__('src', new_src)
     def __repr__(self):
         return f"JITFunction({self.module}:{self.fn.__name__})"
@@ -896,8 +852,8 @@ class TensorWrapper:
     def data_ptr(self):
         return self.base.data_ptr()
-    def stride(self, i):
-        return self.base.stride(i)
+    def stride(self, *args):
+        return self.base.stride(*args)
     def __str__(self) -> str:
         return f"TensorWrapper[{self.dtype}]({self.base})"
@@ -917,6 +873,9 @@ class TensorWrapper:
     def to(self, device):
         return TensorWrapper(self.base.to(device), self.dtype)
+    def new_empty(self, sizes):
+        return TensorWrapper(self.base.new_empty(sizes), self.dtype)
 def reinterpret(tensor, dtype):
     if isinstance(tensor, TensorWrapper):

triton/testing.py CHANGED Viewed

@@ -1,5 +1,7 @@
 import functools
+import math
 import os
+import statistics
 import subprocess
 import sys
 from contextlib import contextmanager
@@ -17,16 +19,42 @@ def nvsmi(attrs):
     return ret
+# pure Python implementation of np.quantile/torch.quantile
+# to avoid unnecessary runtime dependency on numpy/torch
+def _quantile(a, q):
+    n = len(a)
+    a = sorted(a)
+    def get_quantile(q):
+        if not (0 <= q <= 1):
+            raise ValueError("Quantiles must be in the range [0, 1]")
+        point = q * (n - 1)
+        lower = math.floor(point)
+        upper = math.ceil(point)
+        t = point - lower
+        return (1 - t) * a[lower] + t * a[upper]
+    return [get_quantile(q) for q in q]
 def _summarize_statistics(times, quantiles, return_mode):
-    import torch
     if quantiles is not None:
-        ret = torch.quantile(times, torch.tensor(quantiles, dtype=torch.float)).tolist()
+        ret = _quantile(times, quantiles)
         if len(ret) == 1:
             ret = ret[0]
         return ret
     if return_mode == "all":
-        return times.tolist()
-    return getattr(torch, return_mode)(times).item()
+        return times
+    elif return_mode == "min":
+        return min(times)
+    elif return_mode == "max":
+        return max(times)
+    elif return_mode == "mean":
+        return statistics.mean(times)
+    elif return_mode == "median":
+        return statistics.median(times)
 def do_bench_cudagraph(fn, rep=20, grad_to_none=None, quantiles=None, return_mode="mean"):
@@ -39,7 +67,7 @@ def do_bench_cudagraph(fn, rep=20, grad_to_none=None, quantiles=None, return_mod
     :type rep: int
     :param grad_to_none: Reset the gradient of the provided tensor to None
     :type grad_to_none: torch.tensor, optional
-    :param return_mode: The statistical measure to return. Options are "min", "max", "mean", "median", or "all" Default is "mean".
+    :param return_mode: The statistical measure to return. Options are "min", "max", "mean", "median", or "all". Default is "mean".
     :type return_mode: str
     """
     import torch
@@ -89,7 +117,7 @@ def do_bench_cudagraph(fn, rep=20, grad_to_none=None, quantiles=None, return_mod
             end_event.record()
             torch.cuda.synchronize()
             ret += [start_event.elapsed_time(end_event) / n_repeat]
-        return _summarize_statistics(torch.tensor(ret), quantiles, return_mode)
+        return _summarize_statistics(ret, quantiles, return_mode)
 def do_bench(fn, warmup=25, rep=100, grad_to_none=None, quantiles=None, return_mode="mean"):
@@ -107,10 +135,10 @@ def do_bench(fn, warmup=25, rep=100, grad_to_none=None, quantiles=None, return_m
     :type grad_to_none: torch.tensor, optional
     :param quantiles: Performance percentile to return in addition to the median.
     :type quantiles: list[float], optional
-    :param return_mode: The statistical measure to return. Options are "min", "max", "mean", "median", or "all" Default is "mean".    :type return_mode: str
+    :param return_mode: The statistical measure to return. Options are "min", "max", "mean", "median", or "all". Default is "mean".
+    :type return_mode: str
     """
     assert return_mode in ["min", "max", "mean", "median", "all"]
-    import torch
     di = runtime.driver.active.get_device_interface()
@@ -124,7 +152,7 @@ def do_bench(fn, warmup=25, rep=100, grad_to_none=None, quantiles=None, return_m
     end_event = di.Event(enable_timing=True)
     start_event.record()
     for _ in range(5):
-        cache.zero_()
+        runtime.driver.active.clear_cache(cache)
         fn()
     end_event.record()
     di.synchronize()
@@ -147,14 +175,14 @@ def do_bench(fn, warmup=25, rep=100, grad_to_none=None, quantiles=None, return_m
             for x in grad_to_none:
                 x.grad = None
         # we clear the L2 cache before each run
-        cache.zero_()
+        runtime.driver.active.clear_cache(cache)
         # record time of `fn`
         start_event[i].record()
         fn()
         end_event[i].record()
     # Record clocks
     di.synchronize()
-    times = torch.tensor([s.elapsed_time(e) for s, e in zip(start_event, end_event)], dtype=torch.float)
+    times = [s.elapsed_time(e) for s, e in zip(start_event, end_event)]
     return _summarize_statistics(times, quantiles, return_mode)