PyPI - triton-windows - Versions diffs - 3.3.1.post21__cp313-cp313-win_amd64.whl → 3.4.0.post21__cp313-cp313-win_amd64.whl - Mend

triton-windows 3.3.1.post21__cp313-cp313-win_amd64.whl → 3.4.0.post21__cp313-cp313-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of triton-windows might be problematic. Click here for more details.

Files changed (68) hide show

triton/_C/libtriton.pyd +0 -0
triton/__init__.py +4 -1
triton/_filecheck.py +87 -0
triton/_internal_testing.py +26 -15
triton/_utils.py +110 -21
triton/backends/__init__.py +20 -23
triton/backends/amd/__init__.py +0 -0
triton/backends/amd/compiler.py +112 -78
triton/backends/amd/driver.c +5 -2
triton/backends/amd/driver.py +143 -46
triton/backends/compiler.py +7 -21
triton/backends/nvidia/bin/ptxas.exe +0 -0
triton/backends/nvidia/compiler.py +94 -94
triton/backends/nvidia/driver.c +90 -98
triton/backends/nvidia/driver.py +296 -125
triton/compiler/code_generator.py +212 -111
triton/compiler/compiler.py +110 -25
triton/experimental/__init__.py +0 -0
triton/experimental/gluon/__init__.py +4 -0
triton/experimental/gluon/_compiler.py +0 -0
triton/experimental/gluon/_runtime.py +99 -0
triton/experimental/gluon/language/__init__.py +18 -0
triton/experimental/gluon/language/_core.py +312 -0
triton/experimental/gluon/language/_layouts.py +230 -0
triton/experimental/gluon/language/_math.py +12 -0
triton/experimental/gluon/language/_semantic.py +287 -0
triton/experimental/gluon/language/_standard.py +47 -0
triton/experimental/gluon/language/nvidia/__init__.py +4 -0
triton/experimental/gluon/language/nvidia/blackwell/__init__.py +202 -0
triton/experimental/gluon/language/nvidia/blackwell/tma.py +32 -0
triton/experimental/gluon/language/nvidia/hopper/__init__.py +11 -0
triton/experimental/gluon/language/nvidia/hopper/mbarrier.py +51 -0
triton/experimental/gluon/language/nvidia/hopper/tma.py +96 -0
triton/experimental/gluon/nvidia/__init__.py +4 -0
triton/experimental/gluon/nvidia/blackwell.py +3 -0
triton/experimental/gluon/nvidia/hopper.py +40 -0
triton/knobs.py +481 -0
triton/language/__init__.py +39 -14
triton/language/core.py +794 -537
triton/language/extra/cuda/__init__.py +10 -7
triton/language/extra/cuda/gdc.py +42 -0
triton/language/extra/cuda/libdevice.py +394 -394
triton/language/extra/cuda/utils.py +21 -21
triton/language/extra/hip/libdevice.py +113 -104
triton/language/math.py +65 -66
triton/language/random.py +12 -2
triton/language/semantic.py +1706 -1770
triton/language/standard.py +116 -51
triton/runtime/autotuner.py +117 -59
triton/runtime/build.py +73 -9
triton/runtime/cache.py +18 -47
triton/runtime/driver.py +32 -29
triton/runtime/interpreter.py +72 -35
triton/runtime/jit.py +146 -110
triton/testing.py +16 -12
triton/tools/disasm.py +3 -4
triton/tools/tensor_descriptor.py +36 -0
triton/windows_utils.py +47 -83
{triton_windows-3.3.1.post21.dist-info → triton_windows-3.4.0.post21.dist-info}/METADATA +7 -2
{triton_windows-3.3.1.post21.dist-info → triton_windows-3.4.0.post21.dist-info}/RECORD +64 -41
triton_windows-3.4.0.post21.dist-info/entry_points.txt +3 -0
triton_windows-3.4.0.post21.dist-info/licenses/LICENSE +23 -0
triton_windows-3.4.0.post21.dist-info/top_level.txt +1 -0
triton/language/_utils.py +0 -21
triton/language/extra/cuda/_experimental_tma.py +0 -106
triton/tools/experimental_descriptor.py +0 -32
triton_windows-3.3.1.post21.dist-info/top_level.txt +0 -14
{triton_windows-3.3.1.post21.dist-info → triton_windows-3.4.0.post21.dist-info}/WHEEL +0 -0

triton/compiler/compiler.py CHANGED Viewed

@@ -3,19 +3,19 @@ import hashlib
 import json
 from .._C.libtriton import get_cache_invalidating_env_vars, ir
 from ..backends import backends
-from ..backends.compiler import GPUTarget
-from .. import __version__
+from ..backends.compiler import Language
+from ..backends.compiler import BaseBackend, GPUTarget
+from .. import __version__, knobs
 from ..runtime.autotuner import OutOfResources
 from ..runtime.cache import get_cache_manager, get_dump_manager, get_override_manager
 from ..runtime.driver import driver
 from ..tools.disasm import get_sass
-# TODO: this shouldn't be here
-from .code_generator import ast_to_ttir
 from pathlib import Path
 import re
 import functools
 import os
 import sysconfig
+import time
 # - ^\s*tt\.func\s+ : match the start of the string, any leading whitespace, the keyword func,
 #    and any following whitespace
@@ -53,6 +53,7 @@ class ASTSource:
     def __init__(self, fn, signature, constexprs=None, attrs=None) -> None:
         self.fn = fn
+        self.language = Language.TRITON
         self.ext = "ttir"
         self.name = fn.__name__
         self.signature = signature
@@ -78,6 +79,7 @@ class ASTSource:
         return hashlib.sha256(key.encode("utf-8")).hexdigest()
     def make_ir(self, options, codegen_fns, module_map, context):
+        from .code_generator import ast_to_ttir
         return ast_to_ttir(self.fn, self, context=context, options=options, codegen_fns=codegen_fns,
                            module_map=module_map)
@@ -91,6 +93,7 @@ class IRSource:
         self.path = path
         path = Path(path)
         self.ext = path.suffix[1:]
+        self.language = Language.TRITON
         self.src = path.read_text()
         ir.load_dialects(context)
         backend.load_dialects(context)
@@ -162,6 +165,11 @@ def triton_key():
     return f'{__version__}' + '-'.join(contents)
+@functools.lru_cache()
+def max_shared_mem(device):
+    return driver.active.utils.get_device_properties(device)["max_shared_mem"]
 def parse(full_name, ext, context):
     if ext == "ttir" or ext == "ttgir":
         module = ir.parse_mlir_module(full_name, context)
@@ -179,7 +187,7 @@ def filter_traceback(e: BaseException):
     These are uninteresting to the user -- "just show me *my* code!"
     """
-    if os.getenv("TRITON_FRONT_END_DEBUGGING", "0") == "1":
+    if knobs.compilation.front_end_debugging:
         return
     if e.__cause__ is not None:
@@ -211,7 +219,50 @@ def filter_traceback(e: BaseException):
         e.__traceback__ = frames[0]
+class CompileTimer:
+    def __init__(self) -> None:
+        self.start: float = time.perf_counter()
+        self.ir_initialization_end: float | None = None
+        self.lowering_stage_ends: list[tuple[str, float]] = []
+        self.store_results_end: float | None = None
+    def finished_ir_initialization(self) -> None:
+        self.ir_initialization_end = time.perf_counter()
+    def stage_finished(self, stage_name: str) -> None:
+        self.lowering_stage_ends.append((stage_name, time.perf_counter()))
+    def end(self) -> knobs.CompileTimes:
+        timestamp = time.perf_counter()
+        if self.ir_initialization_end is None:
+            self.ir_initialization_end = timestamp
+        else:
+            self.store_results_end = timestamp
+        def delta(start: float, end: float | None) -> int:
+            if end is None:
+                return 0
+            return int((end - start) * 1000000)
+        lowering_stage_durations = []
+        stage_start = self.ir_initialization_end
+        for stage_name, stage_end in self.lowering_stage_ends:
+            lowering_stage_durations.append((stage_name, delta(stage_start, stage_end)))
+            stage_start = stage_end
+        return knobs.CompileTimes(
+            ir_initialization=delta(self.start, self.ir_initialization_end),
+            lowering_stages=lowering_stage_durations,
+            store_results=delta(stage_start, self.store_results_end),
+        )
 def compile(src, target=None, options=None):
+    compilation_listener = knobs.compilation.listener
+    if compilation_listener:
+        timer = CompileTimer()
     if target is None:
         target = driver.active.get_current_target()
     assert isinstance(target, GPUTarget), "target must be of GPUTarget type"
@@ -232,9 +283,9 @@ def compile(src, target=None, options=None):
     fn_cache_manager = get_cache_manager(hash)
     # For dumping/overriding only hash the source as we want it to be independent of triton
     # core changes to make it easier to track kernels by hash.
-    enable_override = os.environ.get("TRITON_KERNEL_OVERRIDE", "0") == "1"
-    enable_ir_dump = os.environ.get("TRITON_KERNEL_DUMP", "0") == "1"
-    store_only_binary = os.environ.get("TRITON_STORE_BINARY_ONLY", "0") == "1"
+    enable_override = knobs.compilation.override
+    enable_ir_dump = knobs.compilation.dump_ir
+    store_only_binary = knobs.compilation.store_binary_only
     fn_override_manager = get_override_manager(src.hash()) if enable_override else None
     fn_dump_manager = get_dump_manager(src.hash()) if enable_ir_dump else None
     # Pre-truncate the file name here to avoid hitting the 255 character limit on common platforms.
@@ -245,10 +296,20 @@ def compile(src, target=None, options=None):
     metadata_filename = f"{file_name}.json"
     metadata_group = fn_cache_manager.get_group(metadata_filename) or {}
     metadata_path = metadata_group.get(metadata_filename)
-    always_compile = os.environ.get("TRITON_ALWAYS_COMPILE", "0") == "1"
+    always_compile = knobs.compilation.always_compile
     if not always_compile and metadata_path is not None:
         # cache hit!
-        return CompiledKernel(src, metadata_group, hash)
+        res = CompiledKernel(src, metadata_group, hash)
+        if compilation_listener:
+            compilation_listener(
+                src=src,
+                metadata=res.metadata._asdict(),
+                metadata_group=metadata_group,
+                times=timer.end(),
+                cache_hit=True,
+            )
+        return res
     # initialize metadata
     metadata = {
         "hash": hash,
@@ -259,7 +320,7 @@ def compile(src, target=None, options=None):
     metadata["triton_version"] = __version__
     # run compilation pipeline  and populate metadata
     stages = dict()
-    backend.add_stages(stages, options)
+    backend.add_stages(stages, options, src.language)
     first_stage = list(stages.keys()).index(src.ext)
     # when the source is an IR file, don't apply the passes related to this stage. This makes it easier to write IR level tests.
     if ir_source:
@@ -279,11 +340,30 @@ def compile(src, target=None, options=None):
     except Exception as e:
         filter_traceback(e)
         raise
-    use_ir_loc = os.environ.get("USE_IR_LOC", None)
+    if ir_source:
+        ir_filename = f"{file_name}.{src.ext}"
+        metadata_group[ir_filename] = fn_cache_manager.put(module, ir_filename)
+    else:
+        ir_filename = f"{file_name}.source"
+        metadata_group[ir_filename] = fn_cache_manager.put(module, ir_filename)
+    use_ir_loc = knobs.compilation.use_ir_loc
+    if ir_source and use_ir_loc:
+        module.create_location_snapshot(src.path)
+        print(f"Creating new locations for {src.path}")
+    if compilation_listener:
+        timer.finished_ir_initialization()
     for ext, compile_ir in list(stages.items())[first_stage:]:
         next_module = compile_ir(module, metadata)
         ir_filename = f"{file_name}.{ext}"
-        if (fn_override_manager is not None and (full_name := fn_override_manager.get_file(ir_filename)) is not None):
+        if fn_override_manager is None:
+            # Users can override kernels at scale by setting `ir_override` in autotune config
+            # without TRITON_KERNEL_OVERRIDE
+            if (ir_override := metadata.get("ir_override", None)) and ir_override.endswith(f".{ext}"):
+                next_module = parse(ir_override, ext, context)
+        elif full_name := fn_override_manager.get_file(ir_filename):
             print(f"\nOverriding kernel with file {full_name}")
             next_module = parse(full_name, ext, context)
         # If TRITON_STORE_BINARY_ONLY is 1, only store cubin/hsaco/json
@@ -297,6 +377,8 @@ def compile(src, target=None, options=None):
             next_module.create_location_snapshot(ir_full_name)
             print(f"Creating new locations for {ir_full_name}")
         module = next_module
+        if compilation_listener:
+            timer.stage_finished(ext)
     # write-back metadata
     metadata_group[metadata_filename] = fn_cache_manager.put(json.dumps(metadata, default=vars), metadata_filename,
                                                              binary=False)
@@ -310,13 +392,18 @@ def compile(src, target=None, options=None):
     # this is likely due to the llvm-symbolizer forking a process
     # TODO: Reconcile the difference here between the ASAN and non-ASAN path with enabling
     # multithreading in the MLIR context
-    if not os.environ.get("TRITON_ENABLE_ASAN", "0") == "1":
+    if not knobs.compilation.enable_asan:
         context.disable_multithreading()
+    # notify any listener
+    if compilation_listener:
+        compilation_listener(src=src, metadata=metadata, metadata_group=metadata_group, times=timer.end(),
+                             cache_hit=False)
     # return handle to compiled kernel
     return CompiledKernel(src, metadata_group, hash)
-def make_backend(target):
+def make_backend(target: GPUTarget) -> BaseBackend:
     actives = [x.compiler for x in backends.values() if x.compiler.supports_target(target)]
     if len(actives) != 1:
         raise RuntimeError(
@@ -330,7 +417,7 @@ class LazyDict:
         self.data = data
         self.extras = []
-    def get(self) -> None:
+    def get(self):
         for func, args in self.extras:
             self.data = self.data | func(*args)
         self.extras.clear()
@@ -355,11 +442,6 @@ class AsmDict(dict):
 class CompiledKernel:
-    # Hooks for external tools to monitor the execution of triton kernels
-    # TODO: move out of this namespace since it's a runtime thing
-    launch_enter_hook = None
-    launch_exit_hook = None
     def __init__(self, src, metadata_group, hash):
         from collections import namedtuple
         metadata_path = next((Path(p) for c, p in metadata_group.items() if c.endswith(".json")))
@@ -396,7 +478,7 @@ class CompiledKernel:
         # create launcher
         self.run = driver.active.launcher_cls(self.src, self.metadata)
         # not enough shared memory to run the kernel
-        max_shared = driver.active.utils.get_device_properties(device)["max_shared_mem"]
+        max_shared = max_shared_mem(device)
         if self.metadata.shared > max_shared:
             raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
         if hasattr(self.metadata, "tmem_size") and self.metadata.tmem_size is not None:
@@ -405,8 +487,11 @@ class CompiledKernel:
             if self.metadata.tmem_size > max_tmem_size:
                 raise OutOfResources(self.metadata.tmem_size, max_tmem_size, "tensor memory")
         # TODO: n_regs, n_spills should be metadata generated when calling `ptxas`
-        self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+        self.module, self.function, self.n_regs, self.n_spills, self.n_max_threads = driver.active.utils.load_binary(
             self.name, self.kernel, self.metadata.shared, device)
+        warp_size = driver.active.get_current_target().warp_size
+        if self.metadata.num_warps * warp_size > self.n_max_threads:
+            raise OutOfResources(self.metadata.num_warps * warp_size, self.n_max_threads, "threads")
     def __getattribute__(self, name):
         if name == 'run':
@@ -414,7 +499,7 @@ class CompiledKernel:
         return super().__getattribute__(name)
     def launch_metadata(self, grid, stream, *args):
-        if CompiledKernel.launch_enter_hook is None:
+        if knobs.runtime.launch_enter_hook is None:
             return None
         ret = LazyDict({"name": self.name, "function": self.function, "stream": stream})
         if not isinstance(self.src, ASTSource) or self.src.fn.launch_metadata is None:
@@ -436,6 +521,6 @@ class CompiledKernel:
                 stream = driver.active.get_current_stream(device)
             launch_metadata = self.launch_metadata(grid, stream, *args)
             self.run(grid[0], grid[1], grid[2], stream, self.function, self.packed_metadata, launch_metadata,
-                     CompiledKernel.launch_enter_hook, CompiledKernel.launch_exit_hook, *args)
+                     knobs.runtime.launch_enter_hook, knobs.runtime.launch_exit_hook, *args)
         return runner

triton/experimental/__init__.py ADDED Viewed

File without changes

triton/experimental/gluon/__init__.py ADDED Viewed

@@ -0,0 +1,4 @@
+from . import nvidia
+from ._runtime import jit
+__all__ = ["jit", "nvidia"]

triton/experimental/gluon/_compiler.py ADDED Viewed

File without changes

triton/experimental/gluon/_runtime.py ADDED Viewed

@@ -0,0 +1,99 @@
+from __future__ import annotations
+import triton
+from triton.compiler.compiler import ASTSource
+from triton.backends.compiler import Language
+from triton.runtime.jit import JITFunction
+from typing import TypeVar, Optional, Callable, Iterable, Union
+from triton._C.libtriton import ir
+T = TypeVar("T")
+class GluonASTSource(ASTSource):
+    def __init__(self, fn, signature, constexprs=None, attrs=None) -> None:
+        super().__init__(fn, signature, constexprs, attrs)
+        self.language = Language.GLUON
+        self.ext = "ttgir"
+    def make_ir(self, options, codegen_fns, module_map, context):
+        from triton.compiler.compiler import make_backend
+        from triton.compiler.code_generator import ast_to_ttir
+        builder = ir.builder(context)
+        module = builder.create_module()
+        # Assign module attributes eagerly, as they are needed to verify layouts
+        target = triton.runtime.driver.active.get_current_target()
+        backend = make_backend(target)
+        target = backend.get_target_name(options)
+        module.set_attr("ttg.target", builder.get_string_attr(target))
+        module.set_attr("ttg.num-warps", builder.get_int32_attr(options.num_warps))
+        module.set_attr("ttg.num-ctas", builder.get_int32_attr(options.num_ctas))
+        module.set_attr("ttg.threads-per-warp", builder.get_int32_attr(32))
+        if options.maxnreg is not None:
+            module.set_attr("ttg.maxnreg", builder.get_int32_attr(options.maxnreg))
+        module = ast_to_ttir(self.fn, self, context=context, options=options, codegen_fns=codegen_fns,
+                             module_map=module_map, module=module)
+        return module
+class GluonJITFunction(JITFunction[T]):
+    def create_binder(self):
+        result = super().create_binder()
+        self.ASTSource = GluonASTSource
+        return result
+    def is_gluon(self):
+        return True
+def jit(
+    fn: Optional[T] = None,
+    *,
+    version=None,
+    repr: Optional[Callable] = None,
+    launch_metadata: Optional[Callable] = None,
+    do_not_specialize: Optional[Iterable[int | str]] = None,
+    do_not_specialize_on_alignment: Optional[Iterable[int | str]] = None,
+    debug: Optional[bool] = None,
+    noinline: Optional[bool] = None,
+) -> Union[GluonJITFunction[T], Callable[[T], JITFunction[T]]]:
+    """
+    Decorator for JIT-compiling a function using the Triton compiler.
+    :note: When a jit'd function is called, arguments are
+        implicitly converted to pointers if they have a :code:`.data_ptr()` method
+        and a `.dtype` attribute.
+    :note: This function will be compiled and run on the GPU. It will only have access to:
+           * python primitives,
+           * builtins within the triton package,
+           * arguments to this function,
+           * other jit'd functions
+    :param fn: the function to be jit-compiled
+    :type fn: Callable
+    """
+    def decorator(fn: T) -> JITFunction[T]:
+        assert callable(fn)
+        return GluonJITFunction(
+            fn,
+            version=version,
+            do_not_specialize=do_not_specialize,
+            do_not_specialize_on_alignment=do_not_specialize_on_alignment,
+            debug=debug,
+            noinline=noinline,
+            repr=repr,
+            launch_metadata=launch_metadata,
+        )
+    if fn is not None:
+        return decorator(fn)
+    else:
+        return decorator

triton/experimental/gluon/language/__init__.py ADDED Viewed

@@ -0,0 +1,18 @@
+from ._core import *  # NOQA: F403
+from ._core import __all__ as __core_all
+from ._layouts import *  # NOQA: F403
+from ._layouts import __all__ as __layouts_all
+from ._math import *  # NOQA: F403
+from ._math import __all__ as __math_all
+from ._standard import *  # NOQA: F403
+from ._standard import __all__ as __standard_all
+from . import nvidia
+__all__ = [
+    *__core_all,
+    *__layouts_all,
+    *__math_all,
+    *__standard_all,
+    "nvidia",
+]