triton-windows 3.5.0.post21__cp314-cp314-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of triton-windows might be problematic. Click here for more details.

Files changed (217) hide show
  1. triton/_C/libtriton.pyd +0 -0
  2. triton/__init__.py +82 -0
  3. triton/_filecheck.py +97 -0
  4. triton/_internal_testing.py +255 -0
  5. triton/_utils.py +126 -0
  6. triton/backends/__init__.py +47 -0
  7. triton/backends/amd/__init__.py +0 -0
  8. triton/backends/amd/compiler.py +461 -0
  9. triton/backends/amd/driver.c +283 -0
  10. triton/backends/amd/driver.py +724 -0
  11. triton/backends/amd/lib/asanrtl.bc +0 -0
  12. triton/backends/amd/lib/ockl.bc +0 -0
  13. triton/backends/amd/lib/ocml.bc +0 -0
  14. triton/backends/compiler.py +90 -0
  15. triton/backends/driver.py +66 -0
  16. triton/backends/nvidia/__init__.py +0 -0
  17. triton/backends/nvidia/bin/ptxas.exe +0 -0
  18. triton/backends/nvidia/compiler.py +533 -0
  19. triton/backends/nvidia/driver.c +517 -0
  20. triton/backends/nvidia/driver.py +799 -0
  21. triton/backends/nvidia/include/cuda.h +26280 -0
  22. triton/backends/nvidia/lib/libdevice.10.bc +0 -0
  23. triton/backends/nvidia/lib/x64/cuda.lib +0 -0
  24. triton/compiler/__init__.py +7 -0
  25. triton/compiler/code_generator.py +1614 -0
  26. triton/compiler/compiler.py +509 -0
  27. triton/compiler/errors.py +51 -0
  28. triton/compiler/make_launcher.py +0 -0
  29. triton/errors.py +5 -0
  30. triton/experimental/__init__.py +0 -0
  31. triton/experimental/gluon/__init__.py +5 -0
  32. triton/experimental/gluon/_compiler.py +0 -0
  33. triton/experimental/gluon/_runtime.py +102 -0
  34. triton/experimental/gluon/language/__init__.py +119 -0
  35. triton/experimental/gluon/language/_core.py +490 -0
  36. triton/experimental/gluon/language/_layouts.py +583 -0
  37. triton/experimental/gluon/language/_math.py +20 -0
  38. triton/experimental/gluon/language/_semantic.py +380 -0
  39. triton/experimental/gluon/language/_standard.py +80 -0
  40. triton/experimental/gluon/language/amd/__init__.py +4 -0
  41. triton/experimental/gluon/language/amd/_layouts.py +96 -0
  42. triton/experimental/gluon/language/amd/cdna3/__init__.py +100 -0
  43. triton/experimental/gluon/language/amd/cdna4/__init__.py +48 -0
  44. triton/experimental/gluon/language/amd/cdna4/async_copy.py +151 -0
  45. triton/experimental/gluon/language/extra/__init__.py +3 -0
  46. triton/experimental/gluon/language/nvidia/__init__.py +4 -0
  47. triton/experimental/gluon/language/nvidia/ampere/__init__.py +3 -0
  48. triton/experimental/gluon/language/nvidia/ampere/async_copy.py +74 -0
  49. triton/experimental/gluon/language/nvidia/ampere/mbarrier.py +80 -0
  50. triton/experimental/gluon/language/nvidia/blackwell/__init__.py +387 -0
  51. triton/experimental/gluon/language/nvidia/blackwell/tma.py +52 -0
  52. triton/experimental/gluon/language/nvidia/hopper/__init__.py +132 -0
  53. triton/experimental/gluon/language/nvidia/hopper/mbarrier.py +34 -0
  54. triton/experimental/gluon/language/nvidia/hopper/tma.py +97 -0
  55. triton/experimental/gluon/nvidia/__init__.py +4 -0
  56. triton/experimental/gluon/nvidia/blackwell.py +3 -0
  57. triton/experimental/gluon/nvidia/hopper.py +45 -0
  58. triton/knobs.py +546 -0
  59. triton/language/__init__.py +342 -0
  60. triton/language/core.py +3405 -0
  61. triton/language/extra/__init__.py +26 -0
  62. triton/language/extra/cuda/__init__.py +16 -0
  63. triton/language/extra/cuda/gdc.py +42 -0
  64. triton/language/extra/cuda/libdevice.py +1629 -0
  65. triton/language/extra/cuda/utils.py +109 -0
  66. triton/language/extra/hip/__init__.py +5 -0
  67. triton/language/extra/hip/libdevice.py +491 -0
  68. triton/language/extra/hip/utils.py +35 -0
  69. triton/language/extra/libdevice.py +790 -0
  70. triton/language/math.py +249 -0
  71. triton/language/random.py +218 -0
  72. triton/language/semantic.py +1939 -0
  73. triton/language/standard.py +534 -0
  74. triton/language/target_info.py +54 -0
  75. triton/runtime/__init__.py +23 -0
  76. triton/runtime/_allocation.py +44 -0
  77. triton/runtime/_async_compile.py +55 -0
  78. triton/runtime/autotuner.py +476 -0
  79. triton/runtime/build.py +168 -0
  80. triton/runtime/cache.py +317 -0
  81. triton/runtime/driver.py +38 -0
  82. triton/runtime/errors.py +36 -0
  83. triton/runtime/interpreter.py +1414 -0
  84. triton/runtime/jit.py +1107 -0
  85. triton/runtime/tcc/include/_mingw.h +168 -0
  86. triton/runtime/tcc/include/assert.h +62 -0
  87. triton/runtime/tcc/include/conio.h +409 -0
  88. triton/runtime/tcc/include/ctype.h +281 -0
  89. triton/runtime/tcc/include/dir.h +31 -0
  90. triton/runtime/tcc/include/direct.h +68 -0
  91. triton/runtime/tcc/include/dirent.h +135 -0
  92. triton/runtime/tcc/include/dos.h +55 -0
  93. triton/runtime/tcc/include/errno.h +75 -0
  94. triton/runtime/tcc/include/excpt.h +123 -0
  95. triton/runtime/tcc/include/fcntl.h +52 -0
  96. triton/runtime/tcc/include/fenv.h +108 -0
  97. triton/runtime/tcc/include/float.h +75 -0
  98. triton/runtime/tcc/include/inttypes.h +297 -0
  99. triton/runtime/tcc/include/io.h +418 -0
  100. triton/runtime/tcc/include/iso646.h +36 -0
  101. triton/runtime/tcc/include/limits.h +116 -0
  102. triton/runtime/tcc/include/locale.h +91 -0
  103. triton/runtime/tcc/include/malloc.h +181 -0
  104. triton/runtime/tcc/include/math.h +497 -0
  105. triton/runtime/tcc/include/mem.h +13 -0
  106. triton/runtime/tcc/include/memory.h +40 -0
  107. triton/runtime/tcc/include/process.h +176 -0
  108. triton/runtime/tcc/include/sec_api/conio_s.h +42 -0
  109. triton/runtime/tcc/include/sec_api/crtdbg_s.h +19 -0
  110. triton/runtime/tcc/include/sec_api/io_s.h +33 -0
  111. triton/runtime/tcc/include/sec_api/mbstring_s.h +52 -0
  112. triton/runtime/tcc/include/sec_api/search_s.h +25 -0
  113. triton/runtime/tcc/include/sec_api/stdio_s.h +145 -0
  114. triton/runtime/tcc/include/sec_api/stdlib_s.h +67 -0
  115. triton/runtime/tcc/include/sec_api/stralign_s.h +30 -0
  116. triton/runtime/tcc/include/sec_api/string_s.h +41 -0
  117. triton/runtime/tcc/include/sec_api/sys/timeb_s.h +34 -0
  118. triton/runtime/tcc/include/sec_api/tchar_s.h +266 -0
  119. triton/runtime/tcc/include/sec_api/time_s.h +61 -0
  120. triton/runtime/tcc/include/sec_api/wchar_s.h +128 -0
  121. triton/runtime/tcc/include/setjmp.h +160 -0
  122. triton/runtime/tcc/include/share.h +28 -0
  123. triton/runtime/tcc/include/signal.h +63 -0
  124. triton/runtime/tcc/include/stdalign.h +16 -0
  125. triton/runtime/tcc/include/stdarg.h +14 -0
  126. triton/runtime/tcc/include/stdatomic.h +171 -0
  127. triton/runtime/tcc/include/stdbool.h +11 -0
  128. triton/runtime/tcc/include/stddef.h +42 -0
  129. triton/runtime/tcc/include/stdint.h +212 -0
  130. triton/runtime/tcc/include/stdio.h +429 -0
  131. triton/runtime/tcc/include/stdlib.h +591 -0
  132. triton/runtime/tcc/include/stdnoreturn.h +7 -0
  133. triton/runtime/tcc/include/string.h +164 -0
  134. triton/runtime/tcc/include/sys/fcntl.h +13 -0
  135. triton/runtime/tcc/include/sys/file.h +14 -0
  136. triton/runtime/tcc/include/sys/locking.h +30 -0
  137. triton/runtime/tcc/include/sys/stat.h +290 -0
  138. triton/runtime/tcc/include/sys/time.h +69 -0
  139. triton/runtime/tcc/include/sys/timeb.h +133 -0
  140. triton/runtime/tcc/include/sys/types.h +123 -0
  141. triton/runtime/tcc/include/sys/unistd.h +14 -0
  142. triton/runtime/tcc/include/sys/utime.h +146 -0
  143. triton/runtime/tcc/include/tcc/tcc_libm.h +618 -0
  144. triton/runtime/tcc/include/tccdefs.h +342 -0
  145. triton/runtime/tcc/include/tcclib.h +80 -0
  146. triton/runtime/tcc/include/tchar.h +1102 -0
  147. triton/runtime/tcc/include/tgmath.h +89 -0
  148. triton/runtime/tcc/include/time.h +287 -0
  149. triton/runtime/tcc/include/uchar.h +33 -0
  150. triton/runtime/tcc/include/unistd.h +1 -0
  151. triton/runtime/tcc/include/vadefs.h +11 -0
  152. triton/runtime/tcc/include/values.h +4 -0
  153. triton/runtime/tcc/include/varargs.h +12 -0
  154. triton/runtime/tcc/include/wchar.h +873 -0
  155. triton/runtime/tcc/include/wctype.h +172 -0
  156. triton/runtime/tcc/include/winapi/basetsd.h +149 -0
  157. triton/runtime/tcc/include/winapi/basetyps.h +85 -0
  158. triton/runtime/tcc/include/winapi/guiddef.h +156 -0
  159. triton/runtime/tcc/include/winapi/poppack.h +8 -0
  160. triton/runtime/tcc/include/winapi/pshpack1.h +8 -0
  161. triton/runtime/tcc/include/winapi/pshpack2.h +8 -0
  162. triton/runtime/tcc/include/winapi/pshpack4.h +8 -0
  163. triton/runtime/tcc/include/winapi/pshpack8.h +8 -0
  164. triton/runtime/tcc/include/winapi/qos.h +72 -0
  165. triton/runtime/tcc/include/winapi/shellapi.h +59 -0
  166. triton/runtime/tcc/include/winapi/winbase.h +2958 -0
  167. triton/runtime/tcc/include/winapi/wincon.h +309 -0
  168. triton/runtime/tcc/include/winapi/windef.h +293 -0
  169. triton/runtime/tcc/include/winapi/windows.h +127 -0
  170. triton/runtime/tcc/include/winapi/winerror.h +3166 -0
  171. triton/runtime/tcc/include/winapi/wingdi.h +4080 -0
  172. triton/runtime/tcc/include/winapi/winnls.h +778 -0
  173. triton/runtime/tcc/include/winapi/winnt.h +5837 -0
  174. triton/runtime/tcc/include/winapi/winreg.h +272 -0
  175. triton/runtime/tcc/include/winapi/winsock2.h +1474 -0
  176. triton/runtime/tcc/include/winapi/winuser.h +5651 -0
  177. triton/runtime/tcc/include/winapi/winver.h +160 -0
  178. triton/runtime/tcc/include/winapi/ws2ipdef.h +21 -0
  179. triton/runtime/tcc/include/winapi/ws2tcpip.h +391 -0
  180. triton/runtime/tcc/lib/cuda.def +697 -0
  181. triton/runtime/tcc/lib/gdi32.def +337 -0
  182. triton/runtime/tcc/lib/kernel32.def +770 -0
  183. triton/runtime/tcc/lib/libtcc1.a +0 -0
  184. triton/runtime/tcc/lib/msvcrt.def +1399 -0
  185. triton/runtime/tcc/lib/python3.def +810 -0
  186. triton/runtime/tcc/lib/python310.def +1610 -0
  187. triton/runtime/tcc/lib/python311.def +1633 -0
  188. triton/runtime/tcc/lib/python312.def +1703 -0
  189. triton/runtime/tcc/lib/python313.def +1651 -0
  190. triton/runtime/tcc/lib/python313t.def +1656 -0
  191. triton/runtime/tcc/lib/python314.def +1800 -0
  192. triton/runtime/tcc/lib/python314t.def +1809 -0
  193. triton/runtime/tcc/lib/python39.def +1644 -0
  194. triton/runtime/tcc/lib/python3t.def +905 -0
  195. triton/runtime/tcc/lib/user32.def +658 -0
  196. triton/runtime/tcc/libtcc.dll +0 -0
  197. triton/runtime/tcc/tcc.exe +0 -0
  198. triton/testing.py +543 -0
  199. triton/tools/__init__.py +0 -0
  200. triton/tools/build_extern.py +365 -0
  201. triton/tools/compile.py +210 -0
  202. triton/tools/disasm.py +143 -0
  203. triton/tools/extra/cuda/compile.c +70 -0
  204. triton/tools/extra/cuda/compile.h +14 -0
  205. triton/tools/extra/hip/compile.cpp +66 -0
  206. triton/tools/extra/hip/compile.h +13 -0
  207. triton/tools/link.py +322 -0
  208. triton/tools/mxfp.py +301 -0
  209. triton/tools/ragged_tma.py +92 -0
  210. triton/tools/tensor_descriptor.py +34 -0
  211. triton/windows_utils.py +405 -0
  212. triton_windows-3.5.0.post21.dist-info/METADATA +46 -0
  213. triton_windows-3.5.0.post21.dist-info/RECORD +217 -0
  214. triton_windows-3.5.0.post21.dist-info/WHEEL +5 -0
  215. triton_windows-3.5.0.post21.dist-info/entry_points.txt +3 -0
  216. triton_windows-3.5.0.post21.dist-info/licenses/LICENSE +23 -0
  217. triton_windows-3.5.0.post21.dist-info/top_level.txt +1 -0
@@ -0,0 +1,476 @@
1
+ from __future__ import annotations
2
+
3
+ import builtins
4
+ import time
5
+ import inspect
6
+ import hashlib
7
+ import json
8
+ from functools import cached_property
9
+ from typing import Dict, Tuple, List, Optional
10
+
11
+ from .. import knobs
12
+ from .jit import KernelInterface, JITFunction
13
+ from .errors import OutOfResources, PTXASError
14
+ from .driver import driver
15
+ from .cache import get_cache_manager, triton_key
16
+ from triton._C.libtriton import get_cache_invalidating_env_vars
17
+
18
+
19
+ class Autotuner(KernelInterface):
20
+
21
+ def __init__(self, fn, arg_names, configs, key, reset_to_zero, restore_value, pre_hook=None, post_hook=None,
22
+ prune_configs_by: Optional[Dict] = None, warmup=None, rep=None, use_cuda_graph=False, do_bench=None,
23
+ cache_results=False):
24
+ """
25
+ :param prune_configs_by: a dict of functions that are used to prune configs, fields:
26
+ 'perf_model': performance model used to predicate running time with different configs, returns running time
27
+ 'top_k': number of configs to bench
28
+ 'prune_num_stages_by'(optional): a function used to prune num_stages. It takes configs:List[Config] as its input, and returns pruned configs.
29
+ """
30
+ if not configs:
31
+ self.configs = [Config({}, num_warps=4, num_stages=3, num_ctas=1)]
32
+ else:
33
+ self.configs = configs
34
+ self.keys = key
35
+ self.cache: Dict[Tuple, Config] = {}
36
+ self.arg_names = arg_names
37
+ self.cache_results = cache_results or (knobs.autotuning.cache and not knobs.runtime.interpret)
38
+
39
+ # Reset to zero or restore values
40
+ self.reset_to_zero = []
41
+ if reset_to_zero is not None:
42
+ self.reset_to_zero = list(reset_to_zero)
43
+ self.restore_value = []
44
+ if restore_value is not None:
45
+ self.restore_value = list(restore_value)
46
+
47
+ # Hook to reset or restore for required tensors
48
+ self.pre_hook = lambda kwargs, reset_only=False: 0
49
+ self.post_hook = lambda kwargs, exception: 0
50
+ self.user_defined_pre_hook = False
51
+ self.user_defined_post_hook = False
52
+ if pre_hook:
53
+ self.pre_hook = pre_hook
54
+ self.user_defined_pre_hook = True
55
+ elif (len(self.reset_to_zero) > 0 or len(self.restore_value) > 0):
56
+
57
+ def _pre_hook(kwargs, reset_only=False):
58
+ for name in self.reset_to_zero:
59
+ kwargs[name].zero_()
60
+ if not reset_only:
61
+ self.restore_copies = {name: kwargs[name].clone() for name in self.restore_value}
62
+
63
+ self.pre_hook = _pre_hook
64
+
65
+ if post_hook:
66
+ self.post_hook = post_hook
67
+ self.user_defined_post_hook = True
68
+ elif len(self.restore_value) > 0:
69
+
70
+ def _post_hook(kwargs, exception):
71
+ for name in self.restore_value:
72
+ kwargs[name].copy_(self.restore_copies[name])
73
+ self.restore_copies = {}
74
+
75
+ self.post_hook = _post_hook
76
+
77
+ self.perf_model = None
78
+ self.configs_top_k = 1.0
79
+ self.early_config_prune = None
80
+ if prune_configs_by:
81
+ self.perf_model = prune_configs_by.get("perf_model", self.perf_model)
82
+ self.configs_top_k = prune_configs_by.get("top_k", self.configs_top_k)
83
+ self.early_config_prune = prune_configs_by.get("early_config_prune", self.early_config_prune)
84
+
85
+ self.fn = fn
86
+ self.base_fn = fn
87
+ while not inspect.isfunction(self.base_fn):
88
+ self.base_fn = self.base_fn.fn
89
+
90
+ self._do_bench = do_bench
91
+ self.num_warmups = warmup
92
+ self.num_reps = rep
93
+ self.use_cuda_graph = use_cuda_graph
94
+
95
+ # If we got explicitly called via the old interface, raise a warning
96
+ # and proceed with the old behavior.
97
+ if warmup is not None or rep is not None or use_cuda_graph:
98
+ import warnings
99
+ warnings.warn(("warmup, rep, and use_cuda_graph parameters are deprecated. See "
100
+ "https://github.com/triton-lang/triton/pull/4496 for details."), DeprecationWarning,
101
+ stacklevel=1)
102
+ if use_cuda_graph:
103
+ from ..testing import do_bench_cudagraph
104
+ self._do_bench = lambda kernel_call, quantiles: do_bench_cudagraph(
105
+ kernel_call,
106
+ rep=rep if rep is not None else 100,
107
+ quantiles=quantiles,
108
+ )
109
+ return
110
+
111
+ import triton.testing
112
+ self._do_bench = lambda kernel_call, quantiles: triton.testing.do_bench(
113
+ kernel_call,
114
+ warmup=warmup if warmup is not None else 25,
115
+ rep=rep if rep is not None else 100,
116
+ quantiles=quantiles,
117
+ )
118
+ return
119
+
120
+ @cached_property
121
+ def do_bench(self):
122
+ if self._do_bench is None:
123
+ return driver.active.get_benchmarker()
124
+ return self._do_bench
125
+
126
+ def _bench(self, *args, config, **meta):
127
+ from ..compiler.errors import CompileTimeAssertionFailure
128
+
129
+ verbose = knobs.autotuning.print
130
+ if verbose:
131
+ print(f"Autotuning kernel {self.base_fn.__name__} with config {config}")
132
+
133
+ # check for conflicts, i.e. meta-parameters both provided
134
+ # as kwargs and by the autotuner
135
+ conflicts = meta.keys() & config.kwargs.keys()
136
+ if conflicts:
137
+ raise ValueError(f"Conflicting meta-parameters: {', '.join(conflicts)}."
138
+ " Make sure that you don't re-define auto-tuned symbols.")
139
+ # augment meta-parameters with tunable ones
140
+ current = dict(meta, **config.all_kwargs())
141
+ full_nargs = {**self.nargs, **current}
142
+
143
+ def kernel_call():
144
+ if config.pre_hook:
145
+ config.pre_hook(full_nargs)
146
+ self.pre_hook(full_nargs)
147
+ try:
148
+ self.fn.run(
149
+ *args,
150
+ **current,
151
+ )
152
+ except Exception as e:
153
+ try:
154
+ self.post_hook(full_nargs, exception=e)
155
+ finally:
156
+ # Throw exception raised by `self.fn.run`
157
+ raise
158
+
159
+ self.post_hook(full_nargs, exception=None)
160
+
161
+ try:
162
+ return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))
163
+ except (OutOfResources, CompileTimeAssertionFailure, PTXASError) as e:
164
+ if verbose:
165
+ print(f"Autotuning failed with {e}")
166
+ return [float("inf"), float("inf"), float("inf")]
167
+
168
+ def check_disk_cache(self, tuning_key, configs, bench_fn):
169
+ # We can't serialize prehooks, so just give up and run the benchmarks.
170
+ if not tuning_key or any(cfg.pre_hook for cfg in configs):
171
+ bench_fn()
172
+ return False
173
+
174
+ from triton.compiler.compiler import make_backend
175
+
176
+ fn = self.fn
177
+ while not isinstance(fn, JITFunction):
178
+ fn = fn.fn
179
+
180
+ env_vars = get_cache_invalidating_env_vars()
181
+ cache_key = [
182
+ triton_key(),
183
+ make_backend(driver.active.get_current_target()).hash(),
184
+ fn.cache_key,
185
+ str(sorted(env_vars.items())),
186
+ str(tuning_key),
187
+ ] + [str(c) for c in configs]
188
+ cache_key = hashlib.sha256("-".join(cache_key).encode("utf-8")).hexdigest()
189
+ cache = get_cache_manager(cache_key)
190
+ file_name = f"{fn.__name__[:150]}.autotune.json"
191
+ path = cache.get_file(file_name)
192
+ if path:
193
+ with open(path, "r") as cached_configs:
194
+ timings = json.load(cached_configs)["configs_timings"]
195
+ timings = {Config(**config): timing for config, timing in timings}
196
+ self.cache[tuning_key] = builtins.min(timings, key=timings.get)
197
+ self.configs_timings = timings
198
+ return True
199
+
200
+ bench_fn()
201
+ cache.put(
202
+ json.dumps({
203
+ "key":
204
+ tuning_key,
205
+ "configs_timings":
206
+ [(config.__dict__, timings) for config, timings in self.configs_timings.items() if not config.pre_hook],
207
+ }), file_name, binary=False)
208
+ return False
209
+
210
+ def run(self, *args, **kwargs):
211
+ self.nargs = dict(zip(self.arg_names, args))
212
+ used_cached_result = True
213
+ if len(self.configs) > 1:
214
+ all_args = {**self.nargs, **kwargs}
215
+ _args = {k: v for (k, v) in all_args.items() if k in self.arg_names}
216
+ key = [_args[key] for key in self.keys if key in _args]
217
+ for _, arg in _args.items():
218
+ if hasattr(arg, "dtype"):
219
+ key.append(str(arg.dtype))
220
+ key = tuple(key)
221
+ if key not in self.cache:
222
+ used_cached_result = False
223
+ pruned_configs = self.prune_configs(kwargs)
224
+
225
+ def benchmark():
226
+ bench_start = time.perf_counter()
227
+ timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}
228
+ bench_end = time.perf_counter()
229
+ self.bench_time = bench_end - bench_start
230
+ self.cache[key] = builtins.min(timings, key=timings.get)
231
+ full_nargs = {**self.nargs, **kwargs, **self.cache[key].all_kwargs()}
232
+ self.pre_hook(full_nargs, reset_only=True)
233
+ self.configs_timings = timings
234
+
235
+ if self.cache_results:
236
+ used_cached_result = self.check_disk_cache(key, pruned_configs, benchmark)
237
+ else:
238
+ benchmark()
239
+
240
+ config = self.cache[key]
241
+ else:
242
+ config = self.configs[0]
243
+ self.best_config = config
244
+ if knobs.autotuning.print and not used_cached_result:
245
+ print(f"Triton autotuning for function {self.base_fn.__name__},\nwith key as {key},\n"
246
+ f"finished after {self.bench_time:.2f}s,\nbest config selected: {self.best_config};")
247
+ if config.pre_hook is not None:
248
+ full_nargs = {**self.nargs, **kwargs, **config.all_kwargs()}
249
+ config.pre_hook(full_nargs)
250
+ ret = self.fn.run(
251
+ *args,
252
+ **kwargs,
253
+ **config.all_kwargs(),
254
+ )
255
+ self.nargs = None
256
+ return ret
257
+
258
+ def prune_configs(self, kwargs: Dict) -> List[Config]:
259
+ pruned_configs = self.configs
260
+ if self.early_config_prune:
261
+ pruned_configs = self.early_config_prune(self.configs, self.nargs, **kwargs)
262
+ if self.perf_model:
263
+ top_k = self.configs_top_k
264
+ if isinstance(top_k, float) and top_k <= 1.0:
265
+ top_k = int(len(self.configs) * top_k)
266
+ elif not isinstance(top_k, int):
267
+ # Slice index must be an integer
268
+ raise TypeError("Error while pruning configs, top_k must be either 1) a float <= 1.0 or 2) an int")
269
+
270
+ if len(pruned_configs) > top_k:
271
+ est_timing = {
272
+ config: self.perf_model(
273
+ **self.nargs,
274
+ **kwargs,
275
+ **config.all_kwargs(),
276
+ )
277
+ for config in pruned_configs
278
+ }
279
+ pruned_configs = sorted(est_timing.keys(), key=lambda x: est_timing[x])[:top_k]
280
+ return pruned_configs
281
+
282
+ def warmup(self, *args, **kwargs):
283
+ self.nargs = dict(zip(self.arg_names, args))
284
+ ret = []
285
+ for autotune_config in self.prune_configs(kwargs):
286
+ ret.append(self.fn.warmup(
287
+ *args,
288
+ **kwargs,
289
+ **autotune_config.all_kwargs(),
290
+ ))
291
+ self.nargs = None
292
+ return ret
293
+
294
+
295
+ class Config:
296
+ """
297
+ An object that represents a possible kernel configuration for the auto-tuner to try.
298
+
299
+ :ivar kwargs: a dictionary of meta-parameters to pass to the kernel as keyword arguments.
300
+ :type kwargs: dict[Str, Any]
301
+ :ivar num_warps: the number of warps to use for the kernel when compiled for GPUs. For example, if
302
+ `num_warps=8`, then each kernel instance will be automatically parallelized to
303
+ cooperatively execute using `8 * 32 = 256` threads.
304
+ :type num_warps: int
305
+ :ivar num_stages: the number of stages that the compiler should use when software-pipelining loops.
306
+ Mostly useful for matrix multiplication workloads on SM80+ GPUs.
307
+ :type num_stages: int
308
+ :ivar num_ctas: number of blocks in a block cluster. SM90+ only.
309
+ :type num_ctas: int
310
+ :type maxnreg: Optional[int]
311
+ :ivar maxnreg: maximum number of registers one thread can use. Corresponds
312
+ to ptx .maxnreg directive. Not supported on all platforms.
313
+ :ivar pre_hook: a function that will be called before the kernel is called. Parameters of this
314
+ function are args.
315
+ :ivar ir_override: filename of a user-defined IR (*.{ttgir|llir|ptx|amdgcn}).
316
+ """
317
+
318
+ def __init__(self, kwargs, num_warps=4, num_stages=3, num_ctas=1, maxnreg=None, pre_hook=None, ir_override=None):
319
+ self.kwargs = kwargs
320
+ self.num_warps = num_warps
321
+ self.num_ctas = num_ctas
322
+ self.num_stages = num_stages
323
+ self.maxnreg = maxnreg
324
+ self.pre_hook = pre_hook
325
+ self.ir_override = ir_override
326
+
327
+ def __setstate__(self, state):
328
+ self.kwargs = state.get("kwargs", {})
329
+ self.num_warps = state.get("num_warps", 4)
330
+ self.num_stages = state.get("num_stages", 3)
331
+ self.num_ctas = state.get("num_ctas", 1)
332
+ self.maxnreg = state.get("maxnreg", None)
333
+ self.pre_hook = state.get("pre_hook", None)
334
+ self.ir_override = state.get("ir_override", None)
335
+
336
+ def all_kwargs(self):
337
+ return {
338
+ **self.kwargs, **{
339
+ k: v
340
+ for (k, v) in (
341
+ ("num_warps", self.num_warps),
342
+ ("num_ctas", self.num_ctas),
343
+ ("num_stages", self.num_stages),
344
+ ("maxnreg", self.maxnreg),
345
+ ("ir_override", self.ir_override),
346
+ ) if v is not None
347
+ }
348
+ }
349
+
350
+ def __str__(self):
351
+ res = []
352
+ for k, v in self.kwargs.items():
353
+ res.append(f"{k}: {v}")
354
+ res.append(f"num_warps: {self.num_warps}")
355
+ res.append(f"num_ctas: {self.num_ctas}")
356
+ res.append(f"num_stages: {self.num_stages}")
357
+ res.append(f"maxnreg: {self.maxnreg}")
358
+ return ", ".join(res)
359
+
360
+ def __hash__(self):
361
+ return hash((*self.all_kwargs().items(), self.pre_hook))
362
+
363
+ def __eq__(self, other):
364
+ self_tuple = tuple((
365
+ *self.all_kwargs().items(),
366
+ self.pre_hook,
367
+ ))
368
+ other_tuple = tuple((
369
+ *other.all_kwargs().items(),
370
+ other.pre_hook,
371
+ ))
372
+ return self_tuple == other_tuple
373
+
374
+
375
+ def autotune(configs, key, prune_configs_by=None, reset_to_zero=None, restore_value=None, pre_hook=None, post_hook=None,
376
+ warmup=None, rep=None, use_cuda_graph=False, do_bench=None, cache_results=False):
377
+ """
378
+ Decorator for auto-tuning a :code:`triton.jit`'d function.
379
+
380
+ .. highlight:: python
381
+ .. code-block:: python
382
+
383
+ @triton.autotune(configs=[
384
+ triton.Config(kwargs={'BLOCK_SIZE': 128}, num_warps=4),
385
+ triton.Config(kwargs={'BLOCK_SIZE': 1024}, num_warps=8),
386
+ ],
387
+ key=['x_size'] # the two above configs will be evaluated anytime
388
+ # the value of x_size changes
389
+ )
390
+ @triton.jit
391
+ def kernel(x_ptr, x_size, BLOCK_SIZE: tl.constexpr):
392
+ ...
393
+ :note: When all the configurations are evaluated, the kernel will run multiple times.
394
+ This means that whatever value the kernel updates will be updated multiple times.
395
+ To avoid this undesired behavior, you can use the `reset_to_zero` argument, which
396
+ resets the value of the provided tensor to `zero` before running any configuration.
397
+
398
+ If the environment variable :code:`TRITON_PRINT_AUTOTUNING` is set to
399
+ :code:`"1"`, Triton will print a message to stdout after autotuning each
400
+ kernel, including the time spent autotuning and the best configuration.
401
+
402
+ :param configs: a list of :code:`triton.Config` objects
403
+ :type configs: list[triton.Config]
404
+ :param key: a list of argument names whose change in value will trigger the evaluation of all provided configs.
405
+ :type key: list[str]
406
+ :param prune_configs_by: a dict of functions that are used to prune configs, fields:
407
+ 'perf_model': performance model used to predicate running time with different configs, returns running time
408
+ 'top_k': number of configs to bench
409
+ 'early_config_prune'(optional): a function used to do early prune (eg, num_stages). It takes configs:List[Config] as its input, and returns pruned configs.
410
+ :param reset_to_zero: a list of argument names whose value will be reset to zero before evaluating any configs.
411
+ :type reset_to_zero: list[str]
412
+ :param restore_value: a list of argument names whose value will be restored after evaluating any configs.
413
+ :type restore_value: list[str]
414
+ :param pre_hook: a function that will be called before the kernel is called.
415
+ This overrides the default pre_hook used for 'reset_to_zero' and 'restore_value'.
416
+ 'kwargs': a dict of all arguments passed to the kernel.
417
+ 'reset_only': a boolean indicating whether the pre_hook is called to reset the values only, without a corresponding post_hook.
418
+ :type pre_hook: lambda args, reset_only
419
+ :param post_hook: a function that will be called after the kernel is called.
420
+ This overrides the default post_hook used for 'restore_value'.
421
+ 'kwargs': a dict of all arguments passed to the kernel.
422
+ 'exception': the exception raised by the kernel in case of a compilation or runtime error.
423
+ :type post_hook: lambda args, exception
424
+ :param warmup: warmup time (in ms) to pass to benchmarking (deprecated).
425
+ :type warmup: int
426
+ :param rep: repetition time (in ms) to pass to benchmarking (deprecated).
427
+ :type rep: int
428
+ :param do_bench: a benchmark function to measure the time of each run.
429
+ :type do_bench: lambda fn, quantiles
430
+ :param cache_results: whether to cache autotune timings to disk. Defaults to False.
431
+ "type cache_results: bool
432
+ """
433
+
434
+ def decorator(fn):
435
+ return Autotuner(fn, fn.arg_names, configs, key, reset_to_zero, restore_value, pre_hook=pre_hook,
436
+ post_hook=post_hook, prune_configs_by=prune_configs_by, warmup=warmup, rep=rep,
437
+ use_cuda_graph=use_cuda_graph, do_bench=do_bench, cache_results=cache_results)
438
+
439
+ return decorator
440
+
441
+
442
+ class Heuristics(KernelInterface):
443
+
444
+ def __init__(self, fn, arg_names, values) -> None:
445
+ self.fn = fn
446
+ self.values = values
447
+ self.arg_names = arg_names
448
+
449
+ def run(self, *args, **kwargs):
450
+ for v, heur in self.values.items():
451
+ kwargs[v] = heur({**dict(zip(self.arg_names, args)), **kwargs})
452
+ return self.fn.run(*args, **kwargs)
453
+
454
+
455
+ def heuristics(values):
456
+ """
457
+ Decorator for specifying how the values of certain meta-parameters may be computed.
458
+ This is useful for cases where auto-tuning is prohibitively expensive, or just not applicable.
459
+
460
+ .. highlight:: python
461
+ .. code-block:: python
462
+
463
+ # smallest power-of-two >= x_size
464
+ @triton.heuristics(values={'BLOCK_SIZE': lambda args: triton.next_power_of_2(args['x_size'])})
465
+ @triton.jit
466
+ def kernel(x_ptr, x_size, BLOCK_SIZE: tl.constexpr):
467
+ ...
468
+ :param values: a dictionary of meta-parameter names and functions that compute the value of the meta-parameter.
469
+ each such function takes a list of positional arguments as input.
470
+ :type values: dict[str, Callable[[dict[str, Any]], Any]]
471
+ """
472
+
473
+ def decorator(fn):
474
+ return Heuristics(fn, fn.arg_names, values)
475
+
476
+ return decorator
@@ -0,0 +1,168 @@
1
+ from __future__ import annotations
2
+
3
+ import functools
4
+ import hashlib
5
+ import importlib.util
6
+ import logging
7
+ import os
8
+ import shutil
9
+ import subprocess
10
+ import sysconfig
11
+ import tempfile
12
+
13
+ from types import ModuleType
14
+
15
+ from .cache import get_cache_manager
16
+ from .. import knobs
17
+
18
+ if os.name == "nt":
19
+ from triton.windows_utils import find_msvc_winsdk, find_python
20
+
21
+
22
+ @functools.lru_cache
23
+ def get_cc():
24
+ cc = os.environ.get("CC")
25
+ if cc is None:
26
+ # Find and check MSVC and Windows SDK from environment variables set by Launch-VsDevShell.ps1 or VsDevCmd.bat
27
+ cc, _, _ = find_msvc_winsdk(env_only=True)
28
+ if cc is None:
29
+ # Bundled TinyCC
30
+ cc = os.path.join(sysconfig.get_paths()["platlib"], "triton", "runtime", "tcc", "tcc.exe")
31
+ if not os.path.exists(cc):
32
+ cc = None
33
+ if cc is None:
34
+ cc = shutil.which("cl")
35
+ if cc is None:
36
+ cc = shutil.which("gcc")
37
+ if cc is None:
38
+ cc = shutil.which("clang")
39
+ if cc is None:
40
+ raise RuntimeError("Failed to find C compiler. Please specify via CC environment variable.")
41
+ return cc
42
+
43
+
44
+ def is_tcc(cc):
45
+ cc = os.path.basename(cc).lower()
46
+ return cc == "tcc" or cc == "tcc.exe"
47
+
48
+
49
+ def is_msvc(cc):
50
+ cc = os.path.basename(cc).lower()
51
+ return cc == "cl" or cc == "cl.exe"
52
+
53
+
54
+ def is_clang(cc):
55
+ cc = os.path.basename(cc).lower()
56
+ return cc == "clang" or cc == "clang.exe"
57
+
58
+
59
+ def _cc_cmd(cc: str, src: str, out: str, include_dirs: list[str], library_dirs: list[str], libraries: list[str],
60
+ ccflags: list[str]) -> list[str]:
61
+ if is_msvc(cc):
62
+ out_base = os.path.splitext(out)[0]
63
+ cc_cmd = [cc, src, "/nologo", "/O2", "/LD", "/std:c11", "/wd4819"]
64
+ cc_cmd += [f"/I{dir}" for dir in include_dirs if dir is not None]
65
+ cc_cmd += [f"/Fo{out_base + '.obj'}"]
66
+ cc_cmd += ["/link"]
67
+ cc_cmd += [f"/LIBPATH:{dir}" for dir in library_dirs]
68
+ cc_cmd += [f'{lib}.lib' for lib in libraries]
69
+ cc_cmd += [f"/OUT:{out}"]
70
+ cc_cmd += [f"/IMPLIB:{out_base + '.lib'}"]
71
+ cc_cmd += [f"/PDB:{out_base + '.pdb'}"]
72
+ else:
73
+ # for -Wno-psabi, see https://gcc.gnu.org/bugzilla/show_bug.cgi?id=111047
74
+ cc_cmd = [cc, src, "-O3", "-shared", "-Wno-psabi", "-o", out]
75
+ if not (os.name == "nt" and is_clang(cc)):
76
+ # Clang does not support -fPIC on Windows
77
+ cc_cmd += ["-fPIC"]
78
+ if is_tcc(cc):
79
+ cc_cmd += ["-D_Py_USE_GCC_BUILTIN_ATOMICS"]
80
+ cc_cmd += [f'-l{lib}' for lib in libraries]
81
+ cc_cmd += [f"-L{dir}" for dir in library_dirs]
82
+ cc_cmd += [f"-I{dir}" for dir in include_dirs if dir is not None]
83
+ cc_cmd += ccflags
84
+ return cc_cmd
85
+
86
+
87
+ def _build(name: str, src: str, srcdir: str, library_dirs: list[str], include_dirs: list[str], libraries: list[str],
88
+ ccflags: list[str]) -> str:
89
+ if impl := knobs.build.impl:
90
+ return impl(name, src, srcdir, library_dirs, include_dirs, libraries)
91
+ suffix = sysconfig.get_config_var('EXT_SUFFIX')
92
+ so = os.path.join(srcdir, '{name}{suffix}'.format(name=name, suffix=suffix))
93
+ cc = get_cc()
94
+ # This function was renamed and made public in Python 3.10
95
+ if hasattr(sysconfig, 'get_default_scheme'):
96
+ scheme = sysconfig.get_default_scheme()
97
+ else:
98
+ scheme = sysconfig._get_default_scheme() # type: ignore
99
+ # 'posix_local' is a custom scheme on Debian. However, starting Python 3.10, the default install
100
+ # path changes to include 'local'. This change is required to use triton with system-wide python.
101
+ if scheme == 'posix_local':
102
+ scheme = 'posix_prefix'
103
+ py_include_dir = sysconfig.get_paths(scheme=scheme)["include"]
104
+ custom_backend_dirs = knobs.build.backend_dirs
105
+ # Don't append in place
106
+ include_dirs = include_dirs + [srcdir, py_include_dir, *custom_backend_dirs]
107
+ if os.name == "nt":
108
+ library_dirs = library_dirs + find_python()
109
+ version = sysconfig.get_python_version().replace(".", "")
110
+ if sysconfig.get_config_var("Py_GIL_DISABLED"):
111
+ version += "t"
112
+ libraries = libraries + [f"python{version}"]
113
+ if is_msvc(cc):
114
+ _, msvc_winsdk_inc_dirs, msvc_winsdk_lib_dirs = find_msvc_winsdk()
115
+ include_dirs = include_dirs + msvc_winsdk_inc_dirs
116
+ library_dirs = library_dirs + msvc_winsdk_lib_dirs
117
+ cc_cmd = _cc_cmd(cc, src, so, include_dirs, library_dirs, libraries, ccflags)
118
+
119
+ try:
120
+ subprocess.check_call(cc_cmd)
121
+ except Exception as e:
122
+ print("Failed to compile. cc_cmd:", cc_cmd)
123
+ raise e
124
+
125
+ return so
126
+
127
+
128
+ @functools.lru_cache
129
+ def platform_key() -> str:
130
+ from platform import machine, system, architecture
131
+ return ",".join([machine(), system(), *architecture()])
132
+
133
+
134
+ def _load_module_from_path(name: str, path: str) -> ModuleType:
135
+ # Loading module with relative path may cause error
136
+ path = os.path.abspath(path)
137
+ spec = importlib.util.spec_from_file_location(name, path)
138
+ if not spec or not spec.loader:
139
+ raise RuntimeError(f"Failed to load newly compiled {name} from {path}")
140
+ mod = importlib.util.module_from_spec(spec)
141
+ spec.loader.exec_module(mod)
142
+ return mod
143
+
144
+
145
+ def compile_module_from_src(src: str, name: str, library_dirs: list[str] | None = None,
146
+ include_dirs: list[str] | None = None, libraries: list[str] | None = None,
147
+ ccflags: list[str] | None = None) -> ModuleType:
148
+ key = hashlib.sha256((src + platform_key()).encode("utf-8")).hexdigest()
149
+ cache = get_cache_manager(key)
150
+ suffix = sysconfig.get_config_var("EXT_SUFFIX")
151
+ cache_path = cache.get_file(f"{name}{suffix}")
152
+
153
+ if cache_path is not None:
154
+ try:
155
+ return _load_module_from_path(name, cache_path)
156
+ except (RuntimeError, ImportError):
157
+ log = logging.getLogger(__name__)
158
+ log.warning(f"Triton cache error: compiled module {name}.so could not be loaded")
159
+
160
+ with tempfile.TemporaryDirectory() as tmpdir:
161
+ src_path = os.path.join(tmpdir, name + ".c")
162
+ with open(src_path, "w") as f:
163
+ f.write(src)
164
+ so = _build(name, src_path, tmpdir, library_dirs or [], include_dirs or [], libraries or [], ccflags or [])
165
+ with open(so, "rb") as f:
166
+ cache_path = cache.put(f.read(), f"{name}{suffix}", binary=True)
167
+
168
+ return _load_module_from_path(name, cache_path)