triton-windows 3.3.1.post21__cp311-cp311-win_amd64.whl → 3.4.0.post21__cp311-cp311-win_amd64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of triton-windows might be problematic. Click here for more details.
- triton/_C/libtriton.pyd +0 -0
- triton/__init__.py +4 -1
- triton/_filecheck.py +87 -0
- triton/_internal_testing.py +26 -15
- triton/_utils.py +110 -21
- triton/backends/__init__.py +20 -23
- triton/backends/amd/__init__.py +0 -0
- triton/backends/amd/compiler.py +112 -78
- triton/backends/amd/driver.c +5 -2
- triton/backends/amd/driver.py +143 -46
- triton/backends/compiler.py +7 -21
- triton/backends/nvidia/bin/ptxas.exe +0 -0
- triton/backends/nvidia/compiler.py +94 -94
- triton/backends/nvidia/driver.c +90 -98
- triton/backends/nvidia/driver.py +296 -125
- triton/compiler/code_generator.py +212 -111
- triton/compiler/compiler.py +110 -25
- triton/experimental/__init__.py +0 -0
- triton/experimental/gluon/__init__.py +4 -0
- triton/experimental/gluon/_compiler.py +0 -0
- triton/experimental/gluon/_runtime.py +99 -0
- triton/experimental/gluon/language/__init__.py +18 -0
- triton/experimental/gluon/language/_core.py +312 -0
- triton/experimental/gluon/language/_layouts.py +230 -0
- triton/experimental/gluon/language/_math.py +12 -0
- triton/experimental/gluon/language/_semantic.py +287 -0
- triton/experimental/gluon/language/_standard.py +47 -0
- triton/experimental/gluon/language/nvidia/__init__.py +4 -0
- triton/experimental/gluon/language/nvidia/blackwell/__init__.py +202 -0
- triton/experimental/gluon/language/nvidia/blackwell/tma.py +32 -0
- triton/experimental/gluon/language/nvidia/hopper/__init__.py +11 -0
- triton/experimental/gluon/language/nvidia/hopper/mbarrier.py +51 -0
- triton/experimental/gluon/language/nvidia/hopper/tma.py +96 -0
- triton/experimental/gluon/nvidia/__init__.py +4 -0
- triton/experimental/gluon/nvidia/blackwell.py +3 -0
- triton/experimental/gluon/nvidia/hopper.py +40 -0
- triton/knobs.py +481 -0
- triton/language/__init__.py +39 -14
- triton/language/core.py +794 -537
- triton/language/extra/cuda/__init__.py +10 -7
- triton/language/extra/cuda/gdc.py +42 -0
- triton/language/extra/cuda/libdevice.py +394 -394
- triton/language/extra/cuda/utils.py +21 -21
- triton/language/extra/hip/libdevice.py +113 -104
- triton/language/math.py +65 -66
- triton/language/random.py +12 -2
- triton/language/semantic.py +1706 -1770
- triton/language/standard.py +116 -51
- triton/runtime/autotuner.py +117 -59
- triton/runtime/build.py +73 -9
- triton/runtime/cache.py +18 -47
- triton/runtime/driver.py +32 -29
- triton/runtime/interpreter.py +72 -35
- triton/runtime/jit.py +146 -110
- triton/testing.py +16 -12
- triton/tools/disasm.py +3 -4
- triton/tools/tensor_descriptor.py +36 -0
- triton/windows_utils.py +47 -83
- {triton_windows-3.3.1.post21.dist-info → triton_windows-3.4.0.post21.dist-info}/METADATA +7 -2
- {triton_windows-3.3.1.post21.dist-info → triton_windows-3.4.0.post21.dist-info}/RECORD +64 -41
- triton_windows-3.4.0.post21.dist-info/entry_points.txt +3 -0
- triton_windows-3.4.0.post21.dist-info/licenses/LICENSE +23 -0
- triton_windows-3.4.0.post21.dist-info/top_level.txt +1 -0
- triton/language/_utils.py +0 -21
- triton/language/extra/cuda/_experimental_tma.py +0 -106
- triton/tools/experimental_descriptor.py +0 -32
- triton_windows-3.3.1.post21.dist-info/top_level.txt +0 -14
- {triton_windows-3.3.1.post21.dist-info → triton_windows-3.4.0.post21.dist-info}/WHEEL +0 -0
triton/language/__init__.py
CHANGED
|
@@ -6,6 +6,7 @@ from . import extra
|
|
|
6
6
|
from .standard import (
|
|
7
7
|
argmax,
|
|
8
8
|
argmin,
|
|
9
|
+
bitonic_merge,
|
|
9
10
|
cdiv,
|
|
10
11
|
cumprod,
|
|
11
12
|
cumsum,
|
|
@@ -14,11 +15,13 @@ from .standard import (
|
|
|
14
15
|
max,
|
|
15
16
|
min,
|
|
16
17
|
ravel,
|
|
18
|
+
reduce_or,
|
|
17
19
|
sigmoid,
|
|
18
20
|
softmax,
|
|
19
21
|
sort,
|
|
20
22
|
sum,
|
|
21
23
|
swizzle2d,
|
|
24
|
+
topk,
|
|
22
25
|
xor_sum,
|
|
23
26
|
zeros,
|
|
24
27
|
zeros_like,
|
|
@@ -26,16 +29,17 @@ from .standard import (
|
|
|
26
29
|
from .core import (
|
|
27
30
|
PropagateNan,
|
|
28
31
|
TRITON_MAX_TENSOR_NUMEL,
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
32
|
+
load_tensor_descriptor,
|
|
33
|
+
store_tensor_descriptor,
|
|
34
|
+
make_tensor_descriptor,
|
|
35
|
+
tensor_descriptor,
|
|
36
|
+
tensor_descriptor_type,
|
|
34
37
|
add,
|
|
35
38
|
advance,
|
|
36
39
|
arange,
|
|
37
40
|
associative_scan,
|
|
38
41
|
assume,
|
|
42
|
+
async_task,
|
|
39
43
|
atomic_add,
|
|
40
44
|
atomic_and,
|
|
41
45
|
atomic_cas,
|
|
@@ -53,6 +57,7 @@ from .core import (
|
|
|
53
57
|
clamp,
|
|
54
58
|
const,
|
|
55
59
|
constexpr,
|
|
60
|
+
constexpr_function,
|
|
56
61
|
debug_barrier,
|
|
57
62
|
device_assert,
|
|
58
63
|
device_print,
|
|
@@ -89,7 +94,6 @@ from .core import (
|
|
|
89
94
|
permute,
|
|
90
95
|
pi32_t,
|
|
91
96
|
pointer_type,
|
|
92
|
-
nv_tma_desc_type,
|
|
93
97
|
program_id,
|
|
94
98
|
range,
|
|
95
99
|
reduce,
|
|
@@ -130,11 +134,10 @@ from .random import (
|
|
|
130
134
|
__all__ = [
|
|
131
135
|
"PropagateNan",
|
|
132
136
|
"TRITON_MAX_TENSOR_NUMEL",
|
|
133
|
-
"
|
|
134
|
-
"
|
|
135
|
-
"
|
|
136
|
-
"
|
|
137
|
-
"_experimental_tensor_descriptor",
|
|
137
|
+
"load_tensor_descriptor",
|
|
138
|
+
"store_tensor_descriptor",
|
|
139
|
+
"make_tensor_descriptor",
|
|
140
|
+
"tensor_descriptor",
|
|
138
141
|
"abs",
|
|
139
142
|
"add",
|
|
140
143
|
"advance",
|
|
@@ -143,6 +146,7 @@ __all__ = [
|
|
|
143
146
|
"argmin",
|
|
144
147
|
"associative_scan",
|
|
145
148
|
"assume",
|
|
149
|
+
"async_task",
|
|
146
150
|
"atomic_add",
|
|
147
151
|
"atomic_and",
|
|
148
152
|
"atomic_cas",
|
|
@@ -152,6 +156,7 @@ __all__ = [
|
|
|
152
156
|
"atomic_xchg",
|
|
153
157
|
"atomic_xor",
|
|
154
158
|
"bfloat16",
|
|
159
|
+
"bitonic_merge",
|
|
155
160
|
"block_type",
|
|
156
161
|
"broadcast",
|
|
157
162
|
"broadcast_to",
|
|
@@ -162,6 +167,7 @@ __all__ = [
|
|
|
162
167
|
"clamp",
|
|
163
168
|
"const",
|
|
164
169
|
"constexpr",
|
|
170
|
+
"constexpr_function",
|
|
165
171
|
"cos",
|
|
166
172
|
"cumprod",
|
|
167
173
|
"cumsum",
|
|
@@ -219,7 +225,6 @@ __all__ = [
|
|
|
219
225
|
"philox_impl",
|
|
220
226
|
"pi32_t",
|
|
221
227
|
"pointer_type",
|
|
222
|
-
"nv_tma_desc_type",
|
|
223
228
|
"program_id",
|
|
224
229
|
"rand",
|
|
225
230
|
"rand4x",
|
|
@@ -230,6 +235,7 @@ __all__ = [
|
|
|
230
235
|
"range",
|
|
231
236
|
"ravel",
|
|
232
237
|
"reduce",
|
|
238
|
+
"reduce_or",
|
|
233
239
|
"reshape",
|
|
234
240
|
"rsqrt",
|
|
235
241
|
"slice",
|
|
@@ -247,6 +253,7 @@ __all__ = [
|
|
|
247
253
|
"sum",
|
|
248
254
|
"swizzle2d",
|
|
249
255
|
"tensor",
|
|
256
|
+
"topk",
|
|
250
257
|
"trans",
|
|
251
258
|
"tuple",
|
|
252
259
|
"uint16",
|
|
@@ -280,8 +287,26 @@ def str_to_ty(name):
|
|
|
280
287
|
ty = str_to_ty(name)
|
|
281
288
|
return pointer_type(element_ty=ty, const=const)
|
|
282
289
|
|
|
283
|
-
if name
|
|
284
|
-
|
|
290
|
+
if name.startswith("tensordesc"):
|
|
291
|
+
inner = name.split("<")[1].rstrip(">")
|
|
292
|
+
dtype, rest = inner.split("[", maxsplit=2)
|
|
293
|
+
block_shape, rest = rest.split("]", maxsplit=2)
|
|
294
|
+
block_shape = [int(s.strip()) for s in block_shape.rstrip("]").split(",")]
|
|
295
|
+
layout = rest.lstrip(",")
|
|
296
|
+
is_gluon = len(layout)
|
|
297
|
+
dtype = str_to_ty(dtype)
|
|
298
|
+
ndim = len(block_shape)
|
|
299
|
+
shape_type = tuple_type([int32] * ndim)
|
|
300
|
+
# FIXME: Last dim stride should be constexpr(1)
|
|
301
|
+
stride_type = tuple_type(([int64] * ndim))
|
|
302
|
+
block = block_type(dtype, block_shape)
|
|
303
|
+
if is_gluon:
|
|
304
|
+
from triton.experimental.gluon.language._layouts import NVMMASharedLayout
|
|
305
|
+
from triton.experimental.gluon.language.nvidia.hopper.tma import tensor_descriptor_type as gluon_tensor_descriptor_type
|
|
306
|
+
layout = eval(layout, dict(NVMMASharedLayout=NVMMASharedLayout))
|
|
307
|
+
assert isinstance(layout, NVMMASharedLayout)
|
|
308
|
+
return gluon_tensor_descriptor_type(block, shape_type, stride_type, layout)
|
|
309
|
+
return tensor_descriptor_type(block, shape_type, stride_type)
|
|
285
310
|
|
|
286
311
|
if name == "constexpr":
|
|
287
312
|
return constexpr
|