triton-windows 3.3.1.post21__cp312-cp312-win_amd64.whl → 3.4.0.post21__cp312-cp312-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of triton-windows might be problematic. Click here for more details.

Files changed (68) hide show
  1. triton/_C/libtriton.pyd +0 -0
  2. triton/__init__.py +4 -1
  3. triton/_filecheck.py +87 -0
  4. triton/_internal_testing.py +26 -15
  5. triton/_utils.py +110 -21
  6. triton/backends/__init__.py +20 -23
  7. triton/backends/amd/__init__.py +0 -0
  8. triton/backends/amd/compiler.py +112 -78
  9. triton/backends/amd/driver.c +5 -2
  10. triton/backends/amd/driver.py +143 -46
  11. triton/backends/compiler.py +7 -21
  12. triton/backends/nvidia/bin/ptxas.exe +0 -0
  13. triton/backends/nvidia/compiler.py +94 -94
  14. triton/backends/nvidia/driver.c +90 -98
  15. triton/backends/nvidia/driver.py +296 -125
  16. triton/compiler/code_generator.py +212 -111
  17. triton/compiler/compiler.py +110 -25
  18. triton/experimental/__init__.py +0 -0
  19. triton/experimental/gluon/__init__.py +4 -0
  20. triton/experimental/gluon/_compiler.py +0 -0
  21. triton/experimental/gluon/_runtime.py +99 -0
  22. triton/experimental/gluon/language/__init__.py +18 -0
  23. triton/experimental/gluon/language/_core.py +312 -0
  24. triton/experimental/gluon/language/_layouts.py +230 -0
  25. triton/experimental/gluon/language/_math.py +12 -0
  26. triton/experimental/gluon/language/_semantic.py +287 -0
  27. triton/experimental/gluon/language/_standard.py +47 -0
  28. triton/experimental/gluon/language/nvidia/__init__.py +4 -0
  29. triton/experimental/gluon/language/nvidia/blackwell/__init__.py +202 -0
  30. triton/experimental/gluon/language/nvidia/blackwell/tma.py +32 -0
  31. triton/experimental/gluon/language/nvidia/hopper/__init__.py +11 -0
  32. triton/experimental/gluon/language/nvidia/hopper/mbarrier.py +51 -0
  33. triton/experimental/gluon/language/nvidia/hopper/tma.py +96 -0
  34. triton/experimental/gluon/nvidia/__init__.py +4 -0
  35. triton/experimental/gluon/nvidia/blackwell.py +3 -0
  36. triton/experimental/gluon/nvidia/hopper.py +40 -0
  37. triton/knobs.py +481 -0
  38. triton/language/__init__.py +39 -14
  39. triton/language/core.py +794 -537
  40. triton/language/extra/cuda/__init__.py +10 -7
  41. triton/language/extra/cuda/gdc.py +42 -0
  42. triton/language/extra/cuda/libdevice.py +394 -394
  43. triton/language/extra/cuda/utils.py +21 -21
  44. triton/language/extra/hip/libdevice.py +113 -104
  45. triton/language/math.py +65 -66
  46. triton/language/random.py +12 -2
  47. triton/language/semantic.py +1706 -1770
  48. triton/language/standard.py +116 -51
  49. triton/runtime/autotuner.py +117 -59
  50. triton/runtime/build.py +73 -9
  51. triton/runtime/cache.py +18 -47
  52. triton/runtime/driver.py +32 -29
  53. triton/runtime/interpreter.py +72 -35
  54. triton/runtime/jit.py +146 -110
  55. triton/testing.py +16 -12
  56. triton/tools/disasm.py +3 -4
  57. triton/tools/tensor_descriptor.py +36 -0
  58. triton/windows_utils.py +47 -83
  59. {triton_windows-3.3.1.post21.dist-info → triton_windows-3.4.0.post21.dist-info}/METADATA +7 -2
  60. {triton_windows-3.3.1.post21.dist-info → triton_windows-3.4.0.post21.dist-info}/RECORD +64 -41
  61. triton_windows-3.4.0.post21.dist-info/entry_points.txt +3 -0
  62. triton_windows-3.4.0.post21.dist-info/licenses/LICENSE +23 -0
  63. triton_windows-3.4.0.post21.dist-info/top_level.txt +1 -0
  64. triton/language/_utils.py +0 -21
  65. triton/language/extra/cuda/_experimental_tma.py +0 -106
  66. triton/tools/experimental_descriptor.py +0 -32
  67. triton_windows-3.3.1.post21.dist-info/top_level.txt +0 -14
  68. {triton_windows-3.3.1.post21.dist-info → triton_windows-3.4.0.post21.dist-info}/WHEEL +0 -0
@@ -6,6 +6,7 @@ from . import extra
6
6
  from .standard import (
7
7
  argmax,
8
8
  argmin,
9
+ bitonic_merge,
9
10
  cdiv,
10
11
  cumprod,
11
12
  cumsum,
@@ -14,11 +15,13 @@ from .standard import (
14
15
  max,
15
16
  min,
16
17
  ravel,
18
+ reduce_or,
17
19
  sigmoid,
18
20
  softmax,
19
21
  sort,
20
22
  sum,
21
23
  swizzle2d,
24
+ topk,
22
25
  xor_sum,
23
26
  zeros,
24
27
  zeros_like,
@@ -26,16 +29,17 @@ from .standard import (
26
29
  from .core import (
27
30
  PropagateNan,
28
31
  TRITON_MAX_TENSOR_NUMEL,
29
- _experimental_descriptor_load,
30
- _experimental_descriptor_store,
31
- _experimental_make_tensor_descriptor,
32
- _experimental_reinterpret_tensor_descriptor,
33
- _experimental_tensor_descriptor,
32
+ load_tensor_descriptor,
33
+ store_tensor_descriptor,
34
+ make_tensor_descriptor,
35
+ tensor_descriptor,
36
+ tensor_descriptor_type,
34
37
  add,
35
38
  advance,
36
39
  arange,
37
40
  associative_scan,
38
41
  assume,
42
+ async_task,
39
43
  atomic_add,
40
44
  atomic_and,
41
45
  atomic_cas,
@@ -53,6 +57,7 @@ from .core import (
53
57
  clamp,
54
58
  const,
55
59
  constexpr,
60
+ constexpr_function,
56
61
  debug_barrier,
57
62
  device_assert,
58
63
  device_print,
@@ -89,7 +94,6 @@ from .core import (
89
94
  permute,
90
95
  pi32_t,
91
96
  pointer_type,
92
- nv_tma_desc_type,
93
97
  program_id,
94
98
  range,
95
99
  reduce,
@@ -130,11 +134,10 @@ from .random import (
130
134
  __all__ = [
131
135
  "PropagateNan",
132
136
  "TRITON_MAX_TENSOR_NUMEL",
133
- "_experimental_descriptor_load",
134
- "_experimental_descriptor_store",
135
- "_experimental_make_tensor_descriptor",
136
- "_experimental_reinterpret_tensor_descriptor",
137
- "_experimental_tensor_descriptor",
137
+ "load_tensor_descriptor",
138
+ "store_tensor_descriptor",
139
+ "make_tensor_descriptor",
140
+ "tensor_descriptor",
138
141
  "abs",
139
142
  "add",
140
143
  "advance",
@@ -143,6 +146,7 @@ __all__ = [
143
146
  "argmin",
144
147
  "associative_scan",
145
148
  "assume",
149
+ "async_task",
146
150
  "atomic_add",
147
151
  "atomic_and",
148
152
  "atomic_cas",
@@ -152,6 +156,7 @@ __all__ = [
152
156
  "atomic_xchg",
153
157
  "atomic_xor",
154
158
  "bfloat16",
159
+ "bitonic_merge",
155
160
  "block_type",
156
161
  "broadcast",
157
162
  "broadcast_to",
@@ -162,6 +167,7 @@ __all__ = [
162
167
  "clamp",
163
168
  "const",
164
169
  "constexpr",
170
+ "constexpr_function",
165
171
  "cos",
166
172
  "cumprod",
167
173
  "cumsum",
@@ -219,7 +225,6 @@ __all__ = [
219
225
  "philox_impl",
220
226
  "pi32_t",
221
227
  "pointer_type",
222
- "nv_tma_desc_type",
223
228
  "program_id",
224
229
  "rand",
225
230
  "rand4x",
@@ -230,6 +235,7 @@ __all__ = [
230
235
  "range",
231
236
  "ravel",
232
237
  "reduce",
238
+ "reduce_or",
233
239
  "reshape",
234
240
  "rsqrt",
235
241
  "slice",
@@ -247,6 +253,7 @@ __all__ = [
247
253
  "sum",
248
254
  "swizzle2d",
249
255
  "tensor",
256
+ "topk",
250
257
  "trans",
251
258
  "tuple",
252
259
  "uint16",
@@ -280,8 +287,26 @@ def str_to_ty(name):
280
287
  ty = str_to_ty(name)
281
288
  return pointer_type(element_ty=ty, const=const)
282
289
 
283
- if name == "nvTmaDesc":
284
- return nv_tma_desc_type()
290
+ if name.startswith("tensordesc"):
291
+ inner = name.split("<")[1].rstrip(">")
292
+ dtype, rest = inner.split("[", maxsplit=2)
293
+ block_shape, rest = rest.split("]", maxsplit=2)
294
+ block_shape = [int(s.strip()) for s in block_shape.rstrip("]").split(",")]
295
+ layout = rest.lstrip(",")
296
+ is_gluon = len(layout)
297
+ dtype = str_to_ty(dtype)
298
+ ndim = len(block_shape)
299
+ shape_type = tuple_type([int32] * ndim)
300
+ # FIXME: Last dim stride should be constexpr(1)
301
+ stride_type = tuple_type(([int64] * ndim))
302
+ block = block_type(dtype, block_shape)
303
+ if is_gluon:
304
+ from triton.experimental.gluon.language._layouts import NVMMASharedLayout
305
+ from triton.experimental.gluon.language.nvidia.hopper.tma import tensor_descriptor_type as gluon_tensor_descriptor_type
306
+ layout = eval(layout, dict(NVMMASharedLayout=NVMMASharedLayout))
307
+ assert isinstance(layout, NVMMASharedLayout)
308
+ return gluon_tensor_descriptor_type(block, shape_type, stride_type, layout)
309
+ return tensor_descriptor_type(block, shape_type, stride_type)
285
310
 
286
311
  if name == "constexpr":
287
312
  return constexpr