triton-windows 3.4.0.post20__cp313-cp313-win_amd64.whl → 3.5.0.post21__cp313-cp313-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of triton-windows might be problematic. Click here for more details.

Files changed (107) hide show
  1. triton/_C/libtriton.pyd +0 -0
  2. triton/__init__.py +8 -2
  3. triton/_filecheck.py +24 -14
  4. triton/_internal_testing.py +70 -4
  5. triton/_utils.py +3 -1
  6. triton/backends/amd/compiler.py +68 -60
  7. triton/backends/amd/driver.c +113 -44
  8. triton/backends/amd/driver.py +133 -57
  9. triton/backends/driver.py +13 -0
  10. triton/backends/nvidia/compiler.py +80 -22
  11. triton/backends/nvidia/driver.c +88 -15
  12. triton/backends/nvidia/driver.py +130 -123
  13. triton/compiler/__init__.py +5 -2
  14. triton/compiler/code_generator.py +270 -163
  15. triton/compiler/compiler.py +45 -62
  16. triton/experimental/gluon/__init__.py +3 -2
  17. triton/experimental/gluon/_runtime.py +9 -6
  18. triton/experimental/gluon/language/__init__.py +117 -16
  19. triton/experimental/gluon/language/_core.py +246 -68
  20. triton/experimental/gluon/language/_layouts.py +398 -45
  21. triton/experimental/gluon/language/_math.py +17 -9
  22. triton/experimental/gluon/language/_semantic.py +130 -37
  23. triton/experimental/gluon/language/_standard.py +55 -22
  24. triton/experimental/gluon/language/amd/__init__.py +4 -0
  25. triton/experimental/gluon/language/amd/_layouts.py +96 -0
  26. triton/experimental/gluon/language/amd/cdna3/__init__.py +100 -0
  27. triton/experimental/gluon/language/amd/cdna4/__init__.py +48 -0
  28. triton/experimental/gluon/language/amd/cdna4/async_copy.py +151 -0
  29. triton/experimental/gluon/language/extra/__init__.py +3 -0
  30. triton/experimental/gluon/language/nvidia/ampere/__init__.py +3 -0
  31. triton/experimental/gluon/language/nvidia/ampere/async_copy.py +74 -0
  32. triton/experimental/gluon/language/nvidia/ampere/mbarrier.py +80 -0
  33. triton/experimental/gluon/language/nvidia/blackwell/__init__.py +192 -7
  34. triton/experimental/gluon/language/nvidia/blackwell/tma.py +20 -0
  35. triton/experimental/gluon/language/nvidia/hopper/__init__.py +124 -3
  36. triton/experimental/gluon/language/nvidia/hopper/mbarrier.py +20 -37
  37. triton/experimental/gluon/language/nvidia/hopper/tma.py +4 -3
  38. triton/experimental/gluon/nvidia/hopper.py +6 -1
  39. triton/knobs.py +132 -67
  40. triton/language/__init__.py +16 -10
  41. triton/language/core.py +163 -83
  42. triton/language/extra/cuda/gdc.py +6 -6
  43. triton/language/extra/hip/__init__.py +3 -1
  44. triton/language/extra/hip/libdevice.py +7 -0
  45. triton/language/extra/hip/utils.py +35 -0
  46. triton/language/extra/libdevice.py +4 -0
  47. triton/language/semantic.py +76 -23
  48. triton/language/standard.py +14 -14
  49. triton/language/target_info.py +54 -0
  50. triton/runtime/_allocation.py +15 -3
  51. triton/runtime/_async_compile.py +55 -0
  52. triton/runtime/autotuner.py +4 -5
  53. triton/runtime/build.py +11 -9
  54. triton/runtime/cache.py +44 -1
  55. triton/runtime/driver.py +16 -41
  56. triton/runtime/interpreter.py +31 -23
  57. triton/runtime/jit.py +318 -157
  58. triton/runtime/tcc/include/_mingw.h +8 -10
  59. triton/runtime/tcc/include/assert.h +5 -0
  60. triton/runtime/tcc/include/errno.h +1 -1
  61. triton/runtime/tcc/include/float.h +21 -3
  62. triton/runtime/tcc/include/iso646.h +36 -0
  63. triton/runtime/tcc/include/limits.h +5 -0
  64. triton/runtime/tcc/include/malloc.h +2 -2
  65. triton/runtime/tcc/include/math.h +21 -261
  66. triton/runtime/tcc/include/stdalign.h +16 -0
  67. triton/runtime/tcc/include/stdarg.h +5 -70
  68. triton/runtime/tcc/include/stdatomic.h +171 -0
  69. triton/runtime/tcc/include/stddef.h +7 -19
  70. triton/runtime/tcc/include/stdlib.h +15 -4
  71. triton/runtime/tcc/include/stdnoreturn.h +7 -0
  72. triton/runtime/tcc/include/sys/stat.h +2 -2
  73. triton/runtime/tcc/include/sys/types.h +5 -0
  74. triton/runtime/tcc/include/tcc/tcc_libm.h +444 -27
  75. triton/runtime/tcc/include/tccdefs.h +342 -0
  76. triton/runtime/tcc/include/tgmath.h +89 -0
  77. triton/runtime/tcc/include/uchar.h +33 -0
  78. triton/runtime/tcc/include/unistd.h +1 -0
  79. triton/runtime/tcc/include/winapi/qos.h +72 -0
  80. triton/runtime/tcc/include/winapi/shellapi.h +59 -0
  81. triton/runtime/tcc/include/winapi/winbase.h +9 -2
  82. triton/runtime/tcc/include/winapi/wincon.h +8 -0
  83. triton/runtime/tcc/include/winapi/windows.h +1 -1
  84. triton/runtime/tcc/include/winapi/winnls.h +778 -0
  85. triton/runtime/tcc/include/winapi/winnt.h +9 -7
  86. triton/runtime/tcc/include/winapi/winsock2.h +1474 -0
  87. triton/runtime/tcc/include/winapi/ws2ipdef.h +21 -0
  88. triton/runtime/tcc/include/winapi/ws2tcpip.h +391 -0
  89. triton/runtime/tcc/lib/libtcc1.a +0 -0
  90. triton/runtime/tcc/lib/python314.def +1800 -0
  91. triton/runtime/tcc/lib/python314t.def +1809 -0
  92. triton/runtime/tcc/libtcc.dll +0 -0
  93. triton/runtime/tcc/tcc.exe +0 -0
  94. triton/tools/compile.py +62 -14
  95. triton/tools/extra/cuda/compile.c +1 -0
  96. triton/tools/extra/hip/compile.cpp +66 -0
  97. triton/tools/extra/hip/compile.h +13 -0
  98. triton/tools/ragged_tma.py +92 -0
  99. triton/tools/tensor_descriptor.py +7 -9
  100. triton/windows_utils.py +42 -79
  101. {triton_windows-3.4.0.post20.dist-info → triton_windows-3.5.0.post21.dist-info}/METADATA +3 -4
  102. {triton_windows-3.4.0.post20.dist-info → triton_windows-3.5.0.post21.dist-info}/RECORD +106 -75
  103. triton/runtime/tcc/lib/libtcc1-64.a +0 -0
  104. {triton_windows-3.4.0.post20.dist-info → triton_windows-3.5.0.post21.dist-info}/WHEEL +0 -0
  105. {triton_windows-3.4.0.post20.dist-info → triton_windows-3.5.0.post21.dist-info}/entry_points.txt +0 -0
  106. {triton_windows-3.4.0.post20.dist-info → triton_windows-3.5.0.post21.dist-info}/licenses/LICENSE +0 -0
  107. {triton_windows-3.4.0.post20.dist-info → triton_windows-3.5.0.post21.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,171 @@
1
+ /* This file is derived from clang's stdatomic.h */
2
+
3
+ /*===---- stdatomic.h - Standard header for atomic types and operations -----===
4
+ *
5
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
6
+ * See https://llvm.org/LICENSE.txt for license information.
7
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
8
+ *
9
+ *===-----------------------------------------------------------------------===
10
+ */
11
+
12
+ #ifndef _STDATOMIC_H
13
+ #define _STDATOMIC_H
14
+
15
+ #include <stddef.h>
16
+ #include <stdint.h>
17
+ #include <stdbool.h>
18
+
19
+ #define __ATOMIC_RELAXED 0
20
+ #define __ATOMIC_CONSUME 1
21
+ #define __ATOMIC_ACQUIRE 2
22
+ #define __ATOMIC_RELEASE 3
23
+ #define __ATOMIC_ACQ_REL 4
24
+ #define __ATOMIC_SEQ_CST 5
25
+
26
+ /* Memory ordering */
27
+ typedef enum {
28
+ memory_order_relaxed = __ATOMIC_RELAXED,
29
+ memory_order_consume = __ATOMIC_CONSUME,
30
+ memory_order_acquire = __ATOMIC_ACQUIRE,
31
+ memory_order_release = __ATOMIC_RELEASE,
32
+ memory_order_acq_rel = __ATOMIC_ACQ_REL,
33
+ memory_order_seq_cst = __ATOMIC_SEQ_CST,
34
+ } memory_order;
35
+
36
+ /* Atomic typedefs */
37
+ typedef _Atomic(_Bool) atomic_bool;
38
+ typedef _Atomic(char) atomic_char;
39
+ typedef _Atomic(signed char) atomic_schar;
40
+ typedef _Atomic(unsigned char) atomic_uchar;
41
+ typedef _Atomic(short) atomic_short;
42
+ typedef _Atomic(unsigned short) atomic_ushort;
43
+ typedef _Atomic(int) atomic_int;
44
+ typedef _Atomic(unsigned int) atomic_uint;
45
+ typedef _Atomic(long) atomic_long;
46
+ typedef _Atomic(unsigned long) atomic_ulong;
47
+ typedef _Atomic(long long) atomic_llong;
48
+ typedef _Atomic(unsigned long long) atomic_ullong;
49
+ typedef _Atomic(uint_least16_t) atomic_char16_t;
50
+ typedef _Atomic(uint_least32_t) atomic_char32_t;
51
+ typedef _Atomic(wchar_t) atomic_wchar_t;
52
+ typedef _Atomic(int_least8_t) atomic_int_least8_t;
53
+ typedef _Atomic(uint_least8_t) atomic_uint_least8_t;
54
+ typedef _Atomic(int_least16_t) atomic_int_least16_t;
55
+ typedef _Atomic(uint_least16_t) atomic_uint_least16_t;
56
+ typedef _Atomic(int_least32_t) atomic_int_least32_t;
57
+ typedef _Atomic(uint_least32_t) atomic_uint_least32_t;
58
+ typedef _Atomic(int_least64_t) atomic_int_least64_t;
59
+ typedef _Atomic(uint_least64_t) atomic_uint_least64_t;
60
+ typedef _Atomic(int_fast8_t) atomic_int_fast8_t;
61
+ typedef _Atomic(uint_fast8_t) atomic_uint_fast8_t;
62
+ typedef _Atomic(int_fast16_t) atomic_int_fast16_t;
63
+ typedef _Atomic(uint_fast16_t) atomic_uint_fast16_t;
64
+ typedef _Atomic(int_fast32_t) atomic_int_fast32_t;
65
+ typedef _Atomic(uint_fast32_t) atomic_uint_fast32_t;
66
+ typedef _Atomic(int_fast64_t) atomic_int_fast64_t;
67
+ typedef _Atomic(uint_fast64_t) atomic_uint_fast64_t;
68
+ typedef _Atomic(intptr_t) atomic_intptr_t;
69
+ typedef _Atomic(uintptr_t) atomic_uintptr_t;
70
+ typedef _Atomic(size_t) atomic_size_t;
71
+ typedef _Atomic(ptrdiff_t) atomic_ptrdiff_t;
72
+ typedef _Atomic(intmax_t) atomic_intmax_t;
73
+ typedef _Atomic(uintmax_t) atomic_uintmax_t;
74
+
75
+ /* Atomic flag */
76
+ typedef struct {
77
+ atomic_bool value;
78
+ } atomic_flag;
79
+
80
+ #define ATOMIC_FLAG_INIT {0}
81
+ #define ATOMIC_VAR_INIT(value) (value)
82
+
83
+ /* Generic routines */
84
+ #define atomic_init(object, desired) \
85
+ atomic_store_explicit(object, desired, __ATOMIC_RELAXED)
86
+
87
+ #define __atomic_store_n(ptr, val, order) \
88
+ (*(ptr) = (val), __atomic_store((ptr), &(typeof(*(ptr))){val}, (order)))
89
+ #define atomic_store_explicit(object, desired, order) \
90
+ ({ __typeof__ (object) ptr = (object); \
91
+ __typeof__ (*ptr) tmp = (desired); \
92
+ __atomic_store (ptr, &tmp, (order)); \
93
+ })
94
+ #define atomic_store(object, desired) \
95
+ atomic_store_explicit (object, desired, __ATOMIC_SEQ_CST)
96
+
97
+ #define __atomic_load_n(ptr, order) \
98
+ ({ typeof(*(ptr)) __val; \
99
+ __atomic_load((ptr), &__val, (order)); \
100
+ __val; })
101
+ #define atomic_load_explicit(object, order) \
102
+ ({ __typeof__ (object) ptr = (object); \
103
+ __typeof__ (*ptr) tmp; \
104
+ __atomic_load (ptr, &tmp, (order)); \
105
+ tmp; \
106
+ })
107
+ #define atomic_load(object) atomic_load_explicit (object, __ATOMIC_SEQ_CST)
108
+
109
+ #define atomic_exchange_explicit(object, desired, order) \
110
+ ({ __typeof__ (object) ptr = (object); \
111
+ __typeof__ (*ptr) val = (desired); \
112
+ __typeof__ (*ptr) tmp; \
113
+ __atomic_exchange (ptr, &val, &tmp, (order)); \
114
+ tmp; \
115
+ })
116
+ #define atomic_exchange(object, desired) \
117
+ atomic_exchange_explicit (object, desired, __ATOMIC_SEQ_CST)
118
+ #define __atomic_compare_exchange_n(ptr, expected, desired, weak, success, failure) \
119
+ ({ typeof(*(ptr)) __desired = (desired); \
120
+ __atomic_compare_exchange((ptr), (expected), &__desired, \
121
+ (weak), (success), (failure)); })
122
+ #define atomic_compare_exchange_strong_explicit(object, expected, desired, success, failure) \
123
+ ({ __typeof__ (object) ptr = (object); \
124
+ __typeof__ (*ptr) tmp = desired; \
125
+ __atomic_compare_exchange(ptr, expected, &tmp, 0, success, failure); \
126
+ })
127
+ #define atomic_compare_exchange_strong(object, expected, desired) \
128
+ atomic_compare_exchange_strong_explicit (object, expected, desired, \
129
+ __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST)
130
+ #define atomic_compare_exchange_weak_explicit(object, expected, desired, success, failure) \
131
+ ({ __typeof__ (object) ptr = (object); \
132
+ __typeof__ (*ptr) tmp = desired; \
133
+ __atomic_compare_exchange(ptr, expected, &tmp, 1, success, failure); \
134
+ })
135
+ #define atomic_compare_exchange_weak(object, expected, desired) \
136
+ atomic_compare_exchange_weak_explicit (object, expected, desired, \
137
+ __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST)
138
+
139
+ #define atomic_fetch_add(object, operand) \
140
+ __atomic_fetch_add(object, operand, __ATOMIC_SEQ_CST)
141
+ #define atomic_fetch_add_explicit __atomic_fetch_add
142
+
143
+ #define atomic_fetch_sub(object, operand) \
144
+ __atomic_fetch_sub(object, operand, __ATOMIC_SEQ_CST)
145
+ #define atomic_fetch_sub_explicit __atomic_fetch_sub
146
+
147
+ #define atomic_fetch_or(object, operand) \
148
+ __atomic_fetch_or(object, operand, __ATOMIC_SEQ_CST)
149
+ #define atomic_fetch_or_explicit __atomic_fetch_or
150
+
151
+ #define atomic_fetch_xor(object, operand) \
152
+ __atomic_fetch_xor(object, operand, __ATOMIC_SEQ_CST)
153
+ #define atomic_fetch_xor_explicit __atomic_fetch_xor
154
+
155
+ #define atomic_fetch_and(object, operand) \
156
+ __atomic_fetch_and(object, operand, __ATOMIC_SEQ_CST)
157
+ #define atomic_fetch_and_explicit __atomic_fetch_and
158
+
159
+ extern void atomic_thread_fence (memory_order);
160
+ #define __atomic_thread_fence(order) atomic_thread_fence (order)
161
+ extern void atomic_signal_fence (memory_order);
162
+ #define __atomic_signal_fence(order) atomic_signal_fence(order)
163
+ #define atomic_signal_fence(order) __atomic_signal_fence (order)
164
+ extern bool __atomic_is_lock_free(size_t size, void *ptr);
165
+ #define atomic_is_lock_free(OBJ) __atomic_is_lock_free (sizeof (*(OBJ)), (OBJ))
166
+
167
+ extern bool atomic_flag_test_and_set(void *object);
168
+ extern bool atomic_flag_test_and_set_explicit(void *object, memory_order order);
169
+ extern void atomic_flag_clear(void *object);
170
+ extern void atomic_flag_clear_explicit(void *object, memory_order order);
171
+ #endif /* _STDATOMIC_H */
@@ -1,5 +1,6 @@
1
1
  #ifndef _STDDEF_H
2
2
  #define _STDDEF_H
3
+ #define _TINYC_STDDEF
3
4
 
4
5
  typedef __SIZE_TYPE__ size_t;
5
6
  typedef __PTRDIFF_TYPE__ ssize_t;
@@ -8,33 +9,20 @@ typedef __PTRDIFF_TYPE__ ptrdiff_t;
8
9
  typedef __PTRDIFF_TYPE__ intptr_t;
9
10
  typedef __SIZE_TYPE__ uintptr_t;
10
11
 
11
- #ifndef __int8_t_defined
12
- #define __int8_t_defined
13
- typedef signed char int8_t;
14
- typedef signed short int int16_t;
15
- typedef signed int int32_t;
16
- #ifdef __LP64__
17
- typedef signed long int int64_t;
18
- #else
19
- typedef signed long long int int64_t;
20
- #endif
21
- typedef unsigned char uint8_t;
22
- typedef unsigned short int uint16_t;
23
- typedef unsigned int uint32_t;
24
- #ifdef __LP64__
25
- typedef unsigned long int uint64_t;
26
- #else
27
- typedef unsigned long long int uint64_t;
28
- #endif
12
+ #if __STDC_VERSION__ >= 201112L
13
+ typedef union { long long __ll; long double __ld; } max_align_t;
29
14
  #endif
30
15
 
31
16
  #ifndef NULL
32
17
  #define NULL ((void*)0)
33
18
  #endif
34
19
 
35
- #define offsetof(type, field) ((size_t)&((type *)0)->field)
20
+ #undef offsetof
21
+ #define offsetof(type, field) __builtin_offsetof(type, field)
36
22
 
23
+ #if defined __i386__ || defined __x86_64__
37
24
  void *alloca(size_t size);
25
+ #endif
38
26
 
39
27
  #endif
40
28
 
@@ -136,7 +136,7 @@ extern "C" {
136
136
 
137
137
  #ifndef _CRT_ERRNO_DEFINED
138
138
  #define _CRT_ERRNO_DEFINED
139
- _CRTIMP extern int *__cdecl _errno(void);
139
+ _CRTIMP int *__cdecl _errno(void);
140
140
  #define errno (*_errno())
141
141
  errno_t __cdecl _set_errno(int _Value);
142
142
  errno_t __cdecl _get_errno(int *_Value);
@@ -376,10 +376,16 @@ extern "C" {
376
376
  _CRTIMP int __cdecl _set_error_mode(int _Mode);
377
377
  void __cdecl srand(unsigned int _Seed);
378
378
  double __cdecl strtod(const char *_Str,char **_EndPtr);
379
- float __cdecl strtof(const char *nptr, char **endptr);
380
379
  #if !defined __NO_ISOCEXT /* in libmingwex.a */
380
+ #if __TINYC__
381
+ __CRT_INLINE float __cdecl strtof (const char *p, char ** e) { return strtod(p, e); }
382
+ __CRT_INLINE long double __cdecl strtold(const char *p, char ** e) { return strtod(p, e); }
383
+ #else
381
384
  float __cdecl strtof (const char * __restrict__, char ** __restrict__);
382
385
  long double __cdecl strtold(const char * __restrict__, char ** __restrict__);
386
+ #endif
387
+ #else
388
+ float __cdecl strtof(const char *nptr, char **endptr);
383
389
  #endif /* __NO_ISOCEXT */
384
390
  _CRTIMP double __cdecl _strtod_l(const char *_Str,char **_EndPtr,_locale_t _Locale);
385
391
  long __cdecl strtol(const char *_Str,char **_EndPtr,int _Radix);
@@ -403,8 +409,8 @@ extern "C" {
403
409
  void *__cdecl malloc(size_t _Size);
404
410
  void *__cdecl realloc(void *_Memory,size_t _NewSize);
405
411
  _CRTIMP void *__cdecl _recalloc(void *_Memory,size_t _Count,size_t _Size);
406
- //_CRTIMP void __cdecl _aligned_free(void *_Memory);
407
- //_CRTIMP void *__cdecl _aligned_malloc(size_t _Size,size_t _Alignment);
412
+ _CRTIMP void __cdecl _aligned_free(void *_Memory);
413
+ _CRTIMP void *__cdecl _aligned_malloc(size_t _Size,size_t _Alignment);
408
414
  _CRTIMP void *__cdecl _aligned_offset_malloc(size_t _Size,size_t _Alignment,size_t _Offset);
409
415
  _CRTIMP void *__cdecl _aligned_realloc(void *_Memory,size_t _Size,size_t _Alignment);
410
416
  _CRTIMP void *__cdecl _aligned_recalloc(void *_Memory,size_t _Count,size_t _Size,size_t _Alignment);
@@ -544,8 +550,13 @@ extern "C" {
544
550
 
545
551
  __CRT_INLINE long long __cdecl llabs(long long _j) { return (_j >= 0 ? _j : -_j); }
546
552
 
553
+ #ifdef __TINYC__ /* gr */
554
+ #define strtoll _strtoi64
555
+ #define strtoull _strtoui64
556
+ #else
547
557
  long long __cdecl strtoll(const char* __restrict__, char** __restrict, int);
548
558
  unsigned long long __cdecl strtoull(const char* __restrict__, char** __restrict__, int);
559
+ #endif
549
560
 
550
561
  /* these are stubs for MS _i64 versions */
551
562
  long long __cdecl atoll (const char *);
@@ -0,0 +1,7 @@
1
+ #ifndef _STDNORETURN_H
2
+ #define _STDNORETURN_H
3
+
4
+ /* ISOC11 noreturn */
5
+ #define noreturn _Noreturn
6
+
7
+ #endif /* _STDNORETURN_H */
@@ -81,9 +81,9 @@ extern "C" {
81
81
  #else
82
82
  #define _fstat _fstat64i32
83
83
  #define _fstati64 _fstat64
84
- #define _stat _stat64i32
84
+ #define _stat _stat64
85
85
  #define _stati64 _stat64
86
- #define _wstat _wstat64i32
86
+ #define _wstat _wstat64
87
87
  #define _wstati64 _wstat64
88
88
  #endif
89
89
 
@@ -102,6 +102,11 @@ typedef _mode_t mode_t;
102
102
  #endif
103
103
  #endif
104
104
 
105
+ /* required by (unbundled) unistd.h for usleep arg type */
106
+ #ifndef __NO_ISOCEXT
107
+ typedef unsigned int useconds_t;
108
+ #endif
109
+
105
110
  #ifndef _TIMESPEC_DEFINED
106
111
  #define _TIMESPEC_DEFINED
107
112
  struct timespec {