triton-windows 3.4.0.post20__cp310-cp310-win_amd64.whl → 3.5.0.post21__cp310-cp310-win_amd64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of triton-windows might be problematic. Click here for more details.
- triton/_C/libtriton.pyd +0 -0
- triton/__init__.py +8 -2
- triton/_filecheck.py +24 -14
- triton/_internal_testing.py +70 -4
- triton/_utils.py +3 -1
- triton/backends/amd/compiler.py +68 -60
- triton/backends/amd/driver.c +113 -44
- triton/backends/amd/driver.py +133 -57
- triton/backends/driver.py +13 -0
- triton/backends/nvidia/compiler.py +80 -22
- triton/backends/nvidia/driver.c +88 -15
- triton/backends/nvidia/driver.py +130 -123
- triton/compiler/__init__.py +5 -2
- triton/compiler/code_generator.py +270 -163
- triton/compiler/compiler.py +45 -62
- triton/experimental/gluon/__init__.py +3 -2
- triton/experimental/gluon/_runtime.py +9 -6
- triton/experimental/gluon/language/__init__.py +117 -16
- triton/experimental/gluon/language/_core.py +246 -68
- triton/experimental/gluon/language/_layouts.py +398 -45
- triton/experimental/gluon/language/_math.py +17 -9
- triton/experimental/gluon/language/_semantic.py +130 -37
- triton/experimental/gluon/language/_standard.py +55 -22
- triton/experimental/gluon/language/amd/__init__.py +4 -0
- triton/experimental/gluon/language/amd/_layouts.py +96 -0
- triton/experimental/gluon/language/amd/cdna3/__init__.py +100 -0
- triton/experimental/gluon/language/amd/cdna4/__init__.py +48 -0
- triton/experimental/gluon/language/amd/cdna4/async_copy.py +151 -0
- triton/experimental/gluon/language/extra/__init__.py +3 -0
- triton/experimental/gluon/language/nvidia/ampere/__init__.py +3 -0
- triton/experimental/gluon/language/nvidia/ampere/async_copy.py +74 -0
- triton/experimental/gluon/language/nvidia/ampere/mbarrier.py +80 -0
- triton/experimental/gluon/language/nvidia/blackwell/__init__.py +192 -7
- triton/experimental/gluon/language/nvidia/blackwell/tma.py +20 -0
- triton/experimental/gluon/language/nvidia/hopper/__init__.py +124 -3
- triton/experimental/gluon/language/nvidia/hopper/mbarrier.py +20 -37
- triton/experimental/gluon/language/nvidia/hopper/tma.py +4 -3
- triton/experimental/gluon/nvidia/hopper.py +6 -1
- triton/knobs.py +132 -67
- triton/language/__init__.py +16 -10
- triton/language/core.py +163 -83
- triton/language/extra/cuda/gdc.py +6 -6
- triton/language/extra/hip/__init__.py +3 -1
- triton/language/extra/hip/libdevice.py +7 -0
- triton/language/extra/hip/utils.py +35 -0
- triton/language/extra/libdevice.py +4 -0
- triton/language/semantic.py +76 -23
- triton/language/standard.py +14 -14
- triton/language/target_info.py +54 -0
- triton/runtime/_allocation.py +15 -3
- triton/runtime/_async_compile.py +55 -0
- triton/runtime/autotuner.py +4 -5
- triton/runtime/build.py +11 -9
- triton/runtime/cache.py +44 -1
- triton/runtime/driver.py +16 -41
- triton/runtime/interpreter.py +31 -23
- triton/runtime/jit.py +318 -157
- triton/runtime/tcc/include/_mingw.h +8 -10
- triton/runtime/tcc/include/assert.h +5 -0
- triton/runtime/tcc/include/errno.h +1 -1
- triton/runtime/tcc/include/float.h +21 -3
- triton/runtime/tcc/include/iso646.h +36 -0
- triton/runtime/tcc/include/limits.h +5 -0
- triton/runtime/tcc/include/malloc.h +2 -2
- triton/runtime/tcc/include/math.h +21 -261
- triton/runtime/tcc/include/stdalign.h +16 -0
- triton/runtime/tcc/include/stdarg.h +5 -70
- triton/runtime/tcc/include/stdatomic.h +171 -0
- triton/runtime/tcc/include/stddef.h +7 -19
- triton/runtime/tcc/include/stdlib.h +15 -4
- triton/runtime/tcc/include/stdnoreturn.h +7 -0
- triton/runtime/tcc/include/sys/stat.h +2 -2
- triton/runtime/tcc/include/sys/types.h +5 -0
- triton/runtime/tcc/include/tcc/tcc_libm.h +444 -27
- triton/runtime/tcc/include/tccdefs.h +342 -0
- triton/runtime/tcc/include/tgmath.h +89 -0
- triton/runtime/tcc/include/uchar.h +33 -0
- triton/runtime/tcc/include/unistd.h +1 -0
- triton/runtime/tcc/include/winapi/qos.h +72 -0
- triton/runtime/tcc/include/winapi/shellapi.h +59 -0
- triton/runtime/tcc/include/winapi/winbase.h +9 -2
- triton/runtime/tcc/include/winapi/wincon.h +8 -0
- triton/runtime/tcc/include/winapi/windows.h +1 -1
- triton/runtime/tcc/include/winapi/winnls.h +778 -0
- triton/runtime/tcc/include/winapi/winnt.h +9 -7
- triton/runtime/tcc/include/winapi/winsock2.h +1474 -0
- triton/runtime/tcc/include/winapi/ws2ipdef.h +21 -0
- triton/runtime/tcc/include/winapi/ws2tcpip.h +391 -0
- triton/runtime/tcc/lib/libtcc1.a +0 -0
- triton/runtime/tcc/lib/python314.def +1800 -0
- triton/runtime/tcc/lib/python314t.def +1809 -0
- triton/runtime/tcc/libtcc.dll +0 -0
- triton/runtime/tcc/tcc.exe +0 -0
- triton/tools/compile.py +62 -14
- triton/tools/extra/cuda/compile.c +1 -0
- triton/tools/extra/hip/compile.cpp +66 -0
- triton/tools/extra/hip/compile.h +13 -0
- triton/tools/ragged_tma.py +92 -0
- triton/tools/tensor_descriptor.py +7 -9
- triton/windows_utils.py +42 -79
- {triton_windows-3.4.0.post20.dist-info → triton_windows-3.5.0.post21.dist-info}/METADATA +3 -4
- {triton_windows-3.4.0.post20.dist-info → triton_windows-3.5.0.post21.dist-info}/RECORD +106 -75
- triton/runtime/tcc/lib/libtcc1-64.a +0 -0
- {triton_windows-3.4.0.post20.dist-info → triton_windows-3.5.0.post21.dist-info}/WHEEL +0 -0
- {triton_windows-3.4.0.post20.dist-info → triton_windows-3.5.0.post21.dist-info}/entry_points.txt +0 -0
- {triton_windows-3.4.0.post20.dist-info → triton_windows-3.5.0.post21.dist-info}/licenses/LICENSE +0 -0
- {triton_windows-3.4.0.post20.dist-info → triton_windows-3.5.0.post21.dist-info}/top_level.txt +0 -0
|
@@ -1310,7 +1310,7 @@ typedef DWORD LCID;
|
|
|
1310
1310
|
#define INITIAL_MXCSR 0x1f80
|
|
1311
1311
|
#define INITIAL_FPCSR 0x027f
|
|
1312
1312
|
|
|
1313
|
-
typedef DECLSPEC_ALIGN(16)
|
|
1313
|
+
typedef struct DECLSPEC_ALIGN(16) _M128A {
|
|
1314
1314
|
ULONGLONG Low;
|
|
1315
1315
|
LONGLONG High;
|
|
1316
1316
|
} M128A,*PM128A;
|
|
@@ -1336,7 +1336,7 @@ typedef DWORD LCID;
|
|
|
1336
1336
|
|
|
1337
1337
|
#define LEGACY_SAVE_AREA_LENGTH sizeof(XMM_SAVE_AREA32)
|
|
1338
1338
|
|
|
1339
|
-
typedef DECLSPEC_ALIGN(16)
|
|
1339
|
+
typedef struct DECLSPEC_ALIGN(16) _CONTEXT {
|
|
1340
1340
|
DWORD64 P1Home;
|
|
1341
1341
|
DWORD64 P2Home;
|
|
1342
1342
|
DWORD64 P3Home;
|
|
@@ -1474,7 +1474,7 @@ typedef DWORD LCID;
|
|
|
1474
1474
|
#if(defined(_X86_) && !defined(__x86_64))
|
|
1475
1475
|
__CRT_INLINE VOID MemoryBarrier(VOID) {
|
|
1476
1476
|
LONG Barrier;
|
|
1477
|
-
__asm__ __volatile__("xchgl
|
|
1477
|
+
__asm__ __volatile__("xchgl %%eax,%0 "
|
|
1478
1478
|
:"=r" (Barrier));
|
|
1479
1479
|
}
|
|
1480
1480
|
#define YieldProcessor() __asm__ __volatile__("rep nop ");
|
|
@@ -1486,7 +1486,7 @@ typedef DWORD LCID;
|
|
|
1486
1486
|
#define PF_NON_TEMPORAL_LEVEL_ALL
|
|
1487
1487
|
|
|
1488
1488
|
__CRT_INLINE VOID DbgRaiseAssertionFailure(void) {
|
|
1489
|
-
__asm__ __volatile__("int 0x2c ");
|
|
1489
|
+
__asm__ __volatile__("int $0x2c ");
|
|
1490
1490
|
}
|
|
1491
1491
|
PVOID GetCurrentFiber(void);
|
|
1492
1492
|
__CRT_INLINE PVOID GetCurrentFiber(void)
|
|
@@ -2761,6 +2761,8 @@ typedef DWORD LCID;
|
|
|
2761
2761
|
#define PROCESS_SET_INFORMATION (0x0200)
|
|
2762
2762
|
#define PROCESS_QUERY_INFORMATION (0x0400)
|
|
2763
2763
|
#define PROCESS_SUSPEND_RESUME (0x0800)
|
|
2764
|
+
#define PROCESS_QUERY_LIMITED_INFORMATION (0x1000)
|
|
2765
|
+
#define PROCESS_SET_LIMITED_INFORMATION (0x2000)
|
|
2764
2766
|
#define PROCESS_ALL_ACCESS (STANDARD_RIGHTS_REQUIRED | SYNCHRONIZE | 0xFFF)
|
|
2765
2767
|
|
|
2766
2768
|
#ifdef _WIN64
|
|
@@ -3150,7 +3152,7 @@ typedef DWORD LCID;
|
|
|
3150
3152
|
DWORD Type;
|
|
3151
3153
|
} MEMORY_BASIC_INFORMATION32,*PMEMORY_BASIC_INFORMATION32;
|
|
3152
3154
|
|
|
3153
|
-
typedef DECLSPEC_ALIGN(16)
|
|
3155
|
+
typedef struct DECLSPEC_ALIGN(16) _MEMORY_BASIC_INFORMATION64 {
|
|
3154
3156
|
ULONGLONG BaseAddress;
|
|
3155
3157
|
ULONGLONG AllocationBase;
|
|
3156
3158
|
DWORD AllocationProtect;
|
|
@@ -4949,7 +4951,7 @@ typedef DWORD LCID;
|
|
|
4949
4951
|
|
|
4950
4952
|
#ifdef _WIN64
|
|
4951
4953
|
typedef struct _SLIST_ENTRY *PSLIST_ENTRY;
|
|
4952
|
-
typedef DECLSPEC_ALIGN(16)
|
|
4954
|
+
typedef struct DECLSPEC_ALIGN(16) _SLIST_ENTRY {
|
|
4953
4955
|
PSLIST_ENTRY Next;
|
|
4954
4956
|
} SLIST_ENTRY;
|
|
4955
4957
|
#else
|
|
@@ -4961,7 +4963,7 @@ typedef DWORD LCID;
|
|
|
4961
4963
|
|
|
4962
4964
|
#if defined(_WIN64)
|
|
4963
4965
|
|
|
4964
|
-
typedef DECLSPEC_ALIGN(16)
|
|
4966
|
+
typedef struct DECLSPEC_ALIGN(16) _SLIST_HEADER {
|
|
4965
4967
|
ULONGLONG Alignment;
|
|
4966
4968
|
ULONGLONG Region;
|
|
4967
4969
|
} SLIST_HEADER;
|