triton-windows 3.4.0.post20__cp312-cp312-win_amd64.whl → 3.5.0.post21__cp312-cp312-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of triton-windows might be problematic. Click here for more details.

Files changed (107) hide show
  1. triton/_C/libtriton.pyd +0 -0
  2. triton/__init__.py +8 -2
  3. triton/_filecheck.py +24 -14
  4. triton/_internal_testing.py +70 -4
  5. triton/_utils.py +3 -1
  6. triton/backends/amd/compiler.py +68 -60
  7. triton/backends/amd/driver.c +113 -44
  8. triton/backends/amd/driver.py +133 -57
  9. triton/backends/driver.py +13 -0
  10. triton/backends/nvidia/compiler.py +80 -22
  11. triton/backends/nvidia/driver.c +88 -15
  12. triton/backends/nvidia/driver.py +130 -123
  13. triton/compiler/__init__.py +5 -2
  14. triton/compiler/code_generator.py +270 -163
  15. triton/compiler/compiler.py +45 -62
  16. triton/experimental/gluon/__init__.py +3 -2
  17. triton/experimental/gluon/_runtime.py +9 -6
  18. triton/experimental/gluon/language/__init__.py +117 -16
  19. triton/experimental/gluon/language/_core.py +246 -68
  20. triton/experimental/gluon/language/_layouts.py +398 -45
  21. triton/experimental/gluon/language/_math.py +17 -9
  22. triton/experimental/gluon/language/_semantic.py +130 -37
  23. triton/experimental/gluon/language/_standard.py +55 -22
  24. triton/experimental/gluon/language/amd/__init__.py +4 -0
  25. triton/experimental/gluon/language/amd/_layouts.py +96 -0
  26. triton/experimental/gluon/language/amd/cdna3/__init__.py +100 -0
  27. triton/experimental/gluon/language/amd/cdna4/__init__.py +48 -0
  28. triton/experimental/gluon/language/amd/cdna4/async_copy.py +151 -0
  29. triton/experimental/gluon/language/extra/__init__.py +3 -0
  30. triton/experimental/gluon/language/nvidia/ampere/__init__.py +3 -0
  31. triton/experimental/gluon/language/nvidia/ampere/async_copy.py +74 -0
  32. triton/experimental/gluon/language/nvidia/ampere/mbarrier.py +80 -0
  33. triton/experimental/gluon/language/nvidia/blackwell/__init__.py +192 -7
  34. triton/experimental/gluon/language/nvidia/blackwell/tma.py +20 -0
  35. triton/experimental/gluon/language/nvidia/hopper/__init__.py +124 -3
  36. triton/experimental/gluon/language/nvidia/hopper/mbarrier.py +20 -37
  37. triton/experimental/gluon/language/nvidia/hopper/tma.py +4 -3
  38. triton/experimental/gluon/nvidia/hopper.py +6 -1
  39. triton/knobs.py +132 -67
  40. triton/language/__init__.py +16 -10
  41. triton/language/core.py +163 -83
  42. triton/language/extra/cuda/gdc.py +6 -6
  43. triton/language/extra/hip/__init__.py +3 -1
  44. triton/language/extra/hip/libdevice.py +7 -0
  45. triton/language/extra/hip/utils.py +35 -0
  46. triton/language/extra/libdevice.py +4 -0
  47. triton/language/semantic.py +76 -23
  48. triton/language/standard.py +14 -14
  49. triton/language/target_info.py +54 -0
  50. triton/runtime/_allocation.py +15 -3
  51. triton/runtime/_async_compile.py +55 -0
  52. triton/runtime/autotuner.py +4 -5
  53. triton/runtime/build.py +11 -9
  54. triton/runtime/cache.py +44 -1
  55. triton/runtime/driver.py +16 -41
  56. triton/runtime/interpreter.py +31 -23
  57. triton/runtime/jit.py +318 -157
  58. triton/runtime/tcc/include/_mingw.h +8 -10
  59. triton/runtime/tcc/include/assert.h +5 -0
  60. triton/runtime/tcc/include/errno.h +1 -1
  61. triton/runtime/tcc/include/float.h +21 -3
  62. triton/runtime/tcc/include/iso646.h +36 -0
  63. triton/runtime/tcc/include/limits.h +5 -0
  64. triton/runtime/tcc/include/malloc.h +2 -2
  65. triton/runtime/tcc/include/math.h +21 -261
  66. triton/runtime/tcc/include/stdalign.h +16 -0
  67. triton/runtime/tcc/include/stdarg.h +5 -70
  68. triton/runtime/tcc/include/stdatomic.h +171 -0
  69. triton/runtime/tcc/include/stddef.h +7 -19
  70. triton/runtime/tcc/include/stdlib.h +15 -4
  71. triton/runtime/tcc/include/stdnoreturn.h +7 -0
  72. triton/runtime/tcc/include/sys/stat.h +2 -2
  73. triton/runtime/tcc/include/sys/types.h +5 -0
  74. triton/runtime/tcc/include/tcc/tcc_libm.h +444 -27
  75. triton/runtime/tcc/include/tccdefs.h +342 -0
  76. triton/runtime/tcc/include/tgmath.h +89 -0
  77. triton/runtime/tcc/include/uchar.h +33 -0
  78. triton/runtime/tcc/include/unistd.h +1 -0
  79. triton/runtime/tcc/include/winapi/qos.h +72 -0
  80. triton/runtime/tcc/include/winapi/shellapi.h +59 -0
  81. triton/runtime/tcc/include/winapi/winbase.h +9 -2
  82. triton/runtime/tcc/include/winapi/wincon.h +8 -0
  83. triton/runtime/tcc/include/winapi/windows.h +1 -1
  84. triton/runtime/tcc/include/winapi/winnls.h +778 -0
  85. triton/runtime/tcc/include/winapi/winnt.h +9 -7
  86. triton/runtime/tcc/include/winapi/winsock2.h +1474 -0
  87. triton/runtime/tcc/include/winapi/ws2ipdef.h +21 -0
  88. triton/runtime/tcc/include/winapi/ws2tcpip.h +391 -0
  89. triton/runtime/tcc/lib/libtcc1.a +0 -0
  90. triton/runtime/tcc/lib/python314.def +1800 -0
  91. triton/runtime/tcc/lib/python314t.def +1809 -0
  92. triton/runtime/tcc/libtcc.dll +0 -0
  93. triton/runtime/tcc/tcc.exe +0 -0
  94. triton/tools/compile.py +62 -14
  95. triton/tools/extra/cuda/compile.c +1 -0
  96. triton/tools/extra/hip/compile.cpp +66 -0
  97. triton/tools/extra/hip/compile.h +13 -0
  98. triton/tools/ragged_tma.py +92 -0
  99. triton/tools/tensor_descriptor.py +7 -9
  100. triton/windows_utils.py +42 -79
  101. {triton_windows-3.4.0.post20.dist-info → triton_windows-3.5.0.post21.dist-info}/METADATA +3 -4
  102. {triton_windows-3.4.0.post20.dist-info → triton_windows-3.5.0.post21.dist-info}/RECORD +106 -75
  103. triton/runtime/tcc/lib/libtcc1-64.a +0 -0
  104. {triton_windows-3.4.0.post20.dist-info → triton_windows-3.5.0.post21.dist-info}/WHEEL +0 -0
  105. {triton_windows-3.4.0.post20.dist-info → triton_windows-3.5.0.post21.dist-info}/entry_points.txt +0 -0
  106. {triton_windows-3.4.0.post20.dist-info → triton_windows-3.5.0.post21.dist-info}/licenses/LICENSE +0 -0
  107. {triton_windows-3.4.0.post20.dist-info → triton_windows-3.5.0.post21.dist-info}/top_level.txt +0 -0
@@ -1310,7 +1310,7 @@ typedef DWORD LCID;
1310
1310
  #define INITIAL_MXCSR 0x1f80
1311
1311
  #define INITIAL_FPCSR 0x027f
1312
1312
 
1313
- typedef DECLSPEC_ALIGN(16) struct _M128A {
1313
+ typedef struct DECLSPEC_ALIGN(16) _M128A {
1314
1314
  ULONGLONG Low;
1315
1315
  LONGLONG High;
1316
1316
  } M128A,*PM128A;
@@ -1336,7 +1336,7 @@ typedef DWORD LCID;
1336
1336
 
1337
1337
  #define LEGACY_SAVE_AREA_LENGTH sizeof(XMM_SAVE_AREA32)
1338
1338
 
1339
- typedef DECLSPEC_ALIGN(16) struct _CONTEXT {
1339
+ typedef struct DECLSPEC_ALIGN(16) _CONTEXT {
1340
1340
  DWORD64 P1Home;
1341
1341
  DWORD64 P2Home;
1342
1342
  DWORD64 P3Home;
@@ -1474,7 +1474,7 @@ typedef DWORD LCID;
1474
1474
  #if(defined(_X86_) && !defined(__x86_64))
1475
1475
  __CRT_INLINE VOID MemoryBarrier(VOID) {
1476
1476
  LONG Barrier;
1477
- __asm__ __volatile__("xchgl %eax,%0 "
1477
+ __asm__ __volatile__("xchgl %%eax,%0 "
1478
1478
  :"=r" (Barrier));
1479
1479
  }
1480
1480
  #define YieldProcessor() __asm__ __volatile__("rep nop ");
@@ -1486,7 +1486,7 @@ typedef DWORD LCID;
1486
1486
  #define PF_NON_TEMPORAL_LEVEL_ALL
1487
1487
 
1488
1488
  __CRT_INLINE VOID DbgRaiseAssertionFailure(void) {
1489
- __asm__ __volatile__("int 0x2c ");
1489
+ __asm__ __volatile__("int $0x2c ");
1490
1490
  }
1491
1491
  PVOID GetCurrentFiber(void);
1492
1492
  __CRT_INLINE PVOID GetCurrentFiber(void)
@@ -2761,6 +2761,8 @@ typedef DWORD LCID;
2761
2761
  #define PROCESS_SET_INFORMATION (0x0200)
2762
2762
  #define PROCESS_QUERY_INFORMATION (0x0400)
2763
2763
  #define PROCESS_SUSPEND_RESUME (0x0800)
2764
+ #define PROCESS_QUERY_LIMITED_INFORMATION (0x1000)
2765
+ #define PROCESS_SET_LIMITED_INFORMATION (0x2000)
2764
2766
  #define PROCESS_ALL_ACCESS (STANDARD_RIGHTS_REQUIRED | SYNCHRONIZE | 0xFFF)
2765
2767
 
2766
2768
  #ifdef _WIN64
@@ -3150,7 +3152,7 @@ typedef DWORD LCID;
3150
3152
  DWORD Type;
3151
3153
  } MEMORY_BASIC_INFORMATION32,*PMEMORY_BASIC_INFORMATION32;
3152
3154
 
3153
- typedef DECLSPEC_ALIGN(16) struct _MEMORY_BASIC_INFORMATION64 {
3155
+ typedef struct DECLSPEC_ALIGN(16) _MEMORY_BASIC_INFORMATION64 {
3154
3156
  ULONGLONG BaseAddress;
3155
3157
  ULONGLONG AllocationBase;
3156
3158
  DWORD AllocationProtect;
@@ -4949,7 +4951,7 @@ typedef DWORD LCID;
4949
4951
 
4950
4952
  #ifdef _WIN64
4951
4953
  typedef struct _SLIST_ENTRY *PSLIST_ENTRY;
4952
- typedef DECLSPEC_ALIGN(16) struct _SLIST_ENTRY {
4954
+ typedef struct DECLSPEC_ALIGN(16) _SLIST_ENTRY {
4953
4955
  PSLIST_ENTRY Next;
4954
4956
  } SLIST_ENTRY;
4955
4957
  #else
@@ -4961,7 +4963,7 @@ typedef DWORD LCID;
4961
4963
 
4962
4964
  #if defined(_WIN64)
4963
4965
 
4964
- typedef DECLSPEC_ALIGN(16) struct _SLIST_HEADER {
4966
+ typedef struct DECLSPEC_ALIGN(16) _SLIST_HEADER {
4965
4967
  ULONGLONG Alignment;
4966
4968
  ULONGLONG Region;
4967
4969
  } SLIST_HEADER;