triton-windows 3.2.0.post11__cp312-cp312-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of triton-windows might be problematic. Click here for more details.

Files changed (154) hide show
  1. triton/_C/libtriton.pyd +0 -0
  2. triton/__init__.py +85 -0
  3. triton/_internal_testing.py +123 -0
  4. triton/backends/__init__.py +50 -0
  5. triton/backends/amd/compiler.py +368 -0
  6. triton/backends/amd/driver.c +211 -0
  7. triton/backends/amd/driver.py +512 -0
  8. triton/backends/amd/include/hip/amd_detail/amd_channel_descriptor.h +358 -0
  9. triton/backends/amd/include/hip/amd_detail/amd_device_functions.h +1031 -0
  10. triton/backends/amd/include/hip/amd_detail/amd_hip_atomic.h +1612 -0
  11. triton/backends/amd/include/hip/amd_detail/amd_hip_bf16.h +1337 -0
  12. triton/backends/amd/include/hip/amd_detail/amd_hip_bfloat16.h +293 -0
  13. triton/backends/amd/include/hip/amd_detail/amd_hip_common.h +32 -0
  14. triton/backends/amd/include/hip/amd_detail/amd_hip_complex.h +174 -0
  15. triton/backends/amd/include/hip/amd_detail/amd_hip_cooperative_groups.h +829 -0
  16. triton/backends/amd/include/hip/amd_detail/amd_hip_fp16.h +1809 -0
  17. triton/backends/amd/include/hip/amd_detail/amd_hip_gl_interop.h +108 -0
  18. triton/backends/amd/include/hip/amd_detail/amd_hip_math_constants.h +124 -0
  19. triton/backends/amd/include/hip/amd_detail/amd_hip_runtime.h +405 -0
  20. triton/backends/amd/include/hip/amd_detail/amd_hip_runtime_pt_api.h +196 -0
  21. triton/backends/amd/include/hip/amd_detail/amd_hip_unsafe_atomics.h +565 -0
  22. triton/backends/amd/include/hip/amd_detail/amd_hip_vector_types.h +2226 -0
  23. triton/backends/amd/include/hip/amd_detail/amd_math_functions.h +104 -0
  24. triton/backends/amd/include/hip/amd_detail/amd_surface_functions.h +244 -0
  25. triton/backends/amd/include/hip/amd_detail/amd_warp_functions.h +494 -0
  26. triton/backends/amd/include/hip/amd_detail/concepts.hpp +30 -0
  27. triton/backends/amd/include/hip/amd_detail/device_library_decls.h +133 -0
  28. triton/backends/amd/include/hip/amd_detail/functional_grid_launch.hpp +218 -0
  29. triton/backends/amd/include/hip/amd_detail/grid_launch.h +67 -0
  30. triton/backends/amd/include/hip/amd_detail/grid_launch.hpp +50 -0
  31. triton/backends/amd/include/hip/amd_detail/grid_launch_GGL.hpp +26 -0
  32. triton/backends/amd/include/hip/amd_detail/helpers.hpp +137 -0
  33. triton/backends/amd/include/hip/amd_detail/hip_api_trace.hpp +1350 -0
  34. triton/backends/amd/include/hip/amd_detail/hip_assert.h +101 -0
  35. triton/backends/amd/include/hip/amd_detail/hip_cooperative_groups_helper.h +242 -0
  36. triton/backends/amd/include/hip/amd_detail/hip_fp16_gcc.h +254 -0
  37. triton/backends/amd/include/hip/amd_detail/hip_fp16_math_fwd.h +96 -0
  38. triton/backends/amd/include/hip/amd_detail/hip_ldg.h +100 -0
  39. triton/backends/amd/include/hip/amd_detail/hip_prof_str.h +10169 -0
  40. triton/backends/amd/include/hip/amd_detail/hip_runtime_prof.h +77 -0
  41. triton/backends/amd/include/hip/amd_detail/host_defines.h +180 -0
  42. triton/backends/amd/include/hip/amd_detail/hsa_helpers.hpp +102 -0
  43. triton/backends/amd/include/hip/amd_detail/macro_based_grid_launch.hpp +798 -0
  44. triton/backends/amd/include/hip/amd_detail/math_fwd.h +698 -0
  45. triton/backends/amd/include/hip/amd_detail/ockl_image.h +177 -0
  46. triton/backends/amd/include/hip/amd_detail/program_state.hpp +107 -0
  47. triton/backends/amd/include/hip/amd_detail/texture_fetch_functions.h +491 -0
  48. triton/backends/amd/include/hip/amd_detail/texture_indirect_functions.h +478 -0
  49. triton/backends/amd/include/hip/channel_descriptor.h +39 -0
  50. triton/backends/amd/include/hip/device_functions.h +38 -0
  51. triton/backends/amd/include/hip/driver_types.h +468 -0
  52. triton/backends/amd/include/hip/hip_bf16.h +36 -0
  53. triton/backends/amd/include/hip/hip_bfloat16.h +44 -0
  54. triton/backends/amd/include/hip/hip_common.h +100 -0
  55. triton/backends/amd/include/hip/hip_complex.h +38 -0
  56. triton/backends/amd/include/hip/hip_cooperative_groups.h +46 -0
  57. triton/backends/amd/include/hip/hip_deprecated.h +95 -0
  58. triton/backends/amd/include/hip/hip_ext.h +159 -0
  59. triton/backends/amd/include/hip/hip_fp16.h +36 -0
  60. triton/backends/amd/include/hip/hip_gl_interop.h +32 -0
  61. triton/backends/amd/include/hip/hip_hcc.h +24 -0
  62. triton/backends/amd/include/hip/hip_math_constants.h +36 -0
  63. triton/backends/amd/include/hip/hip_profile.h +27 -0
  64. triton/backends/amd/include/hip/hip_runtime.h +75 -0
  65. triton/backends/amd/include/hip/hip_runtime_api.h +8919 -0
  66. triton/backends/amd/include/hip/hip_texture_types.h +29 -0
  67. triton/backends/amd/include/hip/hip_vector_types.h +41 -0
  68. triton/backends/amd/include/hip/hip_version.h +17 -0
  69. triton/backends/amd/include/hip/hiprtc.h +421 -0
  70. triton/backends/amd/include/hip/library_types.h +78 -0
  71. triton/backends/amd/include/hip/math_functions.h +42 -0
  72. triton/backends/amd/include/hip/surface_types.h +63 -0
  73. triton/backends/amd/include/hip/texture_types.h +194 -0
  74. triton/backends/amd/include/hsa/Brig.h +1131 -0
  75. triton/backends/amd/include/hsa/amd_hsa_common.h +91 -0
  76. triton/backends/amd/include/hsa/amd_hsa_elf.h +436 -0
  77. triton/backends/amd/include/hsa/amd_hsa_kernel_code.h +269 -0
  78. triton/backends/amd/include/hsa/amd_hsa_queue.h +109 -0
  79. triton/backends/amd/include/hsa/amd_hsa_signal.h +80 -0
  80. triton/backends/amd/include/hsa/hsa.h +5729 -0
  81. triton/backends/amd/include/hsa/hsa_amd_tool.h +91 -0
  82. triton/backends/amd/include/hsa/hsa_api_trace.h +566 -0
  83. triton/backends/amd/include/hsa/hsa_ext_amd.h +3090 -0
  84. triton/backends/amd/include/hsa/hsa_ext_finalize.h +531 -0
  85. triton/backends/amd/include/hsa/hsa_ext_image.h +1454 -0
  86. triton/backends/amd/include/hsa/hsa_ven_amd_aqlprofile.h +488 -0
  87. triton/backends/amd/include/hsa/hsa_ven_amd_loader.h +667 -0
  88. triton/backends/amd/include/roctracer/ext/prof_protocol.h +107 -0
  89. triton/backends/amd/include/roctracer/hip_ostream_ops.h +4435 -0
  90. triton/backends/amd/include/roctracer/hsa_ostream_ops.h +1467 -0
  91. triton/backends/amd/include/roctracer/hsa_prof_str.h +3027 -0
  92. triton/backends/amd/include/roctracer/roctracer.h +779 -0
  93. triton/backends/amd/include/roctracer/roctracer_ext.h +81 -0
  94. triton/backends/amd/include/roctracer/roctracer_hcc.h +24 -0
  95. triton/backends/amd/include/roctracer/roctracer_hip.h +37 -0
  96. triton/backends/amd/include/roctracer/roctracer_hsa.h +112 -0
  97. triton/backends/amd/include/roctracer/roctracer_plugin.h +137 -0
  98. triton/backends/amd/include/roctracer/roctracer_roctx.h +67 -0
  99. triton/backends/amd/include/roctracer/roctx.h +229 -0
  100. triton/backends/amd/lib/ockl.bc +0 -0
  101. triton/backends/amd/lib/ocml.bc +0 -0
  102. triton/backends/compiler.py +304 -0
  103. triton/backends/driver.py +48 -0
  104. triton/backends/nvidia/__init__.py +0 -0
  105. triton/backends/nvidia/bin/ptxas.exe +0 -0
  106. triton/backends/nvidia/compiler.py +410 -0
  107. triton/backends/nvidia/driver.c +451 -0
  108. triton/backends/nvidia/driver.py +524 -0
  109. triton/backends/nvidia/include/cuda.h +24359 -0
  110. triton/backends/nvidia/lib/libdevice.10.bc +0 -0
  111. triton/backends/nvidia/lib/x64/cuda.lib +0 -0
  112. triton/compiler/__init__.py +4 -0
  113. triton/compiler/code_generator.py +1303 -0
  114. triton/compiler/compiler.py +430 -0
  115. triton/compiler/errors.py +51 -0
  116. triton/compiler/make_launcher.py +0 -0
  117. triton/errors.py +5 -0
  118. triton/language/__init__.py +294 -0
  119. triton/language/_utils.py +21 -0
  120. triton/language/core.py +2694 -0
  121. triton/language/extra/__init__.py +26 -0
  122. triton/language/extra/cuda/__init__.py +13 -0
  123. triton/language/extra/cuda/_experimental_tma.py +108 -0
  124. triton/language/extra/cuda/libdevice.py +1629 -0
  125. triton/language/extra/cuda/utils.py +109 -0
  126. triton/language/extra/hip/__init__.py +3 -0
  127. triton/language/extra/hip/libdevice.py +475 -0
  128. triton/language/extra/libdevice.py +786 -0
  129. triton/language/math.py +250 -0
  130. triton/language/random.py +207 -0
  131. triton/language/semantic.py +1796 -0
  132. triton/language/standard.py +452 -0
  133. triton/runtime/__init__.py +23 -0
  134. triton/runtime/autotuner.py +408 -0
  135. triton/runtime/build.py +111 -0
  136. triton/runtime/cache.py +295 -0
  137. triton/runtime/driver.py +60 -0
  138. triton/runtime/errors.py +26 -0
  139. triton/runtime/interpreter.py +1235 -0
  140. triton/runtime/jit.py +951 -0
  141. triton/testing.py +511 -0
  142. triton/tools/__init__.py +0 -0
  143. triton/tools/build_extern.py +365 -0
  144. triton/tools/compile.c +67 -0
  145. triton/tools/compile.h +14 -0
  146. triton/tools/compile.py +155 -0
  147. triton/tools/disasm.py +144 -0
  148. triton/tools/experimental_descriptor.py +32 -0
  149. triton/tools/link.py +322 -0
  150. triton/windows_utils.py +375 -0
  151. triton_windows-3.2.0.post11.dist-info/METADATA +39 -0
  152. triton_windows-3.2.0.post11.dist-info/RECORD +154 -0
  153. triton_windows-3.2.0.post11.dist-info/WHEEL +5 -0
  154. triton_windows-3.2.0.post11.dist-info/top_level.txt +12 -0
@@ -0,0 +1,26 @@
1
+ import pkgutil
2
+ from importlib.util import module_from_spec
3
+ from sys import modules
4
+
5
+ _backends = []
6
+ for module_finder, module_name, is_pkg in pkgutil.iter_modules(
7
+ __path__,
8
+ prefix=__name__ + ".",
9
+ ):
10
+ # skip .py files (like libdevice.py)
11
+ if not is_pkg:
12
+ continue
13
+
14
+ # import backends (like cuda and hip) that are included during setup.py
15
+ spec = module_finder.find_spec(module_name)
16
+ if spec is None or spec.loader is None:
17
+ continue
18
+ module = module_from_spec(spec)
19
+ spec.loader.exec_module(module)
20
+
21
+ _backends.append(module_name)
22
+ modules[module_name] = module
23
+
24
+ __all__ = _backends
25
+
26
+ del _backends
@@ -0,0 +1,13 @@
1
+ from . import libdevice
2
+
3
+ from .utils import (globaltimer, num_threads, num_warps, smid, convert_custom_float8_sm70, convert_custom_float8_sm80)
4
+
5
+ from ._experimental_tma import * # noqa: F403
6
+ from ._experimental_tma import __all__ as _tma_all
7
+
8
+ __all__ = [
9
+ "libdevice", "globaltimer", "num_threads", "num_warps", "smid", "convert_custom_float8_sm70",
10
+ "convert_custom_float8_sm80", *_tma_all
11
+ ]
12
+
13
+ del _tma_all
@@ -0,0 +1,108 @@
1
+ from typing import Sequence
2
+
3
+ from triton.language import core
4
+ from triton.language import semantic
5
+ from triton._C.libtriton import ir
6
+
7
+ __all__ = [
8
+ "experimental_device_tensormap_create1d",
9
+ "experimental_device_tensormap_create2d",
10
+ "experimental_tensormap_fenceproxy_acquire",
11
+ ]
12
+
13
+
14
+ def _determine_elem_type(element_ty: core.dtype):
15
+ if element_ty.primitive_bitwidth == 8:
16
+ return 0
17
+ elif element_ty.primitive_bitwidth == 16:
18
+ return 1
19
+ elif element_ty.primitive_bitwidth == 32:
20
+ return 2
21
+ else:
22
+ raise ValueError("element_ty must be a primitive of size 1, 2, or 4 bytes but got")
23
+
24
+
25
+ @core.builtin
26
+ def experimental_device_tensormap_create1d(
27
+ desc_ptr: core.tensor,
28
+ global_address: core.tensor,
29
+ load_size: core.tensor,
30
+ global_size: core.tensor,
31
+ element_ty: core.dtype,
32
+ _builder: ir.builder,
33
+ ):
34
+ load_size = core._constexpr_to_value(load_size)
35
+ global_size = semantic.to_tensor(global_size, _builder)
36
+ element_ty = core._constexpr_to_value(element_ty)
37
+ element_stride = [core.full([], 1, core.int32, _builder=_builder)]
38
+
39
+ semantic.tensormap_create(
40
+ desc_ptr=desc_ptr,
41
+ global_address=global_address,
42
+ box_dim=[semantic.to_tensor(load_size, _builder)],
43
+ global_dim=[global_size],
44
+ global_stride=[],
45
+ element_stride=element_stride,
46
+ elem_type=_determine_elem_type(element_ty),
47
+ interleave_layout=0,
48
+ swizzle_mode=0,
49
+ fill_mode=0,
50
+ builder=_builder,
51
+ )
52
+
53
+
54
+ @core.builtin
55
+ def experimental_device_tensormap_create2d(
56
+ desc_ptr: core.tensor,
57
+ global_address: core.tensor,
58
+ load_size: Sequence[core.constexpr],
59
+ global_size: Sequence[core.tensor],
60
+ element_ty: core.dtype,
61
+ _builder: ir.builder,
62
+ ):
63
+ assert len(load_size) == 2
64
+ assert len(global_size) == 2
65
+ load_size = [core._constexpr_to_value(x) for x in load_size]
66
+ global_size = [semantic.to_tensor(x, _builder) for x in global_size]
67
+
68
+ element_size = element_ty.primitive_bitwidth // 8
69
+ element_size_t = core.full([], element_size, core.int64, _builder=_builder)
70
+ global_stride = semantic.mul(element_size_t, global_size[-1], True, _builder)
71
+ # Undocumented, but global_stride seems to be divided by 16
72
+ global_stride = semantic.ashr(global_stride, semantic.to_tensor(4, _builder), _builder)
73
+
74
+ contig_dim_size_in_bytes = element_size * load_size[-1]
75
+ if contig_dim_size_in_bytes > 128:
76
+ load_size[-1] = 128 // element_size
77
+
78
+ elem_stride = core.full([], 1, core.int32, _builder=_builder)
79
+
80
+ semantic.tensormap_create(
81
+ desc_ptr=desc_ptr,
82
+ global_address=global_address,
83
+ box_dim=[semantic.to_tensor(x, _builder) for x in load_size[::-1]],
84
+ global_dim=global_size[::-1],
85
+ global_stride=[global_stride],
86
+ element_stride=[elem_stride, elem_stride],
87
+ elem_type=_determine_elem_type(element_ty),
88
+ interleave_layout=0,
89
+ swizzle_mode=_determine_swizzle_mode_2d(contig_dim_size_in_bytes, load_size),
90
+ fill_mode=0,
91
+ builder=_builder,
92
+ )
93
+
94
+
95
+ def _determine_swizzle_mode_2d(contig_dim_size_in_bytes, load_size):
96
+ if contig_dim_size_in_bytes >= 128:
97
+ return 3
98
+ elif contig_dim_size_in_bytes >= 64:
99
+ return 2
100
+ elif contig_dim_size_in_bytes >= 32:
101
+ return 1
102
+ else:
103
+ raise ValueError("block size too small")
104
+
105
+
106
+ @core.builtin
107
+ def experimental_tensormap_fenceproxy_acquire(desc_ptr: core.tensor, _builder: ir.builder):
108
+ semantic.tensormap_fenceproxy_acquire(desc_ptr, _builder)