triton-windows 3.2.0.post11__cp310-cp310-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of triton-windows might be problematic. Click here for more details.

Files changed (154) hide show
  1. triton/_C/libtriton.pyd +0 -0
  2. triton/__init__.py +85 -0
  3. triton/_internal_testing.py +123 -0
  4. triton/backends/__init__.py +50 -0
  5. triton/backends/amd/compiler.py +368 -0
  6. triton/backends/amd/driver.c +211 -0
  7. triton/backends/amd/driver.py +512 -0
  8. triton/backends/amd/include/hip/amd_detail/amd_channel_descriptor.h +358 -0
  9. triton/backends/amd/include/hip/amd_detail/amd_device_functions.h +1031 -0
  10. triton/backends/amd/include/hip/amd_detail/amd_hip_atomic.h +1612 -0
  11. triton/backends/amd/include/hip/amd_detail/amd_hip_bf16.h +1337 -0
  12. triton/backends/amd/include/hip/amd_detail/amd_hip_bfloat16.h +293 -0
  13. triton/backends/amd/include/hip/amd_detail/amd_hip_common.h +32 -0
  14. triton/backends/amd/include/hip/amd_detail/amd_hip_complex.h +174 -0
  15. triton/backends/amd/include/hip/amd_detail/amd_hip_cooperative_groups.h +829 -0
  16. triton/backends/amd/include/hip/amd_detail/amd_hip_fp16.h +1809 -0
  17. triton/backends/amd/include/hip/amd_detail/amd_hip_gl_interop.h +108 -0
  18. triton/backends/amd/include/hip/amd_detail/amd_hip_math_constants.h +124 -0
  19. triton/backends/amd/include/hip/amd_detail/amd_hip_runtime.h +405 -0
  20. triton/backends/amd/include/hip/amd_detail/amd_hip_runtime_pt_api.h +196 -0
  21. triton/backends/amd/include/hip/amd_detail/amd_hip_unsafe_atomics.h +565 -0
  22. triton/backends/amd/include/hip/amd_detail/amd_hip_vector_types.h +2226 -0
  23. triton/backends/amd/include/hip/amd_detail/amd_math_functions.h +104 -0
  24. triton/backends/amd/include/hip/amd_detail/amd_surface_functions.h +244 -0
  25. triton/backends/amd/include/hip/amd_detail/amd_warp_functions.h +494 -0
  26. triton/backends/amd/include/hip/amd_detail/concepts.hpp +30 -0
  27. triton/backends/amd/include/hip/amd_detail/device_library_decls.h +133 -0
  28. triton/backends/amd/include/hip/amd_detail/functional_grid_launch.hpp +218 -0
  29. triton/backends/amd/include/hip/amd_detail/grid_launch.h +67 -0
  30. triton/backends/amd/include/hip/amd_detail/grid_launch.hpp +50 -0
  31. triton/backends/amd/include/hip/amd_detail/grid_launch_GGL.hpp +26 -0
  32. triton/backends/amd/include/hip/amd_detail/helpers.hpp +137 -0
  33. triton/backends/amd/include/hip/amd_detail/hip_api_trace.hpp +1350 -0
  34. triton/backends/amd/include/hip/amd_detail/hip_assert.h +101 -0
  35. triton/backends/amd/include/hip/amd_detail/hip_cooperative_groups_helper.h +242 -0
  36. triton/backends/amd/include/hip/amd_detail/hip_fp16_gcc.h +254 -0
  37. triton/backends/amd/include/hip/amd_detail/hip_fp16_math_fwd.h +96 -0
  38. triton/backends/amd/include/hip/amd_detail/hip_ldg.h +100 -0
  39. triton/backends/amd/include/hip/amd_detail/hip_prof_str.h +10169 -0
  40. triton/backends/amd/include/hip/amd_detail/hip_runtime_prof.h +77 -0
  41. triton/backends/amd/include/hip/amd_detail/host_defines.h +180 -0
  42. triton/backends/amd/include/hip/amd_detail/hsa_helpers.hpp +102 -0
  43. triton/backends/amd/include/hip/amd_detail/macro_based_grid_launch.hpp +798 -0
  44. triton/backends/amd/include/hip/amd_detail/math_fwd.h +698 -0
  45. triton/backends/amd/include/hip/amd_detail/ockl_image.h +177 -0
  46. triton/backends/amd/include/hip/amd_detail/program_state.hpp +107 -0
  47. triton/backends/amd/include/hip/amd_detail/texture_fetch_functions.h +491 -0
  48. triton/backends/amd/include/hip/amd_detail/texture_indirect_functions.h +478 -0
  49. triton/backends/amd/include/hip/channel_descriptor.h +39 -0
  50. triton/backends/amd/include/hip/device_functions.h +38 -0
  51. triton/backends/amd/include/hip/driver_types.h +468 -0
  52. triton/backends/amd/include/hip/hip_bf16.h +36 -0
  53. triton/backends/amd/include/hip/hip_bfloat16.h +44 -0
  54. triton/backends/amd/include/hip/hip_common.h +100 -0
  55. triton/backends/amd/include/hip/hip_complex.h +38 -0
  56. triton/backends/amd/include/hip/hip_cooperative_groups.h +46 -0
  57. triton/backends/amd/include/hip/hip_deprecated.h +95 -0
  58. triton/backends/amd/include/hip/hip_ext.h +159 -0
  59. triton/backends/amd/include/hip/hip_fp16.h +36 -0
  60. triton/backends/amd/include/hip/hip_gl_interop.h +32 -0
  61. triton/backends/amd/include/hip/hip_hcc.h +24 -0
  62. triton/backends/amd/include/hip/hip_math_constants.h +36 -0
  63. triton/backends/amd/include/hip/hip_profile.h +27 -0
  64. triton/backends/amd/include/hip/hip_runtime.h +75 -0
  65. triton/backends/amd/include/hip/hip_runtime_api.h +8919 -0
  66. triton/backends/amd/include/hip/hip_texture_types.h +29 -0
  67. triton/backends/amd/include/hip/hip_vector_types.h +41 -0
  68. triton/backends/amd/include/hip/hip_version.h +17 -0
  69. triton/backends/amd/include/hip/hiprtc.h +421 -0
  70. triton/backends/amd/include/hip/library_types.h +78 -0
  71. triton/backends/amd/include/hip/math_functions.h +42 -0
  72. triton/backends/amd/include/hip/surface_types.h +63 -0
  73. triton/backends/amd/include/hip/texture_types.h +194 -0
  74. triton/backends/amd/include/hsa/Brig.h +1131 -0
  75. triton/backends/amd/include/hsa/amd_hsa_common.h +91 -0
  76. triton/backends/amd/include/hsa/amd_hsa_elf.h +436 -0
  77. triton/backends/amd/include/hsa/amd_hsa_kernel_code.h +269 -0
  78. triton/backends/amd/include/hsa/amd_hsa_queue.h +109 -0
  79. triton/backends/amd/include/hsa/amd_hsa_signal.h +80 -0
  80. triton/backends/amd/include/hsa/hsa.h +5729 -0
  81. triton/backends/amd/include/hsa/hsa_amd_tool.h +91 -0
  82. triton/backends/amd/include/hsa/hsa_api_trace.h +566 -0
  83. triton/backends/amd/include/hsa/hsa_ext_amd.h +3090 -0
  84. triton/backends/amd/include/hsa/hsa_ext_finalize.h +531 -0
  85. triton/backends/amd/include/hsa/hsa_ext_image.h +1454 -0
  86. triton/backends/amd/include/hsa/hsa_ven_amd_aqlprofile.h +488 -0
  87. triton/backends/amd/include/hsa/hsa_ven_amd_loader.h +667 -0
  88. triton/backends/amd/include/roctracer/ext/prof_protocol.h +107 -0
  89. triton/backends/amd/include/roctracer/hip_ostream_ops.h +4435 -0
  90. triton/backends/amd/include/roctracer/hsa_ostream_ops.h +1467 -0
  91. triton/backends/amd/include/roctracer/hsa_prof_str.h +3027 -0
  92. triton/backends/amd/include/roctracer/roctracer.h +779 -0
  93. triton/backends/amd/include/roctracer/roctracer_ext.h +81 -0
  94. triton/backends/amd/include/roctracer/roctracer_hcc.h +24 -0
  95. triton/backends/amd/include/roctracer/roctracer_hip.h +37 -0
  96. triton/backends/amd/include/roctracer/roctracer_hsa.h +112 -0
  97. triton/backends/amd/include/roctracer/roctracer_plugin.h +137 -0
  98. triton/backends/amd/include/roctracer/roctracer_roctx.h +67 -0
  99. triton/backends/amd/include/roctracer/roctx.h +229 -0
  100. triton/backends/amd/lib/ockl.bc +0 -0
  101. triton/backends/amd/lib/ocml.bc +0 -0
  102. triton/backends/compiler.py +304 -0
  103. triton/backends/driver.py +48 -0
  104. triton/backends/nvidia/__init__.py +0 -0
  105. triton/backends/nvidia/bin/ptxas.exe +0 -0
  106. triton/backends/nvidia/compiler.py +410 -0
  107. triton/backends/nvidia/driver.c +451 -0
  108. triton/backends/nvidia/driver.py +524 -0
  109. triton/backends/nvidia/include/cuda.h +24359 -0
  110. triton/backends/nvidia/lib/libdevice.10.bc +0 -0
  111. triton/backends/nvidia/lib/x64/cuda.lib +0 -0
  112. triton/compiler/__init__.py +4 -0
  113. triton/compiler/code_generator.py +1303 -0
  114. triton/compiler/compiler.py +430 -0
  115. triton/compiler/errors.py +51 -0
  116. triton/compiler/make_launcher.py +0 -0
  117. triton/errors.py +5 -0
  118. triton/language/__init__.py +294 -0
  119. triton/language/_utils.py +21 -0
  120. triton/language/core.py +2694 -0
  121. triton/language/extra/__init__.py +26 -0
  122. triton/language/extra/cuda/__init__.py +13 -0
  123. triton/language/extra/cuda/_experimental_tma.py +108 -0
  124. triton/language/extra/cuda/libdevice.py +1629 -0
  125. triton/language/extra/cuda/utils.py +109 -0
  126. triton/language/extra/hip/__init__.py +3 -0
  127. triton/language/extra/hip/libdevice.py +475 -0
  128. triton/language/extra/libdevice.py +786 -0
  129. triton/language/math.py +250 -0
  130. triton/language/random.py +207 -0
  131. triton/language/semantic.py +1796 -0
  132. triton/language/standard.py +452 -0
  133. triton/runtime/__init__.py +23 -0
  134. triton/runtime/autotuner.py +408 -0
  135. triton/runtime/build.py +111 -0
  136. triton/runtime/cache.py +295 -0
  137. triton/runtime/driver.py +60 -0
  138. triton/runtime/errors.py +26 -0
  139. triton/runtime/interpreter.py +1235 -0
  140. triton/runtime/jit.py +951 -0
  141. triton/testing.py +511 -0
  142. triton/tools/__init__.py +0 -0
  143. triton/tools/build_extern.py +365 -0
  144. triton/tools/compile.c +67 -0
  145. triton/tools/compile.h +14 -0
  146. triton/tools/compile.py +155 -0
  147. triton/tools/disasm.py +144 -0
  148. triton/tools/experimental_descriptor.py +32 -0
  149. triton/tools/link.py +322 -0
  150. triton/windows_utils.py +375 -0
  151. triton_windows-3.2.0.post11.dist-info/METADATA +39 -0
  152. triton_windows-3.2.0.post11.dist-info/RECORD +154 -0
  153. triton_windows-3.2.0.post11.dist-info/WHEEL +5 -0
  154. triton_windows-3.2.0.post11.dist-info/top_level.txt +12 -0
Binary file
triton/__init__.py ADDED
@@ -0,0 +1,85 @@
1
+ """isort:skip_file"""
2
+ __version__ = '3.2.0'
3
+
4
+ # Users may not know how to add cl and CUDA to PATH. Let's do it before loading anything
5
+ import os
6
+ if os.name == "nt":
7
+ from .windows_utils import find_cuda, find_msvc_winsdk
8
+ msvc_winsdk_inc_dirs, _ = find_msvc_winsdk()
9
+ if msvc_winsdk_inc_dirs:
10
+ cl_path = msvc_winsdk_inc_dirs[0].replace(r"\include", r"\bin\Hostx64\x64")
11
+ os.environ["PATH"] = cl_path + os.pathsep + os.environ["PATH"]
12
+ cuda_bin_path, _, _ = find_cuda()
13
+ if cuda_bin_path:
14
+ os.environ["PATH"] = cuda_bin_path + os.pathsep + os.environ["PATH"]
15
+
16
+ # ---------------------------------------
17
+ # Note: import order is significant here.
18
+
19
+ # submodules
20
+ from .runtime import (
21
+ autotune,
22
+ Config,
23
+ heuristics,
24
+ JITFunction,
25
+ KernelInterface,
26
+ reinterpret,
27
+ TensorWrapper,
28
+ OutOfResources,
29
+ InterpreterError,
30
+ MockTensor,
31
+ )
32
+ from .runtime.jit import jit
33
+ from .compiler import compile, CompilationError
34
+ from .errors import TritonError
35
+
36
+ from . import language
37
+ from . import testing
38
+ from . import tools
39
+
40
+ __all__ = [
41
+ "autotune",
42
+ "cdiv",
43
+ "CompilationError",
44
+ "compile",
45
+ "Config",
46
+ "heuristics",
47
+ "impl",
48
+ "InterpreterError",
49
+ "jit",
50
+ "JITFunction",
51
+ "KernelInterface",
52
+ "language",
53
+ "MockTensor",
54
+ "next_power_of_2",
55
+ "ops",
56
+ "OutOfResources",
57
+ "reinterpret",
58
+ "runtime",
59
+ "TensorWrapper",
60
+ "TritonError",
61
+ "testing",
62
+ "tools",
63
+ ]
64
+
65
+ # -------------------------------------
66
+ # misc. utilities that don't fit well
67
+ # into any specific module
68
+ # -------------------------------------
69
+
70
+
71
+ def cdiv(x: int, y: int):
72
+ return (x + y - 1) // y
73
+
74
+
75
+ def next_power_of_2(n: int):
76
+ """Return the smallest power of 2 greater than or equal to n"""
77
+ n -= 1
78
+ n |= n >> 1
79
+ n |= n >> 2
80
+ n |= n >> 4
81
+ n |= n >> 8
82
+ n |= n >> 16
83
+ n |= n >> 32
84
+ n += 1
85
+ return n
@@ -0,0 +1,123 @@
1
+ import os
2
+ import re
3
+ import numpy as np
4
+ import torch
5
+ import triton
6
+ import triton.language as tl
7
+ import pytest
8
+
9
+ from numpy.random import RandomState
10
+ from typing import Optional, Union
11
+ from triton.runtime.jit import TensorWrapper, reinterpret
12
+
13
+ int_dtypes = ['int8', 'int16', 'int32', 'int64']
14
+ uint_dtypes = ['uint8', 'uint16', 'uint32', 'uint64']
15
+ integral_dtypes = int_dtypes + uint_dtypes
16
+ float_dtypes = ['float16', 'float32', 'float64']
17
+ dtypes = integral_dtypes + float_dtypes
18
+ dtypes_with_bfloat16 = dtypes + ['bfloat16']
19
+ torch_float8_dtypes = ['float8_e4m3fn', 'float8_e5m2']
20
+ torch_dtypes = ['bool'] + int_dtypes + ['uint8'] + float_dtypes + ['bfloat16']
21
+
22
+
23
+ def is_interpreter():
24
+ return os.environ.get('TRITON_INTERPRET', '0') == '1'
25
+
26
+
27
+ def get_current_target():
28
+ if is_interpreter():
29
+ return None
30
+ return triton.runtime.driver.active.get_current_target()
31
+
32
+
33
+ def is_cuda():
34
+ target = get_current_target()
35
+ return False if target is None else target.backend == "cuda"
36
+
37
+
38
+ def is_hip():
39
+ target = get_current_target()
40
+ return False if target is None else target.backend == "hip"
41
+
42
+
43
+ def get_arch():
44
+ target = get_current_target()
45
+ return "" if target is None else str(target.arch)
46
+
47
+
48
+ def numpy_random(shape, dtype_str, rs: Optional[RandomState] = None, low=None, high=None):
49
+ """
50
+ Override `rs` if you're calling this function twice and don't want the same
51
+ result for both calls.
52
+ """
53
+ if isinstance(shape, int):
54
+ shape = (shape, )
55
+ if rs is None:
56
+ rs = RandomState(seed=17)
57
+ if dtype_str in int_dtypes + uint_dtypes:
58
+ iinfo = np.iinfo(getattr(np, dtype_str))
59
+ low = iinfo.min if low is None else max(low, iinfo.min)
60
+ high = iinfo.max if high is None else min(high, iinfo.max)
61
+ dtype = getattr(np, dtype_str)
62
+ x = rs.randint(low, high, shape, dtype=dtype)
63
+ x[x == 0] = 1 # Workaround. Never return zero so tests of division don't error out.
64
+ return x
65
+ elif dtype_str and 'float8' in dtype_str:
66
+ x = rs.randint(20, 40, shape, dtype=np.int8)
67
+ return x
68
+ elif dtype_str in float_dtypes:
69
+ return rs.normal(0, 1, shape).astype(dtype_str)
70
+ elif dtype_str == 'bfloat16':
71
+ return (rs.normal(0, 1, shape).astype('float32').view('uint32') & np.uint32(0xffff0000)).view('float32')
72
+ elif dtype_str in ['bool', 'int1', 'bool_']:
73
+ return rs.normal(0, 1, shape) > 0.0
74
+ else:
75
+ raise RuntimeError(f'Unknown dtype {dtype_str}')
76
+
77
+
78
+ def to_triton(x: np.ndarray, device, dst_type=None) -> Union[TensorWrapper, torch.Tensor]:
79
+ '''
80
+ Note: We need dst_type because the type of x can be different from dst_type.
81
+ For example: x is of type `float32`, dst_type is `bfloat16`.
82
+ If dst_type is None, we infer dst_type from x.
83
+ '''
84
+ t = x.dtype.name
85
+ if t in uint_dtypes:
86
+ signed_type_name = t.lstrip('u') # e.g. "uint16" -> "int16"
87
+ x_signed = x.astype(getattr(np, signed_type_name))
88
+ return reinterpret(torch.tensor(x_signed, device=device), getattr(tl, t))
89
+ else:
90
+ if dst_type and 'float8' in dst_type:
91
+ return reinterpret(torch.tensor(x, device=device), getattr(tl, dst_type))
92
+ if t == 'float32' and dst_type == 'bfloat16':
93
+ return torch.tensor(x, device=device).bfloat16()
94
+ return torch.tensor(x, device=device)
95
+
96
+
97
+ def torch_dtype_name(dtype) -> str:
98
+ if isinstance(dtype, triton.language.dtype):
99
+ return dtype.name
100
+ elif isinstance(dtype, torch.dtype):
101
+ # 'torch.int64' -> 'int64'
102
+ m = re.match(r'^torch\.(\w+)$', str(dtype))
103
+ return m.group(1)
104
+ else:
105
+ raise TypeError(f'not a triton or torch dtype: {type(dtype)}')
106
+
107
+
108
+ def to_numpy(x):
109
+ if isinstance(x, TensorWrapper):
110
+ return x.base.cpu().numpy().astype(getattr(np, torch_dtype_name(x.dtype)))
111
+ elif isinstance(x, torch.Tensor):
112
+ if x.dtype is torch.bfloat16:
113
+ return x.cpu().float().numpy()
114
+ return x.cpu().numpy()
115
+ else:
116
+ raise ValueError(f"Not a triton-compatible tensor: {x}")
117
+
118
+
119
+ def supports_tma():
120
+ return is_cuda() and torch.cuda.get_device_capability()[0] >= 9
121
+
122
+
123
+ requires_tma = pytest.mark.skipif(not supports_tma(), reason="Requires TMA support (NVIDIA Hopper or higher)")
@@ -0,0 +1,50 @@
1
+ import os
2
+ import importlib.util
3
+ import inspect
4
+ from dataclasses import dataclass
5
+ from .driver import DriverBase
6
+ from .compiler import BaseBackend
7
+
8
+
9
+ def _load_module(name, path):
10
+ spec = importlib.util.spec_from_file_location(name, path)
11
+ module = importlib.util.module_from_spec(spec)
12
+ spec.loader.exec_module(module)
13
+ return module
14
+
15
+
16
+ def _find_concrete_subclasses(module, base_class):
17
+ ret = []
18
+ for attr_name in dir(module):
19
+ attr = getattr(module, attr_name)
20
+ if isinstance(attr, type) and issubclass(attr, base_class) and not inspect.isabstract(attr):
21
+ ret.append(attr)
22
+ if len(ret) == 0:
23
+ raise RuntimeError(f"Found 0 concrete subclasses of {base_class} in {module}: {ret}")
24
+ if len(ret) > 1:
25
+ raise RuntimeError(f"Found >1 concrete subclasses of {base_class} in {module}: {ret}")
26
+ return ret[0]
27
+
28
+
29
+ @dataclass(frozen=True)
30
+ class Backend:
31
+ compiler: BaseBackend = None
32
+ driver: DriverBase = None
33
+
34
+
35
+ def _discover_backends():
36
+ backends = dict()
37
+ root = os.path.dirname(__file__)
38
+ for name in os.listdir(root):
39
+ if not os.path.isdir(os.path.join(root, name)):
40
+ continue
41
+ if name.startswith('__'):
42
+ continue
43
+ compiler = _load_module(name, os.path.join(root, name, 'compiler.py'))
44
+ driver = _load_module(name, os.path.join(root, name, 'driver.py'))
45
+ backends[name] = Backend(_find_concrete_subclasses(compiler, BaseBackend),
46
+ _find_concrete_subclasses(driver, DriverBase))
47
+ return backends
48
+
49
+
50
+ backends = _discover_backends()
@@ -0,0 +1,368 @@
1
+ from triton.backends.compiler import BaseBackend, GPUTarget, AttrsDescriptor, register_descriptor
2
+ from triton._C.libtriton import ir, passes, llvm, amd
3
+ from dataclasses import dataclass
4
+ from typing import Any, Dict, Tuple
5
+ from types import ModuleType
6
+ import hashlib
7
+ import tempfile
8
+ import os
9
+ import re
10
+ import subprocess
11
+ import functools
12
+ from pathlib import Path
13
+
14
+
15
+ def min_dot_size(target: GPUTarget):
16
+ arch_str = target.arch
17
+ # CDNA 3.0 supports k==8 in all mfma variants except for int8
18
+ # (where the smallest `k` supported is 16)
19
+ if "gfx94" in arch_str:
20
+ return lambda lhsType, rhsType: (16, 16, 16) if (lhsType.is_int8() or rhsType.is_int8()) else (16, 16, 8)
21
+ # CDNA 2.0 always supports `k==8`
22
+ if "gfx9" in arch_str:
23
+ return lambda lhsType, rhsType: (16, 16, 8)
24
+ # Other architectures will only support 16,16,16
25
+ return lambda lhsType, rhsType: (16, 16, 16)
26
+
27
+
28
+ @dataclass(frozen=True)
29
+ class HIPOptions:
30
+ num_warps: int = 4
31
+ waves_per_eu: int = 1
32
+ num_stages: int = 2
33
+ num_ctas: int = 1
34
+ num_buffers_warp_spec: int = 0
35
+ num_consumer_groups: int = 0
36
+ reg_dec_producer: int = 0
37
+ reg_inc_consumer: int = 0
38
+ extern_libs: dict = None
39
+ cluster_dims: tuple = (1, 1, 1)
40
+ debug: bool = False
41
+ sanitize_overflow: bool = True
42
+ arch: str = None
43
+ supported_fp8_dtypes: Tuple[str] = ("fp8e5", )
44
+ deprecated_fp8_dtypes: Tuple[str] = ()
45
+ default_dot_input_precision: str = "ieee"
46
+ allowed_dot_input_precisions: Tuple[str] = ("ieee", )
47
+ enable_fp_fusion: bool = True
48
+ matrix_instr_nonkdim: int = 0
49
+ kpack: int = 1
50
+ allow_flush_denorm: bool = False
51
+ max_num_imprecise_acc_default: int = 0
52
+ backend_name: str = 'hip'
53
+
54
+ # The following option provides hints to the AMDGPU backend regarding instruction scheduling
55
+ # for all `tt.dot` operations in a kernel. The "default" variant preserves the default
56
+ # instruction scheduling of the AMDGPU backend which aims at maximizing occupancy.
57
+ # The option is experimental and may change at any time regarding its semantics and/or may
58
+ # be gone entirely anytime.
59
+ instruction_sched_variant: str = 'default'
60
+
61
+ def __post_init__(self):
62
+ default_libdir = Path(__file__).parent / 'lib'
63
+ extern_libs = {} if self.extern_libs is None else dict(self.extern_libs)
64
+ # Ignore user-defined warp size for gfx9
65
+ warp_size = 32 if 'gfx10' in self.arch or 'gfx11' in self.arch or 'gfx12' in self.arch else 64
66
+ object.__setattr__(self, 'warp_size', warp_size)
67
+ libs = ["ocml", "ockl"]
68
+ for lib in libs:
69
+ extern_libs[lib] = str(default_libdir / f'{lib}.bc')
70
+ object.__setattr__(self, 'extern_libs', tuple(extern_libs.items()))
71
+ assert self.num_warps > 0 and (self.num_warps & (self.num_warps - 1)) == 0, \
72
+ "num_warps must be a power of 2"
73
+
74
+ def hash(self):
75
+ key = '_'.join([f'{name}-{val}' for name, val in self.__dict__.items()])
76
+ return hashlib.sha256(key.encode("utf-8")).hexdigest()
77
+
78
+
79
+ @register_descriptor
80
+ class HIPAttrsDescriptor(AttrsDescriptor):
81
+ # This property asserts if the underlying storage area of a given pointer
82
+ # can be resepresented as a 32 bit integer. When this is true, we can be
83
+ # sure that all indices into the tensor behind that pointer can use 32-bit
84
+ # indexing. That opens the door for the AMD backend to use buffer load/store
85
+ # instrinsics, which requires this property. Buffer load/store intrinsics
86
+ # gives direct out-of-bound support and simplifies index calculation for
87
+ # lower register pressure.
88
+ __slots__ = ("pointer_range_32")
89
+
90
+ def _add_backend_properties(self, params=None, values=None):
91
+ self.property_values["tt.pointer_range"] = 32
92
+ if params is None or values is None:
93
+ return
94
+
95
+ self.arg_properties["tt.pointer_range"] = [
96
+ param.num for param, arg in zip(params, values) if HIPAttrsDescriptor.is_within2gb(arg)
97
+ and not param.do_not_specialize and not param.do_not_specialize_on_alignment
98
+ ]
99
+
100
+ @staticmethod
101
+ def is_within2gb(arg):
102
+ if hasattr(arg, "ptr_range"):
103
+ return arg.ptr_range() <= 2**31 - 1
104
+ if "torch.Tensor" in str(type(arg)) and hasattr(arg, "untyped_storage"):
105
+ # Please note that 2**31-1 is the max int32 positive limit
106
+ return arg.untyped_storage().size() <= 2**31 - 1
107
+ return False
108
+
109
+ @staticmethod
110
+ def get_property_key(val, align):
111
+ generic_key = AttrsDescriptor.get_property_key(val, align)
112
+ hip_key = "S" if HIPAttrsDescriptor.is_within2gb(val) else "N"
113
+ key = (generic_key + hip_key).replace("N", "")
114
+ return key if key else "N"
115
+
116
+
117
+ class HIPBackend(BaseBackend):
118
+
119
+ @staticmethod
120
+ def supports_target(target: GPUTarget):
121
+ return target.backend == 'hip'
122
+
123
+ def __init__(self, target: GPUTarget) -> None:
124
+ super().__init__(target)
125
+ assert isinstance(target.arch, str)
126
+ self.binary_ext = "hsaco"
127
+
128
+ def parse_options(self, opts) -> Any:
129
+ args = {'arch': self.target.arch}
130
+
131
+ if "supported_fp8_dtypes" not in opts:
132
+ supported_fp8_dtypes = set(HIPOptions.supported_fp8_dtypes)
133
+ if self.target.arch in ('gfx940', 'gfx941', 'gfx942'):
134
+ supported_fp8_dtypes.update({'fp8e4b8', 'fp8e5b16'})
135
+ args["supported_fp8_dtypes"] = tuple(sorted(supported_fp8_dtypes))
136
+
137
+ if "enable_fp_fusion" not in opts:
138
+ args["enable_fp_fusion"] = os.getenv("TRITON_DEFAULT_FP_FUSION", "1") == "1"
139
+ args.update({k: opts[k] for k in HIPOptions.__dataclass_fields__.keys() if k in opts})
140
+ return HIPOptions(**args)
141
+
142
+ def pack_metadata(self, metadata):
143
+ return (
144
+ metadata.num_warps,
145
+ metadata.num_ctas,
146
+ metadata.shared,
147
+ metadata.cluster_dims[0],
148
+ metadata.cluster_dims[1],
149
+ metadata.cluster_dims[2],
150
+ )
151
+
152
+ def get_codegen_implementation(self):
153
+ codegen_fns = {"min_dot_size": min_dot_size(self.target)}
154
+ return codegen_fns
155
+
156
+ def get_module_map(self) -> Dict[str, ModuleType]:
157
+ from triton.language.extra.hip import libdevice
158
+ return {"triton.language.extra.libdevice": libdevice}
159
+
160
+ def load_dialects(self, ctx):
161
+ amd.load_dialects(ctx)
162
+
163
+ def get_attrs_descriptor(self, params, args):
164
+ return HIPAttrsDescriptor(params, args)
165
+
166
+ @staticmethod
167
+ def compute_spec_key(arg, align):
168
+ return HIPAttrsDescriptor.get_property_key(arg, align)
169
+
170
+ @staticmethod
171
+ def path_to_rocm_lld():
172
+ # Check env path for ld.lld
173
+ lld_env_path = os.getenv("TRITON_HIP_LLD_PATH")
174
+ if lld_env_path is not None:
175
+ lld = Path(lld_env_path)
176
+ if lld.is_file():
177
+ return lld
178
+ # Check backend for ld.lld (used for pytorch wheels)
179
+ lld = Path(__file__).parent / "llvm/bin/ld.lld"
180
+ if lld.is_file():
181
+ return lld
182
+ lld = Path("/opt/rocm/llvm/bin/ld.lld")
183
+ if lld.is_file():
184
+ return lld
185
+ lld = Path("/usr/bin/ld.lld")
186
+ if lld.is_file():
187
+ return lld
188
+ raise Exception("ROCm linker /opt/rocm/llvm/bin/ld.lld not found. Set 'TRITON_HIP_LLD_PATH' to its path.")
189
+
190
+ @staticmethod
191
+ def make_ttir(mod, metadata, options):
192
+ pm = ir.pass_manager(mod.context)
193
+ pm.enable_debug()
194
+ passes.common.add_inliner(pm)
195
+ passes.ttir.add_rewrite_tensor_pointer(pm)
196
+ passes.ttir.add_combine(pm)
197
+ passes.common.add_canonicalizer(pm)
198
+ passes.ttir.add_reorder_broadcast(pm)
199
+ passes.common.add_cse(pm)
200
+ passes.common.add_licm(pm)
201
+ passes.common.add_symbol_dce(pm)
202
+ passes.ttir.add_loop_unroll(pm)
203
+ pm.run(mod)
204
+ return mod
205
+
206
+ @staticmethod
207
+ def make_ttgir(mod, metadata, options):
208
+ pm = ir.pass_manager(mod.context)
209
+ pm.enable_debug()
210
+ passes.ttir.add_convert_to_ttgpuir(pm, f"hip:{options.arch}", options.num_warps, options.warp_size,
211
+ options.num_ctas)
212
+ pm.run(mod)
213
+ pm = ir.pass_manager(mod.context)
214
+ pm.enable_debug()
215
+ passes.ttgpuir.add_coalesce(pm)
216
+ passes.ttgpuir.add_remove_layout_conversions(pm)
217
+ passes.ttgpuir.add_optimize_thread_locality(pm)
218
+ amd.passes.ttgpuir.add_accelerate_matmul(pm, options.arch, options.matrix_instr_nonkdim, options.kpack)
219
+ passes.ttgpuir.add_remove_layout_conversions(pm)
220
+ amd.passes.ttgpuir.add_optimize_epilogue(pm)
221
+ passes.ttgpuir.add_optimize_dot_operands(pm, True)
222
+ if amd.has_matrix_core_feature(options.arch):
223
+ assert options.num_stages != 0, ("Triton AMD backend pipeliner has been updated. "
224
+ "We used to trigger software pipelining with "
225
+ "num_stages == 0. Now it will not happen anymore; "
226
+ "please update to use num_stages == 2 for "
227
+ "equivalent behavior in the past.")
228
+ amd.passes.ttgpuir.add_stream_pipelinev2(pm, options.num_stages)
229
+ passes.common.add_canonicalizer(pm)
230
+ amd.passes.ttgpuir.insert_instruction_sched_hints(pm)
231
+ passes.ttgpuir.add_optimize_dot_operands(pm, True)
232
+ passes.ttgpuir.add_remove_layout_conversions(pm)
233
+ passes.ttgpuir.add_reduce_data_duplication(pm)
234
+ if amd.has_matrix_core_feature(options.arch):
235
+ amd.passes.ttgpuir.add_reorder_instructions(pm)
236
+ if os.environ.get("AMDGCN_USE_BUFFER_OPS", "0") == "1":
237
+ amd.passes.ttgpuir.add_canonicalize_pointers(pm)
238
+ passes.common.add_canonicalizer(pm)
239
+ amd.passes.ttgpuir.add_convert_to_buffer_ops(pm)
240
+ passes.common.add_canonicalizer(pm)
241
+ passes.common.add_cse(pm)
242
+ passes.common.add_symbol_dce(pm)
243
+ pm.run(mod)
244
+ return mod
245
+
246
+ @staticmethod
247
+ def make_llir(src, metadata, options):
248
+ mod = src
249
+ # TritonGPU -> LLVM-IR (MLIR)
250
+ pm = ir.pass_manager(mod.context)
251
+ pm.enable_debug()
252
+ amd.passes.ttgpuir.add_decompose_unsupported_conversions(pm, options.arch)
253
+ # custom_lds_size is an experimental parameter that defines amount of LDS available
254
+ # for one thread block. Measured in bytes.
255
+ #
256
+ # If custom_lds_size = 0, pass will consider all LDS is available for one threads block,
257
+ # LDS size is determined by provided arch name.
258
+ custom_lds_size = 0
259
+ amd.passes.ttgpuir.add_optimize_lds_usage(pm, options.arch, custom_lds_size)
260
+ passes.convert.add_scf_to_cf(pm)
261
+ passes.convert.add_index_to_llvmir(pm)
262
+
263
+ passes.ttgpuir.add_allocate_shared_memory(pm)
264
+ ## __HIP_FTZ is used to control the denorm flushing behavior of exp2 op as follows:
265
+ ## 1. If __HIP_FTZ = 1, exp2 flushes denorms in input and output regardless
266
+ ## of the value of kernel arg `allow_flush_denorm`.
267
+ ## 2. If __HIP_FTZ = 0, whether exp2 flushes denorms in input and output
268
+ ## depends on the value of kernel arg `allow_flush_denorm`.
269
+ ## 3. __HIP_FTZ is default to 1 and not exposed as a kernel argument.
270
+ ## For now it is used as a controller for developers only.
271
+ __HIP_FTZ = True
272
+ amd.passes.ttgpuir.add_to_llvmir(pm, options.arch, __HIP_FTZ)
273
+ passes.common.add_canonicalizer(pm)
274
+ passes.common.add_cse(pm)
275
+
276
+ passes.convert.add_cf_to_llvmir(pm)
277
+ passes.convert.add_arith_to_llvmir(pm)
278
+ passes.common.add_canonicalizer(pm)
279
+ passes.common.add_cse(pm)
280
+ passes.common.add_symbol_dce(pm)
281
+ amd.passes.ttgpuir.lower_instruction_sched_hints(pm, options.instruction_sched_variant)
282
+ if os.environ.get("TRITON_DISABLE_LINE_INFO", "0") == "0":
283
+ passes.llvmir.add_di_scope(pm)
284
+ amd.passes.ttgpuir.add_builtin_func_to_llvmir(pm, __HIP_FTZ)
285
+ pm.run(mod)
286
+
287
+ # LLVM-IR (MLIR) -> LLVM-IR (LLVM)
288
+ llvm.init_targets()
289
+ context = llvm.context()
290
+ llvm_mod = llvm.to_module(mod, context)
291
+ amd.attach_target_triple(llvm_mod)
292
+ llvm.attach_datalayout(llvm_mod, amd.TARGET_TRIPLE, options.arch, '')
293
+
294
+ # Set various control constants on the LLVM module so that device
295
+ # libraries can resolve references to them.
296
+ amd.set_isa_version(llvm_mod, options.arch)
297
+ amd.set_abi_version(llvm_mod, 400)
298
+ amd.set_bool_control_constant(llvm_mod, "__oclc_finite_only_opt", False)
299
+ amd.set_bool_control_constant(llvm_mod, "__oclc_correctly_rounded_sqrt32", True)
300
+ amd.set_bool_control_constant(llvm_mod, "__oclc_unsafe_math_opt", False)
301
+ amd.set_bool_control_constant(llvm_mod, "__oclc_wavefrontsize64", options.warp_size == 64)
302
+
303
+ # Set kernel attributes first given this may affect later optimizations.
304
+ fns = [fn for fn in llvm_mod.get_functions() if not fn.is_declaration()]
305
+ # The public kernel should be kernel 0.
306
+ fns[0].set_calling_conv(amd.CALLING_CONV_AMDGPU_KERNEL)
307
+ fns[0].add_fn_attr("amdgpu-flat-work-group-size", f"1,{options.num_warps*options.warp_size}")
308
+ fns[0].add_fn_attr("amdgpu-waves-per-eu", f"{options.waves_per_eu}")
309
+ denormal_mode = "preserve-sign" if options.allow_flush_denorm else "ieee"
310
+ fns[0].add_fn_attr("denormal-fp-math-f32", denormal_mode)
311
+
312
+ # Hint the compiler that we'd like the firmware to set the kernel arguments
313
+ # to user SGPRs so that the kernel does not need to s_load its arguments
314
+ # from memory.
315
+ amd.set_all_fn_arg_inreg(fns[0])
316
+
317
+ if options.extern_libs:
318
+ paths = [path for (name, path) in options.extern_libs if amd.need_extern_lib(llvm_mod, name)]
319
+ llvm.link_extern_libs(llvm_mod, paths)
320
+
321
+ llvm.optimize_module(llvm_mod, llvm.OPTIMIZE_O3, options.arch, '', [], options.enable_fp_fusion)
322
+
323
+ # Get some metadata
324
+ metadata["shared"] = src.get_int_attr("triton_gpu.shared")
325
+
326
+ amd.cleanup_bitcode_metadata(llvm_mod)
327
+ return str(llvm_mod)
328
+
329
+ @staticmethod
330
+ def make_amdgcn(src, metadata, options):
331
+ # Find kernel names (there should only be one)
332
+ # We get the name at the last possible step to accomodate `triton.compile`
333
+ # on user-provided LLVM
334
+ names = re.findall(r"define amdgpu_kernel void @([a-zA-Z_][a-zA-Z0-9_]*)", src)
335
+ assert len(names) == 1
336
+ metadata["name"] = names[0]
337
+ # llvm -> hsaco
338
+ amdgcn = llvm.translate_to_asm(src, amd.TARGET_TRIPLE, options.arch, '', [], options.enable_fp_fusion, False)
339
+ if os.environ.get("AMDGCN_ENABLE_DUMP", "0") == "1":
340
+ print("// -----// AMDGCN Dump //----- //")
341
+ print(amdgcn)
342
+ return amdgcn
343
+
344
+ @staticmethod
345
+ def make_hsaco(src, metadata, options):
346
+ hsaco = amd.assemble_amdgcn(src, options.arch, '')
347
+
348
+ rocm_path = HIPBackend.path_to_rocm_lld()
349
+ with tempfile.NamedTemporaryFile() as tmp_out:
350
+ with tempfile.NamedTemporaryFile() as tmp_in:
351
+ with open(tmp_in.name, 'wb') as fd_in:
352
+ fd_in.write(hsaco)
353
+ subprocess.check_call([rocm_path, '-flavor', 'gnu', '-shared', tmp_in.name, '-o', tmp_out.name])
354
+ with open(tmp_out.name, 'rb') as fd_out:
355
+ ret = fd_out.read()
356
+ return ret
357
+
358
+ def add_stages(self, stages, options):
359
+ stages["ttir"] = lambda src, metadata: self.make_ttir(src, metadata, options)
360
+ stages["ttgir"] = lambda src, metadata: self.make_ttgir(src, metadata, options)
361
+ stages["llir"] = lambda src, metadata: self.make_llir(src, metadata, options)
362
+ stages["amdgcn"] = lambda src, metadata: self.make_amdgcn(src, metadata, options)
363
+ stages["hsaco"] = lambda src, metadata: self.make_hsaco(src, metadata, options)
364
+
365
+ @functools.lru_cache()
366
+ def hash(self):
367
+ version = subprocess.check_output([HIPBackend.path_to_rocm_lld(), "--version"], encoding='utf-8')
368
+ return f'{version}-{self.target}'