triton-windows 3.2.0.post11__cp312-cp312-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of triton-windows might be problematic. Click here for more details.

Files changed (154) hide show
  1. triton/_C/libtriton.pyd +0 -0
  2. triton/__init__.py +85 -0
  3. triton/_internal_testing.py +123 -0
  4. triton/backends/__init__.py +50 -0
  5. triton/backends/amd/compiler.py +368 -0
  6. triton/backends/amd/driver.c +211 -0
  7. triton/backends/amd/driver.py +512 -0
  8. triton/backends/amd/include/hip/amd_detail/amd_channel_descriptor.h +358 -0
  9. triton/backends/amd/include/hip/amd_detail/amd_device_functions.h +1031 -0
  10. triton/backends/amd/include/hip/amd_detail/amd_hip_atomic.h +1612 -0
  11. triton/backends/amd/include/hip/amd_detail/amd_hip_bf16.h +1337 -0
  12. triton/backends/amd/include/hip/amd_detail/amd_hip_bfloat16.h +293 -0
  13. triton/backends/amd/include/hip/amd_detail/amd_hip_common.h +32 -0
  14. triton/backends/amd/include/hip/amd_detail/amd_hip_complex.h +174 -0
  15. triton/backends/amd/include/hip/amd_detail/amd_hip_cooperative_groups.h +829 -0
  16. triton/backends/amd/include/hip/amd_detail/amd_hip_fp16.h +1809 -0
  17. triton/backends/amd/include/hip/amd_detail/amd_hip_gl_interop.h +108 -0
  18. triton/backends/amd/include/hip/amd_detail/amd_hip_math_constants.h +124 -0
  19. triton/backends/amd/include/hip/amd_detail/amd_hip_runtime.h +405 -0
  20. triton/backends/amd/include/hip/amd_detail/amd_hip_runtime_pt_api.h +196 -0
  21. triton/backends/amd/include/hip/amd_detail/amd_hip_unsafe_atomics.h +565 -0
  22. triton/backends/amd/include/hip/amd_detail/amd_hip_vector_types.h +2226 -0
  23. triton/backends/amd/include/hip/amd_detail/amd_math_functions.h +104 -0
  24. triton/backends/amd/include/hip/amd_detail/amd_surface_functions.h +244 -0
  25. triton/backends/amd/include/hip/amd_detail/amd_warp_functions.h +494 -0
  26. triton/backends/amd/include/hip/amd_detail/concepts.hpp +30 -0
  27. triton/backends/amd/include/hip/amd_detail/device_library_decls.h +133 -0
  28. triton/backends/amd/include/hip/amd_detail/functional_grid_launch.hpp +218 -0
  29. triton/backends/amd/include/hip/amd_detail/grid_launch.h +67 -0
  30. triton/backends/amd/include/hip/amd_detail/grid_launch.hpp +50 -0
  31. triton/backends/amd/include/hip/amd_detail/grid_launch_GGL.hpp +26 -0
  32. triton/backends/amd/include/hip/amd_detail/helpers.hpp +137 -0
  33. triton/backends/amd/include/hip/amd_detail/hip_api_trace.hpp +1350 -0
  34. triton/backends/amd/include/hip/amd_detail/hip_assert.h +101 -0
  35. triton/backends/amd/include/hip/amd_detail/hip_cooperative_groups_helper.h +242 -0
  36. triton/backends/amd/include/hip/amd_detail/hip_fp16_gcc.h +254 -0
  37. triton/backends/amd/include/hip/amd_detail/hip_fp16_math_fwd.h +96 -0
  38. triton/backends/amd/include/hip/amd_detail/hip_ldg.h +100 -0
  39. triton/backends/amd/include/hip/amd_detail/hip_prof_str.h +10169 -0
  40. triton/backends/amd/include/hip/amd_detail/hip_runtime_prof.h +77 -0
  41. triton/backends/amd/include/hip/amd_detail/host_defines.h +180 -0
  42. triton/backends/amd/include/hip/amd_detail/hsa_helpers.hpp +102 -0
  43. triton/backends/amd/include/hip/amd_detail/macro_based_grid_launch.hpp +798 -0
  44. triton/backends/amd/include/hip/amd_detail/math_fwd.h +698 -0
  45. triton/backends/amd/include/hip/amd_detail/ockl_image.h +177 -0
  46. triton/backends/amd/include/hip/amd_detail/program_state.hpp +107 -0
  47. triton/backends/amd/include/hip/amd_detail/texture_fetch_functions.h +491 -0
  48. triton/backends/amd/include/hip/amd_detail/texture_indirect_functions.h +478 -0
  49. triton/backends/amd/include/hip/channel_descriptor.h +39 -0
  50. triton/backends/amd/include/hip/device_functions.h +38 -0
  51. triton/backends/amd/include/hip/driver_types.h +468 -0
  52. triton/backends/amd/include/hip/hip_bf16.h +36 -0
  53. triton/backends/amd/include/hip/hip_bfloat16.h +44 -0
  54. triton/backends/amd/include/hip/hip_common.h +100 -0
  55. triton/backends/amd/include/hip/hip_complex.h +38 -0
  56. triton/backends/amd/include/hip/hip_cooperative_groups.h +46 -0
  57. triton/backends/amd/include/hip/hip_deprecated.h +95 -0
  58. triton/backends/amd/include/hip/hip_ext.h +159 -0
  59. triton/backends/amd/include/hip/hip_fp16.h +36 -0
  60. triton/backends/amd/include/hip/hip_gl_interop.h +32 -0
  61. triton/backends/amd/include/hip/hip_hcc.h +24 -0
  62. triton/backends/amd/include/hip/hip_math_constants.h +36 -0
  63. triton/backends/amd/include/hip/hip_profile.h +27 -0
  64. triton/backends/amd/include/hip/hip_runtime.h +75 -0
  65. triton/backends/amd/include/hip/hip_runtime_api.h +8919 -0
  66. triton/backends/amd/include/hip/hip_texture_types.h +29 -0
  67. triton/backends/amd/include/hip/hip_vector_types.h +41 -0
  68. triton/backends/amd/include/hip/hip_version.h +17 -0
  69. triton/backends/amd/include/hip/hiprtc.h +421 -0
  70. triton/backends/amd/include/hip/library_types.h +78 -0
  71. triton/backends/amd/include/hip/math_functions.h +42 -0
  72. triton/backends/amd/include/hip/surface_types.h +63 -0
  73. triton/backends/amd/include/hip/texture_types.h +194 -0
  74. triton/backends/amd/include/hsa/Brig.h +1131 -0
  75. triton/backends/amd/include/hsa/amd_hsa_common.h +91 -0
  76. triton/backends/amd/include/hsa/amd_hsa_elf.h +436 -0
  77. triton/backends/amd/include/hsa/amd_hsa_kernel_code.h +269 -0
  78. triton/backends/amd/include/hsa/amd_hsa_queue.h +109 -0
  79. triton/backends/amd/include/hsa/amd_hsa_signal.h +80 -0
  80. triton/backends/amd/include/hsa/hsa.h +5729 -0
  81. triton/backends/amd/include/hsa/hsa_amd_tool.h +91 -0
  82. triton/backends/amd/include/hsa/hsa_api_trace.h +566 -0
  83. triton/backends/amd/include/hsa/hsa_ext_amd.h +3090 -0
  84. triton/backends/amd/include/hsa/hsa_ext_finalize.h +531 -0
  85. triton/backends/amd/include/hsa/hsa_ext_image.h +1454 -0
  86. triton/backends/amd/include/hsa/hsa_ven_amd_aqlprofile.h +488 -0
  87. triton/backends/amd/include/hsa/hsa_ven_amd_loader.h +667 -0
  88. triton/backends/amd/include/roctracer/ext/prof_protocol.h +107 -0
  89. triton/backends/amd/include/roctracer/hip_ostream_ops.h +4435 -0
  90. triton/backends/amd/include/roctracer/hsa_ostream_ops.h +1467 -0
  91. triton/backends/amd/include/roctracer/hsa_prof_str.h +3027 -0
  92. triton/backends/amd/include/roctracer/roctracer.h +779 -0
  93. triton/backends/amd/include/roctracer/roctracer_ext.h +81 -0
  94. triton/backends/amd/include/roctracer/roctracer_hcc.h +24 -0
  95. triton/backends/amd/include/roctracer/roctracer_hip.h +37 -0
  96. triton/backends/amd/include/roctracer/roctracer_hsa.h +112 -0
  97. triton/backends/amd/include/roctracer/roctracer_plugin.h +137 -0
  98. triton/backends/amd/include/roctracer/roctracer_roctx.h +67 -0
  99. triton/backends/amd/include/roctracer/roctx.h +229 -0
  100. triton/backends/amd/lib/ockl.bc +0 -0
  101. triton/backends/amd/lib/ocml.bc +0 -0
  102. triton/backends/compiler.py +304 -0
  103. triton/backends/driver.py +48 -0
  104. triton/backends/nvidia/__init__.py +0 -0
  105. triton/backends/nvidia/bin/ptxas.exe +0 -0
  106. triton/backends/nvidia/compiler.py +410 -0
  107. triton/backends/nvidia/driver.c +451 -0
  108. triton/backends/nvidia/driver.py +524 -0
  109. triton/backends/nvidia/include/cuda.h +24359 -0
  110. triton/backends/nvidia/lib/libdevice.10.bc +0 -0
  111. triton/backends/nvidia/lib/x64/cuda.lib +0 -0
  112. triton/compiler/__init__.py +4 -0
  113. triton/compiler/code_generator.py +1303 -0
  114. triton/compiler/compiler.py +430 -0
  115. triton/compiler/errors.py +51 -0
  116. triton/compiler/make_launcher.py +0 -0
  117. triton/errors.py +5 -0
  118. triton/language/__init__.py +294 -0
  119. triton/language/_utils.py +21 -0
  120. triton/language/core.py +2694 -0
  121. triton/language/extra/__init__.py +26 -0
  122. triton/language/extra/cuda/__init__.py +13 -0
  123. triton/language/extra/cuda/_experimental_tma.py +108 -0
  124. triton/language/extra/cuda/libdevice.py +1629 -0
  125. triton/language/extra/cuda/utils.py +109 -0
  126. triton/language/extra/hip/__init__.py +3 -0
  127. triton/language/extra/hip/libdevice.py +475 -0
  128. triton/language/extra/libdevice.py +786 -0
  129. triton/language/math.py +250 -0
  130. triton/language/random.py +207 -0
  131. triton/language/semantic.py +1796 -0
  132. triton/language/standard.py +452 -0
  133. triton/runtime/__init__.py +23 -0
  134. triton/runtime/autotuner.py +408 -0
  135. triton/runtime/build.py +111 -0
  136. triton/runtime/cache.py +295 -0
  137. triton/runtime/driver.py +60 -0
  138. triton/runtime/errors.py +26 -0
  139. triton/runtime/interpreter.py +1235 -0
  140. triton/runtime/jit.py +951 -0
  141. triton/testing.py +511 -0
  142. triton/tools/__init__.py +0 -0
  143. triton/tools/build_extern.py +365 -0
  144. triton/tools/compile.c +67 -0
  145. triton/tools/compile.h +14 -0
  146. triton/tools/compile.py +155 -0
  147. triton/tools/disasm.py +144 -0
  148. triton/tools/experimental_descriptor.py +32 -0
  149. triton/tools/link.py +322 -0
  150. triton/windows_utils.py +375 -0
  151. triton_windows-3.2.0.post11.dist-info/METADATA +39 -0
  152. triton_windows-3.2.0.post11.dist-info/RECORD +154 -0
  153. triton_windows-3.2.0.post11.dist-info/WHEEL +5 -0
  154. triton_windows-3.2.0.post11.dist-info/top_level.txt +12 -0
@@ -0,0 +1,524 @@
1
+ import functools
2
+ import os
3
+ import hashlib
4
+ import subprocess
5
+ import tempfile
6
+ from pathlib import Path
7
+ from triton.runtime.build import _build
8
+ from triton.runtime.cache import get_cache_manager
9
+ from triton.backends.compiler import GPUTarget
10
+ from triton.backends.driver import GPUDriver
11
+
12
+ dirname = os.path.dirname(os.path.realpath(__file__))
13
+ include_dir = [os.path.join(dirname, "include")]
14
+ if os.name == "nt":
15
+ from triton.windows_utils import find_cuda
16
+ _, cuda_inc_dirs, _ = find_cuda()
17
+ include_dir += cuda_inc_dirs
18
+ libdevice_dir = os.path.join(dirname, "lib")
19
+ libraries = ['cuda']
20
+
21
+
22
+ @functools.lru_cache()
23
+ def libcuda_dirs():
24
+ if os.name == "nt":
25
+ _, _, cuda_lib_dirs = find_cuda()
26
+ return cuda_lib_dirs
27
+
28
+ env_libcuda_path = os.getenv("TRITON_LIBCUDA_PATH")
29
+ if env_libcuda_path:
30
+ return [env_libcuda_path]
31
+
32
+ libs = subprocess.check_output(["/sbin/ldconfig", "-p"]).decode()
33
+ # each line looks like the following:
34
+ # libcuda.so.1 (libc6,x86-64) => /lib/x86_64-linux-gnu/libcuda.so.1
35
+ locs = [line.split()[-1] for line in libs.splitlines() if "libcuda.so.1" in line]
36
+ dirs = [os.path.dirname(loc) for loc in locs]
37
+ env_ld_library_path = os.getenv("LD_LIBRARY_PATH")
38
+ if env_ld_library_path and not dirs:
39
+ dirs = [dir for dir in env_ld_library_path.split(":") if os.path.exists(os.path.join(dir, "libcuda.so.1"))]
40
+ msg = 'libcuda.so cannot found!\n'
41
+ if locs:
42
+ msg += 'Possible files are located at %s.' % str(locs)
43
+ msg += 'Please create a symlink of libcuda.so to any of the files.'
44
+ else:
45
+ msg += 'Please make sure GPU is set up and then run "/sbin/ldconfig"'
46
+ msg += ' (requires sudo) to refresh the linker cache.'
47
+ assert any(os.path.exists(os.path.join(path, 'libcuda.so.1')) for path in dirs), msg
48
+ return dirs
49
+
50
+
51
+ @functools.lru_cache()
52
+ def library_dirs():
53
+ return [libdevice_dir, *libcuda_dirs()]
54
+
55
+
56
+ def compile_module_from_src(src, name):
57
+ key = hashlib.sha256(src.encode("utf-8")).hexdigest()
58
+ cache = get_cache_manager(key)
59
+ if os.name == "nt":
60
+ so_name = f"{name}.pyd"
61
+ else:
62
+ so_name = f"{name}.so"
63
+ cache_path = cache.get_file(so_name)
64
+ if cache_path is None:
65
+ with tempfile.TemporaryDirectory() as tmpdir:
66
+ src_path = os.path.join(tmpdir, f"{name}.c")
67
+ with open(src_path, "w") as f:
68
+ f.write(src)
69
+ so = _build(name, src_path, tmpdir, library_dirs(), include_dir, libraries)
70
+ with open(so, "rb") as f:
71
+ cache_path = cache.put(f.read(), so_name, binary=True)
72
+ import importlib.util
73
+ spec = importlib.util.spec_from_file_location(name, cache_path)
74
+ mod = importlib.util.module_from_spec(spec)
75
+ spec.loader.exec_module(mod)
76
+ return mod
77
+
78
+
79
+ # ------------------------
80
+ # Utils
81
+ # ------------------------
82
+
83
+
84
+ class CudaUtils(object):
85
+
86
+ def __new__(cls):
87
+ if not hasattr(cls, "instance"):
88
+ cls.instance = super(CudaUtils, cls).__new__(cls)
89
+ return cls.instance
90
+
91
+ def __init__(self):
92
+ mod = compile_module_from_src(Path(os.path.join(dirname, "driver.c")).read_text(), "cuda_utils")
93
+ self.load_binary = mod.load_binary
94
+ self.get_device_properties = mod.get_device_properties
95
+ self.cuOccupancyMaxActiveClusters = mod.cuOccupancyMaxActiveClusters
96
+ self.set_printf_fifo_size = mod.set_printf_fifo_size
97
+ self.fill_1d_tma_descriptor = mod.fill_1d_tma_descriptor
98
+ self.fill_2d_tma_descriptor = mod.fill_2d_tma_descriptor
99
+
100
+
101
+ # ------------------------
102
+ # Launcher
103
+ # ------------------------
104
+
105
+
106
+ def ty_to_cpp(ty):
107
+ if ty[0] == '*':
108
+ return "CUdeviceptr"
109
+ return {
110
+ "i1": "int32_t",
111
+ "i8": "int8_t",
112
+ "i16": "int16_t",
113
+ "i32": "int32_t",
114
+ "i64": "int64_t",
115
+ "u1": "uint32_t",
116
+ "u8": "uint8_t",
117
+ "u16": "uint16_t",
118
+ "u32": "uint32_t",
119
+ "u64": "uint64_t",
120
+ "fp16": "float",
121
+ "bf16": "float",
122
+ "fp32": "float",
123
+ "f32": "float",
124
+ "fp64": "double",
125
+ "nvTmaDesc": "CUtensorMap",
126
+ }[ty]
127
+
128
+
129
+ def make_launcher(constants, signature, ids):
130
+ # Record the end of regular arguments;
131
+ # subsequent arguments are architecture-specific descriptors, such as tensor descriptors for CUDA.
132
+ arg_decls = ', '.join(f"{ty_to_cpp(ty)} arg{i}" for i, ty in signature.items())
133
+
134
+ def _extracted_type(ty):
135
+ if ty[0] == '*':
136
+ return "PyObject*"
137
+ if ty == "nvTmaDesc":
138
+ return "PyObject*"
139
+
140
+ return ty_to_cpp(ty)
141
+
142
+ def format_of(ty):
143
+ return {
144
+ "PyObject*": "O",
145
+ "float": "f",
146
+ "double": "d",
147
+ "long": "l",
148
+ "int8_t": "b",
149
+ "int16_t": "h",
150
+ "int32_t": "i",
151
+ "int64_t": "L",
152
+ "uint8_t": "B",
153
+ "uint16_t": "H",
154
+ "uint32_t": "I",
155
+ "uint64_t": "K",
156
+ }[ty]
157
+
158
+ args_format = ''.join([format_of(_extracted_type(ty)) for ty in signature.values()])
159
+ format = "iiiKKOOOO" + args_format
160
+ args_list = ', ' + ', '.join(f"&_arg{i}" for i, ty in signature.items()) if len(signature) > 0 else ''
161
+
162
+ internal_args_list = []
163
+ for i, ty in signature.items():
164
+ if ty[0] == "*":
165
+ internal_args_list.append(f"ptr_info{i}.dev_ptr")
166
+ elif ty == "nvTmaDesc":
167
+ # Note: we have to dereference the pointer
168
+ internal_args_list.append(f"*tma_ptr{i}")
169
+ else:
170
+ internal_args_list.append(f"_arg{i}")
171
+
172
+ # generate glue code
173
+ params = [i for i in signature.keys() if i not in constants]
174
+ if params:
175
+ params_decl = ", ".join(f"&arg{i}" for i in params)
176
+ params_decl = f"void *params[] = {{ {params_decl} }};"
177
+ else:
178
+ params_decl = "void **params = NULL;"
179
+ src = f"""
180
+ #include \"cuda.h\"
181
+ #include <stdbool.h>
182
+ #include <Python.h>
183
+
184
+ #ifndef _WIN32
185
+ #include <dlfcn.h>
186
+ #else
187
+ #define WIN32_LEAN_AND_MEAN
188
+ #include <windows.h>
189
+ #endif
190
+
191
+ static inline void gpuAssert(CUresult code, const char *file, int line)
192
+ {{
193
+ if (code != CUDA_SUCCESS)
194
+ {{
195
+ const char* prefix = "Triton Error [CUDA]: ";
196
+ const char* str;
197
+ cuGetErrorString(code, &str);
198
+ char err[1024] = {{0}};
199
+ strcat(err, prefix);
200
+ strcat(err, str);
201
+ PyGILState_STATE gil_state;
202
+ gil_state = PyGILState_Ensure();
203
+ PyErr_SetString(PyExc_RuntimeError, err);
204
+ PyGILState_Release(gil_state);
205
+ }}
206
+ }}
207
+
208
+ #define CUDA_CHECK(ans) {{ gpuAssert((ans), __FILE__, __LINE__); }}
209
+
210
+ typedef CUresult (*cuLaunchKernelEx_t)(const CUlaunchConfig* config, CUfunction f, void** kernelParams, void** extra);
211
+
212
+ #ifndef _WIN32
213
+ static cuLaunchKernelEx_t getLaunchKernelExHandle() {{
214
+ // Open the shared library
215
+ void* handle = dlopen("libcuda.so.1", RTLD_LAZY);
216
+ if (!handle) {{
217
+ PyErr_SetString(PyExc_RuntimeError, "Failed to open libcuda.so.1");
218
+ return NULL;
219
+ }}
220
+ // Clear any existing error
221
+ dlerror();
222
+ cuLaunchKernelEx_t cuLaunchKernelExHandle = (cuLaunchKernelEx_t)dlsym(handle, "cuLaunchKernelEx");
223
+ // Check for errors
224
+ const char *dlsym_error = dlerror();
225
+ if (dlsym_error) {{
226
+ PyErr_SetString(PyExc_RuntimeError, "Failed to retrieve cuLaunchKernelEx from libcuda.so.1");
227
+ return NULL;
228
+ }}
229
+ return cuLaunchKernelExHandle;
230
+ }}
231
+ #else
232
+ static cuLaunchKernelEx_t getLaunchKernelExHandle() {{
233
+ // Open the shared library
234
+ HMODULE handle = LoadLibraryA("nvcuda.dll");
235
+ if (!handle) {{
236
+ PyErr_SetString(PyExc_RuntimeError, "Failed to open nvcuda.dll");
237
+ return NULL;
238
+ }}
239
+ cuLaunchKernelEx_t cuLaunchKernelExHandle =
240
+ (cuLaunchKernelEx_t)GetProcAddress((HMODULE)handle, "cuLaunchKernelEx");
241
+ // Check for errors
242
+ long error = GetLastError();
243
+ if (error) {{
244
+ PyErr_SetString(PyExc_RuntimeError, "Failed to retrieve cuLaunchKernelEx from nvcuda.dll");
245
+ return NULL;
246
+ }}
247
+ return cuLaunchKernelExHandle;
248
+ }}
249
+ #endif
250
+
251
+ static void _launch(int gridX, int gridY, int gridZ, int num_warps, int num_ctas, int clusterDimX, int clusterDimY, int clusterDimZ, int shared_memory, CUstream stream, CUfunction function{', ' + arg_decls if len(arg_decls) > 0 else ''}) {{
252
+ {params_decl}
253
+ if (gridX*gridY*gridZ > 0) {{
254
+ if (num_ctas == 1) {{
255
+ CUDA_CHECK(cuLaunchKernel(function, gridX, gridY, gridZ, 32*num_warps, 1, 1, shared_memory, stream, params, 0));
256
+ }} else {{
257
+ CUlaunchAttribute launchAttr[2];
258
+ launchAttr[0].id = CU_LAUNCH_ATTRIBUTE_CLUSTER_DIMENSION;
259
+ launchAttr[0].value.clusterDim.x = clusterDimX;
260
+ launchAttr[0].value.clusterDim.y = clusterDimY;
261
+ launchAttr[0].value.clusterDim.z = clusterDimZ;
262
+ launchAttr[1].id = CU_LAUNCH_ATTRIBUTE_CLUSTER_SCHEDULING_POLICY_PREFERENCE;
263
+ launchAttr[1].value.clusterSchedulingPolicyPreference = CU_CLUSTER_SCHEDULING_POLICY_SPREAD;
264
+ CUlaunchConfig config;
265
+ config.gridDimX = gridX * clusterDimX;
266
+ config.gridDimY = gridY * clusterDimY;
267
+ config.gridDimZ = gridZ * clusterDimZ;
268
+ config.blockDimX = 32 * num_warps;
269
+ config.blockDimY = 1;
270
+ config.blockDimZ = 1;
271
+ config.sharedMemBytes = shared_memory;
272
+ config.hStream = stream;
273
+ config.attrs = launchAttr;
274
+ config.numAttrs = 2;
275
+ static cuLaunchKernelEx_t cuLaunchKernelExHandle = NULL;
276
+ if (cuLaunchKernelExHandle == NULL) {{
277
+ cuLaunchKernelExHandle = getLaunchKernelExHandle();
278
+ }}
279
+ CUDA_CHECK(cuLaunchKernelExHandle(&config, function, params, 0));
280
+ }}
281
+ }}
282
+ }}
283
+
284
+ typedef struct _DevicePtrInfo {{
285
+ CUdeviceptr dev_ptr;
286
+ bool valid;
287
+ }} DevicePtrInfo;
288
+
289
+ static inline DevicePtrInfo getPointer(PyObject *obj, int idx) {{
290
+ DevicePtrInfo ptr_info;
291
+ ptr_info.dev_ptr = 0;
292
+ ptr_info.valid = true;
293
+ if (PyLong_Check(obj)) {{
294
+ ptr_info.dev_ptr = PyLong_AsUnsignedLongLong(obj);
295
+ return ptr_info;
296
+ }}
297
+ if (obj == Py_None) {{
298
+ // valid nullptr
299
+ return ptr_info;
300
+ }}
301
+ PyObject *ptr = PyObject_GetAttrString(obj, "data_ptr");
302
+ if(ptr){{
303
+ PyObject *empty_tuple = PyTuple_New(0);
304
+ PyObject *ret = PyObject_Call(ptr, empty_tuple, NULL);
305
+ Py_DECREF(empty_tuple);
306
+ Py_DECREF(ptr);
307
+ if (!PyLong_Check(ret)) {{
308
+ PyErr_SetString(PyExc_TypeError, "data_ptr method of Pointer object must return 64-bit int");
309
+ ptr_info.valid = false;
310
+ return ptr_info;
311
+ }}
312
+ ptr_info.dev_ptr = PyLong_AsUnsignedLongLong(ret);
313
+ if(!ptr_info.dev_ptr)
314
+ return ptr_info;
315
+ uint64_t dev_ptr;
316
+ int status = cuPointerGetAttribute(&dev_ptr, CU_POINTER_ATTRIBUTE_DEVICE_POINTER, ptr_info.dev_ptr);
317
+ if (status == CUDA_ERROR_INVALID_VALUE) {{
318
+ PyErr_Format(PyExc_ValueError,
319
+ "Pointer argument (at %d) cannot be accessed from Triton (cpu tensor?)", idx);
320
+ ptr_info.valid = false;
321
+ }} else if (status != CUDA_SUCCESS) {{
322
+ CUDA_CHECK(status); // Catch any other cuda API errors
323
+ ptr_info.valid = false;
324
+ }}
325
+ ptr_info.dev_ptr = dev_ptr;
326
+ Py_DECREF(ret); // Thanks ChatGPT!
327
+ return ptr_info;
328
+ }}
329
+ PyErr_SetString(PyExc_TypeError, "Pointer argument must be either uint64 or have data_ptr method");
330
+ ptr_info.valid = false;
331
+ return ptr_info;
332
+ }}
333
+
334
+ static inline CUtensorMap* getTmaDesc(PyObject *obj) {{
335
+ if (sizeof(CUtensorMap*) != 8) {{
336
+ PyErr_SetString(PyExc_SystemError, "getTmaDesc() requires 64-bit compilation");
337
+ return NULL;
338
+ }}
339
+
340
+ PyObject *method_handle = PyObject_GetAttrString(obj, "tma_desc_cpu_ptr");
341
+ if (!method_handle) {{
342
+ PyErr_SetString(PyExc_TypeError, "tma_desc_cpu_ptr() method does not exist");
343
+ return NULL;
344
+ }}
345
+
346
+ PyObject *empty_tuple = PyTuple_New(0);
347
+ if (!empty_tuple) {{
348
+ Py_DECREF(method_handle);
349
+ PyErr_SetString(PyExc_SystemError, "Internal Python error!");
350
+ return NULL;
351
+ }}
352
+ PyObject *method_ret = PyObject_Call(method_handle, empty_tuple, NULL);
353
+ Py_DECREF(empty_tuple);
354
+ Py_DECREF(method_handle);
355
+ if (!method_ret) {{
356
+ PyErr_SetString(PyExc_SystemError, "Internal Python error!");
357
+ return NULL;
358
+ }}
359
+
360
+ if (!PyLong_Check(method_ret)) {{
361
+ PyErr_SetString(PyExc_TypeError, "tma_desc_cpu_ptr() must return 64-bit int");
362
+ Py_DECREF(method_ret);
363
+ return NULL;
364
+ }}
365
+
366
+ uint64_t ptr_as_uint = PyLong_AsUnsignedLongLong(method_ret);
367
+ Py_DECREF(method_ret);
368
+ if (!ptr_as_uint) {{
369
+ PyErr_SetString(PyExc_ValueError, "received NULL ptr from tma_desc_cpu_ptr()");
370
+ return NULL;
371
+ }}
372
+ if (ptr_as_uint % 64 != 0) {{
373
+ PyErr_SetString(PyExc_ValueError, "tma_desc_cpu_ptr() must be 64-byte aligned");
374
+ return NULL;
375
+ }}
376
+
377
+ return (CUtensorMap*)(ptr_as_uint);
378
+ }}
379
+
380
+ static void ensureCudaContext() {{
381
+ CUcontext pctx;
382
+ CUDA_CHECK(cuCtxGetCurrent(&pctx));
383
+ if (!pctx) {{
384
+ // Ensure device context.
385
+ CUdevice device;
386
+ CUDA_CHECK(cuDeviceGet(&device, 0));
387
+ CUDA_CHECK(cuDevicePrimaryCtxRetain(&pctx, device));
388
+ CUDA_CHECK(cuCtxSetCurrent(pctx));
389
+ }}
390
+ }}
391
+
392
+ static PyObject* launch(PyObject* self, PyObject* args) {{
393
+ // ensure cuda context is valid before calling any CUDA APIs, e.g. before getPointer calls cuPointerGetAttributes
394
+ ensureCudaContext();
395
+
396
+ int gridX, gridY, gridZ;
397
+ uint64_t _stream;
398
+ uint64_t _function;
399
+ PyObject *launch_enter_hook = NULL;
400
+ PyObject *launch_exit_hook = NULL;
401
+ PyObject *kernel_metadata = NULL;
402
+ PyObject *launch_metadata = NULL;
403
+ {' '.join([f"{_extracted_type(ty)} _arg{i}; " for i, ty in signature.items()])}
404
+ if(!PyArg_ParseTuple(args, \"{format}\", &gridX, &gridY, &gridZ, &_stream, &_function,
405
+ &kernel_metadata, &launch_metadata,
406
+ &launch_enter_hook, &launch_exit_hook {args_list})) {{
407
+ return NULL;
408
+ }}
409
+
410
+ int num_warps, num_ctas, shared_memory, clusterDimX, clusterDimY, clusterDimZ;
411
+ if (!PyArg_ParseTuple(kernel_metadata, \"iiiiii\", &num_warps, &num_ctas, &shared_memory, &clusterDimX, &clusterDimY, &clusterDimZ)) {{
412
+ PyErr_SetString(PyExc_TypeError, "kernel_metadata must be a tuple");
413
+ return NULL;
414
+ }}
415
+
416
+ // extract launch metadata
417
+ if (launch_enter_hook != Py_None){{
418
+ PyObject* args = Py_BuildValue("(O)", launch_metadata);
419
+ PyObject* ret = PyObject_CallObject(launch_enter_hook, args);
420
+ Py_DECREF(args);
421
+ if (!ret)
422
+ return NULL;
423
+ }}
424
+
425
+ // raise exception asap
426
+ {"".join([f"DevicePtrInfo ptr_info{i} = getPointer(_arg{i}, {i}); if (!ptr_info{i}.valid) return NULL;" if ty[0] == "*" else "" for i, ty in signature.items()])};
427
+ {"".join([f"CUtensorMap* tma_ptr{i} = getTmaDesc(_arg{i}); if (!tma_ptr{i}) return NULL;" if ty == "nvTmaDesc" else "" for i, ty in signature.items()])};
428
+ Py_BEGIN_ALLOW_THREADS;
429
+ _launch(gridX, gridY, gridZ, num_warps, num_ctas, clusterDimX, clusterDimY, clusterDimZ, shared_memory, (CUstream)_stream, (CUfunction)_function{', ' + ', '.join(internal_args_list) if len(internal_args_list) > 0 else ''});
430
+ Py_END_ALLOW_THREADS;
431
+ if (PyErr_Occurred()) {{
432
+ return NULL;
433
+ }}
434
+
435
+ if(launch_exit_hook != Py_None){{
436
+ PyObject* args = Py_BuildValue("(O)", launch_metadata);
437
+ PyObject* ret = PyObject_CallObject(launch_exit_hook, args);
438
+ Py_DECREF(args);
439
+ if (!ret)
440
+ return NULL;
441
+
442
+ }}
443
+
444
+ // return None
445
+ Py_INCREF(Py_None);
446
+ return Py_None;
447
+ }}
448
+
449
+ static PyMethodDef ModuleMethods[] = {{
450
+ {{"launch", launch, METH_VARARGS, "Entry point for all kernels with this signature"}},
451
+ {{NULL, NULL, 0, NULL}} // sentinel
452
+ }};
453
+
454
+ static struct PyModuleDef ModuleDef = {{
455
+ PyModuleDef_HEAD_INIT,
456
+ \"__triton_launcher\",
457
+ NULL, //documentation
458
+ -1, //size
459
+ ModuleMethods
460
+ }};
461
+
462
+ PyMODINIT_FUNC PyInit___triton_launcher(void) {{
463
+ PyObject *m = PyModule_Create(&ModuleDef);
464
+ if(m == NULL) {{
465
+ return NULL;
466
+ }}
467
+ PyModule_AddFunctions(m, ModuleMethods);
468
+ return m;
469
+ }}
470
+ """
471
+ return src
472
+
473
+
474
+ class CudaLauncher(object):
475
+
476
+ def __init__(self, src, metadata):
477
+ ids = {"ids_of_const_exprs": src.fn.constexprs if hasattr(src, "fn") else tuple()}
478
+ constants = src.constants if hasattr(src, "constants") else dict()
479
+ cst_key = lambda i: src.fn.arg_names.index(i) if isinstance(i, str) else i
480
+ constants = {cst_key(key): value for key, value in constants.items()}
481
+ signature = {cst_key(key): value for key, value in src.signature.items()}
482
+ src = make_launcher(constants, signature, ids)
483
+ mod = compile_module_from_src(src, "__triton_launcher")
484
+ self.launch = mod.launch
485
+
486
+ def __call__(self, *args, **kwargs):
487
+ self.launch(*args, **kwargs)
488
+
489
+
490
+ class CudaDriver(GPUDriver):
491
+
492
+ def __init__(self):
493
+ self.utils = CudaUtils() # TODO: make static
494
+ self.launcher_cls = CudaLauncher
495
+ super().__init__()
496
+
497
+ def get_current_target(self):
498
+ device = self.get_current_device()
499
+ capability = self.get_device_capability(device)
500
+ capability = capability[0] * 10 + capability[1]
501
+ warp_size = 32
502
+ return GPUTarget("cuda", capability, warp_size)
503
+
504
+ def get_device_interface(self):
505
+ import torch
506
+ return torch.cuda
507
+
508
+ @staticmethod
509
+ def is_active():
510
+ import torch
511
+ return torch.cuda.is_available() and (torch.version.hip is None)
512
+
513
+ def get_benchmarker(self):
514
+ from triton.testing import do_bench
515
+ return do_bench
516
+
517
+ def get_empty_cache_for_benchmark(self):
518
+ import torch
519
+
520
+ # We maintain a buffer of 256 MB that we clear
521
+ # before each kernel call to make sure that the L2 cache
522
+ # doesn't contain any input data before the run
523
+ cache_size = 256 * 1024 * 1024
524
+ return torch.empty(int(cache_size // 4), dtype=torch.int, device='cuda')