PyPI - triton-windows - Versions diffs - 3.2.0.post11__cp312-cp312-win_amd64.whl - Mend

triton-windows 3.2.0.post11__cp312-cp312-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of triton-windows might be problematic. Click here for more details.

Files changed (154) hide show

triton/_C/libtriton.pyd +0 -0
triton/__init__.py +85 -0
triton/_internal_testing.py +123 -0
triton/backends/__init__.py +50 -0
triton/backends/amd/compiler.py +368 -0
triton/backends/amd/driver.c +211 -0
triton/backends/amd/driver.py +512 -0
triton/backends/amd/include/hip/amd_detail/amd_channel_descriptor.h +358 -0
triton/backends/amd/include/hip/amd_detail/amd_device_functions.h +1031 -0
triton/backends/amd/include/hip/amd_detail/amd_hip_atomic.h +1612 -0
triton/backends/amd/include/hip/amd_detail/amd_hip_bf16.h +1337 -0
triton/backends/amd/include/hip/amd_detail/amd_hip_bfloat16.h +293 -0
triton/backends/amd/include/hip/amd_detail/amd_hip_common.h +32 -0
triton/backends/amd/include/hip/amd_detail/amd_hip_complex.h +174 -0
triton/backends/amd/include/hip/amd_detail/amd_hip_cooperative_groups.h +829 -0
triton/backends/amd/include/hip/amd_detail/amd_hip_fp16.h +1809 -0
triton/backends/amd/include/hip/amd_detail/amd_hip_gl_interop.h +108 -0
triton/backends/amd/include/hip/amd_detail/amd_hip_math_constants.h +124 -0
triton/backends/amd/include/hip/amd_detail/amd_hip_runtime.h +405 -0
triton/backends/amd/include/hip/amd_detail/amd_hip_runtime_pt_api.h +196 -0
triton/backends/amd/include/hip/amd_detail/amd_hip_unsafe_atomics.h +565 -0
triton/backends/amd/include/hip/amd_detail/amd_hip_vector_types.h +2226 -0
triton/backends/amd/include/hip/amd_detail/amd_math_functions.h +104 -0
triton/backends/amd/include/hip/amd_detail/amd_surface_functions.h +244 -0
triton/backends/amd/include/hip/amd_detail/amd_warp_functions.h +494 -0
triton/backends/amd/include/hip/amd_detail/concepts.hpp +30 -0
triton/backends/amd/include/hip/amd_detail/device_library_decls.h +133 -0
triton/backends/amd/include/hip/amd_detail/functional_grid_launch.hpp +218 -0
triton/backends/amd/include/hip/amd_detail/grid_launch.h +67 -0
triton/backends/amd/include/hip/amd_detail/grid_launch.hpp +50 -0
triton/backends/amd/include/hip/amd_detail/grid_launch_GGL.hpp +26 -0
triton/backends/amd/include/hip/amd_detail/helpers.hpp +137 -0
triton/backends/amd/include/hip/amd_detail/hip_api_trace.hpp +1350 -0
triton/backends/amd/include/hip/amd_detail/hip_assert.h +101 -0
triton/backends/amd/include/hip/amd_detail/hip_cooperative_groups_helper.h +242 -0
triton/backends/amd/include/hip/amd_detail/hip_fp16_gcc.h +254 -0
triton/backends/amd/include/hip/amd_detail/hip_fp16_math_fwd.h +96 -0
triton/backends/amd/include/hip/amd_detail/hip_ldg.h +100 -0
triton/backends/amd/include/hip/amd_detail/hip_prof_str.h +10169 -0
triton/backends/amd/include/hip/amd_detail/hip_runtime_prof.h +77 -0
triton/backends/amd/include/hip/amd_detail/host_defines.h +180 -0
triton/backends/amd/include/hip/amd_detail/hsa_helpers.hpp +102 -0
triton/backends/amd/include/hip/amd_detail/macro_based_grid_launch.hpp +798 -0
triton/backends/amd/include/hip/amd_detail/math_fwd.h +698 -0
triton/backends/amd/include/hip/amd_detail/ockl_image.h +177 -0
triton/backends/amd/include/hip/amd_detail/program_state.hpp +107 -0
triton/backends/amd/include/hip/amd_detail/texture_fetch_functions.h +491 -0
triton/backends/amd/include/hip/amd_detail/texture_indirect_functions.h +478 -0
triton/backends/amd/include/hip/channel_descriptor.h +39 -0
triton/backends/amd/include/hip/device_functions.h +38 -0
triton/backends/amd/include/hip/driver_types.h +468 -0
triton/backends/amd/include/hip/hip_bf16.h +36 -0
triton/backends/amd/include/hip/hip_bfloat16.h +44 -0
triton/backends/amd/include/hip/hip_common.h +100 -0
triton/backends/amd/include/hip/hip_complex.h +38 -0
triton/backends/amd/include/hip/hip_cooperative_groups.h +46 -0
triton/backends/amd/include/hip/hip_deprecated.h +95 -0
triton/backends/amd/include/hip/hip_ext.h +159 -0
triton/backends/amd/include/hip/hip_fp16.h +36 -0
triton/backends/amd/include/hip/hip_gl_interop.h +32 -0
triton/backends/amd/include/hip/hip_hcc.h +24 -0
triton/backends/amd/include/hip/hip_math_constants.h +36 -0
triton/backends/amd/include/hip/hip_profile.h +27 -0
triton/backends/amd/include/hip/hip_runtime.h +75 -0
triton/backends/amd/include/hip/hip_runtime_api.h +8919 -0
triton/backends/amd/include/hip/hip_texture_types.h +29 -0
triton/backends/amd/include/hip/hip_vector_types.h +41 -0
triton/backends/amd/include/hip/hip_version.h +17 -0
triton/backends/amd/include/hip/hiprtc.h +421 -0
triton/backends/amd/include/hip/library_types.h +78 -0
triton/backends/amd/include/hip/math_functions.h +42 -0
triton/backends/amd/include/hip/surface_types.h +63 -0
triton/backends/amd/include/hip/texture_types.h +194 -0
triton/backends/amd/include/hsa/Brig.h +1131 -0
triton/backends/amd/include/hsa/amd_hsa_common.h +91 -0
triton/backends/amd/include/hsa/amd_hsa_elf.h +436 -0
triton/backends/amd/include/hsa/amd_hsa_kernel_code.h +269 -0
triton/backends/amd/include/hsa/amd_hsa_queue.h +109 -0
triton/backends/amd/include/hsa/amd_hsa_signal.h +80 -0
triton/backends/amd/include/hsa/hsa.h +5729 -0
triton/backends/amd/include/hsa/hsa_amd_tool.h +91 -0
triton/backends/amd/include/hsa/hsa_api_trace.h +566 -0
triton/backends/amd/include/hsa/hsa_ext_amd.h +3090 -0
triton/backends/amd/include/hsa/hsa_ext_finalize.h +531 -0
triton/backends/amd/include/hsa/hsa_ext_image.h +1454 -0
triton/backends/amd/include/hsa/hsa_ven_amd_aqlprofile.h +488 -0
triton/backends/amd/include/hsa/hsa_ven_amd_loader.h +667 -0
triton/backends/amd/include/roctracer/ext/prof_protocol.h +107 -0
triton/backends/amd/include/roctracer/hip_ostream_ops.h +4435 -0
triton/backends/amd/include/roctracer/hsa_ostream_ops.h +1467 -0
triton/backends/amd/include/roctracer/hsa_prof_str.h +3027 -0
triton/backends/amd/include/roctracer/roctracer.h +779 -0
triton/backends/amd/include/roctracer/roctracer_ext.h +81 -0
triton/backends/amd/include/roctracer/roctracer_hcc.h +24 -0
triton/backends/amd/include/roctracer/roctracer_hip.h +37 -0
triton/backends/amd/include/roctracer/roctracer_hsa.h +112 -0
triton/backends/amd/include/roctracer/roctracer_plugin.h +137 -0
triton/backends/amd/include/roctracer/roctracer_roctx.h +67 -0
triton/backends/amd/include/roctracer/roctx.h +229 -0
triton/backends/amd/lib/ockl.bc +0 -0
triton/backends/amd/lib/ocml.bc +0 -0
triton/backends/compiler.py +304 -0
triton/backends/driver.py +48 -0
triton/backends/nvidia/__init__.py +0 -0
triton/backends/nvidia/bin/ptxas.exe +0 -0
triton/backends/nvidia/compiler.py +410 -0
triton/backends/nvidia/driver.c +451 -0
triton/backends/nvidia/driver.py +524 -0
triton/backends/nvidia/include/cuda.h +24359 -0
triton/backends/nvidia/lib/libdevice.10.bc +0 -0
triton/backends/nvidia/lib/x64/cuda.lib +0 -0
triton/compiler/__init__.py +4 -0
triton/compiler/code_generator.py +1303 -0
triton/compiler/compiler.py +430 -0
triton/compiler/errors.py +51 -0
triton/compiler/make_launcher.py +0 -0
triton/errors.py +5 -0
triton/language/__init__.py +294 -0
triton/language/_utils.py +21 -0
triton/language/core.py +2694 -0
triton/language/extra/__init__.py +26 -0
triton/language/extra/cuda/__init__.py +13 -0
triton/language/extra/cuda/_experimental_tma.py +108 -0
triton/language/extra/cuda/libdevice.py +1629 -0
triton/language/extra/cuda/utils.py +109 -0
triton/language/extra/hip/__init__.py +3 -0
triton/language/extra/hip/libdevice.py +475 -0
triton/language/extra/libdevice.py +786 -0
triton/language/math.py +250 -0
triton/language/random.py +207 -0
triton/language/semantic.py +1796 -0
triton/language/standard.py +452 -0
triton/runtime/__init__.py +23 -0
triton/runtime/autotuner.py +408 -0
triton/runtime/build.py +111 -0
triton/runtime/cache.py +295 -0
triton/runtime/driver.py +60 -0
triton/runtime/errors.py +26 -0
triton/runtime/interpreter.py +1235 -0
triton/runtime/jit.py +951 -0
triton/testing.py +511 -0
triton/tools/__init__.py +0 -0
triton/tools/build_extern.py +365 -0
triton/tools/compile.c +67 -0
triton/tools/compile.h +14 -0
triton/tools/compile.py +155 -0
triton/tools/disasm.py +144 -0
triton/tools/experimental_descriptor.py +32 -0
triton/tools/link.py +322 -0
triton/windows_utils.py +375 -0
triton_windows-3.2.0.post11.dist-info/METADATA +39 -0
triton_windows-3.2.0.post11.dist-info/RECORD +154 -0
triton_windows-3.2.0.post11.dist-info/WHEEL +5 -0
triton_windows-3.2.0.post11.dist-info/top_level.txt +12 -0

triton/backends/nvidia/driver.py ADDED Viewed

@@ -0,0 +1,524 @@
+import functools
+import os
+import hashlib
+import subprocess
+import tempfile
+from pathlib import Path
+from triton.runtime.build import _build
+from triton.runtime.cache import get_cache_manager
+from triton.backends.compiler import GPUTarget
+from triton.backends.driver import GPUDriver
+dirname = os.path.dirname(os.path.realpath(__file__))
+include_dir = [os.path.join(dirname, "include")]
+if os.name == "nt":
+    from triton.windows_utils import find_cuda
+    _, cuda_inc_dirs, _ = find_cuda()
+    include_dir += cuda_inc_dirs
+libdevice_dir = os.path.join(dirname, "lib")
+libraries = ['cuda']
+@functools.lru_cache()
+def libcuda_dirs():
+    if os.name == "nt":
+        _, _, cuda_lib_dirs = find_cuda()
+        return cuda_lib_dirs
+    env_libcuda_path = os.getenv("TRITON_LIBCUDA_PATH")
+    if env_libcuda_path:
+        return [env_libcuda_path]
+    libs = subprocess.check_output(["/sbin/ldconfig", "-p"]).decode()
+    # each line looks like the following:
+    # libcuda.so.1 (libc6,x86-64) => /lib/x86_64-linux-gnu/libcuda.so.1
+    locs = [line.split()[-1] for line in libs.splitlines() if "libcuda.so.1" in line]
+    dirs = [os.path.dirname(loc) for loc in locs]
+    env_ld_library_path = os.getenv("LD_LIBRARY_PATH")
+    if env_ld_library_path and not dirs:
+        dirs = [dir for dir in env_ld_library_path.split(":") if os.path.exists(os.path.join(dir, "libcuda.so.1"))]
+    msg = 'libcuda.so cannot found!\n'
+    if locs:
+        msg += 'Possible files are located at %s.' % str(locs)
+        msg += 'Please create a symlink of libcuda.so to any of the files.'
+    else:
+        msg += 'Please make sure GPU is set up and then run "/sbin/ldconfig"'
+        msg += ' (requires sudo) to refresh the linker cache.'
+    assert any(os.path.exists(os.path.join(path, 'libcuda.so.1')) for path in dirs), msg
+    return dirs
+@functools.lru_cache()
+def library_dirs():
+    return [libdevice_dir, *libcuda_dirs()]
+def compile_module_from_src(src, name):
+    key = hashlib.sha256(src.encode("utf-8")).hexdigest()
+    cache = get_cache_manager(key)
+    if os.name == "nt":
+        so_name = f"{name}.pyd"
+    else:
+        so_name = f"{name}.so"
+    cache_path = cache.get_file(so_name)
+    if cache_path is None:
+        with tempfile.TemporaryDirectory() as tmpdir:
+            src_path = os.path.join(tmpdir, f"{name}.c")
+            with open(src_path, "w") as f:
+                f.write(src)
+            so = _build(name, src_path, tmpdir, library_dirs(), include_dir, libraries)
+            with open(so, "rb") as f:
+                cache_path = cache.put(f.read(), so_name, binary=True)
+    import importlib.util
+    spec = importlib.util.spec_from_file_location(name, cache_path)
+    mod = importlib.util.module_from_spec(spec)
+    spec.loader.exec_module(mod)
+    return mod
+# ------------------------
+# Utils
+# ------------------------
+class CudaUtils(object):
+    def __new__(cls):
+        if not hasattr(cls, "instance"):
+            cls.instance = super(CudaUtils, cls).__new__(cls)
+        return cls.instance
+    def __init__(self):
+        mod = compile_module_from_src(Path(os.path.join(dirname, "driver.c")).read_text(), "cuda_utils")
+        self.load_binary = mod.load_binary
+        self.get_device_properties = mod.get_device_properties
+        self.cuOccupancyMaxActiveClusters = mod.cuOccupancyMaxActiveClusters
+        self.set_printf_fifo_size = mod.set_printf_fifo_size
+        self.fill_1d_tma_descriptor = mod.fill_1d_tma_descriptor
+        self.fill_2d_tma_descriptor = mod.fill_2d_tma_descriptor
+# ------------------------
+# Launcher
+# ------------------------
+def ty_to_cpp(ty):
+    if ty[0] == '*':
+        return "CUdeviceptr"
+    return {
+        "i1": "int32_t",
+        "i8": "int8_t",
+        "i16": "int16_t",
+        "i32": "int32_t",
+        "i64": "int64_t",
+        "u1": "uint32_t",
+        "u8": "uint8_t",
+        "u16": "uint16_t",
+        "u32": "uint32_t",
+        "u64": "uint64_t",
+        "fp16": "float",
+        "bf16": "float",
+        "fp32": "float",
+        "f32": "float",
+        "fp64": "double",
+        "nvTmaDesc": "CUtensorMap",
+    }[ty]
+def make_launcher(constants, signature, ids):
+    # Record the end of regular arguments;
+    # subsequent arguments are architecture-specific descriptors, such as tensor descriptors for CUDA.
+    arg_decls = ', '.join(f"{ty_to_cpp(ty)} arg{i}" for i, ty in signature.items())
+    def _extracted_type(ty):
+        if ty[0] == '*':
+            return "PyObject*"
+        if ty == "nvTmaDesc":
+            return "PyObject*"
+        return ty_to_cpp(ty)
+    def format_of(ty):
+        return {
+            "PyObject*": "O",
+            "float": "f",
+            "double": "d",
+            "long": "l",
+            "int8_t": "b",
+            "int16_t": "h",
+            "int32_t": "i",
+            "int64_t": "L",
+            "uint8_t": "B",
+            "uint16_t": "H",
+            "uint32_t": "I",
+            "uint64_t": "K",
+        }[ty]
+    args_format = ''.join([format_of(_extracted_type(ty)) for ty in signature.values()])
+    format = "iiiKKOOOO" + args_format
+    args_list = ', ' + ', '.join(f"&_arg{i}" for i, ty in signature.items()) if len(signature) > 0 else ''
+    internal_args_list = []
+    for i, ty in signature.items():
+        if ty[0] == "*":
+            internal_args_list.append(f"ptr_info{i}.dev_ptr")
+        elif ty == "nvTmaDesc":
+            # Note: we have to dereference the pointer
+            internal_args_list.append(f"*tma_ptr{i}")
+        else:
+            internal_args_list.append(f"_arg{i}")
+    # generate glue code
+    params = [i for i in signature.keys() if i not in constants]
+    if params:
+        params_decl = ", ".join(f"&arg{i}" for i in params)
+        params_decl = f"void *params[] = {{ {params_decl} }};"
+    else:
+        params_decl = "void **params = NULL;"
+    src = f"""
+#include \"cuda.h\"
+#include <stdbool.h>
+#include <Python.h>
+#ifndef _WIN32
+#include <dlfcn.h>
+#else
+#define WIN32_LEAN_AND_MEAN
+#include <windows.h>
+#endif
+static inline void gpuAssert(CUresult code, const char *file, int line)
+{{
+   if (code != CUDA_SUCCESS)
+   {{
+      const char* prefix = "Triton Error [CUDA]: ";
+      const char* str;
+      cuGetErrorString(code, &str);
+      char err[1024] = {{0}};
+      strcat(err, prefix);
+      strcat(err, str);
+      PyGILState_STATE gil_state;
+      gil_state = PyGILState_Ensure();
+      PyErr_SetString(PyExc_RuntimeError, err);
+      PyGILState_Release(gil_state);
+   }}
+}}
+#define CUDA_CHECK(ans) {{ gpuAssert((ans), __FILE__, __LINE__); }}
+typedef CUresult (*cuLaunchKernelEx_t)(const CUlaunchConfig* config, CUfunction f, void** kernelParams, void** extra);
+#ifndef _WIN32
+static cuLaunchKernelEx_t getLaunchKernelExHandle() {{
+  // Open the shared library
+  void* handle = dlopen("libcuda.so.1", RTLD_LAZY);
+  if (!handle) {{
+    PyErr_SetString(PyExc_RuntimeError, "Failed to open libcuda.so.1");
+    return NULL;
+  }}
+  // Clear any existing error
+  dlerror();
+  cuLaunchKernelEx_t cuLaunchKernelExHandle = (cuLaunchKernelEx_t)dlsym(handle, "cuLaunchKernelEx");
+  // Check for errors
+  const char *dlsym_error = dlerror();
+  if (dlsym_error) {{
+    PyErr_SetString(PyExc_RuntimeError, "Failed to retrieve cuLaunchKernelEx from libcuda.so.1");
+    return NULL;
+  }}
+  return cuLaunchKernelExHandle;
+}}
+#else
+static cuLaunchKernelEx_t getLaunchKernelExHandle() {{
+  // Open the shared library
+  HMODULE handle = LoadLibraryA("nvcuda.dll");
+  if (!handle) {{
+    PyErr_SetString(PyExc_RuntimeError, "Failed to open nvcuda.dll");
+    return NULL;
+  }}
+  cuLaunchKernelEx_t cuLaunchKernelExHandle =
+      (cuLaunchKernelEx_t)GetProcAddress((HMODULE)handle, "cuLaunchKernelEx");
+  // Check for errors
+  long error = GetLastError();
+  if (error) {{
+    PyErr_SetString(PyExc_RuntimeError, "Failed to retrieve cuLaunchKernelEx from nvcuda.dll");
+    return NULL;
+  }}
+  return cuLaunchKernelExHandle;
+}}
+#endif
+static void _launch(int gridX, int gridY, int gridZ, int num_warps, int num_ctas, int clusterDimX, int clusterDimY, int clusterDimZ, int shared_memory, CUstream stream, CUfunction function{', ' + arg_decls if len(arg_decls) > 0 else ''}) {{
+  {params_decl}
+  if (gridX*gridY*gridZ > 0) {{
+    if (num_ctas == 1) {{
+      CUDA_CHECK(cuLaunchKernel(function, gridX, gridY, gridZ, 32*num_warps, 1, 1, shared_memory, stream, params, 0));
+    }} else {{
+      CUlaunchAttribute launchAttr[2];
+      launchAttr[0].id = CU_LAUNCH_ATTRIBUTE_CLUSTER_DIMENSION;
+      launchAttr[0].value.clusterDim.x = clusterDimX;
+      launchAttr[0].value.clusterDim.y = clusterDimY;
+      launchAttr[0].value.clusterDim.z = clusterDimZ;
+      launchAttr[1].id = CU_LAUNCH_ATTRIBUTE_CLUSTER_SCHEDULING_POLICY_PREFERENCE;
+      launchAttr[1].value.clusterSchedulingPolicyPreference = CU_CLUSTER_SCHEDULING_POLICY_SPREAD;
+      CUlaunchConfig config;
+      config.gridDimX = gridX * clusterDimX;
+      config.gridDimY = gridY * clusterDimY;
+      config.gridDimZ = gridZ * clusterDimZ;
+      config.blockDimX = 32 * num_warps;
+      config.blockDimY = 1;
+      config.blockDimZ = 1;
+      config.sharedMemBytes = shared_memory;
+      config.hStream = stream;
+      config.attrs = launchAttr;
+      config.numAttrs = 2;
+      static cuLaunchKernelEx_t cuLaunchKernelExHandle = NULL;
+      if (cuLaunchKernelExHandle == NULL) {{
+        cuLaunchKernelExHandle = getLaunchKernelExHandle();
+      }}
+      CUDA_CHECK(cuLaunchKernelExHandle(&config, function, params, 0));
+    }}
+  }}
+}}
+typedef struct _DevicePtrInfo {{
+    CUdeviceptr dev_ptr;
+    bool valid;
+}} DevicePtrInfo;
+static inline DevicePtrInfo getPointer(PyObject *obj, int idx) {{
+  DevicePtrInfo ptr_info;
+  ptr_info.dev_ptr = 0;
+  ptr_info.valid = true;
+  if (PyLong_Check(obj)) {{
+    ptr_info.dev_ptr = PyLong_AsUnsignedLongLong(obj);
+    return ptr_info;
+  }}
+  if (obj == Py_None) {{
+    // valid nullptr
+    return ptr_info;
+  }}
+  PyObject *ptr = PyObject_GetAttrString(obj, "data_ptr");
+  if(ptr){{
+    PyObject *empty_tuple = PyTuple_New(0);
+    PyObject *ret = PyObject_Call(ptr, empty_tuple, NULL);
+    Py_DECREF(empty_tuple);
+    Py_DECREF(ptr);
+    if (!PyLong_Check(ret)) {{
+      PyErr_SetString(PyExc_TypeError, "data_ptr method of Pointer object must return 64-bit int");
+      ptr_info.valid = false;
+      return ptr_info;
+    }}
+    ptr_info.dev_ptr = PyLong_AsUnsignedLongLong(ret);
+    if(!ptr_info.dev_ptr)
+      return ptr_info;
+    uint64_t dev_ptr;
+    int status = cuPointerGetAttribute(&dev_ptr, CU_POINTER_ATTRIBUTE_DEVICE_POINTER, ptr_info.dev_ptr);
+    if (status == CUDA_ERROR_INVALID_VALUE) {{
+        PyErr_Format(PyExc_ValueError,
+                     "Pointer argument (at %d) cannot be accessed from Triton (cpu tensor?)", idx);
+        ptr_info.valid = false;
+    }} else if (status != CUDA_SUCCESS) {{
+        CUDA_CHECK(status);  // Catch any other cuda API errors
+        ptr_info.valid = false;
+    }}
+    ptr_info.dev_ptr = dev_ptr;
+    Py_DECREF(ret);  // Thanks ChatGPT!
+    return ptr_info;
+  }}
+  PyErr_SetString(PyExc_TypeError, "Pointer argument must be either uint64 or have data_ptr method");
+  ptr_info.valid = false;
+  return ptr_info;
+}}
+static inline CUtensorMap* getTmaDesc(PyObject *obj) {{
+  if (sizeof(CUtensorMap*) != 8) {{
+    PyErr_SetString(PyExc_SystemError, "getTmaDesc() requires 64-bit compilation");
+    return NULL;
+  }}
+  PyObject *method_handle = PyObject_GetAttrString(obj, "tma_desc_cpu_ptr");
+  if (!method_handle) {{
+    PyErr_SetString(PyExc_TypeError, "tma_desc_cpu_ptr() method does not exist");
+    return NULL;
+  }}
+  PyObject *empty_tuple = PyTuple_New(0);
+  if (!empty_tuple) {{
+    Py_DECREF(method_handle);
+    PyErr_SetString(PyExc_SystemError, "Internal Python error!");
+    return NULL;
+  }}
+  PyObject *method_ret = PyObject_Call(method_handle, empty_tuple, NULL);
+  Py_DECREF(empty_tuple);
+  Py_DECREF(method_handle);
+  if (!method_ret) {{
+    PyErr_SetString(PyExc_SystemError, "Internal Python error!");
+    return NULL;
+  }}
+  if (!PyLong_Check(method_ret)) {{
+    PyErr_SetString(PyExc_TypeError, "tma_desc_cpu_ptr() must return 64-bit int");
+    Py_DECREF(method_ret);
+    return NULL;
+  }}
+  uint64_t ptr_as_uint = PyLong_AsUnsignedLongLong(method_ret);
+  Py_DECREF(method_ret);
+  if (!ptr_as_uint) {{
+    PyErr_SetString(PyExc_ValueError, "received NULL ptr from tma_desc_cpu_ptr()");
+    return NULL;
+  }}
+  if (ptr_as_uint % 64 != 0) {{
+    PyErr_SetString(PyExc_ValueError, "tma_desc_cpu_ptr() must be 64-byte aligned");
+    return NULL;
+  }}
+  return (CUtensorMap*)(ptr_as_uint);
+}}
+static void ensureCudaContext() {{
+  CUcontext pctx;
+  CUDA_CHECK(cuCtxGetCurrent(&pctx));
+  if (!pctx) {{
+    // Ensure device context.
+    CUdevice device;
+    CUDA_CHECK(cuDeviceGet(&device, 0));
+    CUDA_CHECK(cuDevicePrimaryCtxRetain(&pctx, device));
+    CUDA_CHECK(cuCtxSetCurrent(pctx));
+  }}
+}}
+static PyObject* launch(PyObject* self, PyObject* args) {{
+  // ensure cuda context is valid before calling any CUDA APIs, e.g. before getPointer calls cuPointerGetAttributes
+  ensureCudaContext();
+  int gridX, gridY, gridZ;
+  uint64_t _stream;
+  uint64_t _function;
+  PyObject *launch_enter_hook = NULL;
+  PyObject *launch_exit_hook = NULL;
+  PyObject *kernel_metadata = NULL;
+  PyObject *launch_metadata = NULL;
+  {' '.join([f"{_extracted_type(ty)} _arg{i}; " for i, ty in signature.items()])}
+  if(!PyArg_ParseTuple(args, \"{format}\", &gridX, &gridY, &gridZ, &_stream, &_function,
+                                           &kernel_metadata, &launch_metadata,
+                                           &launch_enter_hook, &launch_exit_hook {args_list})) {{
+    return NULL;
+  }}
+  int num_warps, num_ctas, shared_memory, clusterDimX, clusterDimY, clusterDimZ;
+  if (!PyArg_ParseTuple(kernel_metadata, \"iiiiii\", &num_warps, &num_ctas, &shared_memory, &clusterDimX, &clusterDimY, &clusterDimZ)) {{
+    PyErr_SetString(PyExc_TypeError, "kernel_metadata must be a tuple");
+    return NULL;
+  }}
+  // extract launch metadata
+  if (launch_enter_hook != Py_None){{
+    PyObject* args = Py_BuildValue("(O)", launch_metadata);
+    PyObject* ret = PyObject_CallObject(launch_enter_hook, args);
+    Py_DECREF(args);
+    if (!ret)
+      return NULL;
+  }}
+  // raise exception asap
+  {"".join([f"DevicePtrInfo ptr_info{i} = getPointer(_arg{i}, {i}); if (!ptr_info{i}.valid) return NULL;" if ty[0] == "*" else "" for i, ty in signature.items()])};
+  {"".join([f"CUtensorMap* tma_ptr{i} = getTmaDesc(_arg{i}); if (!tma_ptr{i}) return NULL;" if ty == "nvTmaDesc" else "" for i, ty in signature.items()])};
+  Py_BEGIN_ALLOW_THREADS;
+  _launch(gridX, gridY, gridZ, num_warps, num_ctas, clusterDimX, clusterDimY, clusterDimZ, shared_memory, (CUstream)_stream, (CUfunction)_function{', ' + ', '.join(internal_args_list) if len(internal_args_list) > 0 else ''});
+  Py_END_ALLOW_THREADS;
+  if (PyErr_Occurred()) {{
+    return NULL;
+  }}
+  if(launch_exit_hook != Py_None){{
+    PyObject* args = Py_BuildValue("(O)", launch_metadata);
+    PyObject* ret = PyObject_CallObject(launch_exit_hook, args);
+    Py_DECREF(args);
+    if (!ret)
+      return NULL;
+  }}
+  // return None
+  Py_INCREF(Py_None);
+  return Py_None;
+}}
+static PyMethodDef ModuleMethods[] = {{
+  {{"launch", launch, METH_VARARGS, "Entry point for all kernels with this signature"}},
+  {{NULL, NULL, 0, NULL}} // sentinel
+}};
+static struct PyModuleDef ModuleDef = {{
+  PyModuleDef_HEAD_INIT,
+  \"__triton_launcher\",
+  NULL, //documentation
+  -1, //size
+  ModuleMethods
+}};
+PyMODINIT_FUNC PyInit___triton_launcher(void) {{
+  PyObject *m = PyModule_Create(&ModuleDef);
+  if(m == NULL) {{
+    return NULL;
+  }}
+  PyModule_AddFunctions(m, ModuleMethods);
+  return m;
+}}
+"""
+    return src
+class CudaLauncher(object):
+    def __init__(self, src, metadata):
+        ids = {"ids_of_const_exprs": src.fn.constexprs if hasattr(src, "fn") else tuple()}
+        constants = src.constants if hasattr(src, "constants") else dict()
+        cst_key = lambda i: src.fn.arg_names.index(i) if isinstance(i, str) else i
+        constants = {cst_key(key): value for key, value in constants.items()}
+        signature = {cst_key(key): value for key, value in src.signature.items()}
+        src = make_launcher(constants, signature, ids)
+        mod = compile_module_from_src(src, "__triton_launcher")
+        self.launch = mod.launch
+    def __call__(self, *args, **kwargs):
+        self.launch(*args, **kwargs)
+class CudaDriver(GPUDriver):
+    def __init__(self):
+        self.utils = CudaUtils()  # TODO: make static
+        self.launcher_cls = CudaLauncher
+        super().__init__()
+    def get_current_target(self):
+        device = self.get_current_device()
+        capability = self.get_device_capability(device)
+        capability = capability[0] * 10 + capability[1]
+        warp_size = 32
+        return GPUTarget("cuda", capability, warp_size)
+    def get_device_interface(self):
+        import torch
+        return torch.cuda
+    @staticmethod
+    def is_active():
+        import torch
+        return torch.cuda.is_available() and (torch.version.hip is None)
+    def get_benchmarker(self):
+        from triton.testing import do_bench
+        return do_bench
+    def get_empty_cache_for_benchmark(self):
+        import torch
+        # We maintain a buffer of 256 MB that we clear
+        # before each kernel call to make sure that the L2 cache
+        # doesn't contain any input data before the run
+        cache_size = 256 * 1024 * 1024
+        return torch.empty(int(cache_size // 4), dtype=torch.int, device='cuda')