PyPI - triton-windows - Versions diffs - 3.3.1.post21__cp312-cp312-win_amd64.whl → 3.4.0.post21__cp312-cp312-win_amd64.whl - Mend

triton-windows 3.3.1.post21__cp312-cp312-win_amd64.whl → 3.4.0.post21__cp312-cp312-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of triton-windows might be problematic. Click here for more details.

Files changed (68) hide show

triton/_C/libtriton.pyd +0 -0
triton/__init__.py +4 -1
triton/_filecheck.py +87 -0
triton/_internal_testing.py +26 -15
triton/_utils.py +110 -21
triton/backends/__init__.py +20 -23
triton/backends/amd/__init__.py +0 -0
triton/backends/amd/compiler.py +112 -78
triton/backends/amd/driver.c +5 -2
triton/backends/amd/driver.py +143 -46
triton/backends/compiler.py +7 -21
triton/backends/nvidia/bin/ptxas.exe +0 -0
triton/backends/nvidia/compiler.py +94 -94
triton/backends/nvidia/driver.c +90 -98
triton/backends/nvidia/driver.py +296 -125
triton/compiler/code_generator.py +212 -111
triton/compiler/compiler.py +110 -25
triton/experimental/__init__.py +0 -0
triton/experimental/gluon/__init__.py +4 -0
triton/experimental/gluon/_compiler.py +0 -0
triton/experimental/gluon/_runtime.py +99 -0
triton/experimental/gluon/language/__init__.py +18 -0
triton/experimental/gluon/language/_core.py +312 -0
triton/experimental/gluon/language/_layouts.py +230 -0
triton/experimental/gluon/language/_math.py +12 -0
triton/experimental/gluon/language/_semantic.py +287 -0
triton/experimental/gluon/language/_standard.py +47 -0
triton/experimental/gluon/language/nvidia/__init__.py +4 -0
triton/experimental/gluon/language/nvidia/blackwell/__init__.py +202 -0
triton/experimental/gluon/language/nvidia/blackwell/tma.py +32 -0
triton/experimental/gluon/language/nvidia/hopper/__init__.py +11 -0
triton/experimental/gluon/language/nvidia/hopper/mbarrier.py +51 -0
triton/experimental/gluon/language/nvidia/hopper/tma.py +96 -0
triton/experimental/gluon/nvidia/__init__.py +4 -0
triton/experimental/gluon/nvidia/blackwell.py +3 -0
triton/experimental/gluon/nvidia/hopper.py +40 -0
triton/knobs.py +481 -0
triton/language/__init__.py +39 -14
triton/language/core.py +794 -537
triton/language/extra/cuda/__init__.py +10 -7
triton/language/extra/cuda/gdc.py +42 -0
triton/language/extra/cuda/libdevice.py +394 -394
triton/language/extra/cuda/utils.py +21 -21
triton/language/extra/hip/libdevice.py +113 -104
triton/language/math.py +65 -66
triton/language/random.py +12 -2
triton/language/semantic.py +1706 -1770
triton/language/standard.py +116 -51
triton/runtime/autotuner.py +117 -59
triton/runtime/build.py +73 -9
triton/runtime/cache.py +18 -47
triton/runtime/driver.py +32 -29
triton/runtime/interpreter.py +72 -35
triton/runtime/jit.py +146 -110
triton/testing.py +16 -12
triton/tools/disasm.py +3 -4
triton/tools/tensor_descriptor.py +36 -0
triton/windows_utils.py +47 -83
{triton_windows-3.3.1.post21.dist-info → triton_windows-3.4.0.post21.dist-info}/METADATA +7 -2
{triton_windows-3.3.1.post21.dist-info → triton_windows-3.4.0.post21.dist-info}/RECORD +64 -41
triton_windows-3.4.0.post21.dist-info/entry_points.txt +3 -0
triton_windows-3.4.0.post21.dist-info/licenses/LICENSE +23 -0
triton_windows-3.4.0.post21.dist-info/top_level.txt +1 -0
triton/language/_utils.py +0 -21
triton/language/extra/cuda/_experimental_tma.py +0 -106
triton/tools/experimental_descriptor.py +0 -32
triton_windows-3.3.1.post21.dist-info/top_level.txt +0 -14
{triton_windows-3.3.1.post21.dist-info → triton_windows-3.4.0.post21.dist-info}/WHEEL +0 -0

triton/backends/nvidia/driver.py CHANGED Viewed

@@ -1,36 +1,33 @@
 import functools
+import operator
 import os
-import sysconfig
-import hashlib
 import subprocess
-import tempfile
+import triton
+import re
 from pathlib import Path
-from triton.runtime.build import _build
-from triton.runtime.cache import get_cache_manager
+from triton import knobs
+from triton.runtime.build import compile_module_from_src
 from triton.runtime import _allocation
 from triton.backends.compiler import GPUTarget
 from triton.backends.driver import GPUDriver
 dirname = os.path.dirname(os.path.realpath(__file__))
-include_dir = [os.path.join(dirname, "include")]
+include_dirs = [os.path.join(dirname, "include")]
 if os.name == "nt":
     from triton.windows_utils import find_cuda
     _, cuda_inc_dirs, _ = find_cuda()
-    include_dir += cuda_inc_dirs
+    include_dirs += cuda_inc_dirs
 libdevice_dir = os.path.join(dirname, "lib")
 libraries = ['cuda']
 @functools.lru_cache()
 def libcuda_dirs():
-    env_libcuda_path = os.getenv("TRITON_LIBCUDA_PATH")
-    if env_libcuda_path:
+    if env_libcuda_path := knobs.nvidia.libcuda_path:
         return [env_libcuda_path]
     if os.name == "nt":
         _, _, cuda_lib_dirs = find_cuda()
         return cuda_lib_dirs
     libs = subprocess.check_output(["/sbin/ldconfig", "-p"]).decode()
     # each line looks like the following:
     # libcuda.so.1 (libc6,x86-64) => /lib/x86_64-linux-gnu/libcuda.so.1
@@ -55,36 +52,6 @@ def library_dirs():
     return [libdevice_dir, *libcuda_dirs()]
-@functools.lru_cache()
-def platform_key():
-    from platform import machine, system, architecture
-    return ",".join([machine(), system(), *architecture()])
-def compile_module_from_src(src, name):
-    key = hashlib.sha256((src + platform_key()).encode("utf-8")).hexdigest()
-    cache = get_cache_manager(key)
-    ext = sysconfig.get_config_var("EXT_SUFFIX").split(".")[-1]
-    cache_path = cache.get_file(f"{name}.{ext}")
-    if cache_path is None:
-        with tempfile.TemporaryDirectory() as tmpdir:
-            src_path = os.path.join(tmpdir, f"{name}.c")
-            with open(src_path, "w") as f:
-                f.write(src)
-            so = _build(name, src_path, tmpdir, library_dirs(), include_dir, libraries)
-            with open(so, "rb") as f:
-                cache_path = cache.put(f.read(), f"{name}.{ext}", binary=True)
-    # Loading module with relative path may cause error
-    cache_path = os.path.abspath(cache_path)
-    import importlib.util
-    spec = importlib.util.spec_from_file_location(name, cache_path)
-    mod = importlib.util.module_from_spec(spec)
-    spec.loader.exec_module(mod)
-    return mod
 # ------------------------
 # Utils
 # ------------------------
@@ -98,13 +65,18 @@ class CudaUtils(object):
         return cls.instance
     def __init__(self):
-        mod = compile_module_from_src(Path(os.path.join(dirname, "driver.c")).read_text(), "cuda_utils")
+        mod = compile_module_from_src(
+            src=Path(os.path.join(dirname, "driver.c")).read_text(),
+            name="cuda_utils",
+            library_dirs=library_dirs(),
+            include_dirs=include_dirs,
+            libraries=libraries,
+        )
         self.load_binary = mod.load_binary
         self.get_device_properties = mod.get_device_properties
         self.cuOccupancyMaxActiveClusters = mod.cuOccupancyMaxActiveClusters
         self.set_printf_fifo_size = mod.set_printf_fifo_size
-        self.fill_1d_tma_descriptor = mod.fill_1d_tma_descriptor
-        self.fill_2d_tma_descriptor = mod.fill_2d_tma_descriptor
+        self.fill_tma_descriptor = mod.fill_tma_descriptor
 # ------------------------
@@ -115,6 +87,8 @@ class CudaUtils(object):
 def ty_to_cpp(ty):
     if ty[0] == '*':
         return "CUdeviceptr"
+    if ty.startswith("tensordesc"):
+        return "CUtensorMap"
     return {
         "i1": "int32_t",
         "i8": "int8_t",
@@ -126,21 +100,80 @@ def ty_to_cpp(ty):
         "u16": "uint16_t",
         "u32": "uint32_t",
         "u64": "uint64_t",
-        "fp16": "float",
-        "bf16": "float",
-        "fp32": "float",
-        "f32": "float",
+        "fp16": "double",
+        "bf16": "double",
+        "fp32": "double",
+        "f32": "double",
         "fp64": "double",
         "nvTmaDesc": "CUtensorMap",
     }[ty]
-def make_launcher(constants, signature):
-    def _serialize_signature(sig):
+FLOAT_STORAGE_TYPE = {
+    "fp16": "uint16_t",
+    "bf16": "uint16_t",
+    "fp32": "uint32_t",
+    "f32": "uint32_t",
+    "fp64": "uint64_t",
+}
+FLOAT_PACK_FUNCTION = {
+    "fp16": "pack_fp16",
+    "bf16": "pack_bf16",
+    "fp32": "pack_fp32",
+    "f32": "pack_fp32",
+    "fp64": "pack_fp64",
+}
+_BASE_ARGS_FORMAT = "iiiKKppOOOOO"
+def make_launcher(constants, signature, tensordesc_meta):
+    def _expand_signature(signature):
+        output = []
+        tensordesc_idx = 0
+        # Expand tensor descriptor arguments into either nvTmaDesc, shape and
+        # strides, or base pointer, shape and strides depending on whether the
+        # kernel was lowered to use the nvTmaDesc or not.
+        for sig in signature:
+            if isinstance(sig, str) and sig.startswith("tensordesc"):
+                meta = tensordesc_meta[tensordesc_idx] if tensordesc_meta else None
+                tensordesc_idx += 1
+                match = re.match("tensordesc<([^[>]*)\\[([^]]*)\\]", sig)
+                dtype = match.group(1)
+                shape = match.group(2)
+                ndim = shape.count(",") + 1
+                if meta is None:
+                    output.append("*" + dtype)
+                    # Currently the host side tensor descriptors get passed in as a
+                    # tensor desc, shape, and strides. We have no way to use these
+                    # shape and strides when processing tensor descriptors which is
+                    # why we provide our own decomposition above. Sadly this means
+                    # we have to pass the shape and strides twice.
+                    for _ in range(2 * ndim):
+                        output.append("i64")
+                else:
+                    output.append("nvTmaDesc")
+                for _ in range(ndim):
+                    output.append("i32")
+                for _ in range(ndim):
+                    output.append("i64")
+            else:
+                output.append(sig)
+        assert not tensordesc_meta or tensordesc_idx == len(tensordesc_meta)
+        return output
+    def _flatten_signature(sig, output):
+        # Flatten tuples
         if isinstance(sig, tuple):
-            return ','.join(map(_serialize_signature, sig))
-        return sig
+            for x in sig:
+                _flatten_signature(x, output)
+        else:
+            output.append(sig)
     def _extracted_type(ty):
         if isinstance(ty, tuple):
@@ -160,8 +193,9 @@ def make_launcher(constants, signature):
             return "O"
         if ty in ("constexpr", "nvTmaDesc"):
             return "O"
+        if ty.startswith("tensordesc"):
+            return "O"
         return {
-            "float": "f",
             "double": "d",
             "long": "l",
             "int8_t": "b",
@@ -174,19 +208,34 @@ def make_launcher(constants, signature):
             "uint64_t": "K",
         }[ty_to_cpp(ty)]
+    expand_signature = _expand_signature(signature.values())
+    signature = {i: s for i, s in enumerate(expand_signature)}
     args_format = ''.join([format_of(ty) for ty in signature.values()])
-    format = "iiiKKpOOOOO" + args_format
-    signature = ','.join(map(_serialize_signature, signature.values()))
-    signature = list(filter(bool, signature.split(',')))
-    signature = {i: s for i, s in enumerate(signature)}
+    format = _BASE_ARGS_FORMAT + args_format
+    flat_signature = []
+    for sig in signature.values():
+        _flatten_signature(sig, flat_signature)
+    signature = {i: s for i, s in enumerate(flat_signature)}
     args_list = ', ' + ', '.join(f"&_arg{i}" for i, ty in signature.items()) if len(signature) > 0 else ''
     # Record the end of regular arguments;
     # subsequent arguments are architecture-specific descriptors, such as tensor descriptors for CUDA.
-    arg_decls = ', '.join(f"{ty_to_cpp(ty)} arg{i}" for i, ty in signature.items() if ty != "constexpr")
+    arg_decl_list = []
+    for i, ty in signature.items():
+        if ty == "constexpr":
+            continue
+        if ty in FLOAT_STORAGE_TYPE:
+            arg_decl_list.append(f"{FLOAT_STORAGE_TYPE[ty]} arg{i}")
+        else:
+            arg_decl_list.append(f"{ty_to_cpp(ty)} arg{i}")
+    arg_decls = ', '.join(arg_decl_list)
     internal_args_list = []
     for i, ty in signature.items():
         if ty[0] == "*":
             internal_args_list.append(f"ptr_info{i}.dev_ptr")
+        elif ty in FLOAT_STORAGE_TYPE:
+            internal_args_list.append(f"_arg{i}_storage")
         elif ty == "nvTmaDesc":
             # Note: we have to dereference the pointer
             internal_args_list.append(f"*tma_ptr{i}")
@@ -205,14 +254,17 @@ def make_launcher(constants, signature):
         f"CUtensorMap* tma_ptr{i} = getTmaDesc(_arg{i}); if (!tma_ptr{i}) return NULL;" for i, ty in signature.items()
         if ty == "nvTmaDesc"
     ]
+    float_storage_decls = [
+        f"{FLOAT_STORAGE_TYPE[ty]} _arg{i}_storage = {FLOAT_PACK_FUNCTION[ty]}(_arg{i});"
+        for i, ty in signature.items()
+        if ty in FLOAT_STORAGE_TYPE
+    ]
     params = [f"&arg{i}" for i, ty in signature.items() if ty != "constexpr"]
     params.append("&global_scratch")
     src = f"""
 #define _CRT_SECURE_NO_WARNINGS
 #include \"cuda.h\"
 #include <stdbool.h>
-#define PY_SSIZE_T_CLEAN
-#define Py_LIMITED_API 0x03090000
 #include <Python.h>
 #ifndef _WIN32
@@ -282,67 +334,65 @@ static cuLaunchKernelEx_t getLaunchKernelExHandle() {{
 }}
 #endif
-static void _launch(int gridX, int gridY, int gridZ, int num_warps, int num_ctas, int launch_cooperative_grid, int clusterDimX, int clusterDimY, int clusterDimZ, int shared_memory, CUstream stream, CUfunction function, CUdeviceptr global_scratch{', ' + arg_decls if len(arg_decls) > 0 else ''}) {{
+static void _launch(int gridX, int gridY, int gridZ, int num_warps, int num_ctas, int launch_cooperative_grid, int launch_pdl, int clusterDimX, int clusterDimY, int clusterDimZ, int shared_memory, CUstream stream, CUfunction function, CUdeviceptr global_scratch{', ' + arg_decls if len(arg_decls) > 0 else ''}) {{
   void *params[] = {{ {', '.join(params)} }};
   if (gridX*gridY*gridZ > 0) {{
-    if ((num_ctas == 1) && (0 == launch_cooperative_grid)) {{
-      CUDA_CHECK(cuLaunchKernel(function, gridX, gridY, gridZ, 32*num_warps, 1, 1, shared_memory, stream, params, 0));
-    }} else if ((num_ctas == 1) && (0 != launch_cooperative_grid)) {{
-      CUlaunchAttribute launchAttr[1];
+    // 4 attributes that we can currently pass maxmimum
+    CUlaunchAttribute launchAttr[4];
+    static cuLaunchKernelEx_t cuLaunchKernelExHandle = NULL;
+    if (cuLaunchKernelExHandle == NULL) {{
+      cuLaunchKernelExHandle = getLaunchKernelExHandle();
+    }}
+    CUlaunchConfig config;
+    config.gridDimX = gridX;
+    config.gridDimY = gridY;
+    config.gridDimZ = gridZ;
+    if (num_ctas != 1) {{
+      config.gridDimX *= clusterDimX;
+      config.gridDimY *= clusterDimY;
+      config.gridDimZ *= clusterDimZ;
+    }}
+    config.blockDimX = 32 * num_warps;
+    config.blockDimY = 1;
+    config.blockDimZ = 1;
+    config.sharedMemBytes = shared_memory;
+    config.hStream = stream;
+    config.attrs = launchAttr;
+    int num_attrs = 0;
+    if (launch_pdl != 0) {{
+      CUlaunchAttribute pdlAttr = {{ .id = CU_LAUNCH_ATTRIBUTE_PROGRAMMATIC_STREAM_SERIALIZATION, .value = 1}};
+      launchAttr[num_attrs] = pdlAttr;
+      ++num_attrs;
+    }}
+    if (launch_cooperative_grid != 0) {{
       CUlaunchAttribute coopAttr = {{ .id = CU_LAUNCH_ATTRIBUTE_COOPERATIVE, .value = 1}};
-      launchAttr[0] = coopAttr;
-      CUlaunchConfig config;
-      config.gridDimX = gridX;
-      config.gridDimY = gridY;
-      config.gridDimZ = gridZ;
-      config.blockDimX = 32 * num_warps;
-      config.blockDimY = 1;
-      config.blockDimZ = 1;
-      config.sharedMemBytes = shared_memory;
-      config.hStream = stream;
-      config.attrs = launchAttr;
-      config.numAttrs = 1;
-      static cuLaunchKernelEx_t cuLaunchKernelExHandle = NULL;
-      if (cuLaunchKernelExHandle == NULL) {{
-        cuLaunchKernelExHandle = getLaunchKernelExHandle();
-      }}
-      CUDA_CHECK(cuLaunchKernelExHandle(&config, function, params, 0));
-    }} else {{
-      CUlaunchAttribute launchAttr[3];
-      launchAttr[0].id = CU_LAUNCH_ATTRIBUTE_CLUSTER_DIMENSION;
-      launchAttr[0].value.clusterDim.x = clusterDimX;
-      launchAttr[0].value.clusterDim.y = clusterDimY;
-      launchAttr[0].value.clusterDim.z = clusterDimZ;
-      launchAttr[1].id = CU_LAUNCH_ATTRIBUTE_CLUSTER_SCHEDULING_POLICY_PREFERENCE;
-      launchAttr[1].value.clusterSchedulingPolicyPreference = CU_CLUSTER_SCHEDULING_POLICY_SPREAD;
-      unsigned numAttrs = 2;
-      if (0 != launch_cooperative_grid) {{
-        CUlaunchAttribute coopAttr = {{ .id = CU_LAUNCH_ATTRIBUTE_COOPERATIVE, .value = 1}};
-        launchAttr[2] = coopAttr;
-        numAttrs = 3;
-      }}
-      CUlaunchConfig config;
-      config.gridDimX = gridX * clusterDimX;
-      config.gridDimY = gridY * clusterDimY;
-      config.gridDimZ = gridZ * clusterDimZ;
-      config.blockDimX = 32 * num_warps;
-      config.blockDimY = 1;
-      config.blockDimZ = 1;
-      config.sharedMemBytes = shared_memory;
-      config.hStream = stream;
-      config.attrs = launchAttr;
-      config.numAttrs = numAttrs;
-      static cuLaunchKernelEx_t cuLaunchKernelExHandle = NULL;
-      if (cuLaunchKernelExHandle == NULL) {{
-        cuLaunchKernelExHandle = getLaunchKernelExHandle();
-      }}
-      CUDA_CHECK(cuLaunchKernelExHandle(&config, function, params, 0));
+      launchAttr[num_attrs] = coopAttr;
+      ++num_attrs;
     }}
+    if (num_ctas != 1) {{
+      CUlaunchAttribute clusterAttr = {{}};
+      clusterAttr.id = CU_LAUNCH_ATTRIBUTE_CLUSTER_DIMENSION;
+      clusterAttr.value.clusterDim.x = clusterDimX;
+      clusterAttr.value.clusterDim.y = clusterDimY;
+      clusterAttr.value.clusterDim.z = clusterDimZ;
+      launchAttr[num_attrs] = clusterAttr;
+      ++num_attrs;
+      CUlaunchAttribute clusterSchedulingAttr = {{}};
+      clusterSchedulingAttr.id = CU_LAUNCH_ATTRIBUTE_CLUSTER_SCHEDULING_POLICY_PREFERENCE;
+      clusterSchedulingAttr.value.clusterSchedulingPolicyPreference = CU_CLUSTER_SCHEDULING_POLICY_SPREAD;
+      launchAttr[num_attrs] = clusterSchedulingAttr;
+      ++num_attrs;
+    }}
+    config.numAttrs = num_attrs;
+    CUDA_CHECK(cuLaunchKernelExHandle(&config, function, params, 0));
   }}
 }}
@@ -457,6 +507,32 @@ static void ensureCudaContext() {{
   }}
 }}
+static uint16_t pack_fp16(double f) {{
+    uint16_t result;
+    // from https://github.com/python/pythoncapi-compat
+#if 0x030600B1 <= PY_VERSION_HEX && PY_VERSION_HEX <= 0x030B00A1 && !defined(PYPY_VERSION)
+    _PyFloat_Pack2(f, (unsigned char*)&result, 1);
+#else
+    PyFloat_Pack2(f, (unsigned char*)&result, 1);
+#endif
+    return result;
+}}
+static uint16_t pack_bf16(double f) {{
+    float f32 = (float)f;
+    uint32_t u32 = *(uint32_t*)&f32;
+    return (uint16_t)(u32 >> 16);
+}}
+static uint32_t pack_fp32(double f) {{
+    float f32 = (float)f;
+    return *(uint32_t*)&f32;
+}}
+static uint64_t pack_fp64(double f) {{
+    return *(uint64_t*)&f;
+}}
 static PyObject* launch(PyObject* self, PyObject* args) {{
   // ensure cuda context is valid before calling any CUDA APIs, e.g. before getPointer calls cuPointerGetAttributes
   ensureCudaContext();
@@ -465,6 +541,7 @@ static PyObject* launch(PyObject* self, PyObject* args) {{
   uint64_t _stream;
   uint64_t _function;
   int launch_cooperative_grid;
+  int launch_pdl;
   PyObject *launch_enter_hook = NULL;
   PyObject *launch_exit_hook = NULL;
   PyObject *kernel_metadata = NULL;
@@ -472,7 +549,7 @@ static PyObject* launch(PyObject* self, PyObject* args) {{
   PyObject *global_scratch_obj = NULL;
   {newline.join([f"{_extracted_type(ty)} _arg{i};" for i, ty in signature.items()])}
   if(!PyArg_ParseTuple(args, \"{format}\", &gridX, &gridY, &gridZ,
-                                           &_stream, &_function, &launch_cooperative_grid, &global_scratch_obj,
+                                           &_stream, &_function, &launch_cooperative_grid, &launch_pdl, &global_scratch_obj,
                                            &kernel_metadata, &launch_metadata,
                                            &launch_enter_hook, &launch_exit_hook{args_list})) {{
     return NULL;
@@ -506,8 +583,9 @@ static PyObject* launch(PyObject* self, PyObject* args) {{
   // raise exception asap
   {newline.join(ptr_decls)}
   {newline.join(tma_decls)}
+  {newline.join(float_storage_decls)}
   Py_BEGIN_ALLOW_THREADS;
-  _launch(gridX, gridY, gridZ, num_warps, num_ctas, launch_cooperative_grid, clusterDimX, clusterDimY, clusterDimZ, shared_memory, (CUstream)_stream, (CUfunction)_function, global_scratch{', ' + ', '.join(internal_args_list) if len(internal_args_list) > 0 else ''});
+  _launch(gridX, gridY, gridZ, num_warps, num_ctas, launch_cooperative_grid, launch_pdl, clusterDimX, clusterDimY, clusterDimZ, shared_memory, (CUstream)_stream, (CUfunction)_function, global_scratch{', ' + ', '.join(internal_args_list) if len(internal_args_list) > 0 else ''});
   Py_END_ALLOW_THREADS;
   if (PyErr_Occurred()) {{
     return NULL;
@@ -550,6 +628,87 @@ PyMODINIT_FUNC PyInit___triton_launcher(void) {{
     return src
+class TmaDescKernelParam:
+    TMA_DESC_SIZE = 128
+    def __init__(self):
+        import torch
+        self.desc = torch.empty(self.TMA_DESC_SIZE, dtype=torch.uint8, device="cpu")
+    # Return a CUtensorMap* pointer in host memory
+    def tma_desc_cpu_ptr(self):
+        return self.desc.data_ptr()
+# The TMA dtype enum values are slightly different on host vs device...
+TMA_DTYPE_DEVICE_TO_HOST = dict((i, i) for i in range(16))
+TMA_DTYPE_DEVICE_TO_HOST[8] = 10
+TMA_DTYPE_DEVICE_TO_HOST[9] = 8
+TMA_DTYPE_DEVICE_TO_HOST[10] = 9
+def make_tensordesc_arg(arg, metadata):
+    if metadata is None:
+        # Currently the host side tensor descriptors get decomposed in
+        # the frontend to tensor desc, shape, and strides. We have no
+        # way to use these shape and strides when processing tensor
+        # descriptors which is why we provide our own decomposition
+        # above. Sadly this means we have to pass the shape and strides
+        # twice.
+        return [arg.base, *arg.shape, *arg.strides, *arg.shape, *arg.strides]
+    swizzle = metadata["swizzle"]
+    elem_size = metadata["elem_size"]
+    elem_type = metadata["elem_type"]
+    block_size = metadata["block_size"]
+    fp4_padded = metadata["fp4_padded"]
+    data_ptr = arg.base.data_ptr()
+    shape = arg.shape
+    strides = arg.strides
+    assert strides[-1] == 1
+    desc = TmaDescKernelParam()
+    result = [desc, *shape, *strides]
+    if fp4_padded:
+        shape = list(shape)
+        shape[-1] *= 2
+    triton.runtime.driver.active.utils.fill_tma_descriptor(
+        desc.tma_desc_cpu_ptr(),
+        data_ptr,
+        swizzle,
+        elem_size,
+        TMA_DTYPE_DEVICE_TO_HOST[elem_type],
+        block_size,
+        shape,
+        strides,
+    )
+    return result
+def wrap_handle_tensordesc(launcher, tensordesc_meta):
+    from triton.tools.tensor_descriptor import TensorDescriptor
+    from triton.experimental.gluon.nvidia.hopper import TensorDescriptor as GluonTensorDescriptor
+    def inner(*args):
+        meta_args = args[:len(_BASE_ARGS_FORMAT)]
+        raw_kernel_args = args[len(_BASE_ARGS_FORMAT):]
+        tensordesc_idx = 0
+        final_args = []
+        for i, arg in enumerate(raw_kernel_args):
+            if isinstance(arg, (TensorDescriptor, GluonTensorDescriptor)):
+                meta = tensordesc_meta[tensordesc_idx] if tensordesc_meta else None
+                tensordesc_idx += 1
+                final_args.extend(make_tensordesc_arg(arg, meta))
+            else:
+                final_args.append(arg)
+        assert not tensordesc_meta or tensordesc_idx == len(tensordesc_meta)
+        return launcher(*meta_args, *final_args)
+    return inner
 class CudaLauncher(object):
     def __init__(self, src, metadata):
@@ -557,21 +716,33 @@ class CudaLauncher(object):
         arg_idx = lambda x: (src.fn.arg_names.index(x), ) if isinstance(x, str) else x
         constants = {arg_idx(idx): value for idx, value in constants.items()}
         signature = {idx: value for idx, value in src.signature.items()}
-        src = make_launcher(constants, signature)
-        mod = compile_module_from_src(src, "__triton_launcher")
-        self.launch = mod.launch
+        tensordesc_meta = getattr(metadata, "tensordesc_meta", None)
+        src = make_launcher(constants, signature, tensordesc_meta)
+        mod = compile_module_from_src(
+            src=src,
+            name="__triton_launcher",
+            library_dirs=library_dirs(),
+            include_dirs=include_dirs,
+            libraries=libraries,
+        )
+        has_tensor_desc_arg = any(isinstance(sig, str) and sig.startswith("tensordesc") for sig in signature.values())
+        self.num_ctas = functools.reduce(operator.mul, metadata.cluster_dims, 1)
+        self.launch = wrap_handle_tensordesc(mod.launch, tensordesc_meta) if has_tensor_desc_arg else mod.launch
         self.global_scratch_size = metadata.global_scratch_size
         self.global_scratch_align = metadata.global_scratch_align
         self.launch_cooperative_grid = metadata.launch_cooperative_grid
+        self.launch_pdl = metadata.launch_pdl
     def __call__(self, gridX, gridY, gridZ, stream, function, *args):
         if self.global_scratch_size > 0:
             grid_size = gridX * gridY * gridZ
-            alloc_size = grid_size * self.global_scratch_size
+            alloc_size = grid_size * self.num_ctas * self.global_scratch_size
             global_scratch = _allocation._allocator(alloc_size, self.global_scratch_align, stream)
         else:
             global_scratch = None
-        self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, global_scratch, *args)
+        self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,
+                    global_scratch, *args)
 class CudaDriver(GPUDriver):