PyPI - triton-windows - Versions diffs - 3.2.0.post12__cp39-cp39-win_amd64.whl → 3.3.0a0.post12__cp39-cp39-win_amd64.whl - Mend

triton-windows 3.2.0.post12__cp39-cp39-win_amd64.whl → 3.3.0a0.post12__cp39-cp39-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of triton-windows might be problematic. Click here for more details.

Files changed (68) hide show

triton/_C/libtriton.pyd +0 -0
triton/__init__.py +3 -3
triton/_internal_testing.py +59 -4
triton/_utils.py +35 -0
triton/backends/amd/compiler.py +121 -74
triton/backends/amd/driver.py +77 -43
triton/backends/amd/include/hip/amd_detail/amd_device_functions.h +28 -49
triton/backends/amd/include/hip/amd_detail/amd_hip_atomic.h +35 -9
triton/backends/amd/include/hip/amd_detail/amd_hip_bf16.h +761 -284
triton/backends/amd/include/hip/amd_detail/amd_hip_cooperative_groups.h +9 -3
triton/backends/amd/include/hip/amd_detail/amd_hip_fp8.h +1391 -0
triton/backends/amd/include/hip/amd_detail/amd_hip_gl_interop.h +3 -3
triton/backends/amd/include/hip/amd_detail/amd_warp_functions.h +44 -0
triton/backends/amd/include/hip/amd_detail/amd_warp_sync_functions.h +288 -0
triton/backends/amd/include/hip/amd_detail/hip_api_trace.hpp +110 -14
triton/backends/amd/include/hip/amd_detail/hip_prof_str.h +504 -103
triton/backends/amd/include/hip/amd_detail/hip_runtime_prof.h +2 -1
triton/backends/amd/include/hip/amd_detail/host_defines.h +4 -0
triton/backends/amd/include/hip/hip_ext.h +4 -2
triton/backends/amd/include/hip/hip_fp8.h +33 -0
triton/backends/amd/include/hip/hip_runtime_api.h +375 -33
triton/backends/amd/include/hip/hip_version.h +3 -3
triton/backends/amd/include/hip/hiprtc.h +25 -25
triton/backends/amd/include/hsa/amd_hsa_elf.h +40 -14
triton/backends/amd/include/hsa/hsa.h +11 -2
triton/backends/amd/include/hsa/hsa_api_trace.h +30 -17
triton/backends/amd/include/hsa/hsa_api_trace_version.h +68 -0
triton/backends/amd/include/hsa/hsa_ext_amd.h +83 -27
triton/backends/amd/include/hsa/hsa_ven_amd_aqlprofile.h +46 -46
triton/backends/amd/include/hsa/hsa_ven_amd_pc_sampling.h +416 -0
triton/backends/amd/include/roctracer/hip_ostream_ops.h +84 -4
triton/backends/amd/include/roctracer/hsa_ostream_ops.h +260 -0
triton/backends/amd/include/roctracer/hsa_prof_str.h +51 -19
triton/backends/amd/lib/asanrtl.bc +0 -0
triton/backends/compiler.py +25 -225
triton/backends/driver.py +7 -2
triton/backends/nvidia/bin/ptxas.exe +0 -0
triton/backends/nvidia/compiler.py +135 -90
triton/backends/nvidia/driver.c +0 -1
triton/backends/nvidia/driver.py +135 -49
triton/backends/nvidia/include/cuda.h +2162 -241
triton/backends/nvidia/lib/x64/cuda.lib +0 -0
triton/compiler/__init__.py +2 -2
triton/compiler/code_generator.py +334 -231
triton/compiler/compiler.py +77 -66
triton/language/__init__.py +22 -5
triton/language/core.py +448 -74
triton/language/extra/cuda/_experimental_tma.py +3 -5
triton/language/math.py +1 -1
triton/language/random.py +2 -1
triton/language/semantic.py +206 -52
triton/language/standard.py +35 -18
triton/runtime/_allocation.py +32 -0
triton/runtime/autotuner.py +27 -32
triton/runtime/build.py +1 -48
triton/runtime/cache.py +6 -6
triton/runtime/errors.py +10 -0
triton/runtime/interpreter.py +179 -45
triton/runtime/jit.py +149 -190
triton/testing.py +39 -11
triton/tools/compile.py +27 -20
triton/tools/{compile.c → extra/cuda/compile.c} +1 -0
triton/tools/mxfp.py +301 -0
{triton_windows-3.2.0.post12.dist-info → triton_windows-3.3.0a0.post12.dist-info}/METADATA +5 -2
{triton_windows-3.2.0.post12.dist-info → triton_windows-3.3.0a0.post12.dist-info}/RECORD +68 -59
{triton_windows-3.2.0.post12.dist-info → triton_windows-3.3.0a0.post12.dist-info}/top_level.txt +2 -0
/triton/tools/{compile.h → extra/cuda/compile.h} +0 -0
{triton_windows-3.2.0.post12.dist-info → triton_windows-3.3.0a0.post12.dist-info}/WHEEL +0 -0

triton/tools/compile.py CHANGED Viewed

@@ -8,7 +8,6 @@ from typing import List
 import triton
 import triton.backends
-from triton.compiler.code_generator import kernel_suffix
 from triton.backends.nvidia.driver import ty_to_cpp
 desc = """
@@ -91,28 +90,29 @@ if __name__ == "__main__":
             pass
         return None
-    hints = {i: constexpr(s.split(":")[1]) for i, s in enumerate(signature) if ":" in s}
+    hints = {(i, ): constexpr(s.split(":")[1]) for i, s in enumerate(signature) if ":" in s}
     hints = {k: v for k, v in hints.items() if v is not None}
     constants = {kernel.arg_names[i]: constexpr(s) for i, s in enumerate(signature)}
     constants = {k: v for k, v in constants.items() if v is not None}
-    signature = {
-        kernel.arg_names[i]: s.split(":")[0]
-        for i, s in enumerate(signature)
-        if kernel.arg_names[i] not in constants
-    }
+    for key, value in hints.items():
+        if value == 1:
+            constants[kernel.arg_names[key[0]]] = value
+    signature = {kernel.arg_names[i]: s.split(":")[0] for i, s in enumerate(signature)}
+    for key in constants:
+        signature[key] = 'constexpr'
     const_sig = 'x'.join([str(v) for v in constants.values()])
     doc_string = [f"{k}={v}" for k, v in constants.items()]
     doc_string += [f"num_warps={args.num_warps}", f"num_stages={args.num_stages}"]
     # compile ast into cubin
     for h in hints.values():
         assert h in [1, 16], f"Only 1 and 16 are valid hints, got {h}"
-    attrs = triton.backends.compiler.AttrsDescriptor.from_hints(hints)
-    for p, v in attrs.get_constants().items():
-        constants.update({kernel.arg_names[p]: v})
-    src = triton.compiler.ASTSource(fn=kernel, constants=constants, signature=signature, attrs=attrs)
+    attrs = {k: [["tt.divisibility", 16]] for k, v in hints.items() if v == 16}
+    src = triton.compiler.ASTSource(fn=kernel, constexprs=constants, signature=signature, attrs=attrs)
     opts = {"num_warps": args.num_warps, "num_stages": args.num_stages}
     ccinfo = triton.compile(src, options=opts)
+    if ccinfo.metadata.global_scratch_size > 0:
+        raise RuntimeError("AOT compiling kernels with global scratch requirements is not yet implemented")
     arg_names = []
     arg_types = []
     arg_names_not_1 = []
@@ -123,23 +123,30 @@ if __name__ == "__main__":
             arg_types.append(signature[arg_name])
             arg_names_not_1.append(arg_name)
             arg_types_not_1.append(signature[arg_name])
-        elif i in attrs.equal_to_1:
+        elif hints.get((i, ), None) == 1:
             arg_names.append(arg_name)
-            arg_types.append(signature[arg_name])
+            arg_types.append("i32")
     # dump C stub code
-    suffix = kernel_suffix(signature.values(), attrs)
+    suffix = ''
+    for i, ty in enumerate(signature.values()):
+        suffix += str(i)
+        if hints.get((i, ), None) == 1:
+            suffix += 'c'
+        if hints.get((i, ), None) == 16:
+            suffix += 'd'
     func_name = '_'.join([out_name, sig_hash, suffix])
-    hex_ = str(binascii.hexlify(ccinfo.asm["cubin"]))[2:-1]
+    asm = ccinfo.asm["cubin"]  # store binary data once
+    hex_ = str(binascii.hexlify(asm))[2:-1]
     params = {
         "kernel_name": func_name,
         "triton_kernel_name": args.kernel_name,
-        "bin_size": len(hex_),
+        "bin_size": len(asm),
         "bin_data": ", ".join([f"0x{x}{y}" for x, y in zip(hex_[::2], hex_[1::2])]),
         "signature": ", ".join([f"{ty_to_cpp(ty)} {name}" for name, ty in zip(arg_names_not_1, arg_types_not_1)]),
         "full_signature": ", ".join([f"{ty_to_cpp(ty)} {name}" for name, ty in zip(arg_names, arg_types)]),
-        "arg_pointers": ", ".join([f"&{arg}" for arg in arg_names_not_1]),
-        "num_args": len(arg_names_not_1),
+        "arg_pointers": ", ".join([f"&{arg}" for arg in arg_names_not_1] + ["&global_scratch"]),
+        "num_args": len(arg_names_not_1) + 1,
         "kernel_docstring": doc_string,
         "shared": ccinfo.metadata.shared,
         "num_warps": args.num_warps,
@@ -150,6 +157,6 @@ if __name__ == "__main__":
         "_placeholder": "",
     }
     for ext in ['h', 'c']:
-        template_path = Path(__file__).parent / f"compile.{ext}"
+        template_path = Path(__file__).parent / "extra" / "cuda" / f"compile.{ext}"
         with out_path.with_suffix(f".{sig_hash}_{suffix}.{ext}").open("w") as fp:
             fp.write(Path(template_path).read_text().format(**params))

triton/tools/{compile.c → extra/cuda/compile.c} RENAMED Viewed

@@ -60,6 +60,7 @@ CUresult {kernel_name}(CUstream stream, {signature}) {{
     unsigned int gX = {gridX};
     unsigned int gY = {gridY};
     unsigned int gZ = {gridZ};
+    CUdeviceptr global_scratch = 0;
     void *args[{num_args}] = {{ {arg_pointers} }};
     // TODO: shared memory
     if(gX * gY * gZ > 0)

triton/tools/mxfp.py ADDED Viewed

@@ -0,0 +1,301 @@
+"""
+Helper classes for working with low precision floating point types that
+align with the opencompute (OCP) microscaling (MX) specification.
+  * MXFP4Tensor: 4-bit E2M1 floating point data
+  * MXScaleTensor: 8-bit E8M0 floating point data
+Reference: https://www.opencompute.org/documents/ocp-microscaling-formats-mx-v1-0-spec-final-pdf
+"""
+import torch
+class MXFP4Tensor:
+    def __init__(self, data=None, size=None, device=None):
+        """
+        Tensor class for working with four bit E2M1 floating point data as defined by the
+        opencompute microscaling specification.
+        Parameters:
+        - data: A torch tensor of float32 numbers to convert to fp4e2m1 microscaling format.
+        - size: The size of the tensor to create.
+        - device: The device on which to create the tensor.
+        """
+        self.device = device
+        if data is not None:
+            assert isinstance(data, torch.Tensor), "Parameter data must be a torch tensor"
+            self.device = data.device
+            self.data = self._from_float(data)
+        elif size is not None:
+            self.size = size if isinstance(size, tuple) else (size, )
+        else:
+            raise ValueError("Either parameter data or size must be provided")
+    def random(self):
+        S = torch.randint(0, 2, size=self.size, dtype=torch.uint8, device=self.device)
+        E = torch.randint(0, 4, size=self.size, dtype=torch.uint8, device=self.device)
+        M = torch.randint(0, 2, size=self.size, dtype=torch.uint8, device=self.device)
+        self.data = ((S << 3) | (E << 1) | M).type(torch.uint8)
+        return self
+    def to(self, dtype):
+        """
+        Convert fp4e2m1 data to float32.
+        Returns:
+        - A torch tensor of type dtype representing the fp4e2m1 data.
+        """
+        assert dtype == torch.float32, "Currently only float32 is supported for fp4e2m1 to float conversion"
+        data = self.data
+        S = ((data >> 3) & 0x1).type(dtype)
+        E = ((data >> 1) & 0x3).type(dtype)
+        M = (data & 0x1).type(dtype)
+        # The MXF4 E2M1 spec defines 0bS000 as zero
+        value = torch.zeros_like(S)
+        is_zero = (E == 0) & (M == 0)
+        non_zero_mask = ~is_zero
+        if non_zero_mask.any():
+            S_nz = S[non_zero_mask]
+            E_nz = E[non_zero_mask]
+            M_nz = M[non_zero_mask]
+            sign = torch.pow(-1, S_nz)
+            # Normal and subnormal handling for the exponent and mantissa
+            exponent = torch.where(E_nz == 0, E_nz, E_nz - 1)
+            mantissa = torch.where(E_nz == 0, M_nz * 0.5, 1.0 + M_nz * 0.5)
+            value_nz = sign * torch.pow(2, exponent) * mantissa
+            value[non_zero_mask] = value_nz
+        # For zeros, the values must remain zero with the correct sign
+        value[is_zero & (S == 1)] *= -1
+        return value.type(torch.float32)
+    def _from_float(self, values):
+        """
+        Convert float32 numbers to mxf4 e2m1 format.
+        * No encodings are reserved for Inf or NaN in mxf4.
+        * Conversion from float supports roundTiesToEven rounding mode.
+        * If a value exceeds the mxf4 representable range after rounding,
+          clamps to the maximum mxf4 magnitude, preserving the sign.
+        * If a value has magnitude less than the minimum subnormal magnitude
+          in mxf4 after rounding, converts to zero.
+        Parameters:
+        - values: A torch tensor of float32 numbers to convert to fp4 format.
+        """
+        S = torch.signbit(values).type(torch.uint8)
+        abs_values = torch.abs(values)
+        is_zero = (abs_values == 0)
+        is_invalid = torch.isnan(values) | torch.isinf(values)
+        # Enumerate all possible E2M1 exponent and mantissa values. We will
+        # use these to compare the distance between float32 and all possible
+        # E2M1 floats to find the nearest E2M1 representable value
+        E_bits = torch.tensor([0, 1, 2, 3], dtype=torch.uint8, device=self.device)
+        M_bits = torch.tensor([0, 1], dtype=torch.uint8, device=self.device)
+        candidate_values = []
+        candidate_E = []
+        candidate_M = []
+        for E in E_bits:
+            if E == 0:
+                # Subnormals
+                exponent = 0
+                for M in M_bits:
+                    significand = M * 0.5
+                    value = significand * (2**exponent)
+                    candidate_values.append(value)
+                    candidate_E.append(E)
+                    candidate_M.append(M)
+            else:
+                # Normals
+                exponent = E.item() - 1
+                for M in M_bits:
+                    significand = 1.0 + M * 0.5
+                    value = significand * (2**exponent)
+                    candidate_values.append(value)
+                    candidate_E.append(E)
+                    candidate_M.append(M)
+        candidates = torch.tensor(candidate_values, dtype=torch.float32, device=self.device)
+        candidate_E = torch.tensor(candidate_E, dtype=torch.uint8, device=self.device)
+        candidate_M = torch.tensor(candidate_M, dtype=torch.uint8, device=self.device)
+        abs_values_flat = abs_values.view(-1)
+        N = abs_values_flat.shape[0]
+        abs_values_expanded = abs_values_flat.unsqueeze(1)
+        # Clamp invalid values to the max e2m1 representable value
+        max_candidate_value = candidates.max().item()
+        abs_values_flat[is_invalid.view(-1)] = max_candidate_value
+        # Compute distance between all abs_values and candidate e2m1 values
+        errors = torch.abs(abs_values_expanded - candidates.unsqueeze(0))
+        # To implement roundTiesToEven, we need to break ties by preferring
+        # even mantissas (M == 0). We do so by adding an epsilon bias to shift
+        # the closest candidate with an even mantissa closer to the float value
+        min_errors, _ = torch.min(errors, dim=1, keepdim=True)
+        is_tie = (errors == min_errors)
+        # More than one candidate has the min error for some float value
+        if is_tie.sum() > 1:
+            M_bits_expanded = candidate_M.unsqueeze(0).expand(N, -1)
+            tie_breaker = (M_bits_expanded == 0).type(torch.int32)
+            errors = errors - (tie_breaker * 1e-6)
+        best_indices = torch.argmin(errors, dim=1)
+        E_selected = candidate_E[best_indices]
+        M_selected = candidate_M[best_indices]
+        E = E_selected.view(abs_values.shape)
+        M = M_selected.view(abs_values.shape)
+        E[is_zero] = 0
+        M[is_zero] = 0
+        return ((S << 3) | (E << 1) | M).type(torch.uint8)
+    def to_packed_tensor(self, dim):
+        """
+        Packs two e2m1 elements into a single uint8 along the specified dimension.
+        Parameters:
+        - dim: The dimension along which to pack the elements.
+        Returns:
+        - A torch tensor of dtype uint8 with two e2m1 elements packed into one uint8.
+        """
+        data = self.data
+        assert 0 <= dim < data.ndim, \
+            "The dimension to pack along is not within the range of tensor dimensions"
+        size_along_dim = data.size(dim)
+        new_size_along_dim = (size_along_dim + 1) // 2
+        # If the size is odd, we pad the data along dim with zeros at the end
+        if size_along_dim % 2 != 0:
+            pad_sizes = [0] * (2 * data.ndim)
+            pad_index = (data.ndim - dim - 1) * 2 + 1
+            pad_sizes[pad_index] = 1
+            data = torch.nn.functional.pad(data, pad_sizes, mode='constant', value=0)
+        new_shape = list(data.shape)
+        new_shape[dim] = new_size_along_dim
+        new_shape.insert(dim + 1, 2)  # packed dimension of length 2
+        data = data.reshape(*new_shape)
+        low = data.select(dim + 1, 0)
+        high = data.select(dim + 1, 1)
+        packed = (high << 4) | low
+        return packed
+    def unpack_packed_tensor(self, packed_tensor, dim, original_shape):
+        """
+        Unpacks a tensor where two fp4 elements are packed into a single uint8.
+        Parameters:
+        - packed_tensor: The packed tensor
+        - dim: The dimension along which the tensor was packed.
+        - original_shape: The shape of the original tensor before packing.
+        Returns:
+        - A tensor with the original data unpacked into uint8 elements containing one
+          fp4e2m1 element in the least significant bits.
+        """
+        high = (packed_tensor >> 4) & 0xF
+        low = packed_tensor & 0xF
+        stacked = torch.stack((low, high), dim=dim + 1)
+        # Flatten along dim and dim+1 and then merge
+        shape = list(stacked.shape)
+        new_shape = shape[:dim] + [shape[dim] * 2] + shape[dim + 2:]
+        data = stacked.reshape(*new_shape)
+        # Remove any padding
+        if original_shape[dim] % 2 != 0:
+            indices = [slice(None)] * data.ndim
+            indices[dim] = slice(0, original_shape[dim])
+            data = data[tuple(indices)]
+        return data.type(torch.uint8)
+class MXScaleTensor:
+    def __init__(self, data=None, size=None, device=None):
+        """
+        Tensor class for working with microscaling E8M0 block scale factors.
+        Parameters:
+        - data: A torch tensor of float32 numbers to convert to fp8e8m0 microscaling format.
+        - size: The size of the tensor to create.
+        - device: The device on which to create the tensor.
+        """
+        self.device = device
+        if data is not None:
+            assert isinstance(data, torch.Tensor), "Parameter data must be a torch tensor"
+            self.device = data.device
+            self.data = self._from_float(data)
+        elif size is not None:
+            self.size = size if isinstance(size, tuple) else (size, )
+        else:
+            raise ValueError("Either parameter data or size must be provided")
+    def random(self, low=None, high=None):
+        """
+        Generate random E8M0 data within a specified range.
+        * Excludes the NaN encoding (255).
+        """
+        bias = 127
+        min_exponent = 0 if low is None else max(0, int(torch.log2(torch.tensor(low))) + bias)
+        max_exponent = 254 if high is None else min(254, max(0, int(torch.log2(torch.tensor(high))) + bias))
+        assert min_exponent <= max_exponent, "Low must be less than or equal to high"
+        E = torch.randint(min_exponent, max_exponent + 1, size=self.size, dtype=torch.uint8, device=self.device)
+        self.data = E
+        return self
+    def to(self, dtype):
+        assert dtype == torch.float32, "Currently only float32 is supported for f8e8m0 to float conversion"
+        data = self.data.type(dtype)
+        is_nan = (data == 255)
+        e_biased = data.clone()
+        e_biased[is_nan] = 0
+        e = e_biased - 127
+        value = torch.pow(2.0, e)
+        value[is_nan] = torch.nan
+        return value.type(dtype)
+    def _from_float(self, values):
+        """
+        Convert float32 numbers to E8M0 format.
+        * Values <= 0, NaNs, and Infs are converted to the NaN encoding (255).
+        * Positive values are converted by computing the floor of log2(value) to get the exponent.
+        Parameters:
+        - values: A torch tensor of float32 numbers to convert to E8M0 format.
+        """
+        result = torch.empty_like(values, dtype=torch.uint8, device=self.device)
+        is_invalid = torch.isnan(values) | torch.isinf(values) | (values <= 0)
+        result[is_invalid] = 255
+        valid_values = values[~is_invalid]
+        e = torch.floor(torch.log2(valid_values))
+        e_biased = e + 127
+        e_biased_int = e_biased.type(torch.int32)
+        e_biased_clamped = torch.clamp(e_biased_int, 0, 254)
+        result[~is_invalid] = e_biased_clamped.type(torch.uint8)
+        return result

{triton_windows-3.2.0.post12.dist-info → triton_windows-3.3.0a0.post12.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.2
 Name: triton-windows
-Version: 3.2.0.post12
+Version: 3.3.0a0.post12
 Summary: A language and compiler for custom Deep Learning operations
 Home-page: https://github.com/woct0rdho/triton-windows
 Author: Philippe Tillet, Dian Wu
@@ -15,15 +15,17 @@ Classifier: Programming Language :: Python :: 3.10
 Classifier: Programming Language :: Python :: 3.11
 Classifier: Programming Language :: Python :: 3.12
 Classifier: Programming Language :: Python :: 3.13
+Requires-Dist: setuptools>=40.8.0
 Provides-Extra: build
 Requires-Dist: cmake>=3.20; extra == "build"
 Requires-Dist: lit; extra == "build"
 Provides-Extra: tests
 Requires-Dist: autopep8; extra == "tests"
-Requires-Dist: flake8; extra == "tests"
 Requires-Dist: isort; extra == "tests"
 Requires-Dist: numpy; extra == "tests"
 Requires-Dist: pytest; extra == "tests"
+Requires-Dist: pytest-forked; extra == "tests"
+Requires-Dist: pytest-xdist; extra == "tests"
 Requires-Dist: scipy>=1.7.1; extra == "tests"
 Requires-Dist: llnl-hatchet; extra == "tests"
 Provides-Extra: tutorials
@@ -36,4 +38,5 @@ Dynamic: classifier
 Dynamic: home-page
 Dynamic: keywords
 Dynamic: provides-extra
+Dynamic: requires-dist
 Dynamic: summary