PyPI - triton-windows - Versions diffs - 3.5.0.post21__cp314-cp314-win_amd64.whl - Mend

triton-windows 3.5.0.post21__cp314-cp314-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of triton-windows might be problematic. Click here for more details.

Files changed (217) hide show

triton/_C/libtriton.pyd +0 -0
triton/__init__.py +82 -0
triton/_filecheck.py +97 -0
triton/_internal_testing.py +255 -0
triton/_utils.py +126 -0
triton/backends/__init__.py +47 -0
triton/backends/amd/__init__.py +0 -0
triton/backends/amd/compiler.py +461 -0
triton/backends/amd/driver.c +283 -0
triton/backends/amd/driver.py +724 -0
triton/backends/amd/lib/asanrtl.bc +0 -0
triton/backends/amd/lib/ockl.bc +0 -0
triton/backends/amd/lib/ocml.bc +0 -0
triton/backends/compiler.py +90 -0
triton/backends/driver.py +66 -0
triton/backends/nvidia/__init__.py +0 -0
triton/backends/nvidia/bin/ptxas.exe +0 -0
triton/backends/nvidia/compiler.py +533 -0
triton/backends/nvidia/driver.c +517 -0
triton/backends/nvidia/driver.py +799 -0
triton/backends/nvidia/include/cuda.h +26280 -0
triton/backends/nvidia/lib/libdevice.10.bc +0 -0
triton/backends/nvidia/lib/x64/cuda.lib +0 -0
triton/compiler/__init__.py +7 -0
triton/compiler/code_generator.py +1614 -0
triton/compiler/compiler.py +509 -0
triton/compiler/errors.py +51 -0
triton/compiler/make_launcher.py +0 -0
triton/errors.py +5 -0
triton/experimental/__init__.py +0 -0
triton/experimental/gluon/__init__.py +5 -0
triton/experimental/gluon/_compiler.py +0 -0
triton/experimental/gluon/_runtime.py +102 -0
triton/experimental/gluon/language/__init__.py +119 -0
triton/experimental/gluon/language/_core.py +490 -0
triton/experimental/gluon/language/_layouts.py +583 -0
triton/experimental/gluon/language/_math.py +20 -0
triton/experimental/gluon/language/_semantic.py +380 -0
triton/experimental/gluon/language/_standard.py +80 -0
triton/experimental/gluon/language/amd/__init__.py +4 -0
triton/experimental/gluon/language/amd/_layouts.py +96 -0
triton/experimental/gluon/language/amd/cdna3/__init__.py +100 -0
triton/experimental/gluon/language/amd/cdna4/__init__.py +48 -0
triton/experimental/gluon/language/amd/cdna4/async_copy.py +151 -0
triton/experimental/gluon/language/extra/__init__.py +3 -0
triton/experimental/gluon/language/nvidia/__init__.py +4 -0
triton/experimental/gluon/language/nvidia/ampere/__init__.py +3 -0
triton/experimental/gluon/language/nvidia/ampere/async_copy.py +74 -0
triton/experimental/gluon/language/nvidia/ampere/mbarrier.py +80 -0
triton/experimental/gluon/language/nvidia/blackwell/__init__.py +387 -0
triton/experimental/gluon/language/nvidia/blackwell/tma.py +52 -0
triton/experimental/gluon/language/nvidia/hopper/__init__.py +132 -0
triton/experimental/gluon/language/nvidia/hopper/mbarrier.py +34 -0
triton/experimental/gluon/language/nvidia/hopper/tma.py +97 -0
triton/experimental/gluon/nvidia/__init__.py +4 -0
triton/experimental/gluon/nvidia/blackwell.py +3 -0
triton/experimental/gluon/nvidia/hopper.py +45 -0
triton/knobs.py +546 -0
triton/language/__init__.py +342 -0
triton/language/core.py +3405 -0
triton/language/extra/__init__.py +26 -0
triton/language/extra/cuda/__init__.py +16 -0
triton/language/extra/cuda/gdc.py +42 -0
triton/language/extra/cuda/libdevice.py +1629 -0
triton/language/extra/cuda/utils.py +109 -0
triton/language/extra/hip/__init__.py +5 -0
triton/language/extra/hip/libdevice.py +491 -0
triton/language/extra/hip/utils.py +35 -0
triton/language/extra/libdevice.py +790 -0
triton/language/math.py +249 -0
triton/language/random.py +218 -0
triton/language/semantic.py +1939 -0
triton/language/standard.py +534 -0
triton/language/target_info.py +54 -0
triton/runtime/__init__.py +23 -0
triton/runtime/_allocation.py +44 -0
triton/runtime/_async_compile.py +55 -0
triton/runtime/autotuner.py +476 -0
triton/runtime/build.py +168 -0
triton/runtime/cache.py +317 -0
triton/runtime/driver.py +38 -0
triton/runtime/errors.py +36 -0
triton/runtime/interpreter.py +1414 -0
triton/runtime/jit.py +1107 -0
triton/runtime/tcc/include/_mingw.h +168 -0
triton/runtime/tcc/include/assert.h +62 -0
triton/runtime/tcc/include/conio.h +409 -0
triton/runtime/tcc/include/ctype.h +281 -0
triton/runtime/tcc/include/dir.h +31 -0
triton/runtime/tcc/include/direct.h +68 -0
triton/runtime/tcc/include/dirent.h +135 -0
triton/runtime/tcc/include/dos.h +55 -0
triton/runtime/tcc/include/errno.h +75 -0
triton/runtime/tcc/include/excpt.h +123 -0
triton/runtime/tcc/include/fcntl.h +52 -0
triton/runtime/tcc/include/fenv.h +108 -0
triton/runtime/tcc/include/float.h +75 -0
triton/runtime/tcc/include/inttypes.h +297 -0
triton/runtime/tcc/include/io.h +418 -0
triton/runtime/tcc/include/iso646.h +36 -0
triton/runtime/tcc/include/limits.h +116 -0
triton/runtime/tcc/include/locale.h +91 -0
triton/runtime/tcc/include/malloc.h +181 -0
triton/runtime/tcc/include/math.h +497 -0
triton/runtime/tcc/include/mem.h +13 -0
triton/runtime/tcc/include/memory.h +40 -0
triton/runtime/tcc/include/process.h +176 -0
triton/runtime/tcc/include/sec_api/conio_s.h +42 -0
triton/runtime/tcc/include/sec_api/crtdbg_s.h +19 -0
triton/runtime/tcc/include/sec_api/io_s.h +33 -0
triton/runtime/tcc/include/sec_api/mbstring_s.h +52 -0
triton/runtime/tcc/include/sec_api/search_s.h +25 -0
triton/runtime/tcc/include/sec_api/stdio_s.h +145 -0
triton/runtime/tcc/include/sec_api/stdlib_s.h +67 -0
triton/runtime/tcc/include/sec_api/stralign_s.h +30 -0
triton/runtime/tcc/include/sec_api/string_s.h +41 -0
triton/runtime/tcc/include/sec_api/sys/timeb_s.h +34 -0
triton/runtime/tcc/include/sec_api/tchar_s.h +266 -0
triton/runtime/tcc/include/sec_api/time_s.h +61 -0
triton/runtime/tcc/include/sec_api/wchar_s.h +128 -0
triton/runtime/tcc/include/setjmp.h +160 -0
triton/runtime/tcc/include/share.h +28 -0
triton/runtime/tcc/include/signal.h +63 -0
triton/runtime/tcc/include/stdalign.h +16 -0
triton/runtime/tcc/include/stdarg.h +14 -0
triton/runtime/tcc/include/stdatomic.h +171 -0
triton/runtime/tcc/include/stdbool.h +11 -0
triton/runtime/tcc/include/stddef.h +42 -0
triton/runtime/tcc/include/stdint.h +212 -0
triton/runtime/tcc/include/stdio.h +429 -0
triton/runtime/tcc/include/stdlib.h +591 -0
triton/runtime/tcc/include/stdnoreturn.h +7 -0
triton/runtime/tcc/include/string.h +164 -0
triton/runtime/tcc/include/sys/fcntl.h +13 -0
triton/runtime/tcc/include/sys/file.h +14 -0
triton/runtime/tcc/include/sys/locking.h +30 -0
triton/runtime/tcc/include/sys/stat.h +290 -0
triton/runtime/tcc/include/sys/time.h +69 -0
triton/runtime/tcc/include/sys/timeb.h +133 -0
triton/runtime/tcc/include/sys/types.h +123 -0
triton/runtime/tcc/include/sys/unistd.h +14 -0
triton/runtime/tcc/include/sys/utime.h +146 -0
triton/runtime/tcc/include/tcc/tcc_libm.h +618 -0
triton/runtime/tcc/include/tccdefs.h +342 -0
triton/runtime/tcc/include/tcclib.h +80 -0
triton/runtime/tcc/include/tchar.h +1102 -0
triton/runtime/tcc/include/tgmath.h +89 -0
triton/runtime/tcc/include/time.h +287 -0
triton/runtime/tcc/include/uchar.h +33 -0
triton/runtime/tcc/include/unistd.h +1 -0
triton/runtime/tcc/include/vadefs.h +11 -0
triton/runtime/tcc/include/values.h +4 -0
triton/runtime/tcc/include/varargs.h +12 -0
triton/runtime/tcc/include/wchar.h +873 -0
triton/runtime/tcc/include/wctype.h +172 -0
triton/runtime/tcc/include/winapi/basetsd.h +149 -0
triton/runtime/tcc/include/winapi/basetyps.h +85 -0
triton/runtime/tcc/include/winapi/guiddef.h +156 -0
triton/runtime/tcc/include/winapi/poppack.h +8 -0
triton/runtime/tcc/include/winapi/pshpack1.h +8 -0
triton/runtime/tcc/include/winapi/pshpack2.h +8 -0
triton/runtime/tcc/include/winapi/pshpack4.h +8 -0
triton/runtime/tcc/include/winapi/pshpack8.h +8 -0
triton/runtime/tcc/include/winapi/qos.h +72 -0
triton/runtime/tcc/include/winapi/shellapi.h +59 -0
triton/runtime/tcc/include/winapi/winbase.h +2958 -0
triton/runtime/tcc/include/winapi/wincon.h +309 -0
triton/runtime/tcc/include/winapi/windef.h +293 -0
triton/runtime/tcc/include/winapi/windows.h +127 -0
triton/runtime/tcc/include/winapi/winerror.h +3166 -0
triton/runtime/tcc/include/winapi/wingdi.h +4080 -0
triton/runtime/tcc/include/winapi/winnls.h +778 -0
triton/runtime/tcc/include/winapi/winnt.h +5837 -0
triton/runtime/tcc/include/winapi/winreg.h +272 -0
triton/runtime/tcc/include/winapi/winsock2.h +1474 -0
triton/runtime/tcc/include/winapi/winuser.h +5651 -0
triton/runtime/tcc/include/winapi/winver.h +160 -0
triton/runtime/tcc/include/winapi/ws2ipdef.h +21 -0
triton/runtime/tcc/include/winapi/ws2tcpip.h +391 -0
triton/runtime/tcc/lib/cuda.def +697 -0
triton/runtime/tcc/lib/gdi32.def +337 -0
triton/runtime/tcc/lib/kernel32.def +770 -0
triton/runtime/tcc/lib/libtcc1.a +0 -0
triton/runtime/tcc/lib/msvcrt.def +1399 -0
triton/runtime/tcc/lib/python3.def +810 -0
triton/runtime/tcc/lib/python310.def +1610 -0
triton/runtime/tcc/lib/python311.def +1633 -0
triton/runtime/tcc/lib/python312.def +1703 -0
triton/runtime/tcc/lib/python313.def +1651 -0
triton/runtime/tcc/lib/python313t.def +1656 -0
triton/runtime/tcc/lib/python314.def +1800 -0
triton/runtime/tcc/lib/python314t.def +1809 -0
triton/runtime/tcc/lib/python39.def +1644 -0
triton/runtime/tcc/lib/python3t.def +905 -0
triton/runtime/tcc/lib/user32.def +658 -0
triton/runtime/tcc/libtcc.dll +0 -0
triton/runtime/tcc/tcc.exe +0 -0
triton/testing.py +543 -0
triton/tools/__init__.py +0 -0
triton/tools/build_extern.py +365 -0
triton/tools/compile.py +210 -0
triton/tools/disasm.py +143 -0
triton/tools/extra/cuda/compile.c +70 -0
triton/tools/extra/cuda/compile.h +14 -0
triton/tools/extra/hip/compile.cpp +66 -0
triton/tools/extra/hip/compile.h +13 -0
triton/tools/link.py +322 -0
triton/tools/mxfp.py +301 -0
triton/tools/ragged_tma.py +92 -0
triton/tools/tensor_descriptor.py +34 -0
triton/windows_utils.py +405 -0
triton_windows-3.5.0.post21.dist-info/METADATA +46 -0
triton_windows-3.5.0.post21.dist-info/RECORD +217 -0
triton_windows-3.5.0.post21.dist-info/WHEEL +5 -0
triton_windows-3.5.0.post21.dist-info/entry_points.txt +3 -0
triton_windows-3.5.0.post21.dist-info/licenses/LICENSE +23 -0
triton_windows-3.5.0.post21.dist-info/top_level.txt +1 -0

triton/tools/mxfp.py ADDED Viewed

@@ -0,0 +1,301 @@
+"""
+Helper classes for working with low precision floating point types that
+align with the opencompute (OCP) microscaling (MX) specification.
+  * MXFP4Tensor: 4-bit E2M1 floating point data
+  * MXScaleTensor: 8-bit E8M0 floating point data
+Reference: https://www.opencompute.org/documents/ocp-microscaling-formats-mx-v1-0-spec-final-pdf
+"""
+import torch
+class MXFP4Tensor:
+    def __init__(self, data=None, size=None, device=None):
+        """
+        Tensor class for working with four bit E2M1 floating point data as defined by the
+        opencompute microscaling specification.
+        Parameters:
+        - data: A torch tensor of float32 numbers to convert to fp4e2m1 microscaling format.
+        - size: The size of the tensor to create.
+        - device: The device on which to create the tensor.
+        """
+        self.device = device
+        if data is not None:
+            assert isinstance(data, torch.Tensor), "Parameter data must be a torch tensor"
+            self.device = data.device
+            self.data = self._from_float(data)
+        elif size is not None:
+            self.size = size if isinstance(size, tuple) else (size, )
+        else:
+            raise ValueError("Either parameter data or size must be provided")
+    def random(self):
+        S = torch.randint(0, 2, size=self.size, dtype=torch.uint8, device=self.device)
+        E = torch.randint(0, 4, size=self.size, dtype=torch.uint8, device=self.device)
+        M = torch.randint(0, 2, size=self.size, dtype=torch.uint8, device=self.device)
+        self.data = ((S << 3) | (E << 1) | M).type(torch.uint8)
+        return self
+    def to(self, dtype):
+        """
+        Convert fp4e2m1 data to float32.
+        Returns:
+        - A torch tensor of type dtype representing the fp4e2m1 data.
+        """
+        assert dtype == torch.float32, "Currently only float32 is supported for fp4e2m1 to float conversion"
+        data = self.data
+        S = ((data >> 3) & 0x1).type(dtype)
+        E = ((data >> 1) & 0x3).type(dtype)
+        M = (data & 0x1).type(dtype)
+        # The MXF4 E2M1 spec defines 0bS000 as zero
+        value = torch.zeros_like(S)
+        is_zero = (E == 0) & (M == 0)
+        non_zero_mask = ~is_zero
+        if non_zero_mask.any():
+            S_nz = S[non_zero_mask]
+            E_nz = E[non_zero_mask]
+            M_nz = M[non_zero_mask]
+            sign = torch.pow(-1, S_nz)
+            # Normal and subnormal handling for the exponent and mantissa
+            exponent = torch.where(E_nz == 0, E_nz, E_nz - 1)
+            mantissa = torch.where(E_nz == 0, M_nz * 0.5, 1.0 + M_nz * 0.5)
+            value_nz = sign * torch.pow(2, exponent) * mantissa
+            value[non_zero_mask] = value_nz
+        # For zeros, the values must remain zero with the correct sign
+        value[is_zero & (S == 1)] *= -1
+        return value.type(torch.float32)
+    def _from_float(self, values):
+        """
+        Convert float32 numbers to mxf4 e2m1 format.
+        * No encodings are reserved for Inf or NaN in mxf4.
+        * Conversion from float supports roundTiesToEven rounding mode.
+        * If a value exceeds the mxf4 representable range after rounding,
+          clamps to the maximum mxf4 magnitude, preserving the sign.
+        * If a value has magnitude less than the minimum subnormal magnitude
+          in mxf4 after rounding, converts to zero.
+        Parameters:
+        - values: A torch tensor of float32 numbers to convert to fp4 format.
+        """
+        S = torch.signbit(values).type(torch.uint8)
+        abs_values = torch.abs(values)
+        is_zero = (abs_values == 0)
+        is_invalid = torch.isnan(values) | torch.isinf(values)
+        # Enumerate all possible E2M1 exponent and mantissa values. We will
+        # use these to compare the distance between float32 and all possible
+        # E2M1 floats to find the nearest E2M1 representable value
+        E_bits = torch.tensor([0, 1, 2, 3], dtype=torch.uint8, device=self.device)
+        M_bits = torch.tensor([0, 1], dtype=torch.uint8, device=self.device)
+        candidate_values = []
+        candidate_E = []
+        candidate_M = []
+        for E in E_bits:
+            if E == 0:
+                # Subnormals
+                exponent = 0
+                for M in M_bits:
+                    significand = M * 0.5
+                    value = significand * (2**exponent)
+                    candidate_values.append(value)
+                    candidate_E.append(E)
+                    candidate_M.append(M)
+            else:
+                # Normals
+                exponent = E.item() - 1
+                for M in M_bits:
+                    significand = 1.0 + M * 0.5
+                    value = significand * (2**exponent)
+                    candidate_values.append(value)
+                    candidate_E.append(E)
+                    candidate_M.append(M)
+        candidates = torch.tensor(candidate_values, dtype=torch.float32, device=self.device)
+        candidate_E = torch.tensor(candidate_E, dtype=torch.uint8, device=self.device)
+        candidate_M = torch.tensor(candidate_M, dtype=torch.uint8, device=self.device)
+        abs_values_flat = abs_values.view(-1)
+        N = abs_values_flat.shape[0]
+        abs_values_expanded = abs_values_flat.unsqueeze(1)
+        # Clamp invalid values to the max e2m1 representable value
+        max_candidate_value = candidates.max().item()
+        abs_values_flat[is_invalid.view(-1)] = max_candidate_value
+        # Compute distance between all abs_values and candidate e2m1 values
+        errors = torch.abs(abs_values_expanded - candidates.unsqueeze(0))
+        # To implement roundTiesToEven, we need to break ties by preferring
+        # even mantissas (M == 0). We do so by adding an epsilon bias to shift
+        # the closest candidate with an even mantissa closer to the float value
+        min_errors, _ = torch.min(errors, dim=1, keepdim=True)
+        is_tie = (errors == min_errors)
+        # More than one candidate has the min error for some float value
+        if is_tie.sum() > 1:
+            M_bits_expanded = candidate_M.unsqueeze(0).expand(N, -1)
+            tie_breaker = (M_bits_expanded == 0).type(torch.int32)
+            errors = errors - (tie_breaker * 1e-6)
+        best_indices = torch.argmin(errors, dim=1)
+        E_selected = candidate_E[best_indices]
+        M_selected = candidate_M[best_indices]
+        E = E_selected.view(abs_values.shape)
+        M = M_selected.view(abs_values.shape)
+        E[is_zero] = 0
+        M[is_zero] = 0
+        return ((S << 3) | (E << 1) | M).type(torch.uint8)
+    def to_packed_tensor(self, dim):
+        """
+        Packs two e2m1 elements into a single uint8 along the specified dimension.
+        Parameters:
+        - dim: The dimension along which to pack the elements.
+        Returns:
+        - A torch tensor of dtype uint8 with two e2m1 elements packed into one uint8.
+        """
+        data = self.data
+        assert 0 <= dim < data.ndim, \
+            "The dimension to pack along is not within the range of tensor dimensions"
+        size_along_dim = data.size(dim)
+        new_size_along_dim = (size_along_dim + 1) // 2
+        # If the size is odd, we pad the data along dim with zeros at the end
+        if size_along_dim % 2 != 0:
+            pad_sizes = [0] * (2 * data.ndim)
+            pad_index = (data.ndim - dim - 1) * 2 + 1
+            pad_sizes[pad_index] = 1
+            data = torch.nn.functional.pad(data, pad_sizes, mode='constant', value=0)
+        new_shape = list(data.shape)
+        new_shape[dim] = new_size_along_dim
+        new_shape.insert(dim + 1, 2)  # packed dimension of length 2
+        data = data.reshape(*new_shape)
+        low = data.select(dim + 1, 0)
+        high = data.select(dim + 1, 1)
+        packed = (high << 4) | low
+        return packed
+    def unpack_packed_tensor(self, packed_tensor, dim, original_shape):
+        """
+        Unpacks a tensor where two fp4 elements are packed into a single uint8.
+        Parameters:
+        - packed_tensor: The packed tensor
+        - dim: The dimension along which the tensor was packed.
+        - original_shape: The shape of the original tensor before packing.
+        Returns:
+        - A tensor with the original data unpacked into uint8 elements containing one
+          fp4e2m1 element in the least significant bits.
+        """
+        high = (packed_tensor >> 4) & 0xF
+        low = packed_tensor & 0xF
+        stacked = torch.stack((low, high), dim=dim + 1)
+        # Flatten along dim and dim+1 and then merge
+        shape = list(stacked.shape)
+        new_shape = shape[:dim] + [shape[dim] * 2] + shape[dim + 2:]
+        data = stacked.reshape(*new_shape)
+        # Remove any padding
+        if original_shape[dim] % 2 != 0:
+            indices = [slice(None)] * data.ndim
+            indices[dim] = slice(0, original_shape[dim])
+            data = data[tuple(indices)]
+        return data.type(torch.uint8)
+class MXScaleTensor:
+    def __init__(self, data=None, size=None, device=None):
+        """
+        Tensor class for working with microscaling E8M0 block scale factors.
+        Parameters:
+        - data: A torch tensor of float32 numbers to convert to fp8e8m0 microscaling format.
+        - size: The size of the tensor to create.
+        - device: The device on which to create the tensor.
+        """
+        self.device = device
+        if data is not None:
+            assert isinstance(data, torch.Tensor), "Parameter data must be a torch tensor"
+            self.device = data.device
+            self.data = self._from_float(data)
+        elif size is not None:
+            self.size = size if isinstance(size, tuple) else (size, )
+        else:
+            raise ValueError("Either parameter data or size must be provided")
+    def random(self, low=None, high=None):
+        """
+        Generate random E8M0 data within a specified range.
+        * Excludes the NaN encoding (255).
+        """
+        bias = 127
+        min_exponent = 0 if low is None else max(0, int(torch.log2(torch.tensor(low))) + bias)
+        max_exponent = 254 if high is None else min(254, max(0, int(torch.log2(torch.tensor(high))) + bias))
+        assert min_exponent <= max_exponent, "Low must be less than or equal to high"
+        E = torch.randint(min_exponent, max_exponent + 1, size=self.size, dtype=torch.uint8, device=self.device)
+        self.data = E
+        return self
+    def to(self, dtype):
+        assert dtype == torch.float32, "Currently only float32 is supported for f8e8m0 to float conversion"
+        data = self.data.type(dtype)
+        is_nan = (data == 255)
+        e_biased = data.clone()
+        e_biased[is_nan] = 0
+        e = e_biased - 127
+        value = torch.pow(2.0, e)
+        value[is_nan] = torch.nan
+        return value.type(dtype)
+    def _from_float(self, values):
+        """
+        Convert float32 numbers to E8M0 format.
+        * Values <= 0, NaNs, and Infs are converted to the NaN encoding (255).
+        * Positive values are converted by computing the floor of log2(value) to get the exponent.
+        Parameters:
+        - values: A torch tensor of float32 numbers to convert to E8M0 format.
+        """
+        result = torch.empty_like(values, dtype=torch.uint8, device=self.device)
+        is_invalid = torch.isnan(values) | torch.isinf(values) | (values <= 0)
+        result[is_invalid] = 255
+        valid_values = values[~is_invalid]
+        e = torch.floor(torch.log2(valid_values))
+        e_biased = e + 127
+        e_biased_int = e_biased.type(torch.int32)
+        e_biased_clamped = torch.clamp(e_biased_int, 0, 254)
+        result[~is_invalid] = e_biased_clamped.type(torch.uint8)
+        return result

triton/tools/ragged_tma.py ADDED Viewed

@@ -0,0 +1,92 @@
+import triton
+import triton.language as tl
+from triton.tools.tensor_descriptor import TensorDescriptor
+# fmt: off
+def create_ragged_descriptor(T, block_shape, ragged_dim=0):
+    """
+    Given a 2- or 3-dimensional tensor T, this creates a 'ragged descriptor'
+    which behaves like a concatenation (along the first axis) of subarrays
+    of potentially unequal size.
+    The load_ragged and store_ragged device functions can be used to read
+    and write from subarrays T[batch_offset : batch_offset + batch_size]
+    with hardware bounds-checking preventing any sort of leakage outside
+    the subarray.
+    """
+    block_shape = list(block_shape)
+    tensor_shape = list(T.shape)
+    rank = len(tensor_shape)
+    if ragged_dim < 0:
+        ragged_dim += rank
+    assert 0 <= ragged_dim < rank - 1, "last dimension cannot be ragged"
+    assert rank <= 3, "read-write ragged descriptors must have at most 3 dimensions"
+    assert len(block_shape) == rank, "block shape must have same length as tensor shape"
+    max_int = 0x7fff0000
+    billion = 0x40000000  # == 2**30
+    assert tensor_shape[ragged_dim] <= billion, "number of rows may not exceed 2**30"
+    tensor_shape[ragged_dim] = billion
+    ragged_stride = T.stride(ragged_dim)
+    # we prepend an extra two dimensions and rely on the fact that pointers
+    # have 64-bit wraparound semantics:
+    tma_stride = [2**34 - ragged_stride, ragged_stride] + [T.stride(i) for i in range(rank)]
+    tma_shape  = [max_int, max_int] + tensor_shape
+    box_shape  = [1, 1] + block_shape
+    return TensorDescriptor(T, tma_shape, tma_stride, box_shape)
+@triton.jit
+def to_ragged_indices(batch_offset, batch_size, row):
+    """
+    Helper function for load_ragged and store_ragged.
+    """
+    billion = 0x40000000  # == 2**30
+    x = billion - batch_size + row
+    y = batch_offset + batch_size
+    return billion, y, x
+@triton.jit
+def load_ragged(TMA, batch_offset, batch_size, coords, ragged_dim: tl.constexpr = 0):
+    """
+    Read from a subarray T[batch_offset : batch_offset + batch_size] with
+    hardware bounds-checking, where reading outside the subarray gives zeros.
+    Coords should be an appropriately-sized list of integers, just like in
+    TMA.load().
+    """
+    tl.static_assert(len(TMA.shape) == len(coords) + 2, "TMA must be a read-write ragged descriptor")
+    c0, c1, c2 = to_ragged_indices(batch_offset, batch_size, coords[ragged_dim])
+    data = TMA.load([c0, c1] + coords[:ragged_dim] + [c2] + coords[ragged_dim + 1:])
+    data = tl.reshape(data, data.shape[2:])
+    return data
+@triton.jit
+def store_ragged(TMA, batch_offset, batch_size, coords, data, ragged_dim: tl.constexpr = 0):
+    """
+    Write to a subarray T[batch_offset : batch_offset + batch_size] with
+    hardware bounds-checking, where writes outside the subarray are masked
+    correctly.
+    Coords should be an appropriately-sized list of integers, just like in
+    TMA.store().
+    """
+    c0, c1, c2 = to_ragged_indices(batch_offset, batch_size, coords[ragged_dim])
+    data = tl.reshape(data, [1, 1] + data.shape)
+    TMA.store([c0, c1] + coords[:ragged_dim] + [c2] + coords[ragged_dim + 1:], data)

triton/tools/tensor_descriptor.py ADDED Viewed

@@ -0,0 +1,34 @@
+from dataclasses import dataclass
+from typing import List, Any
+from triton._utils import validate_block_shape
+@dataclass
+class TensorDescriptor:
+    base: Any
+    shape: List[int]
+    strides: List[int]
+    block_shape: List[int]
+    padding: str = "zero"
+    def __post_init__(self):
+        rank = len(self.shape)
+        assert len(self.strides) == rank, f"rank mismatch: {self}"
+        assert len(self.block_shape) == rank, f"rank mismatch: {self}"
+        assert rank > 0, "rank must not be zero"
+        assert rank <= 5, "rank cannot be more than 5"
+        ty = type(self.base)
+        if ty.__name__ not in ("FakeTensor", "FunctionalTensor"):
+            assert self.base.data_ptr() % 16 == 0, "base must be 16-byte aligned"
+        validate_block_shape(self.block_shape)
+        elem_bytes = self.base.dtype.itemsize
+        for stride in self.strides[:-1]:
+            assert (stride * elem_bytes) % 16 == 0, "strides must be 16-byte aligned"
+        assert self.strides[-1] == 1, "Last dimension must be contiguous"
+        assert self.padding == "zero" or self.padding == "nan", "Illegal value for padding"
+        if self.padding == "nan":
+            assert self.base.dtype.is_floating_point, "Padding option `nan` is only supported for floating point tensors"
+    @staticmethod
+    def from_tensor(tensor: Any, block_shape: List[int], padding="zero"):
+        return TensorDescriptor(tensor, tensor.shape, tensor.stride(), block_shape, padding)