PyPI - triton-windows - Versions diffs - 3.5.0.post21__cp314-cp314-win_amd64.whl - Mend

triton-windows 3.5.0.post21__cp314-cp314-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of triton-windows might be problematic. Click here for more details.

Files changed (217) hide show

triton/_C/libtriton.pyd +0 -0
triton/__init__.py +82 -0
triton/_filecheck.py +97 -0
triton/_internal_testing.py +255 -0
triton/_utils.py +126 -0
triton/backends/__init__.py +47 -0
triton/backends/amd/__init__.py +0 -0
triton/backends/amd/compiler.py +461 -0
triton/backends/amd/driver.c +283 -0
triton/backends/amd/driver.py +724 -0
triton/backends/amd/lib/asanrtl.bc +0 -0
triton/backends/amd/lib/ockl.bc +0 -0
triton/backends/amd/lib/ocml.bc +0 -0
triton/backends/compiler.py +90 -0
triton/backends/driver.py +66 -0
triton/backends/nvidia/__init__.py +0 -0
triton/backends/nvidia/bin/ptxas.exe +0 -0
triton/backends/nvidia/compiler.py +533 -0
triton/backends/nvidia/driver.c +517 -0
triton/backends/nvidia/driver.py +799 -0
triton/backends/nvidia/include/cuda.h +26280 -0
triton/backends/nvidia/lib/libdevice.10.bc +0 -0
triton/backends/nvidia/lib/x64/cuda.lib +0 -0
triton/compiler/__init__.py +7 -0
triton/compiler/code_generator.py +1614 -0
triton/compiler/compiler.py +509 -0
triton/compiler/errors.py +51 -0
triton/compiler/make_launcher.py +0 -0
triton/errors.py +5 -0
triton/experimental/__init__.py +0 -0
triton/experimental/gluon/__init__.py +5 -0
triton/experimental/gluon/_compiler.py +0 -0
triton/experimental/gluon/_runtime.py +102 -0
triton/experimental/gluon/language/__init__.py +119 -0
triton/experimental/gluon/language/_core.py +490 -0
triton/experimental/gluon/language/_layouts.py +583 -0
triton/experimental/gluon/language/_math.py +20 -0
triton/experimental/gluon/language/_semantic.py +380 -0
triton/experimental/gluon/language/_standard.py +80 -0
triton/experimental/gluon/language/amd/__init__.py +4 -0
triton/experimental/gluon/language/amd/_layouts.py +96 -0
triton/experimental/gluon/language/amd/cdna3/__init__.py +100 -0
triton/experimental/gluon/language/amd/cdna4/__init__.py +48 -0
triton/experimental/gluon/language/amd/cdna4/async_copy.py +151 -0
triton/experimental/gluon/language/extra/__init__.py +3 -0
triton/experimental/gluon/language/nvidia/__init__.py +4 -0
triton/experimental/gluon/language/nvidia/ampere/__init__.py +3 -0
triton/experimental/gluon/language/nvidia/ampere/async_copy.py +74 -0
triton/experimental/gluon/language/nvidia/ampere/mbarrier.py +80 -0
triton/experimental/gluon/language/nvidia/blackwell/__init__.py +387 -0
triton/experimental/gluon/language/nvidia/blackwell/tma.py +52 -0
triton/experimental/gluon/language/nvidia/hopper/__init__.py +132 -0
triton/experimental/gluon/language/nvidia/hopper/mbarrier.py +34 -0
triton/experimental/gluon/language/nvidia/hopper/tma.py +97 -0
triton/experimental/gluon/nvidia/__init__.py +4 -0
triton/experimental/gluon/nvidia/blackwell.py +3 -0
triton/experimental/gluon/nvidia/hopper.py +45 -0
triton/knobs.py +546 -0
triton/language/__init__.py +342 -0
triton/language/core.py +3405 -0
triton/language/extra/__init__.py +26 -0
triton/language/extra/cuda/__init__.py +16 -0
triton/language/extra/cuda/gdc.py +42 -0
triton/language/extra/cuda/libdevice.py +1629 -0
triton/language/extra/cuda/utils.py +109 -0
triton/language/extra/hip/__init__.py +5 -0
triton/language/extra/hip/libdevice.py +491 -0
triton/language/extra/hip/utils.py +35 -0
triton/language/extra/libdevice.py +790 -0
triton/language/math.py +249 -0
triton/language/random.py +218 -0
triton/language/semantic.py +1939 -0
triton/language/standard.py +534 -0
triton/language/target_info.py +54 -0
triton/runtime/__init__.py +23 -0
triton/runtime/_allocation.py +44 -0
triton/runtime/_async_compile.py +55 -0
triton/runtime/autotuner.py +476 -0
triton/runtime/build.py +168 -0
triton/runtime/cache.py +317 -0
triton/runtime/driver.py +38 -0
triton/runtime/errors.py +36 -0
triton/runtime/interpreter.py +1414 -0
triton/runtime/jit.py +1107 -0
triton/runtime/tcc/include/_mingw.h +168 -0
triton/runtime/tcc/include/assert.h +62 -0
triton/runtime/tcc/include/conio.h +409 -0
triton/runtime/tcc/include/ctype.h +281 -0
triton/runtime/tcc/include/dir.h +31 -0
triton/runtime/tcc/include/direct.h +68 -0
triton/runtime/tcc/include/dirent.h +135 -0
triton/runtime/tcc/include/dos.h +55 -0
triton/runtime/tcc/include/errno.h +75 -0
triton/runtime/tcc/include/excpt.h +123 -0
triton/runtime/tcc/include/fcntl.h +52 -0
triton/runtime/tcc/include/fenv.h +108 -0
triton/runtime/tcc/include/float.h +75 -0
triton/runtime/tcc/include/inttypes.h +297 -0
triton/runtime/tcc/include/io.h +418 -0
triton/runtime/tcc/include/iso646.h +36 -0
triton/runtime/tcc/include/limits.h +116 -0
triton/runtime/tcc/include/locale.h +91 -0
triton/runtime/tcc/include/malloc.h +181 -0
triton/runtime/tcc/include/math.h +497 -0
triton/runtime/tcc/include/mem.h +13 -0
triton/runtime/tcc/include/memory.h +40 -0
triton/runtime/tcc/include/process.h +176 -0
triton/runtime/tcc/include/sec_api/conio_s.h +42 -0
triton/runtime/tcc/include/sec_api/crtdbg_s.h +19 -0
triton/runtime/tcc/include/sec_api/io_s.h +33 -0
triton/runtime/tcc/include/sec_api/mbstring_s.h +52 -0
triton/runtime/tcc/include/sec_api/search_s.h +25 -0
triton/runtime/tcc/include/sec_api/stdio_s.h +145 -0
triton/runtime/tcc/include/sec_api/stdlib_s.h +67 -0
triton/runtime/tcc/include/sec_api/stralign_s.h +30 -0
triton/runtime/tcc/include/sec_api/string_s.h +41 -0
triton/runtime/tcc/include/sec_api/sys/timeb_s.h +34 -0
triton/runtime/tcc/include/sec_api/tchar_s.h +266 -0
triton/runtime/tcc/include/sec_api/time_s.h +61 -0
triton/runtime/tcc/include/sec_api/wchar_s.h +128 -0
triton/runtime/tcc/include/setjmp.h +160 -0
triton/runtime/tcc/include/share.h +28 -0
triton/runtime/tcc/include/signal.h +63 -0
triton/runtime/tcc/include/stdalign.h +16 -0
triton/runtime/tcc/include/stdarg.h +14 -0
triton/runtime/tcc/include/stdatomic.h +171 -0
triton/runtime/tcc/include/stdbool.h +11 -0
triton/runtime/tcc/include/stddef.h +42 -0
triton/runtime/tcc/include/stdint.h +212 -0
triton/runtime/tcc/include/stdio.h +429 -0
triton/runtime/tcc/include/stdlib.h +591 -0
triton/runtime/tcc/include/stdnoreturn.h +7 -0
triton/runtime/tcc/include/string.h +164 -0
triton/runtime/tcc/include/sys/fcntl.h +13 -0
triton/runtime/tcc/include/sys/file.h +14 -0
triton/runtime/tcc/include/sys/locking.h +30 -0
triton/runtime/tcc/include/sys/stat.h +290 -0
triton/runtime/tcc/include/sys/time.h +69 -0
triton/runtime/tcc/include/sys/timeb.h +133 -0
triton/runtime/tcc/include/sys/types.h +123 -0
triton/runtime/tcc/include/sys/unistd.h +14 -0
triton/runtime/tcc/include/sys/utime.h +146 -0
triton/runtime/tcc/include/tcc/tcc_libm.h +618 -0
triton/runtime/tcc/include/tccdefs.h +342 -0
triton/runtime/tcc/include/tcclib.h +80 -0
triton/runtime/tcc/include/tchar.h +1102 -0
triton/runtime/tcc/include/tgmath.h +89 -0
triton/runtime/tcc/include/time.h +287 -0
triton/runtime/tcc/include/uchar.h +33 -0
triton/runtime/tcc/include/unistd.h +1 -0
triton/runtime/tcc/include/vadefs.h +11 -0
triton/runtime/tcc/include/values.h +4 -0
triton/runtime/tcc/include/varargs.h +12 -0
triton/runtime/tcc/include/wchar.h +873 -0
triton/runtime/tcc/include/wctype.h +172 -0
triton/runtime/tcc/include/winapi/basetsd.h +149 -0
triton/runtime/tcc/include/winapi/basetyps.h +85 -0
triton/runtime/tcc/include/winapi/guiddef.h +156 -0
triton/runtime/tcc/include/winapi/poppack.h +8 -0
triton/runtime/tcc/include/winapi/pshpack1.h +8 -0
triton/runtime/tcc/include/winapi/pshpack2.h +8 -0
triton/runtime/tcc/include/winapi/pshpack4.h +8 -0
triton/runtime/tcc/include/winapi/pshpack8.h +8 -0
triton/runtime/tcc/include/winapi/qos.h +72 -0
triton/runtime/tcc/include/winapi/shellapi.h +59 -0
triton/runtime/tcc/include/winapi/winbase.h +2958 -0
triton/runtime/tcc/include/winapi/wincon.h +309 -0
triton/runtime/tcc/include/winapi/windef.h +293 -0
triton/runtime/tcc/include/winapi/windows.h +127 -0
triton/runtime/tcc/include/winapi/winerror.h +3166 -0
triton/runtime/tcc/include/winapi/wingdi.h +4080 -0
triton/runtime/tcc/include/winapi/winnls.h +778 -0
triton/runtime/tcc/include/winapi/winnt.h +5837 -0
triton/runtime/tcc/include/winapi/winreg.h +272 -0
triton/runtime/tcc/include/winapi/winsock2.h +1474 -0
triton/runtime/tcc/include/winapi/winuser.h +5651 -0
triton/runtime/tcc/include/winapi/winver.h +160 -0
triton/runtime/tcc/include/winapi/ws2ipdef.h +21 -0
triton/runtime/tcc/include/winapi/ws2tcpip.h +391 -0
triton/runtime/tcc/lib/cuda.def +697 -0
triton/runtime/tcc/lib/gdi32.def +337 -0
triton/runtime/tcc/lib/kernel32.def +770 -0
triton/runtime/tcc/lib/libtcc1.a +0 -0
triton/runtime/tcc/lib/msvcrt.def +1399 -0
triton/runtime/tcc/lib/python3.def +810 -0
triton/runtime/tcc/lib/python310.def +1610 -0
triton/runtime/tcc/lib/python311.def +1633 -0
triton/runtime/tcc/lib/python312.def +1703 -0
triton/runtime/tcc/lib/python313.def +1651 -0
triton/runtime/tcc/lib/python313t.def +1656 -0
triton/runtime/tcc/lib/python314.def +1800 -0
triton/runtime/tcc/lib/python314t.def +1809 -0
triton/runtime/tcc/lib/python39.def +1644 -0
triton/runtime/tcc/lib/python3t.def +905 -0
triton/runtime/tcc/lib/user32.def +658 -0
triton/runtime/tcc/libtcc.dll +0 -0
triton/runtime/tcc/tcc.exe +0 -0
triton/testing.py +543 -0
triton/tools/__init__.py +0 -0
triton/tools/build_extern.py +365 -0
triton/tools/compile.py +210 -0
triton/tools/disasm.py +143 -0
triton/tools/extra/cuda/compile.c +70 -0
triton/tools/extra/cuda/compile.h +14 -0
triton/tools/extra/hip/compile.cpp +66 -0
triton/tools/extra/hip/compile.h +13 -0
triton/tools/link.py +322 -0
triton/tools/mxfp.py +301 -0
triton/tools/ragged_tma.py +92 -0
triton/tools/tensor_descriptor.py +34 -0
triton/windows_utils.py +405 -0
triton_windows-3.5.0.post21.dist-info/METADATA +46 -0
triton_windows-3.5.0.post21.dist-info/RECORD +217 -0
triton_windows-3.5.0.post21.dist-info/WHEEL +5 -0
triton_windows-3.5.0.post21.dist-info/entry_points.txt +3 -0
triton_windows-3.5.0.post21.dist-info/licenses/LICENSE +23 -0
triton_windows-3.5.0.post21.dist-info/top_level.txt +1 -0

triton/language/math.py ADDED Viewed

@@ -0,0 +1,249 @@
+from . import core
+from functools import wraps
+from typing import List
+T = core.TypeVar('T')
+def _check_dtype(dtypes: List[str]) -> T:
+    """
+    We're following libdevice's convention to check accepted data types for math functions.
+    It is not a good practice to support all data types as accelerators/GPUs don't support
+    many float16 and bfloat16 math operations.
+    We should let the users know that they are using and invoke explicit cast to convert
+    the data type to the supported one.
+    """
+    def wrapper(fn):
+        @wraps(fn)
+        def check(*args, **kwargs):
+            # concatenate args and kwargs
+            all_args = list(args) + list(kwargs.values())
+            for arg in [a for a in all_args if isinstance(a, core.tensor)]:
+                if arg.type.scalar.name not in dtypes:
+                    raise ValueError(f"Expected dtype {dtypes} but got {arg.type.scalar.name}")
+            return fn(*args, **kwargs)
+        return check
+    return wrapper
+def _add_math_1arg_docstr(name: str) -> core.Callable[[T], T]:
+    def _decorator(func: T) -> T:
+        docstr = """
+    Computes the element-wise {name} of :code:`x`.
+    :param x: the input values
+    :type x: Block
+    """
+        func.__doc__ = docstr.format(name=name)
+        return func
+    return _decorator
+def _add_math_2arg_docstr(name: str) -> core.Callable[[T], T]:
+    def _decorator(func: T) -> T:
+        docstr = """
+    Computes the element-wise {name} of :code:`x` and :code:`y`.
+    :param x: the input values
+    :type x: Block
+    :param y: the input values
+    :type y: Block
+    """
+        func.__doc__ = docstr.format(name=name)
+        return func
+    return _decorator
+def _add_math_3arg_docstr(name: str) -> core.Callable[[T], T]:
+    def _decorator(func: T) -> T:
+        docstr = """
+    Computes the element-wise {name} of :code:`x`, :code:`y`, and :code:`z`.
+    :param x: the input values
+    :type x: Block
+    :param y: the input values
+    :type y: Block
+    :param z: the input values
+    :type z: Block
+    """
+        func.__doc__ = docstr.format(name=name)
+        return func
+    return _decorator
+@core.builtin
+@_check_dtype(dtypes=["int32", "int64", "uint32", "uint64"])
+@_add_math_2arg_docstr("most significant N bits of the 2N-bit product")
+def umulhi(x, y, _semantic=None):
+    x = _semantic.to_tensor(x)
+    y = _semantic.to_tensor(y)
+    x, y = core.binary_op_type_legalization(x, y, _semantic)
+    return core.tensor(_semantic.builder.create_umulhi(x.handle, y.handle), x.type)
+@core.builtin
+@_check_dtype(dtypes=["fp32", "fp64"])
+@_add_math_1arg_docstr("exponential")
+@core._tensor_member_fn
+def exp(x, _semantic=None):
+    x = _semantic.to_tensor(x)
+    return core.tensor(_semantic.builder.create_exp(x.handle), x.type)
+@core.builtin
+@_check_dtype(dtypes=["fp32", "fp64"])
+@_add_math_1arg_docstr("exponential (base 2)")
+@core._tensor_member_fn
+def exp2(x, _semantic=None):
+    x = _semantic.to_tensor(x)
+    return core.tensor(_semantic.builder.create_exp2(x.handle), x.type)
+@core.builtin
+@_check_dtype(dtypes=["fp32", "fp64"])
+@_add_math_1arg_docstr("natural logarithm")
+@core._tensor_member_fn
+def log(x, _semantic=None):
+    x = _semantic.to_tensor(x)
+    return core.tensor(_semantic.builder.create_log(x.handle), x.type)
+@core.builtin
+@_check_dtype(dtypes=["fp32", "fp64"])
+@_add_math_1arg_docstr("logarithm (base 2)")
+@core._tensor_member_fn
+def log2(x, _semantic=None):
+    x = _semantic.to_tensor(x)
+    return core.tensor(_semantic.builder.create_log2(x.handle), x.type)
+@core.builtin
+@_check_dtype(dtypes=["fp32", "fp64"])
+@_add_math_1arg_docstr("cosine")
+@core._tensor_member_fn
+def cos(x, _semantic=None):
+    x = _semantic.to_tensor(x)
+    return core.tensor(_semantic.builder.create_cos(x.handle), x.type)
+@core.builtin
+@_check_dtype(dtypes=["fp32", "fp64"])
+@_add_math_1arg_docstr("sine")
+@core._tensor_member_fn
+def sin(x, _semantic=None):
+    x = _semantic.to_tensor(x)
+    return core.tensor(_semantic.builder.create_sin(x.handle), x.type)
+@core.builtin
+@_check_dtype(dtypes=["fp32", "fp64"])
+@_add_math_1arg_docstr("fast square root")
+@core._tensor_member_fn
+def sqrt(x, _semantic=None):
+    x = _semantic.to_tensor(x)
+    return core.tensor(_semantic.builder.create_sqrt(x.handle), x.type)
+@core.builtin
+@_check_dtype(dtypes=["fp32"])
+@_add_math_1arg_docstr("precise square root (rounding to nearest wrt the IEEE standard)")
+@core._tensor_member_fn
+def sqrt_rn(x, _semantic=None):
+    x = _semantic.to_tensor(x)
+    return core.tensor(_semantic.builder.create_precise_sqrt(x.handle), x.type)
+@core.builtin
+@_check_dtype(dtypes=["fp32", "fp64"])
+@_add_math_1arg_docstr("inverse square root")
+@core._tensor_member_fn
+def rsqrt(x, _semantic=None):
+    x = _semantic.to_tensor(x)
+    return core.tensor(_semantic.builder.create_rsqrt(x.handle), x.type)
+@core._tensor_member_fn
+@core.builtin
+@_add_math_1arg_docstr("absolute value")
+def abs(x, _semantic=None):
+    x = _semantic.to_tensor(x)
+    dtype = x.dtype
+    if dtype.is_fp8e4b15():
+        mask = core.full(x.shape, 0x7F, core.int8, _semantic=_semantic)
+        return core.tensor(_semantic.builder.create_and(x.handle, mask.handle), x.type)
+    elif dtype.is_floating():
+        return core.tensor(_semantic.builder.create_fabs(x.handle), x.type)
+    elif dtype.is_int_signed():
+        return core.tensor(_semantic.builder.create_iabs(x.handle), x.type)
+    elif dtype.is_int_unsigned():
+        return x  # no-op
+    else:
+        assert False, f"Unexpected dtype {dtype}"
+@core.builtin
+@_add_math_2arg_docstr("fast division")
+def fdiv(x, y, ieee_rounding=False, _semantic=None):
+    ieee_rounding = core._unwrap_if_constexpr(ieee_rounding)
+    x = _semantic.to_tensor(x)
+    y = _semantic.to_tensor(y)
+    return _semantic.fdiv(x, y, ieee_rounding)
+@core.builtin
+@_check_dtype(dtypes=["fp32"])
+@_add_math_2arg_docstr("precise division (rounding to nearest wrt the IEEE standard)")
+def div_rn(x, y, _semantic=None):
+    x = _semantic.to_tensor(x)
+    y = _semantic.to_tensor(y)
+    x, y = core.binary_op_type_legalization(x, y, _semantic)
+    return core.tensor(_semantic.builder.create_precise_divf(x.handle, y.handle), x.type)
+@core.builtin
+@_check_dtype(dtypes=["fp32", "fp64"])
+@_add_math_1arg_docstr("error function")
+@core._tensor_member_fn
+def erf(x, _semantic=None):
+    x = _semantic.to_tensor(x)
+    return core.tensor(_semantic.builder.create_erf(x.handle), x.type)
+@core.builtin
+@_check_dtype(dtypes=["fp32", "fp64"])
+@_add_math_1arg_docstr("floor")
+@core._tensor_member_fn
+def floor(x, _semantic=None):
+    x = _semantic.to_tensor(x)
+    return core.tensor(_semantic.builder.create_floor(x.handle), x.type)
+@core.builtin
+@_check_dtype(dtypes=["fp32", "fp64"])
+@_add_math_1arg_docstr("ceil")
+@core._tensor_member_fn
+def ceil(x, _semantic=None):
+    x = _semantic.to_tensor(x)
+    return core.tensor(_semantic.builder.create_ceil(x.handle), x.type)
+@core.builtin
+@_add_math_3arg_docstr("fused multiply-add")
+def fma(x, y, z, _semantic=None):
+    x = _semantic.to_tensor(x)
+    y = _semantic.to_tensor(y)
+    z = _semantic.to_tensor(z)
+    x, y = core.binary_op_type_legalization(x, y, _semantic)
+    z, x = core.binary_op_type_legalization(z, x, _semantic)
+    z, y = core.binary_op_type_legalization(z, y, _semantic)
+    return core.tensor(_semantic.builder.create_fma(x.handle, y.handle, z.handle), x.type)

triton/language/random.py ADDED Viewed

@@ -0,0 +1,218 @@
+from ..runtime.jit import jit
+from . import core as tl
+from . import math
+N_ROUNDS_DEFAULT = 10  # Default number of rounds for philox
+# -------------------
+# randint
+# -------------------
+@jit
+def philox_impl(c0, c1, c2, c3, k0, k1, n_rounds: tl.constexpr = N_ROUNDS_DEFAULT):
+    """
+    Run `n_rounds` rounds of Philox for state (c0, c1, c2, c3) and key (k0, k1).
+    """
+    if c0.dtype == tl.uint32:
+        PHILOX_KEY_A: tl.constexpr = 0x9E3779B9
+        PHILOX_KEY_B: tl.constexpr = 0xBB67AE85
+        PHILOX_ROUND_A: tl.constexpr = 0xD2511F53
+        PHILOX_ROUND_B: tl.constexpr = 0xCD9E8D57
+    else:
+        tl.static_assert(c0.dtype == tl.uint64, "dtype not supported in philox_impl")
+        PHILOX_KEY_A: tl.constexpr = 0x9E3779B97F4A7C15
+        PHILOX_KEY_B: tl.constexpr = 0xBB67AE8584CAA73B
+        PHILOX_ROUND_A: tl.constexpr = 0xD2E7470EE14C6C93
+        PHILOX_ROUND_B: tl.constexpr = 0xCA5A826395121157
+    for _ in tl.static_range(n_rounds):
+        # for _ in range(n_rounds):
+        # update random state
+        A = PHILOX_ROUND_A
+        B = PHILOX_ROUND_B
+        _c0, _c2 = c0, c2
+        c0 = math.umulhi(B, _c2) ^ c1 ^ k0
+        c2 = math.umulhi(A, _c0) ^ c3 ^ k1
+        c1 = tl.mul(B, _c2, sanitize_overflow=False)
+        c3 = tl.mul(A, _c0, sanitize_overflow=False)
+        # raise key
+        k0 = tl.add(k0, PHILOX_KEY_A, sanitize_overflow=False)
+        k1 = tl.add(k1, PHILOX_KEY_B, sanitize_overflow=False)
+    return c0, c1, c2, c3
+@jit
+def philox(seed, c0, c1, c2, c3, n_rounds: tl.constexpr = N_ROUNDS_DEFAULT):
+    seed = tl.to_tensor(seed)
+    tl.static_assert(seed.dtype.is_int())
+    seed = seed.to(tl.uint64)
+    c0 = tl.to_tensor(c0)
+    c1 = tl.to_tensor(c1)
+    c2 = tl.to_tensor(c2)
+    c3 = tl.to_tensor(c3)
+    if tl.constexpr(c0.dtype.primitive_bitwidth) == 32:
+        int_dtype = tl.uint32
+        seed_hi = ((seed >> 32) & 0xffffffff).to(tl.uint32)
+        seed_lo = (seed & 0xffffffff).to(tl.uint32)
+    else:
+        tl.static_assert(tl.constexpr(c0.dtype.primitive_bitwidth) == 64, "bitwidth not supported in philox")
+        int_dtype = tl.uint64
+        seed_hi = tl.full((1, ), 0, dtype=int_dtype)
+        seed_lo = seed
+    c0 = c0.to(int_dtype, bitcast=True)
+    c1 = c1.to(int_dtype, bitcast=True)
+    c2 = c2.to(int_dtype, bitcast=True)
+    c3 = c3.to(int_dtype, bitcast=True)
+    return philox_impl(c0, c1, c2, c3, seed_lo, seed_hi, n_rounds)
+@jit
+def randint(seed, offset, n_rounds: tl.constexpr = N_ROUNDS_DEFAULT):
+    """
+    Given a :code:`seed` scalar and an :code:`offset` block, returns a single
+    block of random :code:`int32`.
+    If you need multiple streams of random numbers,
+    using `randint4x` is likely to be faster than calling `randint` 4 times.
+    :param seed: The seed for generating random numbers.
+    :param offset: The offsets to generate random numbers for.
+    """
+    ret, _, _, _ = randint4x(seed, offset, n_rounds)
+    return ret
+@jit
+def randint4x(seed, offset, n_rounds: tl.constexpr = N_ROUNDS_DEFAULT):
+    """
+    Given a :code:`seed` scalar and an :code:`offset` block, returns four
+    blocks of random :code:`int32`.
+    This is the maximally efficient entry point
+    to Triton's Philox pseudo-random number generator.
+    :param seed: The seed for generating random numbers.
+    :param offsets: The offsets to generate random numbers for.
+    """
+    # _0 = tl.zeros(offset.shape, offset.dtype)
+    offset_lo = offset.to(tl.uint32)
+    _0 = offset_lo * 0
+    if tl.constexpr(offset.dtype.primitive_bitwidth) > 32:
+        offset_hi = (offset >> 32).to(tl.uint32)
+    else:
+        offset_hi = _0
+    return philox(seed, offset_lo, offset_hi, _0, _0, n_rounds)
+# -------------------
+# rand
+# -------------------
+# @jit
+# def uint32_to_uniform_float(x):
+#     """
+#     Numerically stable function to convert a random uint32 into a random float uniformly sampled in [0, 1).
+#     """
+#     two_to_the_minus_32: tl.constexpr = 2.328306e-10
+#     return x * two_to_the_minus_32
+@jit
+def uint_to_uniform_float(x):
+    """
+    Numerically stable function to convert a random uint into a random float uniformly sampled in [0, 1).
+    """
+    # TODO: fix frontend issues and cleanup
+    # conditions can be simplified
+    # scale is ((2**23 - 1) / 2**23) * 2**(N_BITS - 1)
+    if tl.constexpr(x.dtype == tl.uint32) or tl.constexpr(x.dtype == tl.int32):
+        # maximum value such that `MAX_INT * scale < 1.0` (with float rounding)
+        x = x.to(tl.int32, bitcast=True)
+        scale = 4.6566127342e-10
+    else:
+        tl.static_assert(tl.constexpr(x.dtype == tl.uint64) or tl.constexpr(x.dtype == tl.int64))
+        x = x.to(tl.int64, bitcast=True)
+        scale = 1.0842020432385337e-19
+    x = tl.where(x < 0, -x - 1, x)
+    return x * scale
+@jit
+def rand(seed, offset, n_rounds: tl.constexpr = N_ROUNDS_DEFAULT):
+    """
+    Given a :code:`seed` scalar and an :code:`offset` block,
+    returns a block of random :code:`float32` in :math:`U(0, 1)`.
+    :param seed: The seed for generating random numbers.
+    :param offsets: The offsets to generate random numbers for.
+    """
+    source = randint(seed, offset, n_rounds)
+    return uint_to_uniform_float(source)
+@jit
+def rand4x(seed, offsets, n_rounds: tl.constexpr = N_ROUNDS_DEFAULT):
+    """
+    Given a :code:`seed` scalar and an :code:`offsets` block,
+    returns 4 blocks of random :code:`float32` in :math:`U(0, 1)`.
+    :param seed: The seed for generating random numbers.
+    :param offsets: The offsets to generate random numbers for.
+    """
+    i1, i2, i3, i4 = randint4x(seed, offsets, n_rounds)
+    u1 = uint_to_uniform_float(i1)
+    u2 = uint_to_uniform_float(i2)
+    u3 = uint_to_uniform_float(i3)
+    u4 = uint_to_uniform_float(i4)
+    return u1, u2, u3, u4
+# -------------------
+# randn
+# -------------------
+@jit
+def pair_uniform_to_normal(u1, u2):
+    """Box-Muller transform"""
+    u1 = tl.maximum(1.0e-7, u1)
+    th = 6.283185307179586 * u2
+    r = math.sqrt(-2.0 * math.log(u1))
+    return r * math.cos(th), r * math.sin(th)
+@jit
+def randn(seed, offset, n_rounds: tl.constexpr = N_ROUNDS_DEFAULT):
+    """
+    Given a :code:`seed` scalar and an :code:`offset` block,
+    returns a block of random :code:`float32` in :math:`\\mathcal{N}(0, 1)`.
+    :param seed: The seed for generating random numbers.
+    :param offsets: The offsets to generate random numbers for.
+    """
+    i1, i2, _, _ = randint4x(seed, offset, n_rounds)
+    u1 = uint_to_uniform_float(i1)
+    u2 = uint_to_uniform_float(i2)
+    n1, _ = pair_uniform_to_normal(u1, u2)
+    return n1
+@jit
+def randn4x(seed, offset, n_rounds: tl.constexpr = N_ROUNDS_DEFAULT):
+    """
+    Given a :code:`seed` scalar and an :code:`offset` block,
+    returns 4 blocks of random :code:`float32` in :math:`\\mathcal{N}(0, 1)`.
+    :param seed: The seed for generating random numbers.
+    :param offsets: The offsets to generate random numbers for.
+    """
+    u1, u2, u3, u4 = rand4x(seed, offset, n_rounds)
+    n1, n2 = pair_uniform_to_normal(u1, u2)
+    n3, n4 = pair_uniform_to_normal(u3, u4)
+    return n1, n2, n3, n4