PyPI - flash-hog - Versions diffs - 0.1.0__tar.gz - Mend

flash-hog 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (18) hide show

flash_hog-0.1.0/PKG-INFO +42 -0
flash_hog-0.1.0/README.md +25 -0
flash_hog-0.1.0/fhog/__init__.py +3 -0
flash_hog-0.1.0/fhog/benchhhh.py +110 -0
flash_hog-0.1.0/fhog/jax/_attention_impl.py +71 -0
flash_hog-0.1.0/fhog/jax/attention.py +107 -0
flash_hog-0.1.0/fhog/jax_refs/compare.py +89 -0
flash_hog-0.1.0/fhog/jax_refs/flash_impl.py +442 -0
flash_hog-0.1.0/fhog/jax_refs/flash_impl_herman.py +591 -0
flash_hog-0.1.0/fhog/jax_refs/jax_impl.py +727 -0
flash_hog-0.1.0/fhog/jax_refs/torch_impl.py +250 -0
flash_hog-0.1.0/fhog/mgpu_tests.py +1240 -0
flash_hog-0.1.0/fhog/pallas_attention.py +665 -0
flash_hog-0.1.0/fhog/pallas_bwdbwd.py +343 -0
flash_hog-0.1.0/fhog/test_bwdbwd.py +106 -0
flash_hog-0.1.0/fhog/triton_bwdbwd.py +654 -0
flash_hog-0.1.0/fhog/triton_flash.py +257 -0
flash_hog-0.1.0/pyproject.toml +65 -0

flash_hog-0.1.0/PKG-INFO ADDED Viewed

@@ -0,0 +1,42 @@
+Metadata-Version: 2.3
+Name: flash-hog
+Version: 0.1.0
+Summary: Add your description here
+Requires-Dist: absl-py>=2.4.0
+Requires-Dist: chex>=0.1.91
+Requires-Dist: einops>=0.8.1
+Requires-Dist: equinox>=0.13.2
+Requires-Dist: jax[cuda13]>=0.8.0
+Requires-Dist: nvidia-cutlass-dsl>=4.2.1
+Requires-Dist: pytest>=8.4.2
+Requires-Dist: ruff>=0.14.2
+Requires-Dist: torch>=2.9.0
+Requires-Dist: ty>=0.0.1a24
+Requires-Python: >=3.12, <3.14
+Description-Content-Type: text/markdown
+# Flash Hog
+<p align="center">
+<img src="assets/logo.png" alt="Flash Hog Logo" width="256" />
+</p>
+This repo contains the code for Flash Higher-Order-Gradients, aka. Flash Hog.
+This kernel achieves around a 3.7x speedup over an XLA optimized kernel, with linear memory scaling instead of quadratic scaling.
+<p align="center">
+<img src="assets/speedup.png" alt="Hog Speedup" width="512"/>
+</p>
+## Installation
+TODO
+## Method
+Flash Hog does 4 recomputation passes to avoid any atomics or saving any intermediary tensors of shape `(N_Q, N_K)`.
+This shakes out to be thread-wise tiling across Q in 3 passes first, once to compute `dd`, then once for `b`, then once for both `dQ'` and `ddO`.
+Finally we do another pass tiled over K, producing `dK'` and `dV'`.
+The equations we implement are the following:
+<p align="center">
+<img src="assets/handwritten_equations.png" alt="Equations" width="512"/>
+</p>

flash_hog-0.1.0/README.md ADDED Viewed

@@ -0,0 +1,25 @@
+# Flash Hog
+<p align="center">
+<img src="assets/logo.png" alt="Flash Hog Logo" width="256" />
+</p>
+This repo contains the code for Flash Higher-Order-Gradients, aka. Flash Hog.
+This kernel achieves around a 3.7x speedup over an XLA optimized kernel, with linear memory scaling instead of quadratic scaling.
+<p align="center">
+<img src="assets/speedup.png" alt="Hog Speedup" width="512"/>
+</p>
+## Installation
+TODO
+## Method
+Flash Hog does 4 recomputation passes to avoid any atomics or saving any intermediary tensors of shape `(N_Q, N_K)`.
+This shakes out to be thread-wise tiling across Q in 3 passes first, once to compute `dd`, then once for `b`, then once for both `dQ'` and `ddO`.
+Finally we do another pass tiled over K, producing `dK'` and `dV'`.
+The equations we implement are the following:
+<p align="center">
+<img src="assets/handwritten_equations.png" alt="Equations" width="512"/>
+</p>

flash_hog-0.1.0/fhog/__init__.py ADDED Viewed

@@ -0,0 +1,3 @@
+from fhog.triton_bwdbwd import flash_bwdbwd
+__all__ = ["flash_bwdbwd"]

flash_hog-0.1.0/fhog/benchhhh.py ADDED Viewed

@@ -0,0 +1,110 @@
+from pathlib import Path
+import jax.numpy as jnp
+import jax.random as jrandom
+import torch
+import triton
+from fhog.jax_refs.jax_impl import attn_bwd_bwd
+from fhog.triton_bwdbwd import flash_bwdbwd
+N_REPS = 5000
+N_WARMUP = 300
+def full_benchmark(nq, nkv):
+    d_in = 64
+    d_out = 64
+    q = torch.randn((nq, d_in), device="cuda", dtype=torch.float16)
+    k = torch.randn((nkv, d_in), device="cuda", dtype=torch.float16)
+    v = torch.randn((nkv, d_out), device="cuda", dtype=torch.float16)
+    o = torch.randn((nq, d_out), device="cuda", dtype=torch.float16)
+    l = torch.randn((nq,), device="cuda", dtype=torch.float16)
+    do = torch.randn((nq, d_out), device="cuda", dtype=torch.float16)
+    ddq = torch.randn((nq, d_in), device="cuda", dtype=torch.float16)
+    ddk = torch.randn((nkv, d_in), device="cuda", dtype=torch.float16)
+    ddv = torch.randn((nkv, d_out), device="cuda", dtype=torch.float16)
+    def jax_benchmark():
+        # nq, d_in = q.shape
+        # nkv, _ = k.shape
+        # _, d_out = v.shape
+        scale = jnp.array(1.0 / jnp.sqrt(d_in), dtype=jnp.float32)
+        # assert nq == 128
+        # assert nkv == 256
+        # assert d_in == 64
+        # assert d_out == 64
+        def do_run_jax():
+            # attn_bwd_bwd_stats(
+            attn_bwd_bwd(
+                # stats=jnp.asarray(l),
+                q=jnp.asarray(q),
+                k=jnp.asarray(k),
+                v=jnp.asarray(v),
+                # o=jnp.asarray(o),
+                do=jnp.asarray(do),
+                ddq=jnp.asarray(ddq),
+                ddk=jnp.asarray(ddk),
+                ddv=jnp.asarray(ddv),
+                scale=jnp.asarray(scale),
+                is_causal=False,
+                sliding_window_length=jnp.inf,
+            )
+        results = triton.testing.do_bench(do_run_jax, rep=N_REPS, warmup=N_WARMUP)
+        return results
+    def trition_benchmark():
+        def do_run_triton():
+            triton_dq2, triton_dk2, triton_dv2, triton_ddo = flash_bwdbwd(
+                q.unsqueeze(0),
+                k.unsqueeze(0),
+                v.unsqueeze(0),
+                o.unsqueeze(0),
+                do.unsqueeze(0),
+                ddq.unsqueeze(0),
+                ddk.unsqueeze(0),
+                ddv.unsqueeze(0),
+                l.unsqueeze(0),
+                1 / d_in**0.5,
+            )
+        results = triton.testing.do_bench(do_run_triton, rep=N_REPS, warmup=N_WARMUP)
+        return results
+    try:
+        jax_time = jax_benchmark()
+    except Exception as e:
+        print(f"JAX benchmark failed for nq={nq}, nkv={nkv} with error: {e}")
+        jax_time = float("inf")
+    triton_time = trition_benchmark()
+    print(f"{nq}: {jax_time=} {triton_time=} => speedup: {jax_time / triton_time:.2f}x")
+    return jax_time, triton_time
+if __name__ == "__main__":
+    # nq = 512
+    # nkv = 1024
+    # for siz in
+    for size in [
+        # 128,
+        # 512,
+        # 1024,
+        # 2048,
+        # 4096,
+        8192,
+        16384,
+        32768,
+        65536,
+        131072,
+        262144,
+        524288,  # 2^19
+        1048576,  # 2^20
+        2097152,
+    ]:
+        # for size in [128, 512]:
+        full_benchmark(size, size)

flash_hog-0.1.0/fhog/jax/_attention_impl.py ADDED Viewed

@@ -0,0 +1,71 @@
+"""
+Implementations for the functions used in fhog.jax.attention.
+"""
+from functools import partial
+import jax
+from jax._src.cudnn.fused_attention_stablehlo import MaskType
+from jax._src.cudnn.fused_attention_stablehlo import dot_product_attention as cuda_dot_product_attention
+from jax.tree_util import Partial
+from fhog.pallas_bwdbwd import TuningConfig, flash_bwdbwd0
+def dot_product_attention_fwd(query, key, value, mask_type: MaskType, scale: float):
+    """
+    Forward pass, no saving.
+    Only needs to return the output.
+    """
+    return cuda_dot_product_attention(query, key, value, mask_type=mask_type, scale=scale)
+def dot_product_attention_fwd_rule(query, key, value, mask_type: MaskType, scale: float):
+    """
+    Forward pass, saving stats, Q, K, V and O.
+    """
+    out, vjp_fun = jax.vjp(partial(cuda_dot_product_attention, mask_type=mask_type, scale=scale), query, key, value)
+    residual = vjp_fun
+    return out, residual
+def dot_product_attention_bwd_rule(mask_type: MaskType, scale: float, res, g):
+    """
+    Backward pass, no saving
+    """
+    vjp_fun = res
+    # breakpoint()
+    dQ, dK, dV = vjp_fun(g)
+    return dQ, dK, dV
+def dot_product_attention_bwd_rule_fwd_rule(mask_type: MaskType, scale: float, res, g):
+    """
+    Backward pass, saving for higher order backward
+    """
+    vjp_fun = res
+    dO = g
+    # query, key, value = vjp_fun.args_res
+    # *_, stats, out = vjp_fun.opaque_residuals
+    dQ, dK, dV = vjp_fun(dO)
+    residual = (vjp_fun, dO)
+    return (dQ, dK, dV), residual
+def dot_product_attention_bwd_rule_bwd_rule(mask_type: MaskType, scale: float, res, g):
+    """
+    Backward pass through the backward pass
+    """
+    vjp_fun, dO = res
+    query, key, value = vjp_fun.args_res
+    *_, stats, out = vjp_fun.opaque_residuals
+    vjp_fun_structure = jax.tree.structure(vjp_fun)
+    # breakpoint()
+    ddQ, ddK, ddV = g
+    dQ2, dK2, dV2, ddO = flash_bwdbwd0(query, key, value, out, dO, ddQ, ddK, ddV, stats, scale, config=TuningConfig(tile_q=128, tile_k=32, max_concurrent_steps=4))
+    vjp_fun_grad = jax.tree.unflatten(vjp_fun_structure, [dQ2, dK2, dV2, None, None, None])  # TODO: Don't I need new dO in the last argument here?
+    return vjp_fun_grad, ddO

flash_hog-0.1.0/fhog/jax/attention.py ADDED Viewed

@@ -0,0 +1,107 @@
+"""
+Main interface for Jax attention with support for higher order memory-efficient backward on GPU.
+"""
+from functools import partial
+import jax
+import jax.numpy as jnp
+from jax import Array
+from jax._src.cudnn.fused_attention_stablehlo import MaskType
+from jaxtyping import Bool, Float, Int
+import fhog.jax._attention_impl as attn_impl
+# Reimplementation of much of jax._src.cudnn.fused_attention_stablehlo as used in jax.nn.dot_product_attention
+@partial(jax.custom_vjp, nondiff_argnames=["mask_type", "scale"])
+def dot_product_attention(
+    query: Float[Array, "T N H"],
+    key: Float[Array, "S N H"],
+    value: Float[Array, "S N H"],
+    mask_type: MaskType = MaskType.NO_MASK,
+    scale: float | None = None,
+):
+    """
+    Dimensions:
+        T: Query length
+        S: Key/value length
+        N: Number of attention heads
+        H: Head dimension
+    """
+    if scale is None:
+        scale = query.shape[-1] ** -0.5
+    dtype = query.dtype
+    assert dtype == key.dtype == value.dtype
+    return attn_impl.dot_product_attention_fwd(query, key, value, mask_type=mask_type, scale=scale)
+@partial(jax.custom_vjp, nondiff_argnames=["mask_type", "scale"])
+def _dot_product_attention_fwd(query, key, value, mask_type: bool, scale: float):
+    """
+    Forward pass, saving for regular backward.
+    """
+    print("Running _dot_product_attention_fwd")
+    out, res = attn_impl.dot_product_attention_fwd_rule(query, key, value, mask_type=mask_type, scale=scale)
+    return out, res
+def _dot_product_attention_fwd_fwd(query, key, value, mask_type: bool, scale: float):
+    """
+    Run the forward pass, saving in expectation of a regular backward pass and a higher order backward pass.
+    """
+    print("Running _dot_product_attention_fwd_fwd")
+    out, res = attn_impl.dot_product_attention_fwd_rule(query, key, value, mask_type=mask_type, scale=scale)
+    return (out, res), res
+def _dot_product_attention_fwd_bwd(mask_type: bool, scale: float, res, g):
+    """
+    Backward through the saving forward pass.
+    """
+    print("Running _dot_product_attention_fwd_bwd")
+    # breakpoint()
+    dO, dvjp_fun = g
+    dQ2, dK2, dV2 = dvjp_fun.args_res
+    # *_, stats, out = dvjp_fun.opaque_residuals  # TODO: Do I need dO from here?
+    dQ, dK, dV = attn_impl.dot_product_attention_bwd_rule(mask_type=mask_type, scale=scale, res=res, g=dO)
+    return dQ + dQ2, dK + dK2, dV + dV2
+_dot_product_attention_fwd.defvjp(_dot_product_attention_fwd_fwd, _dot_product_attention_fwd_bwd)
+@partial(jax.custom_vjp, nondiff_argnames=["mask_type", "scale"])
+def _dot_product_attention_bwd(mask_type: MaskType, scale: float, res, g):
+    """
+    Regular backward pass.
+    """
+    print("Running _dot_product_attention_bwd")
+    grads = attn_impl.dot_product_attention_bwd_rule(mask_type=mask_type, scale=scale, res=res, g=g)
+    return grads
+def _dot_product_attention_bwd_fwd(mask_type: MaskType, scale: float, res, g):
+    """
+    Backward pass, saving for higher order backward.
+    """
+    print("Running _dot_product_attention_bwd_fwd")
+    out, res = attn_impl.dot_product_attention_bwd_rule_fwd_rule(mask_type=mask_type, scale=scale, res=res, g=g)
+    return out, res
+def _dot_product_attention_bwd_bwd(mask_type: MaskType, scale: float, res, g):
+    """
+    Backward pass through the backward pass.
+    """
+    print("Running _dot_product_attention_bwd_bwd")
+    grads = attn_impl.dot_product_attention_bwd_rule_bwd_rule(mask_type=mask_type, scale=scale, res=res, g=g)
+    return grads
+_dot_product_attention_bwd.defvjp(_dot_product_attention_bwd_fwd, _dot_product_attention_bwd_bwd)
+dot_product_attention.defvjp(_dot_product_attention_fwd, _dot_product_attention_bwd)

flash_hog-0.1.0/fhog/jax_refs/compare.py ADDED Viewed

@@ -0,0 +1,89 @@
+import numpy as np
+import fhog.jax_refs.jax_impl as jax_impl
+import fhog.jax_refs.torch_impl as torch_impl
+print("Loading libraries...")
+import jax.numpy as jnp
+import jax.random as jrandom
+import torch
+print("Imported libraries.")
+def compare_jax_and_torch(
+    *, stats, q, k, v, o, do, ddq, ddk, ddv, scale=1.0, is_causal=False
+):
+    jax_out = jax_impl.attn_bwd_bwd_stats(
+        stats=stats,
+        q=q,
+        k=k,
+        v=v,
+        o=o,
+        do=do,
+        ddq=ddq,
+        ddk=ddk,
+        ddv=ddv,
+        scale=scale,
+        is_causal=is_causal,
+    )
+    torch_out = torch_impl.attn_bwd_bwd(
+        stats=torch.tensor(np.asarray(stats), dtype=torch.float32),
+        q=torch.tensor(np.asarray(q), dtype=torch.float32),
+        k=torch.tensor(np.asarray(k), dtype=torch.float32),
+        v=torch.tensor(np.asarray(v), dtype=torch.float32),
+        o=torch.tensor(np.asarray(o), dtype=torch.float32),
+        do=torch.tensor(np.asarray(do), dtype=torch.float32),
+        ddq=torch.tensor(np.asarray(ddq), dtype=torch.float32),
+        ddk=torch.tensor(np.asarray(ddk), dtype=torch.float32),
+        ddv=torch.tensor(np.asarray(ddv), dtype=torch.float32),
+        scale=torch.tensor(np.asarray(scale), dtype=torch.float32),
+        is_causal=is_causal,
+    )
+    return jax_out, torch_out
+if __name__ == "__main__":
+    key1, key2, key3, key4, key5, key6, key7, key8, key9 = jrandom.split(
+        jrandom.PRNGKey(42), 9
+    )
+    nq = 128
+    nkv = 256
+    d_in = 64
+    d_out = 64
+    dtype = jnp.float32
+    scale = jnp.array(1.0 / jnp.sqrt(d_in), dtype=jnp.float32)
+    q = jrandom.normal(key1, (nq, d_in), dtype=dtype)
+    k = jrandom.normal(key2, (nkv, d_in), dtype=dtype)
+    v = jrandom.normal(key3, (nkv, d_out), dtype=dtype)
+    o = jrandom.normal(key8, (nq, d_out), dtype=dtype)
+    do = jrandom.normal(key4, (nq, d_out), dtype=dtype)
+    ddq = jrandom.normal(key5, (nq, d_in), dtype=dtype)
+    ddk = jrandom.normal(key6, (nkv, d_in), dtype=dtype)
+    ddv = jrandom.normal(key7, (nkv, d_out), dtype=dtype)
+    stats = jrandom.normal(key9, (nq,), dtype=dtype)
+    is_causal = False
+    jax_out, torch_out = compare_jax_and_torch(
+        stats=stats,
+        q=q,
+        k=k,
+        v=v,
+        o=o,
+        do=do,
+        ddq=ddq,
+        ddk=ddk,
+        ddv=ddv,
+        scale=scale,
+        is_causal=is_causal,
+    )
+    print(jax_out)
+    print(torch_out)
+    for tensor1, tensor2 in zip(jax_out, torch_out):
+        if jnp.allclose(tensor1, jnp.asarray(tensor2.numpy())):
+            print("success")
+        else:
+            print(torch.std(torch.tensor(np.asarray(tensor1)) - tensor2))
+            print("failure")