PyPI - fastvideo-kernel - Versions diffs - 0.3.0__tar.gz → 0.3.2__tar.gz - Mend

fastvideo-kernel 0.3.0tar.gz → 0.3.2tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (91) hide show

{fastvideo_kernel-0.3.0 → fastvideo_kernel-0.3.2}/CMakeLists.txt RENAMED Viewed

@@ -1,6 +1,13 @@
 cmake_minimum_required(VERSION 3.26 FATAL_ERROR)
 project(fastvideo-kernel LANGUAGES CXX)
+# Capture any caller-provided -DCMAKE_CUDA_ARCHITECTURES *before* enable_language(CUDA)
+# auto-populates it with CMake's built-in default (an old arch, e.g. sm_75 on CUDA 13).
+# torch's cmake actually ignores CMAKE_CUDA_ARCHITECTURES (it drives arch selection via
+# TORCH_CUDA_ARCH_LIST), so we only use this captured value to honor an explicit pin by
+# translating it into TORCH_CUDA_ARCH_LIST below.
+set(_FASTVIDEO_USER_CUDA_ARCH "${CMAKE_CUDA_ARCHITECTURES}")
 # Prefer environment variable (used by CI or uv pip install git+repo_addr) if CMake var is not explicitly set.
 if(NOT DEFINED GPU_BACKEND AND DEFINED ENV{GPU_BACKEND})
     set(GPU_BACKEND "$ENV{GPU_BACKEND}")
@@ -19,6 +26,71 @@ endif()
 # Find Python and Torch
 find_package(Python COMPONENTS Interpreter Development.Module REQUIRED)
+# ---------------------------------------------------------------------------
+# Resolve the target CUDA architecture, BEFORE find_package(Torch) below.
+#
+# torch's cmake (Caffe2 public/cuda.cmake) takes over arch selection: it emits
+# the real -gencode flags from TORCH_CUDA_ARCH_LIST and forces
+# CMAKE_CUDA_ARCHITECTURES to OFF. So the *effective* arch is whatever
+# TORCH_CUDA_ARCH_LIST is when find_package(Torch) runs. build.sh exports it;
+# standards-based builds (pip / uv pip install, sdist) don't, and torch then
+# auto-detects an arch that does not match the GPU -- the kernels build but fail
+# at runtime ("no kernel image is available for execution on the device").
+# Resolve it here when absent (mirrors build.sh): honor a pinned
+# CMAKE_CUDA_ARCHITECTURES if given, else probe the visible GPU with torch.
+# ---------------------------------------------------------------------------
+if(NOT GPU_BACKEND STREQUAL "ROCM")
+    if(DEFINED ENV{TORCH_CUDA_ARCH_LIST})
+        message(STATUS "CUDA arch: TORCH_CUDA_ARCH_LIST=$ENV{TORCH_CUDA_ARCH_LIST} (from environment)")
+    elseif(TORCH_CUDA_ARCH_LIST)
+        set(ENV{TORCH_CUDA_ARCH_LIST} "${TORCH_CUDA_ARCH_LIST}")
+        message(STATUS "CUDA arch: TORCH_CUDA_ARCH_LIST=${TORCH_CUDA_ARCH_LIST} (from cmake)")
+    else()
+        set(_FV_ARCH_LIST "")
+        if(_FASTVIDEO_USER_CUDA_ARCH)
+            # Caller pinned -DCMAKE_CUDA_ARCHITECTURES (which torch ignores); translate it
+            # to the TORCH_CUDA_ARCH_LIST spelling: "121" -> "12.1", "90a" -> "9.0a".
+            foreach(_fv_arch IN LISTS _FASTVIDEO_USER_CUDA_ARCH)
+                string(REGEX MATCH "[af]$" _fv_suffix "${_fv_arch}")
+                string(REGEX REPLACE "[af]$" "" _fv_num "${_fv_arch}")
+                string(REGEX REPLACE "(.)$" ".\\1" _fv_num "${_fv_num}")    # dot before the last digit
+                list(APPEND _FV_ARCH_LIST "${_fv_num}${_fv_suffix}")
+            endforeach()
+            message(STATUS "CUDA arch: TORCH_CUDA_ARCH_LIST=${_FV_ARCH_LIST} (from -DCMAKE_CUDA_ARCHITECTURES=${_FASTVIDEO_USER_CUDA_ARCH})")
+        else()
+            # Best-effort probe of the visible GPU (mirrors build.sh detect_with_torch).
+            execute_process(
+                COMMAND "${Python_EXECUTABLE}" -c "import torch; assert torch.cuda.is_available(); mj, mn = torch.cuda.get_device_capability(0); print(f'{mj}.{mn}a' if (mj, mn) in ((9, 0), (12, 0)) else f'{mj}.{mn}')"
+                OUTPUT_VARIABLE _FV_ARCH_LIST
+                OUTPUT_STRIP_TRAILING_WHITESPACE
+                RESULT_VARIABLE _fv_detect_rc
+                ERROR_QUIET
+            )
+            if(_fv_detect_rc EQUAL 0 AND _FV_ARCH_LIST)
+                message(STATUS "CUDA arch: TORCH_CUDA_ARCH_LIST=${_FV_ARCH_LIST} (detected via torch, live GPU)")
+            else()
+                set(_FV_ARCH_LIST "")
+            endif()
+        endif()
+        if(_FV_ARCH_LIST)
+            set(TORCH_CUDA_ARCH_LIST "${_FV_ARCH_LIST}")
+            set(ENV{TORCH_CUDA_ARCH_LIST} "${_FV_ARCH_LIST}")
+        else()
+            message(FATAL_ERROR
+                "fastvideo-kernel: could not determine the target CUDA architecture.\n"
+                "Refusing to let torch auto-detect an arch that may not run on this GPU. "
+                "Fix with one of:\n"
+                "  - set TORCH_CUDA_ARCH_LIST (e.g. 12.1, or 9.0a for Hopper), or\n"
+                "  - pass -DCMAKE_CUDA_ARCHITECTURES=<arch> (e.g. 121), or\n"
+                "  - build where the target GPU is visible to torch.\n"
+                "Note: 'pip/uv pip install' builds under build isolation, which hides the "
+                "GPU; set TORCH_CUDA_ARCH_LIST or add --no-build-isolation. "
+                "fastvideo-kernel/build.sh sets all of this for you.")
+        endif()
+    endif()
+endif()
 # Robustly find Torch include paths using Python
 execute_process(
     COMMAND "${Python_EXECUTABLE}" -c "import torch; from torch.utils.cpp_extension import include_paths; print(';'.join(include_paths()))"
@@ -191,6 +263,11 @@ set(CUDA_FLAGS
     "--expt-relaxed-constexpr"
     "-Xcompiler=-fno-strict-aliasing"
     "-Xcompiler=-fPIC"
+    # ARM/aarch64 defaults `char` to unsigned, but ThunderKittens headers assume the
+    # x86 signed-char behavior (else base_types.cuh hits "narrowing conversion from
+    # char to signed char"). Force signed char so TK compiles on Grace Hopper; this
+    # is a no-op on x86_64, where char is already signed.
+    "-Xcompiler=-fsigned-char"
     "-DTORCH_COMPILE"
     "-Xnvlink=--verbose"
     "-Xptxas=--verbose"
@@ -349,4 +426,3 @@ if(ENABLE_ATTN_QAT_INFER)
     install(TARGETS fp4attn_cuda LIBRARY DESTINATION .)
     install(TARGETS fp4quant_cuda LIBRARY DESTINATION .)
 endif()

{fastvideo_kernel-0.3.0 → fastvideo_kernel-0.3.2}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: fastvideo-kernel
-Version: 0.3.0
+Version: 0.3.2
 Summary: Unified CUDA kernels for FastVideo
 Author-Email: Hao AI Lab <contact@haoailab.com>
 License:                                  Apache License
@@ -239,13 +239,13 @@ fully usable without it).
 The symbols the fastpath needs (`flash_attn.cute.block_sparsity.BlockSparseTensorsTorch`,
 `flash_attn.cute.interface._flash_attn_fwd`) are provided upstream by
 [Dao-AILab/flash-attention](https://github.com/Dao-AILab/flash-attention). Pin to
-commit `c19cd20e`: the wrapper targets that revision's `_flash_attn_fwd` signature
-(`m_block_size` / `n_block_size`); later upstream revisions reshaped it into a
-`tile_mn` tuple and are not drop-in compatible.
+commit `940cd9680f3315f2f06b43ab5bea2c2cf2d96806`, the revision FastVideo pins as
+the `flash-attn-4` source in the repo-root `pyproject.toml`; other revisions may
+have an incompatible `_flash_attn_fwd` signature.
 ```bash
 pip install "nvidia-cutlass-dsl>=4.5.0" torchvision
-pip install "git+https://github.com/Dao-AILab/flash-attention.git@c19cd20e#subdirectory=flash_attn/cute"
+pip install "git+https://github.com/Dao-AILab/flash-attention.git@940cd9680f3315f2f06b43ab5bea2c2cf2d96806#subdirectory=flash_attn/cute"
 ```
 The CuTe kernel JIT-compiles on first use. Verified on Blackwell (sm_100) against

{fastvideo_kernel-0.3.0 → fastvideo_kernel-0.3.2}/README.md RENAMED Viewed

@@ -39,13 +39,13 @@ fully usable without it).
 The symbols the fastpath needs (`flash_attn.cute.block_sparsity.BlockSparseTensorsTorch`,
 `flash_attn.cute.interface._flash_attn_fwd`) are provided upstream by
 [Dao-AILab/flash-attention](https://github.com/Dao-AILab/flash-attention). Pin to
-commit `c19cd20e`: the wrapper targets that revision's `_flash_attn_fwd` signature
-(`m_block_size` / `n_block_size`); later upstream revisions reshaped it into a
-`tile_mn` tuple and are not drop-in compatible.
+commit `940cd9680f3315f2f06b43ab5bea2c2cf2d96806`, the revision FastVideo pins as
+the `flash-attn-4` source in the repo-root `pyproject.toml`; other revisions may
+have an incompatible `_flash_attn_fwd` signature.
 ```bash
 pip install "nvidia-cutlass-dsl>=4.5.0" torchvision
-pip install "git+https://github.com/Dao-AILab/flash-attention.git@c19cd20e#subdirectory=flash_attn/cute"
+pip install "git+https://github.com/Dao-AILab/flash-attention.git@940cd9680f3315f2f06b43ab5bea2c2cf2d96806#subdirectory=flash_attn/cute"
 ```
 The CuTe kernel JIT-compiles on first use. Verified on Blackwell (sm_100) against

{fastvideo_kernel-0.3.0 → fastvideo_kernel-0.3.2}/build.sh RENAMED Viewed

@@ -39,8 +39,13 @@ if [[ -n "${CONDA_PREFIX:-}" ]]; then
     unset _need_clean _host_arch
 fi
-# Ensure submodules are initialized if needed (tk)
-git submodule update --init --recursive
+# Ensure only the kernel's required headers are initialized. A repository-wide
+# update also clones the unrelated VBench evaluation submodule. Skip outside a
+# git checkout (e.g. Docker contexts that exclude .git), where the submodule
+# contents must already be present.
+if git rev-parse --git-dir >/dev/null 2>&1; then
+    git submodule update --init --recursive include/cutlass include/tk
+fi
 # Install build dependencies
 uv pip install scikit-build-core cmake ninja

{fastvideo_kernel-0.3.0 → fastvideo_kernel-0.3.2}/csrc/turbodiffusion/gemm/gemm.cu RENAMED Viewed

@@ -25,12 +25,16 @@
 #include "gemm/launch.hpp"
 void int8_gemm(
-  at::Tensor const& A, at::Tensor const& A_S,
-  at::Tensor const& B, at::Tensor const& B_S,
+  at::Tensor const& A, at::Tensor const& A_S,
+  at::Tensor const& B, at::Tensor const& B_S,
   torch::Tensor& C
 ) {
+  // The kernel dereferences raw pointers; a CPU tensor here (e.g. an Int8Linear
+  // never moved to CUDA) would otherwise fail as an illegal memory access.
+  TORCH_CHECK(A.is_cuda() && A_S.is_cuda() && B.is_cuda() && B_S.is_cuda() && C.is_cuda(),
+              "int8_gemm: all tensors must be on CUDA (move Int8Linear to CUDA before forward)");
   static constexpr int swizzle_dir = 1;
   static constexpr int swizzle_size_log = 5;

fastvideo_kernel-0.3.2/dist/fastvideo_kernel-0.3.2-cp312-cp312-manylinux_2_34_aarch64.manylinux_2_35_aarch64.whl ADDED Viewed

Binary file

fastvideo_kernel-0.3.0/dist/fastvideo_kernel-0.3.0-cp312-cp312-manylinux_2_34_x86_64.manylinux_2_35_x86_64.whl → fastvideo_kernel-0.3.2/dist/fastvideo_kernel-0.3.2-cp312-cp312-manylinux_2_34_x86_64.manylinux_2_35_x86_64.whl RENAMED Viewed

Binary file

{fastvideo_kernel-0.3.0 → fastvideo_kernel-0.3.2}/pyproject.toml RENAMED Viewed

@@ -9,7 +9,7 @@ build-backend = "scikit_build_core.build"
 [project]
 name = "fastvideo-kernel"
-version = "0.3.0"
+version = "0.3.2"
 description = "Unified CUDA kernels for FastVideo"
 readme = "README.md"
 requires-python = ">=3.10"

{fastvideo_kernel-0.3.0 → fastvideo_kernel-0.3.2}/python/fastvideo_kernel/__init__.py RENAMED Viewed

@@ -29,6 +29,15 @@ from fastvideo_kernel.block_sparse_attn_varlen import (
     block_sparse_attn_varlen,
 )
+from fastvideo_kernel.vsa_utils import (
+    VSA_TILE_SIZE,
+    get_tile_partition_indices,
+    get_reverse_tile_partition_indices,
+    construct_variable_block_sizes,
+    get_non_pad_index,
+    build_vsa_metadata,
+)
 __all__ = [
     "sliding_tile_attention",
     "video_sparse_attn",
@@ -44,5 +53,11 @@ __all__ = [
     "FastLayerNorm",
     "int8_linear",
     "int8_quant",
+    "VSA_TILE_SIZE",
+    "get_tile_partition_indices",
+    "get_reverse_tile_partition_indices",
+    "construct_variable_block_sizes",
+    "get_non_pad_index",
+    "build_vsa_metadata",
     "__version__",
 ]

{fastvideo_kernel-0.3.0 → fastvideo_kernel-0.3.2}/python/fastvideo_kernel/block_sparse_attn_cute_fwd.py RENAMED Viewed

@@ -46,8 +46,7 @@ def _load_fa4_cute():
     return BlockSparseTensorsTorch, _flash_attn_fwd
-# FA4 BSA fwd uses (m_block_size, n_block_size); m_block_size=128 is the
-# Q-side tile, kv_block_size comes from the caller's VSA logical KV block.
+# Q-side tile size; kv_block_size comes from the caller's VSA logical KV block.
 _M_BLOCK_SIZE_DEFAULT = 128
@@ -182,18 +181,18 @@ def _cute_forward(
         block_size=(q_sparse_block_size, kv_block_size),
     )
+    # _flash_attn_fwd returns (out, lse, p, row_max); keep the first two.
     out, lse = _flash_attn_fwd(
         q_bshd,
         k_bshd,
         v_bshd,
-        m_block_size=_M_BLOCK_SIZE_DEFAULT,
-        n_block_size=kv_block_size,
+        tile_mn=(_M_BLOCK_SIZE_DEFAULT, kv_block_size),
         mask_mod=_build_vbs_mask_mod(kv_block_size),
         block_sparse_tensors=sparse_tensors,
         aux_tensors=[variable_block_sizes],
         causal=False,
         return_lse=True,
-    )
+    )[:2]
     return out, lse

fastvideo_kernel-0.3.2/python/fastvideo_kernel/version.py ADDED Viewed

	@@ -0,0 +1 @@
1	+ __version__ = "0.3.2"

fastvideo_kernel-0.3.2/python/fastvideo_kernel/vsa_utils.py ADDED Viewed

@@ -0,0 +1,160 @@
+"""VSA metadata utilities — standalone, no fastvideo framework dependency.
+Provides the tile-partition index helpers and variable-block-size
+computations that are needed to call `video_sparse_attn` or
+`block_sparse_attn_from_indices` without depending on the full
+fastvideo package.
+"""
+from __future__ import annotations
+import functools
+import math
+import torch
+VSA_TILE_SIZE = (4, 4, 4)
+_SUPPORTED_VSA_BLOCK_VOLUMES = (64, 256)
+def _canonicalize_device(device: torch.device | str) -> torch.device:
+    """Resolve an indexless CUDA device before it is used as a cache key."""
+    device = torch.device(device)
+    if device.type == "cuda" and device.index is None:
+        return torch.device("cuda", torch.cuda.current_device())
+    return device
+@functools.lru_cache(maxsize=10)
+def get_tile_partition_indices(
+    dit_seq_shape: tuple[int, int, int],
+    tile_size: tuple[int, int, int],
+    device: torch.device,
+) -> torch.LongTensor:
+    """Map raster-order token indices to tile-contiguous order.
+    Groups spatially adjacent tokens into (ts_t x ts_h x ts_w) tiles
+    so that each tile's tokens are contiguous in the output.
+    """
+    T, H, W = dit_seq_shape
+    ts, hs, ws = tile_size
+    indices = torch.arange(T * H * W, device=device, dtype=torch.long).reshape(T, H, W)
+    ls = []
+    for t in range(math.ceil(T / ts)):
+        for h in range(math.ceil(H / hs)):
+            for w in range(math.ceil(W / ws)):
+                ls.append(indices[
+                    t * ts:min(t * ts + ts, T),
+                    h * hs:min(h * hs + hs, H),
+                    w * ws:min(w * ws + ws, W),
+                ].flatten())
+    return torch.cat(ls, dim=0)
+@functools.lru_cache(maxsize=10)
+def get_reverse_tile_partition_indices(
+    dit_seq_shape: tuple[int, int, int],
+    tile_size: tuple[int, int, int],
+    device: torch.device,
+) -> torch.LongTensor:
+    """Inverse of get_tile_partition_indices: tile order back to raster."""
+    return torch.argsort(get_tile_partition_indices(dit_seq_shape, tile_size, device))
+@functools.lru_cache(maxsize=10)
+def construct_variable_block_sizes(
+    dit_seq_shape: tuple[int, int, int],
+    num_tiles: tuple[int, int, int],
+    device: torch.device,
+    tile_size: tuple[int, int, int] = VSA_TILE_SIZE,
+) -> torch.LongTensor:
+    """Compute the number of valid tokens in each tile.
+    Tiles at the boundary of each dimension may contain fewer tokens
+    when the video shape is not evenly divisible by tile_size.
+    """
+    t, h, w = dit_seq_shape
+    ts_t, ts_h, ts_w = tile_size
+    n_t, n_h, n_w = num_tiles
+    def _sizes(dim_len: int, tile: int, n: int) -> torch.LongTensor:
+        sizes = torch.full((n,), tile, dtype=torch.int, device=device)
+        remainder = dim_len - (n - 1) * tile
+        sizes[-1] = remainder if remainder > 0 else tile
+        return sizes
+    t_sizes = _sizes(t, ts_t, n_t)
+    h_sizes = _sizes(h, ts_h, n_h)
+    w_sizes = _sizes(w, ts_w, n_w)
+    return (
+        t_sizes[:, None, None]
+        * h_sizes[None, :, None]
+        * w_sizes[None, None, :]
+    ).reshape(-1)
+def get_non_pad_index(
+    variable_block_sizes: torch.LongTensor,
+    max_block_size: int,
+) -> torch.LongTensor:
+    """Find positions of real tokens within a block-padded layout.
+    Each block occupies max_block_size slots. This returns the flat
+    indices of the valid (non-padding) positions.
+    """
+    n_win = variable_block_sizes.shape[0]
+    device = variable_block_sizes.device
+    starts_pad = torch.arange(n_win, device=device) * max_block_size
+    index_pad = starts_pad[:, None] + torch.arange(max_block_size, device=device)[None, :]
+    index_mask = torch.arange(max_block_size, device=device)[None, :] < variable_block_sizes[:, None]
+    return index_pad[index_mask]
+def build_vsa_metadata(
+    dit_seq_shape: tuple[int, int, int],
+    tile_size: tuple[int, int, int] = VSA_TILE_SIZE,
+    device: torch.device | str = "cuda",
+) -> dict:
+    """Build all VSA metadata from a video latent shape in one call.
+    Args:
+        dit_seq_shape: (T, H, W) — temporal frames, spatial height, width.
+        tile_size: (ts_t, ts_h, ts_w) — tokens per tile in each dimension.
+            The resulting tile volume must be supported by the VSA kernels.
+        device: Target device for index tensors.
+    Returns:
+        Dict with keys: tile_partition_indices, reverse_tile_partition_indices,
+        variable_block_sizes, non_pad_index, num_tiles, max_block_size.
+    """
+    device = _canonicalize_device(device)
+    T, H, W = dit_seq_shape
+    ts_t, ts_h, ts_w = tile_size
+    max_block_size = math.prod(tile_size)
+    if max_block_size not in _SUPPORTED_VSA_BLOCK_VOLUMES:
+        raise ValueError(
+            f"Unsupported VSA tile volume {max_block_size} for tile_size={tile_size}; "
+            f"supported volumes are {_SUPPORTED_VSA_BLOCK_VOLUMES}."
+        )
+    num_tiles = (
+        math.ceil(T / ts_t),
+        math.ceil(H / ts_h),
+        math.ceil(W / ts_w),
+    )
+    tile_indices = get_tile_partition_indices(dit_seq_shape, tile_size, device)
+    reverse_tile_indices = get_reverse_tile_partition_indices(dit_seq_shape, tile_size, device)
+    vbs = construct_variable_block_sizes(dit_seq_shape, num_tiles, device, tile_size)
+    npi = get_non_pad_index(vbs, max_block_size)
+    return {
+        "tile_partition_indices": tile_indices,
+        "reverse_tile_partition_indices": reverse_tile_indices,
+        "variable_block_sizes": vbs,
+        "non_pad_index": npi,
+        "num_tiles": num_tiles,
+        "max_block_size": max_block_size,
+    }

{fastvideo_kernel-0.3.0 → fastvideo_kernel-0.3.2}/tests/test_attn_qat_infer.py RENAMED Viewed

@@ -18,10 +18,15 @@ import os
 import sys
 sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
+import pytest
 import torch
 import torch.nn.functional as F
 from torch.nn.attention import SDPBackend, sdpa_kernel
+# The FP4 extensions are only compiled under the sm_120a (Blackwell) arch
+# gate; on other GPUs the api import below would die at collection time.
+pytest.importorskip("fp4attn_cuda", reason="ATTN_QAT_INFER FP4 kernels require a sm_120a build")
 from attn_qat_infer.api import sageattn_blackwell
 DEVICE = torch.device("cuda")

fastvideo_kernel-0.3.2/tests/test_vsa_utils.py ADDED Viewed

@@ -0,0 +1,276 @@
+"""Tests for vsa_utils — standalone VSA metadata utilities.
+These tests use CPU index computation except for an optional multi-GPU
+regression covering cache isolation between CUDA devices.
+"""
+import math
+import pytest
+import torch
+from fastvideo_kernel.vsa_utils import (
+    VSA_TILE_SIZE,
+    _canonicalize_device,
+    get_tile_partition_indices,
+    get_reverse_tile_partition_indices,
+    construct_variable_block_sizes,
+    get_non_pad_index,
+    build_vsa_metadata,
+)
+class TestDeviceCanonicalization:
+    def test_unindexed_cuda_resolves_current_device(self, monkeypatch):
+        monkeypatch.setattr(torch.cuda, "current_device", lambda: 3)
+        assert _canonicalize_device("cuda") == torch.device("cuda:3")
+        assert _canonicalize_device(torch.device("cuda")) == torch.device("cuda:3")
+    @pytest.mark.skipif(torch.cuda.device_count() < 2, reason="requires two CUDA devices")
+    def test_metadata_cache_isolated_between_cuda_devices(self):
+        shape = (7, 8, 8)
+        tensor_keys = (
+            "tile_partition_indices",
+            "reverse_tile_partition_indices",
+            "variable_block_sizes",
+            "non_pad_index",
+        )
+        with torch.cuda.device(0):
+            metadata_0 = build_vsa_metadata(shape, device="cuda")
+        with torch.cuda.device(1):
+            metadata_1 = build_vsa_metadata(shape, device="cuda")
+        for key in tensor_keys:
+            assert metadata_0[key].device == torch.device("cuda:0")
+            assert metadata_1[key].device == torch.device("cuda:1")
+class TestGetTilePartitionIndices:
+    @pytest.mark.parametrize("dit_seq_shape,tile_size", [
+        ((8, 16, 16), (4, 4, 4)),
+        ((4, 8, 8), (4, 4, 4)),
+        ((9, 10, 7), (4, 4, 4)),
+    ])
+    def test_is_valid_permutation(self, dit_seq_shape, tile_size):
+        """Output must be a permutation of 0..N-1."""
+        device = torch.device("cpu")
+        idx = get_tile_partition_indices(dit_seq_shape, tile_size, device)
+        n = math.prod(dit_seq_shape)
+        assert idx.shape == (n,)
+        assert idx.dtype == torch.long
+        assert set(idx.tolist()) == set(range(n))
+    def test_exact_values_small(self):
+        """Manually verify a small case: (2,2,2) with tile (2,2,2) = 1 tile."""
+        device = torch.device("cpu")
+        idx = get_tile_partition_indices((2, 2, 2), (2, 2, 2), device)
+        assert idx.tolist() == list(range(8))
+    def test_non_divisible_shape(self):
+        """When shape doesn't divide evenly by tile_size, all tokens still covered."""
+        device = torch.device("cpu")
+        shape = (5, 7, 3)
+        idx = get_tile_partition_indices(shape, (4, 4, 4), device)
+        assert idx.shape == (5 * 7 * 3,)
+        assert set(idx.tolist()) == set(range(5 * 7 * 3))
+class TestGetReverseTilePartitionIndices:
+    @pytest.mark.parametrize("dit_seq_shape", [
+        (8, 16, 16),
+        (9, 10, 7),
+    ])
+    def test_inverse_of_forward(self, dit_seq_shape):
+        """reverse[forward[i]] == i for all i."""
+        device = torch.device("cpu")
+        tile_size = (4, 4, 4)
+        fwd = get_tile_partition_indices(dit_seq_shape, tile_size, device)
+        rev = get_reverse_tile_partition_indices(dit_seq_shape, tile_size, device)
+        n = math.prod(dit_seq_shape)
+        identity = torch.arange(n, device=device)
+        assert torch.equal(rev[fwd], identity)
+        assert torch.equal(fwd[rev], identity)
+class TestConstructVariableBlockSizes:
+    def test_sum_equals_total_tokens(self):
+        """Sum of block sizes must equal T*H*W."""
+        device = torch.device("cpu")
+        shape = (8, 16, 16)
+        tile_size = (4, 4, 4)
+        num_tiles = tuple(math.ceil(s / t) for s, t in zip(shape, tile_size))
+        vbs = construct_variable_block_sizes(shape, num_tiles, device, tile_size)
+        assert vbs.sum().item() == math.prod(shape)
+    def test_max_block_size(self):
+        """No block can exceed tile volume."""
+        device = torch.device("cpu")
+        shape = (9, 10, 7)
+        tile_size = (4, 4, 4)
+        num_tiles = tuple(math.ceil(s / t) for s, t in zip(shape, tile_size))
+        vbs = construct_variable_block_sizes(shape, num_tiles, device, tile_size)
+        assert vbs.max().item() <= math.prod(tile_size)
+    def test_num_blocks(self):
+        """Number of blocks = product of num_tiles."""
+        device = torch.device("cpu")
+        shape = (8, 16, 16)
+        tile_size = (4, 4, 4)
+        num_tiles = tuple(math.ceil(s / t) for s, t in zip(shape, tile_size))
+        vbs = construct_variable_block_sizes(shape, num_tiles, device, tile_size)
+        assert vbs.shape[0] == math.prod(num_tiles)
+    def test_exact_divisible(self):
+        """When perfectly divisible, all blocks have the same size."""
+        device = torch.device("cpu")
+        shape = (8, 8, 8)
+        tile_size = (4, 4, 4)
+        num_tiles = (2, 2, 2)
+        vbs = construct_variable_block_sizes(shape, num_tiles, device, tile_size)
+        assert (vbs == 64).all()
+    def test_non_divisible_last_tile_smaller(self):
+        """When not divisible, at least one block is smaller than max."""
+        device = torch.device("cpu")
+        shape = (9, 8, 8)
+        tile_size = (4, 4, 4)
+        num_tiles = (3, 2, 2)
+        vbs = construct_variable_block_sizes(shape, num_tiles, device, tile_size)
+        assert vbs.min().item() < math.prod(tile_size)
+    def test_custom_tile_size(self):
+        """tile_size parameter overrides default VSA_TILE_SIZE."""
+        device = torch.device("cpu")
+        shape = (6, 8, 16)
+        tile_size = (2, 4, 8)
+        num_tiles = (3, 2, 2)
+        vbs = construct_variable_block_sizes(shape, num_tiles, device, tile_size)
+        assert vbs.sum().item() == math.prod(shape)
+        assert (vbs == 64).all()
+class TestGetNonPadIndex:
+    def test_length_equals_sum_block_sizes(self):
+        """Output length must equal sum of variable_block_sizes."""
+        vbs = torch.tensor([32, 48, 64], dtype=torch.long)
+        idx = get_non_pad_index(vbs, 64)
+        assert idx.shape[0] == 32 + 48 + 64
+    def test_indices_in_valid_range(self):
+        """All indices must be in [0, num_blocks * max_block_size)."""
+        vbs = torch.tensor([32, 48], dtype=torch.long)
+        idx = get_non_pad_index(vbs, 64)
+        assert idx.min().item() >= 0
+        assert idx.max().item() < 2 * 64
+    def test_block_boundary_alignment(self):
+        """First token of block i starts at i * max_block_size."""
+        vbs = torch.tensor([20, 40], dtype=torch.long)
+        idx = get_non_pad_index(vbs, 64)
+        assert idx[0].item() == 0
+        assert idx[20].item() == 64
+    def test_full_blocks(self):
+        """When all blocks are full, output is just 0..N-1."""
+        vbs = torch.tensor([64, 64], dtype=torch.long)
+        idx = get_non_pad_index(vbs, 64)
+        assert torch.equal(idx, torch.arange(128))
+class TestBuildVsaMetadata:
+    def test_all_keys_present(self):
+        """build_vsa_metadata returns all expected keys."""
+        meta = build_vsa_metadata((8, 16, 16), device="cpu")
+        expected_keys = {
+            "tile_partition_indices", "reverse_tile_partition_indices",
+            "variable_block_sizes", "non_pad_index",
+            "num_tiles", "max_block_size",
+        }
+        assert set(meta.keys()) == expected_keys
+    def test_types(self):
+        """Return types are correct."""
+        meta = build_vsa_metadata((8, 16, 16), device="cpu")
+        assert isinstance(meta["tile_partition_indices"], torch.Tensor)
+        assert isinstance(meta["reverse_tile_partition_indices"], torch.Tensor)
+        assert isinstance(meta["variable_block_sizes"], torch.Tensor)
+        assert isinstance(meta["non_pad_index"], torch.Tensor)
+        assert isinstance(meta["num_tiles"], tuple)
+        assert isinstance(meta["max_block_size"], int)
+    def test_num_tiles_correct(self):
+        meta = build_vsa_metadata((9, 10, 7), tile_size=(4, 4, 4), device="cpu")
+        assert meta["num_tiles"] == (3, 3, 2)
+        assert meta["max_block_size"] == 64
+    @pytest.mark.parametrize("tile_size,expected_num_tiles,expected_block_size", [
+        ((2, 4, 8), (3, 2, 2), 64),
+        ((4, 8, 8), (2, 1, 2), 256),
+    ])
+    def test_supported_custom_tile_size(self, tile_size, expected_num_tiles, expected_block_size):
+        meta = build_vsa_metadata((6, 8, 16), tile_size=tile_size, device="cpu")
+        assert meta["num_tiles"] == expected_num_tiles
+        assert meta["max_block_size"] == expected_block_size
+    def test_unsupported_tile_volume(self):
+        with pytest.raises(ValueError, match="Unsupported VSA tile volume 27"):
+            build_vsa_metadata((6, 6, 6), tile_size=(3, 3, 3), device="cpu")
+    def test_consistency(self):
+        """All components are internally consistent."""
+        shape = (8, 16, 16)
+        meta = build_vsa_metadata(shape, device="cpu")
+        n = math.prod(shape)
+        assert meta["tile_partition_indices"].shape == (n,)
+        assert meta["reverse_tile_partition_indices"].shape == (n,)
+        assert meta["variable_block_sizes"].sum().item() == n
+        assert meta["non_pad_index"].shape[0] == n
+class TestConsistencyWithFramework:
+    """Verify vsa_utils matches the framework-level functions exactly.
+    Only runs if fastvideo is importable (skip otherwise).
+    """
+    @pytest.fixture(autouse=True)
+    def _skip_if_no_framework(self):
+        try:
+            from fastvideo.attention.backends.video_sparse_attn import (
+                get_tile_partition_indices as fw_get_tile,
+            )
+        except ImportError:
+            pytest.skip("fastvideo framework not installed")
+    @pytest.mark.parametrize("shape", [(8, 16, 16), (9, 10, 7)])
+    def test_tile_indices_match(self, shape):
+        from fastvideo.attention.backends.video_sparse_attn import (
+            get_tile_partition_indices as fw_get_tile,
+        )
+        device = torch.device("cpu")
+        tile_size = (4, 4, 4)
+        ours = get_tile_partition_indices(shape, tile_size, device)
+        theirs = fw_get_tile(shape, tile_size, device)
+        assert torch.equal(ours, theirs)
+    @pytest.mark.parametrize("shape", [(8, 16, 16), (9, 10, 7)])
+    def test_variable_block_sizes_match(self, shape):
+        from fastvideo.attention.backends.video_sparse_attn import (
+            construct_variable_block_sizes as fw_construct_vbs,
+        )
+        device = torch.device("cpu")
+        tile_size = (4, 4, 4)
+        num_tiles = tuple(math.ceil(s / t) for s, t in zip(shape, tile_size))
+        ours = construct_variable_block_sizes(shape, num_tiles, device, tile_size)
+        theirs = fw_construct_vbs(shape, num_tiles, device)
+        assert torch.equal(ours, theirs)
+if __name__ == "__main__":
+    pytest.main([__file__, "-v", "-s"])