PyPI - fastvideo-kernel - Versions diffs - 0.3.1__tar.gz → 0.3.2__tar.gz - Mend

fastvideo-kernel 0.3.1tar.gz → 0.3.2tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (91) hide show

{fastvideo_kernel-0.3.1 → fastvideo_kernel-0.3.2}/CMakeLists.txt RENAMED Viewed

@@ -1,6 +1,13 @@
 cmake_minimum_required(VERSION 3.26 FATAL_ERROR)
 project(fastvideo-kernel LANGUAGES CXX)
+# Capture any caller-provided -DCMAKE_CUDA_ARCHITECTURES *before* enable_language(CUDA)
+# auto-populates it with CMake's built-in default (an old arch, e.g. sm_75 on CUDA 13).
+# torch's cmake actually ignores CMAKE_CUDA_ARCHITECTURES (it drives arch selection via
+# TORCH_CUDA_ARCH_LIST), so we only use this captured value to honor an explicit pin by
+# translating it into TORCH_CUDA_ARCH_LIST below.
+set(_FASTVIDEO_USER_CUDA_ARCH "${CMAKE_CUDA_ARCHITECTURES}")
 # Prefer environment variable (used by CI or uv pip install git+repo_addr) if CMake var is not explicitly set.
 if(NOT DEFINED GPU_BACKEND AND DEFINED ENV{GPU_BACKEND})
     set(GPU_BACKEND "$ENV{GPU_BACKEND}")
@@ -19,6 +26,71 @@ endif()
 # Find Python and Torch
 find_package(Python COMPONENTS Interpreter Development.Module REQUIRED)
+# ---------------------------------------------------------------------------
+# Resolve the target CUDA architecture, BEFORE find_package(Torch) below.
+#
+# torch's cmake (Caffe2 public/cuda.cmake) takes over arch selection: it emits
+# the real -gencode flags from TORCH_CUDA_ARCH_LIST and forces
+# CMAKE_CUDA_ARCHITECTURES to OFF. So the *effective* arch is whatever
+# TORCH_CUDA_ARCH_LIST is when find_package(Torch) runs. build.sh exports it;
+# standards-based builds (pip / uv pip install, sdist) don't, and torch then
+# auto-detects an arch that does not match the GPU -- the kernels build but fail
+# at runtime ("no kernel image is available for execution on the device").
+# Resolve it here when absent (mirrors build.sh): honor a pinned
+# CMAKE_CUDA_ARCHITECTURES if given, else probe the visible GPU with torch.
+# ---------------------------------------------------------------------------
+if(NOT GPU_BACKEND STREQUAL "ROCM")
+    if(DEFINED ENV{TORCH_CUDA_ARCH_LIST})
+        message(STATUS "CUDA arch: TORCH_CUDA_ARCH_LIST=$ENV{TORCH_CUDA_ARCH_LIST} (from environment)")
+    elseif(TORCH_CUDA_ARCH_LIST)
+        set(ENV{TORCH_CUDA_ARCH_LIST} "${TORCH_CUDA_ARCH_LIST}")
+        message(STATUS "CUDA arch: TORCH_CUDA_ARCH_LIST=${TORCH_CUDA_ARCH_LIST} (from cmake)")
+    else()
+        set(_FV_ARCH_LIST "")
+        if(_FASTVIDEO_USER_CUDA_ARCH)
+            # Caller pinned -DCMAKE_CUDA_ARCHITECTURES (which torch ignores); translate it
+            # to the TORCH_CUDA_ARCH_LIST spelling: "121" -> "12.1", "90a" -> "9.0a".
+            foreach(_fv_arch IN LISTS _FASTVIDEO_USER_CUDA_ARCH)
+                string(REGEX MATCH "[af]$" _fv_suffix "${_fv_arch}")
+                string(REGEX REPLACE "[af]$" "" _fv_num "${_fv_arch}")
+                string(REGEX REPLACE "(.)$" ".\\1" _fv_num "${_fv_num}")    # dot before the last digit
+                list(APPEND _FV_ARCH_LIST "${_fv_num}${_fv_suffix}")
+            endforeach()
+            message(STATUS "CUDA arch: TORCH_CUDA_ARCH_LIST=${_FV_ARCH_LIST} (from -DCMAKE_CUDA_ARCHITECTURES=${_FASTVIDEO_USER_CUDA_ARCH})")
+        else()
+            # Best-effort probe of the visible GPU (mirrors build.sh detect_with_torch).
+            execute_process(
+                COMMAND "${Python_EXECUTABLE}" -c "import torch; assert torch.cuda.is_available(); mj, mn = torch.cuda.get_device_capability(0); print(f'{mj}.{mn}a' if (mj, mn) in ((9, 0), (12, 0)) else f'{mj}.{mn}')"
+                OUTPUT_VARIABLE _FV_ARCH_LIST
+                OUTPUT_STRIP_TRAILING_WHITESPACE
+                RESULT_VARIABLE _fv_detect_rc
+                ERROR_QUIET
+            )
+            if(_fv_detect_rc EQUAL 0 AND _FV_ARCH_LIST)
+                message(STATUS "CUDA arch: TORCH_CUDA_ARCH_LIST=${_FV_ARCH_LIST} (detected via torch, live GPU)")
+            else()
+                set(_FV_ARCH_LIST "")
+            endif()
+        endif()
+        if(_FV_ARCH_LIST)
+            set(TORCH_CUDA_ARCH_LIST "${_FV_ARCH_LIST}")
+            set(ENV{TORCH_CUDA_ARCH_LIST} "${_FV_ARCH_LIST}")
+        else()
+            message(FATAL_ERROR
+                "fastvideo-kernel: could not determine the target CUDA architecture.\n"
+                "Refusing to let torch auto-detect an arch that may not run on this GPU. "
+                "Fix with one of:\n"
+                "  - set TORCH_CUDA_ARCH_LIST (e.g. 12.1, or 9.0a for Hopper), or\n"
+                "  - pass -DCMAKE_CUDA_ARCHITECTURES=<arch> (e.g. 121), or\n"
+                "  - build where the target GPU is visible to torch.\n"
+                "Note: 'pip/uv pip install' builds under build isolation, which hides the "
+                "GPU; set TORCH_CUDA_ARCH_LIST or add --no-build-isolation. "
+                "fastvideo-kernel/build.sh sets all of this for you.")
+        endif()
+    endif()
+endif()
 # Robustly find Torch include paths using Python
 execute_process(
     COMMAND "${Python_EXECUTABLE}" -c "import torch; from torch.utils.cpp_extension import include_paths; print(';'.join(include_paths()))"
@@ -354,4 +426,3 @@ if(ENABLE_ATTN_QAT_INFER)
     install(TARGETS fp4attn_cuda LIBRARY DESTINATION .)
     install(TARGETS fp4quant_cuda LIBRARY DESTINATION .)
 endif()

{fastvideo_kernel-0.3.1 → fastvideo_kernel-0.3.2}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: fastvideo-kernel
-Version: 0.3.1
+Version: 0.3.2
 Summary: Unified CUDA kernels for FastVideo
 Author-Email: Hao AI Lab <contact@haoailab.com>
 License:                                  Apache License
@@ -239,13 +239,13 @@ fully usable without it).
 The symbols the fastpath needs (`flash_attn.cute.block_sparsity.BlockSparseTensorsTorch`,
 `flash_attn.cute.interface._flash_attn_fwd`) are provided upstream by
 [Dao-AILab/flash-attention](https://github.com/Dao-AILab/flash-attention). Pin to
-commit `c19cd20e`: the wrapper targets that revision's `_flash_attn_fwd` signature
-(`m_block_size` / `n_block_size`); later upstream revisions reshaped it into a
-`tile_mn` tuple and are not drop-in compatible.
+commit `940cd9680f3315f2f06b43ab5bea2c2cf2d96806`, the revision FastVideo pins as
+the `flash-attn-4` source in the repo-root `pyproject.toml`; other revisions may
+have an incompatible `_flash_attn_fwd` signature.
 ```bash
 pip install "nvidia-cutlass-dsl>=4.5.0" torchvision
-pip install "git+https://github.com/Dao-AILab/flash-attention.git@c19cd20e#subdirectory=flash_attn/cute"
+pip install "git+https://github.com/Dao-AILab/flash-attention.git@940cd9680f3315f2f06b43ab5bea2c2cf2d96806#subdirectory=flash_attn/cute"
 ```
 The CuTe kernel JIT-compiles on first use. Verified on Blackwell (sm_100) against

{fastvideo_kernel-0.3.1 → fastvideo_kernel-0.3.2}/README.md RENAMED Viewed

@@ -39,13 +39,13 @@ fully usable without it).
 The symbols the fastpath needs (`flash_attn.cute.block_sparsity.BlockSparseTensorsTorch`,
 `flash_attn.cute.interface._flash_attn_fwd`) are provided upstream by
 [Dao-AILab/flash-attention](https://github.com/Dao-AILab/flash-attention). Pin to
-commit `c19cd20e`: the wrapper targets that revision's `_flash_attn_fwd` signature
-(`m_block_size` / `n_block_size`); later upstream revisions reshaped it into a
-`tile_mn` tuple and are not drop-in compatible.
+commit `940cd9680f3315f2f06b43ab5bea2c2cf2d96806`, the revision FastVideo pins as
+the `flash-attn-4` source in the repo-root `pyproject.toml`; other revisions may
+have an incompatible `_flash_attn_fwd` signature.
 ```bash
 pip install "nvidia-cutlass-dsl>=4.5.0" torchvision
-pip install "git+https://github.com/Dao-AILab/flash-attention.git@c19cd20e#subdirectory=flash_attn/cute"
+pip install "git+https://github.com/Dao-AILab/flash-attention.git@940cd9680f3315f2f06b43ab5bea2c2cf2d96806#subdirectory=flash_attn/cute"
 ```
 The CuTe kernel JIT-compiles on first use. Verified on Blackwell (sm_100) against

{fastvideo_kernel-0.3.1 → fastvideo_kernel-0.3.2}/build.sh RENAMED Viewed

@@ -39,8 +39,13 @@ if [[ -n "${CONDA_PREFIX:-}" ]]; then
     unset _need_clean _host_arch
 fi
-# Ensure submodules are initialized if needed (tk)
-git submodule update --init --recursive
+# Ensure only the kernel's required headers are initialized. A repository-wide
+# update also clones the unrelated VBench evaluation submodule. Skip outside a
+# git checkout (e.g. Docker contexts that exclude .git), where the submodule
+# contents must already be present.
+if git rev-parse --git-dir >/dev/null 2>&1; then
+    git submodule update --init --recursive include/cutlass include/tk
+fi
 # Install build dependencies
 uv pip install scikit-build-core cmake ninja

{fastvideo_kernel-0.3.1 → fastvideo_kernel-0.3.2}/csrc/turbodiffusion/gemm/gemm.cu RENAMED Viewed

@@ -25,12 +25,16 @@
 #include "gemm/launch.hpp"
 void int8_gemm(
-  at::Tensor const& A, at::Tensor const& A_S,
-  at::Tensor const& B, at::Tensor const& B_S,
+  at::Tensor const& A, at::Tensor const& A_S,
+  at::Tensor const& B, at::Tensor const& B_S,
   torch::Tensor& C
 ) {
+  // The kernel dereferences raw pointers; a CPU tensor here (e.g. an Int8Linear
+  // never moved to CUDA) would otherwise fail as an illegal memory access.
+  TORCH_CHECK(A.is_cuda() && A_S.is_cuda() && B.is_cuda() && B_S.is_cuda() && C.is_cuda(),
+              "int8_gemm: all tensors must be on CUDA (move Int8Linear to CUDA before forward)");
   static constexpr int swizzle_dir = 1;
   static constexpr int swizzle_size_log = 5;

fastvideo_kernel-0.3.1/dist/fastvideo_kernel-0.3.1-cp312-cp312-manylinux_2_34_aarch64.manylinux_2_35_aarch64.whl → fastvideo_kernel-0.3.2/dist/fastvideo_kernel-0.3.2-cp312-cp312-manylinux_2_34_aarch64.manylinux_2_35_aarch64.whl RENAMED Viewed

Binary file

fastvideo_kernel-0.3.1/dist/fastvideo_kernel-0.3.1-cp312-cp312-manylinux_2_34_x86_64.manylinux_2_35_x86_64.whl → fastvideo_kernel-0.3.2/dist/fastvideo_kernel-0.3.2-cp312-cp312-manylinux_2_34_x86_64.manylinux_2_35_x86_64.whl RENAMED Viewed

Binary file

{fastvideo_kernel-0.3.1 → fastvideo_kernel-0.3.2}/pyproject.toml RENAMED Viewed

@@ -9,7 +9,7 @@ build-backend = "scikit_build_core.build"
 [project]
 name = "fastvideo-kernel"
-version = "0.3.1"
+version = "0.3.2"
 description = "Unified CUDA kernels for FastVideo"
 readme = "README.md"
 requires-python = ">=3.10"

{fastvideo_kernel-0.3.1 → fastvideo_kernel-0.3.2}/python/fastvideo_kernel/block_sparse_attn_cute_fwd.py RENAMED Viewed

@@ -46,8 +46,7 @@ def _load_fa4_cute():
     return BlockSparseTensorsTorch, _flash_attn_fwd
-# FA4 BSA fwd uses (m_block_size, n_block_size); m_block_size=128 is the
-# Q-side tile, kv_block_size comes from the caller's VSA logical KV block.
+# Q-side tile size; kv_block_size comes from the caller's VSA logical KV block.
 _M_BLOCK_SIZE_DEFAULT = 128
@@ -182,18 +181,18 @@ def _cute_forward(
         block_size=(q_sparse_block_size, kv_block_size),
     )
+    # _flash_attn_fwd returns (out, lse, p, row_max); keep the first two.
     out, lse = _flash_attn_fwd(
         q_bshd,
         k_bshd,
         v_bshd,
-        m_block_size=_M_BLOCK_SIZE_DEFAULT,
-        n_block_size=kv_block_size,
+        tile_mn=(_M_BLOCK_SIZE_DEFAULT, kv_block_size),
         mask_mod=_build_vbs_mask_mod(kv_block_size),
         block_sparse_tensors=sparse_tensors,
         aux_tensors=[variable_block_sizes],
         causal=False,
         return_lse=True,
-    )
+    )[:2]
     return out, lse

fastvideo_kernel-0.3.2/python/fastvideo_kernel/version.py ADDED Viewed

	@@ -0,0 +1 @@
1	+ __version__ = "0.3.2"

{fastvideo_kernel-0.3.1 → fastvideo_kernel-0.3.2}/tests/test_attn_qat_infer.py RENAMED Viewed

@@ -18,10 +18,15 @@ import os
 import sys
 sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
+import pytest
 import torch
 import torch.nn.functional as F
 from torch.nn.attention import SDPBackend, sdpa_kernel
+# The FP4 extensions are only compiled under the sm_120a (Blackwell) arch
+# gate; on other GPUs the api import below would die at collection time.
+pytest.importorskip("fp4attn_cuda", reason="ATTN_QAT_INFER FP4 kernels require a sm_120a build")
 from attn_qat_infer.api import sageattn_blackwell
 DEVICE = torch.device("cuda")