PyPI - fastvideo-kernel - Versions diffs - 0.2.6__tar.gz → 0.3.0__tar.gz - Mend

fastvideo-kernel 0.2.6tar.gz → 0.3.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (94) hide show

fastvideo_kernel-0.3.0/CMakeLists.txt ADDED Viewed

@@ -0,0 +1,352 @@
+cmake_minimum_required(VERSION 3.26 FATAL_ERROR)
+project(fastvideo-kernel LANGUAGES CXX)
+# Prefer environment variable (used by CI or uv pip install git+repo_addr) if CMake var is not explicitly set.
+if(NOT DEFINED GPU_BACKEND AND DEFINED ENV{GPU_BACKEND})
+    set(GPU_BACKEND "$ENV{GPU_BACKEND}")
+endif()
+if(GPU_BACKEND STREQUAL "ROCM")
+    enable_language(HIP)
+else()
+    enable_language(CUDA)
+    # Ensure CUDA toolkit targets (CUDA::cudart, CUDA::cuda_driver, etc.) are available.
+    find_package(CUDAToolkit REQUIRED)
+endif()
+# Import common utils if needed, but we keep it simple for now
+# Find Python and Torch
+find_package(Python COMPONENTS Interpreter Development.Module REQUIRED)
+# Robustly find Torch include paths using Python
+execute_process(
+    COMMAND "${Python_EXECUTABLE}" -c "import torch; from torch.utils.cpp_extension import include_paths; print(';'.join(include_paths()))"
+    OUTPUT_VARIABLE TORCH_INCLUDE_PATHS
+    OUTPUT_STRIP_TRAILING_WHITESPACE
+)
+list(APPEND TORCH_INCLUDE_DIRS ${TORCH_INCLUDE_PATHS})
+# Find Torch package (still useful for libraries)
+find_package(Torch REQUIRED)
+# Include directories
+include_directories(
+    ${CMAKE_SOURCE_DIR}/include
+    ${CMAKE_SOURCE_DIR}/include/cutlass/include
+    ${CMAKE_SOURCE_DIR}/include/tk/include
+    ${CMAKE_SOURCE_DIR}/include/tk/prototype
+    ${CMAKE_SOURCE_DIR}/csrc
+    ${CMAKE_SOURCE_DIR}/csrc/turbodiffusion
+    ${TORCH_INCLUDE_DIRS}
+)
+# ---------------------------
+# ThunderKittens (TK) toggles
+# ---------------------------
+# AUTO: enable TK only when we can confidently target Hopper (sm_90a).
+# ON:   force-enable TK kernels (intended for release wheels/images; does NOT require a GPU).
+# OFF:  never build TK kernels.
+set(FASTVIDEO_KERNEL_BUILD_TK "AUTO" CACHE STRING "Build ThunderKittens kernels: AUTO/ON/OFF")
+set_property(CACHE FASTVIDEO_KERNEL_BUILD_TK PROPERTY STRINGS AUTO ON OFF)
+set(_FASTVIDEO_KERNEL_BUILD_ATTN_QAT_INFER_DEFAULT "AUTO")
+if(DEFINED FASTVIDEO_KERNEL_BUILD_MODIFIED_SAGE3 AND NOT DEFINED CACHE{FASTVIDEO_KERNEL_BUILD_ATTN_QAT_INFER})
+    set(_FASTVIDEO_KERNEL_BUILD_ATTN_QAT_INFER_DEFAULT "${FASTVIDEO_KERNEL_BUILD_MODIFIED_SAGE3}")
+endif()
+set(FASTVIDEO_KERNEL_BUILD_ATTN_QAT_INFER "${_FASTVIDEO_KERNEL_BUILD_ATTN_QAT_INFER_DEFAULT}" CACHE STRING
+    "Build attn_qat_infer Blackwell inference kernels: AUTO/ON/OFF")
+set_property(CACHE FASTVIDEO_KERNEL_BUILD_ATTN_QAT_INFER PROPERTY STRINGS AUTO ON OFF)
+if(DEFINED FASTVIDEO_KERNEL_BUILD_MODIFIED_SAGE3)
+    message(DEPRECATION
+        "FASTVIDEO_KERNEL_BUILD_MODIFIED_SAGE3 is deprecated. "
+        "Use FASTVIDEO_KERNEL_BUILD_ATTN_QAT_INFER instead.")
+endif()
+# Prefer environment variable (used by CI) if CMake var is not explicitly set.
+if(NOT DEFINED TORCH_CUDA_ARCH_LIST AND DEFINED ENV{TORCH_CUDA_ARCH_LIST})
+    set(TORCH_CUDA_ARCH_LIST "$ENV{TORCH_CUDA_ARCH_LIST}")
+endif()
+message(STATUS "TORCH_CUDA_ARCH_LIST (cmake/env): ${TORCH_CUDA_ARCH_LIST}")
+message(STATUS "FASTVIDEO_KERNEL_BUILD_TK: ${FASTVIDEO_KERNEL_BUILD_TK}")
+message(STATUS "FASTVIDEO_KERNEL_BUILD_ATTN_QAT_INFER: ${FASTVIDEO_KERNEL_BUILD_ATTN_QAT_INFER}")
+set(ENABLE_TK_KERNELS OFF)
+if(FASTVIDEO_KERNEL_BUILD_TK STREQUAL "ON")
+    set(ENABLE_TK_KERNELS ON)
+elseif(FASTVIDEO_KERNEL_BUILD_TK STREQUAL "OFF")
+    set(ENABLE_TK_KERNELS OFF)
+else()
+    # AUTO: detect Hopper if possible.
+    if(TORCH_CUDA_ARCH_LIST)
+        # Accept common spellings: 9.0a, 90a, sm_90a.
+        string(REGEX MATCH "(^|[; ,])((9\\.0a)|(90a)|(sm_90a))([; ,]|$)" _HAS_90A "${TORCH_CUDA_ARCH_LIST}")
+        if(_HAS_90A)
+            set(ENABLE_TK_KERNELS ON)
+        endif()
+    else()
+        # Best-effort local detection (works when a CUDA device is visible).
+        execute_process(
+            COMMAND "${Python_EXECUTABLE}" -c "import torch; import sys; \nprint('1' if (torch.cuda.is_available() and torch.version.cuda and torch.cuda.get_device_capability()[0] >= 9) else '0')"
+            OUTPUT_VARIABLE _LOCAL_HAS_HOPPER
+            OUTPUT_STRIP_TRAILING_WHITESPACE
+            ERROR_QUIET
+        )
+        if(_LOCAL_HAS_HOPPER STREQUAL "1")
+            set(ENABLE_TK_KERNELS ON)
+        endif()
+    endif()
+endif()
+if(ENABLE_TK_KERNELS)
+    message(STATUS "ThunderKittens kernels: ENABLED")
+else()
+    message(STATUS "ThunderKittens kernels: DISABLED (will use Triton fallbacks at runtime)")
+endif()
+set(ENABLE_ATTN_QAT_INFER OFF)
+if(GPU_BACKEND STREQUAL "ROCM")
+    message(STATUS "attn_qat_infer kernels: DISABLED (ROCm build)")
+else()
+    set(_WANTS_ATTN_QAT_INFER OFF)
+    if(FASTVIDEO_KERNEL_BUILD_ATTN_QAT_INFER STREQUAL "ON")
+        set(_WANTS_ATTN_QAT_INFER ON)
+    elseif(FASTVIDEO_KERNEL_BUILD_ATTN_QAT_INFER STREQUAL "AUTO")
+        if(TORCH_CUDA_ARCH_LIST)
+            string(REGEX MATCH
+                "(^|[; ,])((12\\.0a)|(120a)|(sm_120a))([; ,]|$)"
+                _HAS_120A "${TORCH_CUDA_ARCH_LIST}")
+            if(_HAS_120A)
+                set(_WANTS_ATTN_QAT_INFER ON)
+            endif()
+        else()
+            execute_process(
+                COMMAND "${Python_EXECUTABLE}" -c
+                "import torch; print('1' if (torch.cuda.is_available() and torch.version.cuda and torch.cuda.get_device_capability()[0] >= 12) else '0')"
+                OUTPUT_VARIABLE _LOCAL_HAS_BLACKWELL
+                OUTPUT_STRIP_TRAILING_WHITESPACE
+                ERROR_QUIET
+            )
+            if(_LOCAL_HAS_BLACKWELL STREQUAL "1")
+                set(_WANTS_ATTN_QAT_INFER ON)
+            endif()
+        endif()
+    endif()
+    if(_WANTS_ATTN_QAT_INFER)
+        if(CUDAToolkit_VERSION VERSION_LESS 12.8)
+            message(WARNING
+                "attn_qat_infer kernels require CUDA Toolkit 12.8+. "
+                "Skipping because CUDAToolkit_VERSION=${CUDAToolkit_VERSION}.")
+        else()
+            set(ENABLE_ATTN_QAT_INFER ON)
+        endif()
+    endif()
+    if(ENABLE_ATTN_QAT_INFER)
+        message(STATUS "attn_qat_infer kernels: ENABLED")
+    else()
+        message(STATUS
+            "attn_qat_infer kernels: DISABLED "
+            "(requires CUDA 12.8+ and Blackwell sm_120a)")
+    endif()
+endif()
+# Always try to build the extension if CUDA is available, but conditionally add sources/flags
+set(BUILD_CXX_KERNELS ON)
+# ---------------------------------------------------------------------------
+# Per-arch split for the Blackwell FP4 (attn_qat_infer) build
+# ---------------------------------------------------------------------------
+# The FP4 kernels are sm_120a-only (they emit `cvt.e2m1x2` etc.), while the main
+# extension (Hopper-only TK + generic turbodiffusion) targets the full arch list.
+# find_package(Torch) injects ONE global -gencode list into CMAKE_CUDA_FLAGS that
+# forces every target onto every arch, so the FP4 sources also get the sm_90a pass
+# and ptxas rejects their Blackwell instructions. Strip that global list and drive
+# arch per target via CUDA_ARCHITECTURES instead (the fp4* targets pin 120a below;
+# the main extension gets the full list). Only do this for the FP4 build with an
+# explicit arch list, so the cu126 / local autodetect paths stay untouched.
+if(ENABLE_ATTN_QAT_INFER AND TORCH_CUDA_ARCH_LIST)
+    message(STATUS "[per-arch] CMAKE_CUDA_FLAGS before strip: ${CMAKE_CUDA_FLAGS}")
+    string(REGEX REPLACE "-gencode[ =]+arch=[^ ]+" "" CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS}")
+    string(REGEX REPLACE " +" " " CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS}")
+    message(STATUS "[per-arch] CMAKE_CUDA_FLAGS after strip:  ${CMAKE_CUDA_FLAGS}")
+    # Convert TORCH_CUDA_ARCH_LIST ("9.0a;12.0a") to CMake form ("90a;120a").
+    set(FASTVIDEO_MAIN_CUDA_ARCHS "${TORCH_CUDA_ARCH_LIST}")
+    string(REPLACE "sm_" "" FASTVIDEO_MAIN_CUDA_ARCHS "${FASTVIDEO_MAIN_CUDA_ARCHS}")
+    string(REPLACE "." "" FASTVIDEO_MAIN_CUDA_ARCHS "${FASTVIDEO_MAIN_CUDA_ARCHS}")
+    message(STATUS "[per-arch] main extension archs=${FASTVIDEO_MAIN_CUDA_ARCHS}, fp4* archs=120a")
+endif()
+# Compiler flags
+set(CUDA_FLAGS
+    "-DNDEBUG"
+    "-O3"
+    "-std=c++20"
+    "--use_fast_math"
+    "--expt-extended-lambda"
+    "--expt-relaxed-constexpr"
+    "-Xcompiler=-fno-strict-aliasing"
+    "-Xcompiler=-fPIC"
+    "-DTORCH_COMPILE"
+    "-Xnvlink=--verbose"
+    "-Xptxas=--verbose"
+    "-Xptxas=--warn-on-spills"
+)
+# If TK is enabled, ensure we target Hopper. This is required even on GPU-less builders (CI).
+if(ENABLE_TK_KERNELS)
+    if(NOT DEFINED CMAKE_CUDA_ARCHITECTURES OR CMAKE_CUDA_ARCHITECTURES STREQUAL "")
+        set(CMAKE_CUDA_ARCHITECTURES "90a" CACHE STRING "CUDA architectures" FORCE)
+    endif()
+    list(APPEND CUDA_FLAGS "-DKITTENS_HOPPER")
+    message(STATUS "CMAKE_CUDA_ARCHITECTURES: ${CMAKE_CUDA_ARCHITECTURES}")
+endif()
+if(BUILD_CXX_KERNELS)
+    # Source files
+    set(EXTENSION_SOURCES
+        csrc/common_extension.cpp
+        csrc/turbodiffusion/gemm/gemm.cu
+        csrc/turbodiffusion/norm/rmsnorm.cu
+        csrc/turbodiffusion/norm/layernorm.cu
+        csrc/turbodiffusion/quant/quant.cu
+    )
+    # Conditionally add TK kernels
+    if(ENABLE_TK_KERNELS)
+        list(APPEND EXTENSION_SOURCES
+            csrc/attention/st_attn_h100.cu
+            csrc/attention/block_sparse_h100.cu
+        )
+    endif()
+    # Combined FastVideo Extension
+    # Using name 'fastvideo_kernel_ops' to distinguish from the python package namespace
+    Python_add_library(fastvideo_kernel_ops MODULE USE_SABI ${SKBUILD_SABI_VERSION} WITH_SOABI
+        ${EXTENSION_SOURCES}
+    )
+    # When the per-arch split is active (FP4 build), torch's global gencode was
+    # stripped above, so set this target's arch explicitly. TK is guarded down to
+    # sm_90a in source; the turbodiffusion kernels are generic, so the main
+    # extension targets the full list. (No-FP4 builds keep torch's global gencode.)
+    if(ENABLE_ATTN_QAT_INFER AND FASTVIDEO_MAIN_CUDA_ARCHS)
+        set_target_properties(fastvideo_kernel_ops PROPERTIES
+            CUDA_ARCHITECTURES "${FASTVIDEO_MAIN_CUDA_ARCHS}")
+    endif()
+    # Build compile definitions list
+    set(COMPILE_DEFS TORCH_EXTENSION_NAME=fastvideo_kernel_ops)
+    if(ENABLE_TK_KERNELS)
+        list(APPEND COMPILE_DEFS TK_COMPILE_ST_ATTN TK_COMPILE_BLOCK_SPARSE)
+    endif()
+    target_compile_definitions(fastvideo_kernel_ops PRIVATE ${COMPILE_DEFS})
+    target_compile_options(fastvideo_kernel_ops PRIVATE
+        $<$<COMPILE_LANGUAGE:CUDA>:${CUDA_FLAGS}>
+    )
+    # Link against Torch libraries to avoid undefined symbols at import time
+    # (e.g., torch::autograd vtables) when loading the extension module.
+    target_link_libraries(fastvideo_kernel_ops PRIVATE ${TORCH_LIBRARIES})
+    # Also link against libtorch_python to satisfy Python-binding symbols
+    # (e.g., torch::PyWarningHandler) required by torch/extension.h.
+    execute_process(
+        COMMAND "${Python_EXECUTABLE}" -c "import torch; from pathlib import Path; p=Path(torch.__file__).parent/'lib'; m=sorted(p.glob('libtorch_python*')); print(str(m[0]) if m else '')"
+        OUTPUT_VARIABLE TORCH_PYTHON_LIBRARY_PATH
+        OUTPUT_STRIP_TRAILING_WHITESPACE
+        ERROR_QUIET
+    )
+    if(TORCH_PYTHON_LIBRARY_PATH)
+        message(STATUS "TORCH_PYTHON_LIBRARY_PATH: ${TORCH_PYTHON_LIBRARY_PATH}")
+        target_link_libraries(fastvideo_kernel_ops PRIVATE "${TORCH_PYTHON_LIBRARY_PATH}")
+    else()
+        message(WARNING "Could not locate libtorch_python; fastvideo_kernel_ops may fail to import.")
+    endif()
+    # Link CUDA runtime + driver explicitly (fixes missing symbols like cuGetErrorString at import time)
+    if(NOT GPU_BACKEND STREQUAL "ROCM")
+        target_link_libraries(fastvideo_kernel_ops PRIVATE CUDA::cudart CUDA::cuda_driver)
+    endif()
+    # We install it to fastvideo_kernel/_C so we can load it to register the ops
+    install(TARGETS fastvideo_kernel_ops LIBRARY DESTINATION fastvideo_kernel/_C)
+endif()
+if(ENABLE_ATTN_QAT_INFER)
+    set(ATTN_QAT_INFER_DIR ${CMAKE_SOURCE_DIR}/attn_qat_infer)
+    set(ATTN_QAT_INFER_INCLUDE_DIRS
+        ${ATTN_QAT_INFER_DIR}
+        ${CMAKE_SOURCE_DIR}/include/cutlass/include
+        ${CMAKE_SOURCE_DIR}/include/cutlass/tools/util/include
+        ${TORCH_INCLUDE_DIRS}
+    )
+    set(ATTN_QAT_INFER_CUDA_FLAGS
+        "-O3"
+        "-std=c++17"
+        "-U__CUDA_NO_HALF_OPERATORS__"
+        "-U__CUDA_NO_HALF_CONVERSIONS__"
+        "-U__CUDA_NO_BFLOAT16_OPERATORS__"
+        "-U__CUDA_NO_BFLOAT16_CONVERSIONS__"
+        "-U__CUDA_NO_BFLOAT162_OPERATORS__"
+        "-U__CUDA_NO_BFLOAT162_CONVERSIONS__"
+        "--expt-relaxed-constexpr"
+        "--expt-extended-lambda"
+        "--use_fast_math"
+        "--ptxas-options=--verbose,--warn-on-local-memory-usage"
+        "-lineinfo"
+        "-DCUTLASS_DEBUG_TRACE_LEVEL=0"
+        "-DNDEBUG"
+        "-DQBLKSIZE=128"
+        "-DKBLKSIZE=128"
+        "-DCTA256"
+        "-DDQINRMEM"
+    )
+    Python_add_library(fp4attn_cuda MODULE WITH_SOABI
+        attn_qat_infer/blackwell/api.cu
+    )
+    target_include_directories(fp4attn_cuda PRIVATE ${ATTN_QAT_INFER_INCLUDE_DIRS})
+    target_compile_definitions(fp4attn_cuda PRIVATE TORCH_EXTENSION_NAME=fp4attn_cuda)
+    target_compile_options(fp4attn_cuda PRIVATE
+        $<$<COMPILE_LANGUAGE:CXX>:-O3 -std=c++17>
+        $<$<COMPILE_LANGUAGE:CUDA>:${ATTN_QAT_INFER_CUDA_FLAGS}>
+    )
+    set_target_properties(fp4attn_cuda PROPERTIES
+        CUDA_ARCHITECTURES "120a"
+        CXX_STANDARD 17
+        CUDA_STANDARD 17
+    )
+    target_link_libraries(fp4attn_cuda PRIVATE ${TORCH_LIBRARIES} CUDA::cudart CUDA::cuda_driver)
+    Python_add_library(fp4quant_cuda MODULE WITH_SOABI
+        attn_qat_infer/quantization/fp4_quantization_4d.cu
+    )
+    target_include_directories(fp4quant_cuda PRIVATE ${ATTN_QAT_INFER_INCLUDE_DIRS})
+    target_compile_definitions(fp4quant_cuda PRIVATE TORCH_EXTENSION_NAME=fp4quant_cuda)
+    target_compile_options(fp4quant_cuda PRIVATE
+        $<$<COMPILE_LANGUAGE:CXX>:-O3 -std=c++17>
+        $<$<COMPILE_LANGUAGE:CUDA>:${ATTN_QAT_INFER_CUDA_FLAGS}>
+    )
+    set_target_properties(fp4quant_cuda PROPERTIES
+        CUDA_ARCHITECTURES "120a"
+        CXX_STANDARD 17
+        CUDA_STANDARD 17
+    )
+    target_link_libraries(fp4quant_cuda PRIVATE ${TORCH_LIBRARIES} CUDA::cudart CUDA::cuda_driver)
+    if(TORCH_PYTHON_LIBRARY_PATH)
+        target_link_libraries(fp4attn_cuda PRIVATE "${TORCH_PYTHON_LIBRARY_PATH}")
+        target_link_libraries(fp4quant_cuda PRIVATE "${TORCH_PYTHON_LIBRARY_PATH}")
+    endif()
+    install(TARGETS fp4attn_cuda LIBRARY DESTINATION .)
+    install(TARGETS fp4quant_cuda LIBRARY DESTINATION .)
+endif()

{fastvideo_kernel-0.2.6 → fastvideo_kernel-0.3.0}/MANIFEST.in RENAMED Viewed

@@ -2,5 +2,6 @@ include LICENSE
 include README.md
 include pyproject.toml
 recursive-include python/fastvideo_kernel *.py
+recursive-include attn_qat_infer *.py *.cu *.cuh *.cpp *.h
 recursive-include csrc *.cu *.cuh *.cpp *.h
 recursive-include include/tk *.cu *.cuh *.cpp *.h *.src

{fastvideo_kernel-0.2.6 → fastvideo_kernel-0.3.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: fastvideo-kernel
-Version: 0.2.6
+Version: 0.3.0
 Summary: Unified CUDA kernels for FastVideo
 Author-Email: Hao AI Lab <contact@haoailab.com>
 License:                                  Apache License
@@ -195,7 +195,7 @@ Classifier: Environment :: GPU :: NVIDIA CUDA
 Project-URL: Homepage, https://github.com/hao-ai-lab/FastVideo
 Requires-Python: >=3.10
 Requires-Dist: torch>=2.5.0
-Requires-Dist: triton>=2.0.0
+Requires-Dist: triton>=2.0.0; sys_platform == "linux"
 Description-Content-Type: text/markdown
 # FastVideo Kernel
@@ -207,6 +207,13 @@ CUDA kernels for FastVideo video generation.
 ### Standard Installation (Local Development)
 This will automatically detect your GPU architecture. If an NVIDIA Hopper (H100/sm_90a) GPU is detected, ThunderKittens kernels will be enabled. Otherwise, they will be skipped, and the package will use Triton fallbacks at runtime.
+Before installation, set CUDA toolchain paths:
+```bash
+export CUDA_HOME=/usr/local/cuda
+export CUDACXX=$CUDA_HOME/bin/nvcc
+```
 ```bash
 git submodule update --init --recursive
 cd fastvideo-kernel
@@ -221,6 +228,29 @@ cd fastvideo-kernel
 ./build.sh --rocm
 ```
+### Optional: FA4 CuTe block-sparse backend (VSA-256 fastpath)
+The VSA-256 fastpath (tile volume 256, on NVIDIA Blackwell / sm_100) routes to the
+FlashAttention-4 CuTe-DSL block-sparse kernel exposed as `flash_attn.cute`. This is
+an **optional** dependency: it is imported lazily, and `video_sparse_attn`
+transparently falls back to the Triton backend when it is absent (so the package is
+fully usable without it).
+The symbols the fastpath needs (`flash_attn.cute.block_sparsity.BlockSparseTensorsTorch`,
+`flash_attn.cute.interface._flash_attn_fwd`) are provided upstream by
+[Dao-AILab/flash-attention](https://github.com/Dao-AILab/flash-attention). Pin to
+commit `c19cd20e`: the wrapper targets that revision's `_flash_attn_fwd` signature
+(`m_block_size` / `n_block_size`); later upstream revisions reshaped it into a
+`tile_mn` tuple and are not drop-in compatible.
+```bash
+pip install "nvidia-cutlass-dsl>=4.5.0" torchvision
+pip install "git+https://github.com/Dao-AILab/flash-attention.git@c19cd20e#subdirectory=flash_attn/cute"
+```
+The CuTe kernel JIT-compiles on first use. Verified on Blackwell (sm_100) against
+`tests/test_vsa256_forward*.py`.
 ## Usage
 ### Sliding Tile Attention (STA) & Video Sparse Attention (VSA)
@@ -262,6 +292,8 @@ This package also includes kernels from [TurboDiffusion](https://github.com/thu-
   - Any CUDA GPU for Triton-based fallbacks.
 - **Build**:
   - CUDA Toolkit 12.3+
+  - `CUDA_HOME` must be set (for example, `/usr/local/cuda`)
+  - `CUDACXX` must be set (for example, `$CUDA_HOME/bin/nvcc`)
   - C++20 compatible compiler (GCC 10+, Clang 11+)
 ## Acknowledgement

{fastvideo_kernel-0.2.6 → fastvideo_kernel-0.3.0}/README.md RENAMED Viewed

@@ -7,6 +7,13 @@ CUDA kernels for FastVideo video generation.
 ### Standard Installation (Local Development)
 This will automatically detect your GPU architecture. If an NVIDIA Hopper (H100/sm_90a) GPU is detected, ThunderKittens kernels will be enabled. Otherwise, they will be skipped, and the package will use Triton fallbacks at runtime.
+Before installation, set CUDA toolchain paths:
+```bash
+export CUDA_HOME=/usr/local/cuda
+export CUDACXX=$CUDA_HOME/bin/nvcc
+```
 ```bash
 git submodule update --init --recursive
 cd fastvideo-kernel
@@ -21,6 +28,29 @@ cd fastvideo-kernel
 ./build.sh --rocm
 ```
+### Optional: FA4 CuTe block-sparse backend (VSA-256 fastpath)
+The VSA-256 fastpath (tile volume 256, on NVIDIA Blackwell / sm_100) routes to the
+FlashAttention-4 CuTe-DSL block-sparse kernel exposed as `flash_attn.cute`. This is
+an **optional** dependency: it is imported lazily, and `video_sparse_attn`
+transparently falls back to the Triton backend when it is absent (so the package is
+fully usable without it).
+The symbols the fastpath needs (`flash_attn.cute.block_sparsity.BlockSparseTensorsTorch`,
+`flash_attn.cute.interface._flash_attn_fwd`) are provided upstream by
+[Dao-AILab/flash-attention](https://github.com/Dao-AILab/flash-attention). Pin to
+commit `c19cd20e`: the wrapper targets that revision's `_flash_attn_fwd` signature
+(`m_block_size` / `n_block_size`); later upstream revisions reshaped it into a
+`tile_mn` tuple and are not drop-in compatible.
+```bash
+pip install "nvidia-cutlass-dsl>=4.5.0" torchvision
+pip install "git+https://github.com/Dao-AILab/flash-attention.git@c19cd20e#subdirectory=flash_attn/cute"
+```
+The CuTe kernel JIT-compiles on first use. Verified on Blackwell (sm_100) against
+`tests/test_vsa256_forward*.py`.
 ## Usage
 ### Sliding Tile Attention (STA) & Video Sparse Attention (VSA)
@@ -62,6 +92,8 @@ This package also includes kernels from [TurboDiffusion](https://github.com/thu-
   - Any CUDA GPU for Triton-based fallbacks.
 - **Build**:
   - CUDA Toolkit 12.3+
+  - `CUDA_HOME` must be set (for example, `/usr/local/cuda`)
+  - `CUDACXX` must be set (for example, `$CUDA_HOME/bin/nvcc`)
   - C++20 compatible compiler (GCC 10+, Clang 11+)
 ## Acknowledgement

fastvideo_kernel-0.3.0/attn_qat_infer/__init__.py ADDED Viewed

@@ -0,0 +1,16 @@
+"""
+Copyright (c) 2025 by SageAttention team.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+from .api import sageattn_blackwell

fastvideo_kernel-0.3.0/attn_qat_infer/api.py ADDED Viewed

@@ -0,0 +1,189 @@
+# Modified from the original SageATtention3 code
+"""
+Copyright (c) 2025 by SageAttention team.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import torch
+import triton
+import triton.language as tl
+import torch.nn.functional as F
+from typing import Tuple
+from torch.nn.functional import scaled_dot_product_attention as sdpa
+import fp4attn_cuda
+import fp4quant_cuda
+# Centralized block size configuration for sageattn_blackwell kernels
+# These should match the values in fastvideo/attention/backends/sageattn/blackwell/block_config.h
+BLOCK_M = 128  # Block size for M dimension (query sequence length)
+BLOCK_N = 128  # Block size for N dimension (key/value sequence length)
+@triton.jit
+def group_mean_kernel(
+    q_ptr,
+    q_out_ptr,
+    qm_out_ptr,
+    B, H, L, D: tl.constexpr,
+    stride_qb, stride_qh, stride_ql, stride_qd,
+    stride_qmb, stride_qmh, stride_qml, stride_qmd,
+    GROUP_SIZE: tl.constexpr
+):
+    pid_b = tl.program_id(0)
+    pid_h = tl.program_id(1)
+    pid_group = tl.program_id(2)
+    group_start = pid_group * GROUP_SIZE
+    offsets = group_start + tl.arange(0, GROUP_SIZE)
+    q_offsets = pid_b * stride_qb + pid_h * stride_qh + offsets[:, None] * stride_ql + tl.arange(0, D)[None, :] * stride_qd
+    q_group = tl.load(q_ptr + q_offsets)
+    qm_group = tl.sum(q_group, axis=0) / GROUP_SIZE
+    q_group = q_group - qm_group
+    tl.store(q_out_ptr + q_offsets, q_group)
+    qm_offset = pid_b * stride_qmb + pid_h * stride_qmh + pid_group * stride_qml + tl.arange(0, D) * stride_qmd
+    tl.store(qm_out_ptr + qm_offset, qm_group)
+def triton_group_mean(q: torch.Tensor):
+    B, H, L, D = q.shape
+    GROUP_SIZE = BLOCK_M
+    num_groups = L // GROUP_SIZE
+    q_out = torch.empty_like(q)  # [B, H, L, D]
+    qm = torch.empty(B, H, num_groups, D, device=q.device, dtype=q.dtype)
+    grid = (B, H, num_groups)
+    group_mean_kernel[grid](
+        q, q_out, qm,
+        B, H, L, D,
+        q.stride(0), q.stride(1), q.stride(2), q.stride(3),
+        qm.stride(0), qm.stride(1), qm.stride(2), qm.stride(3),
+        GROUP_SIZE=GROUP_SIZE
+    )
+    return q_out, qm
+def preprocess_qkv(q: torch.Tensor, k: torch.Tensor, v: torch.Tensor, per_block_mean: bool = True, enable_smoothing_q: bool = False, enable_smoothing_k: bool = False):
+    def pad_to_block_size(x):
+        L = x.size(2)
+        pad_len = (BLOCK_M - L % BLOCK_M) % BLOCK_M
+        if pad_len == 0:
+            return x.contiguous()
+        return F.pad(x, (0, 0, 0, pad_len), value=0).contiguous()
+    if enable_smoothing_k:
+        k -= k.mean(dim=-2, keepdim=True)
+    q, k, v = map(lambda x: pad_to_block_size(x), [q, k, v])
+    if per_block_mean and enable_smoothing_q:
+        q, qm = triton_group_mean(q)
+    elif enable_smoothing_q:
+        qm = q.mean(dim=-2, keepdim=True)
+        q = q - qm
+    if enable_smoothing_q:
+        delta_s = torch.matmul(qm, k.transpose(-2, -1)).to(torch.float32).contiguous()
+    else:  # used to disable q smoothing
+        B, H, L, D = q.shape
+        delta_s = torch.zeros((B, H, L // BLOCK_M, k.shape[2]), device=q.device, dtype=torch.float32)
+    return q, k, v, delta_s
+def scale_and_quant_fp4(x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+    assert x.ndim == 4
+    B, H, N, D = x.shape
+    packed_fp4 = torch.empty((B, H, N, D // 2), device=x.device, dtype=torch.uint8)
+    fp8_scale = torch.empty((B, H, N, D // 16), device=x.device, dtype=torch.float8_e4m3fn)
+    fp4quant_cuda.scaled_fp4_quant(x, packed_fp4, fp8_scale, 1)
+    return packed_fp4, fp8_scale
+def scale_and_quant_fp4_permute(x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+    assert x.ndim == 4
+    B, H, N, D = x.shape
+    packed_fp4 = torch.empty((B, H, N, D // 2), device=x.device, dtype=torch.uint8)
+    fp8_scale = torch.empty((B, H, N, D // 16), device=x.device, dtype=torch.float8_e4m3fn)
+    fp4quant_cuda.scaled_fp4_quant_permute(x, packed_fp4, fp8_scale, 1)
+    return packed_fp4, fp8_scale
+def scale_and_quant_fp4_transpose(x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+    assert x.ndim == 4
+    B, H, N, D = x.shape
+    packed_fp4 = torch.empty((B, H, D, N // 2), device=x.device, dtype=torch.uint8)
+    fp8_scale = torch.empty((B, H, D, N // 16), device=x.device, dtype=torch.float8_e4m3fn)
+    fp4quant_cuda.scaled_fp4_quant_trans(x, packed_fp4, fp8_scale, 1)
+    return packed_fp4, fp8_scale
+def blockscaled_fp4_attn(qlist: Tuple,
+                         klist: Tuple,
+                         vlist: Tuple,
+                         delta_s: torch.Tensor,
+                         KL: int,
+                         is_causal: bool = False,
+                         per_block_mean: bool = True,
+                         is_bf16: bool = True,
+                         single_level_p_quant: bool = False,
+                         sm_scale: float | None = None
+                        ):
+    softmax_scale = sm_scale if sm_scale is not None else (qlist[0].shape[-1] * 2) ** (-0.5)
+    return fp4attn_cuda.fwd(qlist[0], klist[0], vlist[0], qlist[1], klist[1], vlist[1], delta_s, KL, None, softmax_scale, is_causal, per_block_mean, is_bf16, single_level_p_quant)
+def sageattn_blackwell(q, k, v, attn_mask = None, is_causal = False, per_block_mean = True, single_level_p_quant = True, sm_scale: float | None = None, **kwargs):
+    """
+    SageAttention3 Blackwell kernel for FP4 attention.
+    Args:
+        q: Query tensor [B, H, L, D]
+        k: Key tensor [B, H, L, D]
+        v: Value tensor [B, H, L, D]
+        attn_mask: Attention mask (not used)
+        is_causal: Whether to use causal masking
+        per_block_mean: Whether to use per-block mean for Q smoothing
+        single_level_p_quant: If True, use single-level quantization: s_P2, P̂_2 = φ(P̃) directly
+                              (standard per-block FP4 quantization like V, no s_P1).
+                              If False (default), use two-level quantization:
+                              s_P1 = rowmax(P̃)/(448×6), then s_P2, P̂_2 = φ(P̃/s_P1).
+        sm_scale: Softmax scale to pass through to the CUDA kernel. If None,
+                  defaults to the kernel's 1/sqrt(D) scale.
+        **kwargs: Additional arguments (ignored)
+    Returns:
+        Output tensor [B, H, L, D]
+    """
+    if q.size(-1) >= 256:
+        print(f"Unsupported Headdim {q.size(-1)}")
+        return sdpa(q, k, v, is_causal = is_causal)
+    QL = q.size(2)
+    KL = k.size(2)
+    is_bf16 = q.dtype == torch.bfloat16
+    q, k, v, delta_s = preprocess_qkv(q, k, v, per_block_mean)
+    qlist_from_cuda = scale_and_quant_fp4(q)
+    klist_from_cuda = scale_and_quant_fp4_permute(k)
+    vlist_from_cuda = scale_and_quant_fp4_transpose(v)
+    o_fp4 = blockscaled_fp4_attn(
+    qlist_from_cuda,
+    klist_from_cuda,
+    vlist_from_cuda,
+    delta_s,
+    KL,
+    is_causal,
+    per_block_mean,
+    is_bf16,
+    single_level_p_quant,
+    sm_scale
+    )[0][:, :, :QL, :].contiguous()
+    return o_fp4

fastvideo_kernel-0.3.0/attn_qat_infer/blackwell/__init__.py ADDED Viewed

	@@ -0,0 +1 @@
1	+ __version__ = "3.0.0.b1"

fastvideo-kernel 0.2.6__tar.gz → 0.3.0__tar.gz

fastvideo-kernel 0.2.6tar.gz → 0.3.0tar.gz