cache-dit 0.2.0__tar.gz → 0.2.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of cache-dit might be problematic. Click here for more details.
- {cache_dit-0.2.0 → cache_dit-0.2.2}/PKG-INFO +88 -36
- {cache_dit-0.2.0 → cache_dit-0.2.2}/README.md +87 -35
- cache_dit-0.2.2/assets/TEXTURE_DBCACHE_F1B0_R0.08.png +0 -0
- cache_dit-0.2.2/assets/TEXTURE_DBCACHE_F8B12_R0.12.png +0 -0
- cache_dit-0.2.2/assets/TEXTURE_DBCACHE_F8B16_R0.2.png +0 -0
- cache_dit-0.2.2/assets/TEXTURE_DBCACHE_F8B20_R0.2.png +0 -0
- cache_dit-0.2.2/assets/TEXTURE_DBCACHE_F8B8_R0.12.png +0 -0
- cache_dit-0.2.2/assets/TEXTURE_NONE_R0.08.png +0 -0
- cache_dit-0.2.2/assets/U0_C0_DBCACHE_F1B0S1W0T0ET0_R0.12_S14_T12.85s.png +0 -0
- cache_dit-0.2.2/assets/U0_C0_DBCACHE_F1B0S1W0T0ET0_R0.15_S17_T10.27s.png +0 -0
- cache_dit-0.2.2/assets/U0_C0_DBCACHE_F1B0S1W0T1ET1_R0.12_S14_T12.86s.png +0 -0
- cache_dit-0.2.2/assets/U0_C0_DBCACHE_F1B0S1W0T1ET1_R0.15_S17_T10.28s.png +0 -0
- cache_dit-0.2.2/assets/U0_C1_DBCACHE_F1B0S1W0T1ET1_R0.15_S17_T8.48s.png +0 -0
- cache_dit-0.2.2/assets/cache-dit-v1.png +0 -0
- cache_dit-0.2.2/assets/dbcache-fnbn-v1.png +0 -0
- cache_dit-0.2.2/assets/dbcache-v1.png +0 -0
- cache_dit-0.2.2/assets/dbprune-v1.png +0 -0
- cache_dit-0.2.2/assets/fbcache-v1.png +0 -0
- {cache_dit-0.2.0 → cache_dit-0.2.2}/bench/bench.py +47 -8
- {cache_dit-0.2.0 → cache_dit-0.2.2}/examples/run_hunyuan_video.py +1 -1
- {cache_dit-0.2.0 → cache_dit-0.2.2}/examples/run_wan.py +11 -2
- {cache_dit-0.2.0 → cache_dit-0.2.2}/src/cache_dit/_version.py +2 -2
- {cache_dit-0.2.0 → cache_dit-0.2.2}/src/cache_dit/cache_factory/dual_block_cache/cache_context.py +322 -69
- {cache_dit-0.2.0 → cache_dit-0.2.2}/src/cache_dit/cache_factory/dual_block_cache/diffusers_adapters/__init__.py +0 -2
- {cache_dit-0.2.0 → cache_dit-0.2.2}/src/cache_dit/cache_factory/dual_block_cache/diffusers_adapters/cogvideox.py +0 -2
- {cache_dit-0.2.0 → cache_dit-0.2.2}/src/cache_dit/cache_factory/dual_block_cache/diffusers_adapters/flux.py +0 -2
- {cache_dit-0.2.0 → cache_dit-0.2.2}/src/cache_dit/cache_factory/dual_block_cache/diffusers_adapters/hunyuan_video.py +0 -1
- {cache_dit-0.2.0 → cache_dit-0.2.2}/src/cache_dit/cache_factory/dual_block_cache/diffusers_adapters/mochi.py +0 -1
- {cache_dit-0.2.0 → cache_dit-0.2.2}/src/cache_dit/cache_factory/dual_block_cache/diffusers_adapters/wan.py +0 -2
- {cache_dit-0.2.0 → cache_dit-0.2.2}/src/cache_dit/cache_factory/dynamic_block_prune/diffusers_adapters/__init__.py +0 -2
- {cache_dit-0.2.0 → cache_dit-0.2.2}/src/cache_dit/cache_factory/dynamic_block_prune/diffusers_adapters/cogvideox.py +0 -2
- {cache_dit-0.2.0 → cache_dit-0.2.2}/src/cache_dit/cache_factory/dynamic_block_prune/diffusers_adapters/flux.py +0 -2
- {cache_dit-0.2.0 → cache_dit-0.2.2}/src/cache_dit/cache_factory/dynamic_block_prune/diffusers_adapters/hunyuan_video.py +0 -1
- {cache_dit-0.2.0 → cache_dit-0.2.2}/src/cache_dit/cache_factory/dynamic_block_prune/diffusers_adapters/mochi.py +0 -2
- {cache_dit-0.2.0 → cache_dit-0.2.2}/src/cache_dit/cache_factory/dynamic_block_prune/diffusers_adapters/wan.py +1 -3
- {cache_dit-0.2.0 → cache_dit-0.2.2}/src/cache_dit/cache_factory/first_block_cache/cache_context.py +3 -0
- {cache_dit-0.2.0 → cache_dit-0.2.2}/src/cache_dit/cache_factory/taylorseer.py +30 -0
- {cache_dit-0.2.0 → cache_dit-0.2.2}/src/cache_dit.egg-info/PKG-INFO +88 -36
- {cache_dit-0.2.0 → cache_dit-0.2.2}/src/cache_dit.egg-info/SOURCES.txt +23 -1
- cache_dit-0.2.2/tests/.gitignore +167 -0
- cache_dit-0.2.2/tests/README.md +9 -0
- cache_dit-0.2.2/tests/taylorseer_approximation_order_2.png +0 -0
- cache_dit-0.2.2/tests/taylorseer_approximation_order_4.png +0 -0
- cache_dit-0.2.2/tests/taylorseer_approximation_test.png +0 -0
- cache_dit-0.2.2/tests/test_taylorseer.py +81 -0
- {cache_dit-0.2.0 → cache_dit-0.2.2}/.github/workflows/issue.yml +0 -0
- {cache_dit-0.2.0 → cache_dit-0.2.2}/.gitignore +0 -0
- {cache_dit-0.2.0 → cache_dit-0.2.2}/.pre-commit-config.yaml +0 -0
- {cache_dit-0.2.0 → cache_dit-0.2.2}/CONTRIBUTE.md +0 -0
- {cache_dit-0.2.0 → cache_dit-0.2.2}/LICENSE +0 -0
- {cache_dit-0.2.0 → cache_dit-0.2.2}/MANIFEST.in +0 -0
- {cache_dit-0.2.0 → cache_dit-0.2.2}/assets/DBCACHE_F12B12S4_R0.2_S16.png +0 -0
- {cache_dit-0.2.0 → cache_dit-0.2.2}/assets/DBCACHE_F12B16S4_R0.08_S6.png +0 -0
- {cache_dit-0.2.0 → cache_dit-0.2.2}/assets/DBCACHE_F16B16S2_R0.2_S14.png +0 -0
- {cache_dit-0.2.0 → cache_dit-0.2.2}/assets/DBCACHE_F16B16S4_R0.2_S13.png +0 -0
- {cache_dit-0.2.0 → cache_dit-0.2.2}/assets/DBCACHE_F1B0S1_R0.08_S11.png +0 -0
- {cache_dit-0.2.0 → cache_dit-0.2.2}/assets/DBCACHE_F1B0S1_R0.2_S19.png +0 -0
- {cache_dit-0.2.0 → cache_dit-0.2.2}/assets/DBCACHE_F8B0S2_R0.12_S12.png +0 -0
- {cache_dit-0.2.0 → cache_dit-0.2.2}/assets/DBCACHE_F8B16S1_R0.2_S18.png +0 -0
- {cache_dit-0.2.0 → cache_dit-0.2.2}/assets/DBCACHE_F8B8S1_R0.08_S9.png +0 -0
- {cache_dit-0.2.0 → cache_dit-0.2.2}/assets/DBCACHE_F8B8S1_R0.12_S12.png +0 -0
- {cache_dit-0.2.0 → cache_dit-0.2.2}/assets/DBCACHE_F8B8S1_R0.15_S15.png +0 -0
- {cache_dit-0.2.0 → cache_dit-0.2.2}/assets/DBCache.png +0 -0
- {cache_dit-0.2.0 → cache_dit-0.2.2}/assets/DBPRUNE_F1B0_R0.03_P24.0_T19.43s.png +0 -0
- {cache_dit-0.2.0 → cache_dit-0.2.2}/assets/DBPRUNE_F1B0_R0.04_P34.6_T16.82s.png +0 -0
- {cache_dit-0.2.0 → cache_dit-0.2.2}/assets/DBPRUNE_F1B0_R0.05_P38.3_T15.95s.png +0 -0
- {cache_dit-0.2.0 → cache_dit-0.2.2}/assets/DBPRUNE_F1B0_R0.06_P45.2_T14.24s.png +0 -0
- {cache_dit-0.2.0 → cache_dit-0.2.2}/assets/DBPRUNE_F1B0_R0.07_P52.3_T12.53s.png +0 -0
- {cache_dit-0.2.0 → cache_dit-0.2.2}/assets/DBPRUNE_F1B0_R0.08_P52.4_T12.52s.png +0 -0
- {cache_dit-0.2.0 → cache_dit-0.2.2}/assets/DBPRUNE_F1B0_R0.09_P59.2_T10.81s.png +0 -0
- {cache_dit-0.2.0 → cache_dit-0.2.2}/assets/DBPRUNE_F1B0_R0.12_P59.5_T10.76s.png +0 -0
- {cache_dit-0.2.0 → cache_dit-0.2.2}/assets/DBPRUNE_F1B0_R0.12_P63.0_T9.90s.png +0 -0
- {cache_dit-0.2.0 → cache_dit-0.2.2}/assets/DBPRUNE_F1B0_R0.1_P62.8_T9.95s.png +0 -0
- {cache_dit-0.2.0 → cache_dit-0.2.2}/assets/DBPRUNE_F1B0_R0.2_P59.5_T10.66s.png +0 -0
- {cache_dit-0.2.0 → cache_dit-0.2.2}/assets/DBPRUNE_F1B0_R0.3_P63.1_T9.79s.png +0 -0
- {cache_dit-0.2.0 → cache_dit-0.2.2}/assets/NONE_R0.08_S0.png +0 -0
- {cache_dit-0.2.0 → cache_dit-0.2.2}/assets/U0_C1_DBPRUNE_F1B0_R0.03_P24.0_T16.25s.png +0 -0
- {cache_dit-0.2.0 → cache_dit-0.2.2}/assets/U0_C1_DBPRUNE_F1B0_R0.045_P38.2_T13.41s.png +0 -0
- {cache_dit-0.2.0 → cache_dit-0.2.2}/assets/U0_C1_DBPRUNE_F1B0_R0.04_P34.6_T14.12s.png +0 -0
- {cache_dit-0.2.0 → cache_dit-0.2.2}/assets/U0_C1_DBPRUNE_F1B0_R0.055_P45.1_T12.00s.png +0 -0
- {cache_dit-0.2.0 → cache_dit-0.2.2}/assets/U0_C1_DBPRUNE_F1B0_R0.05_P41.6_T12.70s.png +0 -0
- {cache_dit-0.2.0 → cache_dit-0.2.2}/assets/U0_C1_DBPRUNE_F1B0_R0.2_P59.5_T8.86s.png +0 -0
- {cache_dit-0.2.0 → cache_dit-0.2.2}/assets/U0_C1_DBPRUNE_F8B8_R0.08_P23.1_T16.14s.png +0 -0
- {cache_dit-0.2.0 → cache_dit-0.2.2}/assets/U0_C1_NONE_R0.08_S0_T20.43s.png +0 -0
- {cache_dit-0.2.0 → cache_dit-0.2.2}/assets/U4_C1_DBPRUNE_F1B0_R0.03_P27.3_T6.62s.png +0 -0
- {cache_dit-0.2.0 → cache_dit-0.2.2}/assets/U4_C1_DBPRUNE_F1B0_R0.03_P27.3_T6.63s.png +0 -0
- {cache_dit-0.2.0 → cache_dit-0.2.2}/assets/U4_C1_DBPRUNE_F1B0_R0.045_P38.2_T5.81s.png +0 -0
- {cache_dit-0.2.0 → cache_dit-0.2.2}/assets/U4_C1_DBPRUNE_F1B0_R0.045_P38.2_T5.82s.png +0 -0
- {cache_dit-0.2.0 → cache_dit-0.2.2}/assets/U4_C1_DBPRUNE_F1B0_R0.04_P34.6_T6.06s.png +0 -0
- {cache_dit-0.2.0 → cache_dit-0.2.2}/assets/U4_C1_DBPRUNE_F1B0_R0.04_P34.6_T6.07s.png +0 -0
- {cache_dit-0.2.0 → cache_dit-0.2.2}/assets/U4_C1_DBPRUNE_F1B0_R0.04_P34.6_T6.08s.png +0 -0
- {cache_dit-0.2.0 → cache_dit-0.2.2}/assets/U4_C1_DBPRUNE_F1B0_R0.055_P45.1_T5.27s.png +0 -0
- {cache_dit-0.2.0 → cache_dit-0.2.2}/assets/U4_C1_DBPRUNE_F1B0_R0.055_P45.1_T5.28s.png +0 -0
- {cache_dit-0.2.0 → cache_dit-0.2.2}/assets/U4_C1_DBPRUNE_F1B0_R0.2_P59.5_T3.95s.png +0 -0
- {cache_dit-0.2.0 → cache_dit-0.2.2}/assets/U4_C1_DBPRUNE_F1B0_R0.2_P59.5_T3.96s.png +0 -0
- {cache_dit-0.2.0 → cache_dit-0.2.2}/assets/U4_C1_NONE_R0.08_S0_T7.78s.png +0 -0
- {cache_dit-0.2.0 → cache_dit-0.2.2}/assets/U4_C1_NONE_R0.08_S0_T7.79s.png +0 -0
- {cache_dit-0.2.0 → cache_dit-0.2.2}/assets/cache-dit.png +0 -0
- {cache_dit-0.2.0 → cache_dit-0.2.2}/bench/.gitignore +0 -0
- {cache_dit-0.2.0 → cache_dit-0.2.2}/docs/.gitignore +0 -0
- {cache_dit-0.2.0 → cache_dit-0.2.2}/examples/.gitignore +0 -0
- {cache_dit-0.2.0 → cache_dit-0.2.2}/examples/README.md +0 -0
- {cache_dit-0.2.0 → cache_dit-0.2.2}/examples/data/cup.png +0 -0
- {cache_dit-0.2.0 → cache_dit-0.2.2}/examples/data/cup_mask.png +0 -0
- {cache_dit-0.2.0 → cache_dit-0.2.2}/examples/requirements.txt +0 -0
- {cache_dit-0.2.0 → cache_dit-0.2.2}/examples/run_cogvideox.py +0 -0
- {cache_dit-0.2.0 → cache_dit-0.2.2}/examples/run_flux.py +0 -0
- {cache_dit-0.2.0 → cache_dit-0.2.2}/examples/run_flux_fill.py +0 -0
- {cache_dit-0.2.0 → cache_dit-0.2.2}/examples/run_mochi.py +0 -0
- {cache_dit-0.2.0 → cache_dit-0.2.2}/pyproject.toml +0 -0
- {cache_dit-0.2.0 → cache_dit-0.2.2}/pytest.ini +0 -0
- {cache_dit-0.2.0 → cache_dit-0.2.2}/requirements.txt +0 -0
- {cache_dit-0.2.0 → cache_dit-0.2.2}/setup.cfg +0 -0
- {cache_dit-0.2.0 → cache_dit-0.2.2}/setup.py +0 -0
- {cache_dit-0.2.0 → cache_dit-0.2.2}/src/cache_dit/__init__.py +0 -0
- {cache_dit-0.2.0 → cache_dit-0.2.2}/src/cache_dit/cache_factory/__init__.py +0 -0
- {cache_dit-0.2.0 → cache_dit-0.2.2}/src/cache_dit/cache_factory/dual_block_cache/__init__.py +0 -0
- {cache_dit-0.2.0 → cache_dit-0.2.2}/src/cache_dit/cache_factory/dynamic_block_prune/__init__.py +0 -0
- {cache_dit-0.2.0 → cache_dit-0.2.2}/src/cache_dit/cache_factory/dynamic_block_prune/prune_context.py +0 -0
- {cache_dit-0.2.0 → cache_dit-0.2.2}/src/cache_dit/cache_factory/first_block_cache/__init__.py +0 -0
- {cache_dit-0.2.0 → cache_dit-0.2.2}/src/cache_dit/cache_factory/first_block_cache/diffusers_adapters/__init__.py +0 -0
- {cache_dit-0.2.0 → cache_dit-0.2.2}/src/cache_dit/cache_factory/first_block_cache/diffusers_adapters/cogvideox.py +0 -0
- {cache_dit-0.2.0 → cache_dit-0.2.2}/src/cache_dit/cache_factory/first_block_cache/diffusers_adapters/flux.py +0 -0
- {cache_dit-0.2.0 → cache_dit-0.2.2}/src/cache_dit/cache_factory/first_block_cache/diffusers_adapters/hunyuan_video.py +0 -0
- {cache_dit-0.2.0 → cache_dit-0.2.2}/src/cache_dit/cache_factory/first_block_cache/diffusers_adapters/mochi.py +0 -0
- {cache_dit-0.2.0 → cache_dit-0.2.2}/src/cache_dit/cache_factory/first_block_cache/diffusers_adapters/wan.py +0 -0
- {cache_dit-0.2.0 → cache_dit-0.2.2}/src/cache_dit/cache_factory/utils.py +0 -0
- {cache_dit-0.2.0 → cache_dit-0.2.2}/src/cache_dit/logger.py +0 -0
- {cache_dit-0.2.0 → cache_dit-0.2.2}/src/cache_dit/primitives.py +0 -0
- {cache_dit-0.2.0 → cache_dit-0.2.2}/src/cache_dit.egg-info/dependency_links.txt +0 -0
- {cache_dit-0.2.0 → cache_dit-0.2.2}/src/cache_dit.egg-info/requires.txt +0 -0
- {cache_dit-0.2.0 → cache_dit-0.2.2}/src/cache_dit.egg-info/top_level.txt +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: cache_dit
|
|
3
|
-
Version: 0.2.
|
|
3
|
+
Version: 0.2.2
|
|
4
4
|
Summary: 🤗 CacheDiT: A Training-free and Easy-to-use Cache Acceleration Toolbox for Diffusion Transformers
|
|
5
5
|
Author: DefTruth, vipshop.com, etc.
|
|
6
6
|
Maintainer: DefTruth, vipshop.com, etc
|
|
@@ -37,40 +37,31 @@ Dynamic: requires-python
|
|
|
37
37
|
<p align="center">
|
|
38
38
|
<h2>🤗 CacheDiT: A Training-free and Easy-to-use Cache Acceleration <br>Toolbox for Diffusion Transformers</h2>
|
|
39
39
|
</p>
|
|
40
|
-
<img src=https://github.com/vipshop/cache-dit/raw/main/assets/cache-dit.png >
|
|
40
|
+
<img src=https://github.com/vipshop/cache-dit/raw/main/assets/cache-dit-v1.png >
|
|
41
41
|
<div align='center'>
|
|
42
42
|
<img src=https://img.shields.io/badge/Language-Python-brightgreen.svg >
|
|
43
43
|
<img src=https://img.shields.io/badge/PRs-welcome-9cf.svg >
|
|
44
44
|
<img src=https://img.shields.io/badge/PyPI-pass-brightgreen.svg >
|
|
45
45
|
<img src=https://static.pepy.tech/badge/cache-dit >
|
|
46
46
|
<img src=https://img.shields.io/badge/Python-3.10|3.11|3.12-9cf.svg >
|
|
47
|
-
<img src=https://img.shields.io/badge/Release-v0.2.
|
|
47
|
+
<img src=https://img.shields.io/badge/Release-v0.2.2-brightgreen.svg >
|
|
48
48
|
</div>
|
|
49
49
|
<p align="center">
|
|
50
|
-
DeepCache is for UNet not DiT. Most DiT cache speedups are complex and not training-free. CacheDiT <br>
|
|
51
|
-
</p>
|
|
52
|
-
<p align="center">
|
|
53
|
-
<h4> 🔥Supported Models🔥</h4>
|
|
54
|
-
<a href=https://github.com/vipshop/cache-dit/raw/main/examples> <b>🚀FLUX.1</b>: ✔️DBCache, ✔️DBPrune, ✔️FBCache🔥</a> <br>
|
|
55
|
-
<a href=https://github.com/vipshop/cache-dit/raw/main/examples> <b>🚀Mochi</b>: ✔️DBCache, ✔️DBPrune, ✔️FBCache🔥</a> <br>
|
|
56
|
-
<a href=https://github.com/vipshop/cache-dit/raw/main/examples> <b>🚀CogVideoX</b>: ✔️DBCache, ✔️DBPrune, ✔️FBCache🔥</a> <br>
|
|
57
|
-
<a href=https://github.com/vipshop/cache-dit/raw/main/examples> <b>🚀CogVideoX1.5</b>: ✔️DBCache, ✔️DBPrune, ✔️FBCache🔥</a> <br>
|
|
58
|
-
<a href=https://github.com/vipshop/cache-dit/raw/main/examples> <b>🚀Wan2.1</b>: ✔️DBCache, ✔️DBPrune, ✔️FBCache🔥</a> <br>
|
|
59
|
-
<a href=https://github.com/vipshop/cache-dit/raw/main/examples> <b>🚀HunyuanVideo</b>: ✔️DBCache, ✔️DBPrune, ✔️FBCache🔥</a> <br>
|
|
50
|
+
DeepCache is for UNet not DiT. Most DiT cache speedups are complex and not training-free. CacheDiT offers <br>a set of training-free cache accelerators for DiT: <b>🔥<a href="#dbcache">DBCache</a>, <a href="#dbprune">DBPrune</a>, <a href="#taylorseer">TaylorSeer</a>, <a href="#fbcache">FBCache</a></b>, etc🔥
|
|
60
51
|
</p>
|
|
61
52
|
</div>
|
|
62
53
|
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
<
|
|
66
|
-
|
|
67
|
-
|
|
54
|
+
<div align="center">
|
|
55
|
+
<p align="center">
|
|
56
|
+
<b>♥️ Please consider to leave a ⭐️ Star to support us ~ ♥️</b>
|
|
57
|
+
</p>
|
|
58
|
+
</div>
|
|
68
59
|
|
|
69
60
|
## 🤗 Introduction
|
|
70
61
|
|
|
71
62
|
<div align="center">
|
|
72
63
|
<p align="center">
|
|
73
|
-
<h3>🔥
|
|
64
|
+
<h3>🔥DBCache: Dual Block Caching for Diffusion Transformers</h3>
|
|
74
65
|
</p>
|
|
75
66
|
</div>
|
|
76
67
|
|
|
@@ -86,9 +77,9 @@ The **CacheDiT** codebase is adapted from [FBCache](https://github.com/chengzeyi
|
|
|
86
77
|
|:---:|:---:|:---:|:---:|:---:|:---:|
|
|
87
78
|
|24.85s|15.59s|8.58s|15.41s|15.11s|17.74s|
|
|
88
79
|
|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/NONE_R0.08_S0.png width=105px>|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/DBCACHE_F1B0S1_R0.08_S11.png width=105px> | <img src=https://github.com/vipshop/cache-dit/raw/main/assets/DBCACHE_F1B0S1_R0.2_S19.png width=105px>|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/DBCACHE_F8B8S1_R0.15_S15.png width=105px>|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/DBCACHE_F12B12S4_R0.2_S16.png width=105px>|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/DBCACHE_F16B16S4_R0.2_S13.png width=105px>|
|
|
89
|
-
|**Baseline(L20x1)**|**F1B0 (0.08)**|**F8B8 (0.12)**|**F8B12 (0.
|
|
80
|
+
|**Baseline(L20x1)**|**F1B0 (0.08)**|**F8B8 (0.12)**|**F8B12 (0.12)**|**F8B16 (0.20)**|**F8B20 (0.20)**|
|
|
90
81
|
|27.85s|6.04s|5.88s|5.77s|6.01s|6.20s|
|
|
91
|
-
|<img src=https://github.com/
|
|
82
|
+
|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/TEXTURE_NONE_R0.08.png width=105px>|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/TEXTURE_DBCACHE_F1B0_R0.08.png width=105px> |<img src=https://github.com/vipshop/cache-dit/raw/main/assets/TEXTURE_DBCACHE_F8B8_R0.12.png width=105px>|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/TEXTURE_DBCACHE_F8B12_R0.12.png width=105px>|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/TEXTURE_DBCACHE_F8B16_R0.2.png width=105px>|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/TEXTURE_DBCACHE_F8B20_R0.2.png width=105px>|
|
|
92
83
|
|
|
93
84
|
<div align="center">
|
|
94
85
|
<p align="center">
|
|
@@ -100,7 +91,7 @@ These case studies demonstrate that even with relatively high thresholds (such a
|
|
|
100
91
|
|
|
101
92
|
<div align="center">
|
|
102
93
|
<p align="center">
|
|
103
|
-
<h3>🔥
|
|
94
|
+
<h3>🔥DBPrune: Dynamic Block Prune with Residual Caching</h3>
|
|
104
95
|
</p>
|
|
105
96
|
</div>
|
|
106
97
|
|
|
@@ -119,11 +110,11 @@ These case studies demonstrate that even with relatively high thresholds (such a
|
|
|
119
110
|
|
|
120
111
|
<div align="center">
|
|
121
112
|
<p align="center">
|
|
122
|
-
<h3>🔥
|
|
113
|
+
<h3>🔥Context Parallelism and Torch Compile</h3>
|
|
123
114
|
</p>
|
|
124
115
|
</div>
|
|
125
116
|
|
|
126
|
-
Moreover, **CacheDiT** are **plug-and-play** solutions that works hand-in-hand with [ParaAttention](https://github.com/chengzeyi/ParaAttention). Users can easily tap into its **Context Parallelism** features for distributed inference.
|
|
117
|
+
Moreover, **CacheDiT** are **plug-and-play** solutions that works hand-in-hand with [ParaAttention](https://github.com/chengzeyi/ParaAttention). Users can easily tap into its **Context Parallelism** features for distributed inference. CacheDiT is designed to work compatibly with **torch.compile.** You can easily use CacheDiT with torch.compile to further achieve a better performance.
|
|
127
118
|
|
|
128
119
|
<div align="center">
|
|
129
120
|
<p align="center">
|
|
@@ -137,12 +128,6 @@ Moreover, **CacheDiT** are **plug-and-play** solutions that works hand-in-hand w
|
|
|
137
128
|
|+L20x4:7.75s|6.62s|6.03s|5.81s|5.24s|3.93s|
|
|
138
129
|
|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/U0_C1_NONE_R0.08_S0_T20.43s.png width=105px>|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/U0_C1_DBPRUNE_F1B0_R0.03_P24.0_T16.25s.png width=105px> | <img src=https://github.com/vipshop/cache-dit/raw/main/assets/U0_C1_DBPRUNE_F1B0_R0.04_P34.6_T14.12s.png width=105px>|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/U0_C1_DBPRUNE_F1B0_R0.045_P38.2_T13.41s.png width=105px>|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/U0_C1_DBPRUNE_F1B0_R0.055_P45.1_T12.00s.png width=105px>|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/U0_C1_DBPRUNE_F1B0_R0.2_P59.5_T8.86s.png width=105px>|
|
|
139
130
|
|
|
140
|
-
<div align="center">
|
|
141
|
-
<p align="center">
|
|
142
|
-
<b>♥️ Please consider to leave a ⭐️ Star to support us ~ ♥️</b>
|
|
143
|
-
</p>
|
|
144
|
-
</div>
|
|
145
|
-
|
|
146
131
|
## ©️Citations
|
|
147
132
|
|
|
148
133
|
```BibTeX
|
|
@@ -155,12 +140,20 @@ Moreover, **CacheDiT** are **plug-and-play** solutions that works hand-in-hand w
|
|
|
155
140
|
}
|
|
156
141
|
```
|
|
157
142
|
|
|
143
|
+
## 👋Reference
|
|
144
|
+
|
|
145
|
+
<div id="reference"></div>
|
|
146
|
+
|
|
147
|
+
The **CacheDiT** codebase is adapted from [FBCache](https://github.com/chengzeyi/ParaAttention/tree/main/src/para_attn/first_block_cache). Special thanks to their excellent work!
|
|
148
|
+
|
|
158
149
|
## 📖Contents
|
|
159
150
|
|
|
160
151
|
<div id="contents"></div>
|
|
161
152
|
|
|
162
153
|
- [⚙️Installation](#️installation)
|
|
154
|
+
- [🔥Supported Models](#supported)
|
|
163
155
|
- [⚡️Dual Block Cache](#dbcache)
|
|
156
|
+
- [🔥Hybrid TaylorSeer](#taylorseer)
|
|
164
157
|
- [🎉First Block Cache](#fbcache)
|
|
165
158
|
- [⚡️Dynamic Block Prune](#dbprune)
|
|
166
159
|
- [🎉Context Parallelism](#context-parallelism)
|
|
@@ -183,16 +176,31 @@ Or you can install the latest develop version from GitHub:
|
|
|
183
176
|
pip3 install git+https://github.com/vipshop/cache-dit.git
|
|
184
177
|
```
|
|
185
178
|
|
|
179
|
+
## 🔥Supported Models
|
|
180
|
+
|
|
181
|
+
<div id="supported"></div>
|
|
182
|
+
|
|
183
|
+
- [🚀FLUX.1](https://github.com/vipshop/cache-dit/raw/main/examples)
|
|
184
|
+
- [🚀Mochi](https://github.com/vipshop/cache-dit/raw/main/examples)
|
|
185
|
+
- [🚀CogVideoX](https://github.com/vipshop/cache-dit/raw/main/examples)
|
|
186
|
+
- [🚀CogVideoX1.5](https://github.com/vipshop/cache-dit/raw/main/examples)
|
|
187
|
+
- [🚀Wan2.1](https://github.com/vipshop/cache-dit/raw/main/examples)
|
|
188
|
+
- [🚀HunyuanVideo](https://github.com/vipshop/cache-dit/raw/main/examples)
|
|
189
|
+
|
|
190
|
+
|
|
186
191
|
## ⚡️DBCache: Dual Block Cache
|
|
187
192
|
|
|
188
193
|
<div id="dbcache"></div>
|
|
189
194
|
|
|
190
|
-

|
|
191
196
|
|
|
192
197
|
**DBCache** provides configurable parameters for custom optimization, enabling a balanced trade-off between performance and precision:
|
|
193
198
|
|
|
194
199
|
- **Fn**: Specifies that DBCache uses the **first n** Transformer blocks to fit the information at time step t, enabling the calculation of a more stable L1 diff and delivering more accurate information to subsequent blocks.
|
|
195
200
|
- **Bn**: Further fuses approximate information in the **last n** Transformer blocks to enhance prediction accuracy. These blocks act as an auto-scaler for approximate hidden states that use residual cache.
|
|
201
|
+
|
|
202
|
+

|
|
203
|
+
|
|
196
204
|
- **warmup_steps**: (default: 0) DBCache does not apply the caching strategy when the number of running steps is less than or equal to this value, ensuring the model sufficiently learns basic features during warmup.
|
|
197
205
|
- **max_cached_steps**: (default: -1) DBCache disables the caching strategy when the previous cached steps exceed this value to prevent precision degradation.
|
|
198
206
|
- **residual_diff_threshold**: The value of residual diff threshold, a higher value leads to faster performance at the cost of lower precision.
|
|
@@ -248,11 +256,50 @@ cache_options = {
|
|
|
248
256
|
|24.85s|15.59s|8.58s|15.41s|15.11s|17.74s|
|
|
249
257
|
|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/NONE_R0.08_S0.png width=105px>|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/DBCACHE_F1B0S1_R0.08_S11.png width=105px> | <img src=https://github.com/vipshop/cache-dit/raw/main/assets/DBCACHE_F1B0S1_R0.2_S19.png width=105px>|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/DBCACHE_F8B8S1_R0.15_S15.png width=105px>|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/DBCACHE_F12B12S4_R0.2_S16.png width=105px>|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/DBCACHE_F16B16S4_R0.2_S13.png width=105px>|
|
|
250
258
|
|
|
259
|
+
## 🔥Hybrid TaylorSeer
|
|
260
|
+
|
|
261
|
+
<div id="taylorseer"></div>
|
|
262
|
+
|
|
263
|
+
We have supported the [TaylorSeers: From Reusing to Forecasting: Accelerating Diffusion Models with TaylorSeers](https://arxiv.org/pdf/2503.06923) algorithm to further improve the precision of DBCache in cases where the cached steps are large, namely, **Hybrid TaylorSeer + DBCache**. At timesteps with significant intervals, the feature similarity in diffusion models decreases substantially, significantly harming the generation quality.
|
|
264
|
+
|
|
265
|
+
$$
|
|
266
|
+
\mathcal{F}\_{\text {pred }, m}\left(x_{t-k}^l\right)=\mathcal{F}\left(x_t^l\right)+\sum_{i=1}^m \frac{\Delta^i \mathcal{F}\left(x_t^l\right)}{i!\cdot N^i}(-k)^i
|
|
267
|
+
$$
|
|
268
|
+
|
|
269
|
+
**TaylorSeer** employs a differential method to approximate the higher-order derivatives of features and predict features in future timesteps with Taylor series expansion. The TaylorSeer implemented in CacheDiT supports both hidden states and residual cache types. That is $\mathcal{F}\_{\text {pred }, m}\left(x_{t-k}^l\right)$ can be a residual cache or a hidden-state cache.
|
|
270
|
+
|
|
271
|
+
```python
|
|
272
|
+
cache_options = {
|
|
273
|
+
# TaylorSeer options
|
|
274
|
+
"enable_taylorseer": True,
|
|
275
|
+
"enable_encoder_taylorseer": True,
|
|
276
|
+
# Taylorseer cache type cache be hidden_states or residual.
|
|
277
|
+
"taylorseer_cache_type": "residual",
|
|
278
|
+
# Higher values of n_derivatives will lead to longer
|
|
279
|
+
# computation time but may improve precision significantly.
|
|
280
|
+
"taylorseer_kwargs": {
|
|
281
|
+
"n_derivatives": 2, # default is 2.
|
|
282
|
+
},
|
|
283
|
+
"warmup_steps": 3, # n_derivatives + 1
|
|
284
|
+
"residual_diff_threshold": 0.12,
|
|
285
|
+
}
|
|
286
|
+
```
|
|
287
|
+
<div align="center">
|
|
288
|
+
<p align="center">
|
|
289
|
+
<b>DBCache F1B0 + TaylorSeer</b>, L20x1, Steps: 28, <br>"A cat holding a sign that says hello world with complex background"
|
|
290
|
+
</p>
|
|
291
|
+
</div>
|
|
292
|
+
|
|
293
|
+
|Baseline(L20x1)|F1B0 (0.12)|+TaylorSeer|F1B0 (0.15)|+TaylorSeer|+compile|
|
|
294
|
+
|:---:|:---:|:---:|:---:|:---:|:---:|
|
|
295
|
+
|24.85s|12.85s|12.86s|10.27s|10.28s|8.48s|
|
|
296
|
+
|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/NONE_R0.08_S0.png width=105px>|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/U0_C0_DBCACHE_F1B0S1W0T0ET0_R0.12_S14_T12.85s.png width=105px>|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/U0_C0_DBCACHE_F1B0S1W0T1ET1_R0.12_S14_T12.86s.png width=105px>|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/U0_C0_DBCACHE_F1B0S1W0T0ET0_R0.15_S17_T10.27s.png width=105px>|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/U0_C0_DBCACHE_F1B0S1W0T1ET1_R0.15_S17_T10.28s.png width=105px>|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/U0_C1_DBCACHE_F1B0S1W0T1ET1_R0.15_S17_T8.48s.png width=105px>|
|
|
297
|
+
|
|
251
298
|
## 🎉FBCache: First Block Cache
|
|
252
299
|
|
|
253
300
|
<div id="fbcache"></div>
|
|
254
301
|
|
|
255
|
-

|
|
256
303
|
|
|
257
304
|
**DBCache** is a more general cache algorithm than **FBCache**. When Fn=1 and Bn=0, DBCache behaves identically to FBCache. Therefore, you can either use the original FBCache implementation directly or configure **DBCache** with **F1B0** settings to achieve the same functionality.
|
|
258
305
|
|
|
@@ -286,7 +333,7 @@ apply_cache_on_pipe(pipe, **cache_options)
|
|
|
286
333
|
|
|
287
334
|
<div id="dbprune"></div>
|
|
288
335
|
|
|
289
|
-

|
|
290
337
|
|
|
291
338
|
We have further implemented a new **Dynamic Block Prune** algorithm based on **Residual Caching** for Diffusion Transformers, which is referred to as **DBPrune**. DBPrune caches each block's hidden states and residuals, then dynamically prunes blocks during inference by computing the L1 distance between previous hidden states. When a block is pruned, its output is approximated using the cached residuals. DBPrune is currently in the experimental phase, and we kindly invite you to stay tuned for upcoming updates.
|
|
292
339
|
|
|
@@ -340,6 +387,9 @@ cache_options = {
|
|
|
340
387
|
apply_cache_on_pipe(pipe, **cache_options)
|
|
341
388
|
```
|
|
342
389
|
|
|
390
|
+
> [!Important]
|
|
391
|
+
> Please note that for GPUs with lower VRAM, DBPrune may not be suitable for use on video DiTs, as it caches the hidden states and residuals of each block, leading to higher GPU memory requirements. In such cases, please use DBCache, which only caches the hidden states and residuals of 2 blocks.
|
|
392
|
+
|
|
343
393
|
<div align="center">
|
|
344
394
|
<p align="center">
|
|
345
395
|
DBPrune, <b> L20x1 </b>, Steps: 28, "A cat holding a sign that says hello world with complex background"
|
|
@@ -370,7 +420,7 @@ from para_attn.context_parallel import init_context_parallel_mesh
|
|
|
370
420
|
from para_attn.context_parallel.diffusers_adapters import parallelize_pipe
|
|
371
421
|
from cache_dit.cache_factory import apply_cache_on_pipe, CacheType
|
|
372
422
|
|
|
373
|
-
|
|
423
|
+
# Init distributed process group
|
|
374
424
|
dist.init_process_group()
|
|
375
425
|
torch.cuda.set_device(dist.get_rank())
|
|
376
426
|
|
|
@@ -417,14 +467,16 @@ torch._dynamo.config.recompile_limit = 96 # default is 8
|
|
|
417
467
|
torch._dynamo.config.accumulated_recompile_limit = 2048 # default is 256
|
|
418
468
|
```
|
|
419
469
|
|
|
470
|
+
Please check [bench.py](./bench/bench.py) for more details.
|
|
471
|
+
|
|
420
472
|
## 👋Contribute
|
|
421
473
|
<div id="contribute"></div>
|
|
422
474
|
|
|
423
|
-
How to contribute? Star ⭐️ this repo to support us or check [CONTRIBUTE.md](
|
|
475
|
+
How to contribute? Star ⭐️ this repo to support us or check [CONTRIBUTE.md](https://github.com/vipshop/cache-dit/raw/main/CONTRIBUTE.md).
|
|
424
476
|
|
|
425
477
|
## ©️License
|
|
426
478
|
|
|
427
479
|
<div id="license"></div>
|
|
428
480
|
|
|
429
481
|
|
|
430
|
-
We have followed the original License from [ParaAttention](https://github.com/chengzeyi/ParaAttention), please check [LICENSE](
|
|
482
|
+
We have followed the original License from [ParaAttention](https://github.com/chengzeyi/ParaAttention), please check [LICENSE](https://github.com/vipshop/cache-dit/raw/main/LICENSE) for more details.
|
|
@@ -2,40 +2,31 @@
|
|
|
2
2
|
<p align="center">
|
|
3
3
|
<h2>🤗 CacheDiT: A Training-free and Easy-to-use Cache Acceleration <br>Toolbox for Diffusion Transformers</h2>
|
|
4
4
|
</p>
|
|
5
|
-
<img src=https://github.com/vipshop/cache-dit/raw/main/assets/cache-dit.png >
|
|
5
|
+
<img src=https://github.com/vipshop/cache-dit/raw/main/assets/cache-dit-v1.png >
|
|
6
6
|
<div align='center'>
|
|
7
7
|
<img src=https://img.shields.io/badge/Language-Python-brightgreen.svg >
|
|
8
8
|
<img src=https://img.shields.io/badge/PRs-welcome-9cf.svg >
|
|
9
9
|
<img src=https://img.shields.io/badge/PyPI-pass-brightgreen.svg >
|
|
10
10
|
<img src=https://static.pepy.tech/badge/cache-dit >
|
|
11
11
|
<img src=https://img.shields.io/badge/Python-3.10|3.11|3.12-9cf.svg >
|
|
12
|
-
<img src=https://img.shields.io/badge/Release-v0.2.
|
|
12
|
+
<img src=https://img.shields.io/badge/Release-v0.2.2-brightgreen.svg >
|
|
13
13
|
</div>
|
|
14
14
|
<p align="center">
|
|
15
|
-
DeepCache is for UNet not DiT. Most DiT cache speedups are complex and not training-free. CacheDiT <br>
|
|
16
|
-
</p>
|
|
17
|
-
<p align="center">
|
|
18
|
-
<h4> 🔥Supported Models🔥</h4>
|
|
19
|
-
<a href=https://github.com/vipshop/cache-dit/raw/main/examples> <b>🚀FLUX.1</b>: ✔️DBCache, ✔️DBPrune, ✔️FBCache🔥</a> <br>
|
|
20
|
-
<a href=https://github.com/vipshop/cache-dit/raw/main/examples> <b>🚀Mochi</b>: ✔️DBCache, ✔️DBPrune, ✔️FBCache🔥</a> <br>
|
|
21
|
-
<a href=https://github.com/vipshop/cache-dit/raw/main/examples> <b>🚀CogVideoX</b>: ✔️DBCache, ✔️DBPrune, ✔️FBCache🔥</a> <br>
|
|
22
|
-
<a href=https://github.com/vipshop/cache-dit/raw/main/examples> <b>🚀CogVideoX1.5</b>: ✔️DBCache, ✔️DBPrune, ✔️FBCache🔥</a> <br>
|
|
23
|
-
<a href=https://github.com/vipshop/cache-dit/raw/main/examples> <b>🚀Wan2.1</b>: ✔️DBCache, ✔️DBPrune, ✔️FBCache🔥</a> <br>
|
|
24
|
-
<a href=https://github.com/vipshop/cache-dit/raw/main/examples> <b>🚀HunyuanVideo</b>: ✔️DBCache, ✔️DBPrune, ✔️FBCache🔥</a> <br>
|
|
15
|
+
DeepCache is for UNet not DiT. Most DiT cache speedups are complex and not training-free. CacheDiT offers <br>a set of training-free cache accelerators for DiT: <b>🔥<a href="#dbcache">DBCache</a>, <a href="#dbprune">DBPrune</a>, <a href="#taylorseer">TaylorSeer</a>, <a href="#fbcache">FBCache</a></b>, etc🔥
|
|
25
16
|
</p>
|
|
26
17
|
</div>
|
|
27
18
|
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
<
|
|
31
|
-
|
|
32
|
-
|
|
19
|
+
<div align="center">
|
|
20
|
+
<p align="center">
|
|
21
|
+
<b>♥️ Please consider to leave a ⭐️ Star to support us ~ ♥️</b>
|
|
22
|
+
</p>
|
|
23
|
+
</div>
|
|
33
24
|
|
|
34
25
|
## 🤗 Introduction
|
|
35
26
|
|
|
36
27
|
<div align="center">
|
|
37
28
|
<p align="center">
|
|
38
|
-
<h3>🔥
|
|
29
|
+
<h3>🔥DBCache: Dual Block Caching for Diffusion Transformers</h3>
|
|
39
30
|
</p>
|
|
40
31
|
</div>
|
|
41
32
|
|
|
@@ -51,9 +42,9 @@ The **CacheDiT** codebase is adapted from [FBCache](https://github.com/chengzeyi
|
|
|
51
42
|
|:---:|:---:|:---:|:---:|:---:|:---:|
|
|
52
43
|
|24.85s|15.59s|8.58s|15.41s|15.11s|17.74s|
|
|
53
44
|
|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/NONE_R0.08_S0.png width=105px>|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/DBCACHE_F1B0S1_R0.08_S11.png width=105px> | <img src=https://github.com/vipshop/cache-dit/raw/main/assets/DBCACHE_F1B0S1_R0.2_S19.png width=105px>|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/DBCACHE_F8B8S1_R0.15_S15.png width=105px>|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/DBCACHE_F12B12S4_R0.2_S16.png width=105px>|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/DBCACHE_F16B16S4_R0.2_S13.png width=105px>|
|
|
54
|
-
|**Baseline(L20x1)**|**F1B0 (0.08)**|**F8B8 (0.12)**|**F8B12 (0.
|
|
45
|
+
|**Baseline(L20x1)**|**F1B0 (0.08)**|**F8B8 (0.12)**|**F8B12 (0.12)**|**F8B16 (0.20)**|**F8B20 (0.20)**|
|
|
55
46
|
|27.85s|6.04s|5.88s|5.77s|6.01s|6.20s|
|
|
56
|
-
|<img src=https://github.com/
|
|
47
|
+
|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/TEXTURE_NONE_R0.08.png width=105px>|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/TEXTURE_DBCACHE_F1B0_R0.08.png width=105px> |<img src=https://github.com/vipshop/cache-dit/raw/main/assets/TEXTURE_DBCACHE_F8B8_R0.12.png width=105px>|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/TEXTURE_DBCACHE_F8B12_R0.12.png width=105px>|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/TEXTURE_DBCACHE_F8B16_R0.2.png width=105px>|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/TEXTURE_DBCACHE_F8B20_R0.2.png width=105px>|
|
|
57
48
|
|
|
58
49
|
<div align="center">
|
|
59
50
|
<p align="center">
|
|
@@ -65,7 +56,7 @@ These case studies demonstrate that even with relatively high thresholds (such a
|
|
|
65
56
|
|
|
66
57
|
<div align="center">
|
|
67
58
|
<p align="center">
|
|
68
|
-
<h3>🔥
|
|
59
|
+
<h3>🔥DBPrune: Dynamic Block Prune with Residual Caching</h3>
|
|
69
60
|
</p>
|
|
70
61
|
</div>
|
|
71
62
|
|
|
@@ -84,11 +75,11 @@ These case studies demonstrate that even with relatively high thresholds (such a
|
|
|
84
75
|
|
|
85
76
|
<div align="center">
|
|
86
77
|
<p align="center">
|
|
87
|
-
<h3>🔥
|
|
78
|
+
<h3>🔥Context Parallelism and Torch Compile</h3>
|
|
88
79
|
</p>
|
|
89
80
|
</div>
|
|
90
81
|
|
|
91
|
-
Moreover, **CacheDiT** are **plug-and-play** solutions that works hand-in-hand with [ParaAttention](https://github.com/chengzeyi/ParaAttention). Users can easily tap into its **Context Parallelism** features for distributed inference.
|
|
82
|
+
Moreover, **CacheDiT** are **plug-and-play** solutions that works hand-in-hand with [ParaAttention](https://github.com/chengzeyi/ParaAttention). Users can easily tap into its **Context Parallelism** features for distributed inference. CacheDiT is designed to work compatibly with **torch.compile.** You can easily use CacheDiT with torch.compile to further achieve a better performance.
|
|
92
83
|
|
|
93
84
|
<div align="center">
|
|
94
85
|
<p align="center">
|
|
@@ -102,12 +93,6 @@ Moreover, **CacheDiT** are **plug-and-play** solutions that works hand-in-hand w
|
|
|
102
93
|
|+L20x4:7.75s|6.62s|6.03s|5.81s|5.24s|3.93s|
|
|
103
94
|
|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/U0_C1_NONE_R0.08_S0_T20.43s.png width=105px>|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/U0_C1_DBPRUNE_F1B0_R0.03_P24.0_T16.25s.png width=105px> | <img src=https://github.com/vipshop/cache-dit/raw/main/assets/U0_C1_DBPRUNE_F1B0_R0.04_P34.6_T14.12s.png width=105px>|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/U0_C1_DBPRUNE_F1B0_R0.045_P38.2_T13.41s.png width=105px>|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/U0_C1_DBPRUNE_F1B0_R0.055_P45.1_T12.00s.png width=105px>|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/U0_C1_DBPRUNE_F1B0_R0.2_P59.5_T8.86s.png width=105px>|
|
|
104
95
|
|
|
105
|
-
<div align="center">
|
|
106
|
-
<p align="center">
|
|
107
|
-
<b>♥️ Please consider to leave a ⭐️ Star to support us ~ ♥️</b>
|
|
108
|
-
</p>
|
|
109
|
-
</div>
|
|
110
|
-
|
|
111
96
|
## ©️Citations
|
|
112
97
|
|
|
113
98
|
```BibTeX
|
|
@@ -120,12 +105,20 @@ Moreover, **CacheDiT** are **plug-and-play** solutions that works hand-in-hand w
|
|
|
120
105
|
}
|
|
121
106
|
```
|
|
122
107
|
|
|
108
|
+
## 👋Reference
|
|
109
|
+
|
|
110
|
+
<div id="reference"></div>
|
|
111
|
+
|
|
112
|
+
The **CacheDiT** codebase is adapted from [FBCache](https://github.com/chengzeyi/ParaAttention/tree/main/src/para_attn/first_block_cache). Special thanks to their excellent work!
|
|
113
|
+
|
|
123
114
|
## 📖Contents
|
|
124
115
|
|
|
125
116
|
<div id="contents"></div>
|
|
126
117
|
|
|
127
118
|
- [⚙️Installation](#️installation)
|
|
119
|
+
- [🔥Supported Models](#supported)
|
|
128
120
|
- [⚡️Dual Block Cache](#dbcache)
|
|
121
|
+
- [🔥Hybrid TaylorSeer](#taylorseer)
|
|
129
122
|
- [🎉First Block Cache](#fbcache)
|
|
130
123
|
- [⚡️Dynamic Block Prune](#dbprune)
|
|
131
124
|
- [🎉Context Parallelism](#context-parallelism)
|
|
@@ -148,16 +141,31 @@ Or you can install the latest develop version from GitHub:
|
|
|
148
141
|
pip3 install git+https://github.com/vipshop/cache-dit.git
|
|
149
142
|
```
|
|
150
143
|
|
|
144
|
+
## 🔥Supported Models
|
|
145
|
+
|
|
146
|
+
<div id="supported"></div>
|
|
147
|
+
|
|
148
|
+
- [🚀FLUX.1](https://github.com/vipshop/cache-dit/raw/main/examples)
|
|
149
|
+
- [🚀Mochi](https://github.com/vipshop/cache-dit/raw/main/examples)
|
|
150
|
+
- [🚀CogVideoX](https://github.com/vipshop/cache-dit/raw/main/examples)
|
|
151
|
+
- [🚀CogVideoX1.5](https://github.com/vipshop/cache-dit/raw/main/examples)
|
|
152
|
+
- [🚀Wan2.1](https://github.com/vipshop/cache-dit/raw/main/examples)
|
|
153
|
+
- [🚀HunyuanVideo](https://github.com/vipshop/cache-dit/raw/main/examples)
|
|
154
|
+
|
|
155
|
+
|
|
151
156
|
## ⚡️DBCache: Dual Block Cache
|
|
152
157
|
|
|
153
158
|
<div id="dbcache"></div>
|
|
154
159
|
|
|
155
|
-

|
|
156
161
|
|
|
157
162
|
**DBCache** provides configurable parameters for custom optimization, enabling a balanced trade-off between performance and precision:
|
|
158
163
|
|
|
159
164
|
- **Fn**: Specifies that DBCache uses the **first n** Transformer blocks to fit the information at time step t, enabling the calculation of a more stable L1 diff and delivering more accurate information to subsequent blocks.
|
|
160
165
|
- **Bn**: Further fuses approximate information in the **last n** Transformer blocks to enhance prediction accuracy. These blocks act as an auto-scaler for approximate hidden states that use residual cache.
|
|
166
|
+
|
|
167
|
+

|
|
168
|
+
|
|
161
169
|
- **warmup_steps**: (default: 0) DBCache does not apply the caching strategy when the number of running steps is less than or equal to this value, ensuring the model sufficiently learns basic features during warmup.
|
|
162
170
|
- **max_cached_steps**: (default: -1) DBCache disables the caching strategy when the previous cached steps exceed this value to prevent precision degradation.
|
|
163
171
|
- **residual_diff_threshold**: The value of residual diff threshold, a higher value leads to faster performance at the cost of lower precision.
|
|
@@ -213,11 +221,50 @@ cache_options = {
|
|
|
213
221
|
|24.85s|15.59s|8.58s|15.41s|15.11s|17.74s|
|
|
214
222
|
|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/NONE_R0.08_S0.png width=105px>|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/DBCACHE_F1B0S1_R0.08_S11.png width=105px> | <img src=https://github.com/vipshop/cache-dit/raw/main/assets/DBCACHE_F1B0S1_R0.2_S19.png width=105px>|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/DBCACHE_F8B8S1_R0.15_S15.png width=105px>|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/DBCACHE_F12B12S4_R0.2_S16.png width=105px>|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/DBCACHE_F16B16S4_R0.2_S13.png width=105px>|
|
|
215
223
|
|
|
224
|
+
## 🔥Hybrid TaylorSeer
|
|
225
|
+
|
|
226
|
+
<div id="taylorseer"></div>
|
|
227
|
+
|
|
228
|
+
We have supported the [TaylorSeers: From Reusing to Forecasting: Accelerating Diffusion Models with TaylorSeers](https://arxiv.org/pdf/2503.06923) algorithm to further improve the precision of DBCache in cases where the cached steps are large, namely, **Hybrid TaylorSeer + DBCache**. At timesteps with significant intervals, the feature similarity in diffusion models decreases substantially, significantly harming the generation quality.
|
|
229
|
+
|
|
230
|
+
$$
|
|
231
|
+
\mathcal{F}\_{\text {pred }, m}\left(x_{t-k}^l\right)=\mathcal{F}\left(x_t^l\right)+\sum_{i=1}^m \frac{\Delta^i \mathcal{F}\left(x_t^l\right)}{i!\cdot N^i}(-k)^i
|
|
232
|
+
$$
|
|
233
|
+
|
|
234
|
+
**TaylorSeer** employs a differential method to approximate the higher-order derivatives of features and predict features in future timesteps with Taylor series expansion. The TaylorSeer implemented in CacheDiT supports both hidden states and residual cache types. That is $\mathcal{F}\_{\text {pred }, m}\left(x_{t-k}^l\right)$ can be a residual cache or a hidden-state cache.
|
|
235
|
+
|
|
236
|
+
```python
|
|
237
|
+
cache_options = {
|
|
238
|
+
# TaylorSeer options
|
|
239
|
+
"enable_taylorseer": True,
|
|
240
|
+
"enable_encoder_taylorseer": True,
|
|
241
|
+
# Taylorseer cache type cache be hidden_states or residual.
|
|
242
|
+
"taylorseer_cache_type": "residual",
|
|
243
|
+
# Higher values of n_derivatives will lead to longer
|
|
244
|
+
# computation time but may improve precision significantly.
|
|
245
|
+
"taylorseer_kwargs": {
|
|
246
|
+
"n_derivatives": 2, # default is 2.
|
|
247
|
+
},
|
|
248
|
+
"warmup_steps": 3, # n_derivatives + 1
|
|
249
|
+
"residual_diff_threshold": 0.12,
|
|
250
|
+
}
|
|
251
|
+
```
|
|
252
|
+
<div align="center">
|
|
253
|
+
<p align="center">
|
|
254
|
+
<b>DBCache F1B0 + TaylorSeer</b>, L20x1, Steps: 28, <br>"A cat holding a sign that says hello world with complex background"
|
|
255
|
+
</p>
|
|
256
|
+
</div>
|
|
257
|
+
|
|
258
|
+
|Baseline(L20x1)|F1B0 (0.12)|+TaylorSeer|F1B0 (0.15)|+TaylorSeer|+compile|
|
|
259
|
+
|:---:|:---:|:---:|:---:|:---:|:---:|
|
|
260
|
+
|24.85s|12.85s|12.86s|10.27s|10.28s|8.48s|
|
|
261
|
+
|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/NONE_R0.08_S0.png width=105px>|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/U0_C0_DBCACHE_F1B0S1W0T0ET0_R0.12_S14_T12.85s.png width=105px>|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/U0_C0_DBCACHE_F1B0S1W0T1ET1_R0.12_S14_T12.86s.png width=105px>|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/U0_C0_DBCACHE_F1B0S1W0T0ET0_R0.15_S17_T10.27s.png width=105px>|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/U0_C0_DBCACHE_F1B0S1W0T1ET1_R0.15_S17_T10.28s.png width=105px>|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/U0_C1_DBCACHE_F1B0S1W0T1ET1_R0.15_S17_T8.48s.png width=105px>|
|
|
262
|
+
|
|
216
263
|
## 🎉FBCache: First Block Cache
|
|
217
264
|
|
|
218
265
|
<div id="fbcache"></div>
|
|
219
266
|
|
|
220
|
-

|
|
221
268
|
|
|
222
269
|
**DBCache** is a more general cache algorithm than **FBCache**. When Fn=1 and Bn=0, DBCache behaves identically to FBCache. Therefore, you can either use the original FBCache implementation directly or configure **DBCache** with **F1B0** settings to achieve the same functionality.
|
|
223
270
|
|
|
@@ -251,7 +298,7 @@ apply_cache_on_pipe(pipe, **cache_options)
|
|
|
251
298
|
|
|
252
299
|
<div id="dbprune"></div>
|
|
253
300
|
|
|
254
|
-

|
|
255
302
|
|
|
256
303
|
We have further implemented a new **Dynamic Block Prune** algorithm based on **Residual Caching** for Diffusion Transformers, which is referred to as **DBPrune**. DBPrune caches each block's hidden states and residuals, then dynamically prunes blocks during inference by computing the L1 distance between previous hidden states. When a block is pruned, its output is approximated using the cached residuals. DBPrune is currently in the experimental phase, and we kindly invite you to stay tuned for upcoming updates.
|
|
257
304
|
|
|
@@ -305,6 +352,9 @@ cache_options = {
|
|
|
305
352
|
apply_cache_on_pipe(pipe, **cache_options)
|
|
306
353
|
```
|
|
307
354
|
|
|
355
|
+
> [!Important]
|
|
356
|
+
> Please note that for GPUs with lower VRAM, DBPrune may not be suitable for use on video DiTs, as it caches the hidden states and residuals of each block, leading to higher GPU memory requirements. In such cases, please use DBCache, which only caches the hidden states and residuals of 2 blocks.
|
|
357
|
+
|
|
308
358
|
<div align="center">
|
|
309
359
|
<p align="center">
|
|
310
360
|
DBPrune, <b> L20x1 </b>, Steps: 28, "A cat holding a sign that says hello world with complex background"
|
|
@@ -335,7 +385,7 @@ from para_attn.context_parallel import init_context_parallel_mesh
|
|
|
335
385
|
from para_attn.context_parallel.diffusers_adapters import parallelize_pipe
|
|
336
386
|
from cache_dit.cache_factory import apply_cache_on_pipe, CacheType
|
|
337
387
|
|
|
338
|
-
|
|
388
|
+
# Init distributed process group
|
|
339
389
|
dist.init_process_group()
|
|
340
390
|
torch.cuda.set_device(dist.get_rank())
|
|
341
391
|
|
|
@@ -382,14 +432,16 @@ torch._dynamo.config.recompile_limit = 96 # default is 8
|
|
|
382
432
|
torch._dynamo.config.accumulated_recompile_limit = 2048 # default is 256
|
|
383
433
|
```
|
|
384
434
|
|
|
435
|
+
Please check [bench.py](./bench/bench.py) for more details.
|
|
436
|
+
|
|
385
437
|
## 👋Contribute
|
|
386
438
|
<div id="contribute"></div>
|
|
387
439
|
|
|
388
|
-
How to contribute? Star ⭐️ this repo to support us or check [CONTRIBUTE.md](
|
|
440
|
+
How to contribute? Star ⭐️ this repo to support us or check [CONTRIBUTE.md](https://github.com/vipshop/cache-dit/raw/main/CONTRIBUTE.md).
|
|
389
441
|
|
|
390
442
|
## ©️License
|
|
391
443
|
|
|
392
444
|
<div id="license"></div>
|
|
393
445
|
|
|
394
446
|
|
|
395
|
-
We have followed the original License from [ParaAttention](https://github.com/chengzeyi/ParaAttention), please check [LICENSE](
|
|
447
|
+
We have followed the original License from [ParaAttention](https://github.com/chengzeyi/ParaAttention), please check [LICENSE](https://github.com/vipshop/cache-dit/raw/main/LICENSE) for more details.
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|