cache-dit 0.2.3__tar.gz → 0.2.5__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of cache-dit might be problematic. Click here for more details.
- {cache_dit-0.2.3 → cache_dit-0.2.5}/PKG-INFO +34 -7
- {cache_dit-0.2.3 → cache_dit-0.2.5}/README.md +33 -6
- {cache_dit-0.2.3 → cache_dit-0.2.5}/examples/.gitignore +1 -0
- cache_dit-0.2.5/examples/README.md +65 -0
- cache_dit-0.2.5/examples/data/flf2v_input_first_frame.png +0 -0
- cache_dit-0.2.5/examples/data/flf2v_input_last_frame.png +0 -0
- {cache_dit-0.2.3 → cache_dit-0.2.5}/examples/requirements.txt +1 -1
- cache_dit-0.2.5/examples/run_cogvideox.py +142 -0
- cache_dit-0.2.5/examples/run_flux.py +96 -0
- cache_dit-0.2.5/examples/run_flux_fill.py +100 -0
- cache_dit-0.2.5/examples/run_hunyuan_video.py +145 -0
- cache_dit-0.2.5/examples/run_mochi.py +101 -0
- cache_dit-0.2.5/examples/run_wan.py +140 -0
- cache_dit-0.2.5/examples/run_wan_flf2v.py +191 -0
- {cache_dit-0.2.3 → cache_dit-0.2.5}/src/cache_dit/_version.py +2 -2
- {cache_dit-0.2.3 → cache_dit-0.2.5}/src/cache_dit/cache_factory/dual_block_cache/cache_context.py +356 -66
- {cache_dit-0.2.3 → cache_dit-0.2.5}/src/cache_dit/cache_factory/dual_block_cache/diffusers_adapters/wan.py +1 -1
- {cache_dit-0.2.3 → cache_dit-0.2.5}/src/cache_dit/cache_factory/first_block_cache/cache_context.py +2 -2
- {cache_dit-0.2.3 → cache_dit-0.2.5}/src/cache_dit.egg-info/PKG-INFO +34 -7
- {cache_dit-0.2.3 → cache_dit-0.2.5}/src/cache_dit.egg-info/SOURCES.txt +3 -0
- cache_dit-0.2.3/examples/README.md +0 -45
- cache_dit-0.2.3/examples/run_cogvideox.py +0 -72
- cache_dit-0.2.3/examples/run_flux.py +0 -27
- cache_dit-0.2.3/examples/run_flux_fill.py +0 -32
- cache_dit-0.2.3/examples/run_hunyuan_video.py +0 -75
- cache_dit-0.2.3/examples/run_mochi.py +0 -32
- cache_dit-0.2.3/examples/run_wan.py +0 -63
- {cache_dit-0.2.3 → cache_dit-0.2.5}/.github/workflows/issue.yml +0 -0
- {cache_dit-0.2.3 → cache_dit-0.2.5}/.gitignore +0 -0
- {cache_dit-0.2.3 → cache_dit-0.2.5}/.pre-commit-config.yaml +0 -0
- {cache_dit-0.2.3 → cache_dit-0.2.5}/CONTRIBUTE.md +0 -0
- {cache_dit-0.2.3 → cache_dit-0.2.5}/LICENSE +0 -0
- {cache_dit-0.2.3 → cache_dit-0.2.5}/MANIFEST.in +0 -0
- {cache_dit-0.2.3 → cache_dit-0.2.5}/assets/DBCACHE_F12B12S4_R0.2_S16.png +0 -0
- {cache_dit-0.2.3 → cache_dit-0.2.5}/assets/DBCACHE_F12B16S4_R0.08_S6.png +0 -0
- {cache_dit-0.2.3 → cache_dit-0.2.5}/assets/DBCACHE_F16B16S2_R0.2_S14.png +0 -0
- {cache_dit-0.2.3 → cache_dit-0.2.5}/assets/DBCACHE_F16B16S4_R0.2_S13.png +0 -0
- {cache_dit-0.2.3 → cache_dit-0.2.5}/assets/DBCACHE_F1B0S1_R0.08_S11.png +0 -0
- {cache_dit-0.2.3 → cache_dit-0.2.5}/assets/DBCACHE_F1B0S1_R0.2_S19.png +0 -0
- {cache_dit-0.2.3 → cache_dit-0.2.5}/assets/DBCACHE_F8B0S2_R0.12_S12.png +0 -0
- {cache_dit-0.2.3 → cache_dit-0.2.5}/assets/DBCACHE_F8B16S1_R0.2_S18.png +0 -0
- {cache_dit-0.2.3 → cache_dit-0.2.5}/assets/DBCACHE_F8B8S1_R0.08_S9.png +0 -0
- {cache_dit-0.2.3 → cache_dit-0.2.5}/assets/DBCACHE_F8B8S1_R0.12_S12.png +0 -0
- {cache_dit-0.2.3 → cache_dit-0.2.5}/assets/DBCACHE_F8B8S1_R0.15_S15.png +0 -0
- {cache_dit-0.2.3 → cache_dit-0.2.5}/assets/DBPRUNE_F1B0_R0.03_P24.0_T19.43s.png +0 -0
- {cache_dit-0.2.3 → cache_dit-0.2.5}/assets/DBPRUNE_F1B0_R0.04_P34.6_T16.82s.png +0 -0
- {cache_dit-0.2.3 → cache_dit-0.2.5}/assets/DBPRUNE_F1B0_R0.05_P38.3_T15.95s.png +0 -0
- {cache_dit-0.2.3 → cache_dit-0.2.5}/assets/DBPRUNE_F1B0_R0.06_P45.2_T14.24s.png +0 -0
- {cache_dit-0.2.3 → cache_dit-0.2.5}/assets/DBPRUNE_F1B0_R0.07_P52.3_T12.53s.png +0 -0
- {cache_dit-0.2.3 → cache_dit-0.2.5}/assets/DBPRUNE_F1B0_R0.08_P52.4_T12.52s.png +0 -0
- {cache_dit-0.2.3 → cache_dit-0.2.5}/assets/DBPRUNE_F1B0_R0.09_P59.2_T10.81s.png +0 -0
- {cache_dit-0.2.3 → cache_dit-0.2.5}/assets/DBPRUNE_F1B0_R0.12_P59.5_T10.76s.png +0 -0
- {cache_dit-0.2.3 → cache_dit-0.2.5}/assets/DBPRUNE_F1B0_R0.12_P63.0_T9.90s.png +0 -0
- {cache_dit-0.2.3 → cache_dit-0.2.5}/assets/DBPRUNE_F1B0_R0.1_P62.8_T9.95s.png +0 -0
- {cache_dit-0.2.3 → cache_dit-0.2.5}/assets/DBPRUNE_F1B0_R0.2_P59.5_T10.66s.png +0 -0
- {cache_dit-0.2.3 → cache_dit-0.2.5}/assets/DBPRUNE_F1B0_R0.3_P63.1_T9.79s.png +0 -0
- {cache_dit-0.2.3 → cache_dit-0.2.5}/assets/NONE_R0.08_S0.png +0 -0
- {cache_dit-0.2.3 → cache_dit-0.2.5}/assets/TEXTURE_DBCACHE_F1B0_R0.08.png +0 -0
- {cache_dit-0.2.3 → cache_dit-0.2.5}/assets/TEXTURE_DBCACHE_F8B12_R0.12.png +0 -0
- {cache_dit-0.2.3 → cache_dit-0.2.5}/assets/TEXTURE_DBCACHE_F8B16_R0.2.png +0 -0
- {cache_dit-0.2.3 → cache_dit-0.2.5}/assets/TEXTURE_DBCACHE_F8B20_R0.2.png +0 -0
- {cache_dit-0.2.3 → cache_dit-0.2.5}/assets/TEXTURE_DBCACHE_F8B8_R0.12.png +0 -0
- {cache_dit-0.2.3 → cache_dit-0.2.5}/assets/TEXTURE_NONE_R0.08.png +0 -0
- {cache_dit-0.2.3 → cache_dit-0.2.5}/assets/U0_C0_DBCACHE_F1B0S1W0T0ET0_R0.12_S14_T12.85s.png +0 -0
- {cache_dit-0.2.3 → cache_dit-0.2.5}/assets/U0_C0_DBCACHE_F1B0S1W0T0ET0_R0.15_S17_T10.27s.png +0 -0
- {cache_dit-0.2.3 → cache_dit-0.2.5}/assets/U0_C0_DBCACHE_F1B0S1W0T1ET1_R0.12_S14_T12.86s.png +0 -0
- {cache_dit-0.2.3 → cache_dit-0.2.5}/assets/U0_C0_DBCACHE_F1B0S1W0T1ET1_R0.15_S17_T10.28s.png +0 -0
- {cache_dit-0.2.3 → cache_dit-0.2.5}/assets/U0_C1_DBCACHE_F1B0S1W0T1ET1_R0.15_S17_T8.48s.png +0 -0
- {cache_dit-0.2.3 → cache_dit-0.2.5}/assets/U0_C1_DBPRUNE_F1B0_R0.03_P24.0_T16.25s.png +0 -0
- {cache_dit-0.2.3 → cache_dit-0.2.5}/assets/U0_C1_DBPRUNE_F1B0_R0.045_P38.2_T13.41s.png +0 -0
- {cache_dit-0.2.3 → cache_dit-0.2.5}/assets/U0_C1_DBPRUNE_F1B0_R0.04_P34.6_T14.12s.png +0 -0
- {cache_dit-0.2.3 → cache_dit-0.2.5}/assets/U0_C1_DBPRUNE_F1B0_R0.055_P45.1_T12.00s.png +0 -0
- {cache_dit-0.2.3 → cache_dit-0.2.5}/assets/U0_C1_DBPRUNE_F1B0_R0.05_P41.6_T12.70s.png +0 -0
- {cache_dit-0.2.3 → cache_dit-0.2.5}/assets/U0_C1_DBPRUNE_F1B0_R0.2_P59.5_T8.86s.png +0 -0
- {cache_dit-0.2.3 → cache_dit-0.2.5}/assets/U0_C1_DBPRUNE_F8B8_R0.08_P23.1_T16.14s.png +0 -0
- {cache_dit-0.2.3 → cache_dit-0.2.5}/assets/U0_C1_NONE_R0.08_S0_T20.43s.png +0 -0
- {cache_dit-0.2.3 → cache_dit-0.2.5}/assets/U4_C1_DBPRUNE_F1B0_R0.03_P27.3_T6.62s.png +0 -0
- {cache_dit-0.2.3 → cache_dit-0.2.5}/assets/U4_C1_DBPRUNE_F1B0_R0.03_P27.3_T6.63s.png +0 -0
- {cache_dit-0.2.3 → cache_dit-0.2.5}/assets/U4_C1_DBPRUNE_F1B0_R0.045_P38.2_T5.81s.png +0 -0
- {cache_dit-0.2.3 → cache_dit-0.2.5}/assets/U4_C1_DBPRUNE_F1B0_R0.045_P38.2_T5.82s.png +0 -0
- {cache_dit-0.2.3 → cache_dit-0.2.5}/assets/U4_C1_DBPRUNE_F1B0_R0.04_P34.6_T6.06s.png +0 -0
- {cache_dit-0.2.3 → cache_dit-0.2.5}/assets/U4_C1_DBPRUNE_F1B0_R0.04_P34.6_T6.07s.png +0 -0
- {cache_dit-0.2.3 → cache_dit-0.2.5}/assets/U4_C1_DBPRUNE_F1B0_R0.04_P34.6_T6.08s.png +0 -0
- {cache_dit-0.2.3 → cache_dit-0.2.5}/assets/U4_C1_DBPRUNE_F1B0_R0.055_P45.1_T5.27s.png +0 -0
- {cache_dit-0.2.3 → cache_dit-0.2.5}/assets/U4_C1_DBPRUNE_F1B0_R0.055_P45.1_T5.28s.png +0 -0
- {cache_dit-0.2.3 → cache_dit-0.2.5}/assets/U4_C1_DBPRUNE_F1B0_R0.2_P59.5_T3.95s.png +0 -0
- {cache_dit-0.2.3 → cache_dit-0.2.5}/assets/U4_C1_DBPRUNE_F1B0_R0.2_P59.5_T3.96s.png +0 -0
- {cache_dit-0.2.3 → cache_dit-0.2.5}/assets/U4_C1_NONE_R0.08_S0_T7.78s.png +0 -0
- {cache_dit-0.2.3 → cache_dit-0.2.5}/assets/U4_C1_NONE_R0.08_S0_T7.79s.png +0 -0
- {cache_dit-0.2.3 → cache_dit-0.2.5}/assets/cache-dit-v1.png +0 -0
- {cache_dit-0.2.3 → cache_dit-0.2.5}/assets/dbcache-fnbn-v1.png +0 -0
- {cache_dit-0.2.3 → cache_dit-0.2.5}/assets/dbcache-v1.png +0 -0
- {cache_dit-0.2.3 → cache_dit-0.2.5}/assets/dbprune-v1.png +0 -0
- {cache_dit-0.2.3 → cache_dit-0.2.5}/assets/fbcache-v1.png +0 -0
- {cache_dit-0.2.3 → cache_dit-0.2.5}/bench/.gitignore +0 -0
- {cache_dit-0.2.3 → cache_dit-0.2.5}/bench/bench.py +0 -0
- {cache_dit-0.2.3 → cache_dit-0.2.5}/docs/.gitignore +0 -0
- {cache_dit-0.2.3 → cache_dit-0.2.5}/examples/data/cup.png +0 -0
- {cache_dit-0.2.3 → cache_dit-0.2.5}/examples/data/cup_mask.png +0 -0
- {cache_dit-0.2.3 → cache_dit-0.2.5}/pyproject.toml +0 -0
- {cache_dit-0.2.3 → cache_dit-0.2.5}/pytest.ini +0 -0
- {cache_dit-0.2.3 → cache_dit-0.2.5}/requirements.txt +0 -0
- {cache_dit-0.2.3 → cache_dit-0.2.5}/setup.cfg +0 -0
- {cache_dit-0.2.3 → cache_dit-0.2.5}/setup.py +0 -0
- {cache_dit-0.2.3 → cache_dit-0.2.5}/src/cache_dit/__init__.py +0 -0
- {cache_dit-0.2.3 → cache_dit-0.2.5}/src/cache_dit/cache_factory/__init__.py +0 -0
- {cache_dit-0.2.3 → cache_dit-0.2.5}/src/cache_dit/cache_factory/dual_block_cache/__init__.py +0 -0
- {cache_dit-0.2.3 → cache_dit-0.2.5}/src/cache_dit/cache_factory/dual_block_cache/diffusers_adapters/__init__.py +0 -0
- {cache_dit-0.2.3 → cache_dit-0.2.5}/src/cache_dit/cache_factory/dual_block_cache/diffusers_adapters/cogvideox.py +0 -0
- {cache_dit-0.2.3 → cache_dit-0.2.5}/src/cache_dit/cache_factory/dual_block_cache/diffusers_adapters/flux.py +0 -0
- {cache_dit-0.2.3 → cache_dit-0.2.5}/src/cache_dit/cache_factory/dual_block_cache/diffusers_adapters/hunyuan_video.py +0 -0
- {cache_dit-0.2.3 → cache_dit-0.2.5}/src/cache_dit/cache_factory/dual_block_cache/diffusers_adapters/mochi.py +0 -0
- {cache_dit-0.2.3 → cache_dit-0.2.5}/src/cache_dit/cache_factory/dynamic_block_prune/__init__.py +0 -0
- {cache_dit-0.2.3 → cache_dit-0.2.5}/src/cache_dit/cache_factory/dynamic_block_prune/diffusers_adapters/__init__.py +0 -0
- {cache_dit-0.2.3 → cache_dit-0.2.5}/src/cache_dit/cache_factory/dynamic_block_prune/diffusers_adapters/cogvideox.py +0 -0
- {cache_dit-0.2.3 → cache_dit-0.2.5}/src/cache_dit/cache_factory/dynamic_block_prune/diffusers_adapters/flux.py +0 -0
- {cache_dit-0.2.3 → cache_dit-0.2.5}/src/cache_dit/cache_factory/dynamic_block_prune/diffusers_adapters/hunyuan_video.py +0 -0
- {cache_dit-0.2.3 → cache_dit-0.2.5}/src/cache_dit/cache_factory/dynamic_block_prune/diffusers_adapters/mochi.py +0 -0
- {cache_dit-0.2.3 → cache_dit-0.2.5}/src/cache_dit/cache_factory/dynamic_block_prune/diffusers_adapters/wan.py +0 -0
- {cache_dit-0.2.3 → cache_dit-0.2.5}/src/cache_dit/cache_factory/dynamic_block_prune/prune_context.py +0 -0
- {cache_dit-0.2.3 → cache_dit-0.2.5}/src/cache_dit/cache_factory/first_block_cache/__init__.py +0 -0
- {cache_dit-0.2.3 → cache_dit-0.2.5}/src/cache_dit/cache_factory/first_block_cache/diffusers_adapters/__init__.py +0 -0
- {cache_dit-0.2.3 → cache_dit-0.2.5}/src/cache_dit/cache_factory/first_block_cache/diffusers_adapters/cogvideox.py +0 -0
- {cache_dit-0.2.3 → cache_dit-0.2.5}/src/cache_dit/cache_factory/first_block_cache/diffusers_adapters/flux.py +0 -0
- {cache_dit-0.2.3 → cache_dit-0.2.5}/src/cache_dit/cache_factory/first_block_cache/diffusers_adapters/hunyuan_video.py +0 -0
- {cache_dit-0.2.3 → cache_dit-0.2.5}/src/cache_dit/cache_factory/first_block_cache/diffusers_adapters/mochi.py +0 -0
- {cache_dit-0.2.3 → cache_dit-0.2.5}/src/cache_dit/cache_factory/first_block_cache/diffusers_adapters/wan.py +0 -0
- {cache_dit-0.2.3 → cache_dit-0.2.5}/src/cache_dit/cache_factory/taylorseer.py +0 -0
- {cache_dit-0.2.3 → cache_dit-0.2.5}/src/cache_dit/cache_factory/utils.py +0 -0
- {cache_dit-0.2.3 → cache_dit-0.2.5}/src/cache_dit/compile/__init__.py +0 -0
- {cache_dit-0.2.3 → cache_dit-0.2.5}/src/cache_dit/compile/utils.py +0 -0
- {cache_dit-0.2.3 → cache_dit-0.2.5}/src/cache_dit/custom_ops/__init__.py +0 -0
- {cache_dit-0.2.3 → cache_dit-0.2.5}/src/cache_dit/custom_ops/triton_taylorseer.py +0 -0
- {cache_dit-0.2.3 → cache_dit-0.2.5}/src/cache_dit/logger.py +0 -0
- {cache_dit-0.2.3 → cache_dit-0.2.5}/src/cache_dit/primitives.py +0 -0
- {cache_dit-0.2.3 → cache_dit-0.2.5}/src/cache_dit.egg-info/dependency_links.txt +0 -0
- {cache_dit-0.2.3 → cache_dit-0.2.5}/src/cache_dit.egg-info/requires.txt +0 -0
- {cache_dit-0.2.3 → cache_dit-0.2.5}/src/cache_dit.egg-info/top_level.txt +0 -0
- {cache_dit-0.2.3 → cache_dit-0.2.5}/tests/.gitignore +0 -0
- {cache_dit-0.2.3 → cache_dit-0.2.5}/tests/README.md +0 -0
- {cache_dit-0.2.3 → cache_dit-0.2.5}/tests/taylorseer_approximation_order_2.png +0 -0
- {cache_dit-0.2.3 → cache_dit-0.2.5}/tests/taylorseer_approximation_order_4.png +0 -0
- {cache_dit-0.2.3 → cache_dit-0.2.5}/tests/taylorseer_approximation_test.png +0 -0
- {cache_dit-0.2.3 → cache_dit-0.2.5}/tests/test_taylorseer.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: cache_dit
|
|
3
|
-
Version: 0.2.
|
|
3
|
+
Version: 0.2.5
|
|
4
4
|
Summary: 🤗 CacheDiT: A Training-free and Easy-to-use Cache Acceleration Toolbox for Diffusion Transformers
|
|
5
5
|
Author: DefTruth, vipshop.com, etc.
|
|
6
6
|
Maintainer: DefTruth, vipshop.com, etc
|
|
@@ -44,7 +44,7 @@ Dynamic: requires-python
|
|
|
44
44
|
<img src=https://img.shields.io/badge/PyPI-pass-brightgreen.svg >
|
|
45
45
|
<img src=https://static.pepy.tech/badge/cache-dit >
|
|
46
46
|
<img src=https://img.shields.io/badge/Python-3.10|3.11|3.12-9cf.svg >
|
|
47
|
-
<img src=https://img.shields.io/badge/Release-v0.2
|
|
47
|
+
<img src=https://img.shields.io/badge/Release-v0.2-brightgreen.svg >
|
|
48
48
|
</div>
|
|
49
49
|
<p align="center">
|
|
50
50
|
DeepCache is for UNet not DiT. Most DiT cache speedups are complex and not training-free. CacheDiT offers <br>a set of training-free cache accelerators for DiT: <b>🔥<a href="#dbcache">DBCache</a>, <a href="#dbprune">DBPrune</a>, <a href="#taylorseer">TaylorSeer</a>, <a href="#fbcache">FBCache</a></b>, etc🔥
|
|
@@ -154,6 +154,7 @@ The **CacheDiT** codebase is adapted from [FBCache](https://github.com/chengzeyi
|
|
|
154
154
|
- [🔥Supported Models](#supported)
|
|
155
155
|
- [⚡️Dual Block Cache](#dbcache)
|
|
156
156
|
- [🔥Hybrid TaylorSeer](#taylorseer)
|
|
157
|
+
- [⚡️Hybrid Cache CFG](#cfg)
|
|
157
158
|
- [🎉First Block Cache](#fbcache)
|
|
158
159
|
- [⚡️Dynamic Block Prune](#dbprune)
|
|
159
160
|
- [🎉Context Parallelism](#context-parallelism)
|
|
@@ -168,7 +169,7 @@ The **CacheDiT** codebase is adapted from [FBCache](https://github.com/chengzeyi
|
|
|
168
169
|
You can install the stable release of `cache-dit` from PyPI:
|
|
169
170
|
|
|
170
171
|
```bash
|
|
171
|
-
pip3 install cache-dit
|
|
172
|
+
pip3 install -U cache-dit
|
|
172
173
|
```
|
|
173
174
|
Or you can install the latest develop version from GitHub:
|
|
174
175
|
|
|
@@ -180,11 +181,13 @@ pip3 install git+https://github.com/vipshop/cache-dit.git
|
|
|
180
181
|
|
|
181
182
|
<div id="supported"></div>
|
|
182
183
|
|
|
183
|
-
- [🚀FLUX.1](https://github.com/vipshop/cache-dit/raw/main/examples)
|
|
184
|
-
- [🚀
|
|
184
|
+
- [🚀FLUX.1-dev](https://github.com/vipshop/cache-dit/raw/main/examples)
|
|
185
|
+
- [🚀FLUX.1-Fill-dev](https://github.com/vipshop/cache-dit/raw/main/examples)
|
|
186
|
+
- [🚀mochi-1-preview](https://github.com/vipshop/cache-dit/raw/main/examples)
|
|
185
187
|
- [🚀CogVideoX](https://github.com/vipshop/cache-dit/raw/main/examples)
|
|
186
188
|
- [🚀CogVideoX1.5](https://github.com/vipshop/cache-dit/raw/main/examples)
|
|
187
|
-
- [🚀Wan2.1](https://github.com/vipshop/cache-dit/raw/main/examples)
|
|
189
|
+
- [🚀Wan2.1-T2V](https://github.com/vipshop/cache-dit/raw/main/examples)
|
|
190
|
+
- [🚀Wan2.1-FLF2V](https://github.com/vipshop/cache-dit/raw/main/examples)
|
|
188
191
|
- [🚀HunyuanVideo](https://github.com/vipshop/cache-dit/raw/main/examples)
|
|
189
192
|
|
|
190
193
|
|
|
@@ -280,7 +283,7 @@ cache_options = {
|
|
|
280
283
|
"taylorseer_kwargs": {
|
|
281
284
|
"n_derivatives": 2, # default is 2.
|
|
282
285
|
},
|
|
283
|
-
"warmup_steps": 3, # n_derivatives + 1
|
|
286
|
+
"warmup_steps": 3, # prefer: >= n_derivatives + 1
|
|
284
287
|
"residual_diff_threshold": 0.12,
|
|
285
288
|
}
|
|
286
289
|
```
|
|
@@ -299,6 +302,30 @@ cache_options = {
|
|
|
299
302
|
|24.85s|12.85s|12.86s|10.27s|10.28s|8.48s|
|
|
300
303
|
|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/NONE_R0.08_S0.png width=105px>|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/U0_C0_DBCACHE_F1B0S1W0T0ET0_R0.12_S14_T12.85s.png width=105px>|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/U0_C0_DBCACHE_F1B0S1W0T1ET1_R0.12_S14_T12.86s.png width=105px>|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/U0_C0_DBCACHE_F1B0S1W0T0ET0_R0.15_S17_T10.27s.png width=105px>|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/U0_C0_DBCACHE_F1B0S1W0T1ET1_R0.15_S17_T10.28s.png width=105px>|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/U0_C1_DBCACHE_F1B0S1W0T1ET1_R0.15_S17_T8.48s.png width=105px>|
|
|
301
304
|
|
|
305
|
+
## ⚡️Hybrid Cache CFG
|
|
306
|
+
|
|
307
|
+
<div id="cfg"></div>
|
|
308
|
+
|
|
309
|
+
CacheDiT supports caching for **CFG (classifier-free guidance)**. For models that fuse CFG and non-CFG into a single forward step, or models that do not include CFG (classifier-free guidance) in the forward step, please set `do_separate_classifier_free_guidance` param to **False (default)**. Otherwise, set it to True. For examples:
|
|
310
|
+
|
|
311
|
+
```python
|
|
312
|
+
cache_options = {
|
|
313
|
+
# CFG: classifier free guidance or not
|
|
314
|
+
# For model that fused CFG and non-CFG into single forward step,
|
|
315
|
+
# should set do_separate_classifier_free_guidance as False.
|
|
316
|
+
# For example, set it as True for Wan 2.1 and set it as False
|
|
317
|
+
# for FLUX.1, HunyuanVideo, CogVideoX, Mochi.
|
|
318
|
+
"do_separate_classifier_free_guidance": True, # Wan 2.1
|
|
319
|
+
# Compute cfg forward first or not, default False, namely,
|
|
320
|
+
# 0, 2, 4, ..., -> non-CFG step; 1, 3, 5, ... -> CFG step.
|
|
321
|
+
"cfg_compute_first": False,
|
|
322
|
+
# Compute spearate diff values for CFG and non-CFG step,
|
|
323
|
+
# default True. If False, we will use the computed diff from
|
|
324
|
+
# current non-CFG transformer step for current CFG step.
|
|
325
|
+
"cfg_diff_compute_separate": True,
|
|
326
|
+
}
|
|
327
|
+
```
|
|
328
|
+
|
|
302
329
|
## 🎉FBCache: First Block Cache
|
|
303
330
|
|
|
304
331
|
<div id="fbcache"></div>
|
|
@@ -9,7 +9,7 @@
|
|
|
9
9
|
<img src=https://img.shields.io/badge/PyPI-pass-brightgreen.svg >
|
|
10
10
|
<img src=https://static.pepy.tech/badge/cache-dit >
|
|
11
11
|
<img src=https://img.shields.io/badge/Python-3.10|3.11|3.12-9cf.svg >
|
|
12
|
-
<img src=https://img.shields.io/badge/Release-v0.2
|
|
12
|
+
<img src=https://img.shields.io/badge/Release-v0.2-brightgreen.svg >
|
|
13
13
|
</div>
|
|
14
14
|
<p align="center">
|
|
15
15
|
DeepCache is for UNet not DiT. Most DiT cache speedups are complex and not training-free. CacheDiT offers <br>a set of training-free cache accelerators for DiT: <b>🔥<a href="#dbcache">DBCache</a>, <a href="#dbprune">DBPrune</a>, <a href="#taylorseer">TaylorSeer</a>, <a href="#fbcache">FBCache</a></b>, etc🔥
|
|
@@ -119,6 +119,7 @@ The **CacheDiT** codebase is adapted from [FBCache](https://github.com/chengzeyi
|
|
|
119
119
|
- [🔥Supported Models](#supported)
|
|
120
120
|
- [⚡️Dual Block Cache](#dbcache)
|
|
121
121
|
- [🔥Hybrid TaylorSeer](#taylorseer)
|
|
122
|
+
- [⚡️Hybrid Cache CFG](#cfg)
|
|
122
123
|
- [🎉First Block Cache](#fbcache)
|
|
123
124
|
- [⚡️Dynamic Block Prune](#dbprune)
|
|
124
125
|
- [🎉Context Parallelism](#context-parallelism)
|
|
@@ -133,7 +134,7 @@ The **CacheDiT** codebase is adapted from [FBCache](https://github.com/chengzeyi
|
|
|
133
134
|
You can install the stable release of `cache-dit` from PyPI:
|
|
134
135
|
|
|
135
136
|
```bash
|
|
136
|
-
pip3 install cache-dit
|
|
137
|
+
pip3 install -U cache-dit
|
|
137
138
|
```
|
|
138
139
|
Or you can install the latest develop version from GitHub:
|
|
139
140
|
|
|
@@ -145,11 +146,13 @@ pip3 install git+https://github.com/vipshop/cache-dit.git
|
|
|
145
146
|
|
|
146
147
|
<div id="supported"></div>
|
|
147
148
|
|
|
148
|
-
- [🚀FLUX.1](https://github.com/vipshop/cache-dit/raw/main/examples)
|
|
149
|
-
- [🚀
|
|
149
|
+
- [🚀FLUX.1-dev](https://github.com/vipshop/cache-dit/raw/main/examples)
|
|
150
|
+
- [🚀FLUX.1-Fill-dev](https://github.com/vipshop/cache-dit/raw/main/examples)
|
|
151
|
+
- [🚀mochi-1-preview](https://github.com/vipshop/cache-dit/raw/main/examples)
|
|
150
152
|
- [🚀CogVideoX](https://github.com/vipshop/cache-dit/raw/main/examples)
|
|
151
153
|
- [🚀CogVideoX1.5](https://github.com/vipshop/cache-dit/raw/main/examples)
|
|
152
|
-
- [🚀Wan2.1](https://github.com/vipshop/cache-dit/raw/main/examples)
|
|
154
|
+
- [🚀Wan2.1-T2V](https://github.com/vipshop/cache-dit/raw/main/examples)
|
|
155
|
+
- [🚀Wan2.1-FLF2V](https://github.com/vipshop/cache-dit/raw/main/examples)
|
|
153
156
|
- [🚀HunyuanVideo](https://github.com/vipshop/cache-dit/raw/main/examples)
|
|
154
157
|
|
|
155
158
|
|
|
@@ -245,7 +248,7 @@ cache_options = {
|
|
|
245
248
|
"taylorseer_kwargs": {
|
|
246
249
|
"n_derivatives": 2, # default is 2.
|
|
247
250
|
},
|
|
248
|
-
"warmup_steps": 3, # n_derivatives + 1
|
|
251
|
+
"warmup_steps": 3, # prefer: >= n_derivatives + 1
|
|
249
252
|
"residual_diff_threshold": 0.12,
|
|
250
253
|
}
|
|
251
254
|
```
|
|
@@ -264,6 +267,30 @@ cache_options = {
|
|
|
264
267
|
|24.85s|12.85s|12.86s|10.27s|10.28s|8.48s|
|
|
265
268
|
|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/NONE_R0.08_S0.png width=105px>|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/U0_C0_DBCACHE_F1B0S1W0T0ET0_R0.12_S14_T12.85s.png width=105px>|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/U0_C0_DBCACHE_F1B0S1W0T1ET1_R0.12_S14_T12.86s.png width=105px>|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/U0_C0_DBCACHE_F1B0S1W0T0ET0_R0.15_S17_T10.27s.png width=105px>|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/U0_C0_DBCACHE_F1B0S1W0T1ET1_R0.15_S17_T10.28s.png width=105px>|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/U0_C1_DBCACHE_F1B0S1W0T1ET1_R0.15_S17_T8.48s.png width=105px>|
|
|
266
269
|
|
|
270
|
+
## ⚡️Hybrid Cache CFG
|
|
271
|
+
|
|
272
|
+
<div id="cfg"></div>
|
|
273
|
+
|
|
274
|
+
CacheDiT supports caching for **CFG (classifier-free guidance)**. For models that fuse CFG and non-CFG into a single forward step, or models that do not include CFG (classifier-free guidance) in the forward step, please set `do_separate_classifier_free_guidance` param to **False (default)**. Otherwise, set it to True. For examples:
|
|
275
|
+
|
|
276
|
+
```python
|
|
277
|
+
cache_options = {
|
|
278
|
+
# CFG: classifier free guidance or not
|
|
279
|
+
# For model that fused CFG and non-CFG into single forward step,
|
|
280
|
+
# should set do_separate_classifier_free_guidance as False.
|
|
281
|
+
# For example, set it as True for Wan 2.1 and set it as False
|
|
282
|
+
# for FLUX.1, HunyuanVideo, CogVideoX, Mochi.
|
|
283
|
+
"do_separate_classifier_free_guidance": True, # Wan 2.1
|
|
284
|
+
# Compute cfg forward first or not, default False, namely,
|
|
285
|
+
# 0, 2, 4, ..., -> non-CFG step; 1, 3, 5, ... -> CFG step.
|
|
286
|
+
"cfg_compute_first": False,
|
|
287
|
+
# Compute spearate diff values for CFG and non-CFG step,
|
|
288
|
+
# default True. If False, we will use the computed diff from
|
|
289
|
+
# current non-CFG transformer step for current CFG step.
|
|
290
|
+
"cfg_diff_compute_separate": True,
|
|
291
|
+
}
|
|
292
|
+
```
|
|
293
|
+
|
|
267
294
|
## 🎉FBCache: First Block Cache
|
|
268
295
|
|
|
269
296
|
<div id="fbcache"></div>
|
|
@@ -0,0 +1,65 @@
|
|
|
1
|
+
# Examples for CacheDiT
|
|
2
|
+
|
|
3
|
+
## Install requirements
|
|
4
|
+
|
|
5
|
+
```bash
|
|
6
|
+
pip3 install -r requirements.txt
|
|
7
|
+
```
|
|
8
|
+
|
|
9
|
+
## Run examples
|
|
10
|
+
|
|
11
|
+
- FLUX.1-dev
|
|
12
|
+
|
|
13
|
+
```bash
|
|
14
|
+
python3 run_flux.py # baseline
|
|
15
|
+
python3 run_flux.py --cache --Fn 8 --Bn 8
|
|
16
|
+
python3 run_flux.py --cache --Fn 8 --Bn 0 --taylorseer
|
|
17
|
+
```
|
|
18
|
+
|
|
19
|
+
- FLUX.1-Fill-dev
|
|
20
|
+
|
|
21
|
+
```bash
|
|
22
|
+
python3 run_flux_fill.py # baseline
|
|
23
|
+
python3 run_flux_fill.py --cache --Fn 8 --Bn 8
|
|
24
|
+
python3 run_flux_fill.py --cache --Fn 8 --Bn 0 --taylorseer
|
|
25
|
+
```
|
|
26
|
+
|
|
27
|
+
- CogVideoX
|
|
28
|
+
|
|
29
|
+
```bash
|
|
30
|
+
python3 run_cogvideox.py # baseline
|
|
31
|
+
python3 run_cogvideox.py --cache --Fn 8 --Bn 8
|
|
32
|
+
python3 run_cogvideox.py --cache --Fn 8 --Bn 0 --taylorseer
|
|
33
|
+
```
|
|
34
|
+
|
|
35
|
+
- Wan2.1 T2V
|
|
36
|
+
|
|
37
|
+
```bash
|
|
38
|
+
python3 run_wan.py # baseline
|
|
39
|
+
python3 run_wan.py --cache --Fn 8 --Bn 8
|
|
40
|
+
python3 run_wan.py --cache --Fn 8 --Bn 0 --taylorseer
|
|
41
|
+
```
|
|
42
|
+
|
|
43
|
+
- Wan2.1 FLF2V
|
|
44
|
+
|
|
45
|
+
```bash
|
|
46
|
+
python3 run_wan_flf2v.py # baseline
|
|
47
|
+
python3 run_wan_flf2v.py --cache --Fn 8 --Bn 8
|
|
48
|
+
python3 run_wan_flf2v.py --cache --Fn 8 --Bn 0 --taylorseer
|
|
49
|
+
```
|
|
50
|
+
|
|
51
|
+
- mochi-1-preview
|
|
52
|
+
|
|
53
|
+
```bash
|
|
54
|
+
python3 run_mochi.py # baseline
|
|
55
|
+
python3 run_mochi.py --cache --Fn 8 --Bn 8
|
|
56
|
+
python3 run_mochi.py --cache --Fn 8 --Bn 0 --taylorseer
|
|
57
|
+
```
|
|
58
|
+
|
|
59
|
+
- HunyuanVideo
|
|
60
|
+
|
|
61
|
+
```bash
|
|
62
|
+
python3 run_hunyuan_video.py # baseline
|
|
63
|
+
python3 run_hunyuan_video.py --cache --Fn 8 --Bn 8
|
|
64
|
+
python3 run_hunyuan_video.py --cache --Fn 8 --Bn 0 --taylorseer
|
|
65
|
+
```
|
|
Binary file
|
|
Binary file
|
|
@@ -0,0 +1,142 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import time
|
|
3
|
+
import torch
|
|
4
|
+
import argparse
|
|
5
|
+
from diffusers.utils import export_to_video
|
|
6
|
+
from diffusers import CogVideoXPipeline, AutoencoderKLCogVideoX
|
|
7
|
+
from cache_dit.cache_factory import apply_cache_on_pipe, CacheType
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def get_args() -> argparse.ArgumentParser:
|
|
11
|
+
parser = argparse.ArgumentParser()
|
|
12
|
+
# General arguments
|
|
13
|
+
parser.add_argument("--cache", action="store_true", default=False)
|
|
14
|
+
parser.add_argument("--taylorseer", action="store_true", default=False)
|
|
15
|
+
parser.add_argument("--taylorseer-order", "--order", type=int, default=2)
|
|
16
|
+
parser.add_argument("--Fn-compute-blocks", "--Fn", type=int, default=1)
|
|
17
|
+
parser.add_argument("--Bn-compute-blocks", "--Bn", type=int, default=0)
|
|
18
|
+
parser.add_argument("--rdt", type=float, default=0.08)
|
|
19
|
+
parser.add_argument("--warmup-steps", type=int, default=0)
|
|
20
|
+
return parser.parse_args()
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
args = get_args()
|
|
24
|
+
print(args)
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
model_id = os.environ.get("COGVIDEOX_DIR", "THUDM/CogVideoX-5b")
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def is_cogvideox_1_5():
|
|
31
|
+
return "CogVideoX1.5" in model_id or "THUDM/CogVideoX1.5" in model_id
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def get_gpu_memory_in_gib():
|
|
35
|
+
if not torch.cuda.is_available():
|
|
36
|
+
return 0
|
|
37
|
+
|
|
38
|
+
try:
|
|
39
|
+
total_memory_bytes = torch.cuda.get_device_properties(
|
|
40
|
+
torch.cuda.current_device(),
|
|
41
|
+
).total_memory
|
|
42
|
+
total_memory_gib = total_memory_bytes / (1024**3)
|
|
43
|
+
return int(total_memory_gib)
|
|
44
|
+
except Exception:
|
|
45
|
+
return 0
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
pipe = CogVideoXPipeline.from_pretrained(
|
|
49
|
+
model_id,
|
|
50
|
+
torch_dtype=torch.bfloat16,
|
|
51
|
+
).to("cuda")
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
if args.cache:
|
|
55
|
+
cache_options = {
|
|
56
|
+
"cache_type": CacheType.DBCache,
|
|
57
|
+
"warmup_steps": args.warmup_steps,
|
|
58
|
+
"max_cached_steps": -1, # -1 means no limit
|
|
59
|
+
# Fn=1, Bn=0, means FB Cache, otherwise, Dual Block Cache
|
|
60
|
+
"Fn_compute_blocks": args.Fn_compute_blocks, # Fn, F8, etc.
|
|
61
|
+
"Bn_compute_blocks": args.Bn_compute_blocks, # Bn, B16, etc.
|
|
62
|
+
"residual_diff_threshold": args.rdt,
|
|
63
|
+
# releative token diff threshold, default is 0.0
|
|
64
|
+
"important_condition_threshold": 0.05,
|
|
65
|
+
# CFG: classifier free guidance or not
|
|
66
|
+
# CogVideoX fused CFG and non-CFG into single forward step
|
|
67
|
+
# so, we set do_separate_classifier_free_guidance as False.
|
|
68
|
+
"do_separate_classifier_free_guidance": False,
|
|
69
|
+
"cfg_compute_first": False,
|
|
70
|
+
"enable_taylorseer": args.taylorseer,
|
|
71
|
+
"enable_encoder_taylorseer": args.taylorseer,
|
|
72
|
+
# Taylorseer cache type cache be hidden_states or residual
|
|
73
|
+
"taylorseer_cache_type": "hidden_states",
|
|
74
|
+
"taylorseer_kwargs": {
|
|
75
|
+
"n_derivatives": args.taylorseer_order,
|
|
76
|
+
},
|
|
77
|
+
}
|
|
78
|
+
cache_type_str = "DBCACHE"
|
|
79
|
+
cache_type_str = (
|
|
80
|
+
f"{cache_type_str}_F{args.Fn_compute_blocks}"
|
|
81
|
+
f"B{args.Bn_compute_blocks}W{args.warmup_steps}"
|
|
82
|
+
f"T{int(args.taylorseer)}O{args.taylorseer_order}"
|
|
83
|
+
)
|
|
84
|
+
print(f"cache options:\n{cache_options}")
|
|
85
|
+
|
|
86
|
+
apply_cache_on_pipe(pipe, **cache_options)
|
|
87
|
+
else:
|
|
88
|
+
cache_type_str = "NONE"
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
pipe.enable_model_cpu_offload()
|
|
92
|
+
assert isinstance(pipe.vae, AutoencoderKLCogVideoX) # enable type check for IDE
|
|
93
|
+
pipe.vae.enable_slicing()
|
|
94
|
+
pipe.vae.enable_tiling()
|
|
95
|
+
|
|
96
|
+
start = time.time()
|
|
97
|
+
prompt = (
|
|
98
|
+
"A panda, dressed in a small, red jacket and a tiny hat, "
|
|
99
|
+
"sits on a wooden stool in a serene bamboo forest. The "
|
|
100
|
+
"panda's fluffy paws strum a miniature acoustic guitar, "
|
|
101
|
+
"producing soft, melodic tunes. Nearby, a few other pandas "
|
|
102
|
+
"gather, watching curiously and some clapping in rhythm. "
|
|
103
|
+
"Sunlight filters through the tall bamboo, casting a gentle "
|
|
104
|
+
"glow on the scene. The panda's face is expressive, showing "
|
|
105
|
+
"concentration and joy as it plays. The background includes "
|
|
106
|
+
"a small, flowing stream and vibrant green foliage, enhancing "
|
|
107
|
+
"the peaceful and magical atmosphere of this unique musical "
|
|
108
|
+
"performance."
|
|
109
|
+
)
|
|
110
|
+
video = pipe(
|
|
111
|
+
prompt=prompt,
|
|
112
|
+
num_videos_per_prompt=1,
|
|
113
|
+
num_inference_steps=50,
|
|
114
|
+
num_frames=(
|
|
115
|
+
# Avoid OOM for CogVideoX1.5 model on 48GB GPU
|
|
116
|
+
16
|
|
117
|
+
if (is_cogvideox_1_5() and get_gpu_memory_in_gib() < 48)
|
|
118
|
+
else 49
|
|
119
|
+
),
|
|
120
|
+
guidance_scale=6,
|
|
121
|
+
generator=torch.Generator("cpu").manual_seed(0),
|
|
122
|
+
).frames[0]
|
|
123
|
+
end = time.time()
|
|
124
|
+
|
|
125
|
+
if hasattr(pipe.transformer, "_cached_steps"):
|
|
126
|
+
cached_steps = pipe.transformer._cached_steps
|
|
127
|
+
residual_diffs = pipe.transformer._residual_diffs
|
|
128
|
+
print(f"Cache Steps: {len(cached_steps)}, {cached_steps}")
|
|
129
|
+
print(f"Residual Diffs: {len(residual_diffs)}, {residual_diffs}")
|
|
130
|
+
if hasattr(pipe.transformer, "_cfg_cached_steps"):
|
|
131
|
+
cfg_cached_steps = pipe.transformer._cfg_cached_steps
|
|
132
|
+
cfg_residual_diffs = pipe.transformer._cfg_residual_diffs
|
|
133
|
+
print(f"CFG Cache Steps: {len(cfg_cached_steps)}, {cfg_cached_steps} ")
|
|
134
|
+
print(
|
|
135
|
+
f"CFG Residual Diffs: {len(cfg_residual_diffs)}, {cfg_residual_diffs}"
|
|
136
|
+
)
|
|
137
|
+
|
|
138
|
+
time_cost = end - start
|
|
139
|
+
save_path = f"cogvideox.{cache_type_str}.mp4"
|
|
140
|
+
print(f"Time cost: {time_cost:.2f}s")
|
|
141
|
+
print(f"Saving video to {save_path}")
|
|
142
|
+
export_to_video(video, save_path, fps=8)
|
|
@@ -0,0 +1,96 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import time
|
|
3
|
+
import torch
|
|
4
|
+
import argparse
|
|
5
|
+
from diffusers import FluxPipeline
|
|
6
|
+
from cache_dit.cache_factory import apply_cache_on_pipe, CacheType
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def get_args() -> argparse.ArgumentParser:
|
|
10
|
+
parser = argparse.ArgumentParser()
|
|
11
|
+
# General arguments
|
|
12
|
+
parser.add_argument("--cache", action="store_true", default=False)
|
|
13
|
+
parser.add_argument("--taylorseer", action="store_true", default=False)
|
|
14
|
+
parser.add_argument("--taylorseer-order", "--order", type=int, default=2)
|
|
15
|
+
parser.add_argument("--Fn-compute-blocks", "--Fn", type=int, default=1)
|
|
16
|
+
parser.add_argument("--Bn-compute-blocks", "--Bn", type=int, default=0)
|
|
17
|
+
parser.add_argument("--rdt", type=float, default=0.08)
|
|
18
|
+
parser.add_argument("--warmup-steps", type=int, default=0)
|
|
19
|
+
return parser.parse_args()
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
args = get_args()
|
|
23
|
+
print(args)
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
pipe = FluxPipeline.from_pretrained(
|
|
27
|
+
os.environ.get(
|
|
28
|
+
"FLUX_DIR",
|
|
29
|
+
"black-forest-labs/FLUX.1-dev",
|
|
30
|
+
),
|
|
31
|
+
torch_dtype=torch.bfloat16,
|
|
32
|
+
).to("cuda")
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
if args.cache:
|
|
36
|
+
cache_options = {
|
|
37
|
+
"cache_type": CacheType.DBCache,
|
|
38
|
+
"warmup_steps": args.warmup_steps,
|
|
39
|
+
"max_cached_steps": -1, # -1 means no limit
|
|
40
|
+
# Fn=1, Bn=0, means FB Cache, otherwise, Dual Block Cache
|
|
41
|
+
"Fn_compute_blocks": args.Fn_compute_blocks, # Fn, F8, etc.
|
|
42
|
+
"Bn_compute_blocks": args.Bn_compute_blocks, # Bn, B16, etc.
|
|
43
|
+
"residual_diff_threshold": args.rdt,
|
|
44
|
+
# CFG: classifier free guidance or not
|
|
45
|
+
# FLUX.1 dev don not have CFG, so, we set
|
|
46
|
+
# do_separate_classifier_free_guidance as False.
|
|
47
|
+
"do_separate_classifier_free_guidance": False,
|
|
48
|
+
"cfg_compute_first": False,
|
|
49
|
+
"enable_taylorseer": args.taylorseer,
|
|
50
|
+
"enable_encoder_taylorseer": args.taylorseer,
|
|
51
|
+
# Taylorseer cache type cache be hidden_states or residual
|
|
52
|
+
"taylorseer_cache_type": "residual",
|
|
53
|
+
"taylorseer_kwargs": {
|
|
54
|
+
"n_derivatives": args.taylorseer_order,
|
|
55
|
+
},
|
|
56
|
+
}
|
|
57
|
+
cache_type_str = "DBCACHE"
|
|
58
|
+
cache_type_str = (
|
|
59
|
+
f"{cache_type_str}_F{args.Fn_compute_blocks}"
|
|
60
|
+
f"B{args.Bn_compute_blocks}W{args.warmup_steps}"
|
|
61
|
+
f"T{int(args.taylorseer)}O{args.taylorseer_order}"
|
|
62
|
+
)
|
|
63
|
+
print(f"cache options:\n{cache_options}")
|
|
64
|
+
|
|
65
|
+
apply_cache_on_pipe(pipe, **cache_options)
|
|
66
|
+
else:
|
|
67
|
+
cache_type_str = "NONE"
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
start = time.time()
|
|
71
|
+
image = pipe(
|
|
72
|
+
"A cat holding a sign that says hello world",
|
|
73
|
+
num_inference_steps=28,
|
|
74
|
+
generator=torch.Generator("cpu").manual_seed(0),
|
|
75
|
+
).images[0]
|
|
76
|
+
|
|
77
|
+
end = time.time()
|
|
78
|
+
|
|
79
|
+
if hasattr(pipe.transformer, "_cached_steps"):
|
|
80
|
+
cached_steps = pipe.transformer._cached_steps
|
|
81
|
+
residual_diffs = pipe.transformer._residual_diffs
|
|
82
|
+
print(f"Cache Steps: {len(cached_steps)}, {cached_steps}")
|
|
83
|
+
print(f"Residual Diffs: {len(residual_diffs)}, {residual_diffs}")
|
|
84
|
+
if hasattr(pipe.transformer, "_cfg_cached_steps"):
|
|
85
|
+
cfg_cached_steps = pipe.transformer._cfg_cached_steps
|
|
86
|
+
cfg_residual_diffs = pipe.transformer._cfg_residual_diffs
|
|
87
|
+
print(f"CFG Cache Steps: {len(cfg_cached_steps)}, {cfg_cached_steps} ")
|
|
88
|
+
print(
|
|
89
|
+
f"CFG Residual Diffs: {len(cfg_residual_diffs)}, {cfg_residual_diffs}"
|
|
90
|
+
)
|
|
91
|
+
|
|
92
|
+
time_cost = end - start
|
|
93
|
+
save_path = f"flux.{cache_type_str}.png"
|
|
94
|
+
print(f"Time cost: {time_cost:.2f}s")
|
|
95
|
+
print(f"Saving image to {save_path}")
|
|
96
|
+
image.save(save_path)
|
|
@@ -0,0 +1,100 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import time
|
|
3
|
+
import torch
|
|
4
|
+
import argparse
|
|
5
|
+
from diffusers import FluxFillPipeline
|
|
6
|
+
from diffusers.utils import load_image
|
|
7
|
+
from cache_dit.cache_factory import apply_cache_on_pipe, CacheType
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def get_args() -> argparse.ArgumentParser:
|
|
11
|
+
parser = argparse.ArgumentParser()
|
|
12
|
+
# General arguments
|
|
13
|
+
parser.add_argument("--cache", action="store_true", default=False)
|
|
14
|
+
parser.add_argument("--taylorseer", action="store_true", default=False)
|
|
15
|
+
parser.add_argument("--taylorseer-order", "--order", type=int, default=2)
|
|
16
|
+
parser.add_argument("--Fn-compute-blocks", "--Fn", type=int, default=1)
|
|
17
|
+
parser.add_argument("--Bn-compute-blocks", "--Bn", type=int, default=0)
|
|
18
|
+
parser.add_argument("--rdt", type=float, default=0.08)
|
|
19
|
+
parser.add_argument("--warmup-steps", type=int, default=0)
|
|
20
|
+
return parser.parse_args()
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
args = get_args()
|
|
24
|
+
print(args)
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
pipe = FluxFillPipeline.from_pretrained(
|
|
28
|
+
os.environ.get(
|
|
29
|
+
"FLUX_FILL_DIR",
|
|
30
|
+
"black-forest-labs/FLUX.1-Fill-dev",
|
|
31
|
+
),
|
|
32
|
+
torch_dtype=torch.bfloat16,
|
|
33
|
+
).to("cuda")
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
if args.cache:
|
|
37
|
+
cache_options = {
|
|
38
|
+
"cache_type": CacheType.DBCache,
|
|
39
|
+
"warmup_steps": args.warmup_steps,
|
|
40
|
+
"max_cached_steps": -1, # -1 means no limit
|
|
41
|
+
# Fn=1, Bn=0, means FB Cache, otherwise, Dual Block Cache
|
|
42
|
+
"Fn_compute_blocks": args.Fn_compute_blocks, # Fn, F8, etc.
|
|
43
|
+
"Bn_compute_blocks": args.Bn_compute_blocks, # Bn, B16, etc.
|
|
44
|
+
"residual_diff_threshold": args.rdt,
|
|
45
|
+
# CFG: classifier free guidance or not
|
|
46
|
+
# FLUX.1 dev don not have CFG, so, we set
|
|
47
|
+
# do_separate_classifier_free_guidance as False.
|
|
48
|
+
"do_separate_classifier_free_guidance": False,
|
|
49
|
+
"cfg_compute_first": False,
|
|
50
|
+
"enable_taylorseer": args.taylorseer,
|
|
51
|
+
"enable_encoder_taylorseer": args.taylorseer,
|
|
52
|
+
# Taylorseer cache type cache be hidden_states or residual
|
|
53
|
+
"taylorseer_cache_type": "residual",
|
|
54
|
+
"taylorseer_kwargs": {
|
|
55
|
+
"n_derivatives": args.taylorseer_order,
|
|
56
|
+
},
|
|
57
|
+
}
|
|
58
|
+
cache_type_str = "DBCACHE"
|
|
59
|
+
cache_type_str = (
|
|
60
|
+
f"{cache_type_str}_F{args.Fn_compute_blocks}"
|
|
61
|
+
f"B{args.Bn_compute_blocks}W{args.warmup_steps}"
|
|
62
|
+
f"T{int(args.taylorseer)}O{args.taylorseer_order}"
|
|
63
|
+
)
|
|
64
|
+
print(f"cache options:\n{cache_options}")
|
|
65
|
+
|
|
66
|
+
apply_cache_on_pipe(pipe, **cache_options)
|
|
67
|
+
else:
|
|
68
|
+
cache_type_str = "NONE"
|
|
69
|
+
|
|
70
|
+
start = time.time()
|
|
71
|
+
image = pipe(
|
|
72
|
+
prompt="a white paper cup",
|
|
73
|
+
image=load_image("data/cup.png"),
|
|
74
|
+
mask_image=load_image("data/cup_mask.png"),
|
|
75
|
+
guidance_scale=30,
|
|
76
|
+
num_inference_steps=28,
|
|
77
|
+
max_sequence_length=512,
|
|
78
|
+
generator=torch.Generator("cpu").manual_seed(0),
|
|
79
|
+
).images[0]
|
|
80
|
+
|
|
81
|
+
end = time.time()
|
|
82
|
+
|
|
83
|
+
if hasattr(pipe.transformer, "_cached_steps"):
|
|
84
|
+
cached_steps = pipe.transformer._cached_steps
|
|
85
|
+
residual_diffs = pipe.transformer._residual_diffs
|
|
86
|
+
print(f"Cache Steps: {len(cached_steps)}, {cached_steps}")
|
|
87
|
+
print(f"Residual Diffs: {len(residual_diffs)}, {residual_diffs}")
|
|
88
|
+
if hasattr(pipe.transformer, "_cfg_cached_steps"):
|
|
89
|
+
cfg_cached_steps = pipe.transformer._cfg_cached_steps
|
|
90
|
+
cfg_residual_diffs = pipe.transformer._cfg_residual_diffs
|
|
91
|
+
print(f"CFG Cache Steps: {len(cfg_cached_steps)}, {cfg_cached_steps} ")
|
|
92
|
+
print(
|
|
93
|
+
f"CFG Residual Diffs: {len(cfg_residual_diffs)}, {cfg_residual_diffs}"
|
|
94
|
+
)
|
|
95
|
+
|
|
96
|
+
time_cost = end - start
|
|
97
|
+
save_path = f"flux-fill.{cache_type_str}.png"
|
|
98
|
+
print(f"Time cost: {time_cost:.2f}s")
|
|
99
|
+
print(f"Saving image to {save_path}")
|
|
100
|
+
image.save(save_path)
|