cache-dit 0.1.8__tar.gz → 0.2.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {cache_dit-0.1.8 → cache_dit-0.2.1}/PKG-INFO +50 -60
- {cache_dit-0.1.8 → cache_dit-0.2.1}/README.md +49 -59
- {cache_dit-0.1.8 → cache_dit-0.2.1}/examples/.gitignore +1 -0
- cache_dit-0.2.1/examples/README.md +45 -0
- cache_dit-0.2.1/examples/requirements.txt +4 -0
- {cache_dit-0.1.8 → cache_dit-0.2.1}/examples/run_cogvideox.py +32 -6
- cache_dit-0.2.1/examples/run_hunyuan_video.py +75 -0
- {cache_dit-0.1.8 → cache_dit-0.2.1}/examples/run_wan.py +19 -5
- {cache_dit-0.1.8 → cache_dit-0.2.1}/setup.py +1 -0
- {cache_dit-0.1.8 → cache_dit-0.2.1}/src/cache_dit/_version.py +2 -2
- {cache_dit-0.1.8 → cache_dit-0.2.1}/src/cache_dit/cache_factory/dual_block_cache/cache_context.py +46 -29
- {cache_dit-0.1.8 → cache_dit-0.2.1}/src/cache_dit/cache_factory/dual_block_cache/diffusers_adapters/__init__.py +8 -0
- cache_dit-0.2.1/src/cache_dit/cache_factory/dual_block_cache/diffusers_adapters/hunyuan_video.py +295 -0
- cache_dit-0.2.1/src/cache_dit/cache_factory/dual_block_cache/diffusers_adapters/wan.py +99 -0
- {cache_dit-0.1.8 → cache_dit-0.2.1}/src/cache_dit/cache_factory/dynamic_block_prune/diffusers_adapters/__init__.py +12 -4
- cache_dit-0.2.1/src/cache_dit/cache_factory/dynamic_block_prune/diffusers_adapters/hunyuan_video.py +295 -0
- cache_dit-0.2.1/src/cache_dit/cache_factory/dynamic_block_prune/diffusers_adapters/wan.py +99 -0
- {cache_dit-0.1.8 → cache_dit-0.2.1}/src/cache_dit/cache_factory/first_block_cache/diffusers_adapters/__init__.py +4 -0
- cache_dit-0.2.1/src/cache_dit/cache_factory/first_block_cache/diffusers_adapters/hunyuan_video.py +295 -0
- {cache_dit-0.1.8 → cache_dit-0.2.1}/src/cache_dit/cache_factory/first_block_cache/diffusers_adapters/wan.py +2 -2
- {cache_dit-0.1.8 → cache_dit-0.2.1}/src/cache_dit.egg-info/PKG-INFO +50 -60
- {cache_dit-0.1.8 → cache_dit-0.2.1}/src/cache_dit.egg-info/SOURCES.txt +8 -0
- {cache_dit-0.1.8 → cache_dit-0.2.1}/.github/workflows/issue.yml +0 -0
- {cache_dit-0.1.8 → cache_dit-0.2.1}/.gitignore +0 -0
- {cache_dit-0.1.8 → cache_dit-0.2.1}/.pre-commit-config.yaml +0 -0
- {cache_dit-0.1.8 → cache_dit-0.2.1}/CONTRIBUTE.md +0 -0
- {cache_dit-0.1.8 → cache_dit-0.2.1}/LICENSE +0 -0
- {cache_dit-0.1.8 → cache_dit-0.2.1}/MANIFEST.in +0 -0
- {cache_dit-0.1.8 → cache_dit-0.2.1}/assets/DBCACHE_F12B12S4_R0.2_S16.png +0 -0
- {cache_dit-0.1.8 → cache_dit-0.2.1}/assets/DBCACHE_F12B16S4_R0.08_S6.png +0 -0
- {cache_dit-0.1.8 → cache_dit-0.2.1}/assets/DBCACHE_F16B16S2_R0.2_S14.png +0 -0
- {cache_dit-0.1.8 → cache_dit-0.2.1}/assets/DBCACHE_F16B16S4_R0.2_S13.png +0 -0
- {cache_dit-0.1.8 → cache_dit-0.2.1}/assets/DBCACHE_F1B0S1_R0.08_S11.png +0 -0
- {cache_dit-0.1.8 → cache_dit-0.2.1}/assets/DBCACHE_F1B0S1_R0.2_S19.png +0 -0
- {cache_dit-0.1.8 → cache_dit-0.2.1}/assets/DBCACHE_F8B0S2_R0.12_S12.png +0 -0
- {cache_dit-0.1.8 → cache_dit-0.2.1}/assets/DBCACHE_F8B16S1_R0.2_S18.png +0 -0
- {cache_dit-0.1.8 → cache_dit-0.2.1}/assets/DBCACHE_F8B8S1_R0.08_S9.png +0 -0
- {cache_dit-0.1.8 → cache_dit-0.2.1}/assets/DBCACHE_F8B8S1_R0.12_S12.png +0 -0
- {cache_dit-0.1.8 → cache_dit-0.2.1}/assets/DBCACHE_F8B8S1_R0.15_S15.png +0 -0
- {cache_dit-0.1.8 → cache_dit-0.2.1}/assets/DBCache.png +0 -0
- {cache_dit-0.1.8 → cache_dit-0.2.1}/assets/DBPRUNE_F1B0_R0.03_P24.0_T19.43s.png +0 -0
- {cache_dit-0.1.8 → cache_dit-0.2.1}/assets/DBPRUNE_F1B0_R0.04_P34.6_T16.82s.png +0 -0
- {cache_dit-0.1.8 → cache_dit-0.2.1}/assets/DBPRUNE_F1B0_R0.05_P38.3_T15.95s.png +0 -0
- {cache_dit-0.1.8 → cache_dit-0.2.1}/assets/DBPRUNE_F1B0_R0.06_P45.2_T14.24s.png +0 -0
- {cache_dit-0.1.8 → cache_dit-0.2.1}/assets/DBPRUNE_F1B0_R0.07_P52.3_T12.53s.png +0 -0
- {cache_dit-0.1.8 → cache_dit-0.2.1}/assets/DBPRUNE_F1B0_R0.08_P52.4_T12.52s.png +0 -0
- {cache_dit-0.1.8 → cache_dit-0.2.1}/assets/DBPRUNE_F1B0_R0.09_P59.2_T10.81s.png +0 -0
- {cache_dit-0.1.8 → cache_dit-0.2.1}/assets/DBPRUNE_F1B0_R0.12_P59.5_T10.76s.png +0 -0
- {cache_dit-0.1.8 → cache_dit-0.2.1}/assets/DBPRUNE_F1B0_R0.12_P63.0_T9.90s.png +0 -0
- {cache_dit-0.1.8 → cache_dit-0.2.1}/assets/DBPRUNE_F1B0_R0.1_P62.8_T9.95s.png +0 -0
- {cache_dit-0.1.8 → cache_dit-0.2.1}/assets/DBPRUNE_F1B0_R0.2_P59.5_T10.66s.png +0 -0
- {cache_dit-0.1.8 → cache_dit-0.2.1}/assets/DBPRUNE_F1B0_R0.3_P63.1_T9.79s.png +0 -0
- {cache_dit-0.1.8 → cache_dit-0.2.1}/assets/NONE_R0.08_S0.png +0 -0
- {cache_dit-0.1.8 → cache_dit-0.2.1}/assets/U0_C1_DBPRUNE_F1B0_R0.03_P24.0_T16.25s.png +0 -0
- {cache_dit-0.1.8 → cache_dit-0.2.1}/assets/U0_C1_DBPRUNE_F1B0_R0.045_P38.2_T13.41s.png +0 -0
- {cache_dit-0.1.8 → cache_dit-0.2.1}/assets/U0_C1_DBPRUNE_F1B0_R0.04_P34.6_T14.12s.png +0 -0
- {cache_dit-0.1.8 → cache_dit-0.2.1}/assets/U0_C1_DBPRUNE_F1B0_R0.055_P45.1_T12.00s.png +0 -0
- {cache_dit-0.1.8 → cache_dit-0.2.1}/assets/U0_C1_DBPRUNE_F1B0_R0.05_P41.6_T12.70s.png +0 -0
- {cache_dit-0.1.8 → cache_dit-0.2.1}/assets/U0_C1_DBPRUNE_F1B0_R0.2_P59.5_T8.86s.png +0 -0
- {cache_dit-0.1.8 → cache_dit-0.2.1}/assets/U0_C1_DBPRUNE_F8B8_R0.08_P23.1_T16.14s.png +0 -0
- {cache_dit-0.1.8 → cache_dit-0.2.1}/assets/U0_C1_NONE_R0.08_S0_T20.43s.png +0 -0
- {cache_dit-0.1.8 → cache_dit-0.2.1}/assets/U4_C1_DBPRUNE_F1B0_R0.03_P27.3_T6.62s.png +0 -0
- {cache_dit-0.1.8 → cache_dit-0.2.1}/assets/U4_C1_DBPRUNE_F1B0_R0.03_P27.3_T6.63s.png +0 -0
- {cache_dit-0.1.8 → cache_dit-0.2.1}/assets/U4_C1_DBPRUNE_F1B0_R0.045_P38.2_T5.81s.png +0 -0
- {cache_dit-0.1.8 → cache_dit-0.2.1}/assets/U4_C1_DBPRUNE_F1B0_R0.045_P38.2_T5.82s.png +0 -0
- {cache_dit-0.1.8 → cache_dit-0.2.1}/assets/U4_C1_DBPRUNE_F1B0_R0.04_P34.6_T6.06s.png +0 -0
- {cache_dit-0.1.8 → cache_dit-0.2.1}/assets/U4_C1_DBPRUNE_F1B0_R0.04_P34.6_T6.07s.png +0 -0
- {cache_dit-0.1.8 → cache_dit-0.2.1}/assets/U4_C1_DBPRUNE_F1B0_R0.04_P34.6_T6.08s.png +0 -0
- {cache_dit-0.1.8 → cache_dit-0.2.1}/assets/U4_C1_DBPRUNE_F1B0_R0.055_P45.1_T5.27s.png +0 -0
- {cache_dit-0.1.8 → cache_dit-0.2.1}/assets/U4_C1_DBPRUNE_F1B0_R0.055_P45.1_T5.28s.png +0 -0
- {cache_dit-0.1.8 → cache_dit-0.2.1}/assets/U4_C1_DBPRUNE_F1B0_R0.2_P59.5_T3.95s.png +0 -0
- {cache_dit-0.1.8 → cache_dit-0.2.1}/assets/U4_C1_DBPRUNE_F1B0_R0.2_P59.5_T3.96s.png +0 -0
- {cache_dit-0.1.8 → cache_dit-0.2.1}/assets/U4_C1_NONE_R0.08_S0_T7.78s.png +0 -0
- {cache_dit-0.1.8 → cache_dit-0.2.1}/assets/U4_C1_NONE_R0.08_S0_T7.79s.png +0 -0
- {cache_dit-0.1.8 → cache_dit-0.2.1}/assets/cache-dit.png +0 -0
- {cache_dit-0.1.8 → cache_dit-0.2.1}/bench/.gitignore +0 -0
- {cache_dit-0.1.8 → cache_dit-0.2.1}/bench/bench.py +0 -0
- {cache_dit-0.1.8 → cache_dit-0.2.1}/docs/.gitignore +0 -0
- {cache_dit-0.1.8 → cache_dit-0.2.1}/examples/data/cup.png +0 -0
- {cache_dit-0.1.8 → cache_dit-0.2.1}/examples/data/cup_mask.png +0 -0
- {cache_dit-0.1.8 → cache_dit-0.2.1}/examples/run_flux.py +0 -0
- {cache_dit-0.1.8 → cache_dit-0.2.1}/examples/run_flux_fill.py +0 -0
- {cache_dit-0.1.8 → cache_dit-0.2.1}/examples/run_mochi.py +0 -0
- {cache_dit-0.1.8 → cache_dit-0.2.1}/pyproject.toml +0 -0
- {cache_dit-0.1.8 → cache_dit-0.2.1}/pytest.ini +0 -0
- {cache_dit-0.1.8 → cache_dit-0.2.1}/requirements.txt +0 -0
- {cache_dit-0.1.8 → cache_dit-0.2.1}/setup.cfg +0 -0
- {cache_dit-0.1.8 → cache_dit-0.2.1}/src/cache_dit/__init__.py +0 -0
- {cache_dit-0.1.8 → cache_dit-0.2.1}/src/cache_dit/cache_factory/__init__.py +0 -0
- {cache_dit-0.1.8 → cache_dit-0.2.1}/src/cache_dit/cache_factory/dual_block_cache/__init__.py +0 -0
- {cache_dit-0.1.8 → cache_dit-0.2.1}/src/cache_dit/cache_factory/dual_block_cache/diffusers_adapters/cogvideox.py +0 -0
- {cache_dit-0.1.8 → cache_dit-0.2.1}/src/cache_dit/cache_factory/dual_block_cache/diffusers_adapters/flux.py +0 -0
- {cache_dit-0.1.8 → cache_dit-0.2.1}/src/cache_dit/cache_factory/dual_block_cache/diffusers_adapters/mochi.py +0 -0
- {cache_dit-0.1.8 → cache_dit-0.2.1}/src/cache_dit/cache_factory/dynamic_block_prune/__init__.py +0 -0
- {cache_dit-0.1.8 → cache_dit-0.2.1}/src/cache_dit/cache_factory/dynamic_block_prune/diffusers_adapters/cogvideox.py +0 -0
- {cache_dit-0.1.8 → cache_dit-0.2.1}/src/cache_dit/cache_factory/dynamic_block_prune/diffusers_adapters/flux.py +0 -0
- {cache_dit-0.1.8 → cache_dit-0.2.1}/src/cache_dit/cache_factory/dynamic_block_prune/diffusers_adapters/mochi.py +0 -0
- {cache_dit-0.1.8 → cache_dit-0.2.1}/src/cache_dit/cache_factory/dynamic_block_prune/prune_context.py +0 -0
- {cache_dit-0.1.8 → cache_dit-0.2.1}/src/cache_dit/cache_factory/first_block_cache/__init__.py +0 -0
- {cache_dit-0.1.8 → cache_dit-0.2.1}/src/cache_dit/cache_factory/first_block_cache/cache_context.py +0 -0
- {cache_dit-0.1.8 → cache_dit-0.2.1}/src/cache_dit/cache_factory/first_block_cache/diffusers_adapters/cogvideox.py +0 -0
- {cache_dit-0.1.8 → cache_dit-0.2.1}/src/cache_dit/cache_factory/first_block_cache/diffusers_adapters/flux.py +0 -0
- {cache_dit-0.1.8 → cache_dit-0.2.1}/src/cache_dit/cache_factory/first_block_cache/diffusers_adapters/mochi.py +0 -0
- {cache_dit-0.1.8 → cache_dit-0.2.1}/src/cache_dit/cache_factory/taylorseer.py +0 -0
- {cache_dit-0.1.8 → cache_dit-0.2.1}/src/cache_dit/cache_factory/utils.py +0 -0
- {cache_dit-0.1.8 → cache_dit-0.2.1}/src/cache_dit/logger.py +0 -0
- {cache_dit-0.1.8 → cache_dit-0.2.1}/src/cache_dit/primitives.py +0 -0
- {cache_dit-0.1.8 → cache_dit-0.2.1}/src/cache_dit.egg-info/dependency_links.txt +0 -0
- {cache_dit-0.1.8 → cache_dit-0.2.1}/src/cache_dit.egg-info/requires.txt +0 -0
- {cache_dit-0.1.8 → cache_dit-0.2.1}/src/cache_dit.egg-info/top_level.txt +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: cache_dit
|
|
3
|
-
Version: 0.1
|
|
3
|
+
Version: 0.2.1
|
|
4
4
|
Summary: 🤗 CacheDiT: A Training-free and Easy-to-use Cache Acceleration Toolbox for Diffusion Transformers
|
|
5
5
|
Author: DefTruth, vipshop.com, etc.
|
|
6
6
|
Maintainer: DefTruth, vipshop.com, etc
|
|
@@ -44,31 +44,18 @@ Dynamic: requires-python
|
|
|
44
44
|
<img src=https://img.shields.io/badge/PyPI-pass-brightgreen.svg >
|
|
45
45
|
<img src=https://static.pepy.tech/badge/cache-dit >
|
|
46
46
|
<img src=https://img.shields.io/badge/Python-3.10|3.11|3.12-9cf.svg >
|
|
47
|
-
<img src=https://img.shields.io/badge/Release-v0.1
|
|
47
|
+
<img src=https://img.shields.io/badge/Release-v0.2.1-brightgreen.svg >
|
|
48
48
|
</div>
|
|
49
49
|
<p align="center">
|
|
50
50
|
DeepCache is for UNet not DiT. Most DiT cache speedups are complex and not training-free. CacheDiT <br>offers a set of training-free cache accelerators for DiT: 🔥DBCache, DBPrune, FBCache, etc🔥
|
|
51
51
|
</p>
|
|
52
|
-
<p align="center">
|
|
53
|
-
<h3> 🔥Supported Models🔥</h2>
|
|
54
|
-
<a href=https://github.com/vipshop/cache-dit/raw/main/examples> <b>🚀FLUX.1</b>: ✔️DBCache, ✔️DBPrune, ✔️FBCache🔥</a> <br>
|
|
55
|
-
<a href=https://github.com/vipshop/cache-dit/raw/main/examples> <b>🚀CogVideoX</b>: ✔️DBCache, ✔️DBPrune, ✔️FBCache🔥</a> <br>
|
|
56
|
-
<a href=https://github.com/vipshop/cache-dit/raw/main/examples> <b>🚀Mochi</b>: ✔️DBCache, ✔️DBPrune, ✔️FBCache🔥</a> <br>
|
|
57
|
-
<a href=https://github.com/vipshop/cache-dit/raw/main/examples> <b>🚀Wan2.1</b>: 🔜DBCache, 🔜DBPrune, ✔️FBCache🔥</a> <br> <br>
|
|
58
|
-
<b>♥️ Please consider to leave a ⭐️ Star to support us ~ ♥️</b>
|
|
59
|
-
</p>
|
|
60
52
|
</div>
|
|
61
53
|
|
|
54
|
+
## 👋 Highlight
|
|
62
55
|
|
|
63
|
-
|
|
64
|
-
## 🎉Supported Models
|
|
65
|
-
<div id="supported"></div>
|
|
66
|
-
- [🚀FLUX.1](https://github.com/vipshop/cache-dit/raw/main/examples): *✔️DBCache, ✔️DBPrune, ✔️FBCache*
|
|
67
|
-
- [🚀CogVideoX](https://github.com/vipshop/cache-dit/raw/main/examples): *✔️DBCache, ✔️DBPrune, ✔️FBCache*
|
|
68
|
-
- [🚀Mochi](https://github.com/vipshop/cache-dit/raw/main/examples): *✔️DBCache, ✔️DBPrune, ✔️FBCache*
|
|
69
|
-
- [🚀Wan2.1**](https://github.com/vipshop/cache-dit/raw/main/examples): *🔜DBCache, 🔜DBPrune, ✔️FBCache*
|
|
70
|
-
-->
|
|
56
|
+
<div id="reference"></div>
|
|
71
57
|
|
|
58
|
+
The **CacheDiT** codebase is adapted from [FBCache](https://github.com/chengzeyi/ParaAttention/tree/main/src/para_attn/first_block_cache). Special thanks to their excellent work! The **FBCache** support for Mochi, FLUX.1, CogVideoX, Wan2.1, and HunyuanVideo is directly adapted from the original [FBCache](https://github.com/chengzeyi/ParaAttention/tree/main/src/para_attn/first_block_cache).
|
|
72
59
|
|
|
73
60
|
## 🤗 Introduction
|
|
74
61
|
|
|
@@ -110,6 +97,12 @@ These case studies demonstrate that even with relatively high thresholds (such a
|
|
|
110
97
|
|
|
111
98
|
**DBPrune**: We have further implemented a new **Dynamic Block Prune** algorithm based on **Residual Caching** for Diffusion Transformers, referred to as DBPrune. DBPrune caches each block's hidden states and residuals, then **dynamically prunes** blocks during inference by computing the L1 distance between previous hidden states. When a block is pruned, its output is approximated using the cached residuals.
|
|
112
99
|
|
|
100
|
+
<div align="center">
|
|
101
|
+
<p align="center">
|
|
102
|
+
DBPrune, <b> L20x1 </b>, Steps: 28, "A cat holding a sign that says hello world with complex background"
|
|
103
|
+
</p>
|
|
104
|
+
</div>
|
|
105
|
+
|
|
113
106
|
|Baseline(L20x1)|Pruned(24%)|Pruned(35%)|Pruned(38%)|Pruned(45%)|Pruned(60%)|
|
|
114
107
|
|:---:|:---:|:---:|:---:|:---:|:---:|
|
|
115
108
|
|24.85s|19.43s|16.82s|15.95s|14.24s|10.66s|
|
|
@@ -117,11 +110,11 @@ These case studies demonstrate that even with relatively high thresholds (such a
|
|
|
117
110
|
|
|
118
111
|
<div align="center">
|
|
119
112
|
<p align="center">
|
|
120
|
-
|
|
113
|
+
<h3>🔥 Context Parallelism and Torch Compile</h3>
|
|
121
114
|
</p>
|
|
122
|
-
</div>
|
|
115
|
+
</div>
|
|
123
116
|
|
|
124
|
-
**CacheDiT** are **plug-and-play** solutions that works hand-in-hand with [ParaAttention](https://github.com/chengzeyi/ParaAttention). Users can easily tap into its **Context Parallelism** features for distributed inference.
|
|
117
|
+
Moreover, **CacheDiT** are **plug-and-play** solutions that works hand-in-hand with [ParaAttention](https://github.com/chengzeyi/ParaAttention). Users can easily tap into its **Context Parallelism** features for distributed inference. By the way, CacheDiT is designed to work compatibly with **torch.compile.** You can easily use CacheDiT with torch.compile to further achieve a better performance.
|
|
125
118
|
|
|
126
119
|
<div align="center">
|
|
127
120
|
<p align="center">
|
|
@@ -131,11 +124,16 @@ These case studies demonstrate that even with relatively high thresholds (such a
|
|
|
131
124
|
|
|
132
125
|
|Baseline|Pruned(24%)|Pruned(35%)|Pruned(38%)|Pruned(45%)|Pruned(60%)|
|
|
133
126
|
|:---:|:---:|:---:|:---:|:---:|:---:|
|
|
134
|
-
|+
|
|
135
|
-
|+compile:20.43s|16.25s|14.12s|13.41s|12s|8.86s|
|
|
127
|
+
|+compile:20.43s|16.25s|14.12s|13.41s|12.00s|8.86s|
|
|
136
128
|
|+L20x4:7.75s|6.62s|6.03s|5.81s|5.24s|3.93s|
|
|
137
129
|
|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/U0_C1_NONE_R0.08_S0_T20.43s.png width=105px>|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/U0_C1_DBPRUNE_F1B0_R0.03_P24.0_T16.25s.png width=105px> | <img src=https://github.com/vipshop/cache-dit/raw/main/assets/U0_C1_DBPRUNE_F1B0_R0.04_P34.6_T14.12s.png width=105px>|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/U0_C1_DBPRUNE_F1B0_R0.045_P38.2_T13.41s.png width=105px>|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/U0_C1_DBPRUNE_F1B0_R0.055_P45.1_T12.00s.png width=105px>|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/U0_C1_DBPRUNE_F1B0_R0.2_P59.5_T8.86s.png width=105px>|
|
|
138
130
|
|
|
131
|
+
<div align="center">
|
|
132
|
+
<p align="center">
|
|
133
|
+
<b>♥️ Please consider to leave a ⭐️ Star to support us ~ ♥️</b>
|
|
134
|
+
</p>
|
|
135
|
+
</div>
|
|
136
|
+
|
|
139
137
|
## ©️Citations
|
|
140
138
|
|
|
141
139
|
```BibTeX
|
|
@@ -148,17 +146,12 @@ These case studies demonstrate that even with relatively high thresholds (such a
|
|
|
148
146
|
}
|
|
149
147
|
```
|
|
150
148
|
|
|
151
|
-
## 👋Reference
|
|
152
|
-
|
|
153
|
-
<div id="reference"></div>
|
|
154
|
-
|
|
155
|
-
The **CacheDiT** codebase was adapted from FBCache's implementation at the [ParaAttention](https://github.com/chengzeyi/ParaAttention/tree/main/src/para_attn/first_block_cache). We would like to express our sincere gratitude for this excellent work!
|
|
156
|
-
|
|
157
149
|
## 📖Contents
|
|
158
150
|
|
|
159
151
|
<div id="contents"></div>
|
|
160
152
|
|
|
161
153
|
- [⚙️Installation](#️installation)
|
|
154
|
+
- [🔥Supported Models](#supported)
|
|
162
155
|
- [⚡️Dual Block Cache](#dbcache)
|
|
163
156
|
- [🎉First Block Cache](#fbcache)
|
|
164
157
|
- [⚡️Dynamic Block Prune](#dbprune)
|
|
@@ -182,6 +175,30 @@ Or you can install the latest develop version from GitHub:
|
|
|
182
175
|
pip3 install git+https://github.com/vipshop/cache-dit.git
|
|
183
176
|
```
|
|
184
177
|
|
|
178
|
+
## 🔥Supported Models
|
|
179
|
+
|
|
180
|
+
<div id="supported"></div>
|
|
181
|
+
|
|
182
|
+
- [🚀FLUX.1](https://github.com/vipshop/cache-dit/raw/main/examples)
|
|
183
|
+
- [🚀Mochi](https://github.com/vipshop/cache-dit/raw/main/examples)
|
|
184
|
+
- [🚀CogVideoX](https://github.com/vipshop/cache-dit/raw/main/examples)
|
|
185
|
+
- [🚀CogVideoX1.5](https://github.com/vipshop/cache-dit/raw/main/examples)
|
|
186
|
+
- [🚀Wan2.1](https://github.com/vipshop/cache-dit/raw/main/examples)
|
|
187
|
+
- [🚀HunyuanVideo](https://github.com/vipshop/cache-dit/raw/main/examples)
|
|
188
|
+
|
|
189
|
+
|
|
190
|
+
<!--
|
|
191
|
+
<p align="center">
|
|
192
|
+
<h4> 🔥Supported Models🔥</h4>
|
|
193
|
+
<a href=https://github.com/vipshop/cache-dit/raw/main/examples> <b>🚀FLUX.1</b>: ✔️DBCache, ✔️DBPrune, ✔️FBCache🔥</a> <br>
|
|
194
|
+
<a href=https://github.com/vipshop/cache-dit/raw/main/examples> <b>🚀Mochi</b>: ✔️DBCache, ✔️DBPrune, ✔️FBCache🔥</a> <br>
|
|
195
|
+
<a href=https://github.com/vipshop/cache-dit/raw/main/examples> <b>🚀CogVideoX</b>: ✔️DBCache, ✔️DBPrune, ✔️FBCache🔥</a> <br>
|
|
196
|
+
<a href=https://github.com/vipshop/cache-dit/raw/main/examples> <b>🚀CogVideoX1.5</b>: ✔️DBCache, ✔️DBPrune, ✔️FBCache🔥</a> <br>
|
|
197
|
+
<a href=https://github.com/vipshop/cache-dit/raw/main/examples> <b>🚀Wan2.1</b>: ✔️DBCache, ✔️DBPrune, ✔️FBCache🔥</a> <br>
|
|
198
|
+
<a href=https://github.com/vipshop/cache-dit/raw/main/examples> <b>🚀HunyuanVideo</b>: ✔️DBCache, ✔️DBPrune, ✔️FBCache🔥</a> <br>
|
|
199
|
+
</p>
|
|
200
|
+
-->
|
|
201
|
+
|
|
185
202
|
## ⚡️DBCache: Dual Block Cache
|
|
186
203
|
|
|
187
204
|
<div id="dbcache"></div>
|
|
@@ -339,6 +356,9 @@ cache_options = {
|
|
|
339
356
|
apply_cache_on_pipe(pipe, **cache_options)
|
|
340
357
|
```
|
|
341
358
|
|
|
359
|
+
> [!Important]
|
|
360
|
+
> Please note that for GPUs with lower VRAM, DBPrune may not be suitable for use on video DiTs, as it caches the hidden states and residuals of each block, leading to higher GPU memory requirements. In such cases, please use DBCache, which only caches the hidden states and residuals of 2 blocks.
|
|
361
|
+
|
|
342
362
|
<div align="center">
|
|
343
363
|
<p align="center">
|
|
344
364
|
DBPrune, <b> L20x1 </b>, Steps: 28, "A cat holding a sign that says hello world with complex background"
|
|
@@ -396,26 +416,12 @@ Then, run the python test script with `torchrun`:
|
|
|
396
416
|
```bash
|
|
397
417
|
torchrun --nproc_per_node=4 parallel_cache.py
|
|
398
418
|
```
|
|
399
|
-
<!--
|
|
400
|
-
|
|
401
|
-
<div align="center">
|
|
402
|
-
<p align="center">
|
|
403
|
-
DBPrune, <b> L20x4 </b>, Steps: 28, "A cat holding a sign that says hello world with complex background"
|
|
404
|
-
</p>
|
|
405
|
-
</div>
|
|
406
|
-
|
|
407
|
-
|Baseline|Pruned(24%)|Pruned(35%)|Pruned(38%)|Pruned(45%)|Pruned(60%)|
|
|
408
|
-
|:---:|:---:|:---:|:---:|:---:|:---:|
|
|
409
|
-
|+L20x1:24.85s|19.43s|16.82s|15.95s|14.24s|10.66s|
|
|
410
|
-
|+L20x4:8.54s|7.20s|6.61s|6.09s|5.54s|4.22s|
|
|
411
|
-
|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/NONE_R0.08_S0.png width=105px>|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/DBPRUNE_F1B0_R0.03_P24.0_T19.43s.png width=105px> | <img src=https://github.com/vipshop/cache-dit/raw/main/assets/DBPRUNE_F1B0_R0.04_P34.6_T16.82s.png width=105px>|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/DBPRUNE_F1B0_R0.05_P38.3_T15.95s.png width=105px>|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/DBPRUNE_F1B0_R0.06_P45.2_T14.24s.png width=105px>|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/DBPRUNE_F1B0_R0.2_P59.5_T10.66s.png width=105px>|
|
|
412
|
-
-->
|
|
413
419
|
|
|
414
420
|
## 🔥Torch Compile
|
|
415
421
|
|
|
416
422
|
<div id="compile"></div>
|
|
417
423
|
|
|
418
|
-
**CacheDiT**
|
|
424
|
+
By the way, **CacheDiT** is designed to work compatibly with **torch.compile.** You can easily use CacheDiT with torch.compile to further achieve a better performance. For example:
|
|
419
425
|
|
|
420
426
|
```python
|
|
421
427
|
apply_cache_on_pipe(
|
|
@@ -430,22 +436,6 @@ torch._dynamo.config.recompile_limit = 96 # default is 8
|
|
|
430
436
|
torch._dynamo.config.accumulated_recompile_limit = 2048 # default is 256
|
|
431
437
|
```
|
|
432
438
|
|
|
433
|
-
<!--
|
|
434
|
-
|
|
435
|
-
<div align="center">
|
|
436
|
-
<p align="center">
|
|
437
|
-
DBPrune + <b>torch.compile</b>, Steps: 28, "A cat holding a sign that says hello world with complex background"
|
|
438
|
-
</p>
|
|
439
|
-
</div>
|
|
440
|
-
|
|
441
|
-
|Baseline|Pruned(24%)|Pruned(35%)|Pruned(38%)|Pruned(45%)|Pruned(60%)|
|
|
442
|
-
|:---:|:---:|:---:|:---:|:---:|:---:|
|
|
443
|
-
|+L20x1:24.8s|19.4s|16.8s|15.9s|14.2s|10.6s|
|
|
444
|
-
|+compile:20.4s|16.5s|14.1s|13.4s|12s|8.8s|
|
|
445
|
-
|+L20x4:7.7s|6.6s|6.0s|5.8s|5.2s|3.9s|
|
|
446
|
-
|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/U0_C1_NONE_R0.08_S0_T20.43s.png width=105px>|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/U0_C1_DBPRUNE_F1B0_R0.03_P24.0_T16.25s.png width=105px> | <img src=https://github.com/vipshop/cache-dit/raw/main/assets/U0_C1_DBPRUNE_F1B0_R0.04_P34.6_T14.12s.png width=105px>|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/U0_C1_DBPRUNE_F1B0_R0.045_P38.2_T13.41s.png width=105px>|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/U0_C1_DBPRUNE_F1B0_R0.055_P45.1_T12.00s.png width=105px>|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/U0_C1_DBPRUNE_F1B0_R0.2_P59.5_T8.86s.png width=105px>|
|
|
447
|
-
-->
|
|
448
|
-
|
|
449
439
|
## 👋Contribute
|
|
450
440
|
<div id="contribute"></div>
|
|
451
441
|
|
|
@@ -9,31 +9,18 @@
|
|
|
9
9
|
<img src=https://img.shields.io/badge/PyPI-pass-brightgreen.svg >
|
|
10
10
|
<img src=https://static.pepy.tech/badge/cache-dit >
|
|
11
11
|
<img src=https://img.shields.io/badge/Python-3.10|3.11|3.12-9cf.svg >
|
|
12
|
-
<img src=https://img.shields.io/badge/Release-v0.1
|
|
12
|
+
<img src=https://img.shields.io/badge/Release-v0.2.1-brightgreen.svg >
|
|
13
13
|
</div>
|
|
14
14
|
<p align="center">
|
|
15
15
|
DeepCache is for UNet not DiT. Most DiT cache speedups are complex and not training-free. CacheDiT <br>offers a set of training-free cache accelerators for DiT: 🔥DBCache, DBPrune, FBCache, etc🔥
|
|
16
16
|
</p>
|
|
17
|
-
<p align="center">
|
|
18
|
-
<h3> 🔥Supported Models🔥</h2>
|
|
19
|
-
<a href=https://github.com/vipshop/cache-dit/raw/main/examples> <b>🚀FLUX.1</b>: ✔️DBCache, ✔️DBPrune, ✔️FBCache🔥</a> <br>
|
|
20
|
-
<a href=https://github.com/vipshop/cache-dit/raw/main/examples> <b>🚀CogVideoX</b>: ✔️DBCache, ✔️DBPrune, ✔️FBCache🔥</a> <br>
|
|
21
|
-
<a href=https://github.com/vipshop/cache-dit/raw/main/examples> <b>🚀Mochi</b>: ✔️DBCache, ✔️DBPrune, ✔️FBCache🔥</a> <br>
|
|
22
|
-
<a href=https://github.com/vipshop/cache-dit/raw/main/examples> <b>🚀Wan2.1</b>: 🔜DBCache, 🔜DBPrune, ✔️FBCache🔥</a> <br> <br>
|
|
23
|
-
<b>♥️ Please consider to leave a ⭐️ Star to support us ~ ♥️</b>
|
|
24
|
-
</p>
|
|
25
17
|
</div>
|
|
26
18
|
|
|
19
|
+
## 👋 Highlight
|
|
27
20
|
|
|
28
|
-
|
|
29
|
-
## 🎉Supported Models
|
|
30
|
-
<div id="supported"></div>
|
|
31
|
-
- [🚀FLUX.1](https://github.com/vipshop/cache-dit/raw/main/examples): *✔️DBCache, ✔️DBPrune, ✔️FBCache*
|
|
32
|
-
- [🚀CogVideoX](https://github.com/vipshop/cache-dit/raw/main/examples): *✔️DBCache, ✔️DBPrune, ✔️FBCache*
|
|
33
|
-
- [🚀Mochi](https://github.com/vipshop/cache-dit/raw/main/examples): *✔️DBCache, ✔️DBPrune, ✔️FBCache*
|
|
34
|
-
- [🚀Wan2.1**](https://github.com/vipshop/cache-dit/raw/main/examples): *🔜DBCache, 🔜DBPrune, ✔️FBCache*
|
|
35
|
-
-->
|
|
21
|
+
<div id="reference"></div>
|
|
36
22
|
|
|
23
|
+
The **CacheDiT** codebase is adapted from [FBCache](https://github.com/chengzeyi/ParaAttention/tree/main/src/para_attn/first_block_cache). Special thanks to their excellent work! The **FBCache** support for Mochi, FLUX.1, CogVideoX, Wan2.1, and HunyuanVideo is directly adapted from the original [FBCache](https://github.com/chengzeyi/ParaAttention/tree/main/src/para_attn/first_block_cache).
|
|
37
24
|
|
|
38
25
|
## 🤗 Introduction
|
|
39
26
|
|
|
@@ -75,6 +62,12 @@ These case studies demonstrate that even with relatively high thresholds (such a
|
|
|
75
62
|
|
|
76
63
|
**DBPrune**: We have further implemented a new **Dynamic Block Prune** algorithm based on **Residual Caching** for Diffusion Transformers, referred to as DBPrune. DBPrune caches each block's hidden states and residuals, then **dynamically prunes** blocks during inference by computing the L1 distance between previous hidden states. When a block is pruned, its output is approximated using the cached residuals.
|
|
77
64
|
|
|
65
|
+
<div align="center">
|
|
66
|
+
<p align="center">
|
|
67
|
+
DBPrune, <b> L20x1 </b>, Steps: 28, "A cat holding a sign that says hello world with complex background"
|
|
68
|
+
</p>
|
|
69
|
+
</div>
|
|
70
|
+
|
|
78
71
|
|Baseline(L20x1)|Pruned(24%)|Pruned(35%)|Pruned(38%)|Pruned(45%)|Pruned(60%)|
|
|
79
72
|
|:---:|:---:|:---:|:---:|:---:|:---:|
|
|
80
73
|
|24.85s|19.43s|16.82s|15.95s|14.24s|10.66s|
|
|
@@ -82,11 +75,11 @@ These case studies demonstrate that even with relatively high thresholds (such a
|
|
|
82
75
|
|
|
83
76
|
<div align="center">
|
|
84
77
|
<p align="center">
|
|
85
|
-
|
|
78
|
+
<h3>🔥 Context Parallelism and Torch Compile</h3>
|
|
86
79
|
</p>
|
|
87
|
-
</div>
|
|
80
|
+
</div>
|
|
88
81
|
|
|
89
|
-
**CacheDiT** are **plug-and-play** solutions that works hand-in-hand with [ParaAttention](https://github.com/chengzeyi/ParaAttention). Users can easily tap into its **Context Parallelism** features for distributed inference.
|
|
82
|
+
Moreover, **CacheDiT** are **plug-and-play** solutions that works hand-in-hand with [ParaAttention](https://github.com/chengzeyi/ParaAttention). Users can easily tap into its **Context Parallelism** features for distributed inference. By the way, CacheDiT is designed to work compatibly with **torch.compile.** You can easily use CacheDiT with torch.compile to further achieve a better performance.
|
|
90
83
|
|
|
91
84
|
<div align="center">
|
|
92
85
|
<p align="center">
|
|
@@ -96,11 +89,16 @@ These case studies demonstrate that even with relatively high thresholds (such a
|
|
|
96
89
|
|
|
97
90
|
|Baseline|Pruned(24%)|Pruned(35%)|Pruned(38%)|Pruned(45%)|Pruned(60%)|
|
|
98
91
|
|:---:|:---:|:---:|:---:|:---:|:---:|
|
|
99
|
-
|+
|
|
100
|
-
|+compile:20.43s|16.25s|14.12s|13.41s|12s|8.86s|
|
|
92
|
+
|+compile:20.43s|16.25s|14.12s|13.41s|12.00s|8.86s|
|
|
101
93
|
|+L20x4:7.75s|6.62s|6.03s|5.81s|5.24s|3.93s|
|
|
102
94
|
|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/U0_C1_NONE_R0.08_S0_T20.43s.png width=105px>|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/U0_C1_DBPRUNE_F1B0_R0.03_P24.0_T16.25s.png width=105px> | <img src=https://github.com/vipshop/cache-dit/raw/main/assets/U0_C1_DBPRUNE_F1B0_R0.04_P34.6_T14.12s.png width=105px>|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/U0_C1_DBPRUNE_F1B0_R0.045_P38.2_T13.41s.png width=105px>|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/U0_C1_DBPRUNE_F1B0_R0.055_P45.1_T12.00s.png width=105px>|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/U0_C1_DBPRUNE_F1B0_R0.2_P59.5_T8.86s.png width=105px>|
|
|
103
95
|
|
|
96
|
+
<div align="center">
|
|
97
|
+
<p align="center">
|
|
98
|
+
<b>♥️ Please consider to leave a ⭐️ Star to support us ~ ♥️</b>
|
|
99
|
+
</p>
|
|
100
|
+
</div>
|
|
101
|
+
|
|
104
102
|
## ©️Citations
|
|
105
103
|
|
|
106
104
|
```BibTeX
|
|
@@ -113,17 +111,12 @@ These case studies demonstrate that even with relatively high thresholds (such a
|
|
|
113
111
|
}
|
|
114
112
|
```
|
|
115
113
|
|
|
116
|
-
## 👋Reference
|
|
117
|
-
|
|
118
|
-
<div id="reference"></div>
|
|
119
|
-
|
|
120
|
-
The **CacheDiT** codebase was adapted from FBCache's implementation at the [ParaAttention](https://github.com/chengzeyi/ParaAttention/tree/main/src/para_attn/first_block_cache). We would like to express our sincere gratitude for this excellent work!
|
|
121
|
-
|
|
122
114
|
## 📖Contents
|
|
123
115
|
|
|
124
116
|
<div id="contents"></div>
|
|
125
117
|
|
|
126
118
|
- [⚙️Installation](#️installation)
|
|
119
|
+
- [🔥Supported Models](#supported)
|
|
127
120
|
- [⚡️Dual Block Cache](#dbcache)
|
|
128
121
|
- [🎉First Block Cache](#fbcache)
|
|
129
122
|
- [⚡️Dynamic Block Prune](#dbprune)
|
|
@@ -147,6 +140,30 @@ Or you can install the latest develop version from GitHub:
|
|
|
147
140
|
pip3 install git+https://github.com/vipshop/cache-dit.git
|
|
148
141
|
```
|
|
149
142
|
|
|
143
|
+
## 🔥Supported Models
|
|
144
|
+
|
|
145
|
+
<div id="supported"></div>
|
|
146
|
+
|
|
147
|
+
- [🚀FLUX.1](https://github.com/vipshop/cache-dit/raw/main/examples)
|
|
148
|
+
- [🚀Mochi](https://github.com/vipshop/cache-dit/raw/main/examples)
|
|
149
|
+
- [🚀CogVideoX](https://github.com/vipshop/cache-dit/raw/main/examples)
|
|
150
|
+
- [🚀CogVideoX1.5](https://github.com/vipshop/cache-dit/raw/main/examples)
|
|
151
|
+
- [🚀Wan2.1](https://github.com/vipshop/cache-dit/raw/main/examples)
|
|
152
|
+
- [🚀HunyuanVideo](https://github.com/vipshop/cache-dit/raw/main/examples)
|
|
153
|
+
|
|
154
|
+
|
|
155
|
+
<!--
|
|
156
|
+
<p align="center">
|
|
157
|
+
<h4> 🔥Supported Models🔥</h4>
|
|
158
|
+
<a href=https://github.com/vipshop/cache-dit/raw/main/examples> <b>🚀FLUX.1</b>: ✔️DBCache, ✔️DBPrune, ✔️FBCache🔥</a> <br>
|
|
159
|
+
<a href=https://github.com/vipshop/cache-dit/raw/main/examples> <b>🚀Mochi</b>: ✔️DBCache, ✔️DBPrune, ✔️FBCache🔥</a> <br>
|
|
160
|
+
<a href=https://github.com/vipshop/cache-dit/raw/main/examples> <b>🚀CogVideoX</b>: ✔️DBCache, ✔️DBPrune, ✔️FBCache🔥</a> <br>
|
|
161
|
+
<a href=https://github.com/vipshop/cache-dit/raw/main/examples> <b>🚀CogVideoX1.5</b>: ✔️DBCache, ✔️DBPrune, ✔️FBCache🔥</a> <br>
|
|
162
|
+
<a href=https://github.com/vipshop/cache-dit/raw/main/examples> <b>🚀Wan2.1</b>: ✔️DBCache, ✔️DBPrune, ✔️FBCache🔥</a> <br>
|
|
163
|
+
<a href=https://github.com/vipshop/cache-dit/raw/main/examples> <b>🚀HunyuanVideo</b>: ✔️DBCache, ✔️DBPrune, ✔️FBCache🔥</a> <br>
|
|
164
|
+
</p>
|
|
165
|
+
-->
|
|
166
|
+
|
|
150
167
|
## ⚡️DBCache: Dual Block Cache
|
|
151
168
|
|
|
152
169
|
<div id="dbcache"></div>
|
|
@@ -304,6 +321,9 @@ cache_options = {
|
|
|
304
321
|
apply_cache_on_pipe(pipe, **cache_options)
|
|
305
322
|
```
|
|
306
323
|
|
|
324
|
+
> [!Important]
|
|
325
|
+
> Please note that for GPUs with lower VRAM, DBPrune may not be suitable for use on video DiTs, as it caches the hidden states and residuals of each block, leading to higher GPU memory requirements. In such cases, please use DBCache, which only caches the hidden states and residuals of 2 blocks.
|
|
326
|
+
|
|
307
327
|
<div align="center">
|
|
308
328
|
<p align="center">
|
|
309
329
|
DBPrune, <b> L20x1 </b>, Steps: 28, "A cat holding a sign that says hello world with complex background"
|
|
@@ -361,26 +381,12 @@ Then, run the python test script with `torchrun`:
|
|
|
361
381
|
```bash
|
|
362
382
|
torchrun --nproc_per_node=4 parallel_cache.py
|
|
363
383
|
```
|
|
364
|
-
<!--
|
|
365
|
-
|
|
366
|
-
<div align="center">
|
|
367
|
-
<p align="center">
|
|
368
|
-
DBPrune, <b> L20x4 </b>, Steps: 28, "A cat holding a sign that says hello world with complex background"
|
|
369
|
-
</p>
|
|
370
|
-
</div>
|
|
371
|
-
|
|
372
|
-
|Baseline|Pruned(24%)|Pruned(35%)|Pruned(38%)|Pruned(45%)|Pruned(60%)|
|
|
373
|
-
|:---:|:---:|:---:|:---:|:---:|:---:|
|
|
374
|
-
|+L20x1:24.85s|19.43s|16.82s|15.95s|14.24s|10.66s|
|
|
375
|
-
|+L20x4:8.54s|7.20s|6.61s|6.09s|5.54s|4.22s|
|
|
376
|
-
|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/NONE_R0.08_S0.png width=105px>|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/DBPRUNE_F1B0_R0.03_P24.0_T19.43s.png width=105px> | <img src=https://github.com/vipshop/cache-dit/raw/main/assets/DBPRUNE_F1B0_R0.04_P34.6_T16.82s.png width=105px>|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/DBPRUNE_F1B0_R0.05_P38.3_T15.95s.png width=105px>|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/DBPRUNE_F1B0_R0.06_P45.2_T14.24s.png width=105px>|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/DBPRUNE_F1B0_R0.2_P59.5_T10.66s.png width=105px>|
|
|
377
|
-
-->
|
|
378
384
|
|
|
379
385
|
## 🔥Torch Compile
|
|
380
386
|
|
|
381
387
|
<div id="compile"></div>
|
|
382
388
|
|
|
383
|
-
**CacheDiT**
|
|
389
|
+
By the way, **CacheDiT** is designed to work compatibly with **torch.compile.** You can easily use CacheDiT with torch.compile to further achieve a better performance. For example:
|
|
384
390
|
|
|
385
391
|
```python
|
|
386
392
|
apply_cache_on_pipe(
|
|
@@ -395,22 +401,6 @@ torch._dynamo.config.recompile_limit = 96 # default is 8
|
|
|
395
401
|
torch._dynamo.config.accumulated_recompile_limit = 2048 # default is 256
|
|
396
402
|
```
|
|
397
403
|
|
|
398
|
-
<!--
|
|
399
|
-
|
|
400
|
-
<div align="center">
|
|
401
|
-
<p align="center">
|
|
402
|
-
DBPrune + <b>torch.compile</b>, Steps: 28, "A cat holding a sign that says hello world with complex background"
|
|
403
|
-
</p>
|
|
404
|
-
</div>
|
|
405
|
-
|
|
406
|
-
|Baseline|Pruned(24%)|Pruned(35%)|Pruned(38%)|Pruned(45%)|Pruned(60%)|
|
|
407
|
-
|:---:|:---:|:---:|:---:|:---:|:---:|
|
|
408
|
-
|+L20x1:24.8s|19.4s|16.8s|15.9s|14.2s|10.6s|
|
|
409
|
-
|+compile:20.4s|16.5s|14.1s|13.4s|12s|8.8s|
|
|
410
|
-
|+L20x4:7.7s|6.6s|6.0s|5.8s|5.2s|3.9s|
|
|
411
|
-
|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/U0_C1_NONE_R0.08_S0_T20.43s.png width=105px>|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/U0_C1_DBPRUNE_F1B0_R0.03_P24.0_T16.25s.png width=105px> | <img src=https://github.com/vipshop/cache-dit/raw/main/assets/U0_C1_DBPRUNE_F1B0_R0.04_P34.6_T14.12s.png width=105px>|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/U0_C1_DBPRUNE_F1B0_R0.045_P38.2_T13.41s.png width=105px>|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/U0_C1_DBPRUNE_F1B0_R0.055_P45.1_T12.00s.png width=105px>|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/U0_C1_DBPRUNE_F1B0_R0.2_P59.5_T8.86s.png width=105px>|
|
|
412
|
-
-->
|
|
413
|
-
|
|
414
404
|
## 👋Contribute
|
|
415
405
|
<div id="contribute"></div>
|
|
416
406
|
|
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
# Examples for CacheDiT
|
|
2
|
+
|
|
3
|
+
## Install requirements
|
|
4
|
+
|
|
5
|
+
```bash
|
|
6
|
+
pip3 install -r requirements.txt
|
|
7
|
+
```
|
|
8
|
+
|
|
9
|
+
## Run examples
|
|
10
|
+
|
|
11
|
+
- FLUX.1-dev
|
|
12
|
+
|
|
13
|
+
```bash
|
|
14
|
+
python3 run_flux.py
|
|
15
|
+
```
|
|
16
|
+
|
|
17
|
+
- FLUX.1-Fill-dev
|
|
18
|
+
|
|
19
|
+
```bash
|
|
20
|
+
python3 run_flux_fill.py
|
|
21
|
+
```
|
|
22
|
+
|
|
23
|
+
- CogVideoX
|
|
24
|
+
|
|
25
|
+
```bash
|
|
26
|
+
python3 run_cogvideox.py
|
|
27
|
+
```
|
|
28
|
+
|
|
29
|
+
- Wan2.1
|
|
30
|
+
|
|
31
|
+
```bash
|
|
32
|
+
python3 run_wan.py
|
|
33
|
+
```
|
|
34
|
+
|
|
35
|
+
- Mochi
|
|
36
|
+
|
|
37
|
+
```bash
|
|
38
|
+
python3 run_mochi.py
|
|
39
|
+
```
|
|
40
|
+
|
|
41
|
+
- HunyuanVideo
|
|
42
|
+
|
|
43
|
+
```bash
|
|
44
|
+
python3 run_hunyuan_video.py
|
|
45
|
+
```
|
|
@@ -1,14 +1,33 @@
|
|
|
1
1
|
import os
|
|
2
2
|
import torch
|
|
3
|
-
from diffusers import CogVideoXPipeline
|
|
4
3
|
from diffusers.utils import export_to_video
|
|
4
|
+
from diffusers import CogVideoXPipeline, AutoencoderKLCogVideoX
|
|
5
5
|
from cache_dit.cache_factory import apply_cache_on_pipe, CacheType
|
|
6
6
|
|
|
7
|
+
|
|
8
|
+
model_id = os.environ.get("COGVIDEOX_DIR", "THUDM/CogVideoX-5b")
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def is_cogvideox_1_5():
|
|
12
|
+
return "CogVideoX1.5" in model_id or "THUDM/CogVideoX1.5" in model_id
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def get_gpu_memory_in_gib():
|
|
16
|
+
if not torch.cuda.is_available():
|
|
17
|
+
return 0
|
|
18
|
+
|
|
19
|
+
try:
|
|
20
|
+
total_memory_bytes = torch.cuda.get_device_properties(
|
|
21
|
+
torch.cuda.current_device(),
|
|
22
|
+
).total_memory
|
|
23
|
+
total_memory_gib = total_memory_bytes / (1024**3)
|
|
24
|
+
return int(total_memory_gib)
|
|
25
|
+
except Exception:
|
|
26
|
+
return 0
|
|
27
|
+
|
|
28
|
+
|
|
7
29
|
pipe = CogVideoXPipeline.from_pretrained(
|
|
8
|
-
|
|
9
|
-
"COGVIDEOX_DIR",
|
|
10
|
-
"THUDM/CogVideoX-5b",
|
|
11
|
-
),
|
|
30
|
+
model_id,
|
|
12
31
|
torch_dtype=torch.bfloat16,
|
|
13
32
|
).to("cuda")
|
|
14
33
|
|
|
@@ -17,6 +36,8 @@ cache_options = CacheType.default_options(CacheType.DBCache)
|
|
|
17
36
|
|
|
18
37
|
apply_cache_on_pipe(pipe, **cache_options)
|
|
19
38
|
|
|
39
|
+
pipe.enable_model_cpu_offload()
|
|
40
|
+
assert isinstance(pipe.vae, AutoencoderKLCogVideoX) # enable type check for IDE
|
|
20
41
|
pipe.vae.enable_slicing()
|
|
21
42
|
pipe.vae.enable_tiling()
|
|
22
43
|
|
|
@@ -37,7 +58,12 @@ video = pipe(
|
|
|
37
58
|
prompt=prompt,
|
|
38
59
|
num_videos_per_prompt=1,
|
|
39
60
|
num_inference_steps=50,
|
|
40
|
-
num_frames=
|
|
61
|
+
num_frames=(
|
|
62
|
+
# Avoid OOM for CogVideoX1.5 model on 48GB GPU
|
|
63
|
+
16
|
|
64
|
+
if (is_cogvideox_1_5() and get_gpu_memory_in_gib() < 48)
|
|
65
|
+
else 49
|
|
66
|
+
),
|
|
41
67
|
guidance_scale=6,
|
|
42
68
|
generator=torch.Generator("cuda").manual_seed(0),
|
|
43
69
|
).frames[0]
|
|
@@ -0,0 +1,75 @@
|
|
|
1
|
+
# Adapted from: https://github.com/chengzeyi/ParaAttention/blob/main/first_block_cache_examples/run_hunyuan_video.py
|
|
2
|
+
import os
|
|
3
|
+
import torch
|
|
4
|
+
from diffusers.utils import export_to_video
|
|
5
|
+
from diffusers import (
|
|
6
|
+
HunyuanVideoPipeline,
|
|
7
|
+
HunyuanVideoTransformer3DModel,
|
|
8
|
+
AutoencoderKLHunyuanVideo,
|
|
9
|
+
)
|
|
10
|
+
from cache_dit.cache_factory import apply_cache_on_pipe, CacheType
|
|
11
|
+
|
|
12
|
+
model_id = os.environ.get("HUNYUAN_DIR", "tencent/HunyuanVideo")
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def get_gpu_memory_in_gib():
|
|
16
|
+
if not torch.cuda.is_available():
|
|
17
|
+
return 0
|
|
18
|
+
|
|
19
|
+
try:
|
|
20
|
+
total_memory_bytes = torch.cuda.get_device_properties(
|
|
21
|
+
torch.cuda.current_device(),
|
|
22
|
+
).total_memory
|
|
23
|
+
total_memory_gib = total_memory_bytes / (1024**3)
|
|
24
|
+
return int(total_memory_gib)
|
|
25
|
+
except Exception:
|
|
26
|
+
return 0
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
transformer = HunyuanVideoTransformer3DModel.from_pretrained(
|
|
30
|
+
model_id,
|
|
31
|
+
subfolder="transformer",
|
|
32
|
+
torch_dtype=torch.bfloat16,
|
|
33
|
+
revision="refs/pr/18",
|
|
34
|
+
)
|
|
35
|
+
pipe = HunyuanVideoPipeline.from_pretrained(
|
|
36
|
+
model_id,
|
|
37
|
+
transformer=transformer,
|
|
38
|
+
torch_dtype=torch.float16,
|
|
39
|
+
revision="refs/pr/18",
|
|
40
|
+
).to("cuda")
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
# Default options, F8B8, good balance between performance and precision
|
|
44
|
+
apply_cache_on_pipe(pipe, **CacheType.default_options(CacheType.DBCache))
|
|
45
|
+
|
|
46
|
+
assert isinstance(
|
|
47
|
+
pipe.vae, AutoencoderKLHunyuanVideo
|
|
48
|
+
) # enable type check for IDE
|
|
49
|
+
|
|
50
|
+
# Enable memory savings
|
|
51
|
+
pipe.enable_model_cpu_offload()
|
|
52
|
+
if get_gpu_memory_in_gib() <= 48:
|
|
53
|
+
pipe.vae.enable_tiling(
|
|
54
|
+
# Make it runnable on GPUs with 48GB memory
|
|
55
|
+
tile_sample_min_height=128,
|
|
56
|
+
tile_sample_stride_height=96,
|
|
57
|
+
tile_sample_min_width=128,
|
|
58
|
+
tile_sample_stride_width=96,
|
|
59
|
+
tile_sample_min_num_frames=32,
|
|
60
|
+
tile_sample_stride_num_frames=24,
|
|
61
|
+
)
|
|
62
|
+
else:
|
|
63
|
+
pipe.vae.enable_tiling()
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
output = pipe(
|
|
67
|
+
prompt="A cat walks on the grass, realistic",
|
|
68
|
+
height=720,
|
|
69
|
+
width=1280,
|
|
70
|
+
num_frames=129,
|
|
71
|
+
num_inference_steps=30,
|
|
72
|
+
).frames[0]
|
|
73
|
+
|
|
74
|
+
print("Saving video to hunyuan_video.mp4")
|
|
75
|
+
export_to_video(output, "hunyuan_video.mp4", fps=15)
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
import os
|
|
2
2
|
import torch
|
|
3
|
-
|
|
3
|
+
import diffusers
|
|
4
|
+
from diffusers import WanPipeline, AutoencoderKLWan
|
|
4
5
|
from diffusers.utils import export_to_video
|
|
5
6
|
from diffusers.schedulers.scheduling_unipc_multistep import (
|
|
6
7
|
UniPCMultistepScheduler,
|
|
@@ -27,11 +28,24 @@ if hasattr(pipe, "scheduler") and pipe.scheduler is not None:
|
|
|
27
28
|
|
|
28
29
|
pipe.to("cuda")
|
|
29
30
|
|
|
30
|
-
|
|
31
|
+
# Default options, F8B8, good balance between performance and precision
|
|
32
|
+
apply_cache_on_pipe(pipe, **CacheType.default_options(CacheType.DBCache))
|
|
31
33
|
|
|
32
34
|
# Enable memory savings
|
|
33
35
|
pipe.enable_model_cpu_offload()
|
|
34
|
-
|
|
36
|
+
|
|
37
|
+
# Wan currently requires installing diffusers from source
|
|
38
|
+
assert isinstance(pipe.vae, AutoencoderKLWan) # enable type check for IDE
|
|
39
|
+
if diffusers.__version__ >= "0.34.0.dev0":
|
|
40
|
+
pipe.vae.enable_tiling()
|
|
41
|
+
pipe.vae.enable_slicing()
|
|
42
|
+
else:
|
|
43
|
+
print(
|
|
44
|
+
"Wan pipeline requires diffusers version >= 0.34.0.dev0 "
|
|
45
|
+
"for vae tiling and slicing, please install diffusers "
|
|
46
|
+
"from source."
|
|
47
|
+
)
|
|
48
|
+
|
|
35
49
|
|
|
36
50
|
video = pipe(
|
|
37
51
|
prompt=(
|
|
@@ -39,8 +53,8 @@ video = pipe(
|
|
|
39
53
|
"flying past in the background, hyperrealistic"
|
|
40
54
|
),
|
|
41
55
|
negative_prompt="",
|
|
42
|
-
height=
|
|
43
|
-
width=
|
|
56
|
+
height=height,
|
|
57
|
+
width=width,
|
|
44
58
|
num_frames=81,
|
|
45
59
|
num_inference_steps=30,
|
|
46
60
|
).frames[0]
|