cache-dit 0.1.1.dev2__tar.gz → 0.1.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cache_dit-0.1.2/PKG-INFO +354 -0
- {cache_dit-0.1.1.dev2 → cache_dit-0.1.2}/README.md +29 -30
- cache_dit-0.1.2/assets/cache-dit.png +0 -0
- {cache_dit-0.1.1.dev2 → cache_dit-0.1.2}/pyproject.toml +4 -3
- {cache_dit-0.1.1.dev2 → cache_dit-0.1.2}/setup.cfg +0 -3
- {cache_dit-0.1.1.dev2 → cache_dit-0.1.2}/setup.py +1 -1
- {cache_dit-0.1.1.dev2 → cache_dit-0.1.2}/src/cache_dit/_version.py +2 -2
- {cache_dit-0.1.1.dev2 → cache_dit-0.1.2}/src/cache_dit/cache_factory/__init__.py +3 -1
- {cache_dit-0.1.1.dev2 → cache_dit-0.1.2}/src/cache_dit/cache_factory/dual_block_cache/diffusers_adapters/cogvideox.py +1 -1
- cache_dit-0.1.2/src/cache_dit.egg-info/PKG-INFO +354 -0
- {cache_dit-0.1.1.dev2 → cache_dit-0.1.2}/src/cache_dit.egg-info/SOURCES.txt +1 -0
- cache_dit-0.1.1.dev2/PKG-INFO +0 -31
- cache_dit-0.1.1.dev2/src/cache_dit.egg-info/PKG-INFO +0 -31
- {cache_dit-0.1.1.dev2 → cache_dit-0.1.2}/.github/workflows/issue.yml +0 -0
- {cache_dit-0.1.1.dev2 → cache_dit-0.1.2}/.gitignore +0 -0
- {cache_dit-0.1.1.dev2 → cache_dit-0.1.2}/.pre-commit-config.yaml +0 -0
- {cache_dit-0.1.1.dev2 → cache_dit-0.1.2}/CONTRIBUTE.md +0 -0
- {cache_dit-0.1.1.dev2 → cache_dit-0.1.2}/LICENSE +0 -0
- {cache_dit-0.1.1.dev2 → cache_dit-0.1.2}/MANIFEST.in +0 -0
- {cache_dit-0.1.1.dev2 → cache_dit-0.1.2}/assets/DBCACHE_F12B12S4_R0.2_S16.png +0 -0
- {cache_dit-0.1.1.dev2 → cache_dit-0.1.2}/assets/DBCACHE_F12B16S4_R0.08_S6.png +0 -0
- {cache_dit-0.1.1.dev2 → cache_dit-0.1.2}/assets/DBCACHE_F16B16S2_R0.2_S14.png +0 -0
- {cache_dit-0.1.1.dev2 → cache_dit-0.1.2}/assets/DBCACHE_F16B16S4_R0.2_S13.png +0 -0
- {cache_dit-0.1.1.dev2 → cache_dit-0.1.2}/assets/DBCACHE_F1B0S1_R0.08_S11.png +0 -0
- {cache_dit-0.1.1.dev2 → cache_dit-0.1.2}/assets/DBCACHE_F1B0S1_R0.2_S19.png +0 -0
- {cache_dit-0.1.1.dev2 → cache_dit-0.1.2}/assets/DBCACHE_F8B0S2_R0.12_S12.png +0 -0
- {cache_dit-0.1.1.dev2 → cache_dit-0.1.2}/assets/DBCACHE_F8B16S1_R0.2_S18.png +0 -0
- {cache_dit-0.1.1.dev2 → cache_dit-0.1.2}/assets/DBCACHE_F8B8S1_R0.08_S9.png +0 -0
- {cache_dit-0.1.1.dev2 → cache_dit-0.1.2}/assets/DBCACHE_F8B8S1_R0.12_S12.png +0 -0
- {cache_dit-0.1.1.dev2 → cache_dit-0.1.2}/assets/DBCACHE_F8B8S1_R0.15_S15.png +0 -0
- {cache_dit-0.1.1.dev2 → cache_dit-0.1.2}/assets/DBCache.png +0 -0
- {cache_dit-0.1.1.dev2 → cache_dit-0.1.2}/assets/DBPRUNE_F1B0_R0.03_P24.0_T19.43s.png +0 -0
- {cache_dit-0.1.1.dev2 → cache_dit-0.1.2}/assets/DBPRUNE_F1B0_R0.04_P34.6_T16.82s.png +0 -0
- {cache_dit-0.1.1.dev2 → cache_dit-0.1.2}/assets/DBPRUNE_F1B0_R0.05_P38.3_T15.95s.png +0 -0
- {cache_dit-0.1.1.dev2 → cache_dit-0.1.2}/assets/DBPRUNE_F1B0_R0.06_P45.2_T14.24s.png +0 -0
- {cache_dit-0.1.1.dev2 → cache_dit-0.1.2}/assets/DBPRUNE_F1B0_R0.07_P52.3_T12.53s.png +0 -0
- {cache_dit-0.1.1.dev2 → cache_dit-0.1.2}/assets/DBPRUNE_F1B0_R0.08_P52.4_T12.52s.png +0 -0
- {cache_dit-0.1.1.dev2 → cache_dit-0.1.2}/assets/DBPRUNE_F1B0_R0.09_P59.2_T10.81s.png +0 -0
- {cache_dit-0.1.1.dev2 → cache_dit-0.1.2}/assets/DBPRUNE_F1B0_R0.12_P59.5_T10.76s.png +0 -0
- {cache_dit-0.1.1.dev2 → cache_dit-0.1.2}/assets/DBPRUNE_F1B0_R0.12_P63.0_T9.90s.png +0 -0
- {cache_dit-0.1.1.dev2 → cache_dit-0.1.2}/assets/DBPRUNE_F1B0_R0.1_P62.8_T9.95s.png +0 -0
- {cache_dit-0.1.1.dev2 → cache_dit-0.1.2}/assets/DBPRUNE_F1B0_R0.2_P59.5_T10.66s.png +0 -0
- {cache_dit-0.1.1.dev2 → cache_dit-0.1.2}/assets/DBPRUNE_F1B0_R0.3_P63.1_T9.79s.png +0 -0
- {cache_dit-0.1.1.dev2 → cache_dit-0.1.2}/assets/NONE_R0.08_S0.png +0 -0
- {cache_dit-0.1.1.dev2 → cache_dit-0.1.2}/bench/.gitignore +0 -0
- {cache_dit-0.1.1.dev2 → cache_dit-0.1.2}/bench/bench.py +0 -0
- {cache_dit-0.1.1.dev2 → cache_dit-0.1.2}/docs/.gitignore +0 -0
- {cache_dit-0.1.1.dev2 → cache_dit-0.1.2}/examples/.gitignore +0 -0
- {cache_dit-0.1.1.dev2 → cache_dit-0.1.2}/examples/run_flux.py +0 -0
- {cache_dit-0.1.1.dev2 → cache_dit-0.1.2}/pytest.ini +0 -0
- {cache_dit-0.1.1.dev2 → cache_dit-0.1.2}/requirements.txt +0 -0
- {cache_dit-0.1.1.dev2 → cache_dit-0.1.2}/src/cache_dit/__init__.py +0 -0
- {cache_dit-0.1.1.dev2 → cache_dit-0.1.2}/src/cache_dit/cache_factory/dual_block_cache/__init__.py +0 -0
- {cache_dit-0.1.1.dev2 → cache_dit-0.1.2}/src/cache_dit/cache_factory/dual_block_cache/cache_context.py +0 -0
- {cache_dit-0.1.1.dev2 → cache_dit-0.1.2}/src/cache_dit/cache_factory/dual_block_cache/diffusers_adapters/__init__.py +0 -0
- {cache_dit-0.1.1.dev2 → cache_dit-0.1.2}/src/cache_dit/cache_factory/dual_block_cache/diffusers_adapters/flux.py +0 -0
- {cache_dit-0.1.1.dev2 → cache_dit-0.1.2}/src/cache_dit/cache_factory/dual_block_cache/diffusers_adapters/mochi.py +0 -0
- {cache_dit-0.1.1.dev2 → cache_dit-0.1.2}/src/cache_dit/cache_factory/dynamic_block_prune/__init__.py +0 -0
- {cache_dit-0.1.1.dev2 → cache_dit-0.1.2}/src/cache_dit/cache_factory/dynamic_block_prune/diffusers_adapters/__init__.py +0 -0
- {cache_dit-0.1.1.dev2 → cache_dit-0.1.2}/src/cache_dit/cache_factory/dynamic_block_prune/diffusers_adapters/cogvideox.py +0 -0
- {cache_dit-0.1.1.dev2 → cache_dit-0.1.2}/src/cache_dit/cache_factory/dynamic_block_prune/diffusers_adapters/flux.py +0 -0
- {cache_dit-0.1.1.dev2 → cache_dit-0.1.2}/src/cache_dit/cache_factory/dynamic_block_prune/diffusers_adapters/mochi.py +0 -0
- {cache_dit-0.1.1.dev2 → cache_dit-0.1.2}/src/cache_dit/cache_factory/dynamic_block_prune/prune_context.py +0 -0
- {cache_dit-0.1.1.dev2 → cache_dit-0.1.2}/src/cache_dit/cache_factory/first_block_cache/__init__.py +0 -0
- {cache_dit-0.1.1.dev2 → cache_dit-0.1.2}/src/cache_dit/cache_factory/first_block_cache/cache_context.py +0 -0
- {cache_dit-0.1.1.dev2 → cache_dit-0.1.2}/src/cache_dit/cache_factory/first_block_cache/diffusers_adapters/__init__.py +0 -0
- {cache_dit-0.1.1.dev2 → cache_dit-0.1.2}/src/cache_dit/cache_factory/first_block_cache/diffusers_adapters/cogvideox.py +0 -0
- {cache_dit-0.1.1.dev2 → cache_dit-0.1.2}/src/cache_dit/cache_factory/first_block_cache/diffusers_adapters/flux.py +0 -0
- {cache_dit-0.1.1.dev2 → cache_dit-0.1.2}/src/cache_dit/cache_factory/first_block_cache/diffusers_adapters/mochi.py +0 -0
- {cache_dit-0.1.1.dev2 → cache_dit-0.1.2}/src/cache_dit/cache_factory/first_block_cache/diffusers_adapters/wan.py +0 -0
- {cache_dit-0.1.1.dev2 → cache_dit-0.1.2}/src/cache_dit/cache_factory/taylorseer.py +0 -0
- {cache_dit-0.1.1.dev2 → cache_dit-0.1.2}/src/cache_dit/cache_factory/utils.py +0 -0
- {cache_dit-0.1.1.dev2 → cache_dit-0.1.2}/src/cache_dit/logger.py +0 -0
- {cache_dit-0.1.1.dev2 → cache_dit-0.1.2}/src/cache_dit/primitives.py +0 -0
- {cache_dit-0.1.1.dev2 → cache_dit-0.1.2}/src/cache_dit.egg-info/dependency_links.txt +0 -0
- {cache_dit-0.1.1.dev2 → cache_dit-0.1.2}/src/cache_dit.egg-info/requires.txt +0 -0
- {cache_dit-0.1.1.dev2 → cache_dit-0.1.2}/src/cache_dit.egg-info/top_level.txt +0 -0
cache_dit-0.1.2/PKG-INFO
ADDED
|
@@ -0,0 +1,354 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: cache_dit
|
|
3
|
+
Version: 0.1.2
|
|
4
|
+
Summary: 🤗 CacheDiT: A Training-free and Easy-to-use Cache Acceleration Toolbox for Diffusion Transformers
|
|
5
|
+
Author: DefTruth, vipshop.com, etc.
|
|
6
|
+
Maintainer: DefTruth, vipshop.com, etc
|
|
7
|
+
Project-URL: Repository, https://github.com/vipshop/cache-dit.git
|
|
8
|
+
Project-URL: Homepage, https://github.com/vipshop/cache-dit.git
|
|
9
|
+
Requires-Python: >=3.10
|
|
10
|
+
Description-Content-Type: text/markdown
|
|
11
|
+
License-File: LICENSE
|
|
12
|
+
Requires-Dist: packaging
|
|
13
|
+
Requires-Dist: torch
|
|
14
|
+
Requires-Dist: transformers
|
|
15
|
+
Requires-Dist: diffusers
|
|
16
|
+
Provides-Extra: all
|
|
17
|
+
Provides-Extra: dev
|
|
18
|
+
Requires-Dist: pre-commit; extra == "dev"
|
|
19
|
+
Requires-Dist: pytest<8.0.0,>=7.0.0; extra == "dev"
|
|
20
|
+
Requires-Dist: pytest-html; extra == "dev"
|
|
21
|
+
Requires-Dist: expecttest; extra == "dev"
|
|
22
|
+
Requires-Dist: hypothesis; extra == "dev"
|
|
23
|
+
Requires-Dist: transformers; extra == "dev"
|
|
24
|
+
Requires-Dist: diffusers; extra == "dev"
|
|
25
|
+
Requires-Dist: accelerate; extra == "dev"
|
|
26
|
+
Requires-Dist: peft; extra == "dev"
|
|
27
|
+
Requires-Dist: protobuf; extra == "dev"
|
|
28
|
+
Requires-Dist: sentencepiece; extra == "dev"
|
|
29
|
+
Requires-Dist: opencv-python-headless; extra == "dev"
|
|
30
|
+
Requires-Dist: ftfy; extra == "dev"
|
|
31
|
+
Dynamic: license-file
|
|
32
|
+
Dynamic: provides-extra
|
|
33
|
+
Dynamic: requires-dist
|
|
34
|
+
Dynamic: requires-python
|
|
35
|
+
|
|
36
|
+
<div align="center">
|
|
37
|
+
<p align="center">
|
|
38
|
+
<h3>🤗 CacheDiT: A Training-free and Easy-to-use Cache Acceleration <br>Toolbox for Diffusion Transformers</h3>
|
|
39
|
+
</p>
|
|
40
|
+
<img src=https://github.com/vipshop/cache-dit/raw/dev/assets/cache-dit.png >
|
|
41
|
+
<div align='center'>
|
|
42
|
+
<img src=https://img.shields.io/badge/Language-Python-brightgreen.svg >
|
|
43
|
+
<img src=https://img.shields.io/badge/PRs-welcome-9cf.svg >
|
|
44
|
+
<img src=https://img.shields.io/badge/PyPI-pass-brightgreen.svg >
|
|
45
|
+
<img src=https://img.shields.io/badge/Python-3.10|3.11|3.12-9cf.svg >
|
|
46
|
+
<img src=https://img.shields.io/badge/Release-v0.1.2-brightgreen.svg >
|
|
47
|
+
</div>
|
|
48
|
+
<p align="center">
|
|
49
|
+
DeepCache is for UNet not DiT. Most DiT cache speedups are complex and not training-free. CacheDiT provides <br>a series of training-free, UNet-style cache accelerators for DiT: DBCache, DBPrune, FBCache, etc.
|
|
50
|
+
</p>
|
|
51
|
+
</div>
|
|
52
|
+
|
|
53
|
+
## 🤗 Introduction
|
|
54
|
+
|
|
55
|
+
<div align="center">
|
|
56
|
+
<p align="center">
|
|
57
|
+
<h3>DBCache: Dual Block Caching for Diffusion Transformers</h3>
|
|
58
|
+
</p>
|
|
59
|
+
</div>
|
|
60
|
+
|
|
61
|
+
**DBCache**: **Dual Block Caching** for Diffusion Transformers. We have enhanced `FBCache` into a more general and customizable cache algorithm, namely `DBCache`, enabling it to achieve fully `UNet-style` cache acceleration for DiT models. Different configurations of compute blocks (**F8B12**, etc.) can be customized in DBCache. Moreover, it can be entirely **training**-**free**. DBCache can strike a perfect **balance** between performance and precision!
|
|
62
|
+
|
|
63
|
+
<div align="center">
|
|
64
|
+
<p align="center">
|
|
65
|
+
DBCache, <b> L20x1 </b>, Steps: 28, "A cat holding a sign that says hello world with complex background"
|
|
66
|
+
</p>
|
|
67
|
+
</div>
|
|
68
|
+
|
|
69
|
+
|Baseline(L20x1)|F1B0 (0.08)|F1B0 (0.20)|F8B8 (0.15)|F12B12 (0.20)|F16B16 (0.20)|
|
|
70
|
+
|:---:|:---:|:---:|:---:|:---:|:---:|
|
|
71
|
+
|24.85s|15.59s|8.58s|15.41s|15.11s|17.74s|
|
|
72
|
+
|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/NONE_R0.08_S0.png width=105px>|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/DBCACHE_F1B0S1_R0.08_S11.png width=105px> | <img src=https://github.com/vipshop/cache-dit/raw/main/assets/DBCACHE_F1B0S1_R0.2_S19.png width=105px>|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/DBCACHE_F8B8S1_R0.15_S15.png width=105px>|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/DBCACHE_F12B12S4_R0.2_S16.png width=105px>|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/DBCACHE_F16B16S4_R0.2_S13.png width=105px>|
|
|
73
|
+
|**Baseline(L20x1)**|**F1B0 (0.08)**|**F8B8 (0.12)**|**F8B12 (0.20)**|**F8B16 (0.20)**|**F8B20 (0.20)**|
|
|
74
|
+
|27.85s|6.04s|5.88s|5.77s|6.01s|6.20s|
|
|
75
|
+
|<img src=https://github.com/user-attachments/assets/70ea57f4-d8f2-415b-8a96-d8315974a5e6 width=105px>|<img src=https://github.com/user-attachments/assets/fc0e1a67-19cc-44aa-bf50-04696e7978a0 width=105px> |<img src=https://github.com/user-attachments/assets/d1434896-628c-436b-95ad-43c085a8629e width=105px>|<img src=https://github.com/user-attachments/assets/aaa42cd2-57de-4c4e-8bfb-913018a8251d width=105px>|<img src=https://github.com/user-attachments/assets/dc0ba2a4-ef7c-436d-8a39-67055deab92f width=105px>|<img src=https://github.com/user-attachments/assets/aede466f-61ed-4256-8df0-fecf8020c5ca width=105px>|
|
|
76
|
+
|
|
77
|
+
<div align="center">
|
|
78
|
+
<p align="center">
|
|
79
|
+
DBCache, <b> L20x4 </b>, Steps: 20, case to show the texture recovery ability of DBCache
|
|
80
|
+
</p>
|
|
81
|
+
</div>
|
|
82
|
+
|
|
83
|
+
These case studies demonstrate that even with relatively high thresholds (such as 0.12, 0.15, 0.2, etc.) under the DBCache **F12B12** or **F8B16** configuration, the detailed texture of the kitten's fur, colored cloth, and the clarity of text can still be preserved. This suggests that users can leverage DBCache to effectively balance performance and precision in their workflows!
|
|
84
|
+
|
|
85
|
+
<div align="center">
|
|
86
|
+
<p align="center">
|
|
87
|
+
<h3>DBPrune: Dynamic Block Prune with Residual Caching</h3>
|
|
88
|
+
</p>
|
|
89
|
+
</div>
|
|
90
|
+
|
|
91
|
+
**DBPrune**: We have further implemented a new **Dynamic Block Prune** algorithm based on **Residual Caching** for Diffusion Transformers, referred to as DBPrune. DBPrune caches each block's hidden states and residuals, then **dynamically prunes** blocks during inference by computing the L1 distance between previous hidden states. When a block is pruned, its output is approximated using the cached residuals.
|
|
92
|
+
|
|
93
|
+
|Baseline(L20x1)|Pruned(24%)|Pruned(35%)|Pruned(38%)|Pruned(45%)|Pruned(60%)|
|
|
94
|
+
|:---:|:---:|:---:|:---:|:---:|:---:|
|
|
95
|
+
|24.85s|19.43s|16.82s|15.95s|14.24s|10.66s|
|
|
96
|
+
|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/NONE_R0.08_S0.png width=105px>|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/DBPRUNE_F1B0_R0.03_P24.0_T19.43s.png width=105px> | <img src=https://github.com/vipshop/cache-dit/raw/main/assets/DBPRUNE_F1B0_R0.04_P34.6_T16.82s.png width=105px>|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/DBPRUNE_F1B0_R0.05_P38.3_T15.95s.png width=105px>|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/DBPRUNE_F1B0_R0.06_P45.2_T14.24s.png width=105px>|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/DBPRUNE_F1B0_R0.2_P59.5_T10.66s.png width=105px>|
|
|
97
|
+
|
|
98
|
+
<div align="center">
|
|
99
|
+
<p align="center">
|
|
100
|
+
DBPrune, <b> L20x1 </b>, Steps: 28, "A cat holding a sign that says hello world with complex background"
|
|
101
|
+
</p>
|
|
102
|
+
</div>
|
|
103
|
+
|
|
104
|
+
Moreover, both DBCache and DBPrune are **plug-and-play** solutions that works hand-in-hand with [ParaAttention](https://github.com/chengzeyi/ParaAttention). Users can easily tap into its **Context Parallelism** features for distributed inference.
|
|
105
|
+
|
|
106
|
+
<p align="center">
|
|
107
|
+
♥️ Please consider to leave a ⭐️ Star to support us ~ ♥️
|
|
108
|
+
</p>
|
|
109
|
+
|
|
110
|
+
## ©️Citations
|
|
111
|
+
|
|
112
|
+
```BibTeX
|
|
113
|
+
@misc{CacheDiT@2025,
|
|
114
|
+
title={CacheDiT: A Training-free and Easy-to-use cache acceleration Toolbox for Diffusion Transformers},
|
|
115
|
+
url={https://github.com/vipshop/cache-dit.git},
|
|
116
|
+
note={Open-source software available at https://github.com/vipshop/cache-dit.git},
|
|
117
|
+
author={vipshop.com},
|
|
118
|
+
year={2025}
|
|
119
|
+
}
|
|
120
|
+
```
|
|
121
|
+
|
|
122
|
+
## 👋Reference
|
|
123
|
+
|
|
124
|
+
<div id="reference"></div>
|
|
125
|
+
|
|
126
|
+
The **CacheDiT** codebase was adapted from FBCache's implementation at the [ParaAttention](https://github.com/chengzeyi/ParaAttention/tree/main/src/para_attn/first_block_cache). We would like to express our sincere gratitude for this excellent work!
|
|
127
|
+
|
|
128
|
+
## 📖Contents
|
|
129
|
+
|
|
130
|
+
<div id="contents"></div>
|
|
131
|
+
|
|
132
|
+
- [⚙️Installation](#️installation)
|
|
133
|
+
- [⚡️Dual Block Cache](#dbcache)
|
|
134
|
+
- [🎉First Block Cache](#fbcache)
|
|
135
|
+
- [⚡️Dynamic Block Prune](#dbprune)
|
|
136
|
+
- [🎉Context Parallelism](#context-parallelism)
|
|
137
|
+
- [⚡️Torch Compile](#compile)
|
|
138
|
+
- [🎉Supported Models](#supported)
|
|
139
|
+
- [👋Contribute](#contribute)
|
|
140
|
+
- [©️License](#license)
|
|
141
|
+
|
|
142
|
+
|
|
143
|
+
## ⚙️Installation
|
|
144
|
+
|
|
145
|
+
<div id="installation"></div>
|
|
146
|
+
|
|
147
|
+
You can install the stable release of `cache-dit` from PyPI:
|
|
148
|
+
|
|
149
|
+
```bash
|
|
150
|
+
pip3 install cache-dit
|
|
151
|
+
```
|
|
152
|
+
Or you can install the latest develop version from GitHub:
|
|
153
|
+
|
|
154
|
+
```bash
|
|
155
|
+
pip3 install git+https://github.com/vipshop/cache-dit.git
|
|
156
|
+
```
|
|
157
|
+
|
|
158
|
+
## ⚡️DBCache: Dual Block Cache
|
|
159
|
+
|
|
160
|
+
<div id="dbcache"></div>
|
|
161
|
+
|
|
162
|
+

|
|
163
|
+
|
|
164
|
+
**DBCache** provides configurable parameters for custom optimization, enabling a balanced trade-off between performance and precision:
|
|
165
|
+
|
|
166
|
+
- **Fn**: Specifies that DBCache uses the **first n** Transformer blocks to fit the information at time step t, enabling the calculation of a more stable L1 diff and delivering more accurate information to subsequent blocks.
|
|
167
|
+
- **Bn**: Further fuses approximate information in the **last n** Transformer blocks to enhance prediction accuracy. These blocks act as an auto-scaler for approximate hidden states that use residual cache.
|
|
168
|
+
- **warmup_steps**: (default: 0) DBCache does not apply the caching strategy when the number of running steps is less than or equal to this value, ensuring the model sufficiently learns basic features during warmup.
|
|
169
|
+
- **max_cached_steps**: (default: -1) DBCache disables the caching strategy when the running steps exceed this value to prevent precision degradation.
|
|
170
|
+
- **residual_diff_threshold**: The value of residual diff threshold, a higher value leads to faster performance at the cost of lower precision.
|
|
171
|
+
|
|
172
|
+
For a good balance between performance and precision, DBCache is configured by default with **F8B8**, 8 warmup steps, and unlimited cached steps.
|
|
173
|
+
|
|
174
|
+
```python
|
|
175
|
+
from diffusers import FluxPipeline
|
|
176
|
+
from cache_dit.cache_factory import apply_cache_on_pipe, CacheType
|
|
177
|
+
|
|
178
|
+
pipe = FluxPipeline.from_pretrained(
|
|
179
|
+
"black-forest-labs/FLUX.1-dev",
|
|
180
|
+
torch_dtype=torch.bfloat16,
|
|
181
|
+
).to("cuda")
|
|
182
|
+
|
|
183
|
+
# Default options, F8B8, good balance between performance and precision
|
|
184
|
+
cache_options = CacheType.default_options(CacheType.DBCache)
|
|
185
|
+
|
|
186
|
+
# Custom options, F8B16, higher precision
|
|
187
|
+
cache_options = {
|
|
188
|
+
"cache_type": CacheType.DBCache,
|
|
189
|
+
"warmup_steps": 8,
|
|
190
|
+
"max_cached_steps": 8, # -1 means no limit
|
|
191
|
+
"Fn_compute_blocks": 8, # Fn, F8, etc.
|
|
192
|
+
"Bn_compute_blocks": 16, # Bn, B16, etc.
|
|
193
|
+
"residual_diff_threshold": 0.12,
|
|
194
|
+
}
|
|
195
|
+
|
|
196
|
+
apply_cache_on_pipe(pipe, **cache_options)
|
|
197
|
+
```
|
|
198
|
+
Moreover, users configuring higher **Bn** values (e.g., **F8B16**) while aiming to maintain good performance can specify **Bn_compute_blocks_ids** to work with Bn. DBCache will only compute the specified blocks, with the remaining estimated using the previous step's residual cache.
|
|
199
|
+
|
|
200
|
+
```python
|
|
201
|
+
# Custom options, F8B16, higher precision with good performance.
|
|
202
|
+
cache_options = {
|
|
203
|
+
# 0, 2, 4, ..., 14, 15, etc. [0,16)
|
|
204
|
+
"Bn_compute_blocks_ids": CacheType.range(0, 16, 2),
|
|
205
|
+
# Skip Bn blocks (1, 3, 5 ,..., etc.) only if the L1 diff
|
|
206
|
+
# lower than this value, otherwise, compute it.
|
|
207
|
+
"non_compute_blocks_diff_threshold": 0.08,
|
|
208
|
+
}
|
|
209
|
+
```
|
|
210
|
+
|
|
211
|
+
## 🎉FBCache: First Block Cache
|
|
212
|
+
|
|
213
|
+
<div id="fbcache"></div>
|
|
214
|
+
|
|
215
|
+

|
|
216
|
+
|
|
217
|
+
**DBCache** is a more general cache algorithm than **FBCache**. When Fn=1 and Bn=0, DBCache behaves identically to FBCache. Therefore, you can either use the original FBCache implementation directly or configure **DBCache** with **F1B0** settings to achieve the same functionality.
|
|
218
|
+
|
|
219
|
+
```python
|
|
220
|
+
from diffusers import FluxPipeline
|
|
221
|
+
from cache_dit.cache_factory import apply_cache_on_pipe, CacheType
|
|
222
|
+
|
|
223
|
+
pipe = FluxPipeline.from_pretrained(
|
|
224
|
+
"black-forest-labs/FLUX.1-dev",
|
|
225
|
+
torch_dtype=torch.bfloat16,
|
|
226
|
+
).to("cuda")
|
|
227
|
+
|
|
228
|
+
# Using FBCache directly
|
|
229
|
+
cache_options = CacheType.default_options(CacheType.FBCache)
|
|
230
|
+
|
|
231
|
+
# Or using DBCache with F1B0.
|
|
232
|
+
# Fn=1, Bn=0, means FB Cache, otherwise, Dual Block Cache
|
|
233
|
+
cache_options = {
|
|
234
|
+
"cache_type": CacheType.DBCache,
|
|
235
|
+
"warmup_steps": 8,
|
|
236
|
+
"max_cached_steps": 8, # -1 means no limit
|
|
237
|
+
"Fn_compute_blocks": 1, # Fn, F1, etc.
|
|
238
|
+
"Bn_compute_blocks": 0, # Bn, B0, etc.
|
|
239
|
+
"residual_diff_threshold": 0.12,
|
|
240
|
+
}
|
|
241
|
+
|
|
242
|
+
apply_cache_on_pipe(pipe, **cache_options)
|
|
243
|
+
```
|
|
244
|
+
|
|
245
|
+
## ⚡️DBPrune: Dynamic Block Prune
|
|
246
|
+
|
|
247
|
+
<div id="dbprune"></div>
|
|
248
|
+
|
|
249
|
+

|
|
250
|
+
|
|
251
|
+
We have further implemented a new **Dynamic Block Prune** algorithm based on **Residual Caching** for Diffusion Transformers, which is referred to as **DBPrune**. DBPrune caches each block's hidden states and residuals, then dynamically prunes blocks during inference by computing the L1 distance between previous hidden states. When a block is pruned, its output is approximated using the cached residuals. DBPrune is currently in the experimental phase, and we kindly invite you to stay tuned for upcoming updates.
|
|
252
|
+
|
|
253
|
+
```python
|
|
254
|
+
from diffusers import FluxPipeline
|
|
255
|
+
from cache_dit.cache_factory import apply_cache_on_pipe, CacheType
|
|
256
|
+
|
|
257
|
+
pipe = FluxPipeline.from_pretrained(
|
|
258
|
+
"black-forest-labs/FLUX.1-dev",
|
|
259
|
+
torch_dtype=torch.bfloat16,
|
|
260
|
+
).to("cuda")
|
|
261
|
+
|
|
262
|
+
# Using DBPrune
|
|
263
|
+
cache_options = CacheType.default_options(CacheType.DBPrune)
|
|
264
|
+
|
|
265
|
+
apply_cache_on_pipe(pipe, **cache_options)
|
|
266
|
+
```
|
|
267
|
+
|
|
268
|
+
<div align="center">
|
|
269
|
+
<p align="center">
|
|
270
|
+
DBPrune, <b> L20x1 </b>, Steps: 28, "A cat holding a sign that says hello world with complex background"
|
|
271
|
+
</p>
|
|
272
|
+
</div>
|
|
273
|
+
|
|
274
|
+
|Baseline(L20x1)|Pruned(24%)|Pruned(35%)|Pruned(38%)|Pruned(45%)|Pruned(60%)|
|
|
275
|
+
|:---:|:---:|:---:|:---:|:---:|:---:|
|
|
276
|
+
|24.85s|19.43s|16.82s|15.95s|14.24s|10.66s|
|
|
277
|
+
|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/NONE_R0.08_S0.png width=105px>|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/DBPRUNE_F1B0_R0.03_P24.0_T19.43s.png width=105px> | <img src=https://github.com/vipshop/cache-dit/raw/main/assets/DBPRUNE_F1B0_R0.04_P34.6_T16.82s.png width=105px>|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/DBPRUNE_F1B0_R0.05_P38.3_T15.95s.png width=105px>|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/DBPRUNE_F1B0_R0.06_P45.2_T14.24s.png width=105px>|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/DBPRUNE_F1B0_R0.2_P59.5_T10.66s.png width=105px>|
|
|
278
|
+
|
|
279
|
+
## 🎉Context Parallelism
|
|
280
|
+
|
|
281
|
+
<div id="context-parallelism"></div>
|
|
282
|
+
|
|
283
|
+
**CacheDiT** are **plug-and-play** solutions that works hand-in-hand with [ParaAttention](https://github.com/chengzeyi/ParaAttention). Users can **easily tap into** its **Context Parallelism** features for distributed inference. Firstly, install `para-attn` from PyPI:
|
|
284
|
+
|
|
285
|
+
```bash
|
|
286
|
+
pip3 install para-attn # or install `para-attn` from sources.
|
|
287
|
+
```
|
|
288
|
+
|
|
289
|
+
Then, you can run **DBCache** with **Context Parallelism** on 4 GPUs:
|
|
290
|
+
|
|
291
|
+
```python
|
|
292
|
+
from diffusers import FluxPipeline
|
|
293
|
+
from para_attn.context_parallel import init_context_parallel_mesh
|
|
294
|
+
from para_attn.context_parallel.diffusers_adapters import parallelize_pipe
|
|
295
|
+
from cache_dit.cache_factory import apply_cache_on_pipe, CacheType
|
|
296
|
+
|
|
297
|
+
pipe = FluxPipeline.from_pretrained(
|
|
298
|
+
"black-forest-labs/FLUX.1-dev",
|
|
299
|
+
torch_dtype=torch.bfloat16,
|
|
300
|
+
).to("cuda")
|
|
301
|
+
|
|
302
|
+
# Context Parallel from ParaAttention
|
|
303
|
+
parallelize_pipe(
|
|
304
|
+
pipe, mesh=init_context_parallel_mesh(
|
|
305
|
+
pipe.device.type, max_ulysses_dim_size=4
|
|
306
|
+
)
|
|
307
|
+
)
|
|
308
|
+
|
|
309
|
+
# DBCache with F8B8 from this library
|
|
310
|
+
apply_cache_on_pipe(
|
|
311
|
+
pipe, **CacheType.default_options(CacheType.DBCache)
|
|
312
|
+
)
|
|
313
|
+
```
|
|
314
|
+
|
|
315
|
+
## ⚡️Torch Compile
|
|
316
|
+
|
|
317
|
+
<div id="compile"></div>
|
|
318
|
+
|
|
319
|
+
**CacheDiT** are designed to work compatibly with `torch.compile`. For example:
|
|
320
|
+
|
|
321
|
+
```python
|
|
322
|
+
apply_cache_on_pipe(
|
|
323
|
+
pipe, **CacheType.default_options(CacheType.DBCache)
|
|
324
|
+
)
|
|
325
|
+
# Compile the Transformer module
|
|
326
|
+
pipe.transformer = torch.compile(pipe.transformer)
|
|
327
|
+
```
|
|
328
|
+
However, users intending to use **CacheDiT** for DiT with **dynamic input shapes** should consider increasing the **recompile** **limit** of `torch._dynamo` to achieve better performance.
|
|
329
|
+
|
|
330
|
+
```python
|
|
331
|
+
torch._dynamo.config.recompile_limit = 96 # default is 8
|
|
332
|
+
torch._dynamo.config.accumulated_recompile_limit = 2048 # default is 256
|
|
333
|
+
```
|
|
334
|
+
Otherwise, the recompile_limit error may be triggered, causing the module to fall back to eager mode.
|
|
335
|
+
|
|
336
|
+
## 🎉Supported Models
|
|
337
|
+
|
|
338
|
+
<div id="supported"></div>
|
|
339
|
+
|
|
340
|
+
- [🚀FLUX.1](https://github.com/vipshop/cache-dit/raw/main/src/cache_dit/cache_factory/dual_block_cache/diffusers_adapters)
|
|
341
|
+
- [🚀CogVideoX](https://github.com/vipshop/cache-dit/raw/main/src/cache_dit/cache_factory/dual_block_cache/diffusers_adapters)
|
|
342
|
+
- [🚀Mochi](https://github.com/vipshop/cache-dit/raw/main/src/cache_dit/cache_factory/dual_block_cache/diffusers_adapters)
|
|
343
|
+
|
|
344
|
+
## 👋Contribute
|
|
345
|
+
<div id="contribute"></div>
|
|
346
|
+
|
|
347
|
+
How to contribute? Star this repo or check [CONTRIBUTE.md](./CONTRIBUTE.md).
|
|
348
|
+
|
|
349
|
+
## ©️License
|
|
350
|
+
|
|
351
|
+
<div id="license"></div>
|
|
352
|
+
|
|
353
|
+
|
|
354
|
+
We have followed the original License from [ParaAttention](https://github.com/chengzeyi/ParaAttention), please check [LICENSE](./LICENSE) for more details.
|
|
@@ -1,17 +1,17 @@
|
|
|
1
1
|
<div align="center">
|
|
2
2
|
<p align="center">
|
|
3
|
-
<h3
|
|
3
|
+
<h3>🤗 CacheDiT: A Training-free and Easy-to-use Cache Acceleration <br>Toolbox for Diffusion Transformers</h3>
|
|
4
4
|
</p>
|
|
5
|
-
|
|
5
|
+
<img src=https://github.com/vipshop/cache-dit/raw/dev/assets/cache-dit.png >
|
|
6
6
|
<div align='center'>
|
|
7
7
|
<img src=https://img.shields.io/badge/Language-Python-brightgreen.svg >
|
|
8
8
|
<img src=https://img.shields.io/badge/PRs-welcome-9cf.svg >
|
|
9
|
-
<img src=https://img.shields.io/badge/
|
|
9
|
+
<img src=https://img.shields.io/badge/PyPI-pass-brightgreen.svg >
|
|
10
10
|
<img src=https://img.shields.io/badge/Python-3.10|3.11|3.12-9cf.svg >
|
|
11
|
-
<img src=https://img.shields.io/badge/Release-v0.1.
|
|
11
|
+
<img src=https://img.shields.io/badge/Release-v0.1.2-brightgreen.svg >
|
|
12
12
|
</div>
|
|
13
13
|
<p align="center">
|
|
14
|
-
DeepCache
|
|
14
|
+
DeepCache is for UNet not DiT. Most DiT cache speedups are complex and not training-free. CacheDiT provides <br>a series of training-free, UNet-style cache accelerators for DiT: DBCache, DBPrune, FBCache, etc.
|
|
15
15
|
</p>
|
|
16
16
|
</div>
|
|
17
17
|
|
|
@@ -34,7 +34,7 @@
|
|
|
34
34
|
|Baseline(L20x1)|F1B0 (0.08)|F1B0 (0.20)|F8B8 (0.15)|F12B12 (0.20)|F16B16 (0.20)|
|
|
35
35
|
|:---:|:---:|:---:|:---:|:---:|:---:|
|
|
36
36
|
|24.85s|15.59s|8.58s|15.41s|15.11s|17.74s|
|
|
37
|
-
|<img src
|
|
37
|
+
|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/NONE_R0.08_S0.png width=105px>|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/DBCACHE_F1B0S1_R0.08_S11.png width=105px> | <img src=https://github.com/vipshop/cache-dit/raw/main/assets/DBCACHE_F1B0S1_R0.2_S19.png width=105px>|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/DBCACHE_F8B8S1_R0.15_S15.png width=105px>|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/DBCACHE_F12B12S4_R0.2_S16.png width=105px>|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/DBCACHE_F16B16S4_R0.2_S13.png width=105px>|
|
|
38
38
|
|**Baseline(L20x1)**|**F1B0 (0.08)**|**F8B8 (0.12)**|**F8B12 (0.20)**|**F8B16 (0.20)**|**F8B20 (0.20)**|
|
|
39
39
|
|27.85s|6.04s|5.88s|5.77s|6.01s|6.20s|
|
|
40
40
|
|<img src=https://github.com/user-attachments/assets/70ea57f4-d8f2-415b-8a96-d8315974a5e6 width=105px>|<img src=https://github.com/user-attachments/assets/fc0e1a67-19cc-44aa-bf50-04696e7978a0 width=105px> |<img src=https://github.com/user-attachments/assets/d1434896-628c-436b-95ad-43c085a8629e width=105px>|<img src=https://github.com/user-attachments/assets/aaa42cd2-57de-4c4e-8bfb-913018a8251d width=105px>|<img src=https://github.com/user-attachments/assets/dc0ba2a4-ef7c-436d-8a39-67055deab92f width=105px>|<img src=https://github.com/user-attachments/assets/aede466f-61ed-4256-8df0-fecf8020c5ca width=105px>|
|
|
@@ -53,12 +53,12 @@ These case studies demonstrate that even with relatively high thresholds (such a
|
|
|
53
53
|
</p>
|
|
54
54
|
</div>
|
|
55
55
|
|
|
56
|
-
**DBPrune**:
|
|
56
|
+
**DBPrune**: We have further implemented a new **Dynamic Block Prune** algorithm based on **Residual Caching** for Diffusion Transformers, referred to as DBPrune. DBPrune caches each block's hidden states and residuals, then **dynamically prunes** blocks during inference by computing the L1 distance between previous hidden states. When a block is pruned, its output is approximated using the cached residuals.
|
|
57
57
|
|
|
58
58
|
|Baseline(L20x1)|Pruned(24%)|Pruned(35%)|Pruned(38%)|Pruned(45%)|Pruned(60%)|
|
|
59
59
|
|:---:|:---:|:---:|:---:|:---:|:---:|
|
|
60
60
|
|24.85s|19.43s|16.82s|15.95s|14.24s|10.66s|
|
|
61
|
-
|<img src
|
|
61
|
+
|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/NONE_R0.08_S0.png width=105px>|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/DBPRUNE_F1B0_R0.03_P24.0_T19.43s.png width=105px> | <img src=https://github.com/vipshop/cache-dit/raw/main/assets/DBPRUNE_F1B0_R0.04_P34.6_T16.82s.png width=105px>|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/DBPRUNE_F1B0_R0.05_P38.3_T15.95s.png width=105px>|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/DBPRUNE_F1B0_R0.06_P45.2_T14.24s.png width=105px>|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/DBPRUNE_F1B0_R0.2_P59.5_T10.66s.png width=105px>|
|
|
62
62
|
|
|
63
63
|
<div align="center">
|
|
64
64
|
<p align="center">
|
|
@@ -68,13 +68,17 @@ These case studies demonstrate that even with relatively high thresholds (such a
|
|
|
68
68
|
|
|
69
69
|
Moreover, both DBCache and DBPrune are **plug-and-play** solutions that works hand-in-hand with [ParaAttention](https://github.com/chengzeyi/ParaAttention). Users can easily tap into its **Context Parallelism** features for distributed inference.
|
|
70
70
|
|
|
71
|
+
<p align="center">
|
|
72
|
+
♥️ Please consider to leave a ⭐️ Star to support us ~ ♥️
|
|
73
|
+
</p>
|
|
74
|
+
|
|
71
75
|
## ©️Citations
|
|
72
76
|
|
|
73
77
|
```BibTeX
|
|
74
|
-
@misc{
|
|
75
|
-
title={
|
|
76
|
-
url={https://github.com/vipshop/
|
|
77
|
-
note={Open-source software available at https://github.com/vipshop/
|
|
78
|
+
@misc{CacheDiT@2025,
|
|
79
|
+
title={CacheDiT: A Training-free and Easy-to-use cache acceleration Toolbox for Diffusion Transformers},
|
|
80
|
+
url={https://github.com/vipshop/cache-dit.git},
|
|
81
|
+
note={Open-source software available at https://github.com/vipshop/cache-dit.git},
|
|
78
82
|
author={vipshop.com},
|
|
79
83
|
year={2025}
|
|
80
84
|
}
|
|
@@ -84,7 +88,7 @@ Moreover, both DBCache and DBPrune are **plug-and-play** solutions that works ha
|
|
|
84
88
|
|
|
85
89
|
<div id="reference"></div>
|
|
86
90
|
|
|
87
|
-
|
|
91
|
+
The **CacheDiT** codebase was adapted from FBCache's implementation at the [ParaAttention](https://github.com/chengzeyi/ParaAttention/tree/main/src/para_attn/first_block_cache). We would like to express our sincere gratitude for this excellent work!
|
|
88
92
|
|
|
89
93
|
## 📖Contents
|
|
90
94
|
|
|
@@ -105,20 +109,15 @@ Moreover, both DBCache and DBPrune are **plug-and-play** solutions that works ha
|
|
|
105
109
|
|
|
106
110
|
<div id="installation"></div>
|
|
107
111
|
|
|
108
|
-
You can install `
|
|
112
|
+
You can install the stable release of `cache-dit` from PyPI:
|
|
109
113
|
|
|
110
114
|
```bash
|
|
111
|
-
pip3 install
|
|
115
|
+
pip3 install cache-dit
|
|
112
116
|
```
|
|
113
|
-
|
|
114
|
-
or just install it from sources:
|
|
117
|
+
Or you can install the latest develop version from GitHub:
|
|
115
118
|
|
|
116
119
|
```bash
|
|
117
|
-
|
|
118
|
-
pip3 install 'torch==2.7.0' 'setuptools>=64' 'setuptools_scm>=8'
|
|
119
|
-
|
|
120
|
-
pip3 install -e '.[dev]' --no-build-isolation # build editable package
|
|
121
|
-
python3 -m build && pip3 install ./dist/cache_dit-*.whl # or build whl first and then install it.
|
|
120
|
+
pip3 install git+https://github.com/vipshop/cache-dit.git
|
|
122
121
|
```
|
|
123
122
|
|
|
124
123
|
## ⚡️DBCache: Dual Block Cache
|
|
@@ -214,7 +213,7 @@ apply_cache_on_pipe(pipe, **cache_options)
|
|
|
214
213
|
|
|
215
214
|

|
|
216
215
|
|
|
217
|
-
We have further implemented a new **Dynamic Block Prune** algorithm based on **Residual Caching** for Diffusion Transformers, which is referred to as **DBPrune**. DBPrune is currently in the experimental phase, and we kindly invite you to stay tuned for upcoming updates.
|
|
216
|
+
We have further implemented a new **Dynamic Block Prune** algorithm based on **Residual Caching** for Diffusion Transformers, which is referred to as **DBPrune**. DBPrune caches each block's hidden states and residuals, then dynamically prunes blocks during inference by computing the L1 distance between previous hidden states. When a block is pruned, its output is approximated using the cached residuals. DBPrune is currently in the experimental phase, and we kindly invite you to stay tuned for upcoming updates.
|
|
218
217
|
|
|
219
218
|
```python
|
|
220
219
|
from diffusers import FluxPipeline
|
|
@@ -240,13 +239,13 @@ apply_cache_on_pipe(pipe, **cache_options)
|
|
|
240
239
|
|Baseline(L20x1)|Pruned(24%)|Pruned(35%)|Pruned(38%)|Pruned(45%)|Pruned(60%)|
|
|
241
240
|
|:---:|:---:|:---:|:---:|:---:|:---:|
|
|
242
241
|
|24.85s|19.43s|16.82s|15.95s|14.24s|10.66s|
|
|
243
|
-
|<img src
|
|
242
|
+
|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/NONE_R0.08_S0.png width=105px>|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/DBPRUNE_F1B0_R0.03_P24.0_T19.43s.png width=105px> | <img src=https://github.com/vipshop/cache-dit/raw/main/assets/DBPRUNE_F1B0_R0.04_P34.6_T16.82s.png width=105px>|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/DBPRUNE_F1B0_R0.05_P38.3_T15.95s.png width=105px>|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/DBPRUNE_F1B0_R0.06_P45.2_T14.24s.png width=105px>|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/DBPRUNE_F1B0_R0.2_P59.5_T10.66s.png width=105px>|
|
|
244
243
|
|
|
245
244
|
## 🎉Context Parallelism
|
|
246
245
|
|
|
247
246
|
<div id="context-parallelism"></div>
|
|
248
247
|
|
|
249
|
-
|
|
248
|
+
**CacheDiT** are **plug-and-play** solutions that works hand-in-hand with [ParaAttention](https://github.com/chengzeyi/ParaAttention). Users can **easily tap into** its **Context Parallelism** features for distributed inference. Firstly, install `para-attn` from PyPI:
|
|
250
249
|
|
|
251
250
|
```bash
|
|
252
251
|
pip3 install para-attn # or install `para-attn` from sources.
|
|
@@ -282,7 +281,7 @@ apply_cache_on_pipe(
|
|
|
282
281
|
|
|
283
282
|
<div id="compile"></div>
|
|
284
283
|
|
|
285
|
-
**
|
|
284
|
+
**CacheDiT** are designed to work compatibly with `torch.compile`. For example:
|
|
286
285
|
|
|
287
286
|
```python
|
|
288
287
|
apply_cache_on_pipe(
|
|
@@ -291,7 +290,7 @@ apply_cache_on_pipe(
|
|
|
291
290
|
# Compile the Transformer module
|
|
292
291
|
pipe.transformer = torch.compile(pipe.transformer)
|
|
293
292
|
```
|
|
294
|
-
However, users intending to use
|
|
293
|
+
However, users intending to use **CacheDiT** for DiT with **dynamic input shapes** should consider increasing the **recompile** **limit** of `torch._dynamo` to achieve better performance.
|
|
295
294
|
|
|
296
295
|
```python
|
|
297
296
|
torch._dynamo.config.recompile_limit = 96 # default is 8
|
|
@@ -303,9 +302,9 @@ Otherwise, the recompile_limit error may be triggered, causing the module to fal
|
|
|
303
302
|
|
|
304
303
|
<div id="supported"></div>
|
|
305
304
|
|
|
306
|
-
- [🚀FLUX.1](
|
|
307
|
-
- [🚀CogVideoX](
|
|
308
|
-
- [🚀Mochi](
|
|
305
|
+
- [🚀FLUX.1](https://github.com/vipshop/cache-dit/raw/main/src/cache_dit/cache_factory/dual_block_cache/diffusers_adapters)
|
|
306
|
+
- [🚀CogVideoX](https://github.com/vipshop/cache-dit/raw/main/src/cache_dit/cache_factory/dual_block_cache/diffusers_adapters)
|
|
307
|
+
- [🚀Mochi](https://github.com/vipshop/cache-dit/raw/main/src/cache_dit/cache_factory/dual_block_cache/diffusers_adapters)
|
|
309
308
|
|
|
310
309
|
## 👋Contribute
|
|
311
310
|
<div id="contribute"></div>
|
|
Binary file
|
|
@@ -4,13 +4,14 @@ name = "cache_dit"
|
|
|
4
4
|
dynamic = ["version", "dependencies", "optional-dependencies"]
|
|
5
5
|
requires-python = ">=3.10"
|
|
6
6
|
authors = [{name = "DefTruth, vipshop.com, etc."}]
|
|
7
|
-
description = "
|
|
7
|
+
description = "🤗 CacheDiT: A Training-free and Easy-to-use Cache Acceleration Toolbox for Diffusion Transformers"
|
|
8
8
|
maintainers = [{name="DefTruth, vipshop.com, etc"}]
|
|
9
|
+
readme = "README.md"
|
|
9
10
|
|
|
10
11
|
[project.urls]
|
|
11
12
|
|
|
12
|
-
Repository = "https://github.com/vipshop/
|
|
13
|
-
Homepage = "https://github.com/vipshop/
|
|
13
|
+
Repository = "https://github.com/vipshop/cache-dit.git"
|
|
14
|
+
Homepage = "https://github.com/vipshop/cache-dit.git"
|
|
14
15
|
|
|
15
16
|
[build-system]
|
|
16
17
|
|
|
@@ -44,7 +44,7 @@ def fetch_requirements():
|
|
|
44
44
|
|
|
45
45
|
setup(
|
|
46
46
|
name=PACKAGE_NAME,
|
|
47
|
-
description="
|
|
47
|
+
description="🤗 CacheDiT: A Training-free and Easy-to-use Cache Acceleration Toolbox for Diffusion Transformers",
|
|
48
48
|
author="vipshop.com",
|
|
49
49
|
use_scm_version={
|
|
50
50
|
"write_to": path.join("src", "cache_dit", "_version.py"),
|
|
@@ -17,5 +17,5 @@ __version__: str
|
|
|
17
17
|
__version_tuple__: VERSION_TUPLE
|
|
18
18
|
version_tuple: VERSION_TUPLE
|
|
19
19
|
|
|
20
|
-
__version__ = version = '0.1.
|
|
21
|
-
__version_tuple__ = version_tuple = (0, 1,
|
|
20
|
+
__version__ = version = '0.1.2'
|
|
21
|
+
__version_tuple__ = version_tuple = (0, 1, 2)
|
|
@@ -160,7 +160,9 @@ def apply_cache_on_pipe(pipe: DiffusionPipeline, *args, **kwargs):
|
|
|
160
160
|
elif cache_type == CacheType.DBPrune:
|
|
161
161
|
return apply_db_prune_on_pipe(pipe, *args, **kwargs)
|
|
162
162
|
elif cache_type == CacheType.NONE:
|
|
163
|
-
logger.warning(
|
|
163
|
+
logger.warning(
|
|
164
|
+
f"Cache type is {cache_type}, no caching will be applied."
|
|
165
|
+
)
|
|
164
166
|
return pipe
|
|
165
167
|
else:
|
|
166
168
|
raise ValueError(f"Unknown cache type: {cache_type}")
|