cache-dit 0.1.1.dev2__tar.gz → 0.1.3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of cache-dit might be problematic. Click here for more details.

Files changed (77) hide show
  1. cache_dit-0.1.3/PKG-INFO +356 -0
  2. {cache_dit-0.1.1.dev2 → cache_dit-0.1.3}/README.md +36 -35
  3. cache_dit-0.1.3/assets/cache-dit.png +0 -0
  4. {cache_dit-0.1.1.dev2 → cache_dit-0.1.3}/bench/bench.py +2 -1
  5. {cache_dit-0.1.1.dev2 → cache_dit-0.1.3}/pyproject.toml +4 -3
  6. {cache_dit-0.1.1.dev2 → cache_dit-0.1.3}/setup.cfg +0 -3
  7. {cache_dit-0.1.1.dev2 → cache_dit-0.1.3}/setup.py +1 -1
  8. {cache_dit-0.1.1.dev2 → cache_dit-0.1.3}/src/cache_dit/_version.py +2 -2
  9. {cache_dit-0.1.1.dev2 → cache_dit-0.1.3}/src/cache_dit/cache_factory/__init__.py +3 -1
  10. {cache_dit-0.1.1.dev2 → cache_dit-0.1.3}/src/cache_dit/cache_factory/dual_block_cache/diffusers_adapters/cogvideox.py +1 -1
  11. cache_dit-0.1.3/src/cache_dit.egg-info/PKG-INFO +356 -0
  12. {cache_dit-0.1.1.dev2 → cache_dit-0.1.3}/src/cache_dit.egg-info/SOURCES.txt +1 -0
  13. cache_dit-0.1.1.dev2/PKG-INFO +0 -31
  14. cache_dit-0.1.1.dev2/src/cache_dit.egg-info/PKG-INFO +0 -31
  15. {cache_dit-0.1.1.dev2 → cache_dit-0.1.3}/.github/workflows/issue.yml +0 -0
  16. {cache_dit-0.1.1.dev2 → cache_dit-0.1.3}/.gitignore +0 -0
  17. {cache_dit-0.1.1.dev2 → cache_dit-0.1.3}/.pre-commit-config.yaml +0 -0
  18. {cache_dit-0.1.1.dev2 → cache_dit-0.1.3}/CONTRIBUTE.md +0 -0
  19. {cache_dit-0.1.1.dev2 → cache_dit-0.1.3}/LICENSE +0 -0
  20. {cache_dit-0.1.1.dev2 → cache_dit-0.1.3}/MANIFEST.in +0 -0
  21. {cache_dit-0.1.1.dev2 → cache_dit-0.1.3}/assets/DBCACHE_F12B12S4_R0.2_S16.png +0 -0
  22. {cache_dit-0.1.1.dev2 → cache_dit-0.1.3}/assets/DBCACHE_F12B16S4_R0.08_S6.png +0 -0
  23. {cache_dit-0.1.1.dev2 → cache_dit-0.1.3}/assets/DBCACHE_F16B16S2_R0.2_S14.png +0 -0
  24. {cache_dit-0.1.1.dev2 → cache_dit-0.1.3}/assets/DBCACHE_F16B16S4_R0.2_S13.png +0 -0
  25. {cache_dit-0.1.1.dev2 → cache_dit-0.1.3}/assets/DBCACHE_F1B0S1_R0.08_S11.png +0 -0
  26. {cache_dit-0.1.1.dev2 → cache_dit-0.1.3}/assets/DBCACHE_F1B0S1_R0.2_S19.png +0 -0
  27. {cache_dit-0.1.1.dev2 → cache_dit-0.1.3}/assets/DBCACHE_F8B0S2_R0.12_S12.png +0 -0
  28. {cache_dit-0.1.1.dev2 → cache_dit-0.1.3}/assets/DBCACHE_F8B16S1_R0.2_S18.png +0 -0
  29. {cache_dit-0.1.1.dev2 → cache_dit-0.1.3}/assets/DBCACHE_F8B8S1_R0.08_S9.png +0 -0
  30. {cache_dit-0.1.1.dev2 → cache_dit-0.1.3}/assets/DBCACHE_F8B8S1_R0.12_S12.png +0 -0
  31. {cache_dit-0.1.1.dev2 → cache_dit-0.1.3}/assets/DBCACHE_F8B8S1_R0.15_S15.png +0 -0
  32. {cache_dit-0.1.1.dev2 → cache_dit-0.1.3}/assets/DBCache.png +0 -0
  33. {cache_dit-0.1.1.dev2 → cache_dit-0.1.3}/assets/DBPRUNE_F1B0_R0.03_P24.0_T19.43s.png +0 -0
  34. {cache_dit-0.1.1.dev2 → cache_dit-0.1.3}/assets/DBPRUNE_F1B0_R0.04_P34.6_T16.82s.png +0 -0
  35. {cache_dit-0.1.1.dev2 → cache_dit-0.1.3}/assets/DBPRUNE_F1B0_R0.05_P38.3_T15.95s.png +0 -0
  36. {cache_dit-0.1.1.dev2 → cache_dit-0.1.3}/assets/DBPRUNE_F1B0_R0.06_P45.2_T14.24s.png +0 -0
  37. {cache_dit-0.1.1.dev2 → cache_dit-0.1.3}/assets/DBPRUNE_F1B0_R0.07_P52.3_T12.53s.png +0 -0
  38. {cache_dit-0.1.1.dev2 → cache_dit-0.1.3}/assets/DBPRUNE_F1B0_R0.08_P52.4_T12.52s.png +0 -0
  39. {cache_dit-0.1.1.dev2 → cache_dit-0.1.3}/assets/DBPRUNE_F1B0_R0.09_P59.2_T10.81s.png +0 -0
  40. {cache_dit-0.1.1.dev2 → cache_dit-0.1.3}/assets/DBPRUNE_F1B0_R0.12_P59.5_T10.76s.png +0 -0
  41. {cache_dit-0.1.1.dev2 → cache_dit-0.1.3}/assets/DBPRUNE_F1B0_R0.12_P63.0_T9.90s.png +0 -0
  42. {cache_dit-0.1.1.dev2 → cache_dit-0.1.3}/assets/DBPRUNE_F1B0_R0.1_P62.8_T9.95s.png +0 -0
  43. {cache_dit-0.1.1.dev2 → cache_dit-0.1.3}/assets/DBPRUNE_F1B0_R0.2_P59.5_T10.66s.png +0 -0
  44. {cache_dit-0.1.1.dev2 → cache_dit-0.1.3}/assets/DBPRUNE_F1B0_R0.3_P63.1_T9.79s.png +0 -0
  45. {cache_dit-0.1.1.dev2 → cache_dit-0.1.3}/assets/NONE_R0.08_S0.png +0 -0
  46. {cache_dit-0.1.1.dev2 → cache_dit-0.1.3}/bench/.gitignore +0 -0
  47. {cache_dit-0.1.1.dev2 → cache_dit-0.1.3}/docs/.gitignore +0 -0
  48. {cache_dit-0.1.1.dev2 → cache_dit-0.1.3}/examples/.gitignore +0 -0
  49. {cache_dit-0.1.1.dev2 → cache_dit-0.1.3}/examples/run_flux.py +0 -0
  50. {cache_dit-0.1.1.dev2 → cache_dit-0.1.3}/pytest.ini +0 -0
  51. {cache_dit-0.1.1.dev2 → cache_dit-0.1.3}/requirements.txt +0 -0
  52. {cache_dit-0.1.1.dev2 → cache_dit-0.1.3}/src/cache_dit/__init__.py +0 -0
  53. {cache_dit-0.1.1.dev2 → cache_dit-0.1.3}/src/cache_dit/cache_factory/dual_block_cache/__init__.py +0 -0
  54. {cache_dit-0.1.1.dev2 → cache_dit-0.1.3}/src/cache_dit/cache_factory/dual_block_cache/cache_context.py +0 -0
  55. {cache_dit-0.1.1.dev2 → cache_dit-0.1.3}/src/cache_dit/cache_factory/dual_block_cache/diffusers_adapters/__init__.py +0 -0
  56. {cache_dit-0.1.1.dev2 → cache_dit-0.1.3}/src/cache_dit/cache_factory/dual_block_cache/diffusers_adapters/flux.py +0 -0
  57. {cache_dit-0.1.1.dev2 → cache_dit-0.1.3}/src/cache_dit/cache_factory/dual_block_cache/diffusers_adapters/mochi.py +0 -0
  58. {cache_dit-0.1.1.dev2 → cache_dit-0.1.3}/src/cache_dit/cache_factory/dynamic_block_prune/__init__.py +0 -0
  59. {cache_dit-0.1.1.dev2 → cache_dit-0.1.3}/src/cache_dit/cache_factory/dynamic_block_prune/diffusers_adapters/__init__.py +0 -0
  60. {cache_dit-0.1.1.dev2 → cache_dit-0.1.3}/src/cache_dit/cache_factory/dynamic_block_prune/diffusers_adapters/cogvideox.py +0 -0
  61. {cache_dit-0.1.1.dev2 → cache_dit-0.1.3}/src/cache_dit/cache_factory/dynamic_block_prune/diffusers_adapters/flux.py +0 -0
  62. {cache_dit-0.1.1.dev2 → cache_dit-0.1.3}/src/cache_dit/cache_factory/dynamic_block_prune/diffusers_adapters/mochi.py +0 -0
  63. {cache_dit-0.1.1.dev2 → cache_dit-0.1.3}/src/cache_dit/cache_factory/dynamic_block_prune/prune_context.py +0 -0
  64. {cache_dit-0.1.1.dev2 → cache_dit-0.1.3}/src/cache_dit/cache_factory/first_block_cache/__init__.py +0 -0
  65. {cache_dit-0.1.1.dev2 → cache_dit-0.1.3}/src/cache_dit/cache_factory/first_block_cache/cache_context.py +0 -0
  66. {cache_dit-0.1.1.dev2 → cache_dit-0.1.3}/src/cache_dit/cache_factory/first_block_cache/diffusers_adapters/__init__.py +0 -0
  67. {cache_dit-0.1.1.dev2 → cache_dit-0.1.3}/src/cache_dit/cache_factory/first_block_cache/diffusers_adapters/cogvideox.py +0 -0
  68. {cache_dit-0.1.1.dev2 → cache_dit-0.1.3}/src/cache_dit/cache_factory/first_block_cache/diffusers_adapters/flux.py +0 -0
  69. {cache_dit-0.1.1.dev2 → cache_dit-0.1.3}/src/cache_dit/cache_factory/first_block_cache/diffusers_adapters/mochi.py +0 -0
  70. {cache_dit-0.1.1.dev2 → cache_dit-0.1.3}/src/cache_dit/cache_factory/first_block_cache/diffusers_adapters/wan.py +0 -0
  71. {cache_dit-0.1.1.dev2 → cache_dit-0.1.3}/src/cache_dit/cache_factory/taylorseer.py +0 -0
  72. {cache_dit-0.1.1.dev2 → cache_dit-0.1.3}/src/cache_dit/cache_factory/utils.py +0 -0
  73. {cache_dit-0.1.1.dev2 → cache_dit-0.1.3}/src/cache_dit/logger.py +0 -0
  74. {cache_dit-0.1.1.dev2 → cache_dit-0.1.3}/src/cache_dit/primitives.py +0 -0
  75. {cache_dit-0.1.1.dev2 → cache_dit-0.1.3}/src/cache_dit.egg-info/dependency_links.txt +0 -0
  76. {cache_dit-0.1.1.dev2 → cache_dit-0.1.3}/src/cache_dit.egg-info/requires.txt +0 -0
  77. {cache_dit-0.1.1.dev2 → cache_dit-0.1.3}/src/cache_dit.egg-info/top_level.txt +0 -0
@@ -0,0 +1,356 @@
1
+ Metadata-Version: 2.4
2
+ Name: cache_dit
3
+ Version: 0.1.3
4
+ Summary: 🤗 CacheDiT: A Training-free and Easy-to-use Cache Acceleration Toolbox for Diffusion Transformers
5
+ Author: DefTruth, vipshop.com, etc.
6
+ Maintainer: DefTruth, vipshop.com, etc
7
+ Project-URL: Repository, https://github.com/vipshop/cache-dit.git
8
+ Project-URL: Homepage, https://github.com/vipshop/cache-dit.git
9
+ Requires-Python: >=3.10
10
+ Description-Content-Type: text/markdown
11
+ License-File: LICENSE
12
+ Requires-Dist: packaging
13
+ Requires-Dist: torch
14
+ Requires-Dist: transformers
15
+ Requires-Dist: diffusers
16
+ Provides-Extra: all
17
+ Provides-Extra: dev
18
+ Requires-Dist: pre-commit; extra == "dev"
19
+ Requires-Dist: pytest<8.0.0,>=7.0.0; extra == "dev"
20
+ Requires-Dist: pytest-html; extra == "dev"
21
+ Requires-Dist: expecttest; extra == "dev"
22
+ Requires-Dist: hypothesis; extra == "dev"
23
+ Requires-Dist: transformers; extra == "dev"
24
+ Requires-Dist: diffusers; extra == "dev"
25
+ Requires-Dist: accelerate; extra == "dev"
26
+ Requires-Dist: peft; extra == "dev"
27
+ Requires-Dist: protobuf; extra == "dev"
28
+ Requires-Dist: sentencepiece; extra == "dev"
29
+ Requires-Dist: opencv-python-headless; extra == "dev"
30
+ Requires-Dist: ftfy; extra == "dev"
31
+ Dynamic: license-file
32
+ Dynamic: provides-extra
33
+ Dynamic: requires-dist
34
+ Dynamic: requires-python
35
+
36
+ <div align="center">
37
+ <p align="center">
38
+ <h3>🤗 CacheDiT: A Training-free and Easy-to-use Cache Acceleration <br>Toolbox for Diffusion Transformers</h3>
39
+ </p>
40
+ <img src=https://github.com/vipshop/cache-dit/raw/main/assets/cache-dit.png >
41
+ <div align='center'>
42
+ <img src=https://img.shields.io/badge/Language-Python-brightgreen.svg >
43
+ <img src=https://img.shields.io/badge/PRs-welcome-9cf.svg >
44
+ <img src=https://img.shields.io/badge/PyPI-pass-brightgreen.svg >
45
+ <img src=https://static.pepy.tech/badge/cache-dit >
46
+ <img src=https://img.shields.io/badge/Python-3.10|3.11|3.12-9cf.svg >
47
+ <img src=https://img.shields.io/badge/Release-v0.1.3-brightgreen.svg >
48
+ </div>
49
+ <p align="center">
50
+ DeepCache is for UNet not DiT. Most DiT cache speedups are complex and not training-free. CacheDiT provides <br>a series of training-free, UNet-style cache accelerators for DiT: DBCache, DBPrune, FBCache, etc.
51
+ </p>
52
+ </div>
53
+
54
+ ## 🤗 Introduction
55
+
56
+ <div align="center">
57
+ <p align="center">
58
+ <h3>DBCache: Dual Block Caching for Diffusion Transformers</h3>
59
+ </p>
60
+ </div>
61
+
62
+ **DBCache**: **Dual Block Caching** for Diffusion Transformers. We have enhanced `FBCache` into a more general and customizable cache algorithm, namely `DBCache`, enabling it to achieve fully `UNet-style` cache acceleration for DiT models. Different configurations of compute blocks (**F8B12**, etc.) can be customized in DBCache. Moreover, it can be entirely **training**-**free**. DBCache can strike a perfect **balance** between performance and precision!
63
+
64
+ <div align="center">
65
+ <p align="center">
66
+ DBCache, <b> L20x1 </b>, Steps: 28, "A cat holding a sign that says hello world with complex background"
67
+ </p>
68
+ </div>
69
+
70
+ |Baseline(L20x1)|F1B0 (0.08)|F1B0 (0.20)|F8B8 (0.15)|F12B12 (0.20)|F16B16 (0.20)|
71
+ |:---:|:---:|:---:|:---:|:---:|:---:|
72
+ |24.85s|15.59s|8.58s|15.41s|15.11s|17.74s|
73
+ |<img src=https://github.com/vipshop/cache-dit/raw/main/assets/NONE_R0.08_S0.png width=105px>|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/DBCACHE_F1B0S1_R0.08_S11.png width=105px> | <img src=https://github.com/vipshop/cache-dit/raw/main/assets/DBCACHE_F1B0S1_R0.2_S19.png width=105px>|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/DBCACHE_F8B8S1_R0.15_S15.png width=105px>|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/DBCACHE_F12B12S4_R0.2_S16.png width=105px>|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/DBCACHE_F16B16S4_R0.2_S13.png width=105px>|
74
+ |**Baseline(L20x1)**|**F1B0 (0.08)**|**F8B8 (0.12)**|**F8B12 (0.20)**|**F8B16 (0.20)**|**F8B20 (0.20)**|
75
+ |27.85s|6.04s|5.88s|5.77s|6.01s|6.20s|
76
+ |<img src=https://github.com/user-attachments/assets/70ea57f4-d8f2-415b-8a96-d8315974a5e6 width=105px>|<img src=https://github.com/user-attachments/assets/fc0e1a67-19cc-44aa-bf50-04696e7978a0 width=105px> |<img src=https://github.com/user-attachments/assets/d1434896-628c-436b-95ad-43c085a8629e width=105px>|<img src=https://github.com/user-attachments/assets/aaa42cd2-57de-4c4e-8bfb-913018a8251d width=105px>|<img src=https://github.com/user-attachments/assets/dc0ba2a4-ef7c-436d-8a39-67055deab92f width=105px>|<img src=https://github.com/user-attachments/assets/aede466f-61ed-4256-8df0-fecf8020c5ca width=105px>|
77
+
78
+ <div align="center">
79
+ <p align="center">
80
+ DBCache, <b> L20x4 </b>, Steps: 20, case to show the texture recovery ability of DBCache
81
+ </p>
82
+ </div>
83
+
84
+ These case studies demonstrate that even with relatively high thresholds (such as 0.12, 0.15, 0.2, etc.) under the DBCache **F12B12** or **F8B16** configuration, the detailed texture of the kitten's fur, colored cloth, and the clarity of text can still be preserved. This suggests that users can leverage DBCache to effectively balance performance and precision in their workflows!
85
+
86
+ <div align="center">
87
+ <p align="center">
88
+ <h3>DBPrune: Dynamic Block Prune with Residual Caching</h3>
89
+ </p>
90
+ </div>
91
+
92
+ **DBPrune**: We have further implemented a new **Dynamic Block Prune** algorithm based on **Residual Caching** for Diffusion Transformers, referred to as DBPrune. DBPrune caches each block's hidden states and residuals, then **dynamically prunes** blocks during inference by computing the L1 distance between previous hidden states. When a block is pruned, its output is approximated using the cached residuals.
93
+
94
+ |Baseline(L20x1)|Pruned(24%)|Pruned(35%)|Pruned(38%)|Pruned(45%)|Pruned(60%)|
95
+ |:---:|:---:|:---:|:---:|:---:|:---:|
96
+ |24.85s|19.43s|16.82s|15.95s|14.24s|10.66s|
97
+ |<img src=https://github.com/vipshop/cache-dit/raw/main/assets/NONE_R0.08_S0.png width=105px>|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/DBPRUNE_F1B0_R0.03_P24.0_T19.43s.png width=105px> | <img src=https://github.com/vipshop/cache-dit/raw/main/assets/DBPRUNE_F1B0_R0.04_P34.6_T16.82s.png width=105px>|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/DBPRUNE_F1B0_R0.05_P38.3_T15.95s.png width=105px>|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/DBPRUNE_F1B0_R0.06_P45.2_T14.24s.png width=105px>|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/DBPRUNE_F1B0_R0.2_P59.5_T10.66s.png width=105px>|
98
+
99
+ <div align="center">
100
+ <p align="center">
101
+ DBPrune, <b> L20x1 </b>, Steps: 28, "A cat holding a sign that says hello world with complex background"
102
+ </p>
103
+ </div>
104
+
105
+ Moreover, both DBCache and DBPrune are **plug-and-play** solutions that works hand-in-hand with [ParaAttention](https://github.com/chengzeyi/ParaAttention). Users can easily tap into its **Context Parallelism** features for distributed inference.
106
+
107
+ <p align="center">
108
+ ♥️ Please consider to leave a ⭐️ Star to support us ~ ♥️
109
+ </p>
110
+
111
+ ## ©️Citations
112
+
113
+ ```BibTeX
114
+ @misc{CacheDiT@2025,
115
+ title={CacheDiT: A Training-free and Easy-to-use cache acceleration Toolbox for Diffusion Transformers},
116
+ url={https://github.com/vipshop/cache-dit.git},
117
+ note={Open-source software available at https://github.com/vipshop/cache-dit.git},
118
+ author={vipshop.com},
119
+ year={2025}
120
+ }
121
+ ```
122
+
123
+ ## 👋Reference
124
+
125
+ <div id="reference"></div>
126
+
127
+ The **CacheDiT** codebase was adapted from FBCache's implementation at the [ParaAttention](https://github.com/chengzeyi/ParaAttention/tree/main/src/para_attn/first_block_cache). We would like to express our sincere gratitude for this excellent work!
128
+
129
+ ## 📖Contents
130
+
131
+ <div id="contents"></div>
132
+
133
+ - [⚙️Installation](#️installation)
134
+ - [⚡️Dual Block Cache](#dbcache)
135
+ - [🎉First Block Cache](#fbcache)
136
+ - [⚡️Dynamic Block Prune](#dbprune)
137
+ - [🎉Context Parallelism](#context-parallelism)
138
+ - [⚡️Torch Compile](#compile)
139
+ - [🎉Supported Models](#supported)
140
+ - [👋Contribute](#contribute)
141
+ - [©️License](#license)
142
+
143
+
144
+ ## ⚙️Installation
145
+
146
+ <div id="installation"></div>
147
+
148
+ You can install the stable release of `cache-dit` from PyPI:
149
+
150
+ ```bash
151
+ pip3 install cache-dit
152
+ ```
153
+ Or you can install the latest develop version from GitHub:
154
+
155
+ ```bash
156
+ pip3 install git+https://github.com/vipshop/cache-dit.git
157
+ ```
158
+
159
+ ## ⚡️DBCache: Dual Block Cache
160
+
161
+ <div id="dbcache"></div>
162
+
163
+ ![](https://github.com/user-attachments/assets/c2a382b9-0ccd-46f4-aacc-87857b4a4de8)
164
+
165
+ **DBCache** provides configurable parameters for custom optimization, enabling a balanced trade-off between performance and precision:
166
+
167
+ - **Fn**: Specifies that DBCache uses the **first n** Transformer blocks to fit the information at time step t, enabling the calculation of a more stable L1 diff and delivering more accurate information to subsequent blocks.
168
+ - **Bn**: Further fuses approximate information in the **last n** Transformer blocks to enhance prediction accuracy. These blocks act as an auto-scaler for approximate hidden states that use residual cache.
169
+ - **warmup_steps**: (default: 0) DBCache does not apply the caching strategy when the number of running steps is less than or equal to this value, ensuring the model sufficiently learns basic features during warmup.
170
+ - **max_cached_steps**: (default: -1) DBCache disables the caching strategy when the running steps exceed this value to prevent precision degradation.
171
+ - **residual_diff_threshold**: The value of residual diff threshold, a higher value leads to faster performance at the cost of lower precision.
172
+
173
+ For a good balance between performance and precision, DBCache is configured by default with **F8B8**, 8 warmup steps, and unlimited cached steps.
174
+
175
+ ```python
176
+ from diffusers import FluxPipeline
177
+ from cache_dit.cache_factory import apply_cache_on_pipe, CacheType
178
+
179
+ pipe = FluxPipeline.from_pretrained(
180
+ "black-forest-labs/FLUX.1-dev",
181
+ torch_dtype=torch.bfloat16,
182
+ ).to("cuda")
183
+
184
+ # Default options, F8B8, good balance between performance and precision
185
+ cache_options = CacheType.default_options(CacheType.DBCache)
186
+
187
+ # Custom options, F8B16, higher precision
188
+ cache_options = {
189
+ "cache_type": CacheType.DBCache,
190
+ "warmup_steps": 8,
191
+ "max_cached_steps": 8, # -1 means no limit
192
+ "Fn_compute_blocks": 8, # Fn, F8, etc.
193
+ "Bn_compute_blocks": 16, # Bn, B16, etc.
194
+ "residual_diff_threshold": 0.12,
195
+ }
196
+
197
+ apply_cache_on_pipe(pipe, **cache_options)
198
+ ```
199
+ Moreover, users configuring higher **Bn** values (e.g., **F8B16**) while aiming to maintain good performance can specify **Bn_compute_blocks_ids** to work with Bn. DBCache will only compute the specified blocks, with the remaining estimated using the previous step's residual cache.
200
+
201
+ ```python
202
+ # Custom options, F8B16, higher precision with good performance.
203
+ cache_options = {
204
+ # 0, 2, 4, ..., 14, 15, etc. [0,16)
205
+ "Bn_compute_blocks_ids": CacheType.range(0, 16, 2),
206
+ # If the L1 difference is below this threshold, skip Bn blocks
207
+ # not in `Bn_compute_blocks_ids`(1, 3,..., etc), Otherwise,
208
+ # compute these blocks.
209
+ "non_compute_blocks_diff_threshold": 0.08,
210
+ }
211
+ ```
212
+
213
+ ## 🎉FBCache: First Block Cache
214
+
215
+ <div id="fbcache"></div>
216
+
217
+ ![](https://github.com/user-attachments/assets/0fb66656-b711-457a-92a7-a830f134272d)
218
+
219
+ **DBCache** is a more general cache algorithm than **FBCache**. When Fn=1 and Bn=0, DBCache behaves identically to FBCache. Therefore, you can either use the original FBCache implementation directly or configure **DBCache** with **F1B0** settings to achieve the same functionality.
220
+
221
+ ```python
222
+ from diffusers import FluxPipeline
223
+ from cache_dit.cache_factory import apply_cache_on_pipe, CacheType
224
+
225
+ pipe = FluxPipeline.from_pretrained(
226
+ "black-forest-labs/FLUX.1-dev",
227
+ torch_dtype=torch.bfloat16,
228
+ ).to("cuda")
229
+
230
+ # Using FBCache directly
231
+ cache_options = CacheType.default_options(CacheType.FBCache)
232
+
233
+ # Or using DBCache with F1B0.
234
+ # Fn=1, Bn=0, means FB Cache, otherwise, Dual Block Cache
235
+ cache_options = {
236
+ "cache_type": CacheType.DBCache,
237
+ "warmup_steps": 8,
238
+ "max_cached_steps": 8, # -1 means no limit
239
+ "Fn_compute_blocks": 1, # Fn, F1, etc.
240
+ "Bn_compute_blocks": 0, # Bn, B0, etc.
241
+ "residual_diff_threshold": 0.12,
242
+ }
243
+
244
+ apply_cache_on_pipe(pipe, **cache_options)
245
+ ```
246
+
247
+ ## ⚡️DBPrune: Dynamic Block Prune
248
+
249
+ <div id="dbprune"></div>
250
+
251
+ ![](https://github.com/user-attachments/assets/932b6360-9533-4352-b176-4c4d84bd4695)
252
+
253
+ We have further implemented a new **Dynamic Block Prune** algorithm based on **Residual Caching** for Diffusion Transformers, which is referred to as **DBPrune**. DBPrune caches each block's hidden states and residuals, then dynamically prunes blocks during inference by computing the L1 distance between previous hidden states. When a block is pruned, its output is approximated using the cached residuals. DBPrune is currently in the experimental phase, and we kindly invite you to stay tuned for upcoming updates.
254
+
255
+ ```python
256
+ from diffusers import FluxPipeline
257
+ from cache_dit.cache_factory import apply_cache_on_pipe, CacheType
258
+
259
+ pipe = FluxPipeline.from_pretrained(
260
+ "black-forest-labs/FLUX.1-dev",
261
+ torch_dtype=torch.bfloat16,
262
+ ).to("cuda")
263
+
264
+ # Using DBPrune
265
+ cache_options = CacheType.default_options(CacheType.DBPrune)
266
+
267
+ apply_cache_on_pipe(pipe, **cache_options)
268
+ ```
269
+
270
+ <div align="center">
271
+ <p align="center">
272
+ DBPrune, <b> L20x1 </b>, Steps: 28, "A cat holding a sign that says hello world with complex background"
273
+ </p>
274
+ </div>
275
+
276
+ |Baseline(L20x1)|Pruned(24%)|Pruned(35%)|Pruned(38%)|Pruned(45%)|Pruned(60%)|
277
+ |:---:|:---:|:---:|:---:|:---:|:---:|
278
+ |24.85s|19.43s|16.82s|15.95s|14.24s|10.66s|
279
+ |<img src=https://github.com/vipshop/cache-dit/raw/main/assets/NONE_R0.08_S0.png width=105px>|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/DBPRUNE_F1B0_R0.03_P24.0_T19.43s.png width=105px> | <img src=https://github.com/vipshop/cache-dit/raw/main/assets/DBPRUNE_F1B0_R0.04_P34.6_T16.82s.png width=105px>|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/DBPRUNE_F1B0_R0.05_P38.3_T15.95s.png width=105px>|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/DBPRUNE_F1B0_R0.06_P45.2_T14.24s.png width=105px>|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/DBPRUNE_F1B0_R0.2_P59.5_T10.66s.png width=105px>|
280
+
281
+ ## 🎉Context Parallelism
282
+
283
+ <div id="context-parallelism"></div>
284
+
285
+ **CacheDiT** are **plug-and-play** solutions that works hand-in-hand with [ParaAttention](https://github.com/chengzeyi/ParaAttention). Users can **easily tap into** its **Context Parallelism** features for distributed inference. Firstly, install `para-attn` from PyPI:
286
+
287
+ ```bash
288
+ pip3 install para-attn # or install `para-attn` from sources.
289
+ ```
290
+
291
+ Then, you can run **DBCache** with **Context Parallelism** on 4 GPUs:
292
+
293
+ ```python
294
+ from diffusers import FluxPipeline
295
+ from para_attn.context_parallel import init_context_parallel_mesh
296
+ from para_attn.context_parallel.diffusers_adapters import parallelize_pipe
297
+ from cache_dit.cache_factory import apply_cache_on_pipe, CacheType
298
+
299
+ pipe = FluxPipeline.from_pretrained(
300
+ "black-forest-labs/FLUX.1-dev",
301
+ torch_dtype=torch.bfloat16,
302
+ ).to("cuda")
303
+
304
+ # Context Parallel from ParaAttention
305
+ parallelize_pipe(
306
+ pipe, mesh=init_context_parallel_mesh(
307
+ pipe.device.type, max_ulysses_dim_size=4
308
+ )
309
+ )
310
+
311
+ # DBCache with F8B8 from this library
312
+ apply_cache_on_pipe(
313
+ pipe, **CacheType.default_options(CacheType.DBCache)
314
+ )
315
+ ```
316
+
317
+ ## ⚡️Torch Compile
318
+
319
+ <div id="compile"></div>
320
+
321
+ **CacheDiT** are designed to work compatibly with `torch.compile`. For example:
322
+
323
+ ```python
324
+ apply_cache_on_pipe(
325
+ pipe, **CacheType.default_options(CacheType.DBCache)
326
+ )
327
+ # Compile the Transformer module
328
+ pipe.transformer = torch.compile(pipe.transformer)
329
+ ```
330
+ However, users intending to use **CacheDiT** for DiT with **dynamic input shapes** should consider increasing the **recompile** **limit** of `torch._dynamo` to achieve better performance.
331
+
332
+ ```python
333
+ torch._dynamo.config.recompile_limit = 96 # default is 8
334
+ torch._dynamo.config.accumulated_recompile_limit = 2048 # default is 256
335
+ ```
336
+ Otherwise, the recompile_limit error may be triggered, causing the module to fall back to eager mode.
337
+
338
+ ## 🎉Supported Models
339
+
340
+ <div id="supported"></div>
341
+
342
+ - [🚀FLUX.1](https://github.com/vipshop/cache-dit/raw/main/src/cache_dit/cache_factory/dual_block_cache/diffusers_adapters)
343
+ - [🚀CogVideoX](https://github.com/vipshop/cache-dit/raw/main/src/cache_dit/cache_factory/dual_block_cache/diffusers_adapters)
344
+ - [🚀Mochi](https://github.com/vipshop/cache-dit/raw/main/src/cache_dit/cache_factory/dual_block_cache/diffusers_adapters)
345
+
346
+ ## 👋Contribute
347
+ <div id="contribute"></div>
348
+
349
+ How to contribute? Star this repo or check [CONTRIBUTE.md](./CONTRIBUTE.md).
350
+
351
+ ## ©️License
352
+
353
+ <div id="license"></div>
354
+
355
+
356
+ We have followed the original License from [ParaAttention](https://github.com/chengzeyi/ParaAttention), please check [LICENSE](./LICENSE) for more details.
@@ -1,17 +1,18 @@
1
1
  <div align="center">
2
2
  <p align="center">
3
- <h3>⚡️DBCache: A Training-free UNet-style Cache Acceleration for <br>Diffusion Transformers</h2>
3
+ <h3>🤗 CacheDiT: A Training-free and Easy-to-use Cache Acceleration <br>Toolbox for Diffusion Transformers</h3>
4
4
  </p>
5
- <img src=./assets/DBCache.png >
5
+ <img src=https://github.com/vipshop/cache-dit/raw/main/assets/cache-dit.png >
6
6
  <div align='center'>
7
- <img src=https://img.shields.io/badge/Language-Python-brightgreen.svg >
8
- <img src=https://img.shields.io/badge/PRs-welcome-9cf.svg >
9
- <img src=https://img.shields.io/badge/Build-pass-brightgreen.svg >
10
- <img src=https://img.shields.io/badge/Python-3.10|3.11|3.12-9cf.svg >
11
- <img src=https://img.shields.io/badge/Release-v0.1.0-brightgreen.svg >
7
+ <img src=https://img.shields.io/badge/Language-Python-brightgreen.svg >
8
+ <img src=https://img.shields.io/badge/PRs-welcome-9cf.svg >
9
+ <img src=https://img.shields.io/badge/PyPI-pass-brightgreen.svg >
10
+ <img src=https://static.pepy.tech/badge/cache-dit >
11
+ <img src=https://img.shields.io/badge/Python-3.10|3.11|3.12-9cf.svg >
12
+ <img src=https://img.shields.io/badge/Release-v0.1.3-brightgreen.svg >
12
13
  </div>
13
14
  <p align="center">
14
- DeepCache requires UNet’s U-shape, but DiT lacks it. Most DiT cache accelerators are complex and not training-free. DBCache builds on FBCache to create a training-free, UNet-style cache accelerator for DiT.
15
+ DeepCache is for UNet not DiT. Most DiT cache speedups are complex and not training-free. CacheDiT provides <br>a series of training-free, UNet-style cache accelerators for DiT: DBCache, DBPrune, FBCache, etc.
15
16
  </p>
16
17
  </div>
17
18
 
@@ -34,7 +35,7 @@
34
35
  |Baseline(L20x1)|F1B0 (0.08)|F1B0 (0.20)|F8B8 (0.15)|F12B12 (0.20)|F16B16 (0.20)|
35
36
  |:---:|:---:|:---:|:---:|:---:|:---:|
36
37
  |24.85s|15.59s|8.58s|15.41s|15.11s|17.74s|
37
- |<img src=./assets/NONE_R0.08_S0.png width=105px>|<img src=./assets/DBCACHE_F1B0S1_R0.08_S11.png width=105px> | <img src=./assets/DBCACHE_F1B0S1_R0.2_S19.png width=105px>|<img src=./assets/DBCACHE_F8B8S1_R0.15_S15.png width=105px>|<img src=./assets/DBCACHE_F12B12S4_R0.2_S16.png width=105px>|<img src=./assets/DBCACHE_F16B16S4_R0.2_S13.png width=105px>|
38
+ |<img src=https://github.com/vipshop/cache-dit/raw/main/assets/NONE_R0.08_S0.png width=105px>|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/DBCACHE_F1B0S1_R0.08_S11.png width=105px> | <img src=https://github.com/vipshop/cache-dit/raw/main/assets/DBCACHE_F1B0S1_R0.2_S19.png width=105px>|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/DBCACHE_F8B8S1_R0.15_S15.png width=105px>|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/DBCACHE_F12B12S4_R0.2_S16.png width=105px>|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/DBCACHE_F16B16S4_R0.2_S13.png width=105px>|
38
39
  |**Baseline(L20x1)**|**F1B0 (0.08)**|**F8B8 (0.12)**|**F8B12 (0.20)**|**F8B16 (0.20)**|**F8B20 (0.20)**|
39
40
  |27.85s|6.04s|5.88s|5.77s|6.01s|6.20s|
40
41
  |<img src=https://github.com/user-attachments/assets/70ea57f4-d8f2-415b-8a96-d8315974a5e6 width=105px>|<img src=https://github.com/user-attachments/assets/fc0e1a67-19cc-44aa-bf50-04696e7978a0 width=105px> |<img src=https://github.com/user-attachments/assets/d1434896-628c-436b-95ad-43c085a8629e width=105px>|<img src=https://github.com/user-attachments/assets/aaa42cd2-57de-4c4e-8bfb-913018a8251d width=105px>|<img src=https://github.com/user-attachments/assets/dc0ba2a4-ef7c-436d-8a39-67055deab92f width=105px>|<img src=https://github.com/user-attachments/assets/aede466f-61ed-4256-8df0-fecf8020c5ca width=105px>|
@@ -53,12 +54,12 @@ These case studies demonstrate that even with relatively high thresholds (such a
53
54
  </p>
54
55
  </div>
55
56
 
56
- **DBPrune**: Dynamic Block Prune algorithm with Residual Caching. We have further implemented a new Dynamic Block Prune algorithm based on residual caching for Diffusion Transformers, referred to as DBPrune. DBPrune is currently in the experimental phase, and we kindly invite you to stay tuned for upcoming updates.
57
+ **DBPrune**: We have further implemented a new **Dynamic Block Prune** algorithm based on **Residual Caching** for Diffusion Transformers, referred to as DBPrune. DBPrune caches each block's hidden states and residuals, then **dynamically prunes** blocks during inference by computing the L1 distance between previous hidden states. When a block is pruned, its output is approximated using the cached residuals.
57
58
 
58
59
  |Baseline(L20x1)|Pruned(24%)|Pruned(35%)|Pruned(38%)|Pruned(45%)|Pruned(60%)|
59
60
  |:---:|:---:|:---:|:---:|:---:|:---:|
60
61
  |24.85s|19.43s|16.82s|15.95s|14.24s|10.66s|
61
- |<img src=./assets/NONE_R0.08_S0.png width=105px>|<img src=./assets/DBPRUNE_F1B0_R0.03_P24.0_T19.43s.png width=105px> | <img src=./assets/DBPRUNE_F1B0_R0.04_P34.6_T16.82s.png width=105px>|<img src=./assets/DBPRUNE_F1B0_R0.05_P38.3_T15.95s.png width=105px>|<img src=./assets/DBPRUNE_F1B0_R0.06_P45.2_T14.24s.png width=105px>|<img src=./assets/DBPRUNE_F1B0_R0.2_P59.5_T10.66s.png width=105px>|
62
+ |<img src=https://github.com/vipshop/cache-dit/raw/main/assets/NONE_R0.08_S0.png width=105px>|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/DBPRUNE_F1B0_R0.03_P24.0_T19.43s.png width=105px> | <img src=https://github.com/vipshop/cache-dit/raw/main/assets/DBPRUNE_F1B0_R0.04_P34.6_T16.82s.png width=105px>|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/DBPRUNE_F1B0_R0.05_P38.3_T15.95s.png width=105px>|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/DBPRUNE_F1B0_R0.06_P45.2_T14.24s.png width=105px>|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/DBPRUNE_F1B0_R0.2_P59.5_T10.66s.png width=105px>|
62
63
 
63
64
  <div align="center">
64
65
  <p align="center">
@@ -68,13 +69,17 @@ These case studies demonstrate that even with relatively high thresholds (such a
68
69
 
69
70
  Moreover, both DBCache and DBPrune are **plug-and-play** solutions that works hand-in-hand with [ParaAttention](https://github.com/chengzeyi/ParaAttention). Users can easily tap into its **Context Parallelism** features for distributed inference.
70
71
 
72
+ <p align="center">
73
+ ♥️ Please consider to leave a ⭐️ Star to support us ~ ♥️
74
+ </p>
75
+
71
76
  ## ©️Citations
72
77
 
73
78
  ```BibTeX
74
- @misc{DBCache@2025,
75
- title={DBCache: A Training-free UNet-style Cache Acceleration for Diffusion Transformers},
76
- url={https://github.com/vipshop/DBCache.git},
77
- note={Open-source software available at https://github.com/vipshop/DBCache.git},
79
+ @misc{CacheDiT@2025,
80
+ title={CacheDiT: A Training-free and Easy-to-use cache acceleration Toolbox for Diffusion Transformers},
81
+ url={https://github.com/vipshop/cache-dit.git},
82
+ note={Open-source software available at https://github.com/vipshop/cache-dit.git},
78
83
  author={vipshop.com},
79
84
  year={2025}
80
85
  }
@@ -84,7 +89,7 @@ Moreover, both DBCache and DBPrune are **plug-and-play** solutions that works ha
84
89
 
85
90
  <div id="reference"></div>
86
91
 
87
- **DBCache** is built upon **FBCache**. The **DBCache** codebase was adapted from FBCache's implementation at the [ParaAttention](https://github.com/chengzeyi/ParaAttention/tree/main/src/para_attn/first_block_cache). We would like to express our sincere gratitude for this excellent work!
92
+ The **CacheDiT** codebase was adapted from FBCache's implementation at the [ParaAttention](https://github.com/chengzeyi/ParaAttention/tree/main/src/para_attn/first_block_cache). We would like to express our sincere gratitude for this excellent work!
88
93
 
89
94
  ## 📖Contents
90
95
 
@@ -105,20 +110,15 @@ Moreover, both DBCache and DBPrune are **plug-and-play** solutions that works ha
105
110
 
106
111
  <div id="installation"></div>
107
112
 
108
- You can install `DBCache` from GitHub:
113
+ You can install the stable release of `cache-dit` from PyPI:
109
114
 
110
115
  ```bash
111
- pip3 install git+https://github.com/vipshop/DBCache.git
116
+ pip3 install cache-dit
112
117
  ```
113
-
114
- or just install it from sources:
118
+ Or you can install the latest develop version from GitHub:
115
119
 
116
120
  ```bash
117
- git clone https://github.com/vipshop/DBCache.git && cd DBCache
118
- pip3 install 'torch==2.7.0' 'setuptools>=64' 'setuptools_scm>=8'
119
-
120
- pip3 install -e '.[dev]' --no-build-isolation # build editable package
121
- python3 -m build && pip3 install ./dist/cache_dit-*.whl # or build whl first and then install it.
121
+ pip3 install git+https://github.com/vipshop/cache-dit.git
122
122
  ```
123
123
 
124
124
  ## ⚡️DBCache: Dual Block Cache
@@ -168,8 +168,9 @@ Moreover, users configuring higher **Bn** values (e.g., **F8B16**) while aiming
168
168
  cache_options = {
169
169
  # 0, 2, 4, ..., 14, 15, etc. [0,16)
170
170
  "Bn_compute_blocks_ids": CacheType.range(0, 16, 2),
171
- # Skip Bn blocks (1, 3, 5 ,..., etc.) only if the L1 diff
172
- # lower than this value, otherwise, compute it.
171
+ # If the L1 difference is below this threshold, skip Bn blocks
172
+ # not in `Bn_compute_blocks_ids`(1, 3,..., etc), Otherwise,
173
+ # compute these blocks.
173
174
  "non_compute_blocks_diff_threshold": 0.08,
174
175
  }
175
176
  ```
@@ -214,7 +215,7 @@ apply_cache_on_pipe(pipe, **cache_options)
214
215
 
215
216
  ![](https://github.com/user-attachments/assets/932b6360-9533-4352-b176-4c4d84bd4695)
216
217
 
217
- We have further implemented a new **Dynamic Block Prune** algorithm based on **Residual Caching** for Diffusion Transformers, which is referred to as **DBPrune**. DBPrune is currently in the experimental phase, and we kindly invite you to stay tuned for upcoming updates.
218
+ We have further implemented a new **Dynamic Block Prune** algorithm based on **Residual Caching** for Diffusion Transformers, which is referred to as **DBPrune**. DBPrune caches each block's hidden states and residuals, then dynamically prunes blocks during inference by computing the L1 distance between previous hidden states. When a block is pruned, its output is approximated using the cached residuals. DBPrune is currently in the experimental phase, and we kindly invite you to stay tuned for upcoming updates.
218
219
 
219
220
  ```python
220
221
  from diffusers import FluxPipeline
@@ -240,13 +241,13 @@ apply_cache_on_pipe(pipe, **cache_options)
240
241
  |Baseline(L20x1)|Pruned(24%)|Pruned(35%)|Pruned(38%)|Pruned(45%)|Pruned(60%)|
241
242
  |:---:|:---:|:---:|:---:|:---:|:---:|
242
243
  |24.85s|19.43s|16.82s|15.95s|14.24s|10.66s|
243
- |<img src=./assets/NONE_R0.08_S0.png width=105px>|<img src=./assets/DBPRUNE_F1B0_R0.03_P24.0_T19.43s.png width=105px> | <img src=./assets/DBPRUNE_F1B0_R0.04_P34.6_T16.82s.png width=105px>|<img src=./assets/DBPRUNE_F1B0_R0.05_P38.3_T15.95s.png width=105px>|<img src=./assets/DBPRUNE_F1B0_R0.06_P45.2_T14.24s.png width=105px>|<img src=./assets/DBPRUNE_F1B0_R0.2_P59.5_T10.66s.png width=105px>|
244
+ |<img src=https://github.com/vipshop/cache-dit/raw/main/assets/NONE_R0.08_S0.png width=105px>|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/DBPRUNE_F1B0_R0.03_P24.0_T19.43s.png width=105px> | <img src=https://github.com/vipshop/cache-dit/raw/main/assets/DBPRUNE_F1B0_R0.04_P34.6_T16.82s.png width=105px>|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/DBPRUNE_F1B0_R0.05_P38.3_T15.95s.png width=105px>|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/DBPRUNE_F1B0_R0.06_P45.2_T14.24s.png width=105px>|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/DBPRUNE_F1B0_R0.2_P59.5_T10.66s.png width=105px>|
244
245
 
245
246
  ## 🎉Context Parallelism
246
247
 
247
248
  <div id="context-parallelism"></div>
248
249
 
249
- DBCache and DBPrune are **plug-and-play** solutions that works hand-in-hand with [ParaAttention](https://github.com/chengzeyi/ParaAttention). Users can **easily tap into** its **Context Parallelism** features for distributed inference. Firstly, install `para-attn` from PyPI:
250
+ **CacheDiT** are **plug-and-play** solutions that works hand-in-hand with [ParaAttention](https://github.com/chengzeyi/ParaAttention). Users can **easily tap into** its **Context Parallelism** features for distributed inference. Firstly, install `para-attn` from PyPI:
250
251
 
251
252
  ```bash
252
253
  pip3 install para-attn # or install `para-attn` from sources.
@@ -282,7 +283,7 @@ apply_cache_on_pipe(
282
283
 
283
284
  <div id="compile"></div>
284
285
 
285
- **DBCache** and **DBPrune** are designed to work compatibly with `torch.compile`. For example:
286
+ **CacheDiT** are designed to work compatibly with `torch.compile`. For example:
286
287
 
287
288
  ```python
288
289
  apply_cache_on_pipe(
@@ -291,7 +292,7 @@ apply_cache_on_pipe(
291
292
  # Compile the Transformer module
292
293
  pipe.transformer = torch.compile(pipe.transformer)
293
294
  ```
294
- However, users intending to use DBCache and DBPrune for DiT with **dynamic input shapes** should consider increasing the **recompile** **limit** of `torch._dynamo` to achieve better performance.
295
+ However, users intending to use **CacheDiT** for DiT with **dynamic input shapes** should consider increasing the **recompile** **limit** of `torch._dynamo` to achieve better performance.
295
296
 
296
297
  ```python
297
298
  torch._dynamo.config.recompile_limit = 96 # default is 8
@@ -303,9 +304,9 @@ Otherwise, the recompile_limit error may be triggered, causing the module to fal
303
304
 
304
305
  <div id="supported"></div>
305
306
 
306
- - [🚀FLUX.1](./src/cache_dit/cache_factory/dual_block_cache/diffusers_adapters)
307
- - [🚀CogVideoX](./src/cache_dit/cache_factory/dual_block_cache/diffusers_adapters)
308
- - [🚀Mochi](./src/cache_dit/cache_factory/dual_block_cache/diffusers_adapters)
307
+ - [🚀FLUX.1](https://github.com/vipshop/cache-dit/raw/main/src/cache_dit/cache_factory/dual_block_cache/diffusers_adapters)
308
+ - [🚀CogVideoX](https://github.com/vipshop/cache-dit/raw/main/src/cache_dit/cache_factory/dual_block_cache/diffusers_adapters)
309
+ - [🚀Mochi](https://github.com/vipshop/cache-dit/raw/main/src/cache_dit/cache_factory/dual_block_cache/diffusers_adapters)
309
310
 
310
311
  ## 👋Contribute
311
312
  <div id="contribute"></div>
Binary file
@@ -25,6 +25,7 @@ def get_args() -> argparse.ArgumentParser:
25
25
  parser.add_argument("--Bn-steps", "--BnS", type=int, default=1)
26
26
  parser.add_argument("--warmup-steps", type=int, default=0)
27
27
  parser.add_argument("--max-cached-steps", type=int, default=-1)
28
+ parser.add_argument("--max-pruned-steps", type=int, default=-1)
28
29
  parser.add_argument("--seed", type=int, default=0)
29
30
  parser.add_argument(
30
31
  "--compile",
@@ -79,7 +80,7 @@ def get_cache_options(cache_type: CacheType, args: argparse.Namespace):
79
80
  "Fn_compute_blocks": args.Fn_compute_blocks,
80
81
  "Bn_compute_blocks": args.Bn_compute_blocks,
81
82
  "warmup_steps": args.warmup_steps,
82
- "max_pruned_steps": args.max_cached_steps, # -1 means no limit
83
+ "max_pruned_steps": args.max_pruned_steps, # -1 means no limit
83
84
  # releative token diff threshold, default is 0.0
84
85
  "important_condition_threshold": 0.00,
85
86
  "enable_dynamic_prune_threshold": (
@@ -4,13 +4,14 @@ name = "cache_dit"
4
4
  dynamic = ["version", "dependencies", "optional-dependencies"]
5
5
  requires-python = ">=3.10"
6
6
  authors = [{name = "DefTruth, vipshop.com, etc."}]
7
- description = "⚡️DBCache: A Training-free UNet-style Cache Acceleration for Diffusion Transformers"
7
+ description = "🤗 CacheDiT: A Training-free and Easy-to-use Cache Acceleration Toolbox for Diffusion Transformers"
8
8
  maintainers = [{name="DefTruth, vipshop.com, etc"}]
9
+ readme = "README.md"
9
10
 
10
11
  [project.urls]
11
12
 
12
- Repository = "https://github.com/vipshop/DBCache.git"
13
- Homepage = "https://github.com/vipshop/DBCache.git"
13
+ Repository = "https://github.com/vipshop/cache-dit.git"
14
+ Homepage = "https://github.com/vipshop/cache-dit.git"
14
15
 
15
16
  [build-system]
16
17
 
@@ -1,9 +1,6 @@
1
1
  [bdist_wheel]
2
2
  universal = 0
3
3
 
4
- [metadata]
5
- license-files = ["LICENSE"]
6
-
7
4
  [pep8]
8
5
  max-line-length = 80
9
6
 
@@ -44,7 +44,7 @@ def fetch_requirements():
44
44
 
45
45
  setup(
46
46
  name=PACKAGE_NAME,
47
- description="⚡️DBCache: A Training-free UNet-style Cache Acceleration for Diffusion Transformers",
47
+ description="🤗 CacheDiT: A Training-free and Easy-to-use Cache Acceleration Toolbox for Diffusion Transformers",
48
48
  author="vipshop.com",
49
49
  use_scm_version={
50
50
  "write_to": path.join("src", "cache_dit", "_version.py"),
@@ -17,5 +17,5 @@ __version__: str
17
17
  __version_tuple__: VERSION_TUPLE
18
18
  version_tuple: VERSION_TUPLE
19
19
 
20
- __version__ = version = '0.1.1.dev2'
21
- __version_tuple__ = version_tuple = (0, 1, 1, 'dev2')
20
+ __version__ = version = '0.1.3'
21
+ __version_tuple__ = version_tuple = (0, 1, 3)
@@ -160,7 +160,9 @@ def apply_cache_on_pipe(pipe: DiffusionPipeline, *args, **kwargs):
160
160
  elif cache_type == CacheType.DBPrune:
161
161
  return apply_db_prune_on_pipe(pipe, *args, **kwargs)
162
162
  elif cache_type == CacheType.NONE:
163
- logger.warning("Cache type is NONE, no caching will be applied.")
163
+ logger.warning(
164
+ f"Cache type is {cache_type}, no caching will be applied."
165
+ )
164
166
  return pipe
165
167
  else:
166
168
  raise ValueError(f"Unknown cache type: {cache_type}")
@@ -49,7 +49,7 @@ def apply_db_cache_on_transformer(
49
49
  return transformer
50
50
 
51
51
 
52
- def apply_cache_on_pipe(
52
+ def apply_db_cache_on_pipe(
53
53
  pipe: DiffusionPipeline,
54
54
  *,
55
55
  shallow_patch: bool = False,