cache-dit 0.2.1__tar.gz → 0.2.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of cache-dit might be problematic. Click here for more details.

Files changed (132) hide show
  1. {cache_dit-0.2.1 → cache_dit-0.2.2}/PKG-INFO +72 -39
  2. {cache_dit-0.2.1 → cache_dit-0.2.2}/README.md +71 -38
  3. cache_dit-0.2.2/assets/TEXTURE_DBCACHE_F1B0_R0.08.png +0 -0
  4. cache_dit-0.2.2/assets/TEXTURE_DBCACHE_F8B12_R0.12.png +0 -0
  5. cache_dit-0.2.2/assets/TEXTURE_DBCACHE_F8B16_R0.2.png +0 -0
  6. cache_dit-0.2.2/assets/TEXTURE_DBCACHE_F8B20_R0.2.png +0 -0
  7. cache_dit-0.2.2/assets/TEXTURE_DBCACHE_F8B8_R0.12.png +0 -0
  8. cache_dit-0.2.2/assets/TEXTURE_NONE_R0.08.png +0 -0
  9. cache_dit-0.2.2/assets/U0_C0_DBCACHE_F1B0S1W0T0ET0_R0.12_S14_T12.85s.png +0 -0
  10. cache_dit-0.2.2/assets/U0_C0_DBCACHE_F1B0S1W0T0ET0_R0.15_S17_T10.27s.png +0 -0
  11. cache_dit-0.2.2/assets/U0_C0_DBCACHE_F1B0S1W0T1ET1_R0.12_S14_T12.86s.png +0 -0
  12. cache_dit-0.2.2/assets/U0_C0_DBCACHE_F1B0S1W0T1ET1_R0.15_S17_T10.28s.png +0 -0
  13. cache_dit-0.2.2/assets/U0_C1_DBCACHE_F1B0S1W0T1ET1_R0.15_S17_T8.48s.png +0 -0
  14. cache_dit-0.2.2/assets/cache-dit-v1.png +0 -0
  15. cache_dit-0.2.2/assets/dbcache-fnbn-v1.png +0 -0
  16. cache_dit-0.2.2/assets/dbcache-v1.png +0 -0
  17. cache_dit-0.2.2/assets/dbprune-v1.png +0 -0
  18. cache_dit-0.2.2/assets/fbcache-v1.png +0 -0
  19. {cache_dit-0.2.1 → cache_dit-0.2.2}/bench/bench.py +47 -8
  20. {cache_dit-0.2.1 → cache_dit-0.2.2}/src/cache_dit/_version.py +2 -2
  21. {cache_dit-0.2.1 → cache_dit-0.2.2}/src/cache_dit/cache_factory/dual_block_cache/cache_context.py +282 -46
  22. {cache_dit-0.2.1 → cache_dit-0.2.2}/src/cache_dit/cache_factory/dual_block_cache/diffusers_adapters/__init__.py +0 -2
  23. {cache_dit-0.2.1 → cache_dit-0.2.2}/src/cache_dit/cache_factory/dual_block_cache/diffusers_adapters/cogvideox.py +0 -2
  24. {cache_dit-0.2.1 → cache_dit-0.2.2}/src/cache_dit/cache_factory/dual_block_cache/diffusers_adapters/flux.py +0 -2
  25. {cache_dit-0.2.1 → cache_dit-0.2.2}/src/cache_dit/cache_factory/dual_block_cache/diffusers_adapters/hunyuan_video.py +0 -1
  26. {cache_dit-0.2.1 → cache_dit-0.2.2}/src/cache_dit/cache_factory/dual_block_cache/diffusers_adapters/mochi.py +0 -1
  27. {cache_dit-0.2.1 → cache_dit-0.2.2}/src/cache_dit/cache_factory/dual_block_cache/diffusers_adapters/wan.py +0 -2
  28. {cache_dit-0.2.1 → cache_dit-0.2.2}/src/cache_dit/cache_factory/dynamic_block_prune/diffusers_adapters/__init__.py +0 -2
  29. {cache_dit-0.2.1 → cache_dit-0.2.2}/src/cache_dit/cache_factory/dynamic_block_prune/diffusers_adapters/cogvideox.py +0 -2
  30. {cache_dit-0.2.1 → cache_dit-0.2.2}/src/cache_dit/cache_factory/dynamic_block_prune/diffusers_adapters/flux.py +0 -2
  31. {cache_dit-0.2.1 → cache_dit-0.2.2}/src/cache_dit/cache_factory/dynamic_block_prune/diffusers_adapters/hunyuan_video.py +0 -1
  32. {cache_dit-0.2.1 → cache_dit-0.2.2}/src/cache_dit/cache_factory/dynamic_block_prune/diffusers_adapters/mochi.py +0 -2
  33. {cache_dit-0.2.1 → cache_dit-0.2.2}/src/cache_dit/cache_factory/dynamic_block_prune/diffusers_adapters/wan.py +0 -2
  34. {cache_dit-0.2.1 → cache_dit-0.2.2}/src/cache_dit/cache_factory/first_block_cache/cache_context.py +3 -0
  35. {cache_dit-0.2.1 → cache_dit-0.2.2}/src/cache_dit/cache_factory/taylorseer.py +30 -0
  36. {cache_dit-0.2.1 → cache_dit-0.2.2}/src/cache_dit.egg-info/PKG-INFO +72 -39
  37. {cache_dit-0.2.1 → cache_dit-0.2.2}/src/cache_dit.egg-info/SOURCES.txt +23 -1
  38. cache_dit-0.2.2/tests/.gitignore +167 -0
  39. cache_dit-0.2.2/tests/README.md +9 -0
  40. cache_dit-0.2.2/tests/taylorseer_approximation_order_2.png +0 -0
  41. cache_dit-0.2.2/tests/taylorseer_approximation_order_4.png +0 -0
  42. cache_dit-0.2.2/tests/taylorseer_approximation_test.png +0 -0
  43. cache_dit-0.2.2/tests/test_taylorseer.py +81 -0
  44. {cache_dit-0.2.1 → cache_dit-0.2.2}/.github/workflows/issue.yml +0 -0
  45. {cache_dit-0.2.1 → cache_dit-0.2.2}/.gitignore +0 -0
  46. {cache_dit-0.2.1 → cache_dit-0.2.2}/.pre-commit-config.yaml +0 -0
  47. {cache_dit-0.2.1 → cache_dit-0.2.2}/CONTRIBUTE.md +0 -0
  48. {cache_dit-0.2.1 → cache_dit-0.2.2}/LICENSE +0 -0
  49. {cache_dit-0.2.1 → cache_dit-0.2.2}/MANIFEST.in +0 -0
  50. {cache_dit-0.2.1 → cache_dit-0.2.2}/assets/DBCACHE_F12B12S4_R0.2_S16.png +0 -0
  51. {cache_dit-0.2.1 → cache_dit-0.2.2}/assets/DBCACHE_F12B16S4_R0.08_S6.png +0 -0
  52. {cache_dit-0.2.1 → cache_dit-0.2.2}/assets/DBCACHE_F16B16S2_R0.2_S14.png +0 -0
  53. {cache_dit-0.2.1 → cache_dit-0.2.2}/assets/DBCACHE_F16B16S4_R0.2_S13.png +0 -0
  54. {cache_dit-0.2.1 → cache_dit-0.2.2}/assets/DBCACHE_F1B0S1_R0.08_S11.png +0 -0
  55. {cache_dit-0.2.1 → cache_dit-0.2.2}/assets/DBCACHE_F1B0S1_R0.2_S19.png +0 -0
  56. {cache_dit-0.2.1 → cache_dit-0.2.2}/assets/DBCACHE_F8B0S2_R0.12_S12.png +0 -0
  57. {cache_dit-0.2.1 → cache_dit-0.2.2}/assets/DBCACHE_F8B16S1_R0.2_S18.png +0 -0
  58. {cache_dit-0.2.1 → cache_dit-0.2.2}/assets/DBCACHE_F8B8S1_R0.08_S9.png +0 -0
  59. {cache_dit-0.2.1 → cache_dit-0.2.2}/assets/DBCACHE_F8B8S1_R0.12_S12.png +0 -0
  60. {cache_dit-0.2.1 → cache_dit-0.2.2}/assets/DBCACHE_F8B8S1_R0.15_S15.png +0 -0
  61. {cache_dit-0.2.1 → cache_dit-0.2.2}/assets/DBCache.png +0 -0
  62. {cache_dit-0.2.1 → cache_dit-0.2.2}/assets/DBPRUNE_F1B0_R0.03_P24.0_T19.43s.png +0 -0
  63. {cache_dit-0.2.1 → cache_dit-0.2.2}/assets/DBPRUNE_F1B0_R0.04_P34.6_T16.82s.png +0 -0
  64. {cache_dit-0.2.1 → cache_dit-0.2.2}/assets/DBPRUNE_F1B0_R0.05_P38.3_T15.95s.png +0 -0
  65. {cache_dit-0.2.1 → cache_dit-0.2.2}/assets/DBPRUNE_F1B0_R0.06_P45.2_T14.24s.png +0 -0
  66. {cache_dit-0.2.1 → cache_dit-0.2.2}/assets/DBPRUNE_F1B0_R0.07_P52.3_T12.53s.png +0 -0
  67. {cache_dit-0.2.1 → cache_dit-0.2.2}/assets/DBPRUNE_F1B0_R0.08_P52.4_T12.52s.png +0 -0
  68. {cache_dit-0.2.1 → cache_dit-0.2.2}/assets/DBPRUNE_F1B0_R0.09_P59.2_T10.81s.png +0 -0
  69. {cache_dit-0.2.1 → cache_dit-0.2.2}/assets/DBPRUNE_F1B0_R0.12_P59.5_T10.76s.png +0 -0
  70. {cache_dit-0.2.1 → cache_dit-0.2.2}/assets/DBPRUNE_F1B0_R0.12_P63.0_T9.90s.png +0 -0
  71. {cache_dit-0.2.1 → cache_dit-0.2.2}/assets/DBPRUNE_F1B0_R0.1_P62.8_T9.95s.png +0 -0
  72. {cache_dit-0.2.1 → cache_dit-0.2.2}/assets/DBPRUNE_F1B0_R0.2_P59.5_T10.66s.png +0 -0
  73. {cache_dit-0.2.1 → cache_dit-0.2.2}/assets/DBPRUNE_F1B0_R0.3_P63.1_T9.79s.png +0 -0
  74. {cache_dit-0.2.1 → cache_dit-0.2.2}/assets/NONE_R0.08_S0.png +0 -0
  75. {cache_dit-0.2.1 → cache_dit-0.2.2}/assets/U0_C1_DBPRUNE_F1B0_R0.03_P24.0_T16.25s.png +0 -0
  76. {cache_dit-0.2.1 → cache_dit-0.2.2}/assets/U0_C1_DBPRUNE_F1B0_R0.045_P38.2_T13.41s.png +0 -0
  77. {cache_dit-0.2.1 → cache_dit-0.2.2}/assets/U0_C1_DBPRUNE_F1B0_R0.04_P34.6_T14.12s.png +0 -0
  78. {cache_dit-0.2.1 → cache_dit-0.2.2}/assets/U0_C1_DBPRUNE_F1B0_R0.055_P45.1_T12.00s.png +0 -0
  79. {cache_dit-0.2.1 → cache_dit-0.2.2}/assets/U0_C1_DBPRUNE_F1B0_R0.05_P41.6_T12.70s.png +0 -0
  80. {cache_dit-0.2.1 → cache_dit-0.2.2}/assets/U0_C1_DBPRUNE_F1B0_R0.2_P59.5_T8.86s.png +0 -0
  81. {cache_dit-0.2.1 → cache_dit-0.2.2}/assets/U0_C1_DBPRUNE_F8B8_R0.08_P23.1_T16.14s.png +0 -0
  82. {cache_dit-0.2.1 → cache_dit-0.2.2}/assets/U0_C1_NONE_R0.08_S0_T20.43s.png +0 -0
  83. {cache_dit-0.2.1 → cache_dit-0.2.2}/assets/U4_C1_DBPRUNE_F1B0_R0.03_P27.3_T6.62s.png +0 -0
  84. {cache_dit-0.2.1 → cache_dit-0.2.2}/assets/U4_C1_DBPRUNE_F1B0_R0.03_P27.3_T6.63s.png +0 -0
  85. {cache_dit-0.2.1 → cache_dit-0.2.2}/assets/U4_C1_DBPRUNE_F1B0_R0.045_P38.2_T5.81s.png +0 -0
  86. {cache_dit-0.2.1 → cache_dit-0.2.2}/assets/U4_C1_DBPRUNE_F1B0_R0.045_P38.2_T5.82s.png +0 -0
  87. {cache_dit-0.2.1 → cache_dit-0.2.2}/assets/U4_C1_DBPRUNE_F1B0_R0.04_P34.6_T6.06s.png +0 -0
  88. {cache_dit-0.2.1 → cache_dit-0.2.2}/assets/U4_C1_DBPRUNE_F1B0_R0.04_P34.6_T6.07s.png +0 -0
  89. {cache_dit-0.2.1 → cache_dit-0.2.2}/assets/U4_C1_DBPRUNE_F1B0_R0.04_P34.6_T6.08s.png +0 -0
  90. {cache_dit-0.2.1 → cache_dit-0.2.2}/assets/U4_C1_DBPRUNE_F1B0_R0.055_P45.1_T5.27s.png +0 -0
  91. {cache_dit-0.2.1 → cache_dit-0.2.2}/assets/U4_C1_DBPRUNE_F1B0_R0.055_P45.1_T5.28s.png +0 -0
  92. {cache_dit-0.2.1 → cache_dit-0.2.2}/assets/U4_C1_DBPRUNE_F1B0_R0.2_P59.5_T3.95s.png +0 -0
  93. {cache_dit-0.2.1 → cache_dit-0.2.2}/assets/U4_C1_DBPRUNE_F1B0_R0.2_P59.5_T3.96s.png +0 -0
  94. {cache_dit-0.2.1 → cache_dit-0.2.2}/assets/U4_C1_NONE_R0.08_S0_T7.78s.png +0 -0
  95. {cache_dit-0.2.1 → cache_dit-0.2.2}/assets/U4_C1_NONE_R0.08_S0_T7.79s.png +0 -0
  96. {cache_dit-0.2.1 → cache_dit-0.2.2}/assets/cache-dit.png +0 -0
  97. {cache_dit-0.2.1 → cache_dit-0.2.2}/bench/.gitignore +0 -0
  98. {cache_dit-0.2.1 → cache_dit-0.2.2}/docs/.gitignore +0 -0
  99. {cache_dit-0.2.1 → cache_dit-0.2.2}/examples/.gitignore +0 -0
  100. {cache_dit-0.2.1 → cache_dit-0.2.2}/examples/README.md +0 -0
  101. {cache_dit-0.2.1 → cache_dit-0.2.2}/examples/data/cup.png +0 -0
  102. {cache_dit-0.2.1 → cache_dit-0.2.2}/examples/data/cup_mask.png +0 -0
  103. {cache_dit-0.2.1 → cache_dit-0.2.2}/examples/requirements.txt +0 -0
  104. {cache_dit-0.2.1 → cache_dit-0.2.2}/examples/run_cogvideox.py +0 -0
  105. {cache_dit-0.2.1 → cache_dit-0.2.2}/examples/run_flux.py +0 -0
  106. {cache_dit-0.2.1 → cache_dit-0.2.2}/examples/run_flux_fill.py +0 -0
  107. {cache_dit-0.2.1 → cache_dit-0.2.2}/examples/run_hunyuan_video.py +0 -0
  108. {cache_dit-0.2.1 → cache_dit-0.2.2}/examples/run_mochi.py +0 -0
  109. {cache_dit-0.2.1 → cache_dit-0.2.2}/examples/run_wan.py +0 -0
  110. {cache_dit-0.2.1 → cache_dit-0.2.2}/pyproject.toml +0 -0
  111. {cache_dit-0.2.1 → cache_dit-0.2.2}/pytest.ini +0 -0
  112. {cache_dit-0.2.1 → cache_dit-0.2.2}/requirements.txt +0 -0
  113. {cache_dit-0.2.1 → cache_dit-0.2.2}/setup.cfg +0 -0
  114. {cache_dit-0.2.1 → cache_dit-0.2.2}/setup.py +0 -0
  115. {cache_dit-0.2.1 → cache_dit-0.2.2}/src/cache_dit/__init__.py +0 -0
  116. {cache_dit-0.2.1 → cache_dit-0.2.2}/src/cache_dit/cache_factory/__init__.py +0 -0
  117. {cache_dit-0.2.1 → cache_dit-0.2.2}/src/cache_dit/cache_factory/dual_block_cache/__init__.py +0 -0
  118. {cache_dit-0.2.1 → cache_dit-0.2.2}/src/cache_dit/cache_factory/dynamic_block_prune/__init__.py +0 -0
  119. {cache_dit-0.2.1 → cache_dit-0.2.2}/src/cache_dit/cache_factory/dynamic_block_prune/prune_context.py +0 -0
  120. {cache_dit-0.2.1 → cache_dit-0.2.2}/src/cache_dit/cache_factory/first_block_cache/__init__.py +0 -0
  121. {cache_dit-0.2.1 → cache_dit-0.2.2}/src/cache_dit/cache_factory/first_block_cache/diffusers_adapters/__init__.py +0 -0
  122. {cache_dit-0.2.1 → cache_dit-0.2.2}/src/cache_dit/cache_factory/first_block_cache/diffusers_adapters/cogvideox.py +0 -0
  123. {cache_dit-0.2.1 → cache_dit-0.2.2}/src/cache_dit/cache_factory/first_block_cache/diffusers_adapters/flux.py +0 -0
  124. {cache_dit-0.2.1 → cache_dit-0.2.2}/src/cache_dit/cache_factory/first_block_cache/diffusers_adapters/hunyuan_video.py +0 -0
  125. {cache_dit-0.2.1 → cache_dit-0.2.2}/src/cache_dit/cache_factory/first_block_cache/diffusers_adapters/mochi.py +0 -0
  126. {cache_dit-0.2.1 → cache_dit-0.2.2}/src/cache_dit/cache_factory/first_block_cache/diffusers_adapters/wan.py +0 -0
  127. {cache_dit-0.2.1 → cache_dit-0.2.2}/src/cache_dit/cache_factory/utils.py +0 -0
  128. {cache_dit-0.2.1 → cache_dit-0.2.2}/src/cache_dit/logger.py +0 -0
  129. {cache_dit-0.2.1 → cache_dit-0.2.2}/src/cache_dit/primitives.py +0 -0
  130. {cache_dit-0.2.1 → cache_dit-0.2.2}/src/cache_dit.egg-info/dependency_links.txt +0 -0
  131. {cache_dit-0.2.1 → cache_dit-0.2.2}/src/cache_dit.egg-info/requires.txt +0 -0
  132. {cache_dit-0.2.1 → cache_dit-0.2.2}/src/cache_dit.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: cache_dit
3
- Version: 0.2.1
3
+ Version: 0.2.2
4
4
  Summary: 🤗 CacheDiT: A Training-free and Easy-to-use Cache Acceleration Toolbox for Diffusion Transformers
5
5
  Author: DefTruth, vipshop.com, etc.
6
6
  Maintainer: DefTruth, vipshop.com, etc
@@ -37,31 +37,31 @@ Dynamic: requires-python
37
37
  <p align="center">
38
38
  <h2>🤗 CacheDiT: A Training-free and Easy-to-use Cache Acceleration <br>Toolbox for Diffusion Transformers</h2>
39
39
  </p>
40
- <img src=https://github.com/vipshop/cache-dit/raw/main/assets/cache-dit.png >
40
+ <img src=https://github.com/vipshop/cache-dit/raw/main/assets/cache-dit-v1.png >
41
41
  <div align='center'>
42
42
  <img src=https://img.shields.io/badge/Language-Python-brightgreen.svg >
43
43
  <img src=https://img.shields.io/badge/PRs-welcome-9cf.svg >
44
44
  <img src=https://img.shields.io/badge/PyPI-pass-brightgreen.svg >
45
45
  <img src=https://static.pepy.tech/badge/cache-dit >
46
46
  <img src=https://img.shields.io/badge/Python-3.10|3.11|3.12-9cf.svg >
47
- <img src=https://img.shields.io/badge/Release-v0.2.1-brightgreen.svg >
47
+ <img src=https://img.shields.io/badge/Release-v0.2.2-brightgreen.svg >
48
48
  </div>
49
49
  <p align="center">
50
- DeepCache is for UNet not DiT. Most DiT cache speedups are complex and not training-free. CacheDiT <br>offers a set of training-free cache accelerators for DiT: 🔥DBCache, DBPrune, FBCache, etc🔥
50
+ DeepCache is for UNet not DiT. Most DiT cache speedups are complex and not training-free. CacheDiT offers <br>a set of training-free cache accelerators for DiT: <b>🔥<a href="#dbcache">DBCache</a>, <a href="#dbprune">DBPrune</a>, <a href="#taylorseer">TaylorSeer</a>, <a href="#fbcache">FBCache</a></b>, etc🔥
51
51
  </p>
52
52
  </div>
53
53
 
54
- ## 👋 Highlight
55
-
56
- <div id="reference"></div>
57
-
58
- The **CacheDiT** codebase is adapted from [FBCache](https://github.com/chengzeyi/ParaAttention/tree/main/src/para_attn/first_block_cache). Special thanks to their excellent work! The **FBCache** support for Mochi, FLUX.1, CogVideoX, Wan2.1, and HunyuanVideo is directly adapted from the original [FBCache](https://github.com/chengzeyi/ParaAttention/tree/main/src/para_attn/first_block_cache).
54
+ <div align="center">
55
+ <p align="center">
56
+ <b>♥️ Please consider to leave a ⭐️ Star to support us ~ ♥️</b>
57
+ </p>
58
+ </div>
59
59
 
60
60
  ## 🤗 Introduction
61
61
 
62
62
  <div align="center">
63
63
  <p align="center">
64
- <h3>🔥 DBCache: Dual Block Caching for Diffusion Transformers</h3>
64
+ <h3>🔥DBCache: Dual Block Caching for Diffusion Transformers</h3>
65
65
  </p>
66
66
  </div>
67
67
 
@@ -77,9 +77,9 @@ The **CacheDiT** codebase is adapted from [FBCache](https://github.com/chengzeyi
77
77
  |:---:|:---:|:---:|:---:|:---:|:---:|
78
78
  |24.85s|15.59s|8.58s|15.41s|15.11s|17.74s|
79
79
  |<img src=https://github.com/vipshop/cache-dit/raw/main/assets/NONE_R0.08_S0.png width=105px>|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/DBCACHE_F1B0S1_R0.08_S11.png width=105px> | <img src=https://github.com/vipshop/cache-dit/raw/main/assets/DBCACHE_F1B0S1_R0.2_S19.png width=105px>|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/DBCACHE_F8B8S1_R0.15_S15.png width=105px>|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/DBCACHE_F12B12S4_R0.2_S16.png width=105px>|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/DBCACHE_F16B16S4_R0.2_S13.png width=105px>|
80
- |**Baseline(L20x1)**|**F1B0 (0.08)**|**F8B8 (0.12)**|**F8B12 (0.20)**|**F8B16 (0.20)**|**F8B20 (0.20)**|
80
+ |**Baseline(L20x1)**|**F1B0 (0.08)**|**F8B8 (0.12)**|**F8B12 (0.12)**|**F8B16 (0.20)**|**F8B20 (0.20)**|
81
81
  |27.85s|6.04s|5.88s|5.77s|6.01s|6.20s|
82
- |<img src=https://github.com/user-attachments/assets/70ea57f4-d8f2-415b-8a96-d8315974a5e6 width=105px>|<img src=https://github.com/user-attachments/assets/fc0e1a67-19cc-44aa-bf50-04696e7978a0 width=105px> |<img src=https://github.com/user-attachments/assets/d1434896-628c-436b-95ad-43c085a8629e width=105px>|<img src=https://github.com/user-attachments/assets/aaa42cd2-57de-4c4e-8bfb-913018a8251d width=105px>|<img src=https://github.com/user-attachments/assets/dc0ba2a4-ef7c-436d-8a39-67055deab92f width=105px>|<img src=https://github.com/user-attachments/assets/aede466f-61ed-4256-8df0-fecf8020c5ca width=105px>|
82
+ |<img src=https://github.com/vipshop/cache-dit/raw/main/assets/TEXTURE_NONE_R0.08.png width=105px>|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/TEXTURE_DBCACHE_F1B0_R0.08.png width=105px> |<img src=https://github.com/vipshop/cache-dit/raw/main/assets/TEXTURE_DBCACHE_F8B8_R0.12.png width=105px>|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/TEXTURE_DBCACHE_F8B12_R0.12.png width=105px>|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/TEXTURE_DBCACHE_F8B16_R0.2.png width=105px>|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/TEXTURE_DBCACHE_F8B20_R0.2.png width=105px>|
83
83
 
84
84
  <div align="center">
85
85
  <p align="center">
@@ -91,7 +91,7 @@ These case studies demonstrate that even with relatively high thresholds (such a
91
91
 
92
92
  <div align="center">
93
93
  <p align="center">
94
- <h3>🔥 DBPrune: Dynamic Block Prune with Residual Caching</h3>
94
+ <h3>🔥DBPrune: Dynamic Block Prune with Residual Caching</h3>
95
95
  </p>
96
96
  </div>
97
97
 
@@ -110,11 +110,11 @@ These case studies demonstrate that even with relatively high thresholds (such a
110
110
 
111
111
  <div align="center">
112
112
  <p align="center">
113
- <h3>🔥 Context Parallelism and Torch Compile</h3>
113
+ <h3>🔥Context Parallelism and Torch Compile</h3>
114
114
  </p>
115
115
  </div>
116
116
 
117
- Moreover, **CacheDiT** are **plug-and-play** solutions that works hand-in-hand with [ParaAttention](https://github.com/chengzeyi/ParaAttention). Users can easily tap into its **Context Parallelism** features for distributed inference. By the way, CacheDiT is designed to work compatibly with **torch.compile.** You can easily use CacheDiT with torch.compile to further achieve a better performance.
117
+ Moreover, **CacheDiT** are **plug-and-play** solutions that works hand-in-hand with [ParaAttention](https://github.com/chengzeyi/ParaAttention). Users can easily tap into its **Context Parallelism** features for distributed inference. CacheDiT is designed to work compatibly with **torch.compile.** You can easily use CacheDiT with torch.compile to further achieve a better performance.
118
118
 
119
119
  <div align="center">
120
120
  <p align="center">
@@ -128,12 +128,6 @@ Moreover, **CacheDiT** are **plug-and-play** solutions that works hand-in-hand w
128
128
  |+L20x4:7.75s|6.62s|6.03s|5.81s|5.24s|3.93s|
129
129
  |<img src=https://github.com/vipshop/cache-dit/raw/main/assets/U0_C1_NONE_R0.08_S0_T20.43s.png width=105px>|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/U0_C1_DBPRUNE_F1B0_R0.03_P24.0_T16.25s.png width=105px> | <img src=https://github.com/vipshop/cache-dit/raw/main/assets/U0_C1_DBPRUNE_F1B0_R0.04_P34.6_T14.12s.png width=105px>|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/U0_C1_DBPRUNE_F1B0_R0.045_P38.2_T13.41s.png width=105px>|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/U0_C1_DBPRUNE_F1B0_R0.055_P45.1_T12.00s.png width=105px>|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/U0_C1_DBPRUNE_F1B0_R0.2_P59.5_T8.86s.png width=105px>|
130
130
 
131
- <div align="center">
132
- <p align="center">
133
- <b>♥️ Please consider to leave a ⭐️ Star to support us ~ ♥️</b>
134
- </p>
135
- </div>
136
-
137
131
  ## ©️Citations
138
132
 
139
133
  ```BibTeX
@@ -146,6 +140,12 @@ Moreover, **CacheDiT** are **plug-and-play** solutions that works hand-in-hand w
146
140
  }
147
141
  ```
148
142
 
143
+ ## 👋Reference
144
+
145
+ <div id="reference"></div>
146
+
147
+ The **CacheDiT** codebase is adapted from [FBCache](https://github.com/chengzeyi/ParaAttention/tree/main/src/para_attn/first_block_cache). Special thanks to their excellent work!
148
+
149
149
  ## 📖Contents
150
150
 
151
151
  <div id="contents"></div>
@@ -153,6 +153,7 @@ Moreover, **CacheDiT** are **plug-and-play** solutions that works hand-in-hand w
153
153
  - [⚙️Installation](#️installation)
154
154
  - [🔥Supported Models](#supported)
155
155
  - [⚡️Dual Block Cache](#dbcache)
156
+ - [🔥Hybrid TaylorSeer](#taylorseer)
156
157
  - [🎉First Block Cache](#fbcache)
157
158
  - [⚡️Dynamic Block Prune](#dbprune)
158
159
  - [🎉Context Parallelism](#context-parallelism)
@@ -187,28 +188,19 @@ pip3 install git+https://github.com/vipshop/cache-dit.git
187
188
  - [🚀HunyuanVideo](https://github.com/vipshop/cache-dit/raw/main/examples)
188
189
 
189
190
 
190
- <!--
191
- <p align="center">
192
- <h4> 🔥Supported Models🔥</h4>
193
- <a href=https://github.com/vipshop/cache-dit/raw/main/examples> <b>🚀FLUX.1</b>: ✔️DBCache, ✔️DBPrune, ✔️FBCache🔥</a> <br>
194
- <a href=https://github.com/vipshop/cache-dit/raw/main/examples> <b>🚀Mochi</b>: ✔️DBCache, ✔️DBPrune, ✔️FBCache🔥</a> <br>
195
- <a href=https://github.com/vipshop/cache-dit/raw/main/examples> <b>🚀CogVideoX</b>: ✔️DBCache, ✔️DBPrune, ✔️FBCache🔥</a> <br>
196
- <a href=https://github.com/vipshop/cache-dit/raw/main/examples> <b>🚀CogVideoX1.5</b>: ✔️DBCache, ✔️DBPrune, ✔️FBCache🔥</a> <br>
197
- <a href=https://github.com/vipshop/cache-dit/raw/main/examples> <b>🚀Wan2.1</b>: ✔️DBCache, ✔️DBPrune, ✔️FBCache🔥</a> <br>
198
- <a href=https://github.com/vipshop/cache-dit/raw/main/examples> <b>🚀HunyuanVideo</b>: ✔️DBCache, ✔️DBPrune, ✔️FBCache🔥</a> <br>
199
- </p>
200
- -->
201
-
202
191
  ## ⚡️DBCache: Dual Block Cache
203
192
 
204
193
  <div id="dbcache"></div>
205
194
 
206
- ![](https://github.com/user-attachments/assets/c2a382b9-0ccd-46f4-aacc-87857b4a4de8)
195
+ ![](https://github.com/vipshop/cache-dit/raw/main/assets/dbcache-v1.png)
207
196
 
208
197
  **DBCache** provides configurable parameters for custom optimization, enabling a balanced trade-off between performance and precision:
209
198
 
210
199
  - **Fn**: Specifies that DBCache uses the **first n** Transformer blocks to fit the information at time step t, enabling the calculation of a more stable L1 diff and delivering more accurate information to subsequent blocks.
211
200
  - **Bn**: Further fuses approximate information in the **last n** Transformer blocks to enhance prediction accuracy. These blocks act as an auto-scaler for approximate hidden states that use residual cache.
201
+
202
+ ![](https://github.com/vipshop/cache-dit/raw/main/assets/dbcache-fnbn-v1.png)
203
+
212
204
  - **warmup_steps**: (default: 0) DBCache does not apply the caching strategy when the number of running steps is less than or equal to this value, ensuring the model sufficiently learns basic features during warmup.
213
205
  - **max_cached_steps**: (default: -1) DBCache disables the caching strategy when the previous cached steps exceed this value to prevent precision degradation.
214
206
  - **residual_diff_threshold**: The value of residual diff threshold, a higher value leads to faster performance at the cost of lower precision.
@@ -264,11 +256,50 @@ cache_options = {
264
256
  |24.85s|15.59s|8.58s|15.41s|15.11s|17.74s|
265
257
  |<img src=https://github.com/vipshop/cache-dit/raw/main/assets/NONE_R0.08_S0.png width=105px>|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/DBCACHE_F1B0S1_R0.08_S11.png width=105px> | <img src=https://github.com/vipshop/cache-dit/raw/main/assets/DBCACHE_F1B0S1_R0.2_S19.png width=105px>|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/DBCACHE_F8B8S1_R0.15_S15.png width=105px>|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/DBCACHE_F12B12S4_R0.2_S16.png width=105px>|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/DBCACHE_F16B16S4_R0.2_S13.png width=105px>|
266
258
 
259
+ ## 🔥Hybrid TaylorSeer
260
+
261
+ <div id="taylorseer"></div>
262
+
263
+ We have supported the [TaylorSeers: From Reusing to Forecasting: Accelerating Diffusion Models with TaylorSeers](https://arxiv.org/pdf/2503.06923) algorithm to further improve the precision of DBCache in cases where the cached steps are large, namely, **Hybrid TaylorSeer + DBCache**. At timesteps with significant intervals, the feature similarity in diffusion models decreases substantially, significantly harming the generation quality.
264
+
265
+ $$
266
+ \mathcal{F}\_{\text {pred }, m}\left(x_{t-k}^l\right)=\mathcal{F}\left(x_t^l\right)+\sum_{i=1}^m \frac{\Delta^i \mathcal{F}\left(x_t^l\right)}{i!\cdot N^i}(-k)^i
267
+ $$
268
+
269
+ **TaylorSeer** employs a differential method to approximate the higher-order derivatives of features and predict features in future timesteps with Taylor series expansion. The TaylorSeer implemented in CacheDiT supports both hidden states and residual cache types. That is $\mathcal{F}\_{\text {pred }, m}\left(x_{t-k}^l\right)$ can be a residual cache or a hidden-state cache.
270
+
271
+ ```python
272
+ cache_options = {
273
+ # TaylorSeer options
274
+ "enable_taylorseer": True,
275
+ "enable_encoder_taylorseer": True,
276
+ # Taylorseer cache type cache be hidden_states or residual.
277
+ "taylorseer_cache_type": "residual",
278
+ # Higher values of n_derivatives will lead to longer
279
+ # computation time but may improve precision significantly.
280
+ "taylorseer_kwargs": {
281
+ "n_derivatives": 2, # default is 2.
282
+ },
283
+ "warmup_steps": 3, # n_derivatives + 1
284
+ "residual_diff_threshold": 0.12,
285
+ }
286
+ ```
287
+ <div align="center">
288
+ <p align="center">
289
+ <b>DBCache F1B0 + TaylorSeer</b>, L20x1, Steps: 28, <br>"A cat holding a sign that says hello world with complex background"
290
+ </p>
291
+ </div>
292
+
293
+ |Baseline(L20x1)|F1B0 (0.12)|+TaylorSeer|F1B0 (0.15)|+TaylorSeer|+compile|
294
+ |:---:|:---:|:---:|:---:|:---:|:---:|
295
+ |24.85s|12.85s|12.86s|10.27s|10.28s|8.48s|
296
+ |<img src=https://github.com/vipshop/cache-dit/raw/main/assets/NONE_R0.08_S0.png width=105px>|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/U0_C0_DBCACHE_F1B0S1W0T0ET0_R0.12_S14_T12.85s.png width=105px>|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/U0_C0_DBCACHE_F1B0S1W0T1ET1_R0.12_S14_T12.86s.png width=105px>|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/U0_C0_DBCACHE_F1B0S1W0T0ET0_R0.15_S17_T10.27s.png width=105px>|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/U0_C0_DBCACHE_F1B0S1W0T1ET1_R0.15_S17_T10.28s.png width=105px>|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/U0_C1_DBCACHE_F1B0S1W0T1ET1_R0.15_S17_T8.48s.png width=105px>|
297
+
267
298
  ## 🎉FBCache: First Block Cache
268
299
 
269
300
  <div id="fbcache"></div>
270
301
 
271
- ![](https://github.com/user-attachments/assets/0fb66656-b711-457a-92a7-a830f134272d)
302
+ ![](https://github.com/vipshop/cache-dit/raw/main/assets/fbcache-v1.png)
272
303
 
273
304
  **DBCache** is a more general cache algorithm than **FBCache**. When Fn=1 and Bn=0, DBCache behaves identically to FBCache. Therefore, you can either use the original FBCache implementation directly or configure **DBCache** with **F1B0** settings to achieve the same functionality.
274
305
 
@@ -302,7 +333,7 @@ apply_cache_on_pipe(pipe, **cache_options)
302
333
 
303
334
  <div id="dbprune"></div>
304
335
 
305
- ![](https://github.com/user-attachments/assets/932b6360-9533-4352-b176-4c4d84bd4695)
336
+ ![](https://github.com/vipshop/cache-dit/raw/main/assets/dbprune-v1.png)
306
337
 
307
338
  We have further implemented a new **Dynamic Block Prune** algorithm based on **Residual Caching** for Diffusion Transformers, which is referred to as **DBPrune**. DBPrune caches each block's hidden states and residuals, then dynamically prunes blocks during inference by computing the L1 distance between previous hidden states. When a block is pruned, its output is approximated using the cached residuals. DBPrune is currently in the experimental phase, and we kindly invite you to stay tuned for upcoming updates.
308
339
 
@@ -389,7 +420,7 @@ from para_attn.context_parallel import init_context_parallel_mesh
389
420
  from para_attn.context_parallel.diffusers_adapters import parallelize_pipe
390
421
  from cache_dit.cache_factory import apply_cache_on_pipe, CacheType
391
422
 
392
- # Init distributed process group
423
+ # Init distributed process group
393
424
  dist.init_process_group()
394
425
  torch.cuda.set_device(dist.get_rank())
395
426
 
@@ -436,14 +467,16 @@ torch._dynamo.config.recompile_limit = 96 # default is 8
436
467
  torch._dynamo.config.accumulated_recompile_limit = 2048 # default is 256
437
468
  ```
438
469
 
470
+ Please check [bench.py](./bench/bench.py) for more details.
471
+
439
472
  ## 👋Contribute
440
473
  <div id="contribute"></div>
441
474
 
442
- How to contribute? Star ⭐️ this repo to support us or check [CONTRIBUTE.md](./CONTRIBUTE.md).
475
+ How to contribute? Star ⭐️ this repo to support us or check [CONTRIBUTE.md](https://github.com/vipshop/cache-dit/raw/main/CONTRIBUTE.md).
443
476
 
444
477
  ## ©️License
445
478
 
446
479
  <div id="license"></div>
447
480
 
448
481
 
449
- We have followed the original License from [ParaAttention](https://github.com/chengzeyi/ParaAttention), please check [LICENSE](./LICENSE) for more details.
482
+ We have followed the original License from [ParaAttention](https://github.com/chengzeyi/ParaAttention), please check [LICENSE](https://github.com/vipshop/cache-dit/raw/main/LICENSE) for more details.
@@ -2,31 +2,31 @@
2
2
  <p align="center">
3
3
  <h2>🤗 CacheDiT: A Training-free and Easy-to-use Cache Acceleration <br>Toolbox for Diffusion Transformers</h2>
4
4
  </p>
5
- <img src=https://github.com/vipshop/cache-dit/raw/main/assets/cache-dit.png >
5
+ <img src=https://github.com/vipshop/cache-dit/raw/main/assets/cache-dit-v1.png >
6
6
  <div align='center'>
7
7
  <img src=https://img.shields.io/badge/Language-Python-brightgreen.svg >
8
8
  <img src=https://img.shields.io/badge/PRs-welcome-9cf.svg >
9
9
  <img src=https://img.shields.io/badge/PyPI-pass-brightgreen.svg >
10
10
  <img src=https://static.pepy.tech/badge/cache-dit >
11
11
  <img src=https://img.shields.io/badge/Python-3.10|3.11|3.12-9cf.svg >
12
- <img src=https://img.shields.io/badge/Release-v0.2.1-brightgreen.svg >
12
+ <img src=https://img.shields.io/badge/Release-v0.2.2-brightgreen.svg >
13
13
  </div>
14
14
  <p align="center">
15
- DeepCache is for UNet not DiT. Most DiT cache speedups are complex and not training-free. CacheDiT <br>offers a set of training-free cache accelerators for DiT: 🔥DBCache, DBPrune, FBCache, etc🔥
15
+ DeepCache is for UNet not DiT. Most DiT cache speedups are complex and not training-free. CacheDiT offers <br>a set of training-free cache accelerators for DiT: <b>🔥<a href="#dbcache">DBCache</a>, <a href="#dbprune">DBPrune</a>, <a href="#taylorseer">TaylorSeer</a>, <a href="#fbcache">FBCache</a></b>, etc🔥
16
16
  </p>
17
17
  </div>
18
18
 
19
- ## 👋 Highlight
20
-
21
- <div id="reference"></div>
22
-
23
- The **CacheDiT** codebase is adapted from [FBCache](https://github.com/chengzeyi/ParaAttention/tree/main/src/para_attn/first_block_cache). Special thanks to their excellent work! The **FBCache** support for Mochi, FLUX.1, CogVideoX, Wan2.1, and HunyuanVideo is directly adapted from the original [FBCache](https://github.com/chengzeyi/ParaAttention/tree/main/src/para_attn/first_block_cache).
19
+ <div align="center">
20
+ <p align="center">
21
+ <b>♥️ Please consider to leave a ⭐️ Star to support us ~ ♥️</b>
22
+ </p>
23
+ </div>
24
24
 
25
25
  ## 🤗 Introduction
26
26
 
27
27
  <div align="center">
28
28
  <p align="center">
29
- <h3>🔥 DBCache: Dual Block Caching for Diffusion Transformers</h3>
29
+ <h3>🔥DBCache: Dual Block Caching for Diffusion Transformers</h3>
30
30
  </p>
31
31
  </div>
32
32
 
@@ -42,9 +42,9 @@ The **CacheDiT** codebase is adapted from [FBCache](https://github.com/chengzeyi
42
42
  |:---:|:---:|:---:|:---:|:---:|:---:|
43
43
  |24.85s|15.59s|8.58s|15.41s|15.11s|17.74s|
44
44
  |<img src=https://github.com/vipshop/cache-dit/raw/main/assets/NONE_R0.08_S0.png width=105px>|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/DBCACHE_F1B0S1_R0.08_S11.png width=105px> | <img src=https://github.com/vipshop/cache-dit/raw/main/assets/DBCACHE_F1B0S1_R0.2_S19.png width=105px>|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/DBCACHE_F8B8S1_R0.15_S15.png width=105px>|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/DBCACHE_F12B12S4_R0.2_S16.png width=105px>|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/DBCACHE_F16B16S4_R0.2_S13.png width=105px>|
45
- |**Baseline(L20x1)**|**F1B0 (0.08)**|**F8B8 (0.12)**|**F8B12 (0.20)**|**F8B16 (0.20)**|**F8B20 (0.20)**|
45
+ |**Baseline(L20x1)**|**F1B0 (0.08)**|**F8B8 (0.12)**|**F8B12 (0.12)**|**F8B16 (0.20)**|**F8B20 (0.20)**|
46
46
  |27.85s|6.04s|5.88s|5.77s|6.01s|6.20s|
47
- |<img src=https://github.com/user-attachments/assets/70ea57f4-d8f2-415b-8a96-d8315974a5e6 width=105px>|<img src=https://github.com/user-attachments/assets/fc0e1a67-19cc-44aa-bf50-04696e7978a0 width=105px> |<img src=https://github.com/user-attachments/assets/d1434896-628c-436b-95ad-43c085a8629e width=105px>|<img src=https://github.com/user-attachments/assets/aaa42cd2-57de-4c4e-8bfb-913018a8251d width=105px>|<img src=https://github.com/user-attachments/assets/dc0ba2a4-ef7c-436d-8a39-67055deab92f width=105px>|<img src=https://github.com/user-attachments/assets/aede466f-61ed-4256-8df0-fecf8020c5ca width=105px>|
47
+ |<img src=https://github.com/vipshop/cache-dit/raw/main/assets/TEXTURE_NONE_R0.08.png width=105px>|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/TEXTURE_DBCACHE_F1B0_R0.08.png width=105px> |<img src=https://github.com/vipshop/cache-dit/raw/main/assets/TEXTURE_DBCACHE_F8B8_R0.12.png width=105px>|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/TEXTURE_DBCACHE_F8B12_R0.12.png width=105px>|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/TEXTURE_DBCACHE_F8B16_R0.2.png width=105px>|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/TEXTURE_DBCACHE_F8B20_R0.2.png width=105px>|
48
48
 
49
49
  <div align="center">
50
50
  <p align="center">
@@ -56,7 +56,7 @@ These case studies demonstrate that even with relatively high thresholds (such a
56
56
 
57
57
  <div align="center">
58
58
  <p align="center">
59
- <h3>🔥 DBPrune: Dynamic Block Prune with Residual Caching</h3>
59
+ <h3>🔥DBPrune: Dynamic Block Prune with Residual Caching</h3>
60
60
  </p>
61
61
  </div>
62
62
 
@@ -75,11 +75,11 @@ These case studies demonstrate that even with relatively high thresholds (such a
75
75
 
76
76
  <div align="center">
77
77
  <p align="center">
78
- <h3>🔥 Context Parallelism and Torch Compile</h3>
78
+ <h3>🔥Context Parallelism and Torch Compile</h3>
79
79
  </p>
80
80
  </div>
81
81
 
82
- Moreover, **CacheDiT** are **plug-and-play** solutions that works hand-in-hand with [ParaAttention](https://github.com/chengzeyi/ParaAttention). Users can easily tap into its **Context Parallelism** features for distributed inference. By the way, CacheDiT is designed to work compatibly with **torch.compile.** You can easily use CacheDiT with torch.compile to further achieve a better performance.
82
+ Moreover, **CacheDiT** are **plug-and-play** solutions that works hand-in-hand with [ParaAttention](https://github.com/chengzeyi/ParaAttention). Users can easily tap into its **Context Parallelism** features for distributed inference. CacheDiT is designed to work compatibly with **torch.compile.** You can easily use CacheDiT with torch.compile to further achieve a better performance.
83
83
 
84
84
  <div align="center">
85
85
  <p align="center">
@@ -93,12 +93,6 @@ Moreover, **CacheDiT** are **plug-and-play** solutions that works hand-in-hand w
93
93
  |+L20x4:7.75s|6.62s|6.03s|5.81s|5.24s|3.93s|
94
94
  |<img src=https://github.com/vipshop/cache-dit/raw/main/assets/U0_C1_NONE_R0.08_S0_T20.43s.png width=105px>|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/U0_C1_DBPRUNE_F1B0_R0.03_P24.0_T16.25s.png width=105px> | <img src=https://github.com/vipshop/cache-dit/raw/main/assets/U0_C1_DBPRUNE_F1B0_R0.04_P34.6_T14.12s.png width=105px>|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/U0_C1_DBPRUNE_F1B0_R0.045_P38.2_T13.41s.png width=105px>|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/U0_C1_DBPRUNE_F1B0_R0.055_P45.1_T12.00s.png width=105px>|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/U0_C1_DBPRUNE_F1B0_R0.2_P59.5_T8.86s.png width=105px>|
95
95
 
96
- <div align="center">
97
- <p align="center">
98
- <b>♥️ Please consider to leave a ⭐️ Star to support us ~ ♥️</b>
99
- </p>
100
- </div>
101
-
102
96
  ## ©️Citations
103
97
 
104
98
  ```BibTeX
@@ -111,6 +105,12 @@ Moreover, **CacheDiT** are **plug-and-play** solutions that works hand-in-hand w
111
105
  }
112
106
  ```
113
107
 
108
+ ## 👋Reference
109
+
110
+ <div id="reference"></div>
111
+
112
+ The **CacheDiT** codebase is adapted from [FBCache](https://github.com/chengzeyi/ParaAttention/tree/main/src/para_attn/first_block_cache). Special thanks to their excellent work!
113
+
114
114
  ## 📖Contents
115
115
 
116
116
  <div id="contents"></div>
@@ -118,6 +118,7 @@ Moreover, **CacheDiT** are **plug-and-play** solutions that works hand-in-hand w
118
118
  - [⚙️Installation](#️installation)
119
119
  - [🔥Supported Models](#supported)
120
120
  - [⚡️Dual Block Cache](#dbcache)
121
+ - [🔥Hybrid TaylorSeer](#taylorseer)
121
122
  - [🎉First Block Cache](#fbcache)
122
123
  - [⚡️Dynamic Block Prune](#dbprune)
123
124
  - [🎉Context Parallelism](#context-parallelism)
@@ -152,28 +153,19 @@ pip3 install git+https://github.com/vipshop/cache-dit.git
152
153
  - [🚀HunyuanVideo](https://github.com/vipshop/cache-dit/raw/main/examples)
153
154
 
154
155
 
155
- <!--
156
- <p align="center">
157
- <h4> 🔥Supported Models🔥</h4>
158
- <a href=https://github.com/vipshop/cache-dit/raw/main/examples> <b>🚀FLUX.1</b>: ✔️DBCache, ✔️DBPrune, ✔️FBCache🔥</a> <br>
159
- <a href=https://github.com/vipshop/cache-dit/raw/main/examples> <b>🚀Mochi</b>: ✔️DBCache, ✔️DBPrune, ✔️FBCache🔥</a> <br>
160
- <a href=https://github.com/vipshop/cache-dit/raw/main/examples> <b>🚀CogVideoX</b>: ✔️DBCache, ✔️DBPrune, ✔️FBCache🔥</a> <br>
161
- <a href=https://github.com/vipshop/cache-dit/raw/main/examples> <b>🚀CogVideoX1.5</b>: ✔️DBCache, ✔️DBPrune, ✔️FBCache🔥</a> <br>
162
- <a href=https://github.com/vipshop/cache-dit/raw/main/examples> <b>🚀Wan2.1</b>: ✔️DBCache, ✔️DBPrune, ✔️FBCache🔥</a> <br>
163
- <a href=https://github.com/vipshop/cache-dit/raw/main/examples> <b>🚀HunyuanVideo</b>: ✔️DBCache, ✔️DBPrune, ✔️FBCache🔥</a> <br>
164
- </p>
165
- -->
166
-
167
156
  ## ⚡️DBCache: Dual Block Cache
168
157
 
169
158
  <div id="dbcache"></div>
170
159
 
171
- ![](https://github.com/user-attachments/assets/c2a382b9-0ccd-46f4-aacc-87857b4a4de8)
160
+ ![](https://github.com/vipshop/cache-dit/raw/main/assets/dbcache-v1.png)
172
161
 
173
162
  **DBCache** provides configurable parameters for custom optimization, enabling a balanced trade-off between performance and precision:
174
163
 
175
164
  - **Fn**: Specifies that DBCache uses the **first n** Transformer blocks to fit the information at time step t, enabling the calculation of a more stable L1 diff and delivering more accurate information to subsequent blocks.
176
165
  - **Bn**: Further fuses approximate information in the **last n** Transformer blocks to enhance prediction accuracy. These blocks act as an auto-scaler for approximate hidden states that use residual cache.
166
+
167
+ ![](https://github.com/vipshop/cache-dit/raw/main/assets/dbcache-fnbn-v1.png)
168
+
177
169
  - **warmup_steps**: (default: 0) DBCache does not apply the caching strategy when the number of running steps is less than or equal to this value, ensuring the model sufficiently learns basic features during warmup.
178
170
  - **max_cached_steps**: (default: -1) DBCache disables the caching strategy when the previous cached steps exceed this value to prevent precision degradation.
179
171
  - **residual_diff_threshold**: The value of residual diff threshold, a higher value leads to faster performance at the cost of lower precision.
@@ -229,11 +221,50 @@ cache_options = {
229
221
  |24.85s|15.59s|8.58s|15.41s|15.11s|17.74s|
230
222
  |<img src=https://github.com/vipshop/cache-dit/raw/main/assets/NONE_R0.08_S0.png width=105px>|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/DBCACHE_F1B0S1_R0.08_S11.png width=105px> | <img src=https://github.com/vipshop/cache-dit/raw/main/assets/DBCACHE_F1B0S1_R0.2_S19.png width=105px>|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/DBCACHE_F8B8S1_R0.15_S15.png width=105px>|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/DBCACHE_F12B12S4_R0.2_S16.png width=105px>|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/DBCACHE_F16B16S4_R0.2_S13.png width=105px>|
231
223
 
224
+ ## 🔥Hybrid TaylorSeer
225
+
226
+ <div id="taylorseer"></div>
227
+
228
+ We have supported the [TaylorSeers: From Reusing to Forecasting: Accelerating Diffusion Models with TaylorSeers](https://arxiv.org/pdf/2503.06923) algorithm to further improve the precision of DBCache in cases where the cached steps are large, namely, **Hybrid TaylorSeer + DBCache**. At timesteps with significant intervals, the feature similarity in diffusion models decreases substantially, significantly harming the generation quality.
229
+
230
+ $$
231
+ \mathcal{F}\_{\text {pred }, m}\left(x_{t-k}^l\right)=\mathcal{F}\left(x_t^l\right)+\sum_{i=1}^m \frac{\Delta^i \mathcal{F}\left(x_t^l\right)}{i!\cdot N^i}(-k)^i
232
+ $$
233
+
234
+ **TaylorSeer** employs a differential method to approximate the higher-order derivatives of features and predict features in future timesteps with Taylor series expansion. The TaylorSeer implemented in CacheDiT supports both hidden states and residual cache types. That is $\mathcal{F}\_{\text {pred }, m}\left(x_{t-k}^l\right)$ can be a residual cache or a hidden-state cache.
235
+
236
+ ```python
237
+ cache_options = {
238
+ # TaylorSeer options
239
+ "enable_taylorseer": True,
240
+ "enable_encoder_taylorseer": True,
241
+ # Taylorseer cache type cache be hidden_states or residual.
242
+ "taylorseer_cache_type": "residual",
243
+ # Higher values of n_derivatives will lead to longer
244
+ # computation time but may improve precision significantly.
245
+ "taylorseer_kwargs": {
246
+ "n_derivatives": 2, # default is 2.
247
+ },
248
+ "warmup_steps": 3, # n_derivatives + 1
249
+ "residual_diff_threshold": 0.12,
250
+ }
251
+ ```
252
+ <div align="center">
253
+ <p align="center">
254
+ <b>DBCache F1B0 + TaylorSeer</b>, L20x1, Steps: 28, <br>"A cat holding a sign that says hello world with complex background"
255
+ </p>
256
+ </div>
257
+
258
+ |Baseline(L20x1)|F1B0 (0.12)|+TaylorSeer|F1B0 (0.15)|+TaylorSeer|+compile|
259
+ |:---:|:---:|:---:|:---:|:---:|:---:|
260
+ |24.85s|12.85s|12.86s|10.27s|10.28s|8.48s|
261
+ |<img src=https://github.com/vipshop/cache-dit/raw/main/assets/NONE_R0.08_S0.png width=105px>|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/U0_C0_DBCACHE_F1B0S1W0T0ET0_R0.12_S14_T12.85s.png width=105px>|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/U0_C0_DBCACHE_F1B0S1W0T1ET1_R0.12_S14_T12.86s.png width=105px>|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/U0_C0_DBCACHE_F1B0S1W0T0ET0_R0.15_S17_T10.27s.png width=105px>|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/U0_C0_DBCACHE_F1B0S1W0T1ET1_R0.15_S17_T10.28s.png width=105px>|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/U0_C1_DBCACHE_F1B0S1W0T1ET1_R0.15_S17_T8.48s.png width=105px>|
262
+
232
263
  ## 🎉FBCache: First Block Cache
233
264
 
234
265
  <div id="fbcache"></div>
235
266
 
236
- ![](https://github.com/user-attachments/assets/0fb66656-b711-457a-92a7-a830f134272d)
267
+ ![](https://github.com/vipshop/cache-dit/raw/main/assets/fbcache-v1.png)
237
268
 
238
269
  **DBCache** is a more general cache algorithm than **FBCache**. When Fn=1 and Bn=0, DBCache behaves identically to FBCache. Therefore, you can either use the original FBCache implementation directly or configure **DBCache** with **F1B0** settings to achieve the same functionality.
239
270
 
@@ -267,7 +298,7 @@ apply_cache_on_pipe(pipe, **cache_options)
267
298
 
268
299
  <div id="dbprune"></div>
269
300
 
270
- ![](https://github.com/user-attachments/assets/932b6360-9533-4352-b176-4c4d84bd4695)
301
+ ![](https://github.com/vipshop/cache-dit/raw/main/assets/dbprune-v1.png)
271
302
 
272
303
  We have further implemented a new **Dynamic Block Prune** algorithm based on **Residual Caching** for Diffusion Transformers, which is referred to as **DBPrune**. DBPrune caches each block's hidden states and residuals, then dynamically prunes blocks during inference by computing the L1 distance between previous hidden states. When a block is pruned, its output is approximated using the cached residuals. DBPrune is currently in the experimental phase, and we kindly invite you to stay tuned for upcoming updates.
273
304
 
@@ -354,7 +385,7 @@ from para_attn.context_parallel import init_context_parallel_mesh
354
385
  from para_attn.context_parallel.diffusers_adapters import parallelize_pipe
355
386
  from cache_dit.cache_factory import apply_cache_on_pipe, CacheType
356
387
 
357
- # Init distributed process group
388
+ # Init distributed process group
358
389
  dist.init_process_group()
359
390
  torch.cuda.set_device(dist.get_rank())
360
391
 
@@ -401,14 +432,16 @@ torch._dynamo.config.recompile_limit = 96 # default is 8
401
432
  torch._dynamo.config.accumulated_recompile_limit = 2048 # default is 256
402
433
  ```
403
434
 
435
+ Please check [bench.py](./bench/bench.py) for more details.
436
+
404
437
  ## 👋Contribute
405
438
  <div id="contribute"></div>
406
439
 
407
- How to contribute? Star ⭐️ this repo to support us or check [CONTRIBUTE.md](./CONTRIBUTE.md).
440
+ How to contribute? Star ⭐️ this repo to support us or check [CONTRIBUTE.md](https://github.com/vipshop/cache-dit/raw/main/CONTRIBUTE.md).
408
441
 
409
442
  ## ©️License
410
443
 
411
444
  <div id="license"></div>
412
445
 
413
446
 
414
- We have followed the original License from [ParaAttention](https://github.com/chengzeyi/ParaAttention), please check [LICENSE](./LICENSE) for more details.
447
+ We have followed the original License from [ParaAttention](https://github.com/chengzeyi/ParaAttention), please check [LICENSE](https://github.com/vipshop/cache-dit/raw/main/LICENSE) for more details.
Binary file
Binary file
Binary file
Binary file
@@ -19,6 +19,11 @@ def get_args() -> argparse.ArgumentParser:
19
19
  parser.add_argument("--seed", type=int, default=0)
20
20
  parser.add_argument("--cache", type=str, default=None)
21
21
  parser.add_argument("--alter", action="store_true", default=False)
22
+ parser.add_argument("--taylorseer", action="store_true", default=False)
23
+ parser.add_argument("--taylorseer-order", "--order", type=int, default=2)
24
+ parser.add_argument(
25
+ "--encoder-taylorseer", action="store_true", default=False
26
+ )
22
27
  parser.add_argument("--l1-diff", action="store_true", default=False)
23
28
  parser.add_argument("--rdt", type=float, default=0.08)
24
29
  parser.add_argument("--Fn-compute-blocks", "--Fn", type=int, default=1)
@@ -41,21 +46,25 @@ def get_cache_options(cache_type: CacheType, args: argparse.Namespace):
41
46
  "warmup_steps": args.warmup_steps,
42
47
  "max_cached_steps": args.max_cached_steps,
43
48
  "residual_diff_threshold": args.rdt,
49
+ # TaylorSeer options
50
+ "enable_taylorseer": args.taylorseer,
44
51
  }
45
52
  elif cache_type == CacheType.DBCache:
46
53
  cache_options = {
47
54
  "cache_type": CacheType.DBCache,
48
- "warmup_steps": args.warmup_steps,
55
+ "warmup_steps": (
56
+ # TaylorSeer needs at least order + 1 warmup steps
57
+ max(args.warmup_steps, args.taylorseer_order + 1)
58
+ if (args.taylorseer or args.encoder_taylorseer)
59
+ else args.warmup_steps
60
+ ),
49
61
  "max_cached_steps": args.max_cached_steps, # -1 means no limit
50
62
  # Fn=1, Bn=0, means FB Cache, otherwise, Dual Block Cache
51
63
  "Fn_compute_blocks": args.Fn_compute_blocks, # Fn, F8, etc.
52
64
  "Bn_compute_blocks": args.Bn_compute_blocks, # Bn, B16, etc.
53
65
  "max_Fn_compute_blocks": 19,
54
66
  "max_Bn_compute_blocks": 38,
55
- # WARN: DON'T set len(Fn_compute_blocks_ids) > 0 NOW, still have
56
- # some precision issues. 0, 1, 2, ..., 7, etc.
57
- "Fn_compute_blocks_ids": [],
58
- # NOTE: Only skip the specific Bn blocks in cache steps.
67
+ # Skip the specific Bn blocks in cache steps.
59
68
  # 0, 2, 4, ..., 14, 15, etc.
60
69
  "Bn_compute_blocks_ids": CacheType.range(
61
70
  0, args.Bn_compute_blocks, args.Bn_steps
@@ -70,8 +79,19 @@ def get_cache_options(cache_type: CacheType, args: argparse.Namespace):
70
79
  ),
71
80
  # releative token diff threshold, default is 0.0
72
81
  "important_condition_threshold": 0.00,
82
+ # TaylorSeer options
83
+ "enable_taylorseer": args.taylorseer,
84
+ "enable_encoder_taylorseer": args.encoder_taylorseer,
85
+ # Taylorseer cache type cache be hidden_states or residual
86
+ "taylorseer_cache_type": "residual",
87
+ "taylorseer_kwargs": {
88
+ "n_derivatives": args.taylorseer_order,
89
+ },
73
90
  }
74
91
  elif cache_type == CacheType.DBPrune:
92
+ assert (
93
+ args.taylorseer is False and args.encoder_taylorseer is False
94
+ ), "DBPrune does not support TaylorSeer yet."
75
95
  cache_options = {
76
96
  "cache_type": CacheType.DBPrune,
77
97
  "residual_diff_threshold": args.rdt,
@@ -101,11 +121,18 @@ def get_cache_options(cache_type: CacheType, args: argparse.Namespace):
101
121
  cache_type_str = (
102
122
  f"{cache_type_str}_F{args.Fn_compute_blocks}"
103
123
  f"B{args.Bn_compute_blocks}S{args.Bn_steps}"
124
+ f"W{args.warmup_steps}T{int(args.taylorseer)}"
125
+ f"ET{int(args.encoder_taylorseer)}"
126
+ f"O{args.taylorseer_order}"
104
127
  )
105
128
  elif cache_type == CacheType.DBPrune:
106
129
  cache_type_str = (
107
130
  f"{cache_type_str}_F{args.Fn_compute_blocks}"
108
- f"B{args.Bn_compute_blocks}"
131
+ f"B{args.Bn_compute_blocks}W{args.warmup_steps}"
132
+ )
133
+ elif cache_type == CacheType.FBCache:
134
+ cache_type_str = (
135
+ f"{cache_type_str}_W{args.warmup_steps}" f"T{int(args.taylorseer)}"
109
136
  )
110
137
  return cache_options, cache_type_str
111
138
 
@@ -174,8 +201,20 @@ def main():
174
201
  "Only compile transformer blocks not the whole model "
175
202
  "for FluxTransformer2DModel to keep higher precision."
176
203
  )
177
- for module in pipe.transformer.transformer_blocks:
178
- module.compile()
204
+ if args.taylorseer_order <= 2 or (
205
+ not args.taylorseer and not args.encoder_taylorseer
206
+ ):
207
+ # NOTE: Seems like compiling the whole transformer
208
+ # will cause precision issues while using TaylorSeer
209
+ # with order > 2.
210
+ for module in pipe.transformer.transformer_blocks:
211
+ module.compile()
212
+ else:
213
+ logger.warning(
214
+ "Compiling the whole transformer model with TaylorSeer "
215
+ "order > 2 may cause precision issues. Skipping "
216
+ "transformer_blocks."
217
+ )
179
218
  for module in pipe.transformer.single_transformer_blocks:
180
219
  module.compile()
181
220
  else: