cache-dit 0.2.2__tar.gz → 0.2.4__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (143) hide show
  1. {cache_dit-0.2.2 → cache_dit-0.2.4}/PKG-INFO +20 -2
  2. {cache_dit-0.2.2 → cache_dit-0.2.4}/README.md +19 -1
  3. {cache_dit-0.2.2 → cache_dit-0.2.4}/bench/bench.py +15 -16
  4. cache_dit-0.2.4/examples/README.md +57 -0
  5. cache_dit-0.2.4/examples/run_cogvideox.py +142 -0
  6. cache_dit-0.2.4/examples/run_flux.py +96 -0
  7. cache_dit-0.2.4/examples/run_flux_fill.py +100 -0
  8. cache_dit-0.2.4/examples/run_hunyuan_video.py +145 -0
  9. cache_dit-0.2.4/examples/run_mochi.py +101 -0
  10. cache_dit-0.2.4/examples/run_wan.py +134 -0
  11. {cache_dit-0.2.2 → cache_dit-0.2.4}/src/cache_dit/_version.py +2 -2
  12. {cache_dit-0.2.2 → cache_dit-0.2.4}/src/cache_dit/cache_factory/dual_block_cache/cache_context.py +225 -51
  13. {cache_dit-0.2.2 → cache_dit-0.2.4}/src/cache_dit/cache_factory/dual_block_cache/diffusers_adapters/wan.py +2 -2
  14. {cache_dit-0.2.2 → cache_dit-0.2.4}/src/cache_dit/cache_factory/dynamic_block_prune/prune_context.py +23 -23
  15. {cache_dit-0.2.2 → cache_dit-0.2.4}/src/cache_dit/cache_factory/first_block_cache/cache_context.py +0 -11
  16. {cache_dit-0.2.2 → cache_dit-0.2.4}/src/cache_dit/cache_factory/taylorseer.py +1 -2
  17. cache_dit-0.2.4/src/cache_dit/compile/__init__.py +1 -0
  18. cache_dit-0.2.4/src/cache_dit/compile/utils.py +94 -0
  19. cache_dit-0.2.4/src/cache_dit/custom_ops/__init__.py +0 -0
  20. cache_dit-0.2.4/src/cache_dit/custom_ops/triton_taylorseer.py +0 -0
  21. {cache_dit-0.2.2 → cache_dit-0.2.4}/src/cache_dit/logger.py +28 -0
  22. {cache_dit-0.2.2 → cache_dit-0.2.4}/src/cache_dit.egg-info/PKG-INFO +20 -2
  23. {cache_dit-0.2.2 → cache_dit-0.2.4}/src/cache_dit.egg-info/SOURCES.txt +4 -2
  24. cache_dit-0.2.2/assets/DBCache.png +0 -0
  25. cache_dit-0.2.2/assets/cache-dit.png +0 -0
  26. cache_dit-0.2.2/examples/README.md +0 -45
  27. cache_dit-0.2.2/examples/run_cogvideox.py +0 -72
  28. cache_dit-0.2.2/examples/run_flux.py +0 -27
  29. cache_dit-0.2.2/examples/run_flux_fill.py +0 -32
  30. cache_dit-0.2.2/examples/run_hunyuan_video.py +0 -75
  31. cache_dit-0.2.2/examples/run_mochi.py +0 -32
  32. cache_dit-0.2.2/examples/run_wan.py +0 -63
  33. {cache_dit-0.2.2 → cache_dit-0.2.4}/.github/workflows/issue.yml +0 -0
  34. {cache_dit-0.2.2 → cache_dit-0.2.4}/.gitignore +0 -0
  35. {cache_dit-0.2.2 → cache_dit-0.2.4}/.pre-commit-config.yaml +0 -0
  36. {cache_dit-0.2.2 → cache_dit-0.2.4}/CONTRIBUTE.md +0 -0
  37. {cache_dit-0.2.2 → cache_dit-0.2.4}/LICENSE +0 -0
  38. {cache_dit-0.2.2 → cache_dit-0.2.4}/MANIFEST.in +0 -0
  39. {cache_dit-0.2.2 → cache_dit-0.2.4}/assets/DBCACHE_F12B12S4_R0.2_S16.png +0 -0
  40. {cache_dit-0.2.2 → cache_dit-0.2.4}/assets/DBCACHE_F12B16S4_R0.08_S6.png +0 -0
  41. {cache_dit-0.2.2 → cache_dit-0.2.4}/assets/DBCACHE_F16B16S2_R0.2_S14.png +0 -0
  42. {cache_dit-0.2.2 → cache_dit-0.2.4}/assets/DBCACHE_F16B16S4_R0.2_S13.png +0 -0
  43. {cache_dit-0.2.2 → cache_dit-0.2.4}/assets/DBCACHE_F1B0S1_R0.08_S11.png +0 -0
  44. {cache_dit-0.2.2 → cache_dit-0.2.4}/assets/DBCACHE_F1B0S1_R0.2_S19.png +0 -0
  45. {cache_dit-0.2.2 → cache_dit-0.2.4}/assets/DBCACHE_F8B0S2_R0.12_S12.png +0 -0
  46. {cache_dit-0.2.2 → cache_dit-0.2.4}/assets/DBCACHE_F8B16S1_R0.2_S18.png +0 -0
  47. {cache_dit-0.2.2 → cache_dit-0.2.4}/assets/DBCACHE_F8B8S1_R0.08_S9.png +0 -0
  48. {cache_dit-0.2.2 → cache_dit-0.2.4}/assets/DBCACHE_F8B8S1_R0.12_S12.png +0 -0
  49. {cache_dit-0.2.2 → cache_dit-0.2.4}/assets/DBCACHE_F8B8S1_R0.15_S15.png +0 -0
  50. {cache_dit-0.2.2 → cache_dit-0.2.4}/assets/DBPRUNE_F1B0_R0.03_P24.0_T19.43s.png +0 -0
  51. {cache_dit-0.2.2 → cache_dit-0.2.4}/assets/DBPRUNE_F1B0_R0.04_P34.6_T16.82s.png +0 -0
  52. {cache_dit-0.2.2 → cache_dit-0.2.4}/assets/DBPRUNE_F1B0_R0.05_P38.3_T15.95s.png +0 -0
  53. {cache_dit-0.2.2 → cache_dit-0.2.4}/assets/DBPRUNE_F1B0_R0.06_P45.2_T14.24s.png +0 -0
  54. {cache_dit-0.2.2 → cache_dit-0.2.4}/assets/DBPRUNE_F1B0_R0.07_P52.3_T12.53s.png +0 -0
  55. {cache_dit-0.2.2 → cache_dit-0.2.4}/assets/DBPRUNE_F1B0_R0.08_P52.4_T12.52s.png +0 -0
  56. {cache_dit-0.2.2 → cache_dit-0.2.4}/assets/DBPRUNE_F1B0_R0.09_P59.2_T10.81s.png +0 -0
  57. {cache_dit-0.2.2 → cache_dit-0.2.4}/assets/DBPRUNE_F1B0_R0.12_P59.5_T10.76s.png +0 -0
  58. {cache_dit-0.2.2 → cache_dit-0.2.4}/assets/DBPRUNE_F1B0_R0.12_P63.0_T9.90s.png +0 -0
  59. {cache_dit-0.2.2 → cache_dit-0.2.4}/assets/DBPRUNE_F1B0_R0.1_P62.8_T9.95s.png +0 -0
  60. {cache_dit-0.2.2 → cache_dit-0.2.4}/assets/DBPRUNE_F1B0_R0.2_P59.5_T10.66s.png +0 -0
  61. {cache_dit-0.2.2 → cache_dit-0.2.4}/assets/DBPRUNE_F1B0_R0.3_P63.1_T9.79s.png +0 -0
  62. {cache_dit-0.2.2 → cache_dit-0.2.4}/assets/NONE_R0.08_S0.png +0 -0
  63. {cache_dit-0.2.2 → cache_dit-0.2.4}/assets/TEXTURE_DBCACHE_F1B0_R0.08.png +0 -0
  64. {cache_dit-0.2.2 → cache_dit-0.2.4}/assets/TEXTURE_DBCACHE_F8B12_R0.12.png +0 -0
  65. {cache_dit-0.2.2 → cache_dit-0.2.4}/assets/TEXTURE_DBCACHE_F8B16_R0.2.png +0 -0
  66. {cache_dit-0.2.2 → cache_dit-0.2.4}/assets/TEXTURE_DBCACHE_F8B20_R0.2.png +0 -0
  67. {cache_dit-0.2.2 → cache_dit-0.2.4}/assets/TEXTURE_DBCACHE_F8B8_R0.12.png +0 -0
  68. {cache_dit-0.2.2 → cache_dit-0.2.4}/assets/TEXTURE_NONE_R0.08.png +0 -0
  69. {cache_dit-0.2.2 → cache_dit-0.2.4}/assets/U0_C0_DBCACHE_F1B0S1W0T0ET0_R0.12_S14_T12.85s.png +0 -0
  70. {cache_dit-0.2.2 → cache_dit-0.2.4}/assets/U0_C0_DBCACHE_F1B0S1W0T0ET0_R0.15_S17_T10.27s.png +0 -0
  71. {cache_dit-0.2.2 → cache_dit-0.2.4}/assets/U0_C0_DBCACHE_F1B0S1W0T1ET1_R0.12_S14_T12.86s.png +0 -0
  72. {cache_dit-0.2.2 → cache_dit-0.2.4}/assets/U0_C0_DBCACHE_F1B0S1W0T1ET1_R0.15_S17_T10.28s.png +0 -0
  73. {cache_dit-0.2.2 → cache_dit-0.2.4}/assets/U0_C1_DBCACHE_F1B0S1W0T1ET1_R0.15_S17_T8.48s.png +0 -0
  74. {cache_dit-0.2.2 → cache_dit-0.2.4}/assets/U0_C1_DBPRUNE_F1B0_R0.03_P24.0_T16.25s.png +0 -0
  75. {cache_dit-0.2.2 → cache_dit-0.2.4}/assets/U0_C1_DBPRUNE_F1B0_R0.045_P38.2_T13.41s.png +0 -0
  76. {cache_dit-0.2.2 → cache_dit-0.2.4}/assets/U0_C1_DBPRUNE_F1B0_R0.04_P34.6_T14.12s.png +0 -0
  77. {cache_dit-0.2.2 → cache_dit-0.2.4}/assets/U0_C1_DBPRUNE_F1B0_R0.055_P45.1_T12.00s.png +0 -0
  78. {cache_dit-0.2.2 → cache_dit-0.2.4}/assets/U0_C1_DBPRUNE_F1B0_R0.05_P41.6_T12.70s.png +0 -0
  79. {cache_dit-0.2.2 → cache_dit-0.2.4}/assets/U0_C1_DBPRUNE_F1B0_R0.2_P59.5_T8.86s.png +0 -0
  80. {cache_dit-0.2.2 → cache_dit-0.2.4}/assets/U0_C1_DBPRUNE_F8B8_R0.08_P23.1_T16.14s.png +0 -0
  81. {cache_dit-0.2.2 → cache_dit-0.2.4}/assets/U0_C1_NONE_R0.08_S0_T20.43s.png +0 -0
  82. {cache_dit-0.2.2 → cache_dit-0.2.4}/assets/U4_C1_DBPRUNE_F1B0_R0.03_P27.3_T6.62s.png +0 -0
  83. {cache_dit-0.2.2 → cache_dit-0.2.4}/assets/U4_C1_DBPRUNE_F1B0_R0.03_P27.3_T6.63s.png +0 -0
  84. {cache_dit-0.2.2 → cache_dit-0.2.4}/assets/U4_C1_DBPRUNE_F1B0_R0.045_P38.2_T5.81s.png +0 -0
  85. {cache_dit-0.2.2 → cache_dit-0.2.4}/assets/U4_C1_DBPRUNE_F1B0_R0.045_P38.2_T5.82s.png +0 -0
  86. {cache_dit-0.2.2 → cache_dit-0.2.4}/assets/U4_C1_DBPRUNE_F1B0_R0.04_P34.6_T6.06s.png +0 -0
  87. {cache_dit-0.2.2 → cache_dit-0.2.4}/assets/U4_C1_DBPRUNE_F1B0_R0.04_P34.6_T6.07s.png +0 -0
  88. {cache_dit-0.2.2 → cache_dit-0.2.4}/assets/U4_C1_DBPRUNE_F1B0_R0.04_P34.6_T6.08s.png +0 -0
  89. {cache_dit-0.2.2 → cache_dit-0.2.4}/assets/U4_C1_DBPRUNE_F1B0_R0.055_P45.1_T5.27s.png +0 -0
  90. {cache_dit-0.2.2 → cache_dit-0.2.4}/assets/U4_C1_DBPRUNE_F1B0_R0.055_P45.1_T5.28s.png +0 -0
  91. {cache_dit-0.2.2 → cache_dit-0.2.4}/assets/U4_C1_DBPRUNE_F1B0_R0.2_P59.5_T3.95s.png +0 -0
  92. {cache_dit-0.2.2 → cache_dit-0.2.4}/assets/U4_C1_DBPRUNE_F1B0_R0.2_P59.5_T3.96s.png +0 -0
  93. {cache_dit-0.2.2 → cache_dit-0.2.4}/assets/U4_C1_NONE_R0.08_S0_T7.78s.png +0 -0
  94. {cache_dit-0.2.2 → cache_dit-0.2.4}/assets/U4_C1_NONE_R0.08_S0_T7.79s.png +0 -0
  95. {cache_dit-0.2.2 → cache_dit-0.2.4}/assets/cache-dit-v1.png +0 -0
  96. {cache_dit-0.2.2 → cache_dit-0.2.4}/assets/dbcache-fnbn-v1.png +0 -0
  97. {cache_dit-0.2.2 → cache_dit-0.2.4}/assets/dbcache-v1.png +0 -0
  98. {cache_dit-0.2.2 → cache_dit-0.2.4}/assets/dbprune-v1.png +0 -0
  99. {cache_dit-0.2.2 → cache_dit-0.2.4}/assets/fbcache-v1.png +0 -0
  100. {cache_dit-0.2.2 → cache_dit-0.2.4}/bench/.gitignore +0 -0
  101. {cache_dit-0.2.2 → cache_dit-0.2.4}/docs/.gitignore +0 -0
  102. {cache_dit-0.2.2 → cache_dit-0.2.4}/examples/.gitignore +0 -0
  103. {cache_dit-0.2.2 → cache_dit-0.2.4}/examples/data/cup.png +0 -0
  104. {cache_dit-0.2.2 → cache_dit-0.2.4}/examples/data/cup_mask.png +0 -0
  105. {cache_dit-0.2.2 → cache_dit-0.2.4}/examples/requirements.txt +0 -0
  106. {cache_dit-0.2.2 → cache_dit-0.2.4}/pyproject.toml +0 -0
  107. {cache_dit-0.2.2 → cache_dit-0.2.4}/pytest.ini +0 -0
  108. {cache_dit-0.2.2 → cache_dit-0.2.4}/requirements.txt +0 -0
  109. {cache_dit-0.2.2 → cache_dit-0.2.4}/setup.cfg +0 -0
  110. {cache_dit-0.2.2 → cache_dit-0.2.4}/setup.py +0 -0
  111. {cache_dit-0.2.2 → cache_dit-0.2.4}/src/cache_dit/__init__.py +0 -0
  112. {cache_dit-0.2.2 → cache_dit-0.2.4}/src/cache_dit/cache_factory/__init__.py +0 -0
  113. {cache_dit-0.2.2 → cache_dit-0.2.4}/src/cache_dit/cache_factory/dual_block_cache/__init__.py +0 -0
  114. {cache_dit-0.2.2 → cache_dit-0.2.4}/src/cache_dit/cache_factory/dual_block_cache/diffusers_adapters/__init__.py +0 -0
  115. {cache_dit-0.2.2 → cache_dit-0.2.4}/src/cache_dit/cache_factory/dual_block_cache/diffusers_adapters/cogvideox.py +0 -0
  116. {cache_dit-0.2.2 → cache_dit-0.2.4}/src/cache_dit/cache_factory/dual_block_cache/diffusers_adapters/flux.py +0 -0
  117. {cache_dit-0.2.2 → cache_dit-0.2.4}/src/cache_dit/cache_factory/dual_block_cache/diffusers_adapters/hunyuan_video.py +0 -0
  118. {cache_dit-0.2.2 → cache_dit-0.2.4}/src/cache_dit/cache_factory/dual_block_cache/diffusers_adapters/mochi.py +0 -0
  119. {cache_dit-0.2.2 → cache_dit-0.2.4}/src/cache_dit/cache_factory/dynamic_block_prune/__init__.py +0 -0
  120. {cache_dit-0.2.2 → cache_dit-0.2.4}/src/cache_dit/cache_factory/dynamic_block_prune/diffusers_adapters/__init__.py +0 -0
  121. {cache_dit-0.2.2 → cache_dit-0.2.4}/src/cache_dit/cache_factory/dynamic_block_prune/diffusers_adapters/cogvideox.py +0 -0
  122. {cache_dit-0.2.2 → cache_dit-0.2.4}/src/cache_dit/cache_factory/dynamic_block_prune/diffusers_adapters/flux.py +0 -0
  123. {cache_dit-0.2.2 → cache_dit-0.2.4}/src/cache_dit/cache_factory/dynamic_block_prune/diffusers_adapters/hunyuan_video.py +0 -0
  124. {cache_dit-0.2.2 → cache_dit-0.2.4}/src/cache_dit/cache_factory/dynamic_block_prune/diffusers_adapters/mochi.py +0 -0
  125. {cache_dit-0.2.2 → cache_dit-0.2.4}/src/cache_dit/cache_factory/dynamic_block_prune/diffusers_adapters/wan.py +0 -0
  126. {cache_dit-0.2.2 → cache_dit-0.2.4}/src/cache_dit/cache_factory/first_block_cache/__init__.py +0 -0
  127. {cache_dit-0.2.2 → cache_dit-0.2.4}/src/cache_dit/cache_factory/first_block_cache/diffusers_adapters/__init__.py +0 -0
  128. {cache_dit-0.2.2 → cache_dit-0.2.4}/src/cache_dit/cache_factory/first_block_cache/diffusers_adapters/cogvideox.py +0 -0
  129. {cache_dit-0.2.2 → cache_dit-0.2.4}/src/cache_dit/cache_factory/first_block_cache/diffusers_adapters/flux.py +0 -0
  130. {cache_dit-0.2.2 → cache_dit-0.2.4}/src/cache_dit/cache_factory/first_block_cache/diffusers_adapters/hunyuan_video.py +0 -0
  131. {cache_dit-0.2.2 → cache_dit-0.2.4}/src/cache_dit/cache_factory/first_block_cache/diffusers_adapters/mochi.py +0 -0
  132. {cache_dit-0.2.2 → cache_dit-0.2.4}/src/cache_dit/cache_factory/first_block_cache/diffusers_adapters/wan.py +0 -0
  133. {cache_dit-0.2.2 → cache_dit-0.2.4}/src/cache_dit/cache_factory/utils.py +0 -0
  134. {cache_dit-0.2.2 → cache_dit-0.2.4}/src/cache_dit/primitives.py +0 -0
  135. {cache_dit-0.2.2 → cache_dit-0.2.4}/src/cache_dit.egg-info/dependency_links.txt +0 -0
  136. {cache_dit-0.2.2 → cache_dit-0.2.4}/src/cache_dit.egg-info/requires.txt +0 -0
  137. {cache_dit-0.2.2 → cache_dit-0.2.4}/src/cache_dit.egg-info/top_level.txt +0 -0
  138. {cache_dit-0.2.2 → cache_dit-0.2.4}/tests/.gitignore +0 -0
  139. {cache_dit-0.2.2 → cache_dit-0.2.4}/tests/README.md +0 -0
  140. {cache_dit-0.2.2 → cache_dit-0.2.4}/tests/taylorseer_approximation_order_2.png +0 -0
  141. {cache_dit-0.2.2 → cache_dit-0.2.4}/tests/taylorseer_approximation_order_4.png +0 -0
  142. {cache_dit-0.2.2 → cache_dit-0.2.4}/tests/taylorseer_approximation_test.png +0 -0
  143. {cache_dit-0.2.2 → cache_dit-0.2.4}/tests/test_taylorseer.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: cache_dit
3
- Version: 0.2.2
3
+ Version: 0.2.4
4
4
  Summary: 🤗 CacheDiT: A Training-free and Easy-to-use Cache Acceleration Toolbox for Diffusion Transformers
5
5
  Author: DefTruth, vipshop.com, etc.
6
6
  Maintainer: DefTruth, vipshop.com, etc
@@ -154,6 +154,7 @@ The **CacheDiT** codebase is adapted from [FBCache](https://github.com/chengzeyi
154
154
  - [🔥Supported Models](#supported)
155
155
  - [⚡️Dual Block Cache](#dbcache)
156
156
  - [🔥Hybrid TaylorSeer](#taylorseer)
157
+ - [⚡️Hybrid Cache CFG](#cfg)
157
158
  - [🎉First Block Cache](#fbcache)
158
159
  - [⚡️Dynamic Block Prune](#dbprune)
159
160
  - [🎉Context Parallelism](#context-parallelism)
@@ -283,7 +284,11 @@ cache_options = {
283
284
  "warmup_steps": 3, # n_derivatives + 1
284
285
  "residual_diff_threshold": 0.12,
285
286
  }
286
- ```
287
+ ```
288
+
289
+ > [!Important]
290
+ > Please note that if you have used TaylorSeer as the calibrator for approximate hidden states, the **Bn** param of DBCache can be set to **0**. In essence, DBCache's Bn is also act as a calibrator, so you can choose either Bn > 0 or TaylorSeer. We recommend using the configuration scheme of **TaylorSeer** + **DBCache FnB0**.
291
+
287
292
  <div align="center">
288
293
  <p align="center">
289
294
  <b>DBCache F1B0 + TaylorSeer</b>, L20x1, Steps: 28, <br>"A cat holding a sign that says hello world with complex background"
@@ -295,6 +300,19 @@ cache_options = {
295
300
  |24.85s|12.85s|12.86s|10.27s|10.28s|8.48s|
296
301
  |<img src=https://github.com/vipshop/cache-dit/raw/main/assets/NONE_R0.08_S0.png width=105px>|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/U0_C0_DBCACHE_F1B0S1W0T0ET0_R0.12_S14_T12.85s.png width=105px>|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/U0_C0_DBCACHE_F1B0S1W0T1ET1_R0.12_S14_T12.86s.png width=105px>|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/U0_C0_DBCACHE_F1B0S1W0T0ET0_R0.15_S17_T10.27s.png width=105px>|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/U0_C0_DBCACHE_F1B0S1W0T1ET1_R0.15_S17_T10.28s.png width=105px>|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/U0_C1_DBCACHE_F1B0S1W0T1ET1_R0.15_S17_T8.48s.png width=105px>|
297
302
 
303
+ ## ⚡️Hybrid Cache CFG
304
+
305
+ <div id="cfg"></div>
306
+
307
+ CacheDiT supports caching for CFG (classifier-free guidance). For models that fuse CFG and non-CFG into a single forward step, or models that do not include CFG (classifier-free guidance) in the forward step, please set `do_separate_classifier_free_guidance` param to False. Otherwise, set it to True. Wan 2.1: True. FLUX.1, HunyunVideo, CogVideoX, Mochi: False.
308
+
309
+ ```python
310
+ cache_options = {
311
+ "do_separate_classifier_free_guidance": True, # Wan 2.1
312
+ "cfg_compute_first": False,
313
+ }
314
+ ```
315
+
298
316
  ## 🎉FBCache: First Block Cache
299
317
 
300
318
  <div id="fbcache"></div>
@@ -119,6 +119,7 @@ The **CacheDiT** codebase is adapted from [FBCache](https://github.com/chengzeyi
119
119
  - [🔥Supported Models](#supported)
120
120
  - [⚡️Dual Block Cache](#dbcache)
121
121
  - [🔥Hybrid TaylorSeer](#taylorseer)
122
+ - [⚡️Hybrid Cache CFG](#cfg)
122
123
  - [🎉First Block Cache](#fbcache)
123
124
  - [⚡️Dynamic Block Prune](#dbprune)
124
125
  - [🎉Context Parallelism](#context-parallelism)
@@ -248,7 +249,11 @@ cache_options = {
248
249
  "warmup_steps": 3, # n_derivatives + 1
249
250
  "residual_diff_threshold": 0.12,
250
251
  }
251
- ```
252
+ ```
253
+
254
+ > [!Important]
255
+ > Please note that if you have used TaylorSeer as the calibrator for approximate hidden states, the **Bn** param of DBCache can be set to **0**. In essence, DBCache's Bn is also act as a calibrator, so you can choose either Bn > 0 or TaylorSeer. We recommend using the configuration scheme of **TaylorSeer** + **DBCache FnB0**.
256
+
252
257
  <div align="center">
253
258
  <p align="center">
254
259
  <b>DBCache F1B0 + TaylorSeer</b>, L20x1, Steps: 28, <br>"A cat holding a sign that says hello world with complex background"
@@ -260,6 +265,19 @@ cache_options = {
260
265
  |24.85s|12.85s|12.86s|10.27s|10.28s|8.48s|
261
266
  |<img src=https://github.com/vipshop/cache-dit/raw/main/assets/NONE_R0.08_S0.png width=105px>|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/U0_C0_DBCACHE_F1B0S1W0T0ET0_R0.12_S14_T12.85s.png width=105px>|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/U0_C0_DBCACHE_F1B0S1W0T1ET1_R0.12_S14_T12.86s.png width=105px>|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/U0_C0_DBCACHE_F1B0S1W0T0ET0_R0.15_S17_T10.27s.png width=105px>|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/U0_C0_DBCACHE_F1B0S1W0T1ET1_R0.15_S17_T10.28s.png width=105px>|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/U0_C1_DBCACHE_F1B0S1W0T1ET1_R0.15_S17_T8.48s.png width=105px>|
262
267
 
268
+ ## ⚡️Hybrid Cache CFG
269
+
270
+ <div id="cfg"></div>
271
+
272
+ CacheDiT supports caching for CFG (classifier-free guidance). For models that fuse CFG and non-CFG into a single forward step, or models that do not include CFG (classifier-free guidance) in the forward step, please set `do_separate_classifier_free_guidance` param to False. Otherwise, set it to True. Wan 2.1: True. FLUX.1, HunyunVideo, CogVideoX, Mochi: False.
273
+
274
+ ```python
275
+ cache_options = {
276
+ "do_separate_classifier_free_guidance": True, # Wan 2.1
277
+ "cfg_compute_first": False,
278
+ }
279
+ ```
280
+
263
281
  ## 🎉FBCache: First Block Cache
264
282
 
265
283
  <div id="fbcache"></div>
@@ -21,9 +21,6 @@ def get_args() -> argparse.ArgumentParser:
21
21
  parser.add_argument("--alter", action="store_true", default=False)
22
22
  parser.add_argument("--taylorseer", action="store_true", default=False)
23
23
  parser.add_argument("--taylorseer-order", "--order", type=int, default=2)
24
- parser.add_argument(
25
- "--encoder-taylorseer", action="store_true", default=False
26
- )
27
24
  parser.add_argument("--l1-diff", action="store_true", default=False)
28
25
  parser.add_argument("--rdt", type=float, default=0.08)
29
26
  parser.add_argument("--Fn-compute-blocks", "--Fn", type=int, default=1)
@@ -32,9 +29,15 @@ def get_args() -> argparse.ArgumentParser:
32
29
  parser.add_argument("--warmup-steps", type=int, default=0)
33
30
  parser.add_argument("--max-cached-steps", type=int, default=-1)
34
31
  parser.add_argument("--max-pruned-steps", type=int, default=-1)
32
+ parser.add_argument("--gen-device", type=str, default="cuda")
35
33
  parser.add_argument("--ulysses", type=int, default=None)
36
34
  parser.add_argument("--compile", action="store_true", default=False)
37
- parser.add_argument("--gen-device", type=str, default="cuda")
35
+ parser.add_argument(
36
+ "--force-compile-all",
37
+ "--compile-all",
38
+ action="store_true",
39
+ default=False,
40
+ )
38
41
  return parser.parse_args()
39
42
 
40
43
 
@@ -52,12 +55,7 @@ def get_cache_options(cache_type: CacheType, args: argparse.Namespace):
52
55
  elif cache_type == CacheType.DBCache:
53
56
  cache_options = {
54
57
  "cache_type": CacheType.DBCache,
55
- "warmup_steps": (
56
- # TaylorSeer needs at least order + 1 warmup steps
57
- max(args.warmup_steps, args.taylorseer_order + 1)
58
- if (args.taylorseer or args.encoder_taylorseer)
59
- else args.warmup_steps
60
- ),
58
+ "warmup_steps": args.warmup_steps,
61
59
  "max_cached_steps": args.max_cached_steps, # -1 means no limit
62
60
  # Fn=1, Bn=0, means FB Cache, otherwise, Dual Block Cache
63
61
  "Fn_compute_blocks": args.Fn_compute_blocks, # Fn, F8, etc.
@@ -81,7 +79,7 @@ def get_cache_options(cache_type: CacheType, args: argparse.Namespace):
81
79
  "important_condition_threshold": 0.00,
82
80
  # TaylorSeer options
83
81
  "enable_taylorseer": args.taylorseer,
84
- "enable_encoder_taylorseer": args.encoder_taylorseer,
82
+ "enable_encoder_taylorseer": args.taylorseer,
85
83
  # Taylorseer cache type cache be hidden_states or residual
86
84
  "taylorseer_cache_type": "residual",
87
85
  "taylorseer_kwargs": {
@@ -90,7 +88,7 @@ def get_cache_options(cache_type: CacheType, args: argparse.Namespace):
90
88
  }
91
89
  elif cache_type == CacheType.DBPrune:
92
90
  assert (
93
- args.taylorseer is False and args.encoder_taylorseer is False
91
+ args.taylorseer is False
94
92
  ), "DBPrune does not support TaylorSeer yet."
95
93
  cache_options = {
96
94
  "cache_type": CacheType.DBPrune,
@@ -122,7 +120,6 @@ def get_cache_options(cache_type: CacheType, args: argparse.Namespace):
122
120
  f"{cache_type_str}_F{args.Fn_compute_blocks}"
123
121
  f"B{args.Bn_compute_blocks}S{args.Bn_steps}"
124
122
  f"W{args.warmup_steps}T{int(args.taylorseer)}"
125
- f"ET{int(args.encoder_taylorseer)}"
126
123
  f"O{args.taylorseer_order}"
127
124
  )
128
125
  elif cache_type == CacheType.DBPrune:
@@ -132,7 +129,7 @@ def get_cache_options(cache_type: CacheType, args: argparse.Namespace):
132
129
  )
133
130
  elif cache_type == CacheType.FBCache:
134
131
  cache_type_str = (
135
- f"{cache_type_str}_W{args.warmup_steps}" f"T{int(args.taylorseer)}"
132
+ f"{cache_type_str}_W{args.warmup_steps}T{int(args.taylorseer)}"
136
133
  )
137
134
  return cache_options, cache_type_str
138
135
 
@@ -201,8 +198,10 @@ def main():
201
198
  "Only compile transformer blocks not the whole model "
202
199
  "for FluxTransformer2DModel to keep higher precision."
203
200
  )
204
- if args.taylorseer_order <= 2 or (
205
- not args.taylorseer and not args.encoder_taylorseer
201
+ if (
202
+ args.taylorseer_order <= 2
203
+ or not args.taylorseer
204
+ or args.force_compile_all
206
205
  ):
207
206
  # NOTE: Seems like compiling the whole transformer
208
207
  # will cause precision issues while using TaylorSeer
@@ -0,0 +1,57 @@
1
+ # Examples for CacheDiT
2
+
3
+ ## Install requirements
4
+
5
+ ```bash
6
+ pip3 install -r requirements.txt
7
+ ```
8
+
9
+ ## Run examples
10
+
11
+ - FLUX.1-dev
12
+
13
+ ```bash
14
+ python3 run_flux.py # baseline
15
+ python3 run_flux.py --cache --Fn 8 --Bn 8
16
+ python3 run_flux.py --cache --Fn 8 --Bn 0 --taylorseer
17
+ ```
18
+
19
+ - FLUX.1-Fill-dev
20
+
21
+ ```bash
22
+ python3 run_flux_fill.py # baseline
23
+ python3 run_flux_fill.py --cache --Fn 8 --Bn 8
24
+ python3 run_flux_fill.py --cache --Fn 8 --Bn 0 --taylorseer
25
+ ```
26
+
27
+ - CogVideoX
28
+
29
+ ```bash
30
+ python3 run_cogvideox.py # baseline
31
+ python3 run_cogvideox.py --cache --Fn 8 --Bn 8
32
+ python3 run_cogvideox.py --cache --Fn 8 --Bn 0 --taylorseer
33
+ ```
34
+
35
+ - Wan2.1
36
+
37
+ ```bash
38
+ python3 run_wan.py # baseline
39
+ python3 run_wan.py --cache --Fn 8 --Bn 8
40
+ python3 run_wan.py --cache --Fn 8 --Bn 0 --taylorseer
41
+ ```
42
+
43
+ - Mochi
44
+
45
+ ```bash
46
+ python3 run_mochi.py # baseline
47
+ python3 run_mochi.py --cache --Fn 8 --Bn 8
48
+ python3 run_mochi.py --cache --Fn 8 --Bn 0 --taylorseer
49
+ ```
50
+
51
+ - HunyuanVideo
52
+
53
+ ```bash
54
+ python3 run_hunyuan_video.py # baseline
55
+ python3 run_hunyuan_video.py --cache --Fn 8 --Bn 8
56
+ python3 run_hunyuan_video.py --cache --Fn 8 --Bn 0 --taylorseer
57
+ ```
@@ -0,0 +1,142 @@
1
+ import os
2
+ import time
3
+ import torch
4
+ import argparse
5
+ from diffusers.utils import export_to_video
6
+ from diffusers import CogVideoXPipeline, AutoencoderKLCogVideoX
7
+ from cache_dit.cache_factory import apply_cache_on_pipe, CacheType
8
+
9
+
10
+ def get_args() -> argparse.ArgumentParser:
11
+ parser = argparse.ArgumentParser()
12
+ # General arguments
13
+ parser.add_argument("--cache", action="store_true", default=False)
14
+ parser.add_argument("--taylorseer", action="store_true", default=False)
15
+ parser.add_argument("--taylorseer-order", "--order", type=int, default=2)
16
+ parser.add_argument("--Fn-compute-blocks", "--Fn", type=int, default=1)
17
+ parser.add_argument("--Bn-compute-blocks", "--Bn", type=int, default=0)
18
+ parser.add_argument("--rdt", type=float, default=0.08)
19
+ parser.add_argument("--warmup-steps", type=int, default=0)
20
+ return parser.parse_args()
21
+
22
+
23
+ args = get_args()
24
+ print(args)
25
+
26
+
27
+ model_id = os.environ.get("COGVIDEOX_DIR", "THUDM/CogVideoX-5b")
28
+
29
+
30
+ def is_cogvideox_1_5():
31
+ return "CogVideoX1.5" in model_id or "THUDM/CogVideoX1.5" in model_id
32
+
33
+
34
+ def get_gpu_memory_in_gib():
35
+ if not torch.cuda.is_available():
36
+ return 0
37
+
38
+ try:
39
+ total_memory_bytes = torch.cuda.get_device_properties(
40
+ torch.cuda.current_device(),
41
+ ).total_memory
42
+ total_memory_gib = total_memory_bytes / (1024**3)
43
+ return int(total_memory_gib)
44
+ except Exception:
45
+ return 0
46
+
47
+
48
+ pipe = CogVideoXPipeline.from_pretrained(
49
+ model_id,
50
+ torch_dtype=torch.bfloat16,
51
+ ).to("cuda")
52
+
53
+
54
+ if args.cache:
55
+ cache_options = {
56
+ "cache_type": CacheType.DBCache,
57
+ "warmup_steps": args.warmup_steps,
58
+ "max_cached_steps": -1, # -1 means no limit
59
+ # Fn=1, Bn=0, means FB Cache, otherwise, Dual Block Cache
60
+ "Fn_compute_blocks": args.Fn_compute_blocks, # Fn, F8, etc.
61
+ "Bn_compute_blocks": args.Bn_compute_blocks, # Bn, B16, etc.
62
+ "residual_diff_threshold": args.rdt,
63
+ # releative token diff threshold, default is 0.0
64
+ "important_condition_threshold": 0.05,
65
+ # CFG: classifier free guidance or not
66
+ # CogVideoX fused CFG and non-CFG into single forward step
67
+ # so, we set do_separate_classifier_free_guidance as False.
68
+ "do_separate_classifier_free_guidance": False,
69
+ "cfg_compute_first": False,
70
+ "enable_taylorseer": args.taylorseer,
71
+ "enable_encoder_taylorseer": args.taylorseer,
72
+ # Taylorseer cache type cache be hidden_states or residual
73
+ "taylorseer_cache_type": "residual",
74
+ "taylorseer_kwargs": {
75
+ "n_derivatives": args.taylorseer_order,
76
+ },
77
+ }
78
+ cache_type_str = "DBCACHE"
79
+ cache_type_str = (
80
+ f"{cache_type_str}_F{args.Fn_compute_blocks}"
81
+ f"B{args.Bn_compute_blocks}W{args.warmup_steps}"
82
+ f"T{int(args.taylorseer)}O{args.taylorseer_order}"
83
+ )
84
+ print(f"cache options:\n{cache_options}")
85
+
86
+ apply_cache_on_pipe(pipe, **cache_options)
87
+ else:
88
+ cache_type_str = "NONE"
89
+
90
+
91
+ pipe.enable_model_cpu_offload()
92
+ assert isinstance(pipe.vae, AutoencoderKLCogVideoX) # enable type check for IDE
93
+ pipe.vae.enable_slicing()
94
+ pipe.vae.enable_tiling()
95
+
96
+ start = time.time()
97
+ prompt = (
98
+ "A panda, dressed in a small, red jacket and a tiny hat, "
99
+ "sits on a wooden stool in a serene bamboo forest. The "
100
+ "panda's fluffy paws strum a miniature acoustic guitar, "
101
+ "producing soft, melodic tunes. Nearby, a few other pandas "
102
+ "gather, watching curiously and some clapping in rhythm. "
103
+ "Sunlight filters through the tall bamboo, casting a gentle "
104
+ "glow on the scene. The panda's face is expressive, showing "
105
+ "concentration and joy as it plays. The background includes "
106
+ "a small, flowing stream and vibrant green foliage, enhancing "
107
+ "the peaceful and magical atmosphere of this unique musical "
108
+ "performance."
109
+ )
110
+ video = pipe(
111
+ prompt=prompt,
112
+ num_videos_per_prompt=1,
113
+ num_inference_steps=50,
114
+ num_frames=(
115
+ # Avoid OOM for CogVideoX1.5 model on 48GB GPU
116
+ 16
117
+ if (is_cogvideox_1_5() and get_gpu_memory_in_gib() < 48)
118
+ else 49
119
+ ),
120
+ guidance_scale=6,
121
+ generator=torch.Generator("cpu").manual_seed(0),
122
+ ).frames[0]
123
+ end = time.time()
124
+
125
+ if hasattr(pipe.transformer, "_cached_steps"):
126
+ cached_steps = pipe.transformer._cached_steps
127
+ residual_diffs = pipe.transformer._residual_diffs
128
+ print(f"Cache Steps: {len(cached_steps)}, {cached_steps}")
129
+ print(f"Residual Diffs: {len(residual_diffs)}, {residual_diffs}")
130
+ if hasattr(pipe.transformer, "_cfg_cached_steps"):
131
+ cfg_cached_steps = pipe.transformer._cfg_cached_steps
132
+ cfg_residual_diffs = pipe.transformer._cfg_residual_diffs
133
+ print(f"CFG Cache Steps: {len(cfg_cached_steps)}, {cfg_cached_steps} ")
134
+ print(
135
+ f"CFG Residual Diffs: {len(cfg_residual_diffs)}, {cfg_residual_diffs}"
136
+ )
137
+
138
+ time_cost = end - start
139
+ save_path = f"cogvideox.{cache_type_str}.mp4"
140
+ print(f"Time cost: {time_cost:.2f}s")
141
+ print(f"Saving video to {save_path}")
142
+ export_to_video(video, save_path, fps=8)
@@ -0,0 +1,96 @@
1
+ import os
2
+ import time
3
+ import torch
4
+ import argparse
5
+ from diffusers import FluxPipeline
6
+ from cache_dit.cache_factory import apply_cache_on_pipe, CacheType
7
+
8
+
9
+ def get_args() -> argparse.ArgumentParser:
10
+ parser = argparse.ArgumentParser()
11
+ # General arguments
12
+ parser.add_argument("--cache", action="store_true", default=False)
13
+ parser.add_argument("--taylorseer", action="store_true", default=False)
14
+ parser.add_argument("--taylorseer-order", "--order", type=int, default=2)
15
+ parser.add_argument("--Fn-compute-blocks", "--Fn", type=int, default=1)
16
+ parser.add_argument("--Bn-compute-blocks", "--Bn", type=int, default=0)
17
+ parser.add_argument("--rdt", type=float, default=0.08)
18
+ parser.add_argument("--warmup-steps", type=int, default=0)
19
+ return parser.parse_args()
20
+
21
+
22
+ args = get_args()
23
+ print(args)
24
+
25
+
26
+ pipe = FluxPipeline.from_pretrained(
27
+ os.environ.get(
28
+ "FLUX_DIR",
29
+ "black-forest-labs/FLUX.1-dev",
30
+ ),
31
+ torch_dtype=torch.bfloat16,
32
+ ).to("cuda")
33
+
34
+
35
+ if args.cache:
36
+ cache_options = {
37
+ "cache_type": CacheType.DBCache,
38
+ "warmup_steps": args.warmup_steps,
39
+ "max_cached_steps": -1, # -1 means no limit
40
+ # Fn=1, Bn=0, means FB Cache, otherwise, Dual Block Cache
41
+ "Fn_compute_blocks": args.Fn_compute_blocks, # Fn, F8, etc.
42
+ "Bn_compute_blocks": args.Bn_compute_blocks, # Bn, B16, etc.
43
+ "residual_diff_threshold": args.rdt,
44
+ # CFG: classifier free guidance or not
45
+ # FLUX.1 dev don not have CFG, so, we set
46
+ # do_separate_classifier_free_guidance as False.
47
+ "do_separate_classifier_free_guidance": False,
48
+ "cfg_compute_first": False,
49
+ "enable_taylorseer": args.taylorseer,
50
+ "enable_encoder_taylorseer": args.taylorseer,
51
+ # Taylorseer cache type cache be hidden_states or residual
52
+ "taylorseer_cache_type": "residual",
53
+ "taylorseer_kwargs": {
54
+ "n_derivatives": args.taylorseer_order,
55
+ },
56
+ }
57
+ cache_type_str = "DBCACHE"
58
+ cache_type_str = (
59
+ f"{cache_type_str}_F{args.Fn_compute_blocks}"
60
+ f"B{args.Bn_compute_blocks}W{args.warmup_steps}"
61
+ f"T{int(args.taylorseer)}O{args.taylorseer_order}"
62
+ )
63
+ print(f"cache options:\n{cache_options}")
64
+
65
+ apply_cache_on_pipe(pipe, **cache_options)
66
+ else:
67
+ cache_type_str = "NONE"
68
+
69
+
70
+ start = time.time()
71
+ image = pipe(
72
+ "A cat holding a sign that says hello world",
73
+ num_inference_steps=28,
74
+ generator=torch.Generator("cpu").manual_seed(0),
75
+ ).images[0]
76
+
77
+ end = time.time()
78
+
79
+ if hasattr(pipe.transformer, "_cached_steps"):
80
+ cached_steps = pipe.transformer._cached_steps
81
+ residual_diffs = pipe.transformer._residual_diffs
82
+ print(f"Cache Steps: {len(cached_steps)}, {cached_steps}")
83
+ print(f"Residual Diffs: {len(residual_diffs)}, {residual_diffs}")
84
+ if hasattr(pipe.transformer, "_cfg_cached_steps"):
85
+ cfg_cached_steps = pipe.transformer._cfg_cached_steps
86
+ cfg_residual_diffs = pipe.transformer._cfg_residual_diffs
87
+ print(f"CFG Cache Steps: {len(cfg_cached_steps)}, {cfg_cached_steps} ")
88
+ print(
89
+ f"CFG Residual Diffs: {len(cfg_residual_diffs)}, {cfg_residual_diffs}"
90
+ )
91
+
92
+ time_cost = end - start
93
+ save_path = f"flux.{cache_type_str}.png"
94
+ print(f"Time cost: {time_cost:.2f}s")
95
+ print(f"Saving image to {save_path}")
96
+ image.save(save_path)
@@ -0,0 +1,100 @@
1
+ import os
2
+ import time
3
+ import torch
4
+ import argparse
5
+ from diffusers import FluxFillPipeline
6
+ from diffusers.utils import load_image
7
+ from cache_dit.cache_factory import apply_cache_on_pipe, CacheType
8
+
9
+
10
+ def get_args() -> argparse.ArgumentParser:
11
+ parser = argparse.ArgumentParser()
12
+ # General arguments
13
+ parser.add_argument("--cache", action="store_true", default=False)
14
+ parser.add_argument("--taylorseer", action="store_true", default=False)
15
+ parser.add_argument("--taylorseer-order", "--order", type=int, default=2)
16
+ parser.add_argument("--Fn-compute-blocks", "--Fn", type=int, default=1)
17
+ parser.add_argument("--Bn-compute-blocks", "--Bn", type=int, default=0)
18
+ parser.add_argument("--rdt", type=float, default=0.08)
19
+ parser.add_argument("--warmup-steps", type=int, default=0)
20
+ return parser.parse_args()
21
+
22
+
23
+ args = get_args()
24
+ print(args)
25
+
26
+
27
+ pipe = FluxFillPipeline.from_pretrained(
28
+ os.environ.get(
29
+ "FLUX_FILL_DIR",
30
+ "black-forest-labs/FLUX.1-Fill-dev",
31
+ ),
32
+ torch_dtype=torch.bfloat16,
33
+ ).to("cuda")
34
+
35
+
36
+ if args.cache:
37
+ cache_options = {
38
+ "cache_type": CacheType.DBCache,
39
+ "warmup_steps": args.warmup_steps,
40
+ "max_cached_steps": -1, # -1 means no limit
41
+ # Fn=1, Bn=0, means FB Cache, otherwise, Dual Block Cache
42
+ "Fn_compute_blocks": args.Fn_compute_blocks, # Fn, F8, etc.
43
+ "Bn_compute_blocks": args.Bn_compute_blocks, # Bn, B16, etc.
44
+ "residual_diff_threshold": args.rdt,
45
+ # CFG: classifier free guidance or not
46
+ # FLUX.1 dev don not have CFG, so, we set
47
+ # do_separate_classifier_free_guidance as False.
48
+ "do_separate_classifier_free_guidance": False,
49
+ "cfg_compute_first": False,
50
+ "enable_taylorseer": args.taylorseer,
51
+ "enable_encoder_taylorseer": args.taylorseer,
52
+ # Taylorseer cache type cache be hidden_states or residual
53
+ "taylorseer_cache_type": "residual",
54
+ "taylorseer_kwargs": {
55
+ "n_derivatives": args.taylorseer_order,
56
+ },
57
+ }
58
+ cache_type_str = "DBCACHE"
59
+ cache_type_str = (
60
+ f"{cache_type_str}_F{args.Fn_compute_blocks}"
61
+ f"B{args.Bn_compute_blocks}W{args.warmup_steps}"
62
+ f"T{int(args.taylorseer)}O{args.taylorseer_order}"
63
+ )
64
+ print(f"cache options:\n{cache_options}")
65
+
66
+ apply_cache_on_pipe(pipe, **cache_options)
67
+ else:
68
+ cache_type_str = "NONE"
69
+
70
+ start = time.time()
71
+ image = pipe(
72
+ prompt="a white paper cup",
73
+ image=load_image("data/cup.png"),
74
+ mask_image=load_image("data/cup_mask.png"),
75
+ guidance_scale=30,
76
+ num_inference_steps=28,
77
+ max_sequence_length=512,
78
+ generator=torch.Generator("cpu").manual_seed(0),
79
+ ).images[0]
80
+
81
+ end = time.time()
82
+
83
+ if hasattr(pipe.transformer, "_cached_steps"):
84
+ cached_steps = pipe.transformer._cached_steps
85
+ residual_diffs = pipe.transformer._residual_diffs
86
+ print(f"Cache Steps: {len(cached_steps)}, {cached_steps}")
87
+ print(f"Residual Diffs: {len(residual_diffs)}, {residual_diffs}")
88
+ if hasattr(pipe.transformer, "_cfg_cached_steps"):
89
+ cfg_cached_steps = pipe.transformer._cfg_cached_steps
90
+ cfg_residual_diffs = pipe.transformer._cfg_residual_diffs
91
+ print(f"CFG Cache Steps: {len(cfg_cached_steps)}, {cfg_cached_steps} ")
92
+ print(
93
+ f"CFG Residual Diffs: {len(cfg_residual_diffs)}, {cfg_residual_diffs}"
94
+ )
95
+
96
+ time_cost = end - start
97
+ save_path = f"flux-fill.{cache_type_str}.png"
98
+ print(f"Time cost: {time_cost:.2f}s")
99
+ print(f"Saving image to {save_path}")
100
+ image.save(save_path)