PyPI - cache-dit - Versions diffs - 0.2.13__py3-none-any.whl → 0.2.15__py3-none-any.whl - Mend

cache-dit 0.2.13py3-none-any.whl → 0.2.15py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of cache-dit might be problematic. Click here for more details.

Files changed (15) hide show

cache_dit/__init__.py CHANGED Viewed

@@ -0,0 +1,5 @@
+try:
+    from ._version import version as __version__, version_tuple
+except ImportError:
+    __version__ = "unknown version"
+    version_tuple = (0, 0, "unknown version")

cache_dit/_version.py CHANGED Viewed

@@ -17,5 +17,5 @@ __version__: str
 __version_tuple__: VERSION_TUPLE
 version_tuple: VERSION_TUPLE
-__version__ = version = '0.2.13'
-__version_tuple__ = version_tuple = (0, 2, 13)
+__version__ = version = '0.2.15'
+__version_tuple__ = version_tuple = (0, 2, 15)

cache_dit/cache_factory/dual_block_cache/cache_context.py CHANGED Viewed

@@ -8,8 +8,9 @@ from typing import Any, DefaultDict, Dict, List, Optional, Union
 import torch
-import cache_dit.primitives as DP
+import cache_dit.primitives as primitives
 from cache_dit.cache_factory.taylorseer import TaylorSeer
+from cache_dit.utils import is_diffusers_at_least_0_3_5
 from cache_dit.logger import init_logger
 logger = init_logger(__name__)
@@ -772,8 +773,8 @@ def are_two_tensors_similar(
             mean_t1 = t1.abs().mean()
         if parallelized:
-            mean_diff = DP.all_reduce_sync(mean_diff, "avg")
-            mean_t1 = DP.all_reduce_sync(mean_t1, "avg")
+            mean_diff = primitives.all_reduce_sync(mean_diff, "avg")
+            mean_t1 = primitives.all_reduce_sync(mean_t1, "avg")
         # D = (t1 - t2) / t1 = 1 - (t2 / t1), if D = 0, then t1 = t2.
         # Futher, if we assume that (H(t,  0) - H(t-1,0)) ~ 0, then,
@@ -1365,6 +1366,7 @@ class DBCachedTransformerBlocks(torch.nn.Module):
     ):
         original_hidden_states = hidden_states
         original_encoder_hidden_states = encoder_hidden_states
+        # This condition branch is mainly for FLUX series.
         if self.single_transformer_blocks is not None:
             for block in self.transformer_blocks[Fn_compute_blocks() :]:
                 hidden_states = block(
@@ -1381,22 +1383,32 @@ class DBCachedTransformerBlocks(torch.nn.Module):
                             hidden_states,
                         )
-            hidden_states = torch.cat(
-                [encoder_hidden_states, hidden_states], dim=1
-            )
-            for block in self._Mn_single_transformer_blocks():
-                hidden_states = block(
-                    hidden_states,
-                    *args,
-                    **kwargs,
+            # https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/transformers/transformer_flux.py#L380
+            if is_diffusers_at_least_0_3_5():
+                for block in self._Mn_single_transformer_blocks():
+                    encoder_hidden_states, hidden_states = block(
+                        hidden_states,
+                        encoder_hidden_states,
+                        *args,
+                        **kwargs,
+                    )
+            else:
+                hidden_states = torch.cat(
+                    [encoder_hidden_states, hidden_states], dim=1
+                )
+                for block in self._Mn_single_transformer_blocks():
+                    hidden_states = block(
+                        hidden_states,
+                        *args,
+                        **kwargs,
+                    )
+                encoder_hidden_states, hidden_states = hidden_states.split(
+                    [
+                        encoder_hidden_states.shape[1],
+                        hidden_states.shape[1] - encoder_hidden_states.shape[1],
+                    ],
+                    dim=1,
                 )
-            encoder_hidden_states, hidden_states = hidden_states.split(
-                [
-                    encoder_hidden_states.shape[1],
-                    hidden_states.shape[1] - encoder_hidden_states.shape[1],
-                ],
-                dim=1,
-            )
         else:
             for block in self._Mn_transformer_blocks():
                 hidden_states = block(
@@ -1789,43 +1801,71 @@ class DBCachedTransformerBlocks(torch.nn.Module):
         original_hidden_states = hidden_states
         original_encoder_hidden_states = encoder_hidden_states
+        # This condition branch is mainly for FLUX series.
         if self.single_transformer_blocks is not None:
             assert Bn_compute_blocks() <= len(self.single_transformer_blocks), (
                 f"Bn_compute_blocks {Bn_compute_blocks()} must be less than "
                 f"the number of single transformer blocks {len(self.single_transformer_blocks)}"
             )
-            hidden_states = torch.cat(
-                [encoder_hidden_states, hidden_states], dim=1
-            )
-            if len(Bn_compute_blocks_ids()) > 0:
-                for i, block in enumerate(self._Bn_single_transformer_blocks()):
-                    hidden_states = (
-                        self._compute_and_cache_single_transformer_block(
-                            i,
-                            original_hidden_states,
-                            original_encoder_hidden_states,
-                            block,
+            if is_diffusers_at_least_0_3_5():
+                if len(Bn_compute_blocks_ids()) > 0:
+                    # NOTE: Reuse _compute_and_cache_transformer_block here.
+                    for i, block in enumerate(
+                        self._Bn_single_transformer_blocks()
+                    ):
+                        hidden_states, encoder_hidden_states = (
+                            self._compute_and_cache_transformer_block(
+                                i,
+                                block,
+                                hidden_states,
+                                encoder_hidden_states,
+                                *args,
+                                **kwargs,
+                            )
+                        )
+                else:
+                    # Compute all Bn blocks if no specific Bn compute blocks ids are set.
+                    for block in self._Bn_single_transformer_blocks():
+                        encoder_hidden_states, hidden_states = block(
                             hidden_states,
+                            encoder_hidden_states,
                             *args,
                             **kwargs,
                         )
-                    )
             else:
-                # Compute all Bn blocks if no specific Bn compute blocks ids are set.
-                for block in self._Bn_single_transformer_blocks():
-                    hidden_states = block(
-                        hidden_states,
-                        *args,
-                        **kwargs,
-                    )
-            encoder_hidden_states, hidden_states = hidden_states.split(
-                [
-                    encoder_hidden_states.shape[1],
-                    hidden_states.shape[1] - encoder_hidden_states.shape[1],
-                ],
-                dim=1,
-            )
+                hidden_states = torch.cat(
+                    [encoder_hidden_states, hidden_states], dim=1
+                )
+                if len(Bn_compute_blocks_ids()) > 0:
+                    for i, block in enumerate(
+                        self._Bn_single_transformer_blocks()
+                    ):
+                        hidden_states = (
+                            self._compute_and_cache_single_transformer_block(
+                                i,
+                                original_hidden_states,
+                                original_encoder_hidden_states,
+                                block,
+                                hidden_states,
+                                *args,
+                                **kwargs,
+                            )
+                        )
+                else:
+                    # Compute all Bn blocks if no specific Bn compute blocks ids are set.
+                    for block in self._Bn_single_transformer_blocks():
+                        hidden_states = block(
+                            hidden_states,
+                            *args,
+                            **kwargs,
+                        )
+                encoder_hidden_states, hidden_states = hidden_states.split(
+                    [
+                        encoder_hidden_states.shape[1],
+                        hidden_states.shape[1] - encoder_hidden_states.shape[1],
+                    ],
+                    dim=1,
+                )
         else:
             assert Bn_compute_blocks() <= len(self.transformer_blocks), (
                 f"Bn_compute_blocks {Bn_compute_blocks()} must be less than "

cache_dit/cache_factory/dual_block_cache/diffusers_adapters/__init__.py CHANGED Viewed

@@ -15,6 +15,8 @@ def apply_db_cache_on_transformer(transformer, *args, **kwargs):
         adapter_name = "wan"
     elif transformer_cls_name.startswith("HunyuanVideo"):
         adapter_name = "hunyuan_video"
+    elif transformer_cls_name.startswith("QwenImage"):
+        adapter_name = "qwen_image"
     else:
         raise ValueError(
             f"Unknown transformer class name: {transformer_cls_name}"
@@ -41,6 +43,8 @@ def apply_db_cache_on_pipe(pipe: DiffusionPipeline, *args, **kwargs):
         adapter_name = "wan"
     elif pipe_cls_name.startswith("HunyuanVideo"):
         adapter_name = "hunyuan_video"
+    elif pipe_cls_name.startswith("QwenImage"):
+        adapter_name = "qwen_image"
     else:
         raise ValueError(f"Unknown pipeline class name: {pipe_cls_name}")

cache_dit/cache_factory/dual_block_cache/diffusers_adapters/qwen_image.py ADDED Viewed

@@ -0,0 +1,88 @@
+import functools
+import unittest
+import torch
+from diffusers import QwenImagePipeline, QwenImageTransformer2DModel
+from cache_dit.cache_factory.dual_block_cache import cache_context
+def apply_db_cache_on_transformer(
+    transformer: QwenImageTransformer2DModel,
+):
+    if getattr(transformer, "_is_cached", False):
+        return transformer
+    transformer_blocks = torch.nn.ModuleList(
+        [
+            cache_context.DBCachedTransformerBlocks(
+                transformer.transformer_blocks,
+                transformer=transformer,
+                return_hidden_states_first=False,
+            )
+        ]
+    )
+    original_forward = transformer.forward
+    @functools.wraps(transformer.__class__.forward)
+    def new_forward(
+        self,
+        *args,
+        **kwargs,
+    ):
+        with unittest.mock.patch.object(
+            self,
+            "transformer_blocks",
+            transformer_blocks,
+        ):
+            return original_forward(
+                *args,
+                **kwargs,
+            )
+    transformer.forward = new_forward.__get__(transformer)
+    transformer._is_cached = True
+    return transformer
+def apply_db_cache_on_pipe(
+    pipe: QwenImagePipeline,
+    *,
+    shallow_patch: bool = False,
+    residual_diff_threshold=0.06,
+    downsample_factor=1,
+    warmup_steps=0,
+    max_cached_steps=-1,
+    **kwargs,
+):
+    cache_kwargs, kwargs = cache_context.collect_cache_kwargs(
+        default_attrs={
+            "residual_diff_threshold": residual_diff_threshold,
+            "downsample_factor": downsample_factor,
+            "warmup_steps": warmup_steps,
+            "max_cached_steps": max_cached_steps,
+        },
+        **kwargs,
+    )
+    if not getattr(pipe, "_is_cached", False):
+        original_call = pipe.__class__.__call__
+        @functools.wraps(original_call)
+        def new_call(self, *args, **kwargs):
+            with cache_context.cache_context(
+                cache_context.create_cache_context(
+                    **cache_kwargs,
+                )
+            ):
+                return original_call(self, *args, **kwargs)
+        pipe.__class__.__call__ = new_call
+        pipe.__class__._is_cached = True
+    if not shallow_patch:
+        apply_db_cache_on_transformer(pipe.transformer)
+    return pipe

cache_dit/cache_factory/dual_block_cache/diffusers_adapters/wan.py CHANGED Viewed

@@ -2,7 +2,7 @@ import functools
 import unittest
 import torch
-from diffusers import DiffusionPipeline, WanTransformer3DModel
+from diffusers import WanPipeline, WanTransformer3DModel
 from cache_dit.cache_factory.dual_block_cache import cache_context
@@ -49,7 +49,7 @@ def apply_db_cache_on_transformer(
 def apply_db_cache_on_pipe(
-    pipe: DiffusionPipeline,
+    pipe: WanPipeline,
     *,
     shallow_patch: bool = False,
     residual_diff_threshold=0.03,

cache_dit/cache_factory/dynamic_block_prune/prune_context.py CHANGED Viewed

@@ -7,7 +7,8 @@ from typing import Any, Dict, List, Optional, Union
 import torch
-import cache_dit.primitives as DP
+import cache_dit.primitives as primitives
+from cache_dit.utils import is_diffusers_at_least_0_3_5
 from cache_dit.logger import init_logger
 logger = init_logger(__name__)
@@ -446,8 +447,8 @@ def are_two_tensors_similar(
         mean_t1 = t1.abs().mean()
     if parallelized:
-        mean_diff = DP.all_reduce_sync(mean_diff, "avg")
-        mean_t1 = DP.all_reduce_sync(mean_t1, "avg")
+        mean_diff = primitives.all_reduce_sync(mean_diff, "avg")
+        mean_t1 = primitives.all_reduce_sync(mean_t1, "avg")
     # D = (t1 - t2) / t1 = 1 - (t2 / t1), if D = 0, then t1 = t2.
     # Futher, if we assume that (H(t,  0) - H(t-1,0)) ~ 0, then,
@@ -936,27 +937,44 @@ class DBPrunedTransformerBlocks(torch.nn.Module):
             )
         if self.single_transformer_blocks is not None:
-            hidden_states = torch.cat(
-                [encoder_hidden_states, hidden_states], dim=1
-            )
-            for j, block in enumerate(self.single_transformer_blocks):
-                hidden_states = self._compute_or_prune_single_transformer_block(
-                    j + len(self.transformer_blocks),
-                    original_hidden_states,
-                    original_encoder_hidden_states,
-                    block,
-                    hidden_states,
-                    *args,
-                    **kwargs,
+            # https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/transformers/transformer_flux.py#L380
+            if is_diffusers_at_least_0_3_5():
+                for j, block in enumerate(self.single_transformer_blocks):
+                    # NOTE: Reuse _compute_or_prune_transformer_block here.
+                    hidden_states, encoder_hidden_states = (
+                        self._compute_or_prune_transformer_block(
+                            j + len(self.transformer_blocks),
+                            block,
+                            hidden_states,
+                            encoder_hidden_states,
+                            *args,
+                            **kwargs,
+                        )
+                    )
+            else:
+                hidden_states = torch.cat(
+                    [encoder_hidden_states, hidden_states], dim=1
                 )
+                for j, block in enumerate(self.single_transformer_blocks):
+                    hidden_states = (
+                        self._compute_or_prune_single_transformer_block(
+                            j + len(self.transformer_blocks),
+                            original_hidden_states,
+                            original_encoder_hidden_states,
+                            block,
+                            hidden_states,
+                            *args,
+                            **kwargs,
+                        )
+                    )
-            encoder_hidden_states, hidden_states = hidden_states.split(
-                [
-                    encoder_hidden_states.shape[1],
-                    hidden_states.shape[1] - encoder_hidden_states.shape[1],
-                ],
-                dim=1,
-            )
+                encoder_hidden_states, hidden_states = hidden_states.split(
+                    [
+                        encoder_hidden_states.shape[1],
+                        hidden_states.shape[1] - encoder_hidden_states.shape[1],
+                    ],
+                    dim=1,
+                )
         hidden_states = (
             hidden_states.reshape(-1)

cache_dit/cache_factory/first_block_cache/cache_context.py CHANGED Viewed

@@ -7,7 +7,7 @@ from typing import Any, DefaultDict, Dict, List, Optional, Union
 import torch
-import cache_dit.primitives as DP
+import cache_dit.primitives as primitives
 from cache_dit.cache_factory.taylorseer import TaylorSeer
 from cache_dit.logger import init_logger
@@ -348,8 +348,8 @@ def are_two_tensors_similar(
     mean_diff = (t1 - t2).abs().mean()
     mean_t1 = t1.abs().mean()
     if parallelized:
-        mean_diff = DP.all_reduce_sync(mean_diff, "avg")
-        mean_t1 = DP.all_reduce_sync(mean_t1, "avg")
+        mean_diff = primitives.all_reduce_sync(mean_diff, "avg")
+        mean_t1 = primitives.all_reduce_sync(mean_t1, "avg")
     diff = (mean_diff / mean_t1).item()
     add_residual_diff(diff)

cache_dit/utils.py ADDED Viewed

@@ -0,0 +1,7 @@
+import torch
+import diffusers
+@torch.compiler.disable
+def is_diffusers_at_least_0_3_5() -> bool:
+    return diffusers.__version__ >= "0.35.0"

{cache_dit-0.2.13.dist-info → cache_dit-0.2.15.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: cache_dit
-Version: 0.2.13
+Version: 0.2.15
 Summary: 🤗 CacheDiT: A Training-free and Easy-to-use Cache Acceleration Toolbox for Diffusion Transformers
 Author: DefTruth, vipshop.com, etc.
 Maintainer: DefTruth, vipshop.com, etc
@@ -52,109 +52,14 @@ Dynamic: requires-python
       <img src=https://img.shields.io/badge/Python-3.10|3.11|3.12-9cf.svg >
       <img src=https://img.shields.io/badge/Release-v0.2-brightgreen.svg >
  </div>
-  <p align="center">
-    DeepCache is for UNet not DiT. Most DiT cache speedups are complex and not training-free. CacheDiT offers <br>a set of training-free cache accelerators for DiT: <b>🔥<a href="#dbcache">DBCache</a>, <a href="#dbprune">DBPrune</a>, <a href="#taylorseer">TaylorSeer</a>, <a href="#fbcache">FBCache</a></b>, etc🔥
-  </p>
-</div>
-<div align="center">
-  <p align="center">
-    <b>♥️ Please consider to leave a ⭐️ Star to support us ~ ♥️</b>
-  </p>
-</div>
-## 🔥News🔥
-- [2025-07-18] 🎉First caching mechanism in **[🤗huggingface/flux-fast](https://github.com/huggingface/flux-fast)** with **[cache-dit](https://github.com/vipshop/cache-dit)**, also check the **[PR](https://github.com/huggingface/flux-fast/pull/13)**.
-- [2025-07-13] **[🤗flux-faster](https://github.com/xlite-dev/flux-faster)** is released! A forked version of **[🤗huggingface/flux-fast](https://github.com/huggingface/flux-fast)** that **makes flux-fast even faster** with **[cache-dit](https://github.com/vipshop/cache-dit)**, **3.3x** speedup on NVIDIA L20.
-## 🤗 Introduction
-<div align="center">
-  <p align="center">
-    <h3>🔥DBCache: Dual Block Caching for Diffusion Transformers</h3>
-  </p>
-</div>
-**DBCache**: **Dual Block Caching** for Diffusion Transformers. We have enhanced `FBCache` into a more general and customizable cache algorithm, namely `DBCache`, enabling it to achieve fully `UNet-style` cache acceleration for DiT models. Different configurations of compute blocks (**F8B12**, etc.) can be customized in DBCache. Moreover, it can be entirely **training**-**free**. DBCache can strike a perfect **balance** between performance and precision!
-<div align="center">
-  <p align="center">
-    DBCache, <b> L20x1 </b>, Steps: 28, "A cat holding a sign that says hello world with complex background"
-  </p>
-</div>
-|Baseline(L20x1)|F1B0 (0.08)|F1B0 (0.20)|F8B8 (0.15)|F12B12 (0.20)|F16B16 (0.20)|
-|:---:|:---:|:---:|:---:|:---:|:---:|
-|24.85s|15.59s|8.58s|15.41s|15.11s|17.74s|
-|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/NONE_R0.08_S0.png width=105px>|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/DBCACHE_F1B0S1_R0.08_S11.png width=105px> | <img src=https://github.com/vipshop/cache-dit/raw/main/assets/DBCACHE_F1B0S1_R0.2_S19.png width=105px>|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/DBCACHE_F8B8S1_R0.15_S15.png width=105px>|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/DBCACHE_F12B12S4_R0.2_S16.png width=105px>|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/DBCACHE_F16B16S4_R0.2_S13.png width=105px>|
-|**Baseline(L20x1)**|**F1B0 (0.08)**|**F8B8 (0.12)**|**F8B12 (0.12)**|**F8B16 (0.20)**|**F8B20 (0.20)**|
-|27.85s|6.04s|5.88s|5.77s|6.01s|6.20s|
-|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/TEXTURE_NONE_R0.08.png width=105px>|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/TEXTURE_DBCACHE_F1B0_R0.08.png width=105px> |<img src=https://github.com/vipshop/cache-dit/raw/main/assets/TEXTURE_DBCACHE_F8B8_R0.12.png width=105px>|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/TEXTURE_DBCACHE_F8B12_R0.12.png width=105px>|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/TEXTURE_DBCACHE_F8B16_R0.2.png width=105px>|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/TEXTURE_DBCACHE_F8B20_R0.2.png width=105px>|
-<div align="center">
-  <p align="center">
-    DBCache, <b> L20x4 </b>, Steps: 20, case to show the texture recovery ability of DBCache
-  </p>
-</div>
-These case studies demonstrate that even with relatively high thresholds (such as 0.12, 0.15, 0.2, etc.) under the DBCache **F12B12** or **F8B16** configuration, the detailed texture of the kitten's fur, colored cloth, and the clarity of text can still be preserved. This suggests that users can leverage DBCache to effectively balance performance and precision in their workflows!
-<div align="center">
-  <p align="center">
-    <h3>🔥DBPrune: Dynamic Block Prune with Residual Caching</h3>
-  </p>
-</div>
-**DBPrune**: We have further implemented a new **Dynamic Block Prune** algorithm based on **Residual Caching** for Diffusion Transformers, referred to as DBPrune. DBPrune caches each block's hidden states and residuals, then **dynamically prunes** blocks during inference by computing the L1 distance between previous hidden states. When a block is pruned, its output is approximated using the cached residuals.
-<div align="center">
-  <p align="center">
-    DBPrune, <b> L20x1 </b>, Steps: 28, "A cat holding a sign that says hello world with complex background"
-  </p>
-</div>
-|Baseline(L20x1)|Pruned(24%)|Pruned(35%)|Pruned(38%)|Pruned(45%)|Pruned(60%)|
-|:---:|:---:|:---:|:---:|:---:|:---:|
-|24.85s|19.43s|16.82s|15.95s|14.24s|10.66s|
-|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/NONE_R0.08_S0.png width=105px>|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/DBPRUNE_F1B0_R0.03_P24.0_T19.43s.png width=105px> | <img src=https://github.com/vipshop/cache-dit/raw/main/assets/DBPRUNE_F1B0_R0.04_P34.6_T16.82s.png width=105px>|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/DBPRUNE_F1B0_R0.05_P38.3_T15.95s.png width=105px>|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/DBPRUNE_F1B0_R0.06_P45.2_T14.24s.png width=105px>|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/DBPRUNE_F1B0_R0.2_P59.5_T10.66s.png width=105px>|
-<div align="center">
-  <p align="center">
-    <h3>🔥Context Parallelism and Torch Compile</h3>
-  </p>
-</div>
-Moreover, **CacheDiT** are **plug-and-play** solutions that works hand-in-hand with [ParaAttention](https://github.com/chengzeyi/ParaAttention). Users can easily tap into its **Context Parallelism** features for distributed inference. CacheDiT is designed to work compatibly with **torch.compile.** You can easily use CacheDiT with torch.compile to further achieve a better performance.
-<div align="center">
-  <p align="center">
-  DBPrune + <b>torch.compile + context parallelism</b> <br>Steps: 28, "A cat holding a sign that says hello world with complex background"
-  </p>
+   <b>🔥<a href="#dbcache">DBCache</a> | <a href="#dbprune">DBPrune</a> | <a href="#taylorseer">Hybrid TaylorSeer</a> | <a href="#cfg">Hybrid Cache CFG</a> | <a href="#fbcache">FBCache</a></b>🔥
 </div>
-|Baseline|Pruned(24%)|Pruned(35%)|Pruned(38%)|Pruned(45%)|Pruned(60%)|
-|:---:|:---:|:---:|:---:|:---:|:---:|
-|+compile:20.43s|16.25s|14.12s|13.41s|12.00s|8.86s|
-|+L20x4:7.75s|6.62s|6.03s|5.81s|5.24s|3.93s|
-|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/U0_C1_NONE_R0.08_S0_T20.43s.png width=105px>|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/U0_C1_DBPRUNE_F1B0_R0.03_P24.0_T16.25s.png width=105px> | <img src=https://github.com/vipshop/cache-dit/raw/main/assets/U0_C1_DBPRUNE_F1B0_R0.04_P34.6_T14.12s.png width=105px>|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/U0_C1_DBPRUNE_F1B0_R0.045_P38.2_T13.41s.png width=105px>|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/U0_C1_DBPRUNE_F1B0_R0.055_P45.1_T12.00s.png width=105px>|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/U0_C1_DBPRUNE_F1B0_R0.2_P59.5_T8.86s.png width=105px>|
-## ©️Citations
-```BibTeX
-@misc{CacheDiT@2025,
-  title={CacheDiT: A Training-free and Easy-to-use cache acceleration Toolbox for Diffusion Transformers},
-  url={https://github.com/vipshop/cache-dit.git},
-  note={Open-source software available at https://github.com/vipshop/cache-dit.git},
-  author={vipshop.com},
-  year={2025}
-}
-```
-## 👋Reference
-<div id="reference"></div>
-The **CacheDiT** codebase is adapted from [FBCache](https://github.com/chengzeyi/ParaAttention/tree/main/src/para_attn/first_block_cache). Special thanks to their excellent work!
+## 🔥News
+- [2025-08-11] 🔥[Qwen-Image](./examples/run_qwen_image.py) is supported! Please check [run_qwen_image.py](./examples/run_qwen_image.py) as an example.
+- [2025-08-10] 🔥[FLUX.1-Kontext-dev](./examples/run_flux_kontext.py) is supported! Please check [run_flux_kontext.py](./examples/run_flux_kontext.py) as an example.
+- [2025-07-18] 🎉First caching mechanism in [🤗huggingface/flux-fast](https://github.com/huggingface/flux-fast) with **[cache-dit](https://github.com/vipshop/cache-dit)**, also check the [PR](https://github.com/huggingface/flux-fast/pull/13).
+- [2025-07-13] **[🤗flux-faster](https://github.com/xlite-dev/flux-faster)** is released! **3.3x** speedup for FLUX.1 on NVIDIA L20 with `cache-dit`.
 ## 📖Contents
@@ -172,6 +77,7 @@ The **CacheDiT** codebase is adapted from [FBCache](https://github.com/chengzeyi
 - [⚙️Metrics CLI](#metrics)
 - [👋Contribute](#contribute)
 - [©️License](#license)
+- [©️Citations](#citations)
 ## ⚙️Installation
@@ -192,8 +98,10 @@ pip3 install git+https://github.com/vipshop/cache-dit.git
 <div id="supported"></div>
+- [🚀Qwen-Image](https://github.com/vipshop/cache-dit/raw/main/examples)
 - [🚀FLUX.1-dev](https://github.com/vipshop/cache-dit/raw/main/examples)
 - [🚀FLUX.1-Fill-dev](https://github.com/vipshop/cache-dit/raw/main/examples)
+- [🚀FLUX.1-Kontext-dev](https://github.com/vipshop/cache-dit/raw/main/examples)
 - [🚀mochi-1-preview](https://github.com/vipshop/cache-dit/raw/main/examples)
 - [🚀CogVideoX](https://github.com/vipshop/cache-dit/raw/main/examples)
 - [🚀CogVideoX1.5](https://github.com/vipshop/cache-dit/raw/main/examples)
@@ -208,6 +116,32 @@ pip3 install git+https://github.com/vipshop/cache-dit.git
 ![](https://github.com/vipshop/cache-dit/raw/main/assets/dbcache-v1.png)
+**DBCache**: **Dual Block Caching** for Diffusion Transformers. We have enhanced `FBCache` into a more general and customizable cache algorithm, namely `DBCache`, enabling it to achieve fully `UNet-style` cache acceleration for DiT models. Different configurations of compute blocks (**F8B12**, etc.) can be customized in DBCache. Moreover, it can be entirely **training**-**free**. DBCache can strike a perfect **balance** between performance and precision!
+<div align="center">
+  <p align="center">
+    DBCache, <b> L20x1 </b>, Steps: 28, "A cat holding a sign that says hello world with complex background"
+  </p>
+</div>
+|Baseline(L20x1)|F1B0 (0.08)|F1B0 (0.20)|F8B8 (0.15)|F12B12 (0.20)|F16B16 (0.20)|
+|:---:|:---:|:---:|:---:|:---:|:---:|
+|24.85s|15.59s|8.58s|15.41s|15.11s|17.74s|
+|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/NONE_R0.08_S0.png width=105px>|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/DBCACHE_F1B0S1_R0.08_S11.png width=105px> | <img src=https://github.com/vipshop/cache-dit/raw/main/assets/DBCACHE_F1B0S1_R0.2_S19.png width=105px>|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/DBCACHE_F8B8S1_R0.15_S15.png width=105px>|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/DBCACHE_F12B12S4_R0.2_S16.png width=105px>|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/DBCACHE_F16B16S4_R0.2_S13.png width=105px>|
+|**Baseline(L20x1)**|**F1B0 (0.08)**|**F8B8 (0.12)**|**F8B12 (0.12)**|**F8B16 (0.20)**|**F8B20 (0.20)**|
+|27.85s|6.04s|5.88s|5.77s|6.01s|6.20s|
+|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/TEXTURE_NONE_R0.08.png width=105px>|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/TEXTURE_DBCACHE_F1B0_R0.08.png width=105px> |<img src=https://github.com/vipshop/cache-dit/raw/main/assets/TEXTURE_DBCACHE_F8B8_R0.12.png width=105px>|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/TEXTURE_DBCACHE_F8B12_R0.12.png width=105px>|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/TEXTURE_DBCACHE_F8B16_R0.2.png width=105px>|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/TEXTURE_DBCACHE_F8B20_R0.2.png width=105px>|
+<div align="center">
+  <p align="center">
+    DBCache, <b> L20x4 </b>, Steps: 20, case to show the texture recovery ability of DBCache
+  </p>
+</div>
+These case studies demonstrate that even with relatively high thresholds (such as 0.12, 0.15, 0.2, etc.) under the DBCache **F12B12** or **F8B16** configuration, the detailed texture of the kitten's fur, colored cloth, and the clarity of text can still be preserved. This suggests that users can leverage DBCache to effectively balance performance and precision in their workflows!
 **DBCache** provides configurable parameters for custom optimization, enabling a balanced trade-off between performance and precision:
 - **Fn**: Specifies that DBCache uses the **first n** Transformer blocks to fit the information at time step t, enabling the calculation of a more stable L1 diff and delivering more accurate information to subsequent blocks.
@@ -259,17 +193,6 @@ cache_options = {
 }
 ```
-<div align="center">
-  <p align="center">
-    DBCache, <b> L20x1 </b>, Steps: 28, "A cat holding a sign that says hello world with complex background"
-  </p>
-</div>
-|Baseline(L20x1)|F1B0 (0.08)|F1B0 (0.20)|F8B8 (0.15)|F12B12 (0.20)|F16B16 (0.20)|
-|:---:|:---:|:---:|:---:|:---:|:---:|
-|24.85s|15.59s|8.58s|15.41s|15.11s|17.74s|
-|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/NONE_R0.08_S0.png width=105px>|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/DBCACHE_F1B0S1_R0.08_S11.png width=105px> | <img src=https://github.com/vipshop/cache-dit/raw/main/assets/DBCACHE_F1B0S1_R0.2_S19.png width=105px>|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/DBCACHE_F8B8S1_R0.15_S15.png width=105px>|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/DBCACHE_F12B12S4_R0.2_S16.png width=105px>|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/DBCACHE_F16B16S4_R0.2_S13.png width=105px>|
 ## 🔥Hybrid TaylorSeer
 <div id="taylorseer"></div>
@@ -490,6 +413,18 @@ Then, run the python test script with `torchrun`:
 torchrun --nproc_per_node=4 parallel_cache.py
 ```
+<div align="center">
+  <p align="center">
+  DBPrune + <b>torch.compile + context parallelism</b> <br>Steps: 28, "A cat holding a sign that says hello world with complex background"
+  </p>
+</div>
+|Baseline|Pruned(24%)|Pruned(35%)|Pruned(38%)|Pruned(45%)|Pruned(60%)|
+|:---:|:---:|:---:|:---:|:---:|:---:|
+|+compile:20.43s|16.25s|14.12s|13.41s|12.00s|8.86s|
+|+L20x4:7.75s|6.62s|6.03s|5.81s|5.24s|3.93s|
+|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/U0_C1_NONE_R0.08_S0_T20.43s.png width=105px>|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/U0_C1_DBPRUNE_F1B0_R0.03_P24.0_T16.25s.png width=105px> | <img src=https://github.com/vipshop/cache-dit/raw/main/assets/U0_C1_DBPRUNE_F1B0_R0.04_P34.6_T14.12s.png width=105px>|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/U0_C1_DBPRUNE_F1B0_R0.045_P38.2_T13.41s.png width=105px>|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/U0_C1_DBPRUNE_F1B0_R0.055_P45.1_T12.00s.png width=105px>|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/U0_C1_DBPRUNE_F1B0_R0.2_P59.5_T8.86s.png width=105px>|
 ## 🔥Torch Compile
 <div id="compile"></div>
@@ -551,5 +486,18 @@ How to contribute? Star ⭐️ this repo to support us or check [CONTRIBUTE.md](
 <div id="license"></div>
+The **CacheDiT** codebase is adapted from [FBCache](https://github.com/chengzeyi/ParaAttention/tree/main/src/para_attn/first_block_cache). Special thanks to their excellent work! We have followed the original License from [FBCache](https://github.com/chengzeyi/ParaAttention), please check [LICENSE](https://github.com/vipshop/cache-dit/raw/main/LICENSE) for more details.
+## ©️Citations
+<div id="citations"></div>
-We have followed the original License from [ParaAttention](https://github.com/chengzeyi/ParaAttention), please check [LICENSE](https://github.com/vipshop/cache-dit/raw/main/LICENSE) for more details.
+```BibTeX
+@misc{CacheDiT@2025,
+  title={CacheDiT: A Training-free and Easy-to-use cache acceleration Toolbox for Diffusion Transformers},
+  url={https://github.com/vipshop/cache-dit.git},
+  note={Open-source software available at https://github.com/vipshop/cache-dit.git},
+  author={vipshop.com},
+  year={2025}
+}
+```

{cache_dit-0.2.13.dist-info → cache_dit-0.2.15.dist-info}/RECORD RENAMED Viewed

@@ -1,21 +1,23 @@
-cache_dit/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-cache_dit/_version.py,sha256=2ECxD0Bipdh9vxnyteM0k9jxi9NOpPR7YxTi7Ad1ors,513
+cache_dit/__init__.py,sha256=0-B173-fLi3IA8nJXoS71zK0zD33Xplysd9skmLfEOY,171
+cache_dit/_version.py,sha256=TspyaLI34cMH7HNr-lrUKFnY4CF39yGfG_421vC3fhg,513
 cache_dit/logger.py,sha256=0zsu42hN-3-rgGC_C29ms1IvVpV4_b4_SwJCKSenxBE,4304
 cache_dit/primitives.py,sha256=A2iG9YLot3gOsZSPp-_gyjqjLgJvWQRx8aitD4JQ23Y,3877
+cache_dit/utils.py,sha256=4cFNh0asch6Zgsixq0bS1ElfwBu_6BG5ZSmaa1khjyg,144
 cache_dit/cache_factory/__init__.py,sha256=iYQwLwB_XLoYl0OB9unZGDbBtrYvZaLkOAmhGRwdW2E,191
 cache_dit/cache_factory/adapters.py,sha256=QMCaXnmqM7NT7sx4bCF1mMLn-QcXX9h1RmgLAypDedg,5256
 cache_dit/cache_factory/taylorseer.py,sha256=LKSNo2ode69EVo9xrxjxAMEjz0yDGiGADeDYnEqddA8,3987
 cache_dit/cache_factory/utils.py,sha256=V-Mb5Jn07geEUUWo4QAfh6pmSzkL-2OGDn0VAXbG6hQ,1799
 cache_dit/cache_factory/dual_block_cache/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-cache_dit/cache_factory/dual_block_cache/cache_context.py,sha256=itVEb6gT2eZuncAHUmP51ZS0r6v6cGtRvnPjyeXqKH8,71156
-cache_dit/cache_factory/dual_block_cache/diffusers_adapters/__init__.py,sha256=krNAICf-aS3JLmSG8vOB9tpLa04uYRcABsC8PMbVUKY,1870
+cache_dit/cache_factory/dual_block_cache/cache_context.py,sha256=sJ9yxQlcrX4qkPln94FrL0WDe2WIn3_UD2-Mk8YtjSw,73301
+cache_dit/cache_factory/dual_block_cache/diffusers_adapters/__init__.py,sha256=uSqF5aD2-feHB25vEbx1STBQVjWVAOn_wYTdAEmS4NU,2045
 cache_dit/cache_factory/dual_block_cache/diffusers_adapters/cogvideox.py,sha256=3xUjvDzor9AkBkDUc0N7kZqM86MIdajuigesnicNzXE,2260
 cache_dit/cache_factory/dual_block_cache/diffusers_adapters/flux.py,sha256=cIsov6Pf0dRyddqkzTA2CU-jSDotof8LQr-HIoY9T9M,2615
 cache_dit/cache_factory/dual_block_cache/diffusers_adapters/hunyuan_video.py,sha256=SO4q39PQuQ5QVHy5Z-ubiKdstzvQPedONN2J5oiGUh0,9955
 cache_dit/cache_factory/dual_block_cache/diffusers_adapters/mochi.py,sha256=8W9m-WeEVE2ytYi9udKEA8Wtb0EnvP3eT2A1Tu-d29k,2252
-cache_dit/cache_factory/dual_block_cache/diffusers_adapters/wan.py,sha256=EREHM5E1wxnL-uRXRAEege4HXraRp1oD_r1Zx4CsiKk,2596
+cache_dit/cache_factory/dual_block_cache/diffusers_adapters/qwen_image.py,sha256=ZIjB4GFIL0_xY8FkXdOJJ9-Xcft54rnBCrz43VWZLi0,2296
+cache_dit/cache_factory/dual_block_cache/diffusers_adapters/wan.py,sha256=EH2PpYFJ76KQLb35za6T-nM8Q10owLNatv6cd480ydE,2584
 cache_dit/cache_factory/dynamic_block_prune/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-cache_dit/cache_factory/dynamic_block_prune/prune_context.py,sha256=so1wGdb8W0ATwrjv7E5IEZLPcobybaY1HJa6hBYlOOQ,34698
+cache_dit/cache_factory/dynamic_block_prune/prune_context.py,sha256=1qarKAsEFiaaN2_ghko2dqGz_R7BTQSOyGtb_eQq38Y,35716
 cache_dit/cache_factory/dynamic_block_prune/diffusers_adapters/__init__.py,sha256=hVBTXj9MMGFGVezT3j8MntFRBiphSaUL4YhSOd8JtuY,1870
 cache_dit/cache_factory/dynamic_block_prune/diffusers_adapters/cogvideox.py,sha256=KP8NxtHAKzzBOoX0lhvlMgY_5dmP4Z3T5TOfwl4SSyg,2273
 cache_dit/cache_factory/dynamic_block_prune/diffusers_adapters/flux.py,sha256=kCB7lL4OIq8TZn-baMIF8D_PVPTFW60omCMVQCb8ebs,2628
@@ -23,7 +25,7 @@ cache_dit/cache_factory/dynamic_block_prune/diffusers_adapters/hunyuan_video.py,
 cache_dit/cache_factory/dynamic_block_prune/diffusers_adapters/mochi.py,sha256=zXgoRDDjus3a2WSjtNh4ERtQp20ceb6nzohHMDlo2zY,2265
 cache_dit/cache_factory/dynamic_block_prune/diffusers_adapters/wan.py,sha256=PA7nuLgfAelnaI8usQx0Kxi8XATzMapyR1WndEdFoZA,2604
 cache_dit/cache_factory/first_block_cache/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-cache_dit/cache_factory/first_block_cache/cache_context.py,sha256=tTPwhPLEA7LqGupps1Zy2MycCtLzs22wsW0yUhiiF-U,23217
+cache_dit/cache_factory/first_block_cache/cache_context.py,sha256=qn4zWJ_eEMIPYzrxXoslunxbzK0WueuNtC54Pp5Q57k,23241
 cache_dit/cache_factory/first_block_cache/diffusers_adapters/__init__.py,sha256=-FFgA2MoudEo7uDacg4aWgm1KwfLZFsEDTVxatgbq9M,2146
 cache_dit/cache_factory/first_block_cache/diffusers_adapters/cogvideox.py,sha256=qO5CWyurtwW30mvOe6cxeQPTSXLDlPJcezm72zEjDq8,2375
 cache_dit/cache_factory/first_block_cache/diffusers_adapters/flux.py,sha256=Dcd4OzABCtyQCZNX2KNnUTdVoO1E1ApM7P8gcVYzcK0,2733
@@ -40,9 +42,9 @@ cache_dit/metrics/fid.py,sha256=9Ivtazl6mW0Bon2VXa-Ia5Xj2ewxRD3V1Qkd69zYM3Y,1706
 cache_dit/metrics/inception.py,sha256=pBVe2X6ylLPIXTG4-GWDM9DWnCviMJbJ45R3ulhktR0,12759
 cache_dit/metrics/lpips.py,sha256=I2qCNi6qJh5TRsaIsdxO0WoRX1DN7U_H3zS0oCSahYM,1032
 cache_dit/metrics/metrics.py,sha256=8jvM1sF-nDxUuwCRy44QEoo4dYVLCQVh1QyAMs4eaQY,27840
-cache_dit-0.2.13.dist-info/licenses/LICENSE,sha256=Dqb07Ik2dV41s9nIdMUbiRWEfDqo7-dQeRiY7kPO8PE,3769
-cache_dit-0.2.13.dist-info/METADATA,sha256=at8DNFeGI5aVnBTi7_6zJgAi_QdgsItpBMzSGl8HEME,28247
-cache_dit-0.2.13.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
-cache_dit-0.2.13.dist-info/entry_points.txt,sha256=FX2gysXaZx6NeK1iCLMcIdP8Q4_qikkIHtEmi3oWn8o,65
-cache_dit-0.2.13.dist-info/top_level.txt,sha256=ZJDydonLEhujzz0FOkVbO-BqfzO9d_VqRHmZU-3MOZo,10
-cache_dit-0.2.13.dist-info/RECORD,,
+cache_dit-0.2.15.dist-info/licenses/LICENSE,sha256=Dqb07Ik2dV41s9nIdMUbiRWEfDqo7-dQeRiY7kPO8PE,3769
+cache_dit-0.2.15.dist-info/METADATA,sha256=7cxw9ZaYpCFbtA1iDt-CWhk07mDXuRPECxfuO-wB0IE,25153
+cache_dit-0.2.15.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
+cache_dit-0.2.15.dist-info/entry_points.txt,sha256=FX2gysXaZx6NeK1iCLMcIdP8Q4_qikkIHtEmi3oWn8o,65
+cache_dit-0.2.15.dist-info/top_level.txt,sha256=ZJDydonLEhujzz0FOkVbO-BqfzO9d_VqRHmZU-3MOZo,10
+cache_dit-0.2.15.dist-info/RECORD,,

{cache_dit-0.2.13.dist-info → cache_dit-0.2.15.dist-info}/WHEEL RENAMED Viewed

File without changes

{cache_dit-0.2.13.dist-info → cache_dit-0.2.15.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{cache_dit-0.2.13.dist-info → cache_dit-0.2.15.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

{cache_dit-0.2.13.dist-info → cache_dit-0.2.15.dist-info}/top_level.txt RENAMED Viewed

File without changes

cache-dit 0.2.13__py3-none-any.whl → 0.2.15__py3-none-any.whl

Potentially problematic release.

cache-dit 0.2.13py3-none-any.whl → 0.2.15py3-none-any.whl