PyPI - cache-dit - Versions diffs - 0.3.2__py3-none-any.whl → 1.0.0__py3-none-any.whl - Mend

cache-dit 0.3.2py3-none-any.whl → 1.0.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of cache-dit might be problematic. Click here for more details.

Files changed (13) hide show

cache_dit/_version.py CHANGED Viewed

@@ -28,7 +28,7 @@ version_tuple: VERSION_TUPLE
 commit_id: COMMIT_ID
 __commit_id__: COMMIT_ID
-__version__ = version = '0.3.2'
-__version_tuple__ = version_tuple = (0, 3, 2)
+__version__ = version = '1.0.0'
+__version_tuple__ = version_tuple = (1, 0, 0)
 __commit_id__ = commit_id = None

cache_dit/cache_factory/block_adapters/block_adapters.py CHANGED Viewed

@@ -113,6 +113,19 @@ class BlockAdapter:
         if any((self.pipe is not None, self.transformer is not None)):
             self.maybe_fill_attrs()
             self.maybe_patchify()
+            self.maybe_skip_checks()
+    def maybe_skip_checks(self):
+        if getattr(self.transformer, "_hf_hook", None) is not None:
+            logger.warning("_hf_hook is not None, force skip pattern check!")
+            self.check_forward_pattern = False
+            self.check_num_outputs = False
+        elif getattr(self.transformer, "_diffusers_hook", None) is not None:
+            logger.warning(
+                "_diffusers_hook is not None, force skip pattern check!"
+            )
+            self.check_forward_pattern = False
+            self.check_num_outputs = False
     def maybe_fill_attrs(self):
         # NOTE: This func should be call before normalize.

cache_dit/cache_factory/cache_adapters/cache_adapter.py CHANGED Viewed

@@ -1,10 +1,8 @@
 import torch
 import unittest
 import functools
 from contextlib import ExitStack
-from typing import Dict, List, Tuple, Any, Union, Callable
+from typing import Dict, List, Tuple, Any, Union, Callable, Optional
 from diffusers import DiffusionPipeline
@@ -16,7 +14,7 @@ from cache_dit.cache_factory.cache_contexts import CachedContextManager
 from cache_dit.cache_factory.cache_contexts import BasicCacheConfig
 from cache_dit.cache_factory.cache_contexts import CalibratorConfig
 from cache_dit.cache_factory.cache_blocks import CachedBlocks
-from cache_dit.cache_factory.cache_blocks.utils import (
+from cache_dit.cache_factory.cache_blocks import (
     patch_cached_stats,
     remove_cached_stats,
 )
@@ -330,7 +328,26 @@ class CachedAdapter:
         assert isinstance(dummy_blocks_names, list)
-        @functools.wraps(original_forward)
+        from accelerate import hooks
+        _hf_hook: Optional[hooks.ModelHook] = None
+        if getattr(transformer, "_hf_hook", None) is not None:
+            _hf_hook = transformer._hf_hook  # hooks from accelerate.hooks
+            if hasattr(transformer, "_old_forward"):
+                logger.warning(
+                    "_hf_hook is not None, so, we have to re-direct transformer's "
+                    f"original_forward({id(original_forward)}) to transformer's "
+                    f"_old_forward({id(transformer._old_forward)})"
+                )
+                original_forward = transformer._old_forward
+        # TODO: remove group offload hooks the re-apply after cache applied.
+        # hooks = _diffusers_hook.hooks.copy(); _diffusers_hook.hooks.clear()
+        # re-apply hooks to transformer after cache applied.
+        # from diffusers.hooks.hooks import HookFunctionReference, HookRegistry
+        # from diffusers.hooks.group_offloading import apply_group_offloading
         def new_forward(self, *args, **kwargs):
             with ExitStack() as stack:
                 for name, context_name in zip(
@@ -348,9 +365,27 @@ class CachedAdapter:
                             self, dummy_name, dummy_blocks
                         )
                     )
-                return original_forward(*args, **kwargs)
+                outputs = original_forward(*args, **kwargs)
+            return outputs
+        def new_forward_with_hf_hook(self, *args, **kwargs):
+            # Compatible with model cpu offload
+            if _hf_hook is not None and hasattr(_hf_hook, "pre_forward"):
+                args, kwargs = _hf_hook.pre_forward(self, *args, **kwargs)
+            outputs = new_forward(self, *args, **kwargs)
+            if _hf_hook is not None and hasattr(_hf_hook, "post_forward"):
+                outputs = _hf_hook.post_forward(self, outputs)
+            return outputs
+        # NOTE: Still can't fully compatible with group offloading
+        transformer.forward = functools.update_wrapper(
+            functools.partial(new_forward_with_hf_hook, transformer),
+            new_forward_with_hf_hook,
+        )
-        transformer.forward = new_forward.__get__(transformer)
         transformer._original_forward = original_forward
         transformer._is_cached = True

cache_dit/cache_factory/cache_blocks/__init__.py CHANGED Viewed

@@ -12,6 +12,10 @@ from cache_dit.cache_factory.cache_blocks.pattern_0_1_2 import (
 from cache_dit.cache_factory.cache_blocks.pattern_3_4_5 import (
     CachedBlocks_Pattern_3_4_5,
 )
+from cache_dit.cache_factory.cache_blocks.pattern_utils import (
+    patch_cached_stats,
+    remove_cached_stats,
+)
 from cache_dit.logger import init_logger

cache_dit/cache_factory/cache_blocks/offload_utils.py ADDED Viewed

@@ -0,0 +1,115 @@
+import torch
+import asyncio
+import logging
+from contextlib import contextmanager
+from typing import Generator, Optional, List
+from diffusers.hooks.group_offloading import _is_group_offload_enabled
+from cache_dit.logger import init_logger
+logger = init_logger(__name__)
+@torch.compiler.disable
+@contextmanager
+def maybe_onload(
+    block: torch.nn.Module,
+    reference_tensor: torch.Tensor,
+    pending_tasks: List[asyncio.Task] = [],
+) -> Generator:
+    if not _is_group_offload_enabled(block):
+        yield block
+        return
+    original_devices: Optional[List[torch.device]] = None
+    if hasattr(block, "parameters"):
+        params = list(block.parameters())
+        if params:
+            original_devices = [param.data.device for param in params]
+    target_device: torch.device = reference_tensor.device
+    move_task: Optional[asyncio.Task] = None
+    need_restore: bool = False
+    try:
+        if original_devices is not None:
+            unique_devices = list(set(original_devices))
+            if len(unique_devices) > 1 or unique_devices[0] != target_device:
+                if logger.isEnabledFor(logging.DEBUG):
+                    logger.debug(
+                        f"Onloading from {unique_devices} to {target_device}"
+                    )
+                has_meta_params = any(
+                    dev.type == "meta" for dev in original_devices
+                )
+                if has_meta_params:  # compatible with sequential cpu offload
+                    block = block.to_empty(device=target_device)
+                else:
+                    block = block.to(target_device, non_blocking=False)
+                need_restore = True
+        yield block
+    finally:
+        if need_restore and original_devices:
+            async def restore_device():
+                for param, original_device in zip(
+                    block.parameters(), original_devices
+                ):
+                    param.data = await asyncio.to_thread(
+                        lambda p, d: p.to(d, non_blocking=True),
+                        param.data,  # type: torch.Tensor
+                        original_device,  # type: torch.device
+                    )  # type: ignore[assignment]
+            loop = get_event_loop()
+            move_task = loop.create_task(restore_device())
+            if move_task:
+                pending_tasks.append(move_task)
+def get_event_loop() -> asyncio.AbstractEventLoop:
+    try:
+        loop = asyncio.get_running_loop()
+    except RuntimeError:
+        try:
+            loop = asyncio.get_event_loop()
+        except RuntimeError:
+            loop = asyncio.new_event_loop()
+            asyncio.set_event_loop(loop)
+    if not loop.is_running():
+        def run_loop() -> None:
+            asyncio.set_event_loop(loop)
+            loop.run_forever()
+        import threading
+        if not any(t.name == "_my_loop" for t in threading.enumerate()):
+            threading.Thread(
+                target=run_loop, name="_my_loop", daemon=True
+            ).start()
+    return loop
+@torch.compiler.disable
+def maybe_offload(
+    pending_tasks: List[asyncio.Task],
+) -> None:
+    if not pending_tasks:
+        return
+    loop = get_event_loop()
+    async def gather_tasks():
+        return await asyncio.gather(*pending_tasks)
+    future = asyncio.run_coroutine_threadsafe(gather_tasks(), loop)
+    try:
+        future.result(timeout=30.0)
+    except Exception as e:
+        logger.error(f"May Offload Error: {e}")
+    pending_tasks.clear()

cache_dit/cache_factory/cache_blocks/pattern_base.py CHANGED Viewed

@@ -1,7 +1,9 @@
 import inspect
+import asyncio
 import torch
 import torch.distributed as dist
+from typing import List
 from cache_dit.cache_factory.cache_contexts.cache_context import CachedContext
 from cache_dit.cache_factory.cache_contexts.cache_manager import (
     CachedContextManager,
@@ -45,6 +47,7 @@ class CachedBlocks_Pattern_Base(torch.nn.Module):
         self.cache_prefix = cache_prefix
         self.cache_context = cache_context
         self.cache_manager = cache_manager
+        self.pending_tasks: List[asyncio.Task] = []
         self._check_forward_pattern()
         logger.info(

{cache_dit-0.3.2.dist-info → cache_dit-1.0.0.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: cache_dit
-Version: 0.3.2
+Version: 1.0.0
 Summary: A Unified, Flexible and Training-free Cache Acceleration Framework for 🤗Diffusers.
 Author: DefTruth, vipshop.com, etc.
 Maintainer: DefTruth, vipshop.com, etc
@@ -48,23 +48,31 @@ Dynamic: requires-python
 <div align="center">
   <img src=https://github.com/vipshop/cache-dit/raw/main/assets/cache-dit-logo.png height="120">
-<p align="center">
+  <p align="center">
     A <b>Unified</b>, Flexible and Training-free <b>Cache Acceleration</b> Framework for <b>🤗Diffusers</b> <br>
     ♥️ Cache Acceleration with <b>One-line</b> Code ~ ♥️
   </p>
   <div align='center'>
+      <img src="./assets/image-reward-bench.png" width=580px >
+  </div>
+  <div align='center'>
+      <a href="https://huggingface.co/docs/diffusers/main/en/optimization/cache_dit"><img src=https://img.shields.io/badge/🤗Diffusers-ecosystem-yellow.svg ></a>
       <img src=https://img.shields.io/badge/Language-Python-brightgreen.svg >
-      <img src=https://img.shields.io/badge/PRs-welcome-9cf.svg >
+      <img src=https://img.shields.io/badge/PRs-welcome-blue.svg >
       <img src=https://img.shields.io/badge/PyPI-pass-brightgreen.svg >
       <img src=https://static.pepy.tech/badge/cache-dit >
-      <img src=https://img.shields.io/badge/Python-3.10|3.11|3.12-9cf.svg >
-      <img src=https://img.shields.io/badge/Release-v0.3-brightgreen.svg >
- </div>
+      <img src=https://img.shields.io/github/stars/vipshop/cache-dit.svg?style=dark >
+  </div>
+  <div align='center'>
+      <a href="./README.md">📚English</a> | <a href="./README_CN.md">📚中文阅读 </a> | <a href="./docs/User_Guide.md#api-documentation"> 📚API Documentation </a> | <a href="https://huggingface.co/docs/diffusers/main/en/optimization/cache_dit">🤗Diffusers' Documentation</a>
+  </div>
+<!--
   <p align="center">
-    <b><a href="#unified">📚Unified Cache APIs</a></b> | <a href="#forward-pattern-matching">📚Forward Pattern Matching</a> | <a href="#automatic-block-adapter">📚Automatic Block Adapter</a><br>
-    <a href="#hybird-forward-pattern">📚Hybrid Forward Pattern</a> | <a href="#dbcache">📚DBCache</a> | <a href="#taylorseer">📚TaylorSeer Calibrator</a> | <a href="#cfg">📚Cache CFG</a><br>
+    <b><a href="#unified">📚Unified Cache APIs</a></b> | <a href="#forward-pattern-matching">📚Forward Pattern Matching</a> | <a href="./docs/User_Guide.md">📚Automatic Block Adapter</a><br>
+    <a href="./docs/User_Guide.md">📚Hybrid Forward Pattern</a> | <a href="#dbcache">📚DBCache</a> | <a href="./docs/User_Guide.md">📚TaylorSeer Calibrator</a> | <a href="./docs/User_Guide.md">📚Cache CFG</a><br>
     <a href="#benchmarks">📚Text2Image DrawBench</a> | <a href="#benchmarks">📚Text2Image Distillation DrawBench</a>
   </p>
+-->
   <p align="center">
     🎉Now, <b>cache-dit</b> covers almost <b>All</b> Diffusers' <b>DiT</b> Pipelines🎉<br>
     🔥<a href="#supported">Qwen-Image</a> | <a href="#supported">FLUX.1</a> | <a href="#supported">Qwen-Image-Lightning</a> | <a href="#supported"> Wan 2.1 </a> | <a href="#supported"> Wan 2.2 </a>🔥<br>
@@ -74,6 +82,8 @@ Dynamic: requires-python
     🔥<a href="#supported">Chroma</a> | <a href="#supported">Sana</a> | <a href="#supported">Allegro</a> | <a href="#supported">Mochi</a> | <a href="#supported">SD 3/3.5</a> | <a href="#supported">Amused</a> | <a href="#supported"> ... </a> | <a href="#supported">DiT-XL</a>🔥
   </p>
 </div>
 <div align='center'>
   <img src=https://github.com/vipshop/cache-dit/raw/main/assets/gifs/wan2.2.C0_Q0_NONE.gif width=124px>
   <img src=https://github.com/vipshop/cache-dit/raw/main/assets/gifs/wan2.2.C1_Q0_DBCACHE_F1B0_W2M8MC2_T1O2_R0.08.gif width=124px>
@@ -85,12 +95,6 @@ Dynamic: requires-python
   <img src=https://github.com/vipshop/cache-dit/raw/main/assets/flux.C0_Q0_NONE_T23.69s.png width=90px>
   <img src=https://github.com/vipshop/cache-dit/raw/main/assets/flux.C0_Q0_DBCACHE_F1B0_W4M0MC0_T1O2_R0.15_S16_T11.39s.png width=90px>
   <p><b>🔥Qwen-Image</b> | <a href="https://github.com/vipshop/cache-dit">+cache-dit</a>:1.8x↑🎉 | <b>FLUX.1-dev</b> | <a href="https://github.com/vipshop/cache-dit">+cache-dit</a>:2.1x↑🎉</p>
-  <img src=https://github.com/vipshop/cache-dit/raw/main/assets/flux-kontext-cat.C0_L0_Q0_NONE.png width=100px>
-  <img src=https://github.com/vipshop/cache-dit/raw/main/assets/flux-kontext.C0_L0_Q0_NONE.png width=100px>
-  <img src=https://github.com/vipshop/cache-dit/raw/main/assets/flux-kontext.C0_L0_Q0_DBCACHE_F8B0_W8M0MC0_T0O2_R0.08_S10.png width=100px>
-  <img src=https://github.com/vipshop/cache-dit/raw/main/assets/flux-kontext.C0_L0_Q0_DBCACHE_F1B0_W8M0MC2_T0O2_R0.12_S12.png width=100px>
-  <img src=https://github.com/vipshop/cache-dit/raw/main/assets/flux-kontext.C0_L0_Q0_DBCACHE_F1B0_W2M0MC2_T0O2_R0.15_S15.png width=100px>
-  <p><b>🔥FLUX-Kontext-dev</b> | Baseline | <a href="https://github.com/vipshop/cache-dit">+cache-dit</a>:1.3x↑🎉 | 1.7x↑🎉 | 2.0x↑ 🎉</p>
   <img src=https://github.com/vipshop/cache-dit/raw/main/assets/qwen-image-lightning.4steps.C0_L1_Q0_NONE.png width=160px>
   <img src=https://github.com/vipshop/cache-dit/raw/main/assets/qwen-image-lightning.4steps.C0_L1_Q0_DBCACHE_F16B16_W2M1MC1_T0O2_R0.9_S1.png width=160px>
   <img src=https://github.com/vipshop/cache-dit/raw/main/assets/hunyuan-image-2.1.C0_L0_Q1_fp8_w8a16_wo_NONE.png width=90px>
@@ -100,7 +104,22 @@ Dynamic: requires-python
   <img src=https://github.com/vipshop/cache-dit/raw/main/assets/qwen-image-edit.C0_L0_Q0_NONE.png width=125px>
   <img src=https://github.com/vipshop/cache-dit/raw/main/assets/qwen-image-edit.C0_L0_Q0_DBCACHE_F8B0_W8M0MC0_T0O2_R0.08_S18.png width=125px>
   <img src=https://github.com/vipshop/cache-dit/raw/main/assets/qwen-image-edit.C0_L0_Q0_DBCACHE_F1B0_W8M0MC2_T0O2_R0.12_S24.png width=125px>
-  <p><b>🔥Qwen-Image-Edit</b> | Input w/o Edit | Baseline | <a href="https://github.com/vipshop/cache-dit">+cache-dit</a>:1.6x↑🎉 | 1.9x↑🎉 </p>
+  <p><b>🔥Qwen-Image-Edit</b> | Input w/o Edit | Baseline | <a href="https://github.com/vipshop/cache-dit">+cache-dit</a>:1.6x↑🎉 | 1.9x↑🎉
+  <br>♥️ Please consider to leave a <b>⭐️ Star</b> to support us ~ ♥️
+  </p>
+</div>
+<details align='center'>
+<summary>Click here to show more Image/Video cases</summary>
+<div  align='center'>
+  <img src=https://github.com/vipshop/cache-dit/raw/main/assets/flux-kontext-cat.C0_L0_Q0_NONE.png width=100px>
+  <img src=https://github.com/vipshop/cache-dit/raw/main/assets/flux-kontext.C0_L0_Q0_NONE.png width=100px>
+  <img src=https://github.com/vipshop/cache-dit/raw/main/assets/flux-kontext.C0_L0_Q0_DBCACHE_F8B0_W8M0MC0_T0O2_R0.08_S10.png width=100px>
+  <img src=https://github.com/vipshop/cache-dit/raw/main/assets/flux-kontext.C0_L0_Q0_DBCACHE_F1B0_W8M0MC2_T0O2_R0.12_S12.png width=100px>
+  <img src=https://github.com/vipshop/cache-dit/raw/main/assets/flux-kontext.C0_L0_Q0_DBCACHE_F1B0_W2M0MC2_T0O2_R0.15_S15.png width=100px>
+  <p><b>🔥FLUX-Kontext-dev</b> | Baseline | <a href="https://github.com/vipshop/cache-dit">+cache-dit</a>:1.3x↑🎉 | 1.7x↑🎉 | 2.0x↑ 🎉</p>
   <img src=https://github.com/vipshop/cache-dit/raw/main/assets/hidream.C0_L0_Q0_NONE.png width=100px>
   <img src=https://github.com/vipshop/cache-dit/raw/main/assets/hidream.C0_L0_Q0_DBCACHE_F1B0_W8M0MC0_T0O2_R0.08_S24.png width=100px>
   <img src=https://github.com/vipshop/cache-dit/raw/main/assets/cogview4.C0_L0_Q0_NONE.png width=100px>
@@ -160,24 +179,26 @@ Dynamic: requires-python
   <p><b>🔥Asumed</b> | <a href="https://github.com/vipshop/cache-dit">+cache-dit</a>:1.1x↑🎉 | 1.2x↑🎉 | <b>DiT-XL-256</b> | <a href="https://github.com/vipshop/cache-dit">+cache-dit</a>:1.8x↑🎉
   <br>♥️ Please consider to leave a <b>⭐️ Star</b> to support us ~ ♥️</p>
 </div>
+</details>
 ## 🔥News
-- [2025-09-10] 🎉Day 1 support [**HunyuanImage-2.1**](https://github.com/Tencent-Hunyuan/HunyuanImage-2.1) with **1.7x↑🎉** speedup! Check this [example](https://github.com/vipshop/cache-dit/raw/main/examples/pipeline/run_hunyuan_image_2.1.py).
-- [2025-09-08] 🔥[**Qwen-Image-Lightning**](https://github.com/vipshop/cache-dit/raw/main/examples/pipeline/run_qwen_image_lightning.py) **7.1/3.5 steps🎉** inference with **[DBCache: F16B16](https://github.com/vipshop/cache-dit)**.
-- [2025-09-03] 🎉[**Wan2.2-MoE**](https://github.com/Wan-Video) **2.4x↑🎉** speedup! Please refer to [run_wan_2.2.py](https://github.com/vipshop/cache-dit/raw/main/examples/pipeline/run_wan_2.2.py) as an example.
-- [2025-08-19] 🔥[**Qwen-Image-Edit**](https://github.com/QwenLM/Qwen-Image) **2x↑🎉** speedup! Check the example: [run_qwen_image_edit.py](https://github.com/vipshop/cache-dit/raw/main/examples/pipeline/run_qwen_image_edit.py).
-- [2025-08-11] 🔥[**Qwen-Image**](https://github.com/QwenLM/Qwen-Image) **1.8x↑🎉** speedup! Please refer to [run_qwen_image.py](https://github.com/vipshop/cache-dit/raw/main/examples/pipeline/run_qwen_image.py) as an example.
-- [2025-07-13] 🎉[**FLUX.1-dev**](https://github.com/xlite-dev/flux-faster) **3.3x↑🎉** speedup! NVIDIA L20 with **[cache-dit](https://github.com/vipshop/cache-dit)** + **compile + FP8 DQ**.
+- [2025-09-24] 🔥**cache-dit** has now joined the 🤗 Diffusers community ecosystem as the **first** cache acceleration framework for DiTs! Check out the documentation here: **[Diffusers Docs](https://huggingface.co/docs/diffusers/main/en/optimization/cache_dit)**. <a href="https://huggingface.co/docs/diffusers/main/en/optimization/cache_dit"><img src=https://img.shields.io/badge/🤗Diffusers-ecosystem-yellow.svg ></a>
+- [2025-09-10] 🎉Day 1 support [**HunyuanImage-2.1**](https://github.com/Tencent-Hunyuan/HunyuanImage-2.1) with **1.7x↑🎉** speedup! Check this [example](https://github.com/vipshop/cache-dit/blob/main/examples/pipeline/run_hunyuan_image_2.1.py).
+- [2025-09-08] 🔥[**Qwen-Image-Lightning**](https://github.com/vipshop/cache-dit/blob/main/examples/pipeline/run_qwen_image_lightning.py) **7.1/3.5 steps🎉** inference with **[DBCache: F16B16](https://github.com/vipshop/cache-dit)**.
+- [2025-09-03] 🎉[**Wan2.2-MoE**](https://github.com/Wan-Video) **2.4x↑🎉** speedup! Please refer to [run_wan_2.2.py](https://github.com/vipshop/cache-dit/blob/main/examples/pipeline/run_wan_2.2.py) as an example.
+- [2025-08-19] 🔥[**Qwen-Image-Edit**](https://github.com/QwenLM/Qwen-Image) **2x↑🎉** speedup! Check the example: [run_qwen_image_edit.py](https://github.com/vipshop/cache-dit/blob/main/examples/pipeline/run_qwen_image_edit.py).
+- [2025-08-11] 🔥[**Qwen-Image**](https://github.com/QwenLM/Qwen-Image) **1.8x↑🎉** speedup! Please refer to [run_qwen_image.py](https://github.com/vipshop/cache-dit/blob/main/examples/pipeline/run_qwen_image.py) as an example.
 <details>
 <summary> Previous News </summary>
+- [2025-07-13] 🎉[**FLUX.1-dev**](https://github.com/xlite-dev/flux-faster) **3.3x↑🎉** speedup! NVIDIA L20 with **[cache-dit](https://github.com/vipshop/cache-dit)** + **compile + FP8 DQ**.
 - [2025-09-08] 🎉First caching mechanism in [Qwen-Image-Lightning](https://github.com/ModelTC/Qwen-Image-Lightning) with **[cache-dit](https://github.com/vipshop/cache-dit)**, check this [PR](https://github.com/ModelTC/Qwen-Image-Lightning/pull/35).
 - [2025-09-08] 🎉First caching mechanism in [Wan2.2](https://github.com/Wan-Video/Wan2.2) with **[cache-dit](https://github.com/vipshop/cache-dit)**, check this [PR](https://github.com/Wan-Video/Wan2.2/pull/127) for more details.
 - [2025-08-12] 🎉First caching mechanism in [QwenLM/Qwen-Image](https://github.com/QwenLM/Qwen-Image) with **[cache-dit](https://github.com/vipshop/cache-dit)**, check this [PR](https://github.com/QwenLM/Qwen-Image/pull/61).
-- [2025-09-01] 📚[**Hybird Forward Pattern**](#unified) is supported! Please check [FLUX.1-dev](https://github.com/vipshop/cache-dit/raw/main/examples/run_flux_adapter.py) as an example.
-- [2025-08-10] 🔥[**FLUX.1-Kontext-dev**](https://huggingface.co/black-forest-labs/FLUX.1-Kontext-dev) is supported! Please refer [run_flux_kontext.py](https://github.com/vipshop/cache-dit/raw/main/examples/pipeline/run_flux_kontext.py) as an example.
+- [2025-09-01] 📚[**Hybird Forward Pattern**](#unified) is supported! Please check [FLUX.1-dev](https://github.com/vipshop/cache-dit/blob/main/examples/run_flux_adapter.py) as an example.
+- [2025-08-10] 🔥[**FLUX.1-Kontext-dev**](https://huggingface.co/black-forest-labs/FLUX.1-Kontext-dev) is supported! Please refer [run_flux_kontext.py](https://github.com/vipshop/cache-dit/blob/main/examples/pipeline/run_flux_kontext.py) as an example.
 - [2025-07-18] 🎉First caching mechanism in [🤗huggingface/flux-fast](https://github.com/huggingface/flux-fast) with **[cache-dit](https://github.com/vipshop/cache-dit)**, check the [PR](https://github.com/huggingface/flux-fast/pull/13).
 </details>
@@ -188,19 +209,13 @@ Dynamic: requires-python
 - [⚙️Installation](#️installation)
 - [🔥Benchmarks](#benchmarks)
-- [🔥Supported Pipelines](#supported)
-- [🎉Unified Cache APIs](#unified)
-  - [📚Forward Pattern Matching](#forward-pattern-matching)
-  - [♥️Cache with One-line Code](#%EF%B8%8Fcache-acceleration-with-one-line-code)
-  - [🔥Automatic Block Adapter](#automatic-block-adapter)
-  - [📚Hybird Forward Pattern](#automatic-block-adapter)
-  - [📚Implement Patch Functor](#implement-patch-functor)
-  - [🤖Cache Acceleration Stats](#cache-acceleration-stats-summary)
+- [🔥Quick Start](#quick-start)
+- [📚Pattern Matching](#forward-pattern-matching)
 - [⚡️Dual Block Cache](#dbcache)
 - [🔥TaylorSeer Calibrator](#taylorseer)
-- [⚡️Hybrid Cache CFG](#cfg)
-- [⚙️Torch Compile](#compile)
-- [🛠Metrics CLI](#metrics)
+- [📚Hybrid Cache CFG](#cfg)
+- [🎉User Guide](#user-guide)
+- [©️Citations](#citations)
 ## ⚙️Installation
@@ -217,82 +232,13 @@ Or you can install the latest develop version from GitHub:
 pip3 install git+https://github.com/vipshop/cache-dit.git
 ```
-## 🔥Supported Pipelines
-<div id="supported"></div>
-Currently, **cache-dit** library supports almost **Any** Diffusion Transformers (with **Transformer Blocks** that match the specific Input and Output **patterns**). Please check [🎉Examples](https://github.com/vipshop/cache-dit/raw/main/examples/pipeline) for more details. Here are just some of the tested models listed.
-```python
->>> import cache_dit
->>> cache_dit.supported_pipelines()
-(30, ['Flux*', 'Mochi*', 'CogVideoX*', 'Wan*', 'HunyuanVideo*', 'QwenImage*', 'LTX*', 'Allegro*',
-'CogView3Plus*', 'CogView4*', 'Cosmos*', 'EasyAnimate*', 'SkyReelsV2*', 'StableDiffusion3*',
-'ConsisID*', 'DiT*', 'Amused*', 'Bria*', 'Lumina*', 'OmniGen*', 'PixArt*', 'Sana*', 'StableAudio*',
-'VisualCloze*', 'AuraFlow*', 'Chroma*', 'ShapE*', 'HiDream*', 'HunyuanDiT*', 'HunyuanDiTPAG*'])
-```
-<details>
-<summary> Show all pipelines </summary>
-- [🚀HunyuanImage-2.1](https://github.com/vipshop/cache-dit/raw/main/examples)
-- [🚀Qwen-Image-Lightning](https://github.com/vipshop/cache-dit/raw/main/examples)
-- [🚀Qwen-Image-Edit](https://github.com/vipshop/cache-dit/raw/main/examples)
-- [🚀Qwen-Image](https://github.com/vipshop/cache-dit/raw/main/examples)
-- [🚀FLUX.1-dev](https://github.com/vipshop/cache-dit/raw/main/examples)
-- [🚀FLUX.1-Fill-dev](https://github.com/vipshop/cache-dit/raw/main/examples)
-- [🚀FLUX.1-Kontext-dev](https://github.com/vipshop/cache-dit/raw/main/examples)
-- [🚀CogView4](https://github.com/vipshop/cache-dit/raw/main/examples)
-- [🚀Wan2.2-T2V](https://github.com/vipshop/cache-dit/raw/main/examples)
-- [🚀HunyuanVideo](https://github.com/vipshop/cache-dit/raw/main/examples)
-- [🚀HiDream-I1-Full](https://github.com/vipshop/cache-dit/raw/main/examples)
-- [🚀HunyuanDiT](https://github.com/vipshop/cache-dit/raw/main/examples)
-- [🚀Wan2.1-T2V](https://github.com/vipshop/cache-dit/raw/main/examples)
-- [🚀Wan2.1-FLF2V](https://github.com/vipshop/cache-dit/raw/main/examples)
-- [🚀SkyReelsV2](https://github.com/vipshop/cache-dit/raw/main/examples)
-- [🚀Chroma1-HD](https://github.com/vipshop/cache-dit/raw/main/examples)
-- [🚀CogVideoX1.5](https://github.com/vipshop/cache-dit/raw/main/examples)
-- [🚀CogView3-Plus](https://github.com/vipshop/cache-dit/raw/main/examples)
-- [🚀CogVideoX](https://github.com/vipshop/cache-dit/raw/main/examples)
-- [🚀VisualCloze](https://github.com/vipshop/cache-dit/raw/main/examples)
-- [🚀LTXVideo](https://github.com/vipshop/cache-dit/raw/main/examples)
-- [🚀OmniGen](https://github.com/vipshop/cache-dit/raw/main/examples)
-- [🚀Lumina2](https://github.com/vipshop/cache-dit/raw/main/examples)
-- [🚀mochi-1-preview](https://github.com/vipshop/cache-dit/raw/main/examples)
-- [🚀AuraFlow-v0.3](https://github.com/vipshop/cache-dit/raw/main/examples)
-- [🚀PixArt-Alpha](https://github.com/vipshop/cache-dit/raw/main/examples)
-- [🚀PixArt-Sigma](https://github.com/vipshop/cache-dit/raw/main/examples)
-- [🚀NVIDIA Sana](https://github.com/vipshop/cache-dit/raw/main/examples)
-- [🚀SD-3/3.5](https://github.com/vipshop/cache-dit/raw/main/examples)
-- [🚀ConsisID](https://github.com/vipshop/cache-dit/raw/main/examples)
-- [🚀Allegro](https://github.com/vipshop/cache-dit/raw/main/examples)
-- [🚀Amused](https://github.com/vipshop/cache-dit/raw/main/examples)
-- [🚀DiT-XL](https://github.com/vipshop/cache-dit/raw/main/examples)
-- ...
-</details>
 ## 🔥Benchmarks
 <div id="benchmarks"></div>
-cache-dit will support more mainstream Cache acceleration algorithms in the future. More benchmarks will be released, please stay tuned for update. Here, only the results of some precision and performance benchmarks are presented. The test dataset is **DrawBench**. For a complete benchmark, please refer to [📚Benchmarks](https://github.com/vipshop/cache-dit/raw/main/bench/).
+![image-reward-bench](https://github.com/vipshop/cache-dit/raw/main/assets/image-reward-bench.png)
-### 📚Text2Image DrawBench: FLUX.1-dev
-Comparisons between different FnBn compute block configurations show that **more compute blocks result in higher precision**. For example, the F8B0_W8MC0 configuration achieves the best Clip Score (33.007) and ImageReward (1.0333). **Device**: NVIDIA L20. **F**: Fn_compute_blocks, **B**: Bn_compute_blocks, 50 steps.
-| Config | Clip Score(↑) | ImageReward(↑) | PSNR(↑) | TFLOPs(↓) | SpeedUp(↑) |
-| --- | --- | --- | --- | --- | --- |
-| [**FLUX.1**-dev]: 50 steps | 32.9217 | 1.0412 | INF | 3726.87 | 1.00x |
-| F8B0_W4MC0_R0.08 | 32.9871 | 1.0370 | 33.8317 | 2064.81 | 1.80x |
-| F8B0_W4MC2_R0.12 | 32.9535 | 1.0185 | 32.7346 | 1935.73 | 1.93x |
-| F8B0_W4MC3_R0.12 | 32.9234 | 1.0085 | 32.5385 | 1816.58 | 2.05x |
-| F4B0_W4MC3_R0.12 | 32.8981 | 1.0130 | 31.8031 | 1507.83 | 2.47x |
-| F4B0_W4MC4_R0.12 | 32.8384 | 1.0065 | 31.5292 | 1400.08 | 2.66x |
-The comparison between **cache-dit: DBCache** and algorithms such as Δ-DiT, Chipmunk, FORA, DuCa, TaylorSeer and FoCa is as follows. Now, in the comparison with a speedup ratio less than **3x**, cache-dit achieved the best accuracy. Please check [📚How to Reproduce?](https://github.com/vipshop/cache-dit/raw/main/bench/) for more details.
+The comparison between **cache-dit: DBCache** and algorithms such as Δ-DiT, Chipmunk, FORA, DuCa, TaylorSeer and FoCa is as follows. Now, in the comparison with a speedup ratio less than **3x**, cache-dit achieved the best accuracy. Surprisingly, cache-dit: DBCache still works in the extremely few-step distill model. For a complete benchmark, please refer to [📚Benchmarks](https://github.com/vipshop/cache-dit/raw/main/bench/).
 | Method | TFLOPs(↓) | SpeedUp(↑) | ImageReward(↑) | Clip Score(↑) |
 | --- | --- | --- | --- | --- |
@@ -314,6 +260,8 @@ The comparison between **cache-dit: DBCache** and algorithms such as Δ-DiT, Chi
 <details>
 <summary> Show all comparison </summary>
+![clip-score-bench](https://github.com/vipshop/cache-dit/raw/main/assets/clip-score-bench.png)
 | Method | TFLOPs(↓) | SpeedUp(↑) | ImageReward(↑) | Clip Score(↑) |
 | --- | --- | --- | --- | --- |
 | [**FLUX.1**-dev]: 50 steps | 3726.87 | 1.00× | 0.9898 | 32.404 |
@@ -350,192 +298,84 @@ NOTE: Except for DBCache, other performance data are referenced from the paper [
 </details>
-### 📚Text2Image Distillation DrawBench: Qwen-Image-Lightning
-Surprisingly, cache-dit: DBCache still works in the extremely few-step distill model. For example,  **Qwen-Image-Lightning w/ 4 steps**, with the F16B16 configuration, the PSNR is 34.8163, the Clip Score is 35.6109, and the ImageReward is 1.2614. It maintained a relatively high precision.
-| Config                     |  PSNR(↑)      | Clip Score(↑) | ImageReward(↑) | TFLOPs(↓)   | SpeedUp(↑) |
-|----------------------------|-----------|------------|--------------|----------|------------|
-| [**Lightning**]: 4 steps   | INF       | 35.5797    | 1.2630       | 274.33   | 1.00x       |
-| F24B24_W2MC1_R0.8          | 36.3242   | 35.6224    | 1.2630       | 264.74   | 1.04x       |
-| F16B16_W2MC1_R0.8          | 34.8163   | 35.6109    | 1.2614       | 244.25   | 1.12x       |
-| F12B12_W2MC1_R0.8          | 33.8953   | 35.6535    | 1.2549       | 234.63   | 1.17x       |
-| F8B8_W2MC1_R0.8            | 33.1374   | 35.7284    | 1.2517       | 224.29   | 1.22x       |
-| F1B0_W2MC1_R0.8            | 31.8317   | 35.6651    | 1.2397       | 206.90   | 1.33x       |
-## 🎉Unified Cache APIs
+## 🔥Quick Start
 <div id="unified"></div>
-### 📚Forward Pattern Matching
-Currently, for any **Diffusion** models with **Transformer Blocks** that match the specific **Input/Output patterns**, we can use the **Unified Cache APIs** from **cache-dit**, namely, the `cache_dit.enable_cache(...)` API. The **Unified Cache APIs** are currently in the experimental phase; please stay tuned for updates. The supported patterns are listed as follows:
-![](https://github.com/vipshop/cache-dit/raw/main/assets/patterns-v1.png)
-### ♥️Cache Acceleration with One-line Code
-In most cases, you only need to call **one-line** of code, that is `cache_dit.enable_cache(...)`. After this API is called, you just need to call the pipe as normal. The `pipe` param can be **any** Diffusion Pipeline. Please refer to [Qwen-Image](https://github.com/vipshop/cache-dit/raw/main/examples/pipeline/run_qwen_image.py) as an example.
-```python
-import cache_dit
-from diffusers import DiffusionPipeline
-# Can be any diffusion pipeline
-pipe = DiffusionPipeline.from_pretrained("Qwen/Qwen-Image")
-# One-line code with default cache options.
-cache_dit.enable_cache(pipe)
-# Just call the pipe as normal.
-output = pipe(...)
-# Disable cache and run original pipe.
-cache_dit.disable_cache(pipe)
-```
-### 🔥Automatic Block Adapter
-But in some cases, you may have a **modified** Diffusion Pipeline or Transformer that is not located in the diffusers library or not officially supported by **cache-dit** at this time. The **BlockAdapter** can help you solve this problems. Please refer to [🔥Qwen-Image w/ BlockAdapter](https://github.com/vipshop/cache-dit/raw/main/examples/adapter/run_qwen_image_adapter.py) as an example.
-```python
-from cache_dit import ForwardPattern, BlockAdapter
-# Use 🔥BlockAdapter with `auto` mode.
-cache_dit.enable_cache(
-    BlockAdapter(
-        # Any DiffusionPipeline, Qwen-Image, etc.
-        pipe=pipe, auto=True,
-        # Check `📚Forward Pattern Matching` documentation and hack the code of
-        # of Qwen-Image, you will find that it has satisfied `FORWARD_PATTERN_1`.
-        forward_pattern=ForwardPattern.Pattern_1,
-    ),
-)
-# Or, manually setup transformer configurations.
-cache_dit.enable_cache(
-    BlockAdapter(
-        pipe=pipe, # Qwen-Image, etc.
-        transformer=pipe.transformer,
-        blocks=pipe.transformer.transformer_blocks,
-        forward_pattern=ForwardPattern.Pattern_1,
-    ),
-)
-```
-For such situations, **BlockAdapter** can help you quickly apply various cache acceleration features to your own Diffusion Pipelines and Transformers. Please check the [📚BlockAdapter.md](https://github.com/vipshop/cache-dit/raw/main/docs/BlockAdapter.md) for more details.
+<div id="quick-start"></div>
-### 📚Hybird Forward Pattern
-Sometimes, a Transformer class will contain more than one transformer `blocks`. For example, **FLUX.1** (HiDream, Chroma, etc) contains transformer_blocks and single_transformer_blocks (with different forward patterns). The **BlockAdapter** can also help you solve this problem. Please refer to [📚FLUX.1](https://github.com/vipshop/cache-dit/raw/main/examples/adapter/run_flux_adapter.py) as an example.
+In most cases, you only need to call ♥️**one-line**♥️ of code, that is `cache_dit.enable_cache(...)`. After this API is called, you just need to call the pipe as normal. The `pipe` param can be **any** Diffusion Pipeline. Please refer to [Qwen-Image](https://github.com/vipshop/cache-dit/blob/main/examples/pipeline/run_qwen_image.py) as an example.
 ```python
-# For diffusers <= 0.34.0, FLUX.1 transformer_blocks and
-# single_transformer_blocks have different forward patterns.
-cache_dit.enable_cache(
-    BlockAdapter(
-        pipe=pipe, # FLUX.1, etc.
-        transformer=pipe.transformer,
-        blocks=[
-            pipe.transformer.transformer_blocks,
-            pipe.transformer.single_transformer_blocks,
-        ],
-        forward_pattern=[
-            ForwardPattern.Pattern_1,
-            ForwardPattern.Pattern_3,
-        ],
-    ),
-)
+>>> import cache_dit
+>>> from diffusers import DiffusionPipeline
+>>> pipe = DiffusionPipeline.from_pretrained("Qwen/Qwen-Image") # Can be any diffusion pipeline
+>>> cache_dit.enable_cache(pipe) # One-line code with default cache options.
+>>> output = pipe(...) # Just call the pipe as normal.
+>>> stats = cache_dit.summary(pipe) # Then, get the summary of cache acceleration stats.
+>>> cache_dit.disable_cache(pipe) # Disable cache and run original pipe.
 ```
-Even sometimes you have more complex cases, such as **Wan 2.2 MoE**, which has more than one Transformer (namely `transformer` and `transformer_2`) in its structure. Fortunately, **cache-dit** can also handle this situation very well. Please refer to [📚Wan 2.2 MoE](https://github.com/vipshop/cache-dit/raw/main/examples/pipeline/run_wan_2.2.py) as an example.
-```python
-from cache_dit import ForwardPattern, BlockAdapter, ParamsModifier, BasicCacheConfig
+## 📚Forward Pattern Matching
-cache_dit.enable_cache(
-    BlockAdapter(
-        pipe=pipe,
-        transformer=[
-            pipe.transformer,
-            pipe.transformer_2,
-        ],
-        blocks=[
-            pipe.transformer.blocks,
-            pipe.transformer_2.blocks,
-        ],
-        forward_pattern=[
-            ForwardPattern.Pattern_2,
-            ForwardPattern.Pattern_2,
-        ],
-        # Setup different cache params for each 'blocks'. You can
-        # pass any specific cache params to ParamModifier, the old
-        # value will be overwrite by the new one.
-        params_modifiers=[
-            ParamsModifier(
-                cache_config=BasicCacheConfig(
-                    max_warmup_steps=4,
-                    max_cached_steps=8,
-                ),
-            ),
-            ParamsModifier(
-                cache_config=BasicCacheConfig(
-                    max_warmup_steps=2,
-                    max_cached_steps=20,
-                ),
-            ),
-        ],
-        has_separate_cfg=True,
-    ),
-)
-```
-### 📚Implement Patch Functor
-For any PATTERN not in {0...5}, we introduced the simple abstract concept of **Patch Functor**. Users can implement a subclass of Patch Functor to convert an unknown Pattern into a known PATTERN, and for some models, users may also need to fuse the operations within the blocks for loop into block forward.
+<div id="supported"></div>
-![](https://github.com/vipshop/cache-dit/raw/main/assets/patch-functor.png)
+<div id="forward-pattern-matching"></div>
-Some Patch functors have already been provided in cache-dit: [📚HiDreamPatchFunctor](https://github.com/vipshop/cache-dit/raw/main/src/cache_dit/cache_factory/patch_functors/functor_hidream.py), [📚ChromaPatchFunctor](https://github.com/vipshop/cache-dit/raw/main/src/cache_dit/cache_factory/patch_functors/functor_chroma.py), etc. After implementing Patch Functor, users need to set the `patch_functor` property of **BlockAdapter**.
+cache-dit works by matching specific input/output patterns as shown below.
-```python
-@BlockAdapterRegistry.register("HiDream")
-def hidream_adapter(pipe, **kwargs) -> BlockAdapter:
-    from diffusers import HiDreamImageTransformer2DModel
-    from cache_dit.cache_factory.patch_functors import HiDreamPatchFunctor
-    assert isinstance(pipe.transformer, HiDreamImageTransformer2DModel)
-    return BlockAdapter(
-        pipe=pipe,
-        transformer=pipe.transformer,
-        blocks=[
-            pipe.transformer.double_stream_blocks,
-            pipe.transformer.single_stream_blocks,
-        ],
-        forward_pattern=[
-            ForwardPattern.Pattern_0,
-            ForwardPattern.Pattern_3,
-        ],
-        # NOTE: Setup your custom patch functor here.
-        patch_functor=HiDreamPatchFunctor(),
-        **kwargs,
-    )
-```
+![](https://github.com/vipshop/cache-dit/raw/main/assets/patterns-v1.png)
-### 🤖Cache Acceleration Stats Summary
+Please check [🎉Examples](https://github.com/vipshop/cache-dit/blob/main/examples/pipeline) for more details. Here are just some of the tested models listed.
-After finishing each inference of `pipe(...)`, you can call the `cache_dit.summary()` API on pipe to get the details of the **Cache Acceleration Stats** for the current inference.
 ```python
-stats = cache_dit.summary(pipe)
+>>> import cache_dit
+>>> cache_dit.supported_pipelines()
+(30, ['Flux*', 'Mochi*', 'CogVideoX*', 'Wan*', 'HunyuanVideo*', 'QwenImage*', 'LTX*', 'Allegro*',
+'CogView3Plus*', 'CogView4*', 'Cosmos*', 'EasyAnimate*', 'SkyReelsV2*', 'StableDiffusion3*',
+'ConsisID*', 'DiT*', 'Amused*', 'Bria*', 'Lumina*', 'OmniGen*', 'PixArt*', 'Sana*', 'StableAudio*',
+'VisualCloze*', 'AuraFlow*', 'Chroma*', 'ShapE*', 'HiDream*', 'HunyuanDiT*', 'HunyuanDiTPAG*'])
 ```
-You can set `details` param as `True` to show more details of cache stats. (markdown table format) Sometimes, this may help you analyze what values of the residual diff threshold would be better.
+<details>
+<summary> Show all pipelines </summary>
-```python
-⚡️Cache Steps and Residual Diffs Statistics: QwenImagePipeline
+- [🚀HunyuanImage-2.1](https://github.com/vipshop/cache-dit/blob/main/examples)
+- [🚀Qwen-Image-Lightning](https://github.com/vipshop/cache-dit/blob/main/examples)
+- [🚀Qwen-Image-Edit](https://github.com/vipshop/cache-dit/blob/main/examples)
+- [🚀Qwen-Image](https://github.com/vipshop/cache-dit/blob/main/examples)
+- [🚀FLUX.1-dev](https://github.com/vipshop/cache-dit/blob/main/examples)
+- [🚀FLUX.1-Fill-dev](https://github.com/vipshop/cache-dit/blob/main/examples)
+- [🚀FLUX.1-Kontext-dev](https://github.com/vipshop/cache-dit/blob/main/examples)
+- [🚀CogView4](https://github.com/vipshop/cache-dit/blob/main/examples)
+- [🚀Wan2.2-T2V](https://github.com/vipshop/cache-dit/blob/main/examples)
+- [🚀HunyuanVideo](https://github.com/vipshop/cache-dit/blob/main/examples)
+- [🚀HiDream-I1-Full](https://github.com/vipshop/cache-dit/blob/main/examples)
+- [🚀HunyuanDiT](https://github.com/vipshop/cache-dit/blob/main/examples)
+- [🚀Wan2.1-T2V](https://github.com/vipshop/cache-dit/blob/main/examples)
+- [🚀Wan2.1-FLF2V](https://github.com/vipshop/cache-dit/blob/main/examples)
+- [🚀SkyReelsV2](https://github.com/vipshop/cache-dit/blob/main/examples)
+- [🚀Chroma1-HD](https://github.com/vipshop/cache-dit/blob/main/examples)
+- [🚀CogVideoX1.5](https://github.com/vipshop/cache-dit/blob/main/examples)
+- [🚀CogView3-Plus](https://github.com/vipshop/cache-dit/blob/main/examples)
+- [🚀CogVideoX](https://github.com/vipshop/cache-dit/blob/main/examples)
+- [🚀VisualCloze](https://github.com/vipshop/cache-dit/blob/main/examples)
+- [🚀LTXVideo](https://github.com/vipshop/cache-dit/blob/main/examples)
+- [🚀OmniGen](https://github.com/vipshop/cache-dit/blob/main/examples)
+- [🚀Lumina2](https://github.com/vipshop/cache-dit/blob/main/examples)
+- [🚀mochi-1-preview](https://github.com/vipshop/cache-dit/blob/main/examples)
+- [🚀AuraFlow-v0.3](https://github.com/vipshop/cache-dit/blob/main/examples)
+- [🚀PixArt-Alpha](https://github.com/vipshop/cache-dit/blob/main/examples)
+- [🚀PixArt-Sigma](https://github.com/vipshop/cache-dit/blob/main/examples)
+- [🚀NVIDIA Sana](https://github.com/vipshop/cache-dit/blob/main/examples)
+- [🚀SD-3/3.5](https://github.com/vipshop/cache-dit/blob/main/examples)
+- [🚀ConsisID](https://github.com/vipshop/cache-dit/blob/main/examples)
+- [🚀Allegro](https://github.com/vipshop/cache-dit/blob/main/examples)
+- [🚀Amused](https://github.com/vipshop/cache-dit/blob/main/examples)
+- [🚀DiT-XL](https://github.com/vipshop/cache-dit/blob/main/examples)
+- ...
-| Cache Steps | Diffs Min | Diffs P25 | Diffs P50 | Diffs P75 | Diffs P95 | Diffs Max |
-|-------------|-----------|-----------|-----------|-----------|-----------|-----------|
-| 23          | 0.045     | 0.084     | 0.114     | 0.147     | 0.241     | 0.297     |
-```
+</details>
 ## ⚡️DBCache: Dual Block Cache
@@ -543,20 +383,9 @@ You can set `details` param as `True` to show more details of cache stats. (mark
 ![](https://github.com/vipshop/cache-dit/raw/main/assets/dbcache-v1.png)
-**DBCache**: **Dual Block Caching** for Diffusion Transformers. Different configurations of compute blocks (**F8B12**, etc.) can be customized in DBCache, enabling a balanced trade-off between performance and precision. Moreover, it can be entirely **training**-**free**. Please check [DBCache.md](https://github.com/vipshop/cache-dit/raw/main/docs/DBCache.md) docs for more design details.
-- **Fn**: Specifies that DBCache uses the **first n** Transformer blocks to fit the information at time step t, enabling the calculation of a more stable L1 diff and delivering more accurate information to subsequent blocks.
-- **Bn**: Further fuses approximate information in the **last n** Transformer blocks to enhance prediction accuracy. These blocks act as an auto-scaler for approximate hidden states that use residual cache.
+**DBCache**: **Dual Block Caching** for Diffusion Transformers. Different configurations of compute blocks (**F8B12**, etc.) can be customized in DBCache, enabling a balanced trade-off between performance and precision. Moreover, it can be entirely **training**-**free**. Please Check the [DBCache](https://github.com/vipshop/cache-dit/blob/main/docs/DBCache.md) and [User Guide](https://github.com/vipshop/cache-dit/blob/main/docs/User_Guide.md#dbcache) docs for details.
 ```python
-import cache_dit
-from diffusers import FluxPipeline
-pipe_or_adapter = FluxPipeline.from_pretrained(
-    "black-forest-labs/FLUX.1-dev",
-    torch_dtype=torch.bfloat16,
-).to("cuda")
 # Default options, F8B0, 8 warmup steps, and unlimited cached
 # steps for good balance between performance and precision
 cache_dit.enable_cache(pipe_or_adapter)
@@ -576,28 +405,13 @@ cache_dit.enable_cache(
 )
 ```
-<div align="center">
-  <p align="center">
-    DBCache, <b> L20x1 </b>, Steps: 28, "A cat holding a sign that says hello world with complex background"
-  </p>
-</div>
-|Baseline(L20x1)|F1B0 (0.08)|F1B0 (0.20)|F8B8 (0.15)|F12B12 (0.20)|F16B16 (0.20)|
-|:---:|:---:|:---:|:---:|:---:|:---:|
-|24.85s|15.59s|8.58s|15.41s|15.11s|17.74s|
-|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/NONE_R0.08_S0.png width=105px>|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/DBCACHE_F1B0S1_R0.08_S11.png width=105px> | <img src=https://github.com/vipshop/cache-dit/raw/main/assets/DBCACHE_F1B0S1_R0.2_S19.png width=105px>|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/DBCACHE_F8B8S1_R0.15_S15.png width=105px>|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/DBCACHE_F12B12S4_R0.2_S16.png width=105px>|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/DBCACHE_F16B16S4_R0.2_S13.png width=105px>|
 ## 🔥TaylorSeer Calibrator
 <div id="taylorseer"></div>
-We have supported the [TaylorSeers: From Reusing to Forecasting: Accelerating Diffusion Models with TaylorSeers](https://arxiv.org/pdf/2503.06923) algorithm to further improve the precision of DBCache in cases where the cached steps are large, namely, **Hybrid TaylorSeer + DBCache**. At timesteps with significant intervals, the feature similarity in diffusion models decreases substantially, significantly harming the generation quality.
-$$
-\mathcal{F}\_{\text {pred }, m}\left(x_{t-k}^l\right)=\mathcal{F}\left(x_t^l\right)+\sum_{i=1}^m \frac{\Delta^i \mathcal{F}\left(x_t^l\right)}{i!\cdot N^i}(-k)^i
-$$
+The [TaylorSeers](https://huggingface.co/papers/2503.06923) algorithm further improves the precision of DBCache in cases where the cached steps are large (Hybrid TaylorSeer + DBCache). At timesteps with significant intervals, the feature similarity in diffusion models decreases substantially, significantly harming the generation quality.
-**TaylorSeer** employs a differential method to approximate the higher-order derivatives of features and predict features in future timesteps with Taylor series expansion. The TaylorSeer implemented in cache-dit supports both hidden states and residual cache types. That is $\mathcal{F}\_{\text {pred }, m}\left(x_{t-k}^l\right)$ can be a residual cache or a hidden-state cache.
+TaylorSeer employs a differential method to approximate the higher-order derivatives of features and predict features in future timesteps with Taylor series expansion. The TaylorSeer implemented in CacheDiT supports both hidden states and residual cache types. F_pred can be a residual cache or a hidden-state cache.
 ```python
 from cache_dit import BasicCacheConfig, TaylorSeerCalibratorConfig
@@ -620,25 +434,14 @@ cache_dit.enable_cache(
 )
 ```
-> [!Important]
-> Please note that if you have used TaylorSeer as the calibrator for approximate hidden states, the **Bn** param of DBCache can be set to **0**. In essence, DBCache's Bn is also act as a calibrator, so you can choose either Bn > 0 or TaylorSeer. We recommend using the configuration scheme of **TaylorSeer** + **DBCache FnB0**.
-<div align="center">
-  <p align="center">
-    <b>DBCache F1B0 + TaylorSeer</b>, L20x1, Steps: 28, <br>"A cat holding a sign that says hello world with complex background"
-  </p>
-</div>
-|Baseline(L20x1)|F1B0 (0.12)|+TaylorSeer|F1B0 (0.15)|+TaylorSeer|+compile|
-|:---:|:---:|:---:|:---:|:---:|:---:|
-|24.85s|12.85s|12.86s|10.27s|10.28s|8.48s|
-|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/NONE_R0.08_S0.png width=105px>|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/U0_C0_DBCACHE_F1B0S1W0T0ET0_R0.12_S14_T12.85s.png width=105px>|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/U0_C0_DBCACHE_F1B0S1W0T1ET1_R0.12_S14_T12.86s.png width=105px>|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/U0_C0_DBCACHE_F1B0S1W0T0ET0_R0.15_S17_T10.27s.png width=105px>|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/U0_C0_DBCACHE_F1B0S1W0T1ET1_R0.15_S17_T10.28s.png width=105px>|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/U0_C1_DBCACHE_F1B0S1W0T1ET1_R0.15_S17_T8.48s.png width=105px>|
+> [!TIP]
+> The `Bn_compute_blocks` parameter of DBCache can be set to `0` if you use TaylorSeer as the calibrator for approximate hidden states. DBCache's `Bn_compute_blocks` also acts as a calibrator, so you can choose either `Bn_compute_blocks` > 0 or TaylorSeer. We recommend using the configuration scheme of TaylorSeer + DBCache FnB0.
-## ⚡️Hybrid Cache CFG
+## 📚Hybrid Cache CFG
 <div id="cfg"></div>
-cache-dit supports caching for **CFG (classifier-free guidance)**. For models that fuse CFG and non-CFG into a single forward step, or models that do not include CFG (classifier-free guidance) in the forward step, please set `enable_separate_cfg` param to **False (default, None)**. Otherwise, set it to True. For examples:
+cache-dit supports caching for CFG (classifier-free guidance). For models that fuse CFG and non-CFG into a single forward step, or models that do not include CFG (classifier-free guidance) in the forward step, please set `enable_separate_cfg` parameter  to `False (default, None)`. Otherwise, set it to `True`.
 ```python
 from cache_dit import BasicCacheConfig
@@ -647,75 +450,35 @@ cache_dit.enable_cache(
     pipe_or_adapter,
     cache_config=BasicCacheConfig(
         ...,
-        # CFG: classifier free guidance or not
-        # For model that fused CFG and non-CFG into single forward step,
-        # should set enable_separate_cfg as False. For example, set it as True
-        # for Wan 2.1/Qwen-Image and set it as False for FLUX.1, HunyuanVideo,
-        # CogVideoX, Mochi, LTXVideo, Allegro, CogView3Plus, EasyAnimate, SD3, etc.
-        enable_separate_cfg=True, # Wan 2.1, Qwen-Image, CogView4, Cosmos, SkyReelsV2, etc.
-        # Compute cfg forward first or not, default False, namely,
-        # 0, 2, 4, ..., -> non-CFG step; 1, 3, 5, ... -> CFG step.
-        cfg_compute_first=False,
-        # Compute separate diff values for CFG and non-CFG step,
-        # default True. If False, we will use the computed diff from
-        # current non-CFG transformer step for current CFG step.
-        cfg_diff_compute_separate=True,
+        # For example, set it as True for Wan 2.1/Qwen-Image
+        # and set it as False for FLUX.1, HunyuanVideo, CogVideoX, etc.
+        enable_separate_cfg=True,
     ),
 )
 ```
-## ⚙️Torch Compile
-<div id="compile"></div>
-By the way, **cache-dit** is designed to work compatibly with **torch.compile.** You can easily use cache-dit with torch.compile to further achieve a better performance. For example:
-```python
-cache_dit.enable_cache(pipe)
-# Compile the Transformer module
-pipe.transformer = torch.compile(pipe.transformer)
-```
-However, users intending to use **cache-dit** for DiT with **dynamic input shapes** should consider increasing the **recompile** **limit** of `torch._dynamo`. Otherwise, the recompile_limit error may be triggered, causing the module to fall back to eager mode.
-```python
-torch._dynamo.config.recompile_limit = 96  # default is 8
-torch._dynamo.config.accumulated_recompile_limit = 2048  # default is 256
-```
-Please check [perf.py](https://github.com/vipshop/cache-dit/raw/main/bench/perf.py) for more details.
-## 🛠Metrics CLI
-<div id="metrics"></div>
-You can utilize the APIs provided by cache-dit to quickly evaluate the accuracy losses caused by different cache configurations. For example:
-```python
-from cache_dit.metrics import compute_psnr
-from cache_dit.metrics import compute_ssim
-from cache_dit.metrics import compute_fid
-from cache_dit.metrics import compute_lpips
-from cache_dit.metrics import compute_clip_score
-from cache_dit.metrics import compute_image_reward
-psnr,   n = compute_psnr("true.png", "test.png") # Num: n
-psnr,   n = compute_psnr("true_dir", "test_dir")
-ssim,   n = compute_ssim("true_dir", "test_dir")
-fid,    n = compute_fid("true_dir", "test_dir")
-lpips,  n = compute_lpips("true_dir", "test_dir")
-clip,   n = compute_clip_score("DrawBench200.txt", "test_dir")
-reward, n = compute_image_reward("DrawBench200.txt", "test_dir")
-```
-Or, you can use `cache-dit-metrics-cli` tool. For examples:
-```bash
-cache-dit-metrics-cli -h  # show usage
-# all: PSNR, FID, SSIM, MSE, ..., etc.
-cache-dit-metrics-cli all  -i1 true.png -i2 test.png  # image
-cache-dit-metrics-cli all  -i1 true_dir -i2 test_dir  # image dir
-```
+## 🎉User Guide
+<div id="user-guide"></div>
+For more advanced features such as **Unified Cache APIs**, **Forward Pattern Matching**, **Automatic Block Adapter**, **Hybrid Forward Pattern**, **DBCache**, **TaylorSeer Calibrator**, and **Hybrid Cache CFG**, please refer to the [🎉User_Guide.md](./docs/User_Guide.md) for details.
+- [⚙️Installation](./docs/User_Guide.md#️installation)
+- [🔥Benchmarks](./docs/User_Guide.md#benchmarks)
+- [🔥Supported Pipelines](./docs/User_Guide.md#supported-pipelines)
+- [🎉Unified Cache APIs](./docs/User_Guide.md#unified-cache-apis)
+  - [📚Forward Pattern Matching](./docs/User_Guide.md#forward-pattern-matching)
+  - [📚Cache with One-line Code](./docs/User_Guide.md#%EF%B8%8Fcache-acceleration-with-one-line-code)
+  - [🔥Automatic Block Adapter](./docs/User_Guide.md#automatic-block-adapter)
+  - [📚Hybird Forward Pattern](./docs/User_Guide.md#hybird-forward-pattern)
+  - [📚Implement Patch Functor](./docs/User_Guide.md#implement-patch-functor)
+  - [🤖Cache Acceleration Stats](./docs/User_Guide.md#cache-acceleration-stats-summary)
+- [⚡️Dual Block Cache](./docs/User_Guide.md#️dbcache-dual-block-cache)
+- [🔥TaylorSeer Calibrator](./docs/User_Guide.md#taylorseer-calibrator)
+- [⚡️Hybrid Cache CFG](./docs/User_Guide.md#️hybrid-cache-cfg)
+- [⚙️Torch Compile](./docs/User_Guide.md#️torch-compile)
+- [🛠Metrics CLI](./docs/User_Guide.md#metrics-cli)
+- [📚API Documents](./docs/User_Guide.md#api-documentation)
 ## 👋Contribute
 <div id="contribute"></div>
@@ -738,13 +501,17 @@ How to contribute? Star ⭐️ this repo to support us or check [CONTRIBUTE.md](
 The **cache-dit** codebase is adapted from FBCache. Over time its codebase diverged a lot, and **cache-dit** API is no longer compatible with FBCache.
+## ©️Special Acknowledgements
+Special thanks to vipshop's Computer Vision AI Team for supporting document, testing and production-level deployment of this project.
 ## ©️Citations
 <div id="citations"></div>
 ```BibTeX
 @misc{cache-dit@2025,
-  title={cache-dit: A Unified, Flexible and Training-free Cache Acceleration Framework for 🤗Diffusers.},
+  title={cache-dit: A Unified, Flexible and Training-free Cache Acceleration Framework for Diffusers.},
   url={https://github.com/vipshop/cache-dit.git},
   note={Open-source software available at https://github.com/vipshop/cache-dit.git},
   author={vipshop.com},

{cache_dit-0.3.2.dist-info → cache_dit-1.0.0.dist-info}/RECORD RENAMED Viewed

@@ -1,5 +1,5 @@
 cache_dit/__init__.py,sha256=sHRg0swXZZiw6lvSQ53fcVtN9JRayx0az2lXAz5OOGI,1510
-cache_dit/_version.py,sha256=e8NqPtZ8fggRgk3GPrqZ_U_BDV8aSULw1u_Gn9NNbnk,704
+cache_dit/_version.py,sha256=vLA4ITz09S-S435nq6yTF6l3qiSz6w4euS1rOxXgd1M,704
 cache_dit/logger.py,sha256=0zsu42hN-3-rgGC_C29ms1IvVpV4_b4_SwJCKSenxBE,4304
 cache_dit/utils.py,sha256=AyYRwi5XBxYBH4GaXxOxv9-X24Te_IYOYwh54t_1d3A,10674
 cache_dit/cache_factory/.gitignore,sha256=5Cb-qT9wsTUoMJ7vACDF7ZcLpAXhi5v-xdcWSRit988,23
@@ -10,15 +10,16 @@ cache_dit/cache_factory/forward_pattern.py,sha256=FumlCuZ-TSmSYH0hGBHctSJ-oGLCft
 cache_dit/cache_factory/params_modifier.py,sha256=zYJJsInTYCaYHBZ7mZJOP-PZnkSg3iN1WPewNOayXos,3628
 cache_dit/cache_factory/utils.py,sha256=XkVM9AXcB9zYq8-S8QKAsGz80r3tA6U3lBNGDGeHOe4,1871
 cache_dit/cache_factory/block_adapters/__init__.py,sha256=33geXMz56TxFWMp0c-H4__MY5SGRzKMKj3TXnUYOMlc,17512
-cache_dit/cache_factory/block_adapters/block_adapters.py,sha256=jAgzMPTaY4rBuq7DLK2VeEWuYLy7lvw7bZcPY4S93b4,21660
+cache_dit/cache_factory/block_adapters/block_adapters.py,sha256=2TVK_KqiYXC7AKZ2s07fzdOzUoeUBc9P1SzQtLVzhf4,22249
 cache_dit/cache_factory/block_adapters/block_registers.py,sha256=2L7QeM4ygnaKQpC9PoJod0QRYyxidUKU2AYpysDCUwE,2572
 cache_dit/cache_factory/cache_adapters/__init__.py,sha256=py71WGD3JztQ1uk6qdLVbzYcQ1rvqFidNNaQYo7tqTo,79
-cache_dit/cache_factory/cache_adapters/cache_adapter.py,sha256=GrkSz4was9gg_dYkfBobrOQ_eNqipQBqeuFfqcwkCXc,19650
-cache_dit/cache_factory/cache_blocks/__init__.py,sha256=08Ox7kD05lkRKCOsVTdEZeKAWBheqpxfrAT1Nz7eclI,2916
+cache_dit/cache_factory/cache_adapters/cache_adapter.py,sha256=7heGoy8LHMP54ISMwfJ-i_ALngkbnUdeQDBRrE-MTgs,21303
+cache_dit/cache_factory/cache_blocks/__init__.py,sha256=mivvm8YOfqT7YHs8y_MzGOGztPw8LxAqKGXuSRXxCv0,3032
+cache_dit/cache_factory/cache_blocks/offload_utils.py,sha256=wusgcqaCrwEjvv7Guy-6VXhNOgPPUrBV2sSVuRmGuvo,3513
 cache_dit/cache_factory/cache_blocks/pattern_0_1_2.py,sha256=ElMps6_7uI74tSF9GDR_dEI0bZEhdzcepM29xFWnYo8,428
 cache_dit/cache_factory/cache_blocks/pattern_3_4_5.py,sha256=Bv56qETXhsREvCrNvnZpSqDIIHsi6Ze3FJW4Yk2x3uI,8597
-cache_dit/cache_factory/cache_blocks/pattern_base.py,sha256=d4H9kEB0AgnVMT8aF0Y54SUMUQUxw5HQ8gRkoCuTQ_A,14577
-cache_dit/cache_factory/cache_blocks/utils.py,sha256=dGOC1tMMOvcbvEgx44eTESKn_jsv-0RZ3tRHPa3wmQ4,1315
+cache_dit/cache_factory/cache_blocks/pattern_base.py,sha256=wdh0bbcpKO08AW2FTsj9X_tTbFCLkDmBjrstMxTf7MQ,14668
+cache_dit/cache_factory/cache_blocks/pattern_utils.py,sha256=dGOC1tMMOvcbvEgx44eTESKn_jsv-0RZ3tRHPa3wmQ4,1315
 cache_dit/cache_factory/cache_contexts/__init__.py,sha256=T6Vak3x7Rs0Oy15Tou49p-rPQRA2jiuYtJBsbv1lBBU,388
 cache_dit/cache_factory/cache_contexts/cache_context.py,sha256=3EhaMCz3VUQ_NF81VgYwWoSEGIvhScPxPYhjL1OcgxE,15240
 cache_dit/cache_factory/cache_contexts/cache_manager.py,sha256=hSKAeP1CxmO3RFUxjFjAK1xdvVvTmeayh5jEHMaQXNE,30225
@@ -48,9 +49,9 @@ cache_dit/metrics/metrics.py,sha256=7UV-H2NRbhfr6dvrXEzU97Zy-BSQ5zEfm9CKtaK4ldg,
 cache_dit/quantize/__init__.py,sha256=kWYoMAyZgBXu9BJlZjTQ0dRffW9GqeeY9_iTkXrb70A,59
 cache_dit/quantize/quantize_ao.py,sha256=Fx1KW4l3gdEkdrcAYtPoDW7WKBJWrs3glOHiEwW_TgE,6160
 cache_dit/quantize/quantize_interface.py,sha256=2s_R7xPSKuJeFpEGeLwRxnq_CqJcBG3a3lzyW5wh-UM,1241
-cache_dit-0.3.2.dist-info/licenses/LICENSE,sha256=Dqb07Ik2dV41s9nIdMUbiRWEfDqo7-dQeRiY7kPO8PE,3769
-cache_dit-0.3.2.dist-info/METADATA,sha256=L8vWXW0w9Z4GXVXylKnqmhnfpKJ8YeL0LKIuwLL8HEo,47858
-cache_dit-0.3.2.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
-cache_dit-0.3.2.dist-info/entry_points.txt,sha256=FX2gysXaZx6NeK1iCLMcIdP8Q4_qikkIHtEmi3oWn8o,65
-cache_dit-0.3.2.dist-info/top_level.txt,sha256=ZJDydonLEhujzz0FOkVbO-BqfzO9d_VqRHmZU-3MOZo,10
-cache_dit-0.3.2.dist-info/RECORD,,
+cache_dit-1.0.0.dist-info/licenses/LICENSE,sha256=Dqb07Ik2dV41s9nIdMUbiRWEfDqo7-dQeRiY7kPO8PE,3769
+cache_dit-1.0.0.dist-info/METADATA,sha256=HbV42qlhu8PFIO6FD_PuIo1dO-7K-yBiPCc5fikKIsg,35959
+cache_dit-1.0.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
+cache_dit-1.0.0.dist-info/entry_points.txt,sha256=FX2gysXaZx6NeK1iCLMcIdP8Q4_qikkIHtEmi3oWn8o,65
+cache_dit-1.0.0.dist-info/top_level.txt,sha256=ZJDydonLEhujzz0FOkVbO-BqfzO9d_VqRHmZU-3MOZo,10
+cache_dit-1.0.0.dist-info/RECORD,,

/cache_dit/cache_factory/cache_blocks/{utils.py → pattern_utils.py} RENAMED Viewed

File without changes

{cache_dit-0.3.2.dist-info → cache_dit-1.0.0.dist-info}/WHEEL RENAMED Viewed

File without changes

{cache_dit-0.3.2.dist-info → cache_dit-1.0.0.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{cache_dit-0.3.2.dist-info → cache_dit-1.0.0.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

{cache_dit-0.3.2.dist-info → cache_dit-1.0.0.dist-info}/top_level.txt RENAMED Viewed

File without changes

cache-dit 0.3.2__py3-none-any.whl → 1.0.0__py3-none-any.whl

Potentially problematic release.

cache-dit 0.3.2py3-none-any.whl → 1.0.0py3-none-any.whl