cache-dit 0.3.2__py3-none-any.whl → 0.3.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of cache-dit might be problematic. Click here for more details.

cache_dit/_version.py CHANGED
@@ -28,7 +28,7 @@ version_tuple: VERSION_TUPLE
28
28
  commit_id: COMMIT_ID
29
29
  __commit_id__: COMMIT_ID
30
30
 
31
- __version__ = version = '0.3.2'
32
- __version_tuple__ = version_tuple = (0, 3, 2)
31
+ __version__ = version = '0.3.3'
32
+ __version_tuple__ = version_tuple = (0, 3, 3)
33
33
 
34
34
  __commit_id__ = commit_id = None
@@ -113,6 +113,19 @@ class BlockAdapter:
113
113
  if any((self.pipe is not None, self.transformer is not None)):
114
114
  self.maybe_fill_attrs()
115
115
  self.maybe_patchify()
116
+ self.maybe_skip_checks()
117
+
118
+ def maybe_skip_checks(self):
119
+ if getattr(self.transformer, "_hf_hook", None) is not None:
120
+ logger.warning("_hf_hook is not None, force skip pattern check!")
121
+ self.check_forward_pattern = False
122
+ self.check_num_outputs = False
123
+ elif getattr(self.transformer, "_diffusers_hook", None) is not None:
124
+ logger.warning(
125
+ "_diffusers_hook is not None, force skip pattern check!"
126
+ )
127
+ self.check_forward_pattern = False
128
+ self.check_num_outputs = False
116
129
 
117
130
  def maybe_fill_attrs(self):
118
131
  # NOTE: This func should be call before normalize.
@@ -1,10 +1,8 @@
1
1
  import torch
2
-
3
2
  import unittest
4
3
  import functools
5
-
6
4
  from contextlib import ExitStack
7
- from typing import Dict, List, Tuple, Any, Union, Callable
5
+ from typing import Dict, List, Tuple, Any, Union, Callable, Optional
8
6
 
9
7
  from diffusers import DiffusionPipeline
10
8
 
@@ -16,7 +14,7 @@ from cache_dit.cache_factory.cache_contexts import CachedContextManager
16
14
  from cache_dit.cache_factory.cache_contexts import BasicCacheConfig
17
15
  from cache_dit.cache_factory.cache_contexts import CalibratorConfig
18
16
  from cache_dit.cache_factory.cache_blocks import CachedBlocks
19
- from cache_dit.cache_factory.cache_blocks.utils import (
17
+ from cache_dit.cache_factory.cache_blocks import (
20
18
  patch_cached_stats,
21
19
  remove_cached_stats,
22
20
  )
@@ -330,7 +328,19 @@ class CachedAdapter:
330
328
 
331
329
  assert isinstance(dummy_blocks_names, list)
332
330
 
333
- @functools.wraps(original_forward)
331
+ from accelerate import hooks
332
+
333
+ _hf_hook: Optional[hooks.ModelHook] = None
334
+
335
+ if getattr(transformer, "_hf_hook", None) is not None:
336
+ _hf_hook = transformer._hf_hook # hooks from accelerate.hooks
337
+
338
+ # TODO: remove group offload hooks the re-apply after cache applied.
339
+ # hooks = _diffusers_hook.hooks.copy(); _diffusers_hook.hooks.clear()
340
+ # re-apply hooks to transformer after cache applied.
341
+ # from diffusers.hooks.hooks import HookFunctionReference, HookRegistry
342
+ # from diffusers.hooks.group_offloading import apply_group_offloading
343
+
334
344
  def new_forward(self, *args, **kwargs):
335
345
  with ExitStack() as stack:
336
346
  for name, context_name in zip(
@@ -348,9 +358,27 @@ class CachedAdapter:
348
358
  self, dummy_name, dummy_blocks
349
359
  )
350
360
  )
351
- return original_forward(*args, **kwargs)
361
+ outputs = original_forward(*args, **kwargs)
362
+ return outputs
363
+
364
+ def new_forward_with_hf_hook(self, *args, **kwargs):
365
+ # Compatible with model cpu offload
366
+ if _hf_hook is not None and hasattr(_hf_hook, "pre_forward"):
367
+ args, kwargs = _hf_hook.pre_forward(self, *args, **kwargs)
368
+
369
+ outputs = new_forward(self, *args, **kwargs)
370
+
371
+ if _hf_hook is not None and hasattr(_hf_hook, "post_forward"):
372
+ outputs = _hf_hook.post_forward(self, outputs)
373
+
374
+ return outputs
375
+
376
+ # NOTE: Still can't fully compatible with group offloading
377
+ transformer.forward = functools.update_wrapper(
378
+ functools.partial(new_forward_with_hf_hook, transformer),
379
+ new_forward_with_hf_hook,
380
+ )
352
381
 
353
- transformer.forward = new_forward.__get__(transformer)
354
382
  transformer._original_forward = original_forward
355
383
  transformer._is_cached = True
356
384
 
@@ -12,6 +12,10 @@ from cache_dit.cache_factory.cache_blocks.pattern_0_1_2 import (
12
12
  from cache_dit.cache_factory.cache_blocks.pattern_3_4_5 import (
13
13
  CachedBlocks_Pattern_3_4_5,
14
14
  )
15
+ from cache_dit.cache_factory.cache_blocks.pattern_utils import (
16
+ patch_cached_stats,
17
+ remove_cached_stats,
18
+ )
15
19
 
16
20
  from cache_dit.logger import init_logger
17
21
 
@@ -0,0 +1,115 @@
1
+ import torch
2
+ import asyncio
3
+ import logging
4
+ from contextlib import contextmanager
5
+ from typing import Generator, Optional, List
6
+ from diffusers.hooks.group_offloading import _is_group_offload_enabled
7
+ from cache_dit.logger import init_logger
8
+
9
+ logger = init_logger(__name__)
10
+
11
+
12
+ @torch.compiler.disable
13
+ @contextmanager
14
+ def maybe_onload(
15
+ block: torch.nn.Module,
16
+ reference_tensor: torch.Tensor,
17
+ pending_tasks: List[asyncio.Task] = [],
18
+ ) -> Generator:
19
+
20
+ if not _is_group_offload_enabled(block):
21
+ yield block
22
+ return
23
+
24
+ original_devices: Optional[List[torch.device]] = None
25
+ if hasattr(block, "parameters"):
26
+ params = list(block.parameters())
27
+ if params:
28
+ original_devices = [param.data.device for param in params]
29
+
30
+ target_device: torch.device = reference_tensor.device
31
+ move_task: Optional[asyncio.Task] = None
32
+ need_restore: bool = False
33
+
34
+ try:
35
+ if original_devices is not None:
36
+ unique_devices = list(set(original_devices))
37
+ if len(unique_devices) > 1 or unique_devices[0] != target_device:
38
+ if logger.isEnabledFor(logging.DEBUG):
39
+ logger.debug(
40
+ f"Onloading from {unique_devices} to {target_device}"
41
+ )
42
+
43
+ has_meta_params = any(
44
+ dev.type == "meta" for dev in original_devices
45
+ )
46
+ if has_meta_params: # compatible with sequential cpu offload
47
+ block = block.to_empty(device=target_device)
48
+ else:
49
+ block = block.to(target_device, non_blocking=False)
50
+ need_restore = True
51
+ yield block
52
+ finally:
53
+ if need_restore and original_devices:
54
+
55
+ async def restore_device():
56
+ for param, original_device in zip(
57
+ block.parameters(), original_devices
58
+ ):
59
+ param.data = await asyncio.to_thread(
60
+ lambda p, d: p.to(d, non_blocking=True),
61
+ param.data, # type: torch.Tensor
62
+ original_device, # type: torch.device
63
+ ) # type: ignore[assignment]
64
+
65
+ loop = get_event_loop()
66
+ move_task = loop.create_task(restore_device())
67
+ if move_task:
68
+ pending_tasks.append(move_task)
69
+
70
+
71
+ def get_event_loop() -> asyncio.AbstractEventLoop:
72
+ try:
73
+ loop = asyncio.get_running_loop()
74
+ except RuntimeError:
75
+ try:
76
+ loop = asyncio.get_event_loop()
77
+ except RuntimeError:
78
+ loop = asyncio.new_event_loop()
79
+ asyncio.set_event_loop(loop)
80
+
81
+ if not loop.is_running():
82
+
83
+ def run_loop() -> None:
84
+ asyncio.set_event_loop(loop)
85
+ loop.run_forever()
86
+
87
+ import threading
88
+
89
+ if not any(t.name == "_my_loop" for t in threading.enumerate()):
90
+ threading.Thread(
91
+ target=run_loop, name="_my_loop", daemon=True
92
+ ).start()
93
+
94
+ return loop
95
+
96
+
97
+ @torch.compiler.disable
98
+ def maybe_offload(
99
+ pending_tasks: List[asyncio.Task],
100
+ ) -> None:
101
+ if not pending_tasks:
102
+ return
103
+
104
+ loop = get_event_loop()
105
+
106
+ async def gather_tasks():
107
+ return await asyncio.gather(*pending_tasks)
108
+
109
+ future = asyncio.run_coroutine_threadsafe(gather_tasks(), loop)
110
+ try:
111
+ future.result(timeout=30.0)
112
+ except Exception as e:
113
+ logger.error(f"May Offload Error: {e}")
114
+
115
+ pending_tasks.clear()
@@ -1,7 +1,9 @@
1
1
  import inspect
2
+ import asyncio
2
3
  import torch
3
4
  import torch.distributed as dist
4
5
 
6
+ from typing import List
5
7
  from cache_dit.cache_factory.cache_contexts.cache_context import CachedContext
6
8
  from cache_dit.cache_factory.cache_contexts.cache_manager import (
7
9
  CachedContextManager,
@@ -45,6 +47,7 @@ class CachedBlocks_Pattern_Base(torch.nn.Module):
45
47
  self.cache_prefix = cache_prefix
46
48
  self.cache_context = cache_context
47
49
  self.cache_manager = cache_manager
50
+ self.pending_tasks: List[asyncio.Task] = []
48
51
 
49
52
  self._check_forward_pattern()
50
53
  logger.info(
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: cache_dit
3
- Version: 0.3.2
3
+ Version: 0.3.3
4
4
  Summary: A Unified, Flexible and Training-free Cache Acceleration Framework for 🤗Diffusers.
5
5
  Author: DefTruth, vipshop.com, etc.
6
6
  Maintainer: DefTruth, vipshop.com, etc
@@ -45,6 +45,8 @@ Dynamic: provides-extra
45
45
  Dynamic: requires-dist
46
46
  Dynamic: requires-python
47
47
 
48
+ <a href="./README.md">📚English</a> | <a href="./README_CN.md">📚中文阅读</a>
49
+
48
50
  <div align="center">
49
51
  <img src=https://github.com/vipshop/cache-dit/raw/main/assets/cache-dit-logo.png height="120">
50
52
 
@@ -57,12 +59,12 @@ Dynamic: requires-python
57
59
  <img src=https://img.shields.io/badge/PRs-welcome-9cf.svg >
58
60
  <img src=https://img.shields.io/badge/PyPI-pass-brightgreen.svg >
59
61
  <img src=https://static.pepy.tech/badge/cache-dit >
62
+ <img src=https://img.shields.io/github/stars/vipshop/cache-dit.svg?style=dark >
60
63
  <img src=https://img.shields.io/badge/Python-3.10|3.11|3.12-9cf.svg >
61
- <img src=https://img.shields.io/badge/Release-v0.3-brightgreen.svg >
62
64
  </div>
63
65
  <p align="center">
64
- <b><a href="#unified">📚Unified Cache APIs</a></b> | <a href="#forward-pattern-matching">📚Forward Pattern Matching</a> | <a href="#automatic-block-adapter">📚Automatic Block Adapter</a><br>
65
- <a href="#hybird-forward-pattern">📚Hybrid Forward Pattern</a> | <a href="#dbcache">📚DBCache</a> | <a href="#taylorseer">📚TaylorSeer Calibrator</a> | <a href="#cfg">📚Cache CFG</a><br>
66
+ <b><a href="#unified">📚Unified Cache APIs</a></b> | <a href="#forward-pattern-matching">📚Forward Pattern Matching</a> | <a href="./docs/User_Guide.md">📚Automatic Block Adapter</a><br>
67
+ <a href="./docs/User_Guide.md">📚Hybrid Forward Pattern</a> | <a href="#dbcache">📚DBCache</a> | <a href="./docs/User_Guide.md">📚TaylorSeer Calibrator</a> | <a href="./docs/User_Guide.md">📚Cache CFG</a><br>
66
68
  <a href="#benchmarks">📚Text2Image DrawBench</a> | <a href="#benchmarks">📚Text2Image Distillation DrawBench</a>
67
69
  </p>
68
70
  <p align="center">
@@ -74,6 +76,8 @@ Dynamic: requires-python
74
76
  🔥<a href="#supported">Chroma</a> | <a href="#supported">Sana</a> | <a href="#supported">Allegro</a> | <a href="#supported">Mochi</a> | <a href="#supported">SD 3/3.5</a> | <a href="#supported">Amused</a> | <a href="#supported"> ... </a> | <a href="#supported">DiT-XL</a>🔥
75
77
  </p>
76
78
  </div>
79
+
80
+
77
81
  <div align='center'>
78
82
  <img src=https://github.com/vipshop/cache-dit/raw/main/assets/gifs/wan2.2.C0_Q0_NONE.gif width=124px>
79
83
  <img src=https://github.com/vipshop/cache-dit/raw/main/assets/gifs/wan2.2.C1_Q0_DBCACHE_F1B0_W2M8MC2_T1O2_R0.08.gif width=124px>
@@ -85,12 +89,6 @@ Dynamic: requires-python
85
89
  <img src=https://github.com/vipshop/cache-dit/raw/main/assets/flux.C0_Q0_NONE_T23.69s.png width=90px>
86
90
  <img src=https://github.com/vipshop/cache-dit/raw/main/assets/flux.C0_Q0_DBCACHE_F1B0_W4M0MC0_T1O2_R0.15_S16_T11.39s.png width=90px>
87
91
  <p><b>🔥Qwen-Image</b> | <a href="https://github.com/vipshop/cache-dit">+cache-dit</a>:1.8x↑🎉 | <b>FLUX.1-dev</b> | <a href="https://github.com/vipshop/cache-dit">+cache-dit</a>:2.1x↑🎉</p>
88
- <img src=https://github.com/vipshop/cache-dit/raw/main/assets/flux-kontext-cat.C0_L0_Q0_NONE.png width=100px>
89
- <img src=https://github.com/vipshop/cache-dit/raw/main/assets/flux-kontext.C0_L0_Q0_NONE.png width=100px>
90
- <img src=https://github.com/vipshop/cache-dit/raw/main/assets/flux-kontext.C0_L0_Q0_DBCACHE_F8B0_W8M0MC0_T0O2_R0.08_S10.png width=100px>
91
- <img src=https://github.com/vipshop/cache-dit/raw/main/assets/flux-kontext.C0_L0_Q0_DBCACHE_F1B0_W8M0MC2_T0O2_R0.12_S12.png width=100px>
92
- <img src=https://github.com/vipshop/cache-dit/raw/main/assets/flux-kontext.C0_L0_Q0_DBCACHE_F1B0_W2M0MC2_T0O2_R0.15_S15.png width=100px>
93
- <p><b>🔥FLUX-Kontext-dev</b> | Baseline | <a href="https://github.com/vipshop/cache-dit">+cache-dit</a>:1.3x↑🎉 | 1.7x↑🎉 | 2.0x↑ 🎉</p>
94
92
  <img src=https://github.com/vipshop/cache-dit/raw/main/assets/qwen-image-lightning.4steps.C0_L1_Q0_NONE.png width=160px>
95
93
  <img src=https://github.com/vipshop/cache-dit/raw/main/assets/qwen-image-lightning.4steps.C0_L1_Q0_DBCACHE_F16B16_W2M1MC1_T0O2_R0.9_S1.png width=160px>
96
94
  <img src=https://github.com/vipshop/cache-dit/raw/main/assets/hunyuan-image-2.1.C0_L0_Q1_fp8_w8a16_wo_NONE.png width=90px>
@@ -100,7 +98,22 @@ Dynamic: requires-python
100
98
  <img src=https://github.com/vipshop/cache-dit/raw/main/assets/qwen-image-edit.C0_L0_Q0_NONE.png width=125px>
101
99
  <img src=https://github.com/vipshop/cache-dit/raw/main/assets/qwen-image-edit.C0_L0_Q0_DBCACHE_F8B0_W8M0MC0_T0O2_R0.08_S18.png width=125px>
102
100
  <img src=https://github.com/vipshop/cache-dit/raw/main/assets/qwen-image-edit.C0_L0_Q0_DBCACHE_F1B0_W8M0MC2_T0O2_R0.12_S24.png width=125px>
103
- <p><b>🔥Qwen-Image-Edit</b> | Input w/o Edit | Baseline | <a href="https://github.com/vipshop/cache-dit">+cache-dit</a>:1.6x↑🎉 | 1.9x↑🎉 </p>
101
+ <p><b>🔥Qwen-Image-Edit</b> | Input w/o Edit | Baseline | <a href="https://github.com/vipshop/cache-dit">+cache-dit</a>:1.6x↑🎉 | 1.9x↑🎉
102
+ <br>♥️ Please consider to leave a <b>⭐️ Star</b> to support us ~ ♥️
103
+ </p>
104
+ </div>
105
+
106
+ <details align='center'>
107
+
108
+ <summary>Click here to show more Image/Video cases</summary>
109
+
110
+ <div align='center'>
111
+ <img src=https://github.com/vipshop/cache-dit/raw/main/assets/flux-kontext-cat.C0_L0_Q0_NONE.png width=100px>
112
+ <img src=https://github.com/vipshop/cache-dit/raw/main/assets/flux-kontext.C0_L0_Q0_NONE.png width=100px>
113
+ <img src=https://github.com/vipshop/cache-dit/raw/main/assets/flux-kontext.C0_L0_Q0_DBCACHE_F8B0_W8M0MC0_T0O2_R0.08_S10.png width=100px>
114
+ <img src=https://github.com/vipshop/cache-dit/raw/main/assets/flux-kontext.C0_L0_Q0_DBCACHE_F1B0_W8M0MC2_T0O2_R0.12_S12.png width=100px>
115
+ <img src=https://github.com/vipshop/cache-dit/raw/main/assets/flux-kontext.C0_L0_Q0_DBCACHE_F1B0_W2M0MC2_T0O2_R0.15_S15.png width=100px>
116
+ <p><b>🔥FLUX-Kontext-dev</b> | Baseline | <a href="https://github.com/vipshop/cache-dit">+cache-dit</a>:1.3x↑🎉 | 1.7x↑🎉 | 2.0x↑ 🎉</p>
104
117
  <img src=https://github.com/vipshop/cache-dit/raw/main/assets/hidream.C0_L0_Q0_NONE.png width=100px>
105
118
  <img src=https://github.com/vipshop/cache-dit/raw/main/assets/hidream.C0_L0_Q0_DBCACHE_F1B0_W8M0MC0_T0O2_R0.08_S24.png width=100px>
106
119
  <img src=https://github.com/vipshop/cache-dit/raw/main/assets/cogview4.C0_L0_Q0_NONE.png width=100px>
@@ -160,24 +173,25 @@ Dynamic: requires-python
160
173
  <p><b>🔥Asumed</b> | <a href="https://github.com/vipshop/cache-dit">+cache-dit</a>:1.1x↑🎉 | 1.2x↑🎉 | <b>DiT-XL-256</b> | <a href="https://github.com/vipshop/cache-dit">+cache-dit</a>:1.8x↑🎉
161
174
  <br>♥️ Please consider to leave a <b>⭐️ Star</b> to support us ~ ♥️</p>
162
175
  </div>
176
+ </details>
163
177
 
164
178
  ## 🔥News
165
179
 
166
- - [2025-09-10] 🎉Day 1 support [**HunyuanImage-2.1**](https://github.com/Tencent-Hunyuan/HunyuanImage-2.1) with **1.7x↑🎉** speedup! Check this [example](https://github.com/vipshop/cache-dit/raw/main/examples/pipeline/run_hunyuan_image_2.1.py).
167
- - [2025-09-08] 🔥[**Qwen-Image-Lightning**](https://github.com/vipshop/cache-dit/raw/main/examples/pipeline/run_qwen_image_lightning.py) **7.1/3.5 steps🎉** inference with **[DBCache: F16B16](https://github.com/vipshop/cache-dit)**.
168
- - [2025-09-03] 🎉[**Wan2.2-MoE**](https://github.com/Wan-Video) **2.4x↑🎉** speedup! Please refer to [run_wan_2.2.py](https://github.com/vipshop/cache-dit/raw/main/examples/pipeline/run_wan_2.2.py) as an example.
169
- - [2025-08-19] 🔥[**Qwen-Image-Edit**](https://github.com/QwenLM/Qwen-Image) **2x↑🎉** speedup! Check the example: [run_qwen_image_edit.py](https://github.com/vipshop/cache-dit/raw/main/examples/pipeline/run_qwen_image_edit.py).
170
- - [2025-08-11] 🔥[**Qwen-Image**](https://github.com/QwenLM/Qwen-Image) **1.8x↑🎉** speedup! Please refer to [run_qwen_image.py](https://github.com/vipshop/cache-dit/raw/main/examples/pipeline/run_qwen_image.py) as an example.
171
- - [2025-07-13] 🎉[**FLUX.1-dev**](https://github.com/xlite-dev/flux-faster) **3.3x↑🎉** speedup! NVIDIA L20 with **[cache-dit](https://github.com/vipshop/cache-dit)** + **compile + FP8 DQ**.
180
+ - [2025-09-10] 🎉Day 1 support [**HunyuanImage-2.1**](https://github.com/Tencent-Hunyuan/HunyuanImage-2.1) with **1.7x↑🎉** speedup! Check this [example](https://github.com/vipshop/cache-dit/blob/main/examples/pipeline/run_hunyuan_image_2.1.py).
181
+ - [2025-09-08] 🔥[**Qwen-Image-Lightning**](https://github.com/vipshop/cache-dit/blob/main/examples/pipeline/run_qwen_image_lightning.py) **7.1/3.5 steps🎉** inference with **[DBCache: F16B16](https://github.com/vipshop/cache-dit)**.
182
+ - [2025-09-03] 🎉[**Wan2.2-MoE**](https://github.com/Wan-Video) **2.4x↑🎉** speedup! Please refer to [run_wan_2.2.py](https://github.com/vipshop/cache-dit/blob/main/examples/pipeline/run_wan_2.2.py) as an example.
183
+ - [2025-08-19] 🔥[**Qwen-Image-Edit**](https://github.com/QwenLM/Qwen-Image) **2x↑🎉** speedup! Check the example: [run_qwen_image_edit.py](https://github.com/vipshop/cache-dit/blob/main/examples/pipeline/run_qwen_image_edit.py).
184
+ - [2025-08-11] 🔥[**Qwen-Image**](https://github.com/QwenLM/Qwen-Image) **1.8x↑🎉** speedup! Please refer to [run_qwen_image.py](https://github.com/vipshop/cache-dit/blob/main/examples/pipeline/run_qwen_image.py) as an example.
172
185
 
173
186
  <details>
174
187
  <summary> Previous News </summary>
175
188
 
189
+ - [2025-07-13] 🎉[**FLUX.1-dev**](https://github.com/xlite-dev/flux-faster) **3.3x↑🎉** speedup! NVIDIA L20 with **[cache-dit](https://github.com/vipshop/cache-dit)** + **compile + FP8 DQ**.
176
190
  - [2025-09-08] 🎉First caching mechanism in [Qwen-Image-Lightning](https://github.com/ModelTC/Qwen-Image-Lightning) with **[cache-dit](https://github.com/vipshop/cache-dit)**, check this [PR](https://github.com/ModelTC/Qwen-Image-Lightning/pull/35).
177
191
  - [2025-09-08] 🎉First caching mechanism in [Wan2.2](https://github.com/Wan-Video/Wan2.2) with **[cache-dit](https://github.com/vipshop/cache-dit)**, check this [PR](https://github.com/Wan-Video/Wan2.2/pull/127) for more details.
178
192
  - [2025-08-12] 🎉First caching mechanism in [QwenLM/Qwen-Image](https://github.com/QwenLM/Qwen-Image) with **[cache-dit](https://github.com/vipshop/cache-dit)**, check this [PR](https://github.com/QwenLM/Qwen-Image/pull/61).
179
- - [2025-09-01] 📚[**Hybird Forward Pattern**](#unified) is supported! Please check [FLUX.1-dev](https://github.com/vipshop/cache-dit/raw/main/examples/run_flux_adapter.py) as an example.
180
- - [2025-08-10] 🔥[**FLUX.1-Kontext-dev**](https://huggingface.co/black-forest-labs/FLUX.1-Kontext-dev) is supported! Please refer [run_flux_kontext.py](https://github.com/vipshop/cache-dit/raw/main/examples/pipeline/run_flux_kontext.py) as an example.
193
+ - [2025-09-01] 📚[**Hybird Forward Pattern**](#unified) is supported! Please check [FLUX.1-dev](https://github.com/vipshop/cache-dit/blob/main/examples/run_flux_adapter.py) as an example.
194
+ - [2025-08-10] 🔥[**FLUX.1-Kontext-dev**](https://huggingface.co/black-forest-labs/FLUX.1-Kontext-dev) is supported! Please refer [run_flux_kontext.py](https://github.com/vipshop/cache-dit/blob/main/examples/pipeline/run_flux_kontext.py) as an example.
181
195
  - [2025-07-18] 🎉First caching mechanism in [🤗huggingface/flux-fast](https://github.com/huggingface/flux-fast) with **[cache-dit](https://github.com/vipshop/cache-dit)**, check the [PR](https://github.com/huggingface/flux-fast/pull/13).
182
196
 
183
197
  </details>
@@ -187,20 +201,14 @@ Dynamic: requires-python
187
201
  <div id="contents"></div>
188
202
 
189
203
  - [⚙️Installation](#️installation)
190
- - [🔥Benchmarks](#benchmarks)
191
- - [🔥Supported Pipelines](#supported)
192
- - [🎉Unified Cache APIs](#unified)
193
- - [📚Forward Pattern Matching](#forward-pattern-matching)
194
- - [♥️Cache with One-line Code](#%EF%B8%8Fcache-acceleration-with-one-line-code)
195
- - [🔥Automatic Block Adapter](#automatic-block-adapter)
196
- - [📚Hybird Forward Pattern](#automatic-block-adapter)
197
- - [📚Implement Patch Functor](#implement-patch-functor)
198
- - [🤖Cache Acceleration Stats](#cache-acceleration-stats-summary)
204
+ - [🔥Quick Start](#quick-start)
205
+ - [📚Pattern Matching](#forward-pattern-matching)
199
206
  - [⚡️Dual Block Cache](#dbcache)
200
207
  - [🔥TaylorSeer Calibrator](#taylorseer)
201
- - [⚡️Hybrid Cache CFG](#cfg)
202
- - [⚙️Torch Compile](#compile)
203
- - [🛠Metrics CLI](#metrics)
208
+ - [📚Hybrid Cache CFG](#cfg)
209
+ - [🔥Benchmarks](#benchmarks)
210
+ - [🎉User Guide](#user-guide)
211
+ - [©️Citations](#citations)
204
212
 
205
213
  ## ⚙️Installation
206
214
 
@@ -217,325 +225,84 @@ Or you can install the latest develop version from GitHub:
217
225
  pip3 install git+https://github.com/vipshop/cache-dit.git
218
226
  ```
219
227
 
220
- ## 🔥Supported Pipelines
221
-
222
- <div id="supported"></div>
223
-
224
- Currently, **cache-dit** library supports almost **Any** Diffusion Transformers (with **Transformer Blocks** that match the specific Input and Output **patterns**). Please check [🎉Examples](https://github.com/vipshop/cache-dit/raw/main/examples/pipeline) for more details. Here are just some of the tested models listed.
225
-
226
- ```python
227
- >>> import cache_dit
228
- >>> cache_dit.supported_pipelines()
229
- (30, ['Flux*', 'Mochi*', 'CogVideoX*', 'Wan*', 'HunyuanVideo*', 'QwenImage*', 'LTX*', 'Allegro*',
230
- 'CogView3Plus*', 'CogView4*', 'Cosmos*', 'EasyAnimate*', 'SkyReelsV2*', 'StableDiffusion3*',
231
- 'ConsisID*', 'DiT*', 'Amused*', 'Bria*', 'Lumina*', 'OmniGen*', 'PixArt*', 'Sana*', 'StableAudio*',
232
- 'VisualCloze*', 'AuraFlow*', 'Chroma*', 'ShapE*', 'HiDream*', 'HunyuanDiT*', 'HunyuanDiTPAG*'])
233
- ```
234
-
235
- <details>
236
- <summary> Show all pipelines </summary>
237
-
238
- - [🚀HunyuanImage-2.1](https://github.com/vipshop/cache-dit/raw/main/examples)
239
- - [🚀Qwen-Image-Lightning](https://github.com/vipshop/cache-dit/raw/main/examples)
240
- - [🚀Qwen-Image-Edit](https://github.com/vipshop/cache-dit/raw/main/examples)
241
- - [🚀Qwen-Image](https://github.com/vipshop/cache-dit/raw/main/examples)
242
- - [🚀FLUX.1-dev](https://github.com/vipshop/cache-dit/raw/main/examples)
243
- - [🚀FLUX.1-Fill-dev](https://github.com/vipshop/cache-dit/raw/main/examples)
244
- - [🚀FLUX.1-Kontext-dev](https://github.com/vipshop/cache-dit/raw/main/examples)
245
- - [🚀CogView4](https://github.com/vipshop/cache-dit/raw/main/examples)
246
- - [🚀Wan2.2-T2V](https://github.com/vipshop/cache-dit/raw/main/examples)
247
- - [🚀HunyuanVideo](https://github.com/vipshop/cache-dit/raw/main/examples)
248
- - [🚀HiDream-I1-Full](https://github.com/vipshop/cache-dit/raw/main/examples)
249
- - [🚀HunyuanDiT](https://github.com/vipshop/cache-dit/raw/main/examples)
250
- - [🚀Wan2.1-T2V](https://github.com/vipshop/cache-dit/raw/main/examples)
251
- - [🚀Wan2.1-FLF2V](https://github.com/vipshop/cache-dit/raw/main/examples)
252
- - [🚀SkyReelsV2](https://github.com/vipshop/cache-dit/raw/main/examples)
253
- - [🚀Chroma1-HD](https://github.com/vipshop/cache-dit/raw/main/examples)
254
- - [🚀CogVideoX1.5](https://github.com/vipshop/cache-dit/raw/main/examples)
255
- - [🚀CogView3-Plus](https://github.com/vipshop/cache-dit/raw/main/examples)
256
- - [🚀CogVideoX](https://github.com/vipshop/cache-dit/raw/main/examples)
257
- - [🚀VisualCloze](https://github.com/vipshop/cache-dit/raw/main/examples)
258
- - [🚀LTXVideo](https://github.com/vipshop/cache-dit/raw/main/examples)
259
- - [🚀OmniGen](https://github.com/vipshop/cache-dit/raw/main/examples)
260
- - [🚀Lumina2](https://github.com/vipshop/cache-dit/raw/main/examples)
261
- - [🚀mochi-1-preview](https://github.com/vipshop/cache-dit/raw/main/examples)
262
- - [🚀AuraFlow-v0.3](https://github.com/vipshop/cache-dit/raw/main/examples)
263
- - [🚀PixArt-Alpha](https://github.com/vipshop/cache-dit/raw/main/examples)
264
- - [🚀PixArt-Sigma](https://github.com/vipshop/cache-dit/raw/main/examples)
265
- - [🚀NVIDIA Sana](https://github.com/vipshop/cache-dit/raw/main/examples)
266
- - [🚀SD-3/3.5](https://github.com/vipshop/cache-dit/raw/main/examples)
267
- - [🚀ConsisID](https://github.com/vipshop/cache-dit/raw/main/examples)
268
- - [🚀Allegro](https://github.com/vipshop/cache-dit/raw/main/examples)
269
- - [🚀Amused](https://github.com/vipshop/cache-dit/raw/main/examples)
270
- - [🚀DiT-XL](https://github.com/vipshop/cache-dit/raw/main/examples)
271
- - ...
272
-
273
- </details>
274
-
275
- ## 🔥Benchmarks
276
-
277
- <div id="benchmarks"></div>
278
-
279
- cache-dit will support more mainstream Cache acceleration algorithms in the future. More benchmarks will be released, please stay tuned for update. Here, only the results of some precision and performance benchmarks are presented. The test dataset is **DrawBench**. For a complete benchmark, please refer to [📚Benchmarks](https://github.com/vipshop/cache-dit/raw/main/bench/).
280
-
281
- ### 📚Text2Image DrawBench: FLUX.1-dev
282
-
283
- Comparisons between different FnBn compute block configurations show that **more compute blocks result in higher precision**. For example, the F8B0_W8MC0 configuration achieves the best Clip Score (33.007) and ImageReward (1.0333). **Device**: NVIDIA L20. **F**: Fn_compute_blocks, **B**: Bn_compute_blocks, 50 steps.
284
-
285
-
286
- | Config | Clip Score(↑) | ImageReward(↑) | PSNR(↑) | TFLOPs(↓) | SpeedUp(↑) |
287
- | --- | --- | --- | --- | --- | --- |
288
- | [**FLUX.1**-dev]: 50 steps | 32.9217 | 1.0412 | INF | 3726.87 | 1.00x |
289
- | F8B0_W4MC0_R0.08 | 32.9871 | 1.0370 | 33.8317 | 2064.81 | 1.80x |
290
- | F8B0_W4MC2_R0.12 | 32.9535 | 1.0185 | 32.7346 | 1935.73 | 1.93x |
291
- | F8B0_W4MC3_R0.12 | 32.9234 | 1.0085 | 32.5385 | 1816.58 | 2.05x |
292
- | F4B0_W4MC3_R0.12 | 32.8981 | 1.0130 | 31.8031 | 1507.83 | 2.47x |
293
- | F4B0_W4MC4_R0.12 | 32.8384 | 1.0065 | 31.5292 | 1400.08 | 2.66x |
294
-
295
- The comparison between **cache-dit: DBCache** and algorithms such as Δ-DiT, Chipmunk, FORA, DuCa, TaylorSeer and FoCa is as follows. Now, in the comparison with a speedup ratio less than **3x**, cache-dit achieved the best accuracy. Please check [📚How to Reproduce?](https://github.com/vipshop/cache-dit/raw/main/bench/) for more details.
296
-
297
- | Method | TFLOPs(↓) | SpeedUp(↑) | ImageReward(↑) | Clip Score(↑) |
298
- | --- | --- | --- | --- | --- |
299
- | [**FLUX.1**-dev]: 50 steps | 3726.87 | 1.00× | 0.9898 | 32.404 |
300
- | [**FLUX.1**-dev]: 60% steps | 2231.70 | 1.67× | 0.9663 | 32.312 |
301
- | Δ-DiT(N=2) | 2480.01 | 1.50× | 0.9444 | 32.273 |
302
- | Δ-DiT(N=3) | 1686.76 | 2.21× | 0.8721 | 32.102 |
303
- | [**FLUX.1**-dev]: 34% steps | 1264.63 | 3.13× | 0.9453 | 32.114 |
304
- | Chipmunk | 1505.87 | 2.47× | 0.9936 | 32.776 |
305
- | FORA(N=3) | 1320.07 | 2.82× | 0.9776 | 32.266 |
306
- | **[DBCache(F=4,B=0,W=4,MC=4)](https://github.com/vipshop/cache-dit)** | 1400.08 | **2.66×** | **1.0065** | 32.838 |
307
- | **[DBCache+TaylorSeer(F=1,B=0,O=1)](https://github.com/vipshop/cache-dit)** | 1153.05 | **3.23×** | **1.0221** | 32.819 |
308
- | DuCa(N=5) | 978.76 | 3.80× | 0.9955 | 32.241 |
309
- | TaylorSeer(N=4,O=2) | 1042.27 | 3.57× | 0.9857 | 32.413 |
310
- | **[DBCache(F=1,B=0,W=4,MC=6)](https://github.com/vipshop/cache-dit)** | 944.75 | **3.94×** | 0.9997 | 32.849 |
311
- | **[DBCache+TaylorSeer(F=1,B=0,O=1)](https://github.com/vipshop/cache-dit)** | 944.75 | **3.94×** | **1.0107** | 32.865 |
312
- | **[FoCa(N=5): arxiv.2508.16211](https://arxiv.org/pdf/2508.16211)** | 893.54 | **4.16×** | **1.0029** | **32.948** |
313
-
314
- <details>
315
- <summary> Show all comparison </summary>
316
-
317
- | Method | TFLOPs(↓) | SpeedUp(↑) | ImageReward(↑) | Clip Score(↑) |
318
- | --- | --- | --- | --- | --- |
319
- | [**FLUX.1**-dev]: 50 steps | 3726.87 | 1.00× | 0.9898 | 32.404 |
320
- | [**FLUX.1**-dev]: 60% steps | 2231.70 | 1.67× | 0.9663 | 32.312 |
321
- | Δ-DiT(N=2) | 2480.01 | 1.50× | 0.9444 | 32.273 |
322
- | Δ-DiT(N=3) | 1686.76 | 2.21× | 0.8721 | 32.102 |
323
- | [**FLUX.1**-dev]: 34% steps | 1264.63 | 3.13× | 0.9453 | 32.114 |
324
- | Chipmunk | 1505.87 | 2.47× | 0.9936 | 32.776 |
325
- | FORA(N=3) | 1320.07 | 2.82× | 0.9776 | 32.266 |
326
- | **[DBCache(F=4,B=0,W=4,MC=4)](https://github.com/vipshop/cache-dit)** | 1400.08 | **2.66×** | **1.0065** | 32.838 |
327
- | DuCa(N=5) | 978.76 | 3.80× | 0.9955 | 32.241 |
328
- | TaylorSeer(N=4,O=2) | 1042.27 | 3.57× | 0.9857 | 32.413 |
329
- | **[DBCache+TaylorSeer(F=1,B=0,O=1)](https://github.com/vipshop/cache-dit)** | 1153.05 | **3.23×** | **1.0221** | 32.819 |
330
- | **[DBCache(F=1,B=0,W=4,MC=6)](https://github.com/vipshop/cache-dit)** | 944.75 | **3.94×** | 0.9997 | 32.849 |
331
- | **[DBCache+TaylorSeer(F=1,B=0,O=1)](https://github.com/vipshop/cache-dit)** | 944.75 | **3.94×** | **1.0107** | 32.865 |
332
- | **[FoCa(N=5): arxiv.2508.16211](https://arxiv.org/pdf/2508.16211)** | 893.54 | **4.16×** | **1.0029** | **32.948** |
333
- | [**FLUX.1**-dev]: 22% steps | 818.29 | 4.55× | 0.8183 | 31.772 |
334
- | FORA(N=4) | 967.91 | 3.84× | 0.9730 | 32.142 |
335
- | ToCa(N=8) | 784.54 | 4.74× | 0.9451 | 31.993 |
336
- | DuCa(N=7) | 760.14 | 4.89× | 0.9757 | 32.066 |
337
- | TeaCache(l=0.8) | 892.35 | 4.17× | 0.8683 | 31.704 |
338
- | **[DBCache(F=4,B=0,W=4,MC=10)](https://github.com/vipshop/cache-dit)** | 816.65 | 4.56x | 0.8245 | 32.191 |
339
- | TaylorSeer(N=5,O=2) | 893.54 | 4.16× | 0.9768 | 32.467 |
340
- | **[FoCa(N=7): arxiv.2508.16211](https://arxiv.org/pdf/2508.16211)** | 670.44 | **5.54×** | **0.9891** | **32.920** |
341
- | FORA(N=7) | 670.14 | 5.55× | 0.7418 | 31.519 |
342
- | ToCa(N=12) | 644.70 | 5.77× | 0.7155 | 31.808 |
343
- | DuCa(N=10) | 606.91 | 6.13× | 0.8382 | 31.759 |
344
- | TeaCache(l=1.2) | 669.27 | 5.56× | 0.7394 | 31.704 |
345
- | **[DBCache(F=1,B=0,W=4,MC=10)](https://github.com/vipshop/cache-dit)** | 651.90 | **5.72x** | 0.8796 | **32.318** |
346
- | TaylorSeer(N=7,O=2) | 670.44 | 5.54× | 0.9128 | 32.128 |
347
- | **[FoCa(N=8): arxiv.2508.16211](https://arxiv.org/pdf/2508.16211)** | 596.07 | **6.24×** | **0.9502** | **32.706** |
348
-
349
- NOTE: Except for DBCache, other performance data are referenced from the paper [FoCa, arxiv.2508.16211](https://arxiv.org/pdf/2508.16211).
350
-
351
- </details>
352
-
353
- ### 📚Text2Image Distillation DrawBench: Qwen-Image-Lightning
354
-
355
- Surprisingly, cache-dit: DBCache still works in the extremely few-step distill model. For example, **Qwen-Image-Lightning w/ 4 steps**, with the F16B16 configuration, the PSNR is 34.8163, the Clip Score is 35.6109, and the ImageReward is 1.2614. It maintained a relatively high precision.
356
-
357
- | Config | PSNR(↑) | Clip Score(↑) | ImageReward(↑) | TFLOPs(↓) | SpeedUp(↑) |
358
- |----------------------------|-----------|------------|--------------|----------|------------|
359
- | [**Lightning**]: 4 steps | INF | 35.5797 | 1.2630 | 274.33 | 1.00x |
360
- | F24B24_W2MC1_R0.8 | 36.3242 | 35.6224 | 1.2630 | 264.74 | 1.04x |
361
- | F16B16_W2MC1_R0.8 | 34.8163 | 35.6109 | 1.2614 | 244.25 | 1.12x |
362
- | F12B12_W2MC1_R0.8 | 33.8953 | 35.6535 | 1.2549 | 234.63 | 1.17x |
363
- | F8B8_W2MC1_R0.8 | 33.1374 | 35.7284 | 1.2517 | 224.29 | 1.22x |
364
- | F1B0_W2MC1_R0.8 | 31.8317 | 35.6651 | 1.2397 | 206.90 | 1.33x |
365
-
366
- ## 🎉Unified Cache APIs
228
+ ## 🔥Quick Start
367
229
 
368
230
  <div id="unified"></div>
369
231
 
370
- ### 📚Forward Pattern Matching
371
-
372
- Currently, for any **Diffusion** models with **Transformer Blocks** that match the specific **Input/Output patterns**, we can use the **Unified Cache APIs** from **cache-dit**, namely, the `cache_dit.enable_cache(...)` API. The **Unified Cache APIs** are currently in the experimental phase; please stay tuned for updates. The supported patterns are listed as follows:
373
-
374
- ![](https://github.com/vipshop/cache-dit/raw/main/assets/patterns-v1.png)
375
-
376
- ### ♥️Cache Acceleration with One-line Code
377
-
378
- In most cases, you only need to call **one-line** of code, that is `cache_dit.enable_cache(...)`. After this API is called, you just need to call the pipe as normal. The `pipe` param can be **any** Diffusion Pipeline. Please refer to [Qwen-Image](https://github.com/vipshop/cache-dit/raw/main/examples/pipeline/run_qwen_image.py) as an example.
379
-
380
- ```python
381
- import cache_dit
382
- from diffusers import DiffusionPipeline
383
-
384
- # Can be any diffusion pipeline
385
- pipe = DiffusionPipeline.from_pretrained("Qwen/Qwen-Image")
386
-
387
- # One-line code with default cache options.
388
- cache_dit.enable_cache(pipe)
232
+ <div id="quick-start"></div>
389
233
 
390
- # Just call the pipe as normal.
391
- output = pipe(...)
392
-
393
- # Disable cache and run original pipe.
394
- cache_dit.disable_cache(pipe)
395
- ```
396
-
397
- ### 🔥Automatic Block Adapter
398
-
399
- But in some cases, you may have a **modified** Diffusion Pipeline or Transformer that is not located in the diffusers library or not officially supported by **cache-dit** at this time. The **BlockAdapter** can help you solve this problems. Please refer to [🔥Qwen-Image w/ BlockAdapter](https://github.com/vipshop/cache-dit/raw/main/examples/adapter/run_qwen_image_adapter.py) as an example.
400
-
401
- ```python
402
- from cache_dit import ForwardPattern, BlockAdapter
403
-
404
- # Use 🔥BlockAdapter with `auto` mode.
405
- cache_dit.enable_cache(
406
- BlockAdapter(
407
- # Any DiffusionPipeline, Qwen-Image, etc.
408
- pipe=pipe, auto=True,
409
- # Check `📚Forward Pattern Matching` documentation and hack the code of
410
- # of Qwen-Image, you will find that it has satisfied `FORWARD_PATTERN_1`.
411
- forward_pattern=ForwardPattern.Pattern_1,
412
- ),
413
- )
414
-
415
- # Or, manually setup transformer configurations.
416
- cache_dit.enable_cache(
417
- BlockAdapter(
418
- pipe=pipe, # Qwen-Image, etc.
419
- transformer=pipe.transformer,
420
- blocks=pipe.transformer.transformer_blocks,
421
- forward_pattern=ForwardPattern.Pattern_1,
422
- ),
423
- )
424
- ```
425
- For such situations, **BlockAdapter** can help you quickly apply various cache acceleration features to your own Diffusion Pipelines and Transformers. Please check the [📚BlockAdapter.md](https://github.com/vipshop/cache-dit/raw/main/docs/BlockAdapter.md) for more details.
426
-
427
- ### 📚Hybird Forward Pattern
428
-
429
- Sometimes, a Transformer class will contain more than one transformer `blocks`. For example, **FLUX.1** (HiDream, Chroma, etc) contains transformer_blocks and single_transformer_blocks (with different forward patterns). The **BlockAdapter** can also help you solve this problem. Please refer to [📚FLUX.1](https://github.com/vipshop/cache-dit/raw/main/examples/adapter/run_flux_adapter.py) as an example.
234
+ In most cases, you only need to call ♥️**one-line**♥️ of code, that is `cache_dit.enable_cache(...)`. After this API is called, you just need to call the pipe as normal. The `pipe` param can be **any** Diffusion Pipeline. Please refer to [Qwen-Image](https://github.com/vipshop/cache-dit/blob/main/examples/pipeline/run_qwen_image.py) as an example.
430
235
 
431
236
  ```python
432
- # For diffusers <= 0.34.0, FLUX.1 transformer_blocks and
433
- # single_transformer_blocks have different forward patterns.
434
- cache_dit.enable_cache(
435
- BlockAdapter(
436
- pipe=pipe, # FLUX.1, etc.
437
- transformer=pipe.transformer,
438
- blocks=[
439
- pipe.transformer.transformer_blocks,
440
- pipe.transformer.single_transformer_blocks,
441
- ],
442
- forward_pattern=[
443
- ForwardPattern.Pattern_1,
444
- ForwardPattern.Pattern_3,
445
- ],
446
- ),
447
- )
237
+ >>> import cache_dit
238
+ >>> from diffusers import DiffusionPipeline
239
+ >>> pipe = DiffusionPipeline.from_pretrained("Qwen/Qwen-Image") # Can be any diffusion pipeline
240
+ >>> cache_dit.enable_cache(pipe) # One-line code with default cache options.
241
+ >>> output = pipe(...) # Just call the pipe as normal.
242
+ >>> stats = cache_dit.summary(pipe) # Then, get the summary of cache acceleration stats.
243
+ >>> cache_dit.disable_cache(pipe) # Disable cache and run original pipe.
448
244
  ```
449
245
 
450
- Even sometimes you have more complex cases, such as **Wan 2.2 MoE**, which has more than one Transformer (namely `transformer` and `transformer_2`) in its structure. Fortunately, **cache-dit** can also handle this situation very well. Please refer to [📚Wan 2.2 MoE](https://github.com/vipshop/cache-dit/raw/main/examples/pipeline/run_wan_2.2.py) as an example.
451
-
452
- ```python
453
- from cache_dit import ForwardPattern, BlockAdapter, ParamsModifier, BasicCacheConfig
246
+ ## 📚Forward Pattern Matching
454
247
 
455
- cache_dit.enable_cache(
456
- BlockAdapter(
457
- pipe=pipe,
458
- transformer=[
459
- pipe.transformer,
460
- pipe.transformer_2,
461
- ],
462
- blocks=[
463
- pipe.transformer.blocks,
464
- pipe.transformer_2.blocks,
465
- ],
466
- forward_pattern=[
467
- ForwardPattern.Pattern_2,
468
- ForwardPattern.Pattern_2,
469
- ],
470
- # Setup different cache params for each 'blocks'. You can
471
- # pass any specific cache params to ParamModifier, the old
472
- # value will be overwrite by the new one.
473
- params_modifiers=[
474
- ParamsModifier(
475
- cache_config=BasicCacheConfig(
476
- max_warmup_steps=4,
477
- max_cached_steps=8,
478
- ),
479
- ),
480
- ParamsModifier(
481
- cache_config=BasicCacheConfig(
482
- max_warmup_steps=2,
483
- max_cached_steps=20,
484
- ),
485
- ),
486
- ],
487
- has_separate_cfg=True,
488
- ),
489
- )
490
- ```
491
- ### 📚Implement Patch Functor
492
-
493
- For any PATTERN not in {0...5}, we introduced the simple abstract concept of **Patch Functor**. Users can implement a subclass of Patch Functor to convert an unknown Pattern into a known PATTERN, and for some models, users may also need to fuse the operations within the blocks for loop into block forward.
248
+ <div id="supported"></div>
494
249
 
495
- ![](https://github.com/vipshop/cache-dit/raw/main/assets/patch-functor.png)
250
+ <div id="forward-pattern-matching"></div>
496
251
 
497
- Some Patch functors have already been provided in cache-dit: [📚HiDreamPatchFunctor](https://github.com/vipshop/cache-dit/raw/main/src/cache_dit/cache_factory/patch_functors/functor_hidream.py), [📚ChromaPatchFunctor](https://github.com/vipshop/cache-dit/raw/main/src/cache_dit/cache_factory/patch_functors/functor_chroma.py), etc. After implementing Patch Functor, users need to set the `patch_functor` property of **BlockAdapter**.
252
+ cache-dit works by matching specific input/output patterns as shown below.
498
253
 
499
- ```python
500
- @BlockAdapterRegistry.register("HiDream")
501
- def hidream_adapter(pipe, **kwargs) -> BlockAdapter:
502
- from diffusers import HiDreamImageTransformer2DModel
503
- from cache_dit.cache_factory.patch_functors import HiDreamPatchFunctor
504
-
505
- assert isinstance(pipe.transformer, HiDreamImageTransformer2DModel)
506
- return BlockAdapter(
507
- pipe=pipe,
508
- transformer=pipe.transformer,
509
- blocks=[
510
- pipe.transformer.double_stream_blocks,
511
- pipe.transformer.single_stream_blocks,
512
- ],
513
- forward_pattern=[
514
- ForwardPattern.Pattern_0,
515
- ForwardPattern.Pattern_3,
516
- ],
517
- # NOTE: Setup your custom patch functor here.
518
- patch_functor=HiDreamPatchFunctor(),
519
- **kwargs,
520
- )
521
- ```
254
+ ![](https://github.com/vipshop/cache-dit/raw/main/assets/patterns-v1.png)
522
255
 
523
- ### 🤖Cache Acceleration Stats Summary
256
+ Please check [🎉Examples](https://github.com/vipshop/cache-dit/blob/main/examples/pipeline) for more details. Here are just some of the tested models listed.
524
257
 
525
- After finishing each inference of `pipe(...)`, you can call the `cache_dit.summary()` API on pipe to get the details of the **Cache Acceleration Stats** for the current inference.
526
258
  ```python
527
- stats = cache_dit.summary(pipe)
259
+ >>> import cache_dit
260
+ >>> cache_dit.supported_pipelines()
261
+ (30, ['Flux*', 'Mochi*', 'CogVideoX*', 'Wan*', 'HunyuanVideo*', 'QwenImage*', 'LTX*', 'Allegro*',
262
+ 'CogView3Plus*', 'CogView4*', 'Cosmos*', 'EasyAnimate*', 'SkyReelsV2*', 'StableDiffusion3*',
263
+ 'ConsisID*', 'DiT*', 'Amused*', 'Bria*', 'Lumina*', 'OmniGen*', 'PixArt*', 'Sana*', 'StableAudio*',
264
+ 'VisualCloze*', 'AuraFlow*', 'Chroma*', 'ShapE*', 'HiDream*', 'HunyuanDiT*', 'HunyuanDiTPAG*'])
528
265
  ```
529
266
 
530
- You can set `details` param as `True` to show more details of cache stats. (markdown table format) Sometimes, this may help you analyze what values of the residual diff threshold would be better.
267
+ <details>
268
+ <summary> Show all pipelines </summary>
531
269
 
532
- ```python
533
- ⚡️Cache Steps and Residual Diffs Statistics: QwenImagePipeline
270
+ - [🚀HunyuanImage-2.1](https://github.com/vipshop/cache-dit/blob/main/examples)
271
+ - [🚀Qwen-Image-Lightning](https://github.com/vipshop/cache-dit/blob/main/examples)
272
+ - [🚀Qwen-Image-Edit](https://github.com/vipshop/cache-dit/blob/main/examples)
273
+ - [🚀Qwen-Image](https://github.com/vipshop/cache-dit/blob/main/examples)
274
+ - [🚀FLUX.1-dev](https://github.com/vipshop/cache-dit/blob/main/examples)
275
+ - [🚀FLUX.1-Fill-dev](https://github.com/vipshop/cache-dit/blob/main/examples)
276
+ - [🚀FLUX.1-Kontext-dev](https://github.com/vipshop/cache-dit/blob/main/examples)
277
+ - [🚀CogView4](https://github.com/vipshop/cache-dit/blob/main/examples)
278
+ - [🚀Wan2.2-T2V](https://github.com/vipshop/cache-dit/blob/main/examples)
279
+ - [🚀HunyuanVideo](https://github.com/vipshop/cache-dit/blob/main/examples)
280
+ - [🚀HiDream-I1-Full](https://github.com/vipshop/cache-dit/blob/main/examples)
281
+ - [🚀HunyuanDiT](https://github.com/vipshop/cache-dit/blob/main/examples)
282
+ - [🚀Wan2.1-T2V](https://github.com/vipshop/cache-dit/blob/main/examples)
283
+ - [🚀Wan2.1-FLF2V](https://github.com/vipshop/cache-dit/blob/main/examples)
284
+ - [🚀SkyReelsV2](https://github.com/vipshop/cache-dit/blob/main/examples)
285
+ - [🚀Chroma1-HD](https://github.com/vipshop/cache-dit/blob/main/examples)
286
+ - [🚀CogVideoX1.5](https://github.com/vipshop/cache-dit/blob/main/examples)
287
+ - [🚀CogView3-Plus](https://github.com/vipshop/cache-dit/blob/main/examples)
288
+ - [🚀CogVideoX](https://github.com/vipshop/cache-dit/blob/main/examples)
289
+ - [🚀VisualCloze](https://github.com/vipshop/cache-dit/blob/main/examples)
290
+ - [🚀LTXVideo](https://github.com/vipshop/cache-dit/blob/main/examples)
291
+ - [🚀OmniGen](https://github.com/vipshop/cache-dit/blob/main/examples)
292
+ - [🚀Lumina2](https://github.com/vipshop/cache-dit/blob/main/examples)
293
+ - [🚀mochi-1-preview](https://github.com/vipshop/cache-dit/blob/main/examples)
294
+ - [🚀AuraFlow-v0.3](https://github.com/vipshop/cache-dit/blob/main/examples)
295
+ - [🚀PixArt-Alpha](https://github.com/vipshop/cache-dit/blob/main/examples)
296
+ - [🚀PixArt-Sigma](https://github.com/vipshop/cache-dit/blob/main/examples)
297
+ - [🚀NVIDIA Sana](https://github.com/vipshop/cache-dit/blob/main/examples)
298
+ - [🚀SD-3/3.5](https://github.com/vipshop/cache-dit/blob/main/examples)
299
+ - [🚀ConsisID](https://github.com/vipshop/cache-dit/blob/main/examples)
300
+ - [🚀Allegro](https://github.com/vipshop/cache-dit/blob/main/examples)
301
+ - [🚀Amused](https://github.com/vipshop/cache-dit/blob/main/examples)
302
+ - [🚀DiT-XL](https://github.com/vipshop/cache-dit/blob/main/examples)
303
+ - ...
534
304
 
535
- | Cache Steps | Diffs Min | Diffs P25 | Diffs P50 | Diffs P75 | Diffs P95 | Diffs Max |
536
- |-------------|-----------|-----------|-----------|-----------|-----------|-----------|
537
- | 23 | 0.045 | 0.084 | 0.114 | 0.147 | 0.241 | 0.297 |
538
- ```
305
+ </details>
539
306
 
540
307
  ## ⚡️DBCache: Dual Block Cache
541
308
 
@@ -543,20 +310,9 @@ You can set `details` param as `True` to show more details of cache stats. (mark
543
310
 
544
311
  ![](https://github.com/vipshop/cache-dit/raw/main/assets/dbcache-v1.png)
545
312
 
546
- **DBCache**: **Dual Block Caching** for Diffusion Transformers. Different configurations of compute blocks (**F8B12**, etc.) can be customized in DBCache, enabling a balanced trade-off between performance and precision. Moreover, it can be entirely **training**-**free**. Please check [DBCache.md](https://github.com/vipshop/cache-dit/raw/main/docs/DBCache.md) docs for more design details.
547
-
548
- - **Fn**: Specifies that DBCache uses the **first n** Transformer blocks to fit the information at time step t, enabling the calculation of a more stable L1 diff and delivering more accurate information to subsequent blocks.
549
- - **Bn**: Further fuses approximate information in the **last n** Transformer blocks to enhance prediction accuracy. These blocks act as an auto-scaler for approximate hidden states that use residual cache.
313
+ **DBCache**: **Dual Block Caching** for Diffusion Transformers. Different configurations of compute blocks (**F8B12**, etc.) can be customized in DBCache, enabling a balanced trade-off between performance and precision. Moreover, it can be entirely **training**-**free**. Please Check the [DBCache](https://github.com/vipshop/cache-dit/blob/main/docs/DBCache.md) and [User Guide](https://github.com/vipshop/cache-dit/blob/main/docs/User_Guide.md#dbcache) docs for more design details.
550
314
 
551
315
  ```python
552
- import cache_dit
553
- from diffusers import FluxPipeline
554
-
555
- pipe_or_adapter = FluxPipeline.from_pretrained(
556
- "black-forest-labs/FLUX.1-dev",
557
- torch_dtype=torch.bfloat16,
558
- ).to("cuda")
559
-
560
316
  # Default options, F8B0, 8 warmup steps, and unlimited cached
561
317
  # steps for good balance between performance and precision
562
318
  cache_dit.enable_cache(pipe_or_adapter)
@@ -576,28 +332,13 @@ cache_dit.enable_cache(
576
332
  )
577
333
  ```
578
334
 
579
- <div align="center">
580
- <p align="center">
581
- DBCache, <b> L20x1 </b>, Steps: 28, "A cat holding a sign that says hello world with complex background"
582
- </p>
583
- </div>
584
-
585
- |Baseline(L20x1)|F1B0 (0.08)|F1B0 (0.20)|F8B8 (0.15)|F12B12 (0.20)|F16B16 (0.20)|
586
- |:---:|:---:|:---:|:---:|:---:|:---:|
587
- |24.85s|15.59s|8.58s|15.41s|15.11s|17.74s|
588
- |<img src=https://github.com/vipshop/cache-dit/raw/main/assets/NONE_R0.08_S0.png width=105px>|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/DBCACHE_F1B0S1_R0.08_S11.png width=105px> | <img src=https://github.com/vipshop/cache-dit/raw/main/assets/DBCACHE_F1B0S1_R0.2_S19.png width=105px>|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/DBCACHE_F8B8S1_R0.15_S15.png width=105px>|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/DBCACHE_F12B12S4_R0.2_S16.png width=105px>|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/DBCACHE_F16B16S4_R0.2_S13.png width=105px>|
589
-
590
335
  ## 🔥TaylorSeer Calibrator
591
336
 
592
337
  <div id="taylorseer"></div>
593
338
 
594
- We have supported the [TaylorSeers: From Reusing to Forecasting: Accelerating Diffusion Models with TaylorSeers](https://arxiv.org/pdf/2503.06923) algorithm to further improve the precision of DBCache in cases where the cached steps are large, namely, **Hybrid TaylorSeer + DBCache**. At timesteps with significant intervals, the feature similarity in diffusion models decreases substantially, significantly harming the generation quality.
339
+ The [TaylorSeers](https://huggingface.co/papers/2503.06923) algorithm further improves the precision of DBCache in cases where the cached steps are large (Hybrid TaylorSeer + DBCache). At timesteps with significant intervals, the feature similarity in diffusion models decreases substantially, significantly harming the generation quality.
595
340
 
596
- $$
597
- \mathcal{F}\_{\text {pred }, m}\left(x_{t-k}^l\right)=\mathcal{F}\left(x_t^l\right)+\sum_{i=1}^m \frac{\Delta^i \mathcal{F}\left(x_t^l\right)}{i!\cdot N^i}(-k)^i
598
- $$
599
-
600
- **TaylorSeer** employs a differential method to approximate the higher-order derivatives of features and predict features in future timesteps with Taylor series expansion. The TaylorSeer implemented in cache-dit supports both hidden states and residual cache types. That is $\mathcal{F}\_{\text {pred }, m}\left(x_{t-k}^l\right)$ can be a residual cache or a hidden-state cache.
341
+ TaylorSeer employs a differential method to approximate the higher-order derivatives of features and predict features in future timesteps with Taylor series expansion. The TaylorSeer implemented in CacheDiT supports both hidden states and residual cache types. F_pred can be a residual cache or a hidden-state cache.
601
342
 
602
343
  ```python
603
344
  from cache_dit import BasicCacheConfig, TaylorSeerCalibratorConfig
@@ -620,25 +361,14 @@ cache_dit.enable_cache(
620
361
  )
621
362
  ```
622
363
 
623
- > [!Important]
624
- > Please note that if you have used TaylorSeer as the calibrator for approximate hidden states, the **Bn** param of DBCache can be set to **0**. In essence, DBCache's Bn is also act as a calibrator, so you can choose either Bn > 0 or TaylorSeer. We recommend using the configuration scheme of **TaylorSeer** + **DBCache FnB0**.
364
+ > [!TIP]
365
+ > The `Bn_compute_blocks` parameter of DBCache can be set to `0` if you use TaylorSeer as the calibrator for approximate hidden states. DBCache's `Bn_compute_blocks` also acts as a calibrator, so you can choose either `Bn_compute_blocks` > 0 or TaylorSeer. We recommend using the configuration scheme of TaylorSeer + DBCache FnB0.
625
366
 
626
- <div align="center">
627
- <p align="center">
628
- <b>DBCache F1B0 + TaylorSeer</b>, L20x1, Steps: 28, <br>"A cat holding a sign that says hello world with complex background"
629
- </p>
630
- </div>
631
-
632
- |Baseline(L20x1)|F1B0 (0.12)|+TaylorSeer|F1B0 (0.15)|+TaylorSeer|+compile|
633
- |:---:|:---:|:---:|:---:|:---:|:---:|
634
- |24.85s|12.85s|12.86s|10.27s|10.28s|8.48s|
635
- |<img src=https://github.com/vipshop/cache-dit/raw/main/assets/NONE_R0.08_S0.png width=105px>|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/U0_C0_DBCACHE_F1B0S1W0T0ET0_R0.12_S14_T12.85s.png width=105px>|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/U0_C0_DBCACHE_F1B0S1W0T1ET1_R0.12_S14_T12.86s.png width=105px>|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/U0_C0_DBCACHE_F1B0S1W0T0ET0_R0.15_S17_T10.27s.png width=105px>|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/U0_C0_DBCACHE_F1B0S1W0T1ET1_R0.15_S17_T10.28s.png width=105px>|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/U0_C1_DBCACHE_F1B0S1W0T1ET1_R0.15_S17_T8.48s.png width=105px>|
636
-
637
- ## ⚡️Hybrid Cache CFG
367
+ ## 📚Hybrid Cache CFG
638
368
 
639
369
  <div id="cfg"></div>
640
370
 
641
- cache-dit supports caching for **CFG (classifier-free guidance)**. For models that fuse CFG and non-CFG into a single forward step, or models that do not include CFG (classifier-free guidance) in the forward step, please set `enable_separate_cfg` param to **False (default, None)**. Otherwise, set it to True. For examples:
371
+ cache-dit supports caching for CFG (classifier-free guidance). For models that fuse CFG and non-CFG into a single forward step, or models that do not include CFG (classifier-free guidance) in the forward step, please set `enable_separate_cfg` parameter to `False (default, None)`. Otherwise, set it to `True`.
642
372
 
643
373
  ```python
644
374
  from cache_dit import BasicCacheConfig
@@ -647,75 +377,97 @@ cache_dit.enable_cache(
647
377
  pipe_or_adapter,
648
378
  cache_config=BasicCacheConfig(
649
379
  ...,
650
- # CFG: classifier free guidance or not
651
- # For model that fused CFG and non-CFG into single forward step,
652
- # should set enable_separate_cfg as False. For example, set it as True
653
- # for Wan 2.1/Qwen-Image and set it as False for FLUX.1, HunyuanVideo,
654
- # CogVideoX, Mochi, LTXVideo, Allegro, CogView3Plus, EasyAnimate, SD3, etc.
655
- enable_separate_cfg=True, # Wan 2.1, Qwen-Image, CogView4, Cosmos, SkyReelsV2, etc.
656
- # Compute cfg forward first or not, default False, namely,
657
- # 0, 2, 4, ..., -> non-CFG step; 1, 3, 5, ... -> CFG step.
658
- cfg_compute_first=False,
659
- # Compute separate diff values for CFG and non-CFG step,
660
- # default True. If False, we will use the computed diff from
661
- # current non-CFG transformer step for current CFG step.
662
- cfg_diff_compute_separate=True,
380
+ # For example, set it as True for Wan 2.1/Qwen-Image
381
+ # and set it as False for FLUX.1, HunyuanVideo, CogVideoX, etc.
382
+ enable_separate_cfg=True,
663
383
  ),
664
384
  )
665
385
  ```
666
386
 
667
- ## ⚙️Torch Compile
668
-
669
- <div id="compile"></div>
670
-
671
- By the way, **cache-dit** is designed to work compatibly with **torch.compile.** You can easily use cache-dit with torch.compile to further achieve a better performance. For example:
672
-
673
- ```python
674
- cache_dit.enable_cache(pipe)
675
-
676
- # Compile the Transformer module
677
- pipe.transformer = torch.compile(pipe.transformer)
678
- ```
679
- However, users intending to use **cache-dit** for DiT with **dynamic input shapes** should consider increasing the **recompile** **limit** of `torch._dynamo`. Otherwise, the recompile_limit error may be triggered, causing the module to fall back to eager mode.
680
- ```python
681
- torch._dynamo.config.recompile_limit = 96 # default is 8
682
- torch._dynamo.config.accumulated_recompile_limit = 2048 # default is 256
683
- ```
387
+ ## 🔥Benchmarks
684
388
 
685
- Please check [perf.py](https://github.com/vipshop/cache-dit/raw/main/bench/perf.py) for more details.
389
+ <div id="benchmarks"></div>
686
390
 
391
+ The comparison between **cache-dit: DBCache** and algorithms such as Δ-DiT, Chipmunk, FORA, DuCa, TaylorSeer and FoCa is as follows. Now, in the comparison with a speedup ratio less than **3x**, cache-dit achieved the best accuracy. Surprisingly, cache-dit: DBCache still works in the extremely few-step distill model. For a complete benchmark, please refer to [📚Benchmarks](https://github.com/vipshop/cache-dit/raw/main/bench/).
687
392
 
688
- ## 🛠Metrics CLI
393
+ | Method | TFLOPs(↓) | SpeedUp(↑) | ImageReward(↑) | Clip Score(↑) |
394
+ | --- | --- | --- | --- | --- |
395
+ | [**FLUX.1**-dev]: 50 steps | 3726.87 | 1.00× | 0.9898 | 32.404 |
396
+ | [**FLUX.1**-dev]: 60% steps | 2231.70 | 1.67× | 0.9663 | 32.312 |
397
+ | Δ-DiT(N=2) | 2480.01 | 1.50× | 0.9444 | 32.273 |
398
+ | Δ-DiT(N=3) | 1686.76 | 2.21× | 0.8721 | 32.102 |
399
+ | [**FLUX.1**-dev]: 34% steps | 1264.63 | 3.13× | 0.9453 | 32.114 |
400
+ | Chipmunk | 1505.87 | 2.47× | 0.9936 | 32.776 |
401
+ | FORA(N=3) | 1320.07 | 2.82× | 0.9776 | 32.266 |
402
+ | **[DBCache(F=4,B=0,W=4,MC=4)](https://github.com/vipshop/cache-dit)** | 1400.08 | **2.66×** | **1.0065** | 32.838 |
403
+ | **[DBCache+TaylorSeer(F=1,B=0,O=1)](https://github.com/vipshop/cache-dit)** | 1153.05 | **3.23×** | **1.0221** | 32.819 |
404
+ | DuCa(N=5) | 978.76 | 3.80× | 0.9955 | 32.241 |
405
+ | TaylorSeer(N=4,O=2) | 1042.27 | 3.57× | 0.9857 | 32.413 |
406
+ | **[DBCache(F=1,B=0,W=4,MC=6)](https://github.com/vipshop/cache-dit)** | 944.75 | **3.94×** | 0.9997 | 32.849 |
407
+ | **[DBCache+TaylorSeer(F=1,B=0,O=1)](https://github.com/vipshop/cache-dit)** | 944.75 | **3.94×** | **1.0107** | 32.865 |
408
+ | **[FoCa(N=5): arxiv.2508.16211](https://arxiv.org/pdf/2508.16211)** | 893.54 | **4.16×** | **1.0029** | **32.948** |
689
409
 
690
- <div id="metrics"></div>
410
+ <details>
411
+ <summary> Show all comparison </summary>
691
412
 
692
- You can utilize the APIs provided by cache-dit to quickly evaluate the accuracy losses caused by different cache configurations. For example:
413
+ | Method | TFLOPs(↓) | SpeedUp(↑) | ImageReward(↑) | Clip Score(↑) |
414
+ | --- | --- | --- | --- | --- |
415
+ | [**FLUX.1**-dev]: 50 steps | 3726.87 | 1.00× | 0.9898 | 32.404 |
416
+ | [**FLUX.1**-dev]: 60% steps | 2231.70 | 1.67× | 0.9663 | 32.312 |
417
+ | Δ-DiT(N=2) | 2480.01 | 1.50× | 0.9444 | 32.273 |
418
+ | Δ-DiT(N=3) | 1686.76 | 2.21× | 0.8721 | 32.102 |
419
+ | [**FLUX.1**-dev]: 34% steps | 1264.63 | 3.13× | 0.9453 | 32.114 |
420
+ | Chipmunk | 1505.87 | 2.47× | 0.9936 | 32.776 |
421
+ | FORA(N=3) | 1320.07 | 2.82× | 0.9776 | 32.266 |
422
+ | **[DBCache(F=4,B=0,W=4,MC=4)](https://github.com/vipshop/cache-dit)** | 1400.08 | **2.66×** | **1.0065** | 32.838 |
423
+ | DuCa(N=5) | 978.76 | 3.80× | 0.9955 | 32.241 |
424
+ | TaylorSeer(N=4,O=2) | 1042.27 | 3.57× | 0.9857 | 32.413 |
425
+ | **[DBCache+TaylorSeer(F=1,B=0,O=1)](https://github.com/vipshop/cache-dit)** | 1153.05 | **3.23×** | **1.0221** | 32.819 |
426
+ | **[DBCache(F=1,B=0,W=4,MC=6)](https://github.com/vipshop/cache-dit)** | 944.75 | **3.94×** | 0.9997 | 32.849 |
427
+ | **[DBCache+TaylorSeer(F=1,B=0,O=1)](https://github.com/vipshop/cache-dit)** | 944.75 | **3.94×** | **1.0107** | 32.865 |
428
+ | **[FoCa(N=5): arxiv.2508.16211](https://arxiv.org/pdf/2508.16211)** | 893.54 | **4.16×** | **1.0029** | **32.948** |
429
+ | [**FLUX.1**-dev]: 22% steps | 818.29 | 4.55× | 0.8183 | 31.772 |
430
+ | FORA(N=4) | 967.91 | 3.84× | 0.9730 | 32.142 |
431
+ | ToCa(N=8) | 784.54 | 4.74× | 0.9451 | 31.993 |
432
+ | DuCa(N=7) | 760.14 | 4.89× | 0.9757 | 32.066 |
433
+ | TeaCache(l=0.8) | 892.35 | 4.17× | 0.8683 | 31.704 |
434
+ | **[DBCache(F=4,B=0,W=4,MC=10)](https://github.com/vipshop/cache-dit)** | 816.65 | 4.56x | 0.8245 | 32.191 |
435
+ | TaylorSeer(N=5,O=2) | 893.54 | 4.16× | 0.9768 | 32.467 |
436
+ | **[FoCa(N=7): arxiv.2508.16211](https://arxiv.org/pdf/2508.16211)** | 670.44 | **5.54×** | **0.9891** | **32.920** |
437
+ | FORA(N=7) | 670.14 | 5.55× | 0.7418 | 31.519 |
438
+ | ToCa(N=12) | 644.70 | 5.77× | 0.7155 | 31.808 |
439
+ | DuCa(N=10) | 606.91 | 6.13× | 0.8382 | 31.759 |
440
+ | TeaCache(l=1.2) | 669.27 | 5.56× | 0.7394 | 31.704 |
441
+ | **[DBCache(F=1,B=0,W=4,MC=10)](https://github.com/vipshop/cache-dit)** | 651.90 | **5.72x** | 0.8796 | **32.318** |
442
+ | TaylorSeer(N=7,O=2) | 670.44 | 5.54× | 0.9128 | 32.128 |
443
+ | **[FoCa(N=8): arxiv.2508.16211](https://arxiv.org/pdf/2508.16211)** | 596.07 | **6.24×** | **0.9502** | **32.706** |
693
444
 
694
- ```python
695
- from cache_dit.metrics import compute_psnr
696
- from cache_dit.metrics import compute_ssim
697
- from cache_dit.metrics import compute_fid
698
- from cache_dit.metrics import compute_lpips
699
- from cache_dit.metrics import compute_clip_score
700
- from cache_dit.metrics import compute_image_reward
701
-
702
- psnr, n = compute_psnr("true.png", "test.png") # Num: n
703
- psnr, n = compute_psnr("true_dir", "test_dir")
704
- ssim, n = compute_ssim("true_dir", "test_dir")
705
- fid, n = compute_fid("true_dir", "test_dir")
706
- lpips, n = compute_lpips("true_dir", "test_dir")
707
- clip, n = compute_clip_score("DrawBench200.txt", "test_dir")
708
- reward, n = compute_image_reward("DrawBench200.txt", "test_dir")
709
- ```
445
+ NOTE: Except for DBCache, other performance data are referenced from the paper [FoCa, arxiv.2508.16211](https://arxiv.org/pdf/2508.16211).
710
446
 
711
- Or, you can use `cache-dit-metrics-cli` tool. For examples:
447
+ </details>
712
448
 
713
- ```bash
714
- cache-dit-metrics-cli -h # show usage
715
- # all: PSNR, FID, SSIM, MSE, ..., etc.
716
- cache-dit-metrics-cli all -i1 true.png -i2 test.png # image
717
- cache-dit-metrics-cli all -i1 true_dir -i2 test_dir # image dir
718
- ```
449
+ ## 🎉User Guide
450
+
451
+ <div id="user-guide"></div>
452
+
453
+ For more advanced features such as **Unified Cache APIs**, **Forward Pattern Matching**, **Automatic Block Adapter**, **Hybrid Forward Pattern**, **DBCache**, **TaylorSeer Calibrator**, and **Hybrid Cache CFG**, please refer to the [🎉User_Guide.md](./docs/User_Guide.md) for details.
454
+
455
+ - [⚙️Installation](./docs/User_Guide.md#️installation)
456
+ - [🔥Benchmarks](./docs/User_Guide.md#benchmarks)
457
+ - [🔥Supported Pipelines](./docs/User_Guide.md#supported-pipelines)
458
+ - [🎉Unified Cache APIs](./docs/User_Guide.md#unified-cache-apis)
459
+ - [📚Forward Pattern Matching](./docs/User_Guide.md#forward-pattern-matching)
460
+ - [📚Cache with One-line Code](./docs/User_Guide.md#%EF%B8%8Fcache-acceleration-with-one-line-code)
461
+ - [🔥Automatic Block Adapter](./docs/User_Guide.md#automatic-block-adapter)
462
+ - [📚Hybird Forward Pattern](./docs/User_Guide.md#hybird-forward-pattern)
463
+ - [📚Implement Patch Functor](./docs/User_Guide.md#implement-patch-functor)
464
+ - [🤖Cache Acceleration Stats](./docs/User_Guide.md#cache-acceleration-stats-summary)
465
+ - [⚡️Dual Block Cache](./docs/User_Guide.md#️dbcache-dual-block-cache)
466
+ - [🔥TaylorSeer Calibrator](./docs/User_Guide.md#taylorseer-calibrator)
467
+ - [⚡️Hybrid Cache CFG](./docs/User_Guide.md#️hybrid-cache-cfg)
468
+ - [⚙️Torch Compile](./docs/User_Guide.md#️torch-compile)
469
+ - [🛠Metrics CLI](./docs/User_Guide.md#metrics-cli)
470
+ - [📚API Documents](./docs/User_Guide.md#api-documentation)
719
471
 
720
472
  ## 👋Contribute
721
473
  <div id="contribute"></div>
@@ -744,7 +496,7 @@ The **cache-dit** codebase is adapted from FBCache. Over time its codebase diver
744
496
 
745
497
  ```BibTeX
746
498
  @misc{cache-dit@2025,
747
- title={cache-dit: A Unified, Flexible and Training-free Cache Acceleration Framework for 🤗Diffusers.},
499
+ title={cache-dit: A Unified, Flexible and Training-free Cache Acceleration Framework for Diffusers.},
748
500
  url={https://github.com/vipshop/cache-dit.git},
749
501
  note={Open-source software available at https://github.com/vipshop/cache-dit.git},
750
502
  author={vipshop.com},
@@ -1,5 +1,5 @@
1
1
  cache_dit/__init__.py,sha256=sHRg0swXZZiw6lvSQ53fcVtN9JRayx0az2lXAz5OOGI,1510
2
- cache_dit/_version.py,sha256=e8NqPtZ8fggRgk3GPrqZ_U_BDV8aSULw1u_Gn9NNbnk,704
2
+ cache_dit/_version.py,sha256=lemL_4Kl75FgrO6lVuFrrtw6-Dcf9wtXBalKkXuzkO4,704
3
3
  cache_dit/logger.py,sha256=0zsu42hN-3-rgGC_C29ms1IvVpV4_b4_SwJCKSenxBE,4304
4
4
  cache_dit/utils.py,sha256=AyYRwi5XBxYBH4GaXxOxv9-X24Te_IYOYwh54t_1d3A,10674
5
5
  cache_dit/cache_factory/.gitignore,sha256=5Cb-qT9wsTUoMJ7vACDF7ZcLpAXhi5v-xdcWSRit988,23
@@ -10,15 +10,16 @@ cache_dit/cache_factory/forward_pattern.py,sha256=FumlCuZ-TSmSYH0hGBHctSJ-oGLCft
10
10
  cache_dit/cache_factory/params_modifier.py,sha256=zYJJsInTYCaYHBZ7mZJOP-PZnkSg3iN1WPewNOayXos,3628
11
11
  cache_dit/cache_factory/utils.py,sha256=XkVM9AXcB9zYq8-S8QKAsGz80r3tA6U3lBNGDGeHOe4,1871
12
12
  cache_dit/cache_factory/block_adapters/__init__.py,sha256=33geXMz56TxFWMp0c-H4__MY5SGRzKMKj3TXnUYOMlc,17512
13
- cache_dit/cache_factory/block_adapters/block_adapters.py,sha256=jAgzMPTaY4rBuq7DLK2VeEWuYLy7lvw7bZcPY4S93b4,21660
13
+ cache_dit/cache_factory/block_adapters/block_adapters.py,sha256=2TVK_KqiYXC7AKZ2s07fzdOzUoeUBc9P1SzQtLVzhf4,22249
14
14
  cache_dit/cache_factory/block_adapters/block_registers.py,sha256=2L7QeM4ygnaKQpC9PoJod0QRYyxidUKU2AYpysDCUwE,2572
15
15
  cache_dit/cache_factory/cache_adapters/__init__.py,sha256=py71WGD3JztQ1uk6qdLVbzYcQ1rvqFidNNaQYo7tqTo,79
16
- cache_dit/cache_factory/cache_adapters/cache_adapter.py,sha256=GrkSz4was9gg_dYkfBobrOQ_eNqipQBqeuFfqcwkCXc,19650
17
- cache_dit/cache_factory/cache_blocks/__init__.py,sha256=08Ox7kD05lkRKCOsVTdEZeKAWBheqpxfrAT1Nz7eclI,2916
16
+ cache_dit/cache_factory/cache_adapters/cache_adapter.py,sha256=PuNFO0t9510MhOOJy93cz0uiG8PeWKsjgUWshNj76LQ,20906
17
+ cache_dit/cache_factory/cache_blocks/__init__.py,sha256=mivvm8YOfqT7YHs8y_MzGOGztPw8LxAqKGXuSRXxCv0,3032
18
+ cache_dit/cache_factory/cache_blocks/offload_utils.py,sha256=wusgcqaCrwEjvv7Guy-6VXhNOgPPUrBV2sSVuRmGuvo,3513
18
19
  cache_dit/cache_factory/cache_blocks/pattern_0_1_2.py,sha256=ElMps6_7uI74tSF9GDR_dEI0bZEhdzcepM29xFWnYo8,428
19
20
  cache_dit/cache_factory/cache_blocks/pattern_3_4_5.py,sha256=Bv56qETXhsREvCrNvnZpSqDIIHsi6Ze3FJW4Yk2x3uI,8597
20
- cache_dit/cache_factory/cache_blocks/pattern_base.py,sha256=d4H9kEB0AgnVMT8aF0Y54SUMUQUxw5HQ8gRkoCuTQ_A,14577
21
- cache_dit/cache_factory/cache_blocks/utils.py,sha256=dGOC1tMMOvcbvEgx44eTESKn_jsv-0RZ3tRHPa3wmQ4,1315
21
+ cache_dit/cache_factory/cache_blocks/pattern_base.py,sha256=wdh0bbcpKO08AW2FTsj9X_tTbFCLkDmBjrstMxTf7MQ,14668
22
+ cache_dit/cache_factory/cache_blocks/pattern_utils.py,sha256=dGOC1tMMOvcbvEgx44eTESKn_jsv-0RZ3tRHPa3wmQ4,1315
22
23
  cache_dit/cache_factory/cache_contexts/__init__.py,sha256=T6Vak3x7Rs0Oy15Tou49p-rPQRA2jiuYtJBsbv1lBBU,388
23
24
  cache_dit/cache_factory/cache_contexts/cache_context.py,sha256=3EhaMCz3VUQ_NF81VgYwWoSEGIvhScPxPYhjL1OcgxE,15240
24
25
  cache_dit/cache_factory/cache_contexts/cache_manager.py,sha256=hSKAeP1CxmO3RFUxjFjAK1xdvVvTmeayh5jEHMaQXNE,30225
@@ -48,9 +49,9 @@ cache_dit/metrics/metrics.py,sha256=7UV-H2NRbhfr6dvrXEzU97Zy-BSQ5zEfm9CKtaK4ldg,
48
49
  cache_dit/quantize/__init__.py,sha256=kWYoMAyZgBXu9BJlZjTQ0dRffW9GqeeY9_iTkXrb70A,59
49
50
  cache_dit/quantize/quantize_ao.py,sha256=Fx1KW4l3gdEkdrcAYtPoDW7WKBJWrs3glOHiEwW_TgE,6160
50
51
  cache_dit/quantize/quantize_interface.py,sha256=2s_R7xPSKuJeFpEGeLwRxnq_CqJcBG3a3lzyW5wh-UM,1241
51
- cache_dit-0.3.2.dist-info/licenses/LICENSE,sha256=Dqb07Ik2dV41s9nIdMUbiRWEfDqo7-dQeRiY7kPO8PE,3769
52
- cache_dit-0.3.2.dist-info/METADATA,sha256=L8vWXW0w9Z4GXVXylKnqmhnfpKJ8YeL0LKIuwLL8HEo,47858
53
- cache_dit-0.3.2.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
54
- cache_dit-0.3.2.dist-info/entry_points.txt,sha256=FX2gysXaZx6NeK1iCLMcIdP8Q4_qikkIHtEmi3oWn8o,65
55
- cache_dit-0.3.2.dist-info/top_level.txt,sha256=ZJDydonLEhujzz0FOkVbO-BqfzO9d_VqRHmZU-3MOZo,10
56
- cache_dit-0.3.2.dist-info/RECORD,,
52
+ cache_dit-0.3.3.dist-info/licenses/LICENSE,sha256=Dqb07Ik2dV41s9nIdMUbiRWEfDqo7-dQeRiY7kPO8PE,3769
53
+ cache_dit-0.3.3.dist-info/METADATA,sha256=2kUqLHOXsbb25iz6uO8Y3pzOVMSaRHs-st6o3imjX_o,34752
54
+ cache_dit-0.3.3.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
55
+ cache_dit-0.3.3.dist-info/entry_points.txt,sha256=FX2gysXaZx6NeK1iCLMcIdP8Q4_qikkIHtEmi3oWn8o,65
56
+ cache_dit-0.3.3.dist-info/top_level.txt,sha256=ZJDydonLEhujzz0FOkVbO-BqfzO9d_VqRHmZU-3MOZo,10
57
+ cache_dit-0.3.3.dist-info/RECORD,,