cache-dit 0.3.2__py3-none-any.whl → 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of cache-dit might be problematic. Click here for more details.
- cache_dit/_version.py +2 -2
- cache_dit/cache_factory/block_adapters/block_adapters.py +13 -0
- cache_dit/cache_factory/cache_adapters/cache_adapter.py +42 -7
- cache_dit/cache_factory/cache_blocks/__init__.py +4 -0
- cache_dit/cache_factory/cache_blocks/offload_utils.py +115 -0
- cache_dit/cache_factory/cache_blocks/pattern_base.py +3 -0
- {cache_dit-0.3.2.dist-info → cache_dit-1.0.0.dist-info}/METADATA +149 -382
- {cache_dit-0.3.2.dist-info → cache_dit-1.0.0.dist-info}/RECORD +13 -12
- /cache_dit/cache_factory/cache_blocks/{utils.py → pattern_utils.py} +0 -0
- {cache_dit-0.3.2.dist-info → cache_dit-1.0.0.dist-info}/WHEEL +0 -0
- {cache_dit-0.3.2.dist-info → cache_dit-1.0.0.dist-info}/entry_points.txt +0 -0
- {cache_dit-0.3.2.dist-info → cache_dit-1.0.0.dist-info}/licenses/LICENSE +0 -0
- {cache_dit-0.3.2.dist-info → cache_dit-1.0.0.dist-info}/top_level.txt +0 -0
cache_dit/_version.py
CHANGED
|
@@ -28,7 +28,7 @@ version_tuple: VERSION_TUPLE
|
|
|
28
28
|
commit_id: COMMIT_ID
|
|
29
29
|
__commit_id__: COMMIT_ID
|
|
30
30
|
|
|
31
|
-
__version__ = version = '0.
|
|
32
|
-
__version_tuple__ = version_tuple = (
|
|
31
|
+
__version__ = version = '1.0.0'
|
|
32
|
+
__version_tuple__ = version_tuple = (1, 0, 0)
|
|
33
33
|
|
|
34
34
|
__commit_id__ = commit_id = None
|
|
@@ -113,6 +113,19 @@ class BlockAdapter:
|
|
|
113
113
|
if any((self.pipe is not None, self.transformer is not None)):
|
|
114
114
|
self.maybe_fill_attrs()
|
|
115
115
|
self.maybe_patchify()
|
|
116
|
+
self.maybe_skip_checks()
|
|
117
|
+
|
|
118
|
+
def maybe_skip_checks(self):
|
|
119
|
+
if getattr(self.transformer, "_hf_hook", None) is not None:
|
|
120
|
+
logger.warning("_hf_hook is not None, force skip pattern check!")
|
|
121
|
+
self.check_forward_pattern = False
|
|
122
|
+
self.check_num_outputs = False
|
|
123
|
+
elif getattr(self.transformer, "_diffusers_hook", None) is not None:
|
|
124
|
+
logger.warning(
|
|
125
|
+
"_diffusers_hook is not None, force skip pattern check!"
|
|
126
|
+
)
|
|
127
|
+
self.check_forward_pattern = False
|
|
128
|
+
self.check_num_outputs = False
|
|
116
129
|
|
|
117
130
|
def maybe_fill_attrs(self):
|
|
118
131
|
# NOTE: This func should be call before normalize.
|
|
@@ -1,10 +1,8 @@
|
|
|
1
1
|
import torch
|
|
2
|
-
|
|
3
2
|
import unittest
|
|
4
3
|
import functools
|
|
5
|
-
|
|
6
4
|
from contextlib import ExitStack
|
|
7
|
-
from typing import Dict, List, Tuple, Any, Union, Callable
|
|
5
|
+
from typing import Dict, List, Tuple, Any, Union, Callable, Optional
|
|
8
6
|
|
|
9
7
|
from diffusers import DiffusionPipeline
|
|
10
8
|
|
|
@@ -16,7 +14,7 @@ from cache_dit.cache_factory.cache_contexts import CachedContextManager
|
|
|
16
14
|
from cache_dit.cache_factory.cache_contexts import BasicCacheConfig
|
|
17
15
|
from cache_dit.cache_factory.cache_contexts import CalibratorConfig
|
|
18
16
|
from cache_dit.cache_factory.cache_blocks import CachedBlocks
|
|
19
|
-
from cache_dit.cache_factory.cache_blocks
|
|
17
|
+
from cache_dit.cache_factory.cache_blocks import (
|
|
20
18
|
patch_cached_stats,
|
|
21
19
|
remove_cached_stats,
|
|
22
20
|
)
|
|
@@ -330,7 +328,26 @@ class CachedAdapter:
|
|
|
330
328
|
|
|
331
329
|
assert isinstance(dummy_blocks_names, list)
|
|
332
330
|
|
|
333
|
-
|
|
331
|
+
from accelerate import hooks
|
|
332
|
+
|
|
333
|
+
_hf_hook: Optional[hooks.ModelHook] = None
|
|
334
|
+
|
|
335
|
+
if getattr(transformer, "_hf_hook", None) is not None:
|
|
336
|
+
_hf_hook = transformer._hf_hook # hooks from accelerate.hooks
|
|
337
|
+
if hasattr(transformer, "_old_forward"):
|
|
338
|
+
logger.warning(
|
|
339
|
+
"_hf_hook is not None, so, we have to re-direct transformer's "
|
|
340
|
+
f"original_forward({id(original_forward)}) to transformer's "
|
|
341
|
+
f"_old_forward({id(transformer._old_forward)})"
|
|
342
|
+
)
|
|
343
|
+
original_forward = transformer._old_forward
|
|
344
|
+
|
|
345
|
+
# TODO: remove group offload hooks the re-apply after cache applied.
|
|
346
|
+
# hooks = _diffusers_hook.hooks.copy(); _diffusers_hook.hooks.clear()
|
|
347
|
+
# re-apply hooks to transformer after cache applied.
|
|
348
|
+
# from diffusers.hooks.hooks import HookFunctionReference, HookRegistry
|
|
349
|
+
# from diffusers.hooks.group_offloading import apply_group_offloading
|
|
350
|
+
|
|
334
351
|
def new_forward(self, *args, **kwargs):
|
|
335
352
|
with ExitStack() as stack:
|
|
336
353
|
for name, context_name in zip(
|
|
@@ -348,9 +365,27 @@ class CachedAdapter:
|
|
|
348
365
|
self, dummy_name, dummy_blocks
|
|
349
366
|
)
|
|
350
367
|
)
|
|
351
|
-
|
|
368
|
+
outputs = original_forward(*args, **kwargs)
|
|
369
|
+
return outputs
|
|
370
|
+
|
|
371
|
+
def new_forward_with_hf_hook(self, *args, **kwargs):
|
|
372
|
+
# Compatible with model cpu offload
|
|
373
|
+
if _hf_hook is not None and hasattr(_hf_hook, "pre_forward"):
|
|
374
|
+
args, kwargs = _hf_hook.pre_forward(self, *args, **kwargs)
|
|
375
|
+
|
|
376
|
+
outputs = new_forward(self, *args, **kwargs)
|
|
377
|
+
|
|
378
|
+
if _hf_hook is not None and hasattr(_hf_hook, "post_forward"):
|
|
379
|
+
outputs = _hf_hook.post_forward(self, outputs)
|
|
380
|
+
|
|
381
|
+
return outputs
|
|
382
|
+
|
|
383
|
+
# NOTE: Still can't fully compatible with group offloading
|
|
384
|
+
transformer.forward = functools.update_wrapper(
|
|
385
|
+
functools.partial(new_forward_with_hf_hook, transformer),
|
|
386
|
+
new_forward_with_hf_hook,
|
|
387
|
+
)
|
|
352
388
|
|
|
353
|
-
transformer.forward = new_forward.__get__(transformer)
|
|
354
389
|
transformer._original_forward = original_forward
|
|
355
390
|
transformer._is_cached = True
|
|
356
391
|
|
|
@@ -12,6 +12,10 @@ from cache_dit.cache_factory.cache_blocks.pattern_0_1_2 import (
|
|
|
12
12
|
from cache_dit.cache_factory.cache_blocks.pattern_3_4_5 import (
|
|
13
13
|
CachedBlocks_Pattern_3_4_5,
|
|
14
14
|
)
|
|
15
|
+
from cache_dit.cache_factory.cache_blocks.pattern_utils import (
|
|
16
|
+
patch_cached_stats,
|
|
17
|
+
remove_cached_stats,
|
|
18
|
+
)
|
|
15
19
|
|
|
16
20
|
from cache_dit.logger import init_logger
|
|
17
21
|
|
|
@@ -0,0 +1,115 @@
|
|
|
1
|
+
import torch
|
|
2
|
+
import asyncio
|
|
3
|
+
import logging
|
|
4
|
+
from contextlib import contextmanager
|
|
5
|
+
from typing import Generator, Optional, List
|
|
6
|
+
from diffusers.hooks.group_offloading import _is_group_offload_enabled
|
|
7
|
+
from cache_dit.logger import init_logger
|
|
8
|
+
|
|
9
|
+
logger = init_logger(__name__)
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
@torch.compiler.disable
|
|
13
|
+
@contextmanager
|
|
14
|
+
def maybe_onload(
|
|
15
|
+
block: torch.nn.Module,
|
|
16
|
+
reference_tensor: torch.Tensor,
|
|
17
|
+
pending_tasks: List[asyncio.Task] = [],
|
|
18
|
+
) -> Generator:
|
|
19
|
+
|
|
20
|
+
if not _is_group_offload_enabled(block):
|
|
21
|
+
yield block
|
|
22
|
+
return
|
|
23
|
+
|
|
24
|
+
original_devices: Optional[List[torch.device]] = None
|
|
25
|
+
if hasattr(block, "parameters"):
|
|
26
|
+
params = list(block.parameters())
|
|
27
|
+
if params:
|
|
28
|
+
original_devices = [param.data.device for param in params]
|
|
29
|
+
|
|
30
|
+
target_device: torch.device = reference_tensor.device
|
|
31
|
+
move_task: Optional[asyncio.Task] = None
|
|
32
|
+
need_restore: bool = False
|
|
33
|
+
|
|
34
|
+
try:
|
|
35
|
+
if original_devices is not None:
|
|
36
|
+
unique_devices = list(set(original_devices))
|
|
37
|
+
if len(unique_devices) > 1 or unique_devices[0] != target_device:
|
|
38
|
+
if logger.isEnabledFor(logging.DEBUG):
|
|
39
|
+
logger.debug(
|
|
40
|
+
f"Onloading from {unique_devices} to {target_device}"
|
|
41
|
+
)
|
|
42
|
+
|
|
43
|
+
has_meta_params = any(
|
|
44
|
+
dev.type == "meta" for dev in original_devices
|
|
45
|
+
)
|
|
46
|
+
if has_meta_params: # compatible with sequential cpu offload
|
|
47
|
+
block = block.to_empty(device=target_device)
|
|
48
|
+
else:
|
|
49
|
+
block = block.to(target_device, non_blocking=False)
|
|
50
|
+
need_restore = True
|
|
51
|
+
yield block
|
|
52
|
+
finally:
|
|
53
|
+
if need_restore and original_devices:
|
|
54
|
+
|
|
55
|
+
async def restore_device():
|
|
56
|
+
for param, original_device in zip(
|
|
57
|
+
block.parameters(), original_devices
|
|
58
|
+
):
|
|
59
|
+
param.data = await asyncio.to_thread(
|
|
60
|
+
lambda p, d: p.to(d, non_blocking=True),
|
|
61
|
+
param.data, # type: torch.Tensor
|
|
62
|
+
original_device, # type: torch.device
|
|
63
|
+
) # type: ignore[assignment]
|
|
64
|
+
|
|
65
|
+
loop = get_event_loop()
|
|
66
|
+
move_task = loop.create_task(restore_device())
|
|
67
|
+
if move_task:
|
|
68
|
+
pending_tasks.append(move_task)
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
def get_event_loop() -> asyncio.AbstractEventLoop:
|
|
72
|
+
try:
|
|
73
|
+
loop = asyncio.get_running_loop()
|
|
74
|
+
except RuntimeError:
|
|
75
|
+
try:
|
|
76
|
+
loop = asyncio.get_event_loop()
|
|
77
|
+
except RuntimeError:
|
|
78
|
+
loop = asyncio.new_event_loop()
|
|
79
|
+
asyncio.set_event_loop(loop)
|
|
80
|
+
|
|
81
|
+
if not loop.is_running():
|
|
82
|
+
|
|
83
|
+
def run_loop() -> None:
|
|
84
|
+
asyncio.set_event_loop(loop)
|
|
85
|
+
loop.run_forever()
|
|
86
|
+
|
|
87
|
+
import threading
|
|
88
|
+
|
|
89
|
+
if not any(t.name == "_my_loop" for t in threading.enumerate()):
|
|
90
|
+
threading.Thread(
|
|
91
|
+
target=run_loop, name="_my_loop", daemon=True
|
|
92
|
+
).start()
|
|
93
|
+
|
|
94
|
+
return loop
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
@torch.compiler.disable
|
|
98
|
+
def maybe_offload(
|
|
99
|
+
pending_tasks: List[asyncio.Task],
|
|
100
|
+
) -> None:
|
|
101
|
+
if not pending_tasks:
|
|
102
|
+
return
|
|
103
|
+
|
|
104
|
+
loop = get_event_loop()
|
|
105
|
+
|
|
106
|
+
async def gather_tasks():
|
|
107
|
+
return await asyncio.gather(*pending_tasks)
|
|
108
|
+
|
|
109
|
+
future = asyncio.run_coroutine_threadsafe(gather_tasks(), loop)
|
|
110
|
+
try:
|
|
111
|
+
future.result(timeout=30.0)
|
|
112
|
+
except Exception as e:
|
|
113
|
+
logger.error(f"May Offload Error: {e}")
|
|
114
|
+
|
|
115
|
+
pending_tasks.clear()
|
|
@@ -1,7 +1,9 @@
|
|
|
1
1
|
import inspect
|
|
2
|
+
import asyncio
|
|
2
3
|
import torch
|
|
3
4
|
import torch.distributed as dist
|
|
4
5
|
|
|
6
|
+
from typing import List
|
|
5
7
|
from cache_dit.cache_factory.cache_contexts.cache_context import CachedContext
|
|
6
8
|
from cache_dit.cache_factory.cache_contexts.cache_manager import (
|
|
7
9
|
CachedContextManager,
|
|
@@ -45,6 +47,7 @@ class CachedBlocks_Pattern_Base(torch.nn.Module):
|
|
|
45
47
|
self.cache_prefix = cache_prefix
|
|
46
48
|
self.cache_context = cache_context
|
|
47
49
|
self.cache_manager = cache_manager
|
|
50
|
+
self.pending_tasks: List[asyncio.Task] = []
|
|
48
51
|
|
|
49
52
|
self._check_forward_pattern()
|
|
50
53
|
logger.info(
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: cache_dit
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 1.0.0
|
|
4
4
|
Summary: A Unified, Flexible and Training-free Cache Acceleration Framework for 🤗Diffusers.
|
|
5
5
|
Author: DefTruth, vipshop.com, etc.
|
|
6
6
|
Maintainer: DefTruth, vipshop.com, etc
|
|
@@ -48,23 +48,31 @@ Dynamic: requires-python
|
|
|
48
48
|
<div align="center">
|
|
49
49
|
<img src=https://github.com/vipshop/cache-dit/raw/main/assets/cache-dit-logo.png height="120">
|
|
50
50
|
|
|
51
|
-
<p align="center">
|
|
51
|
+
<p align="center">
|
|
52
52
|
A <b>Unified</b>, Flexible and Training-free <b>Cache Acceleration</b> Framework for <b>🤗Diffusers</b> <br>
|
|
53
53
|
♥️ Cache Acceleration with <b>One-line</b> Code ~ ♥️
|
|
54
54
|
</p>
|
|
55
55
|
<div align='center'>
|
|
56
|
+
<img src="./assets/image-reward-bench.png" width=580px >
|
|
57
|
+
</div>
|
|
58
|
+
<div align='center'>
|
|
59
|
+
<a href="https://huggingface.co/docs/diffusers/main/en/optimization/cache_dit"><img src=https://img.shields.io/badge/🤗Diffusers-ecosystem-yellow.svg ></a>
|
|
56
60
|
<img src=https://img.shields.io/badge/Language-Python-brightgreen.svg >
|
|
57
|
-
<img src=https://img.shields.io/badge/PRs-welcome-
|
|
61
|
+
<img src=https://img.shields.io/badge/PRs-welcome-blue.svg >
|
|
58
62
|
<img src=https://img.shields.io/badge/PyPI-pass-brightgreen.svg >
|
|
59
63
|
<img src=https://static.pepy.tech/badge/cache-dit >
|
|
60
|
-
<img src=https://img.shields.io/
|
|
61
|
-
|
|
62
|
-
|
|
64
|
+
<img src=https://img.shields.io/github/stars/vipshop/cache-dit.svg?style=dark >
|
|
65
|
+
</div>
|
|
66
|
+
<div align='center'>
|
|
67
|
+
<a href="./README.md">📚English</a> | <a href="./README_CN.md">📚中文阅读 </a> | <a href="./docs/User_Guide.md#api-documentation"> 📚API Documentation </a> | <a href="https://huggingface.co/docs/diffusers/main/en/optimization/cache_dit">🤗Diffusers' Documentation</a>
|
|
68
|
+
</div>
|
|
69
|
+
<!--
|
|
63
70
|
<p align="center">
|
|
64
|
-
<b><a href="#unified">📚Unified Cache APIs</a></b> | <a href="#forward-pattern-matching">📚Forward Pattern Matching</a> | <a href="
|
|
65
|
-
<a href="
|
|
71
|
+
<b><a href="#unified">📚Unified Cache APIs</a></b> | <a href="#forward-pattern-matching">📚Forward Pattern Matching</a> | <a href="./docs/User_Guide.md">📚Automatic Block Adapter</a><br>
|
|
72
|
+
<a href="./docs/User_Guide.md">📚Hybrid Forward Pattern</a> | <a href="#dbcache">📚DBCache</a> | <a href="./docs/User_Guide.md">📚TaylorSeer Calibrator</a> | <a href="./docs/User_Guide.md">📚Cache CFG</a><br>
|
|
66
73
|
<a href="#benchmarks">📚Text2Image DrawBench</a> | <a href="#benchmarks">📚Text2Image Distillation DrawBench</a>
|
|
67
74
|
</p>
|
|
75
|
+
-->
|
|
68
76
|
<p align="center">
|
|
69
77
|
🎉Now, <b>cache-dit</b> covers almost <b>All</b> Diffusers' <b>DiT</b> Pipelines🎉<br>
|
|
70
78
|
🔥<a href="#supported">Qwen-Image</a> | <a href="#supported">FLUX.1</a> | <a href="#supported">Qwen-Image-Lightning</a> | <a href="#supported"> Wan 2.1 </a> | <a href="#supported"> Wan 2.2 </a>🔥<br>
|
|
@@ -74,6 +82,8 @@ Dynamic: requires-python
|
|
|
74
82
|
🔥<a href="#supported">Chroma</a> | <a href="#supported">Sana</a> | <a href="#supported">Allegro</a> | <a href="#supported">Mochi</a> | <a href="#supported">SD 3/3.5</a> | <a href="#supported">Amused</a> | <a href="#supported"> ... </a> | <a href="#supported">DiT-XL</a>🔥
|
|
75
83
|
</p>
|
|
76
84
|
</div>
|
|
85
|
+
|
|
86
|
+
|
|
77
87
|
<div align='center'>
|
|
78
88
|
<img src=https://github.com/vipshop/cache-dit/raw/main/assets/gifs/wan2.2.C0_Q0_NONE.gif width=124px>
|
|
79
89
|
<img src=https://github.com/vipshop/cache-dit/raw/main/assets/gifs/wan2.2.C1_Q0_DBCACHE_F1B0_W2M8MC2_T1O2_R0.08.gif width=124px>
|
|
@@ -85,12 +95,6 @@ Dynamic: requires-python
|
|
|
85
95
|
<img src=https://github.com/vipshop/cache-dit/raw/main/assets/flux.C0_Q0_NONE_T23.69s.png width=90px>
|
|
86
96
|
<img src=https://github.com/vipshop/cache-dit/raw/main/assets/flux.C0_Q0_DBCACHE_F1B0_W4M0MC0_T1O2_R0.15_S16_T11.39s.png width=90px>
|
|
87
97
|
<p><b>🔥Qwen-Image</b> | <a href="https://github.com/vipshop/cache-dit">+cache-dit</a>:1.8x↑🎉 | <b>FLUX.1-dev</b> | <a href="https://github.com/vipshop/cache-dit">+cache-dit</a>:2.1x↑🎉</p>
|
|
88
|
-
<img src=https://github.com/vipshop/cache-dit/raw/main/assets/flux-kontext-cat.C0_L0_Q0_NONE.png width=100px>
|
|
89
|
-
<img src=https://github.com/vipshop/cache-dit/raw/main/assets/flux-kontext.C0_L0_Q0_NONE.png width=100px>
|
|
90
|
-
<img src=https://github.com/vipshop/cache-dit/raw/main/assets/flux-kontext.C0_L0_Q0_DBCACHE_F8B0_W8M0MC0_T0O2_R0.08_S10.png width=100px>
|
|
91
|
-
<img src=https://github.com/vipshop/cache-dit/raw/main/assets/flux-kontext.C0_L0_Q0_DBCACHE_F1B0_W8M0MC2_T0O2_R0.12_S12.png width=100px>
|
|
92
|
-
<img src=https://github.com/vipshop/cache-dit/raw/main/assets/flux-kontext.C0_L0_Q0_DBCACHE_F1B0_W2M0MC2_T0O2_R0.15_S15.png width=100px>
|
|
93
|
-
<p><b>🔥FLUX-Kontext-dev</b> | Baseline | <a href="https://github.com/vipshop/cache-dit">+cache-dit</a>:1.3x↑🎉 | 1.7x↑🎉 | 2.0x↑ 🎉</p>
|
|
94
98
|
<img src=https://github.com/vipshop/cache-dit/raw/main/assets/qwen-image-lightning.4steps.C0_L1_Q0_NONE.png width=160px>
|
|
95
99
|
<img src=https://github.com/vipshop/cache-dit/raw/main/assets/qwen-image-lightning.4steps.C0_L1_Q0_DBCACHE_F16B16_W2M1MC1_T0O2_R0.9_S1.png width=160px>
|
|
96
100
|
<img src=https://github.com/vipshop/cache-dit/raw/main/assets/hunyuan-image-2.1.C0_L0_Q1_fp8_w8a16_wo_NONE.png width=90px>
|
|
@@ -100,7 +104,22 @@ Dynamic: requires-python
|
|
|
100
104
|
<img src=https://github.com/vipshop/cache-dit/raw/main/assets/qwen-image-edit.C0_L0_Q0_NONE.png width=125px>
|
|
101
105
|
<img src=https://github.com/vipshop/cache-dit/raw/main/assets/qwen-image-edit.C0_L0_Q0_DBCACHE_F8B0_W8M0MC0_T0O2_R0.08_S18.png width=125px>
|
|
102
106
|
<img src=https://github.com/vipshop/cache-dit/raw/main/assets/qwen-image-edit.C0_L0_Q0_DBCACHE_F1B0_W8M0MC2_T0O2_R0.12_S24.png width=125px>
|
|
103
|
-
<p><b>🔥Qwen-Image-Edit</b> | Input w/o Edit | Baseline | <a href="https://github.com/vipshop/cache-dit">+cache-dit</a>:1.6x↑🎉 | 1.9x↑🎉
|
|
107
|
+
<p><b>🔥Qwen-Image-Edit</b> | Input w/o Edit | Baseline | <a href="https://github.com/vipshop/cache-dit">+cache-dit</a>:1.6x↑🎉 | 1.9x↑🎉
|
|
108
|
+
<br>♥️ Please consider to leave a <b>⭐️ Star</b> to support us ~ ♥️
|
|
109
|
+
</p>
|
|
110
|
+
</div>
|
|
111
|
+
|
|
112
|
+
<details align='center'>
|
|
113
|
+
|
|
114
|
+
<summary>Click here to show more Image/Video cases</summary>
|
|
115
|
+
|
|
116
|
+
<div align='center'>
|
|
117
|
+
<img src=https://github.com/vipshop/cache-dit/raw/main/assets/flux-kontext-cat.C0_L0_Q0_NONE.png width=100px>
|
|
118
|
+
<img src=https://github.com/vipshop/cache-dit/raw/main/assets/flux-kontext.C0_L0_Q0_NONE.png width=100px>
|
|
119
|
+
<img src=https://github.com/vipshop/cache-dit/raw/main/assets/flux-kontext.C0_L0_Q0_DBCACHE_F8B0_W8M0MC0_T0O2_R0.08_S10.png width=100px>
|
|
120
|
+
<img src=https://github.com/vipshop/cache-dit/raw/main/assets/flux-kontext.C0_L0_Q0_DBCACHE_F1B0_W8M0MC2_T0O2_R0.12_S12.png width=100px>
|
|
121
|
+
<img src=https://github.com/vipshop/cache-dit/raw/main/assets/flux-kontext.C0_L0_Q0_DBCACHE_F1B0_W2M0MC2_T0O2_R0.15_S15.png width=100px>
|
|
122
|
+
<p><b>🔥FLUX-Kontext-dev</b> | Baseline | <a href="https://github.com/vipshop/cache-dit">+cache-dit</a>:1.3x↑🎉 | 1.7x↑🎉 | 2.0x↑ 🎉</p>
|
|
104
123
|
<img src=https://github.com/vipshop/cache-dit/raw/main/assets/hidream.C0_L0_Q0_NONE.png width=100px>
|
|
105
124
|
<img src=https://github.com/vipshop/cache-dit/raw/main/assets/hidream.C0_L0_Q0_DBCACHE_F1B0_W8M0MC0_T0O2_R0.08_S24.png width=100px>
|
|
106
125
|
<img src=https://github.com/vipshop/cache-dit/raw/main/assets/cogview4.C0_L0_Q0_NONE.png width=100px>
|
|
@@ -160,24 +179,26 @@ Dynamic: requires-python
|
|
|
160
179
|
<p><b>🔥Asumed</b> | <a href="https://github.com/vipshop/cache-dit">+cache-dit</a>:1.1x↑🎉 | 1.2x↑🎉 | <b>DiT-XL-256</b> | <a href="https://github.com/vipshop/cache-dit">+cache-dit</a>:1.8x↑🎉
|
|
161
180
|
<br>♥️ Please consider to leave a <b>⭐️ Star</b> to support us ~ ♥️</p>
|
|
162
181
|
</div>
|
|
182
|
+
</details>
|
|
163
183
|
|
|
164
184
|
## 🔥News
|
|
165
185
|
|
|
166
|
-
- [2025-09-
|
|
167
|
-
- [2025-09-
|
|
168
|
-
- [2025-09-
|
|
169
|
-
- [2025-
|
|
170
|
-
- [2025-08-
|
|
171
|
-
- [2025-
|
|
186
|
+
- [2025-09-24] 🔥**cache-dit** has now joined the 🤗 Diffusers community ecosystem as the **first** cache acceleration framework for DiTs! Check out the documentation here: **[Diffusers Docs](https://huggingface.co/docs/diffusers/main/en/optimization/cache_dit)**. <a href="https://huggingface.co/docs/diffusers/main/en/optimization/cache_dit"><img src=https://img.shields.io/badge/🤗Diffusers-ecosystem-yellow.svg ></a>
|
|
187
|
+
- [2025-09-10] 🎉Day 1 support [**HunyuanImage-2.1**](https://github.com/Tencent-Hunyuan/HunyuanImage-2.1) with **1.7x↑🎉** speedup! Check this [example](https://github.com/vipshop/cache-dit/blob/main/examples/pipeline/run_hunyuan_image_2.1.py).
|
|
188
|
+
- [2025-09-08] 🔥[**Qwen-Image-Lightning**](https://github.com/vipshop/cache-dit/blob/main/examples/pipeline/run_qwen_image_lightning.py) **7.1/3.5 steps🎉** inference with **[DBCache: F16B16](https://github.com/vipshop/cache-dit)**.
|
|
189
|
+
- [2025-09-03] 🎉[**Wan2.2-MoE**](https://github.com/Wan-Video) **2.4x↑🎉** speedup! Please refer to [run_wan_2.2.py](https://github.com/vipshop/cache-dit/blob/main/examples/pipeline/run_wan_2.2.py) as an example.
|
|
190
|
+
- [2025-08-19] 🔥[**Qwen-Image-Edit**](https://github.com/QwenLM/Qwen-Image) **2x↑🎉** speedup! Check the example: [run_qwen_image_edit.py](https://github.com/vipshop/cache-dit/blob/main/examples/pipeline/run_qwen_image_edit.py).
|
|
191
|
+
- [2025-08-11] 🔥[**Qwen-Image**](https://github.com/QwenLM/Qwen-Image) **1.8x↑🎉** speedup! Please refer to [run_qwen_image.py](https://github.com/vipshop/cache-dit/blob/main/examples/pipeline/run_qwen_image.py) as an example.
|
|
172
192
|
|
|
173
193
|
<details>
|
|
174
194
|
<summary> Previous News </summary>
|
|
175
195
|
|
|
196
|
+
- [2025-07-13] 🎉[**FLUX.1-dev**](https://github.com/xlite-dev/flux-faster) **3.3x↑🎉** speedup! NVIDIA L20 with **[cache-dit](https://github.com/vipshop/cache-dit)** + **compile + FP8 DQ**.
|
|
176
197
|
- [2025-09-08] 🎉First caching mechanism in [Qwen-Image-Lightning](https://github.com/ModelTC/Qwen-Image-Lightning) with **[cache-dit](https://github.com/vipshop/cache-dit)**, check this [PR](https://github.com/ModelTC/Qwen-Image-Lightning/pull/35).
|
|
177
198
|
- [2025-09-08] 🎉First caching mechanism in [Wan2.2](https://github.com/Wan-Video/Wan2.2) with **[cache-dit](https://github.com/vipshop/cache-dit)**, check this [PR](https://github.com/Wan-Video/Wan2.2/pull/127) for more details.
|
|
178
199
|
- [2025-08-12] 🎉First caching mechanism in [QwenLM/Qwen-Image](https://github.com/QwenLM/Qwen-Image) with **[cache-dit](https://github.com/vipshop/cache-dit)**, check this [PR](https://github.com/QwenLM/Qwen-Image/pull/61).
|
|
179
|
-
- [2025-09-01] 📚[**Hybird Forward Pattern**](#unified) is supported! Please check [FLUX.1-dev](https://github.com/vipshop/cache-dit/
|
|
180
|
-
- [2025-08-10] 🔥[**FLUX.1-Kontext-dev**](https://huggingface.co/black-forest-labs/FLUX.1-Kontext-dev) is supported! Please refer [run_flux_kontext.py](https://github.com/vipshop/cache-dit/
|
|
200
|
+
- [2025-09-01] 📚[**Hybird Forward Pattern**](#unified) is supported! Please check [FLUX.1-dev](https://github.com/vipshop/cache-dit/blob/main/examples/run_flux_adapter.py) as an example.
|
|
201
|
+
- [2025-08-10] 🔥[**FLUX.1-Kontext-dev**](https://huggingface.co/black-forest-labs/FLUX.1-Kontext-dev) is supported! Please refer [run_flux_kontext.py](https://github.com/vipshop/cache-dit/blob/main/examples/pipeline/run_flux_kontext.py) as an example.
|
|
181
202
|
- [2025-07-18] 🎉First caching mechanism in [🤗huggingface/flux-fast](https://github.com/huggingface/flux-fast) with **[cache-dit](https://github.com/vipshop/cache-dit)**, check the [PR](https://github.com/huggingface/flux-fast/pull/13).
|
|
182
203
|
|
|
183
204
|
</details>
|
|
@@ -188,19 +209,13 @@ Dynamic: requires-python
|
|
|
188
209
|
|
|
189
210
|
- [⚙️Installation](#️installation)
|
|
190
211
|
- [🔥Benchmarks](#benchmarks)
|
|
191
|
-
- [🔥
|
|
192
|
-
- [
|
|
193
|
-
- [📚Forward Pattern Matching](#forward-pattern-matching)
|
|
194
|
-
- [♥️Cache with One-line Code](#%EF%B8%8Fcache-acceleration-with-one-line-code)
|
|
195
|
-
- [🔥Automatic Block Adapter](#automatic-block-adapter)
|
|
196
|
-
- [📚Hybird Forward Pattern](#automatic-block-adapter)
|
|
197
|
-
- [📚Implement Patch Functor](#implement-patch-functor)
|
|
198
|
-
- [🤖Cache Acceleration Stats](#cache-acceleration-stats-summary)
|
|
212
|
+
- [🔥Quick Start](#quick-start)
|
|
213
|
+
- [📚Pattern Matching](#forward-pattern-matching)
|
|
199
214
|
- [⚡️Dual Block Cache](#dbcache)
|
|
200
215
|
- [🔥TaylorSeer Calibrator](#taylorseer)
|
|
201
|
-
- [
|
|
202
|
-
- [
|
|
203
|
-
- [
|
|
216
|
+
- [📚Hybrid Cache CFG](#cfg)
|
|
217
|
+
- [🎉User Guide](#user-guide)
|
|
218
|
+
- [©️Citations](#citations)
|
|
204
219
|
|
|
205
220
|
## ⚙️Installation
|
|
206
221
|
|
|
@@ -217,82 +232,13 @@ Or you can install the latest develop version from GitHub:
|
|
|
217
232
|
pip3 install git+https://github.com/vipshop/cache-dit.git
|
|
218
233
|
```
|
|
219
234
|
|
|
220
|
-
## 🔥Supported Pipelines
|
|
221
|
-
|
|
222
|
-
<div id="supported"></div>
|
|
223
|
-
|
|
224
|
-
Currently, **cache-dit** library supports almost **Any** Diffusion Transformers (with **Transformer Blocks** that match the specific Input and Output **patterns**). Please check [🎉Examples](https://github.com/vipshop/cache-dit/raw/main/examples/pipeline) for more details. Here are just some of the tested models listed.
|
|
225
|
-
|
|
226
|
-
```python
|
|
227
|
-
>>> import cache_dit
|
|
228
|
-
>>> cache_dit.supported_pipelines()
|
|
229
|
-
(30, ['Flux*', 'Mochi*', 'CogVideoX*', 'Wan*', 'HunyuanVideo*', 'QwenImage*', 'LTX*', 'Allegro*',
|
|
230
|
-
'CogView3Plus*', 'CogView4*', 'Cosmos*', 'EasyAnimate*', 'SkyReelsV2*', 'StableDiffusion3*',
|
|
231
|
-
'ConsisID*', 'DiT*', 'Amused*', 'Bria*', 'Lumina*', 'OmniGen*', 'PixArt*', 'Sana*', 'StableAudio*',
|
|
232
|
-
'VisualCloze*', 'AuraFlow*', 'Chroma*', 'ShapE*', 'HiDream*', 'HunyuanDiT*', 'HunyuanDiTPAG*'])
|
|
233
|
-
```
|
|
234
|
-
|
|
235
|
-
<details>
|
|
236
|
-
<summary> Show all pipelines </summary>
|
|
237
|
-
|
|
238
|
-
- [🚀HunyuanImage-2.1](https://github.com/vipshop/cache-dit/raw/main/examples)
|
|
239
|
-
- [🚀Qwen-Image-Lightning](https://github.com/vipshop/cache-dit/raw/main/examples)
|
|
240
|
-
- [🚀Qwen-Image-Edit](https://github.com/vipshop/cache-dit/raw/main/examples)
|
|
241
|
-
- [🚀Qwen-Image](https://github.com/vipshop/cache-dit/raw/main/examples)
|
|
242
|
-
- [🚀FLUX.1-dev](https://github.com/vipshop/cache-dit/raw/main/examples)
|
|
243
|
-
- [🚀FLUX.1-Fill-dev](https://github.com/vipshop/cache-dit/raw/main/examples)
|
|
244
|
-
- [🚀FLUX.1-Kontext-dev](https://github.com/vipshop/cache-dit/raw/main/examples)
|
|
245
|
-
- [🚀CogView4](https://github.com/vipshop/cache-dit/raw/main/examples)
|
|
246
|
-
- [🚀Wan2.2-T2V](https://github.com/vipshop/cache-dit/raw/main/examples)
|
|
247
|
-
- [🚀HunyuanVideo](https://github.com/vipshop/cache-dit/raw/main/examples)
|
|
248
|
-
- [🚀HiDream-I1-Full](https://github.com/vipshop/cache-dit/raw/main/examples)
|
|
249
|
-
- [🚀HunyuanDiT](https://github.com/vipshop/cache-dit/raw/main/examples)
|
|
250
|
-
- [🚀Wan2.1-T2V](https://github.com/vipshop/cache-dit/raw/main/examples)
|
|
251
|
-
- [🚀Wan2.1-FLF2V](https://github.com/vipshop/cache-dit/raw/main/examples)
|
|
252
|
-
- [🚀SkyReelsV2](https://github.com/vipshop/cache-dit/raw/main/examples)
|
|
253
|
-
- [🚀Chroma1-HD](https://github.com/vipshop/cache-dit/raw/main/examples)
|
|
254
|
-
- [🚀CogVideoX1.5](https://github.com/vipshop/cache-dit/raw/main/examples)
|
|
255
|
-
- [🚀CogView3-Plus](https://github.com/vipshop/cache-dit/raw/main/examples)
|
|
256
|
-
- [🚀CogVideoX](https://github.com/vipshop/cache-dit/raw/main/examples)
|
|
257
|
-
- [🚀VisualCloze](https://github.com/vipshop/cache-dit/raw/main/examples)
|
|
258
|
-
- [🚀LTXVideo](https://github.com/vipshop/cache-dit/raw/main/examples)
|
|
259
|
-
- [🚀OmniGen](https://github.com/vipshop/cache-dit/raw/main/examples)
|
|
260
|
-
- [🚀Lumina2](https://github.com/vipshop/cache-dit/raw/main/examples)
|
|
261
|
-
- [🚀mochi-1-preview](https://github.com/vipshop/cache-dit/raw/main/examples)
|
|
262
|
-
- [🚀AuraFlow-v0.3](https://github.com/vipshop/cache-dit/raw/main/examples)
|
|
263
|
-
- [🚀PixArt-Alpha](https://github.com/vipshop/cache-dit/raw/main/examples)
|
|
264
|
-
- [🚀PixArt-Sigma](https://github.com/vipshop/cache-dit/raw/main/examples)
|
|
265
|
-
- [🚀NVIDIA Sana](https://github.com/vipshop/cache-dit/raw/main/examples)
|
|
266
|
-
- [🚀SD-3/3.5](https://github.com/vipshop/cache-dit/raw/main/examples)
|
|
267
|
-
- [🚀ConsisID](https://github.com/vipshop/cache-dit/raw/main/examples)
|
|
268
|
-
- [🚀Allegro](https://github.com/vipshop/cache-dit/raw/main/examples)
|
|
269
|
-
- [🚀Amused](https://github.com/vipshop/cache-dit/raw/main/examples)
|
|
270
|
-
- [🚀DiT-XL](https://github.com/vipshop/cache-dit/raw/main/examples)
|
|
271
|
-
- ...
|
|
272
|
-
|
|
273
|
-
</details>
|
|
274
|
-
|
|
275
235
|
## 🔥Benchmarks
|
|
276
236
|
|
|
277
237
|
<div id="benchmarks"></div>
|
|
278
238
|
|
|
279
|
-
|
|
239
|
+

|
|
280
240
|
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
Comparisons between different FnBn compute block configurations show that **more compute blocks result in higher precision**. For example, the F8B0_W8MC0 configuration achieves the best Clip Score (33.007) and ImageReward (1.0333). **Device**: NVIDIA L20. **F**: Fn_compute_blocks, **B**: Bn_compute_blocks, 50 steps.
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
| Config | Clip Score(↑) | ImageReward(↑) | PSNR(↑) | TFLOPs(↓) | SpeedUp(↑) |
|
|
287
|
-
| --- | --- | --- | --- | --- | --- |
|
|
288
|
-
| [**FLUX.1**-dev]: 50 steps | 32.9217 | 1.0412 | INF | 3726.87 | 1.00x |
|
|
289
|
-
| F8B0_W4MC0_R0.08 | 32.9871 | 1.0370 | 33.8317 | 2064.81 | 1.80x |
|
|
290
|
-
| F8B0_W4MC2_R0.12 | 32.9535 | 1.0185 | 32.7346 | 1935.73 | 1.93x |
|
|
291
|
-
| F8B0_W4MC3_R0.12 | 32.9234 | 1.0085 | 32.5385 | 1816.58 | 2.05x |
|
|
292
|
-
| F4B0_W4MC3_R0.12 | 32.8981 | 1.0130 | 31.8031 | 1507.83 | 2.47x |
|
|
293
|
-
| F4B0_W4MC4_R0.12 | 32.8384 | 1.0065 | 31.5292 | 1400.08 | 2.66x |
|
|
294
|
-
|
|
295
|
-
The comparison between **cache-dit: DBCache** and algorithms such as Δ-DiT, Chipmunk, FORA, DuCa, TaylorSeer and FoCa is as follows. Now, in the comparison with a speedup ratio less than **3x**, cache-dit achieved the best accuracy. Please check [📚How to Reproduce?](https://github.com/vipshop/cache-dit/raw/main/bench/) for more details.
|
|
241
|
+
The comparison between **cache-dit: DBCache** and algorithms such as Δ-DiT, Chipmunk, FORA, DuCa, TaylorSeer and FoCa is as follows. Now, in the comparison with a speedup ratio less than **3x**, cache-dit achieved the best accuracy. Surprisingly, cache-dit: DBCache still works in the extremely few-step distill model. For a complete benchmark, please refer to [📚Benchmarks](https://github.com/vipshop/cache-dit/raw/main/bench/).
|
|
296
242
|
|
|
297
243
|
| Method | TFLOPs(↓) | SpeedUp(↑) | ImageReward(↑) | Clip Score(↑) |
|
|
298
244
|
| --- | --- | --- | --- | --- |
|
|
@@ -314,6 +260,8 @@ The comparison between **cache-dit: DBCache** and algorithms such as Δ-DiT, Chi
|
|
|
314
260
|
<details>
|
|
315
261
|
<summary> Show all comparison </summary>
|
|
316
262
|
|
|
263
|
+

|
|
264
|
+
|
|
317
265
|
| Method | TFLOPs(↓) | SpeedUp(↑) | ImageReward(↑) | Clip Score(↑) |
|
|
318
266
|
| --- | --- | --- | --- | --- |
|
|
319
267
|
| [**FLUX.1**-dev]: 50 steps | 3726.87 | 1.00× | 0.9898 | 32.404 |
|
|
@@ -350,192 +298,84 @@ NOTE: Except for DBCache, other performance data are referenced from the paper [
|
|
|
350
298
|
|
|
351
299
|
</details>
|
|
352
300
|
|
|
353
|
-
|
|
354
|
-
|
|
355
|
-
Surprisingly, cache-dit: DBCache still works in the extremely few-step distill model. For example, **Qwen-Image-Lightning w/ 4 steps**, with the F16B16 configuration, the PSNR is 34.8163, the Clip Score is 35.6109, and the ImageReward is 1.2614. It maintained a relatively high precision.
|
|
356
|
-
|
|
357
|
-
| Config | PSNR(↑) | Clip Score(↑) | ImageReward(↑) | TFLOPs(↓) | SpeedUp(↑) |
|
|
358
|
-
|----------------------------|-----------|------------|--------------|----------|------------|
|
|
359
|
-
| [**Lightning**]: 4 steps | INF | 35.5797 | 1.2630 | 274.33 | 1.00x |
|
|
360
|
-
| F24B24_W2MC1_R0.8 | 36.3242 | 35.6224 | 1.2630 | 264.74 | 1.04x |
|
|
361
|
-
| F16B16_W2MC1_R0.8 | 34.8163 | 35.6109 | 1.2614 | 244.25 | 1.12x |
|
|
362
|
-
| F12B12_W2MC1_R0.8 | 33.8953 | 35.6535 | 1.2549 | 234.63 | 1.17x |
|
|
363
|
-
| F8B8_W2MC1_R0.8 | 33.1374 | 35.7284 | 1.2517 | 224.29 | 1.22x |
|
|
364
|
-
| F1B0_W2MC1_R0.8 | 31.8317 | 35.6651 | 1.2397 | 206.90 | 1.33x |
|
|
365
|
-
|
|
366
|
-
## 🎉Unified Cache APIs
|
|
301
|
+
## 🔥Quick Start
|
|
367
302
|
|
|
368
303
|
<div id="unified"></div>
|
|
369
304
|
|
|
370
|
-
|
|
371
|
-
|
|
372
|
-
Currently, for any **Diffusion** models with **Transformer Blocks** that match the specific **Input/Output patterns**, we can use the **Unified Cache APIs** from **cache-dit**, namely, the `cache_dit.enable_cache(...)` API. The **Unified Cache APIs** are currently in the experimental phase; please stay tuned for updates. The supported patterns are listed as follows:
|
|
373
|
-
|
|
374
|
-

|
|
375
|
-
|
|
376
|
-
### ♥️Cache Acceleration with One-line Code
|
|
377
|
-
|
|
378
|
-
In most cases, you only need to call **one-line** of code, that is `cache_dit.enable_cache(...)`. After this API is called, you just need to call the pipe as normal. The `pipe` param can be **any** Diffusion Pipeline. Please refer to [Qwen-Image](https://github.com/vipshop/cache-dit/raw/main/examples/pipeline/run_qwen_image.py) as an example.
|
|
379
|
-
|
|
380
|
-
```python
|
|
381
|
-
import cache_dit
|
|
382
|
-
from diffusers import DiffusionPipeline
|
|
383
|
-
|
|
384
|
-
# Can be any diffusion pipeline
|
|
385
|
-
pipe = DiffusionPipeline.from_pretrained("Qwen/Qwen-Image")
|
|
386
|
-
|
|
387
|
-
# One-line code with default cache options.
|
|
388
|
-
cache_dit.enable_cache(pipe)
|
|
389
|
-
|
|
390
|
-
# Just call the pipe as normal.
|
|
391
|
-
output = pipe(...)
|
|
392
|
-
|
|
393
|
-
# Disable cache and run original pipe.
|
|
394
|
-
cache_dit.disable_cache(pipe)
|
|
395
|
-
```
|
|
396
|
-
|
|
397
|
-
### 🔥Automatic Block Adapter
|
|
398
|
-
|
|
399
|
-
But in some cases, you may have a **modified** Diffusion Pipeline or Transformer that is not located in the diffusers library or not officially supported by **cache-dit** at this time. The **BlockAdapter** can help you solve this problems. Please refer to [🔥Qwen-Image w/ BlockAdapter](https://github.com/vipshop/cache-dit/raw/main/examples/adapter/run_qwen_image_adapter.py) as an example.
|
|
400
|
-
|
|
401
|
-
```python
|
|
402
|
-
from cache_dit import ForwardPattern, BlockAdapter
|
|
403
|
-
|
|
404
|
-
# Use 🔥BlockAdapter with `auto` mode.
|
|
405
|
-
cache_dit.enable_cache(
|
|
406
|
-
BlockAdapter(
|
|
407
|
-
# Any DiffusionPipeline, Qwen-Image, etc.
|
|
408
|
-
pipe=pipe, auto=True,
|
|
409
|
-
# Check `📚Forward Pattern Matching` documentation and hack the code of
|
|
410
|
-
# of Qwen-Image, you will find that it has satisfied `FORWARD_PATTERN_1`.
|
|
411
|
-
forward_pattern=ForwardPattern.Pattern_1,
|
|
412
|
-
),
|
|
413
|
-
)
|
|
414
|
-
|
|
415
|
-
# Or, manually setup transformer configurations.
|
|
416
|
-
cache_dit.enable_cache(
|
|
417
|
-
BlockAdapter(
|
|
418
|
-
pipe=pipe, # Qwen-Image, etc.
|
|
419
|
-
transformer=pipe.transformer,
|
|
420
|
-
blocks=pipe.transformer.transformer_blocks,
|
|
421
|
-
forward_pattern=ForwardPattern.Pattern_1,
|
|
422
|
-
),
|
|
423
|
-
)
|
|
424
|
-
```
|
|
425
|
-
For such situations, **BlockAdapter** can help you quickly apply various cache acceleration features to your own Diffusion Pipelines and Transformers. Please check the [📚BlockAdapter.md](https://github.com/vipshop/cache-dit/raw/main/docs/BlockAdapter.md) for more details.
|
|
305
|
+
<div id="quick-start"></div>
|
|
426
306
|
|
|
427
|
-
|
|
428
|
-
|
|
429
|
-
Sometimes, a Transformer class will contain more than one transformer `blocks`. For example, **FLUX.1** (HiDream, Chroma, etc) contains transformer_blocks and single_transformer_blocks (with different forward patterns). The **BlockAdapter** can also help you solve this problem. Please refer to [📚FLUX.1](https://github.com/vipshop/cache-dit/raw/main/examples/adapter/run_flux_adapter.py) as an example.
|
|
307
|
+
In most cases, you only need to call ♥️**one-line**♥️ of code, that is `cache_dit.enable_cache(...)`. After this API is called, you just need to call the pipe as normal. The `pipe` param can be **any** Diffusion Pipeline. Please refer to [Qwen-Image](https://github.com/vipshop/cache-dit/blob/main/examples/pipeline/run_qwen_image.py) as an example.
|
|
430
308
|
|
|
431
309
|
```python
|
|
432
|
-
|
|
433
|
-
|
|
434
|
-
|
|
435
|
-
|
|
436
|
-
|
|
437
|
-
|
|
438
|
-
|
|
439
|
-
pipe.transformer.transformer_blocks,
|
|
440
|
-
pipe.transformer.single_transformer_blocks,
|
|
441
|
-
],
|
|
442
|
-
forward_pattern=[
|
|
443
|
-
ForwardPattern.Pattern_1,
|
|
444
|
-
ForwardPattern.Pattern_3,
|
|
445
|
-
],
|
|
446
|
-
),
|
|
447
|
-
)
|
|
310
|
+
>>> import cache_dit
|
|
311
|
+
>>> from diffusers import DiffusionPipeline
|
|
312
|
+
>>> pipe = DiffusionPipeline.from_pretrained("Qwen/Qwen-Image") # Can be any diffusion pipeline
|
|
313
|
+
>>> cache_dit.enable_cache(pipe) # One-line code with default cache options.
|
|
314
|
+
>>> output = pipe(...) # Just call the pipe as normal.
|
|
315
|
+
>>> stats = cache_dit.summary(pipe) # Then, get the summary of cache acceleration stats.
|
|
316
|
+
>>> cache_dit.disable_cache(pipe) # Disable cache and run original pipe.
|
|
448
317
|
```
|
|
449
318
|
|
|
450
|
-
|
|
451
|
-
|
|
452
|
-
```python
|
|
453
|
-
from cache_dit import ForwardPattern, BlockAdapter, ParamsModifier, BasicCacheConfig
|
|
319
|
+
## 📚Forward Pattern Matching
|
|
454
320
|
|
|
455
|
-
|
|
456
|
-
BlockAdapter(
|
|
457
|
-
pipe=pipe,
|
|
458
|
-
transformer=[
|
|
459
|
-
pipe.transformer,
|
|
460
|
-
pipe.transformer_2,
|
|
461
|
-
],
|
|
462
|
-
blocks=[
|
|
463
|
-
pipe.transformer.blocks,
|
|
464
|
-
pipe.transformer_2.blocks,
|
|
465
|
-
],
|
|
466
|
-
forward_pattern=[
|
|
467
|
-
ForwardPattern.Pattern_2,
|
|
468
|
-
ForwardPattern.Pattern_2,
|
|
469
|
-
],
|
|
470
|
-
# Setup different cache params for each 'blocks'. You can
|
|
471
|
-
# pass any specific cache params to ParamModifier, the old
|
|
472
|
-
# value will be overwrite by the new one.
|
|
473
|
-
params_modifiers=[
|
|
474
|
-
ParamsModifier(
|
|
475
|
-
cache_config=BasicCacheConfig(
|
|
476
|
-
max_warmup_steps=4,
|
|
477
|
-
max_cached_steps=8,
|
|
478
|
-
),
|
|
479
|
-
),
|
|
480
|
-
ParamsModifier(
|
|
481
|
-
cache_config=BasicCacheConfig(
|
|
482
|
-
max_warmup_steps=2,
|
|
483
|
-
max_cached_steps=20,
|
|
484
|
-
),
|
|
485
|
-
),
|
|
486
|
-
],
|
|
487
|
-
has_separate_cfg=True,
|
|
488
|
-
),
|
|
489
|
-
)
|
|
490
|
-
```
|
|
491
|
-
### 📚Implement Patch Functor
|
|
492
|
-
|
|
493
|
-
For any PATTERN not in {0...5}, we introduced the simple abstract concept of **Patch Functor**. Users can implement a subclass of Patch Functor to convert an unknown Pattern into a known PATTERN, and for some models, users may also need to fuse the operations within the blocks for loop into block forward.
|
|
321
|
+
<div id="supported"></div>
|
|
494
322
|
|
|
495
|
-
|
|
323
|
+
<div id="forward-pattern-matching"></div>
|
|
496
324
|
|
|
497
|
-
|
|
325
|
+
cache-dit works by matching specific input/output patterns as shown below.
|
|
498
326
|
|
|
499
|
-
|
|
500
|
-
@BlockAdapterRegistry.register("HiDream")
|
|
501
|
-
def hidream_adapter(pipe, **kwargs) -> BlockAdapter:
|
|
502
|
-
from diffusers import HiDreamImageTransformer2DModel
|
|
503
|
-
from cache_dit.cache_factory.patch_functors import HiDreamPatchFunctor
|
|
504
|
-
|
|
505
|
-
assert isinstance(pipe.transformer, HiDreamImageTransformer2DModel)
|
|
506
|
-
return BlockAdapter(
|
|
507
|
-
pipe=pipe,
|
|
508
|
-
transformer=pipe.transformer,
|
|
509
|
-
blocks=[
|
|
510
|
-
pipe.transformer.double_stream_blocks,
|
|
511
|
-
pipe.transformer.single_stream_blocks,
|
|
512
|
-
],
|
|
513
|
-
forward_pattern=[
|
|
514
|
-
ForwardPattern.Pattern_0,
|
|
515
|
-
ForwardPattern.Pattern_3,
|
|
516
|
-
],
|
|
517
|
-
# NOTE: Setup your custom patch functor here.
|
|
518
|
-
patch_functor=HiDreamPatchFunctor(),
|
|
519
|
-
**kwargs,
|
|
520
|
-
)
|
|
521
|
-
```
|
|
327
|
+

|
|
522
328
|
|
|
523
|
-
|
|
329
|
+
Please check [🎉Examples](https://github.com/vipshop/cache-dit/blob/main/examples/pipeline) for more details. Here are just some of the tested models listed.
|
|
524
330
|
|
|
525
|
-
After finishing each inference of `pipe(...)`, you can call the `cache_dit.summary()` API on pipe to get the details of the **Cache Acceleration Stats** for the current inference.
|
|
526
331
|
```python
|
|
527
|
-
|
|
332
|
+
>>> import cache_dit
|
|
333
|
+
>>> cache_dit.supported_pipelines()
|
|
334
|
+
(30, ['Flux*', 'Mochi*', 'CogVideoX*', 'Wan*', 'HunyuanVideo*', 'QwenImage*', 'LTX*', 'Allegro*',
|
|
335
|
+
'CogView3Plus*', 'CogView4*', 'Cosmos*', 'EasyAnimate*', 'SkyReelsV2*', 'StableDiffusion3*',
|
|
336
|
+
'ConsisID*', 'DiT*', 'Amused*', 'Bria*', 'Lumina*', 'OmniGen*', 'PixArt*', 'Sana*', 'StableAudio*',
|
|
337
|
+
'VisualCloze*', 'AuraFlow*', 'Chroma*', 'ShapE*', 'HiDream*', 'HunyuanDiT*', 'HunyuanDiTPAG*'])
|
|
528
338
|
```
|
|
529
339
|
|
|
530
|
-
|
|
340
|
+
<details>
|
|
341
|
+
<summary> Show all pipelines </summary>
|
|
531
342
|
|
|
532
|
-
|
|
533
|
-
|
|
343
|
+
- [🚀HunyuanImage-2.1](https://github.com/vipshop/cache-dit/blob/main/examples)
|
|
344
|
+
- [🚀Qwen-Image-Lightning](https://github.com/vipshop/cache-dit/blob/main/examples)
|
|
345
|
+
- [🚀Qwen-Image-Edit](https://github.com/vipshop/cache-dit/blob/main/examples)
|
|
346
|
+
- [🚀Qwen-Image](https://github.com/vipshop/cache-dit/blob/main/examples)
|
|
347
|
+
- [🚀FLUX.1-dev](https://github.com/vipshop/cache-dit/blob/main/examples)
|
|
348
|
+
- [🚀FLUX.1-Fill-dev](https://github.com/vipshop/cache-dit/blob/main/examples)
|
|
349
|
+
- [🚀FLUX.1-Kontext-dev](https://github.com/vipshop/cache-dit/blob/main/examples)
|
|
350
|
+
- [🚀CogView4](https://github.com/vipshop/cache-dit/blob/main/examples)
|
|
351
|
+
- [🚀Wan2.2-T2V](https://github.com/vipshop/cache-dit/blob/main/examples)
|
|
352
|
+
- [🚀HunyuanVideo](https://github.com/vipshop/cache-dit/blob/main/examples)
|
|
353
|
+
- [🚀HiDream-I1-Full](https://github.com/vipshop/cache-dit/blob/main/examples)
|
|
354
|
+
- [🚀HunyuanDiT](https://github.com/vipshop/cache-dit/blob/main/examples)
|
|
355
|
+
- [🚀Wan2.1-T2V](https://github.com/vipshop/cache-dit/blob/main/examples)
|
|
356
|
+
- [🚀Wan2.1-FLF2V](https://github.com/vipshop/cache-dit/blob/main/examples)
|
|
357
|
+
- [🚀SkyReelsV2](https://github.com/vipshop/cache-dit/blob/main/examples)
|
|
358
|
+
- [🚀Chroma1-HD](https://github.com/vipshop/cache-dit/blob/main/examples)
|
|
359
|
+
- [🚀CogVideoX1.5](https://github.com/vipshop/cache-dit/blob/main/examples)
|
|
360
|
+
- [🚀CogView3-Plus](https://github.com/vipshop/cache-dit/blob/main/examples)
|
|
361
|
+
- [🚀CogVideoX](https://github.com/vipshop/cache-dit/blob/main/examples)
|
|
362
|
+
- [🚀VisualCloze](https://github.com/vipshop/cache-dit/blob/main/examples)
|
|
363
|
+
- [🚀LTXVideo](https://github.com/vipshop/cache-dit/blob/main/examples)
|
|
364
|
+
- [🚀OmniGen](https://github.com/vipshop/cache-dit/blob/main/examples)
|
|
365
|
+
- [🚀Lumina2](https://github.com/vipshop/cache-dit/blob/main/examples)
|
|
366
|
+
- [🚀mochi-1-preview](https://github.com/vipshop/cache-dit/blob/main/examples)
|
|
367
|
+
- [🚀AuraFlow-v0.3](https://github.com/vipshop/cache-dit/blob/main/examples)
|
|
368
|
+
- [🚀PixArt-Alpha](https://github.com/vipshop/cache-dit/blob/main/examples)
|
|
369
|
+
- [🚀PixArt-Sigma](https://github.com/vipshop/cache-dit/blob/main/examples)
|
|
370
|
+
- [🚀NVIDIA Sana](https://github.com/vipshop/cache-dit/blob/main/examples)
|
|
371
|
+
- [🚀SD-3/3.5](https://github.com/vipshop/cache-dit/blob/main/examples)
|
|
372
|
+
- [🚀ConsisID](https://github.com/vipshop/cache-dit/blob/main/examples)
|
|
373
|
+
- [🚀Allegro](https://github.com/vipshop/cache-dit/blob/main/examples)
|
|
374
|
+
- [🚀Amused](https://github.com/vipshop/cache-dit/blob/main/examples)
|
|
375
|
+
- [🚀DiT-XL](https://github.com/vipshop/cache-dit/blob/main/examples)
|
|
376
|
+
- ...
|
|
534
377
|
|
|
535
|
-
|
|
536
|
-
|-------------|-----------|-----------|-----------|-----------|-----------|-----------|
|
|
537
|
-
| 23 | 0.045 | 0.084 | 0.114 | 0.147 | 0.241 | 0.297 |
|
|
538
|
-
```
|
|
378
|
+
</details>
|
|
539
379
|
|
|
540
380
|
## ⚡️DBCache: Dual Block Cache
|
|
541
381
|
|
|
@@ -543,20 +383,9 @@ You can set `details` param as `True` to show more details of cache stats. (mark
|
|
|
543
383
|
|
|
544
384
|

|
|
545
385
|
|
|
546
|
-
**DBCache**: **Dual Block Caching** for Diffusion Transformers. Different configurations of compute blocks (**F8B12**, etc.) can be customized in DBCache, enabling a balanced trade-off between performance and precision. Moreover, it can be entirely **training**-**free**. Please
|
|
547
|
-
|
|
548
|
-
- **Fn**: Specifies that DBCache uses the **first n** Transformer blocks to fit the information at time step t, enabling the calculation of a more stable L1 diff and delivering more accurate information to subsequent blocks.
|
|
549
|
-
- **Bn**: Further fuses approximate information in the **last n** Transformer blocks to enhance prediction accuracy. These blocks act as an auto-scaler for approximate hidden states that use residual cache.
|
|
386
|
+
**DBCache**: **Dual Block Caching** for Diffusion Transformers. Different configurations of compute blocks (**F8B12**, etc.) can be customized in DBCache, enabling a balanced trade-off between performance and precision. Moreover, it can be entirely **training**-**free**. Please Check the [DBCache](https://github.com/vipshop/cache-dit/blob/main/docs/DBCache.md) and [User Guide](https://github.com/vipshop/cache-dit/blob/main/docs/User_Guide.md#dbcache) docs for details.
|
|
550
387
|
|
|
551
388
|
```python
|
|
552
|
-
import cache_dit
|
|
553
|
-
from diffusers import FluxPipeline
|
|
554
|
-
|
|
555
|
-
pipe_or_adapter = FluxPipeline.from_pretrained(
|
|
556
|
-
"black-forest-labs/FLUX.1-dev",
|
|
557
|
-
torch_dtype=torch.bfloat16,
|
|
558
|
-
).to("cuda")
|
|
559
|
-
|
|
560
389
|
# Default options, F8B0, 8 warmup steps, and unlimited cached
|
|
561
390
|
# steps for good balance between performance and precision
|
|
562
391
|
cache_dit.enable_cache(pipe_or_adapter)
|
|
@@ -576,28 +405,13 @@ cache_dit.enable_cache(
|
|
|
576
405
|
)
|
|
577
406
|
```
|
|
578
407
|
|
|
579
|
-
<div align="center">
|
|
580
|
-
<p align="center">
|
|
581
|
-
DBCache, <b> L20x1 </b>, Steps: 28, "A cat holding a sign that says hello world with complex background"
|
|
582
|
-
</p>
|
|
583
|
-
</div>
|
|
584
|
-
|
|
585
|
-
|Baseline(L20x1)|F1B0 (0.08)|F1B0 (0.20)|F8B8 (0.15)|F12B12 (0.20)|F16B16 (0.20)|
|
|
586
|
-
|:---:|:---:|:---:|:---:|:---:|:---:|
|
|
587
|
-
|24.85s|15.59s|8.58s|15.41s|15.11s|17.74s|
|
|
588
|
-
|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/NONE_R0.08_S0.png width=105px>|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/DBCACHE_F1B0S1_R0.08_S11.png width=105px> | <img src=https://github.com/vipshop/cache-dit/raw/main/assets/DBCACHE_F1B0S1_R0.2_S19.png width=105px>|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/DBCACHE_F8B8S1_R0.15_S15.png width=105px>|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/DBCACHE_F12B12S4_R0.2_S16.png width=105px>|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/DBCACHE_F16B16S4_R0.2_S13.png width=105px>|
|
|
589
|
-
|
|
590
408
|
## 🔥TaylorSeer Calibrator
|
|
591
409
|
|
|
592
410
|
<div id="taylorseer"></div>
|
|
593
411
|
|
|
594
|
-
|
|
595
|
-
|
|
596
|
-
$$
|
|
597
|
-
\mathcal{F}\_{\text {pred }, m}\left(x_{t-k}^l\right)=\mathcal{F}\left(x_t^l\right)+\sum_{i=1}^m \frac{\Delta^i \mathcal{F}\left(x_t^l\right)}{i!\cdot N^i}(-k)^i
|
|
598
|
-
$$
|
|
412
|
+
The [TaylorSeers](https://huggingface.co/papers/2503.06923) algorithm further improves the precision of DBCache in cases where the cached steps are large (Hybrid TaylorSeer + DBCache). At timesteps with significant intervals, the feature similarity in diffusion models decreases substantially, significantly harming the generation quality.
|
|
599
413
|
|
|
600
|
-
|
|
414
|
+
TaylorSeer employs a differential method to approximate the higher-order derivatives of features and predict features in future timesteps with Taylor series expansion. The TaylorSeer implemented in CacheDiT supports both hidden states and residual cache types. F_pred can be a residual cache or a hidden-state cache.
|
|
601
415
|
|
|
602
416
|
```python
|
|
603
417
|
from cache_dit import BasicCacheConfig, TaylorSeerCalibratorConfig
|
|
@@ -620,25 +434,14 @@ cache_dit.enable_cache(
|
|
|
620
434
|
)
|
|
621
435
|
```
|
|
622
436
|
|
|
623
|
-
> [!
|
|
624
|
-
>
|
|
625
|
-
|
|
626
|
-
<div align="center">
|
|
627
|
-
<p align="center">
|
|
628
|
-
<b>DBCache F1B0 + TaylorSeer</b>, L20x1, Steps: 28, <br>"A cat holding a sign that says hello world with complex background"
|
|
629
|
-
</p>
|
|
630
|
-
</div>
|
|
631
|
-
|
|
632
|
-
|Baseline(L20x1)|F1B0 (0.12)|+TaylorSeer|F1B0 (0.15)|+TaylorSeer|+compile|
|
|
633
|
-
|:---:|:---:|:---:|:---:|:---:|:---:|
|
|
634
|
-
|24.85s|12.85s|12.86s|10.27s|10.28s|8.48s|
|
|
635
|
-
|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/NONE_R0.08_S0.png width=105px>|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/U0_C0_DBCACHE_F1B0S1W0T0ET0_R0.12_S14_T12.85s.png width=105px>|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/U0_C0_DBCACHE_F1B0S1W0T1ET1_R0.12_S14_T12.86s.png width=105px>|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/U0_C0_DBCACHE_F1B0S1W0T0ET0_R0.15_S17_T10.27s.png width=105px>|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/U0_C0_DBCACHE_F1B0S1W0T1ET1_R0.15_S17_T10.28s.png width=105px>|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/U0_C1_DBCACHE_F1B0S1W0T1ET1_R0.15_S17_T8.48s.png width=105px>|
|
|
437
|
+
> [!TIP]
|
|
438
|
+
> The `Bn_compute_blocks` parameter of DBCache can be set to `0` if you use TaylorSeer as the calibrator for approximate hidden states. DBCache's `Bn_compute_blocks` also acts as a calibrator, so you can choose either `Bn_compute_blocks` > 0 or TaylorSeer. We recommend using the configuration scheme of TaylorSeer + DBCache FnB0.
|
|
636
439
|
|
|
637
|
-
##
|
|
440
|
+
## 📚Hybrid Cache CFG
|
|
638
441
|
|
|
639
442
|
<div id="cfg"></div>
|
|
640
443
|
|
|
641
|
-
cache-dit supports caching for
|
|
444
|
+
cache-dit supports caching for CFG (classifier-free guidance). For models that fuse CFG and non-CFG into a single forward step, or models that do not include CFG (classifier-free guidance) in the forward step, please set `enable_separate_cfg` parameter to `False (default, None)`. Otherwise, set it to `True`.
|
|
642
445
|
|
|
643
446
|
```python
|
|
644
447
|
from cache_dit import BasicCacheConfig
|
|
@@ -647,75 +450,35 @@ cache_dit.enable_cache(
|
|
|
647
450
|
pipe_or_adapter,
|
|
648
451
|
cache_config=BasicCacheConfig(
|
|
649
452
|
...,
|
|
650
|
-
#
|
|
651
|
-
#
|
|
652
|
-
|
|
653
|
-
# for Wan 2.1/Qwen-Image and set it as False for FLUX.1, HunyuanVideo,
|
|
654
|
-
# CogVideoX, Mochi, LTXVideo, Allegro, CogView3Plus, EasyAnimate, SD3, etc.
|
|
655
|
-
enable_separate_cfg=True, # Wan 2.1, Qwen-Image, CogView4, Cosmos, SkyReelsV2, etc.
|
|
656
|
-
# Compute cfg forward first or not, default False, namely,
|
|
657
|
-
# 0, 2, 4, ..., -> non-CFG step; 1, 3, 5, ... -> CFG step.
|
|
658
|
-
cfg_compute_first=False,
|
|
659
|
-
# Compute separate diff values for CFG and non-CFG step,
|
|
660
|
-
# default True. If False, we will use the computed diff from
|
|
661
|
-
# current non-CFG transformer step for current CFG step.
|
|
662
|
-
cfg_diff_compute_separate=True,
|
|
453
|
+
# For example, set it as True for Wan 2.1/Qwen-Image
|
|
454
|
+
# and set it as False for FLUX.1, HunyuanVideo, CogVideoX, etc.
|
|
455
|
+
enable_separate_cfg=True,
|
|
663
456
|
),
|
|
664
457
|
)
|
|
665
458
|
```
|
|
666
459
|
|
|
667
|
-
##
|
|
668
|
-
|
|
669
|
-
<div id="
|
|
670
|
-
|
|
671
|
-
|
|
672
|
-
|
|
673
|
-
|
|
674
|
-
|
|
675
|
-
|
|
676
|
-
|
|
677
|
-
|
|
678
|
-
|
|
679
|
-
|
|
680
|
-
|
|
681
|
-
|
|
682
|
-
|
|
683
|
-
|
|
684
|
-
|
|
685
|
-
|
|
686
|
-
|
|
687
|
-
|
|
688
|
-
|
|
689
|
-
|
|
690
|
-
<div id="metrics"></div>
|
|
691
|
-
|
|
692
|
-
You can utilize the APIs provided by cache-dit to quickly evaluate the accuracy losses caused by different cache configurations. For example:
|
|
693
|
-
|
|
694
|
-
```python
|
|
695
|
-
from cache_dit.metrics import compute_psnr
|
|
696
|
-
from cache_dit.metrics import compute_ssim
|
|
697
|
-
from cache_dit.metrics import compute_fid
|
|
698
|
-
from cache_dit.metrics import compute_lpips
|
|
699
|
-
from cache_dit.metrics import compute_clip_score
|
|
700
|
-
from cache_dit.metrics import compute_image_reward
|
|
701
|
-
|
|
702
|
-
psnr, n = compute_psnr("true.png", "test.png") # Num: n
|
|
703
|
-
psnr, n = compute_psnr("true_dir", "test_dir")
|
|
704
|
-
ssim, n = compute_ssim("true_dir", "test_dir")
|
|
705
|
-
fid, n = compute_fid("true_dir", "test_dir")
|
|
706
|
-
lpips, n = compute_lpips("true_dir", "test_dir")
|
|
707
|
-
clip, n = compute_clip_score("DrawBench200.txt", "test_dir")
|
|
708
|
-
reward, n = compute_image_reward("DrawBench200.txt", "test_dir")
|
|
709
|
-
```
|
|
710
|
-
|
|
711
|
-
Or, you can use `cache-dit-metrics-cli` tool. For examples:
|
|
712
|
-
|
|
713
|
-
```bash
|
|
714
|
-
cache-dit-metrics-cli -h # show usage
|
|
715
|
-
# all: PSNR, FID, SSIM, MSE, ..., etc.
|
|
716
|
-
cache-dit-metrics-cli all -i1 true.png -i2 test.png # image
|
|
717
|
-
cache-dit-metrics-cli all -i1 true_dir -i2 test_dir # image dir
|
|
718
|
-
```
|
|
460
|
+
## 🎉User Guide
|
|
461
|
+
|
|
462
|
+
<div id="user-guide"></div>
|
|
463
|
+
|
|
464
|
+
For more advanced features such as **Unified Cache APIs**, **Forward Pattern Matching**, **Automatic Block Adapter**, **Hybrid Forward Pattern**, **DBCache**, **TaylorSeer Calibrator**, and **Hybrid Cache CFG**, please refer to the [🎉User_Guide.md](./docs/User_Guide.md) for details.
|
|
465
|
+
|
|
466
|
+
- [⚙️Installation](./docs/User_Guide.md#️installation)
|
|
467
|
+
- [🔥Benchmarks](./docs/User_Guide.md#benchmarks)
|
|
468
|
+
- [🔥Supported Pipelines](./docs/User_Guide.md#supported-pipelines)
|
|
469
|
+
- [🎉Unified Cache APIs](./docs/User_Guide.md#unified-cache-apis)
|
|
470
|
+
- [📚Forward Pattern Matching](./docs/User_Guide.md#forward-pattern-matching)
|
|
471
|
+
- [📚Cache with One-line Code](./docs/User_Guide.md#%EF%B8%8Fcache-acceleration-with-one-line-code)
|
|
472
|
+
- [🔥Automatic Block Adapter](./docs/User_Guide.md#automatic-block-adapter)
|
|
473
|
+
- [📚Hybird Forward Pattern](./docs/User_Guide.md#hybird-forward-pattern)
|
|
474
|
+
- [📚Implement Patch Functor](./docs/User_Guide.md#implement-patch-functor)
|
|
475
|
+
- [🤖Cache Acceleration Stats](./docs/User_Guide.md#cache-acceleration-stats-summary)
|
|
476
|
+
- [⚡️Dual Block Cache](./docs/User_Guide.md#️dbcache-dual-block-cache)
|
|
477
|
+
- [🔥TaylorSeer Calibrator](./docs/User_Guide.md#taylorseer-calibrator)
|
|
478
|
+
- [⚡️Hybrid Cache CFG](./docs/User_Guide.md#️hybrid-cache-cfg)
|
|
479
|
+
- [⚙️Torch Compile](./docs/User_Guide.md#️torch-compile)
|
|
480
|
+
- [🛠Metrics CLI](./docs/User_Guide.md#metrics-cli)
|
|
481
|
+
- [📚API Documents](./docs/User_Guide.md#api-documentation)
|
|
719
482
|
|
|
720
483
|
## 👋Contribute
|
|
721
484
|
<div id="contribute"></div>
|
|
@@ -738,13 +501,17 @@ How to contribute? Star ⭐️ this repo to support us or check [CONTRIBUTE.md](
|
|
|
738
501
|
|
|
739
502
|
The **cache-dit** codebase is adapted from FBCache. Over time its codebase diverged a lot, and **cache-dit** API is no longer compatible with FBCache.
|
|
740
503
|
|
|
504
|
+
## ©️Special Acknowledgements
|
|
505
|
+
|
|
506
|
+
Special thanks to vipshop's Computer Vision AI Team for supporting document, testing and production-level deployment of this project.
|
|
507
|
+
|
|
741
508
|
## ©️Citations
|
|
742
509
|
|
|
743
510
|
<div id="citations"></div>
|
|
744
511
|
|
|
745
512
|
```BibTeX
|
|
746
513
|
@misc{cache-dit@2025,
|
|
747
|
-
title={cache-dit: A Unified, Flexible and Training-free Cache Acceleration Framework for
|
|
514
|
+
title={cache-dit: A Unified, Flexible and Training-free Cache Acceleration Framework for Diffusers.},
|
|
748
515
|
url={https://github.com/vipshop/cache-dit.git},
|
|
749
516
|
note={Open-source software available at https://github.com/vipshop/cache-dit.git},
|
|
750
517
|
author={vipshop.com},
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
cache_dit/__init__.py,sha256=sHRg0swXZZiw6lvSQ53fcVtN9JRayx0az2lXAz5OOGI,1510
|
|
2
|
-
cache_dit/_version.py,sha256=
|
|
2
|
+
cache_dit/_version.py,sha256=vLA4ITz09S-S435nq6yTF6l3qiSz6w4euS1rOxXgd1M,704
|
|
3
3
|
cache_dit/logger.py,sha256=0zsu42hN-3-rgGC_C29ms1IvVpV4_b4_SwJCKSenxBE,4304
|
|
4
4
|
cache_dit/utils.py,sha256=AyYRwi5XBxYBH4GaXxOxv9-X24Te_IYOYwh54t_1d3A,10674
|
|
5
5
|
cache_dit/cache_factory/.gitignore,sha256=5Cb-qT9wsTUoMJ7vACDF7ZcLpAXhi5v-xdcWSRit988,23
|
|
@@ -10,15 +10,16 @@ cache_dit/cache_factory/forward_pattern.py,sha256=FumlCuZ-TSmSYH0hGBHctSJ-oGLCft
|
|
|
10
10
|
cache_dit/cache_factory/params_modifier.py,sha256=zYJJsInTYCaYHBZ7mZJOP-PZnkSg3iN1WPewNOayXos,3628
|
|
11
11
|
cache_dit/cache_factory/utils.py,sha256=XkVM9AXcB9zYq8-S8QKAsGz80r3tA6U3lBNGDGeHOe4,1871
|
|
12
12
|
cache_dit/cache_factory/block_adapters/__init__.py,sha256=33geXMz56TxFWMp0c-H4__MY5SGRzKMKj3TXnUYOMlc,17512
|
|
13
|
-
cache_dit/cache_factory/block_adapters/block_adapters.py,sha256=
|
|
13
|
+
cache_dit/cache_factory/block_adapters/block_adapters.py,sha256=2TVK_KqiYXC7AKZ2s07fzdOzUoeUBc9P1SzQtLVzhf4,22249
|
|
14
14
|
cache_dit/cache_factory/block_adapters/block_registers.py,sha256=2L7QeM4ygnaKQpC9PoJod0QRYyxidUKU2AYpysDCUwE,2572
|
|
15
15
|
cache_dit/cache_factory/cache_adapters/__init__.py,sha256=py71WGD3JztQ1uk6qdLVbzYcQ1rvqFidNNaQYo7tqTo,79
|
|
16
|
-
cache_dit/cache_factory/cache_adapters/cache_adapter.py,sha256=
|
|
17
|
-
cache_dit/cache_factory/cache_blocks/__init__.py,sha256=
|
|
16
|
+
cache_dit/cache_factory/cache_adapters/cache_adapter.py,sha256=7heGoy8LHMP54ISMwfJ-i_ALngkbnUdeQDBRrE-MTgs,21303
|
|
17
|
+
cache_dit/cache_factory/cache_blocks/__init__.py,sha256=mivvm8YOfqT7YHs8y_MzGOGztPw8LxAqKGXuSRXxCv0,3032
|
|
18
|
+
cache_dit/cache_factory/cache_blocks/offload_utils.py,sha256=wusgcqaCrwEjvv7Guy-6VXhNOgPPUrBV2sSVuRmGuvo,3513
|
|
18
19
|
cache_dit/cache_factory/cache_blocks/pattern_0_1_2.py,sha256=ElMps6_7uI74tSF9GDR_dEI0bZEhdzcepM29xFWnYo8,428
|
|
19
20
|
cache_dit/cache_factory/cache_blocks/pattern_3_4_5.py,sha256=Bv56qETXhsREvCrNvnZpSqDIIHsi6Ze3FJW4Yk2x3uI,8597
|
|
20
|
-
cache_dit/cache_factory/cache_blocks/pattern_base.py,sha256=
|
|
21
|
-
cache_dit/cache_factory/cache_blocks/
|
|
21
|
+
cache_dit/cache_factory/cache_blocks/pattern_base.py,sha256=wdh0bbcpKO08AW2FTsj9X_tTbFCLkDmBjrstMxTf7MQ,14668
|
|
22
|
+
cache_dit/cache_factory/cache_blocks/pattern_utils.py,sha256=dGOC1tMMOvcbvEgx44eTESKn_jsv-0RZ3tRHPa3wmQ4,1315
|
|
22
23
|
cache_dit/cache_factory/cache_contexts/__init__.py,sha256=T6Vak3x7Rs0Oy15Tou49p-rPQRA2jiuYtJBsbv1lBBU,388
|
|
23
24
|
cache_dit/cache_factory/cache_contexts/cache_context.py,sha256=3EhaMCz3VUQ_NF81VgYwWoSEGIvhScPxPYhjL1OcgxE,15240
|
|
24
25
|
cache_dit/cache_factory/cache_contexts/cache_manager.py,sha256=hSKAeP1CxmO3RFUxjFjAK1xdvVvTmeayh5jEHMaQXNE,30225
|
|
@@ -48,9 +49,9 @@ cache_dit/metrics/metrics.py,sha256=7UV-H2NRbhfr6dvrXEzU97Zy-BSQ5zEfm9CKtaK4ldg,
|
|
|
48
49
|
cache_dit/quantize/__init__.py,sha256=kWYoMAyZgBXu9BJlZjTQ0dRffW9GqeeY9_iTkXrb70A,59
|
|
49
50
|
cache_dit/quantize/quantize_ao.py,sha256=Fx1KW4l3gdEkdrcAYtPoDW7WKBJWrs3glOHiEwW_TgE,6160
|
|
50
51
|
cache_dit/quantize/quantize_interface.py,sha256=2s_R7xPSKuJeFpEGeLwRxnq_CqJcBG3a3lzyW5wh-UM,1241
|
|
51
|
-
cache_dit-0.
|
|
52
|
-
cache_dit-0.
|
|
53
|
-
cache_dit-0.
|
|
54
|
-
cache_dit-0.
|
|
55
|
-
cache_dit-0.
|
|
56
|
-
cache_dit-0.
|
|
52
|
+
cache_dit-1.0.0.dist-info/licenses/LICENSE,sha256=Dqb07Ik2dV41s9nIdMUbiRWEfDqo7-dQeRiY7kPO8PE,3769
|
|
53
|
+
cache_dit-1.0.0.dist-info/METADATA,sha256=HbV42qlhu8PFIO6FD_PuIo1dO-7K-yBiPCc5fikKIsg,35959
|
|
54
|
+
cache_dit-1.0.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
55
|
+
cache_dit-1.0.0.dist-info/entry_points.txt,sha256=FX2gysXaZx6NeK1iCLMcIdP8Q4_qikkIHtEmi3oWn8o,65
|
|
56
|
+
cache_dit-1.0.0.dist-info/top_level.txt,sha256=ZJDydonLEhujzz0FOkVbO-BqfzO9d_VqRHmZU-3MOZo,10
|
|
57
|
+
cache_dit-1.0.0.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|