cache-dit 0.2.21__py3-none-any.whl → 0.2.22__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of cache-dit might be problematic. Click here for more details.
- cache_dit/__init__.py +1 -1
- cache_dit/_version.py +2 -2
- cache_dit/cache_factory/__init__.py +4 -61
- cache_dit/cache_factory/cache_adapters.py +4 -15
- cache_dit/cache_factory/cache_context.py +40 -40
- cache_dit/cache_factory/cache_interface.py +149 -0
- cache_dit/cache_factory/cache_types.py +21 -52
- cache_dit/cache_factory/taylorseer.py +0 -2
- cache_dit/cache_factory/utils.py +4 -0
- cache_dit/compile/utils.py +6 -2
- cache_dit/utils.py +33 -4
- {cache_dit-0.2.21.dist-info → cache_dit-0.2.22.dist-info}/METADATA +8 -13
- {cache_dit-0.2.21.dist-info → cache_dit-0.2.22.dist-info}/RECORD +17 -16
- {cache_dit-0.2.21.dist-info → cache_dit-0.2.22.dist-info}/WHEEL +0 -0
- {cache_dit-0.2.21.dist-info → cache_dit-0.2.22.dist-info}/entry_points.txt +0 -0
- {cache_dit-0.2.21.dist-info → cache_dit-0.2.22.dist-info}/licenses/LICENSE +0 -0
- {cache_dit-0.2.21.dist-info → cache_dit-0.2.22.dist-info}/top_level.txt +0 -0
cache_dit/__init__.py
CHANGED
|
@@ -7,13 +7,13 @@ except ImportError:
|
|
|
7
7
|
from cache_dit.cache_factory import load_options
|
|
8
8
|
from cache_dit.cache_factory import enable_cache
|
|
9
9
|
from cache_dit.cache_factory import cache_type
|
|
10
|
-
from cache_dit.cache_factory import default_options
|
|
11
10
|
from cache_dit.cache_factory import block_range
|
|
12
11
|
from cache_dit.cache_factory import CacheType
|
|
13
12
|
from cache_dit.cache_factory import ForwardPattern
|
|
14
13
|
from cache_dit.cache_factory import BlockAdapterParams
|
|
15
14
|
from cache_dit.compile import set_compile_configs
|
|
16
15
|
from cache_dit.utils import summary
|
|
16
|
+
from cache_dit.utils import strify
|
|
17
17
|
from cache_dit.logger import init_logger
|
|
18
18
|
|
|
19
19
|
NONE = CacheType.NONE
|
cache_dit/_version.py
CHANGED
|
@@ -28,7 +28,7 @@ version_tuple: VERSION_TUPLE
|
|
|
28
28
|
commit_id: COMMIT_ID
|
|
29
29
|
__commit_id__: COMMIT_ID
|
|
30
30
|
|
|
31
|
-
__version__ = version = '0.2.
|
|
32
|
-
__version_tuple__ = version_tuple = (0, 2,
|
|
31
|
+
__version__ = version = '0.2.22'
|
|
32
|
+
__version_tuple__ = version_tuple = (0, 2, 22)
|
|
33
33
|
|
|
34
34
|
__commit_id__ = commit_id = None
|
|
@@ -1,65 +1,8 @@
|
|
|
1
|
-
from typing import Dict, List
|
|
2
|
-
from diffusers import DiffusionPipeline
|
|
3
1
|
from cache_dit.cache_factory.forward_pattern import ForwardPattern
|
|
4
2
|
from cache_dit.cache_factory.cache_types import CacheType
|
|
3
|
+
from cache_dit.cache_factory.cache_types import cache_type
|
|
4
|
+
from cache_dit.cache_factory.cache_types import block_range
|
|
5
5
|
from cache_dit.cache_factory.cache_adapters import BlockAdapterParams
|
|
6
6
|
from cache_dit.cache_factory.cache_adapters import UnifiedCacheAdapter
|
|
7
|
-
from cache_dit.cache_factory.
|
|
8
|
-
|
|
9
|
-
from cache_dit.logger import init_logger
|
|
10
|
-
|
|
11
|
-
logger = init_logger(__name__)
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
def load_options(path: str):
|
|
15
|
-
return load_cache_options_from_yaml(path)
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
def cache_type(
|
|
19
|
-
type_hint: "CacheType | str",
|
|
20
|
-
) -> CacheType:
|
|
21
|
-
return CacheType.type(cache_type=type_hint)
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
def default_options(
|
|
25
|
-
cache_type: CacheType = CacheType.DBCache,
|
|
26
|
-
) -> Dict:
|
|
27
|
-
return CacheType.default_options(cache_type)
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
def block_range(
|
|
31
|
-
start: int,
|
|
32
|
-
end: int,
|
|
33
|
-
step: int = 1,
|
|
34
|
-
) -> List[int]:
|
|
35
|
-
return CacheType.block_range(
|
|
36
|
-
start,
|
|
37
|
-
end,
|
|
38
|
-
step,
|
|
39
|
-
)
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
def enable_cache(
|
|
43
|
-
pipe_or_adapter: DiffusionPipeline | BlockAdapterParams,
|
|
44
|
-
forward_pattern: ForwardPattern = ForwardPattern.Pattern_0,
|
|
45
|
-
**cache_options_kwargs,
|
|
46
|
-
) -> DiffusionPipeline:
|
|
47
|
-
if isinstance(pipe_or_adapter, BlockAdapterParams):
|
|
48
|
-
return UnifiedCacheAdapter.apply(
|
|
49
|
-
pipe=None,
|
|
50
|
-
adapter_params=pipe_or_adapter,
|
|
51
|
-
forward_pattern=forward_pattern,
|
|
52
|
-
**cache_options_kwargs,
|
|
53
|
-
)
|
|
54
|
-
elif isinstance(pipe_or_adapter, DiffusionPipeline):
|
|
55
|
-
return UnifiedCacheAdapter.apply(
|
|
56
|
-
pipe=pipe_or_adapter,
|
|
57
|
-
adapter_params=None,
|
|
58
|
-
forward_pattern=forward_pattern,
|
|
59
|
-
**cache_options_kwargs,
|
|
60
|
-
)
|
|
61
|
-
else:
|
|
62
|
-
raise ValueError(
|
|
63
|
-
"Please pass DiffusionPipeline or BlockAdapterParams"
|
|
64
|
-
"(BlockAdapter) for the 1 position param: pipe_or_adapter"
|
|
65
|
-
)
|
|
7
|
+
from cache_dit.cache_factory.cache_interface import enable_cache
|
|
8
|
+
from cache_dit.cache_factory.utils import load_options
|
|
@@ -350,7 +350,7 @@ class UnifiedCacheAdapter:
|
|
|
350
350
|
return adapter_params.pipe
|
|
351
351
|
|
|
352
352
|
@classmethod
|
|
353
|
-
def
|
|
353
|
+
def has_separate_cfg(
|
|
354
354
|
cls,
|
|
355
355
|
pipe_or_transformer: DiffusionPipeline | Any,
|
|
356
356
|
) -> bool:
|
|
@@ -364,20 +364,9 @@ class UnifiedCacheAdapter:
|
|
|
364
364
|
@classmethod
|
|
365
365
|
def check_context_kwargs(cls, pipe, **cache_context_kwargs):
|
|
366
366
|
# Check cache_context_kwargs
|
|
367
|
-
if not cache_context_kwargs:
|
|
368
|
-
|
|
369
|
-
|
|
370
|
-
cache_context_kwargs["do_separate_classifier_free_guidance"] = (
|
|
371
|
-
True
|
|
372
|
-
)
|
|
373
|
-
logger.warning(
|
|
374
|
-
"cache_context_kwargs is empty, use default "
|
|
375
|
-
f"cache options: {cache_context_kwargs}"
|
|
376
|
-
)
|
|
377
|
-
else:
|
|
378
|
-
# Allow empty cache_type, we only support DBCache now.
|
|
379
|
-
if cache_context_kwargs.get("cache_type", None):
|
|
380
|
-
cache_context_kwargs["cache_type"] = CacheType.DBCache
|
|
367
|
+
if not cache_context_kwargs["do_separate_cfg"]:
|
|
368
|
+
# Check cfg for some specific case if users don't set it as True
|
|
369
|
+
cache_context_kwargs["do_separate_cfg"] = cls.has_separate_cfg(pipe)
|
|
381
370
|
|
|
382
371
|
if cache_type := cache_context_kwargs.pop("cache_type", None):
|
|
383
372
|
assert (
|
|
@@ -69,11 +69,11 @@ class DBCacheContext:
|
|
|
69
69
|
taylorseer: Optional[TaylorSeer] = None
|
|
70
70
|
encoder_tarlorseer: Optional[TaylorSeer] = None
|
|
71
71
|
|
|
72
|
-
# Support
|
|
72
|
+
# Support do_separate_cfg, such as Wan 2.1,
|
|
73
73
|
# Qwen-Image. For model that fused CFG and non-CFG into single
|
|
74
|
-
# forward step, should set
|
|
75
|
-
#
|
|
76
|
-
|
|
74
|
+
# forward step, should set do_separate_cfg as False.
|
|
75
|
+
# For example: CogVideoX, HunyuanVideo, Mochi.
|
|
76
|
+
do_separate_cfg: bool = False
|
|
77
77
|
# Compute cfg forward first or not, default False, namely,
|
|
78
78
|
# 0, 2, 4, ..., -> non-CFG step; 1, 3, 5, ... -> CFG step.
|
|
79
79
|
cfg_compute_first: bool = False
|
|
@@ -97,10 +97,10 @@ class DBCacheContext:
|
|
|
97
97
|
@torch.compiler.disable
|
|
98
98
|
def __post_init__(self):
|
|
99
99
|
# Some checks for settings
|
|
100
|
-
if self.
|
|
100
|
+
if self.do_separate_cfg:
|
|
101
101
|
assert self.enable_alter_cache is False, (
|
|
102
102
|
"enable_alter_cache must set as False if "
|
|
103
|
-
"
|
|
103
|
+
"do_separate_cfg is enabled."
|
|
104
104
|
)
|
|
105
105
|
if self.cfg_diff_compute_separate:
|
|
106
106
|
assert self.cfg_compute_first is False, (
|
|
@@ -123,12 +123,12 @@ class DBCacheContext:
|
|
|
123
123
|
|
|
124
124
|
if self.enable_taylorseer:
|
|
125
125
|
self.taylorseer = TaylorSeer(**self.taylorseer_kwargs)
|
|
126
|
-
if self.
|
|
126
|
+
if self.do_separate_cfg:
|
|
127
127
|
self.cfg_taylorseer = TaylorSeer(**self.taylorseer_kwargs)
|
|
128
128
|
|
|
129
129
|
if self.enable_encoder_taylorseer:
|
|
130
130
|
self.encoder_tarlorseer = TaylorSeer(**self.taylorseer_kwargs)
|
|
131
|
-
if self.
|
|
131
|
+
if self.do_separate_cfg:
|
|
132
132
|
self.cfg_encoder_taylorseer = TaylorSeer(
|
|
133
133
|
**self.taylorseer_kwargs
|
|
134
134
|
)
|
|
@@ -175,16 +175,16 @@ class DBCacheContext:
|
|
|
175
175
|
# incr step: prev 0 -> 1; prev 1 -> 2
|
|
176
176
|
# current step: incr step - 1
|
|
177
177
|
self.transformer_executed_steps += 1
|
|
178
|
-
if not self.
|
|
178
|
+
if not self.do_separate_cfg:
|
|
179
179
|
self.executed_steps += 1
|
|
180
180
|
else:
|
|
181
181
|
# 0,1 -> 0 + 1, 2,3 -> 1 + 1, ...
|
|
182
182
|
if not self.cfg_compute_first:
|
|
183
|
-
if not self.
|
|
183
|
+
if not self.is_separate_cfg_step():
|
|
184
184
|
# transformer step: 0,2,4,...
|
|
185
185
|
self.executed_steps += 1
|
|
186
186
|
else:
|
|
187
|
-
if self.
|
|
187
|
+
if self.is_separate_cfg_step():
|
|
188
188
|
# transformer step: 0,2,4,...
|
|
189
189
|
self.executed_steps += 1
|
|
190
190
|
|
|
@@ -217,9 +217,9 @@ class DBCacheContext:
|
|
|
217
217
|
|
|
218
218
|
# mark_step_begin of TaylorSeer must be called after the cache is reset.
|
|
219
219
|
if self.enable_taylorseer or self.enable_encoder_taylorseer:
|
|
220
|
-
if self.
|
|
220
|
+
if self.do_separate_cfg:
|
|
221
221
|
# Assume non-CFG steps: 0, 2, 4, 6, ...
|
|
222
|
-
if not self.
|
|
222
|
+
if not self.is_separate_cfg_step():
|
|
223
223
|
taylorseer, encoder_taylorseer = self.get_taylorseers()
|
|
224
224
|
if taylorseer is not None:
|
|
225
225
|
taylorseer.mark_step_begin()
|
|
@@ -251,7 +251,7 @@ class DBCacheContext:
|
|
|
251
251
|
# step: executed_steps - 1, not transformer_steps - 1
|
|
252
252
|
step = str(self.get_current_step())
|
|
253
253
|
# Only add the diff if it is not already recorded for this step
|
|
254
|
-
if not self.
|
|
254
|
+
if not self.is_separate_cfg_step():
|
|
255
255
|
if step not in self.residual_diffs:
|
|
256
256
|
self.residual_diffs[step] = diff
|
|
257
257
|
else:
|
|
@@ -268,7 +268,7 @@ class DBCacheContext:
|
|
|
268
268
|
|
|
269
269
|
@torch.compiler.disable
|
|
270
270
|
def add_cached_step(self):
|
|
271
|
-
if not self.
|
|
271
|
+
if not self.is_separate_cfg_step():
|
|
272
272
|
self.cached_steps.append(self.get_current_step())
|
|
273
273
|
else:
|
|
274
274
|
self.cfg_cached_steps.append(self.get_current_step())
|
|
@@ -290,8 +290,8 @@ class DBCacheContext:
|
|
|
290
290
|
return self.transformer_executed_steps - 1
|
|
291
291
|
|
|
292
292
|
@torch.compiler.disable
|
|
293
|
-
def
|
|
294
|
-
if not self.
|
|
293
|
+
def is_separate_cfg_step(self):
|
|
294
|
+
if not self.do_separate_cfg:
|
|
295
295
|
return False
|
|
296
296
|
if self.cfg_compute_first:
|
|
297
297
|
# CFG steps: 0, 2, 4, 6, ...
|
|
@@ -589,17 +589,17 @@ def Bn_compute_blocks_ids():
|
|
|
589
589
|
|
|
590
590
|
|
|
591
591
|
@torch.compiler.disable
|
|
592
|
-
def
|
|
592
|
+
def do_separate_cfg():
|
|
593
593
|
cache_context = get_current_cache_context()
|
|
594
594
|
assert cache_context is not None, "cache_context must be set before"
|
|
595
|
-
return cache_context.
|
|
595
|
+
return cache_context.do_separate_cfg
|
|
596
596
|
|
|
597
597
|
|
|
598
598
|
@torch.compiler.disable
|
|
599
|
-
def
|
|
599
|
+
def is_separate_cfg_step():
|
|
600
600
|
cache_context = get_current_cache_context()
|
|
601
601
|
assert cache_context is not None, "cache_context must be set before"
|
|
602
|
-
return cache_context.
|
|
602
|
+
return cache_context.is_separate_cfg_step()
|
|
603
603
|
|
|
604
604
|
|
|
605
605
|
@torch.compiler.disable
|
|
@@ -710,8 +710,8 @@ def are_two_tensors_similar(
|
|
|
710
710
|
|
|
711
711
|
if all(
|
|
712
712
|
(
|
|
713
|
-
|
|
714
|
-
|
|
713
|
+
do_separate_cfg(),
|
|
714
|
+
is_separate_cfg_step(),
|
|
715
715
|
not cfg_diff_compute_separate(),
|
|
716
716
|
get_current_step_residual_diff() is not None,
|
|
717
717
|
)
|
|
@@ -789,7 +789,7 @@ def set_Fn_buffer(buffer: torch.Tensor, prefix: str = "Fn"):
|
|
|
789
789
|
if downsample_factor > 1:
|
|
790
790
|
buffer = buffer[..., ::downsample_factor]
|
|
791
791
|
buffer = buffer.contiguous()
|
|
792
|
-
if
|
|
792
|
+
if is_separate_cfg_step():
|
|
793
793
|
_debugging_set_buffer(f"{prefix}_buffer_cfg")
|
|
794
794
|
set_buffer(f"{prefix}_buffer_cfg", buffer)
|
|
795
795
|
else:
|
|
@@ -799,7 +799,7 @@ def set_Fn_buffer(buffer: torch.Tensor, prefix: str = "Fn"):
|
|
|
799
799
|
|
|
800
800
|
@torch.compiler.disable
|
|
801
801
|
def get_Fn_buffer(prefix: str = "Fn"):
|
|
802
|
-
if
|
|
802
|
+
if is_separate_cfg_step():
|
|
803
803
|
_debugging_get_buffer(f"{prefix}_buffer_cfg")
|
|
804
804
|
return get_buffer(f"{prefix}_buffer_cfg")
|
|
805
805
|
_debugging_get_buffer(f"{prefix}_buffer")
|
|
@@ -808,7 +808,7 @@ def get_Fn_buffer(prefix: str = "Fn"):
|
|
|
808
808
|
|
|
809
809
|
@torch.compiler.disable
|
|
810
810
|
def set_Fn_encoder_buffer(buffer: torch.Tensor, prefix: str = "Fn"):
|
|
811
|
-
if
|
|
811
|
+
if is_separate_cfg_step():
|
|
812
812
|
_debugging_set_buffer(f"{prefix}_encoder_buffer_cfg")
|
|
813
813
|
set_buffer(f"{prefix}_encoder_buffer_cfg", buffer)
|
|
814
814
|
else:
|
|
@@ -818,7 +818,7 @@ def set_Fn_encoder_buffer(buffer: torch.Tensor, prefix: str = "Fn"):
|
|
|
818
818
|
|
|
819
819
|
@torch.compiler.disable
|
|
820
820
|
def get_Fn_encoder_buffer(prefix: str = "Fn"):
|
|
821
|
-
if
|
|
821
|
+
if is_separate_cfg_step():
|
|
822
822
|
_debugging_get_buffer(f"{prefix}_encoder_buffer_cfg")
|
|
823
823
|
return get_buffer(f"{prefix}_encoder_buffer_cfg")
|
|
824
824
|
_debugging_get_buffer(f"{prefix}_encoder_buffer")
|
|
@@ -832,7 +832,7 @@ def set_Bn_buffer(buffer: torch.Tensor, prefix: str = "Bn"):
|
|
|
832
832
|
# This buffer is use for hidden states approximation.
|
|
833
833
|
if is_taylorseer_enabled():
|
|
834
834
|
# taylorseer, encoder_taylorseer
|
|
835
|
-
if
|
|
835
|
+
if is_separate_cfg_step():
|
|
836
836
|
taylorseer, _ = get_cfg_taylorseers()
|
|
837
837
|
else:
|
|
838
838
|
taylorseer, _ = get_taylorseers()
|
|
@@ -846,14 +846,14 @@ def set_Bn_buffer(buffer: torch.Tensor, prefix: str = "Bn"):
|
|
|
846
846
|
"TaylorSeer is enabled but not set in the cache context. "
|
|
847
847
|
"Falling back to default buffer retrieval."
|
|
848
848
|
)
|
|
849
|
-
if
|
|
849
|
+
if is_separate_cfg_step():
|
|
850
850
|
_debugging_set_buffer(f"{prefix}_buffer_cfg")
|
|
851
851
|
set_buffer(f"{prefix}_buffer_cfg", buffer)
|
|
852
852
|
else:
|
|
853
853
|
_debugging_set_buffer(f"{prefix}_buffer")
|
|
854
854
|
set_buffer(f"{prefix}_buffer", buffer)
|
|
855
855
|
else:
|
|
856
|
-
if
|
|
856
|
+
if is_separate_cfg_step():
|
|
857
857
|
_debugging_set_buffer(f"{prefix}_buffer_cfg")
|
|
858
858
|
set_buffer(f"{prefix}_buffer_cfg", buffer)
|
|
859
859
|
else:
|
|
@@ -865,7 +865,7 @@ def set_Bn_buffer(buffer: torch.Tensor, prefix: str = "Bn"):
|
|
|
865
865
|
def get_Bn_buffer(prefix: str = "Bn"):
|
|
866
866
|
if is_taylorseer_enabled():
|
|
867
867
|
# taylorseer, encoder_taylorseer
|
|
868
|
-
if
|
|
868
|
+
if is_separate_cfg_step():
|
|
869
869
|
taylorseer, _ = get_cfg_taylorseers()
|
|
870
870
|
else:
|
|
871
871
|
taylorseer, _ = get_taylorseers()
|
|
@@ -879,13 +879,13 @@ def get_Bn_buffer(prefix: str = "Bn"):
|
|
|
879
879
|
"Falling back to default buffer retrieval."
|
|
880
880
|
)
|
|
881
881
|
# Fallback to default buffer retrieval
|
|
882
|
-
if
|
|
882
|
+
if is_separate_cfg_step():
|
|
883
883
|
_debugging_get_buffer(f"{prefix}_buffer_cfg")
|
|
884
884
|
return get_buffer(f"{prefix}_buffer_cfg")
|
|
885
885
|
_debugging_get_buffer(f"{prefix}_buffer")
|
|
886
886
|
return get_buffer(f"{prefix}_buffer")
|
|
887
887
|
else:
|
|
888
|
-
if
|
|
888
|
+
if is_separate_cfg_step():
|
|
889
889
|
_debugging_get_buffer(f"{prefix}_buffer_cfg")
|
|
890
890
|
return get_buffer(f"{prefix}_buffer_cfg")
|
|
891
891
|
_debugging_get_buffer(f"{prefix}_buffer")
|
|
@@ -897,7 +897,7 @@ def set_Bn_encoder_buffer(buffer: torch.Tensor, prefix: str = "Bn"):
|
|
|
897
897
|
# This buffer is use for encoder hidden states approximation.
|
|
898
898
|
if is_encoder_taylorseer_enabled():
|
|
899
899
|
# taylorseer, encoder_taylorseer
|
|
900
|
-
if
|
|
900
|
+
if is_separate_cfg_step():
|
|
901
901
|
_, encoder_taylorseer = get_cfg_taylorseers()
|
|
902
902
|
else:
|
|
903
903
|
_, encoder_taylorseer = get_taylorseers()
|
|
@@ -911,14 +911,14 @@ def set_Bn_encoder_buffer(buffer: torch.Tensor, prefix: str = "Bn"):
|
|
|
911
911
|
"TaylorSeer is enabled but not set in the cache context. "
|
|
912
912
|
"Falling back to default buffer retrieval."
|
|
913
913
|
)
|
|
914
|
-
if
|
|
914
|
+
if is_separate_cfg_step():
|
|
915
915
|
_debugging_set_buffer(f"{prefix}_encoder_buffer_cfg")
|
|
916
916
|
set_buffer(f"{prefix}_encoder_buffer_cfg", buffer)
|
|
917
917
|
else:
|
|
918
918
|
_debugging_set_buffer(f"{prefix}_encoder_buffer")
|
|
919
919
|
set_buffer(f"{prefix}_encoder_buffer", buffer)
|
|
920
920
|
else:
|
|
921
|
-
if
|
|
921
|
+
if is_separate_cfg_step():
|
|
922
922
|
_debugging_set_buffer(f"{prefix}_encoder_buffer_cfg")
|
|
923
923
|
set_buffer(f"{prefix}_encoder_buffer_cfg", buffer)
|
|
924
924
|
else:
|
|
@@ -929,7 +929,7 @@ def set_Bn_encoder_buffer(buffer: torch.Tensor, prefix: str = "Bn"):
|
|
|
929
929
|
@torch.compiler.disable
|
|
930
930
|
def get_Bn_encoder_buffer(prefix: str = "Bn"):
|
|
931
931
|
if is_encoder_taylorseer_enabled():
|
|
932
|
-
if
|
|
932
|
+
if is_separate_cfg_step():
|
|
933
933
|
_, encoder_taylorseer = get_cfg_taylorseers()
|
|
934
934
|
else:
|
|
935
935
|
_, encoder_taylorseer = get_taylorseers()
|
|
@@ -944,13 +944,13 @@ def get_Bn_encoder_buffer(prefix: str = "Bn"):
|
|
|
944
944
|
"Falling back to default buffer retrieval."
|
|
945
945
|
)
|
|
946
946
|
# Fallback to default buffer retrieval
|
|
947
|
-
if
|
|
947
|
+
if is_separate_cfg_step():
|
|
948
948
|
_debugging_get_buffer(f"{prefix}_encoder_buffer_cfg")
|
|
949
949
|
return get_buffer(f"{prefix}_encoder_buffer_cfg")
|
|
950
950
|
_debugging_get_buffer(f"{prefix}_encoder_buffer")
|
|
951
951
|
return get_buffer(f"{prefix}_encoder_buffer")
|
|
952
952
|
else:
|
|
953
|
-
if
|
|
953
|
+
if is_separate_cfg_step():
|
|
954
954
|
_debugging_get_buffer(f"{prefix}_encoder_buffer_cfg")
|
|
955
955
|
return get_buffer(f"{prefix}_encoder_buffer_cfg")
|
|
956
956
|
_debugging_get_buffer(f"{prefix}_encoder_buffer")
|
|
@@ -1021,7 +1021,7 @@ def get_can_use_cache(
|
|
|
1021
1021
|
return False
|
|
1022
1022
|
|
|
1023
1023
|
max_cached_steps = get_max_cached_steps()
|
|
1024
|
-
if not
|
|
1024
|
+
if not is_separate_cfg_step():
|
|
1025
1025
|
cached_steps = get_cached_steps()
|
|
1026
1026
|
else:
|
|
1027
1027
|
cached_steps = get_cfg_cached_steps()
|
|
@@ -0,0 +1,149 @@
|
|
|
1
|
+
from diffusers import DiffusionPipeline
|
|
2
|
+
from cache_dit.cache_factory.forward_pattern import ForwardPattern
|
|
3
|
+
from cache_dit.cache_factory.cache_types import CacheType
|
|
4
|
+
from cache_dit.cache_factory.cache_adapters import BlockAdapterParams
|
|
5
|
+
from cache_dit.cache_factory.cache_adapters import UnifiedCacheAdapter
|
|
6
|
+
|
|
7
|
+
from cache_dit.logger import init_logger
|
|
8
|
+
|
|
9
|
+
logger = init_logger(__name__)
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def enable_cache(
|
|
13
|
+
# BlockAdapter & forward pattern
|
|
14
|
+
pipe_or_adapter: DiffusionPipeline | BlockAdapterParams,
|
|
15
|
+
forward_pattern: ForwardPattern = ForwardPattern.Pattern_0,
|
|
16
|
+
# Cache context kwargs
|
|
17
|
+
Fn_compute_blocks: int = 8,
|
|
18
|
+
Bn_compute_blocks: int = 0,
|
|
19
|
+
warmup_steps: int = 8,
|
|
20
|
+
max_cached_steps: int = -1,
|
|
21
|
+
residual_diff_threshold: float = 0.08,
|
|
22
|
+
# Cache CFG or not
|
|
23
|
+
do_separate_cfg: bool = False,
|
|
24
|
+
cfg_compute_first: bool = False,
|
|
25
|
+
cfg_diff_compute_separate: bool = False,
|
|
26
|
+
# Hybird TaylorSeer
|
|
27
|
+
enable_taylorseer: bool = False,
|
|
28
|
+
enable_encoder_taylorseer: bool = False,
|
|
29
|
+
taylorseer_cache_type: str = "residual",
|
|
30
|
+
taylorseer_order: int = 2,
|
|
31
|
+
**other_cache_kwargs,
|
|
32
|
+
) -> DiffusionPipeline:
|
|
33
|
+
r"""
|
|
34
|
+
Unified Cache API for almost Any Diffusion Transformers (with Transformer Blocks
|
|
35
|
+
that match the specific Input and Output patterns).
|
|
36
|
+
|
|
37
|
+
For a good balance between performance and precision, DBCache is configured by default
|
|
38
|
+
with F8B0, 8 warmup steps, and unlimited cached steps.
|
|
39
|
+
|
|
40
|
+
Args:
|
|
41
|
+
pipe_or_adapter (`DiffusionPipeline` or `BlockAdapterParams`, *required*):
|
|
42
|
+
The standard Diffusion Pipeline or custom BlockAdapter (from cache-dit or user-defined).
|
|
43
|
+
For example: cache_dit.enable_cache(FluxPipeline(...)). Please check https://github.com/vipshop/cache-dit/blob/main/docs/BlockAdapter.md
|
|
44
|
+
for the usgae of BlockAdapter.
|
|
45
|
+
forward_pattern (`ForwardPattern`, *required*, defaults to `ForwardPattern.Pattern_0`):
|
|
46
|
+
The forward pattern of Transformer block, please check https://github.com/vipshop/cache-dit/tree/main?tab=readme-ov-file#forward-pattern-matching
|
|
47
|
+
for more details.
|
|
48
|
+
Fn_compute_blocks (`int`, *required*, defaults to 8):
|
|
49
|
+
Specifies that `DBCache` uses the **first n** Transformer blocks to fit the information
|
|
50
|
+
at time step t, enabling the calculation of a more stable L1 diff and delivering more
|
|
51
|
+
accurate information to subsequent blocks. Please check https://github.com/vipshop/cache-dit/blob/main/docs/DBCache.md
|
|
52
|
+
for more details of DBCache.
|
|
53
|
+
Bn_compute_blocks: (`int`, *required*, defaults to 0):
|
|
54
|
+
Further fuses approximate information in the **last n** Transformer blocks to enhance
|
|
55
|
+
prediction accuracy. These blocks act as an auto-scaler for approximate hidden states
|
|
56
|
+
that use residual cache.
|
|
57
|
+
warmup_steps (`int`, *required*, defaults to 8):
|
|
58
|
+
DBCache does not apply the caching strategy when the number of running steps is less than
|
|
59
|
+
or equal to this value, ensuring the model sufficiently learns basic features during warmup.
|
|
60
|
+
max_cached_steps (`int`, *required*, defaults to -1):
|
|
61
|
+
DBCache disables the caching strategy when the previous cached steps exceed this value to
|
|
62
|
+
prevent precision degradation.
|
|
63
|
+
residual_diff_threshold (`float`, *required*, defaults to 0.08):
|
|
64
|
+
he value of residual diff threshold, a higher value leads to faster performance at the
|
|
65
|
+
cost of lower precision.
|
|
66
|
+
do_separate_cfg (`bool`, *required*, defaults to False):
|
|
67
|
+
Whether to do separate cfg or not, such as Wan 2.1, Qwen-Image. For model that fused CFG
|
|
68
|
+
and non-CFG into single forward step, should set do_separate_cfg as False, for example:
|
|
69
|
+
CogVideoX, HunyuanVideo, Mochi, etc.
|
|
70
|
+
cfg_compute_first (`bool`, *required*, defaults to False):
|
|
71
|
+
Compute cfg forward first or not, default False, namely, 0, 2, 4, ..., -> non-CFG step;
|
|
72
|
+
1, 3, 5, ... -> CFG step.
|
|
73
|
+
cfg_diff_compute_separate (`bool`, *required*, defaults to True):
|
|
74
|
+
Compute spearate diff values for CFG and non-CFG step, default True. If False, we will
|
|
75
|
+
use the computed diff from current non-CFG transformer step for current CFG step.
|
|
76
|
+
enable_taylorseer (`bool`, *required*, defaults to False):
|
|
77
|
+
Enable the hybird TaylorSeer for hidden_states or not. We have supported the
|
|
78
|
+
[TaylorSeers: From Reusing to Forecasting: Accelerating Diffusion Models with TaylorSeers](https://arxiv.org/pdf/2503.06923) algorithm
|
|
79
|
+
to further improve the precision of DBCache in cases where the cached steps are large,
|
|
80
|
+
namely, **Hybrid TaylorSeer + DBCache**. At timesteps with significant intervals,
|
|
81
|
+
the feature similarity in diffusion models decreases substantially, significantly
|
|
82
|
+
harming the generation quality.
|
|
83
|
+
enable_encoder_taylorseer (`bool`, *required*, defaults to False):
|
|
84
|
+
Enable the hybird TaylorSeer for encoder_hidden_states or not.
|
|
85
|
+
taylorseer_cache_type (`str`, *required*, defaults to `residual`):
|
|
86
|
+
The TaylorSeer implemented in cache-dit supports both `hidden_states` and `residual` as cache type.
|
|
87
|
+
taylorseer_order (`int`, *required*, defaults to 2):
|
|
88
|
+
The order of taylorseer, higher values of n_derivatives will lead to longer computation time,
|
|
89
|
+
but may improve precision significantly.
|
|
90
|
+
other_cache_kwargs: (`dict`, *optional*, defaults to {})
|
|
91
|
+
Other cache context kwargs, please check https://github.com/vipshop/cache-dit/blob/main/src/cache_dit/cache_factory/cache_context.py
|
|
92
|
+
for more details.
|
|
93
|
+
|
|
94
|
+
Examples:
|
|
95
|
+
```py
|
|
96
|
+
>>> import cache_dit
|
|
97
|
+
>>> from diffusers import DiffusionPipeline
|
|
98
|
+
>>> pipe = DiffusionPipeline.from_pretrained("Qwen/Qwen-Image") # Can be any diffusion pipeline
|
|
99
|
+
>>> cache_dit.enable_cache(pipe) # One-line code with default cache options.
|
|
100
|
+
>>> output = pipe(...) # Just call the pipe as normal.
|
|
101
|
+
>>> stats = cache_dit.summary(pipe) # Then, get the summary of cache acceleration stats.
|
|
102
|
+
"""
|
|
103
|
+
|
|
104
|
+
# Collect cache context kwargs
|
|
105
|
+
cache_context_kwargs = other_cache_kwargs.copy()
|
|
106
|
+
cache_context_kwargs["cache_type"] = CacheType.DBCache
|
|
107
|
+
cache_context_kwargs["Fn_compute_blocks"] = Fn_compute_blocks
|
|
108
|
+
cache_context_kwargs["Bn_compute_blocks"] = Bn_compute_blocks
|
|
109
|
+
cache_context_kwargs["warmup_steps"] = warmup_steps
|
|
110
|
+
cache_context_kwargs["max_cached_steps"] = max_cached_steps
|
|
111
|
+
cache_context_kwargs["residual_diff_threshold"] = residual_diff_threshold
|
|
112
|
+
cache_context_kwargs["do_separate_cfg"] = do_separate_cfg
|
|
113
|
+
cache_context_kwargs["cfg_compute_first"] = cfg_compute_first
|
|
114
|
+
cache_context_kwargs["cfg_diff_compute_separate"] = (
|
|
115
|
+
cfg_diff_compute_separate
|
|
116
|
+
)
|
|
117
|
+
cache_context_kwargs["enable_taylorseer"] = enable_taylorseer
|
|
118
|
+
cache_context_kwargs["enable_encoder_taylorseer"] = (
|
|
119
|
+
enable_encoder_taylorseer
|
|
120
|
+
)
|
|
121
|
+
cache_context_kwargs["taylorseer_cache_type"] = taylorseer_cache_type
|
|
122
|
+
if "taylorseer_kwargs" in cache_context_kwargs:
|
|
123
|
+
cache_context_kwargs["taylorseer_kwargs"][
|
|
124
|
+
"n_derivatives"
|
|
125
|
+
] = taylorseer_order
|
|
126
|
+
else:
|
|
127
|
+
cache_context_kwargs["taylorseer_kwargs"] = {
|
|
128
|
+
"n_derivatives": taylorseer_order
|
|
129
|
+
}
|
|
130
|
+
|
|
131
|
+
if isinstance(pipe_or_adapter, BlockAdapterParams):
|
|
132
|
+
return UnifiedCacheAdapter.apply(
|
|
133
|
+
pipe=None,
|
|
134
|
+
adapter_params=pipe_or_adapter,
|
|
135
|
+
forward_pattern=forward_pattern,
|
|
136
|
+
**cache_context_kwargs,
|
|
137
|
+
)
|
|
138
|
+
elif isinstance(pipe_or_adapter, DiffusionPipeline):
|
|
139
|
+
return UnifiedCacheAdapter.apply(
|
|
140
|
+
pipe=pipe_or_adapter,
|
|
141
|
+
adapter_params=None,
|
|
142
|
+
forward_pattern=forward_pattern,
|
|
143
|
+
**cache_context_kwargs,
|
|
144
|
+
)
|
|
145
|
+
else:
|
|
146
|
+
raise ValueError(
|
|
147
|
+
"Please pass DiffusionPipeline or BlockAdapterParams"
|
|
148
|
+
"(BlockAdapter) for the 1 position param: pipe_or_adapter"
|
|
149
|
+
)
|
|
@@ -9,62 +9,31 @@ class CacheType(Enum):
|
|
|
9
9
|
DBCache = "Dual_Block_Cache"
|
|
10
10
|
|
|
11
11
|
@staticmethod
|
|
12
|
-
def type(
|
|
13
|
-
if isinstance(
|
|
14
|
-
return
|
|
15
|
-
return
|
|
12
|
+
def type(type_hint: "CacheType | str") -> "CacheType":
|
|
13
|
+
if isinstance(type_hint, CacheType):
|
|
14
|
+
return type_hint
|
|
15
|
+
return cache_type(type_hint)
|
|
16
16
|
|
|
17
|
-
@staticmethod
|
|
18
|
-
def cache_type(cache_type: "CacheType | str") -> "CacheType":
|
|
19
|
-
if cache_type is None:
|
|
20
|
-
return CacheType.NONE
|
|
21
|
-
|
|
22
|
-
if isinstance(cache_type, CacheType):
|
|
23
|
-
return cache_type
|
|
24
17
|
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
"db_cache",
|
|
28
|
-
"dbcache",
|
|
29
|
-
"db",
|
|
30
|
-
):
|
|
31
|
-
return CacheType.DBCache
|
|
18
|
+
def cache_type(type_hint: "CacheType | str") -> "CacheType":
|
|
19
|
+
if type_hint is None:
|
|
32
20
|
return CacheType.NONE
|
|
33
21
|
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
if start > end or end <= 0 or step <= 1:
|
|
37
|
-
return []
|
|
38
|
-
# Always compute 0 and end - 1 blocks for DB Cache
|
|
39
|
-
return list(
|
|
40
|
-
sorted(set([0] + list(range(start, end, step)) + [end - 1]))
|
|
41
|
-
)
|
|
42
|
-
|
|
43
|
-
@staticmethod
|
|
44
|
-
def default_options(cache_type: "CacheType | str") -> dict:
|
|
45
|
-
_no_options = {
|
|
46
|
-
"cache_type": CacheType.NONE,
|
|
47
|
-
}
|
|
22
|
+
if isinstance(type_hint, CacheType):
|
|
23
|
+
return type_hint
|
|
48
24
|
|
|
49
|
-
|
|
50
|
-
|
|
25
|
+
elif type_hint.lower() in (
|
|
26
|
+
"dual_block_cache",
|
|
27
|
+
"db_cache",
|
|
28
|
+
"dbcache",
|
|
29
|
+
"db",
|
|
30
|
+
):
|
|
31
|
+
return CacheType.DBCache
|
|
32
|
+
return CacheType.NONE
|
|
51
33
|
|
|
52
|
-
_db_options = {
|
|
53
|
-
"cache_type": CacheType.DBCache,
|
|
54
|
-
"residual_diff_threshold": 0.12,
|
|
55
|
-
"warmup_steps": 8,
|
|
56
|
-
"max_cached_steps": -1, # -1 means no limit
|
|
57
|
-
"Fn_compute_blocks": _Fn_compute_blocks,
|
|
58
|
-
"Bn_compute_blocks": _Bn_compute_blocks,
|
|
59
|
-
"max_Fn_compute_blocks": 16,
|
|
60
|
-
"max_Bn_compute_blocks": 16,
|
|
61
|
-
"Fn_compute_blocks_ids": [], # 0, 1, 2, ..., 7, etc.
|
|
62
|
-
"Bn_compute_blocks_ids": [], # 0, 1, 2, ..., 7, etc.
|
|
63
|
-
}
|
|
64
34
|
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
raise ValueError(f"Unknown cache type: {cache_type}")
|
|
35
|
+
def block_range(start: int, end: int, step: int = 1) -> list[int]:
|
|
36
|
+
if start > end or end <= 0 or step <= 1:
|
|
37
|
+
return []
|
|
38
|
+
# Always compute 0 and end - 1 blocks for DB Cache
|
|
39
|
+
return list(sorted(set([0] + list(range(start, end, step)) + [end - 1])))
|
cache_dit/cache_factory/utils.py
CHANGED
cache_dit/compile/utils.py
CHANGED
|
@@ -24,14 +24,15 @@ def epilogue_prologue_fusion_enabled(**kwargs) -> bool:
|
|
|
24
24
|
|
|
25
25
|
|
|
26
26
|
def set_compile_configs(
|
|
27
|
+
descent_tuning: bool = True,
|
|
27
28
|
cuda_graphs: bool = False,
|
|
28
29
|
force_disable_compile_caches: bool = False,
|
|
29
30
|
use_fast_math: bool = False,
|
|
30
31
|
**kwargs, # other kwargs
|
|
31
32
|
):
|
|
32
33
|
# Alway increase recompile_limit for dynamic shape compilation
|
|
33
|
-
torch._dynamo.config.recompile_limit =
|
|
34
|
-
torch._dynamo.config.accumulated_recompile_limit =
|
|
34
|
+
torch._dynamo.config.recompile_limit = 1024 # default is 8
|
|
35
|
+
torch._dynamo.config.accumulated_recompile_limit = 8192 # default is 256
|
|
35
36
|
# Handle compiler caches
|
|
36
37
|
# https://github.com/vllm-project/vllm/blob/23baa2180b0ebba5ae94073ba9b8e93f88b75486/vllm/compilation/compiler_interface.py#L270
|
|
37
38
|
torch._inductor.config.fx_graph_cache = True
|
|
@@ -47,6 +48,9 @@ def set_compile_configs(
|
|
|
47
48
|
64 if "L20" in torch.cuda.get_device_name() else 300
|
|
48
49
|
)
|
|
49
50
|
|
|
51
|
+
if not descent_tuning:
|
|
52
|
+
return
|
|
53
|
+
|
|
50
54
|
FORCE_DISABLE_CUSTOM_COMPILE_CONFIG = (
|
|
51
55
|
os.environ.get("CACHE_DIT_FORCE_DISABLE_CUSTOM_COMPILE_CONFIG", "0")
|
|
52
56
|
== "1"
|
cache_dit/utils.py
CHANGED
|
@@ -26,14 +26,17 @@ class CacheStats:
|
|
|
26
26
|
)
|
|
27
27
|
|
|
28
28
|
|
|
29
|
-
def summary(
|
|
29
|
+
def summary(
|
|
30
|
+
pipe: DiffusionPipeline, details: bool = False, logging: bool = True
|
|
31
|
+
):
|
|
30
32
|
cache_stats = CacheStats()
|
|
31
33
|
pipe_cls_name = pipe.__class__.__name__
|
|
32
34
|
|
|
33
35
|
if hasattr(pipe, "_cache_options"):
|
|
34
36
|
cache_options = pipe._cache_options
|
|
35
37
|
cache_stats.cache_options = cache_options
|
|
36
|
-
|
|
38
|
+
if logging:
|
|
39
|
+
print(f"\n🤗Cache Options: {pipe_cls_name}\n\n{cache_options}")
|
|
37
40
|
|
|
38
41
|
if hasattr(pipe.transformer, "_cached_steps"):
|
|
39
42
|
cached_steps: list[int] = pipe.transformer._cached_steps
|
|
@@ -43,7 +46,7 @@ def summary(pipe: DiffusionPipeline, details: bool = False):
|
|
|
43
46
|
cache_stats.cached_steps = cached_steps
|
|
44
47
|
cache_stats.residual_diffs = residual_diffs
|
|
45
48
|
|
|
46
|
-
if residual_diffs:
|
|
49
|
+
if residual_diffs and logging:
|
|
47
50
|
diffs_values = list(residual_diffs.values())
|
|
48
51
|
qmin = np.min(diffs_values)
|
|
49
52
|
q0 = np.percentile(diffs_values, 0)
|
|
@@ -90,7 +93,7 @@ def summary(pipe: DiffusionPipeline, details: bool = False):
|
|
|
90
93
|
cache_stats.cfg_cached_steps = cfg_cached_steps
|
|
91
94
|
cache_stats.cfg_residual_diffs = cfg_residual_diffs
|
|
92
95
|
|
|
93
|
-
if cfg_residual_diffs:
|
|
96
|
+
if cfg_residual_diffs and logging:
|
|
94
97
|
cfg_diffs_values = list(cfg_residual_diffs.values())
|
|
95
98
|
qmin = np.min(cfg_diffs_values)
|
|
96
99
|
q0 = np.percentile(cfg_diffs_values, 0)
|
|
@@ -130,3 +133,29 @@ def summary(pipe: DiffusionPipeline, details: bool = False):
|
|
|
130
133
|
)
|
|
131
134
|
|
|
132
135
|
return cache_stats
|
|
136
|
+
|
|
137
|
+
|
|
138
|
+
def strify(pipe_or_stats: DiffusionPipeline | CacheStats):
|
|
139
|
+
if not isinstance(pipe_or_stats, CacheStats):
|
|
140
|
+
stats = summary(pipe_or_stats, logging=False)
|
|
141
|
+
else:
|
|
142
|
+
stats = pipe_or_stats
|
|
143
|
+
|
|
144
|
+
cache_options = stats.cache_options
|
|
145
|
+
cached_steps = len(stats.cached_steps)
|
|
146
|
+
|
|
147
|
+
if not cache_options:
|
|
148
|
+
return "NONE"
|
|
149
|
+
|
|
150
|
+
cache_type_str = (
|
|
151
|
+
f"DBCACHE_F{cache_options['Fn_compute_blocks']}"
|
|
152
|
+
f"B{cache_options['Bn_compute_blocks']}"
|
|
153
|
+
f"W{cache_options['warmup_steps']}"
|
|
154
|
+
f"M{max(0, cache_options['max_cached_steps'])}"
|
|
155
|
+
f"T{int(cache_options['enable_taylorseer'])}"
|
|
156
|
+
f"O{cache_options['taylorseer_kwargs']['n_derivatives']}_"
|
|
157
|
+
f"R{cache_options['residual_diff_threshold']}_"
|
|
158
|
+
f"S{cached_steps}" # skiped steps
|
|
159
|
+
)
|
|
160
|
+
|
|
161
|
+
return cache_type_str
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: cache_dit
|
|
3
|
-
Version: 0.2.
|
|
3
|
+
Version: 0.2.22
|
|
4
4
|
Summary: 🤗 CacheDiT: An Unified and Training-free Cache Acceleration Toolbox for Diffusion Transformers
|
|
5
5
|
Author: DefTruth, vipshop.com, etc.
|
|
6
6
|
Maintainer: DefTruth, vipshop.com, etc
|
|
@@ -179,7 +179,6 @@ cache_dit.enable_cache(
|
|
|
179
179
|
pipe=pipe, # Qwen-Image, etc.
|
|
180
180
|
transformer=pipe.transformer,
|
|
181
181
|
blocks=pipe.transformer.transformer_blocks,
|
|
182
|
-
blocks_name="transformer_blocks",
|
|
183
182
|
),
|
|
184
183
|
# Check `📚Forward Pattern Matching` documentation and hack the code of
|
|
185
184
|
# of Qwen-Image, you will find that it has satisfied `FORWARD_PATTERN_1`.
|
|
@@ -200,7 +199,7 @@ You can set `details` param as `True` to show more details of cache stats. (mark
|
|
|
200
199
|
```python
|
|
201
200
|
⚡️Cache Steps and Residual Diffs Statistics: QwenImagePipeline
|
|
202
201
|
|
|
203
|
-
| Cache Steps | Diffs
|
|
202
|
+
| Cache Steps | Diffs Min | Diffs P25 | Diffs P50 | Diffs P75 | Diffs P95 | Diffs Max |
|
|
204
203
|
|-------------|-----------|-----------|-----------|-----------|-----------|-----------|
|
|
205
204
|
| 23 | 0.045 | 0.084 | 0.114 | 0.147 | 0.241 | 0.297 |
|
|
206
205
|
```
|
|
@@ -209,7 +208,7 @@ You can set `details` param as `True` to show more details of cache stats. (mark
|
|
|
209
208
|
|
|
210
209
|
<div id="dbcache"></div>
|
|
211
210
|
|
|
212
|
-

|
|
213
212
|
|
|
214
213
|
**DBCache**: **Dual Block Caching** for Diffusion Transformers. Different configurations of compute blocks (**F8B12**, etc.) can be customized in DBCache, enabling a balanced trade-off between performance and precision. Moreover, it can be entirely **training**-**free**. Please check [DBCache.md](./docs/DBCache.md) docs for more design details.
|
|
215
214
|
|
|
@@ -314,7 +313,7 @@ cache_dit.enable_cache(
|
|
|
314
313
|
|
|
315
314
|
<div id="cfg"></div>
|
|
316
315
|
|
|
317
|
-
cache-dit supports caching for **CFG (classifier-free guidance)**. For models that fuse CFG and non-CFG into a single forward step, or models that do not include CFG (classifier-free guidance) in the forward step, please set `
|
|
316
|
+
cache-dit supports caching for **CFG (classifier-free guidance)**. For models that fuse CFG and non-CFG into a single forward step, or models that do not include CFG (classifier-free guidance) in the forward step, please set `do_separate_cfg` param to **False (default)**. Otherwise, set it to True. For examples:
|
|
318
317
|
|
|
319
318
|
```python
|
|
320
319
|
cache_dit.enable_cache(
|
|
@@ -322,10 +321,10 @@ cache_dit.enable_cache(
|
|
|
322
321
|
...,
|
|
323
322
|
# CFG: classifier free guidance or not
|
|
324
323
|
# For model that fused CFG and non-CFG into single forward step,
|
|
325
|
-
# should set
|
|
326
|
-
#
|
|
327
|
-
#
|
|
328
|
-
|
|
324
|
+
# should set do_separate_cfg as False. For example, set it as True
|
|
325
|
+
# for Wan 2.1/Qwen-Image and set it as False for FLUX.1, HunyuanVideo,
|
|
326
|
+
# CogVideoX, Mochi, etc.
|
|
327
|
+
do_separate_cfg=True, # Wan 2.1, Qwen-Image
|
|
329
328
|
# Compute cfg forward first or not, default False, namely,
|
|
330
329
|
# 0, 2, 4, ..., -> non-CFG step; 1, 3, 5, ... -> CFG step.
|
|
331
330
|
cfg_compute_first=False,
|
|
@@ -381,10 +380,6 @@ cache-dit-metrics-cli -h # show usage
|
|
|
381
380
|
# all: PSNR, FID, SSIM, MSE, ..., etc.
|
|
382
381
|
cache-dit-metrics-cli all -i1 true.png -i2 test.png # image
|
|
383
382
|
cache-dit-metrics-cli all -i1 true_dir -i2 test_dir # image dir
|
|
384
|
-
cache-dit-metrics-cli all -v1 true.mp4 -v2 test.mp4 # video
|
|
385
|
-
cache-dit-metrics-cli all -v1 true_dir -v2 test_dir # video dir
|
|
386
|
-
cache-dit-metrics-cli fid -i1 true_dir -i2 test_dir # FID
|
|
387
|
-
cache-dit-metrics-cli psnr -i1 true_dir -i2 test_dir # PSNR
|
|
388
383
|
```
|
|
389
384
|
|
|
390
385
|
## 👋Contribute
|
|
@@ -1,21 +1,22 @@
|
|
|
1
|
-
cache_dit/__init__.py,sha256=
|
|
2
|
-
cache_dit/_version.py,sha256=
|
|
1
|
+
cache_dit/__init__.py,sha256=wVOaj_LSDsgYygL0cDdUU80_6RINh_JctQFyDalZN7k,946
|
|
2
|
+
cache_dit/_version.py,sha256=I7oxlElEVr-U2wT5qgQ2G41IxS87cokjF8Z2fKVHGrc,706
|
|
3
3
|
cache_dit/logger.py,sha256=0zsu42hN-3-rgGC_C29ms1IvVpV4_b4_SwJCKSenxBE,4304
|
|
4
4
|
cache_dit/primitives.py,sha256=A2iG9YLot3gOsZSPp-_gyjqjLgJvWQRx8aitD4JQ23Y,3877
|
|
5
|
-
cache_dit/utils.py,sha256=
|
|
5
|
+
cache_dit/utils.py,sha256=3UgVhfmTFG28w6CV-Rfxp5u1uzLrRozocHwLCTGiQ5M,5865
|
|
6
6
|
cache_dit/cache_factory/.gitignore,sha256=5Cb-qT9wsTUoMJ7vACDF7ZcLpAXhi5v-xdcWSRit988,23
|
|
7
|
-
cache_dit/cache_factory/__init__.py,sha256=
|
|
8
|
-
cache_dit/cache_factory/cache_adapters.py,sha256=
|
|
7
|
+
cache_dit/cache_factory/__init__.py,sha256=cXqBAVvldXNStpxAmNIJnpfJEf2miDlzzyjIqDauFI8,505
|
|
8
|
+
cache_dit/cache_factory/cache_adapters.py,sha256=bNcUz4SP3XFpVbkgSlehLdAqKbEXjQJcm-5oS8pKqxg,20289
|
|
9
9
|
cache_dit/cache_factory/cache_blocks.py,sha256=kMEOoNvygzeiM2yvUSAPkKpHeQTOpXQYH2qz34TqXzs,18457
|
|
10
|
-
cache_dit/cache_factory/cache_context.py,sha256=
|
|
11
|
-
cache_dit/cache_factory/
|
|
10
|
+
cache_dit/cache_factory/cache_context.py,sha256=4thx9NYxVaYZ_Nr2quUVE8bsNmTsXhZK0F960rccOc8,39000
|
|
11
|
+
cache_dit/cache_factory/cache_interface.py,sha256=V1FbtwI78Qj-yoDnz956o5lpnPxH8bMmiZNhiuiYLQo,8090
|
|
12
|
+
cache_dit/cache_factory/cache_types.py,sha256=FIFa6ZBfvvSMMHyBBhvarvgg2Y2wbRgITcG_uGylGe0,991
|
|
12
13
|
cache_dit/cache_factory/forward_pattern.py,sha256=B2YeqV2t_zo2Ar8m7qimPBjwQgoXHGp2grPZmEAhi8s,1286
|
|
13
|
-
cache_dit/cache_factory/taylorseer.py,sha256=
|
|
14
|
-
cache_dit/cache_factory/utils.py,sha256=
|
|
14
|
+
cache_dit/cache_factory/taylorseer.py,sha256=WeK2WlAJa4Px_pnAKokmnZXeqQYylQkPw4-EDqBIqeQ,3770
|
|
15
|
+
cache_dit/cache_factory/utils.py,sha256=YGtn02O3fVlrfQ32gGV4WAtTRvzzwSXNxzP_FmnE2Uk,1867
|
|
15
16
|
cache_dit/cache_factory/patch/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
16
17
|
cache_dit/cache_factory/patch/flux.py,sha256=iNQ-1RlOgXupZ4uPiEvJ__Ro6vKT_fOKja9JrpMrO78,8998
|
|
17
18
|
cache_dit/compile/__init__.py,sha256=FcTVzCeyypl-mxlc59_ehHL3lBNiDAFsXuRoJ-5Cfi0,56
|
|
18
|
-
cache_dit/compile/utils.py,sha256=
|
|
19
|
+
cache_dit/compile/utils.py,sha256=53KPsMWHyGmHGtw0T4oP0VY4O60cVXOFwFGWTlZrUqI,3857
|
|
19
20
|
cache_dit/custom_ops/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
20
21
|
cache_dit/custom_ops/triton_taylorseer.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
21
22
|
cache_dit/metrics/__init__.py,sha256=RaUhl5dieF40RqnizGzR30qoJJ9dyMUEADwgwMaMQrE,575
|
|
@@ -24,9 +25,9 @@ cache_dit/metrics/fid.py,sha256=9Ivtazl6mW0Bon2VXa-Ia5Xj2ewxRD3V1Qkd69zYM3Y,1706
|
|
|
24
25
|
cache_dit/metrics/inception.py,sha256=pBVe2X6ylLPIXTG4-GWDM9DWnCviMJbJ45R3ulhktR0,12759
|
|
25
26
|
cache_dit/metrics/lpips.py,sha256=I2qCNi6qJh5TRsaIsdxO0WoRX1DN7U_H3zS0oCSahYM,1032
|
|
26
27
|
cache_dit/metrics/metrics.py,sha256=8jvM1sF-nDxUuwCRy44QEoo4dYVLCQVh1QyAMs4eaQY,27840
|
|
27
|
-
cache_dit-0.2.
|
|
28
|
-
cache_dit-0.2.
|
|
29
|
-
cache_dit-0.2.
|
|
30
|
-
cache_dit-0.2.
|
|
31
|
-
cache_dit-0.2.
|
|
32
|
-
cache_dit-0.2.
|
|
28
|
+
cache_dit-0.2.22.dist-info/licenses/LICENSE,sha256=Dqb07Ik2dV41s9nIdMUbiRWEfDqo7-dQeRiY7kPO8PE,3769
|
|
29
|
+
cache_dit-0.2.22.dist-info/METADATA,sha256=BABVrkyVTakN0jel9xgApSd9IzDBRLqJHLHhauqka50,19566
|
|
30
|
+
cache_dit-0.2.22.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
31
|
+
cache_dit-0.2.22.dist-info/entry_points.txt,sha256=FX2gysXaZx6NeK1iCLMcIdP8Q4_qikkIHtEmi3oWn8o,65
|
|
32
|
+
cache_dit-0.2.22.dist-info/top_level.txt,sha256=ZJDydonLEhujzz0FOkVbO-BqfzO9d_VqRHmZU-3MOZo,10
|
|
33
|
+
cache_dit-0.2.22.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|