PyPI - cache-dit - Versions diffs - 0.2.21__py3-none-any.whl → 0.2.23__py3-none-any.whl - Mend

cache-dit 0.2.21py3-none-any.whl → 0.2.23py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of cache-dit might be problematic. Click here for more details.

Files changed (19) hide show

cache_dit/__init__.py +2 -4
cache_dit/_version.py +2 -2
cache_dit/cache_factory/__init__.py +5 -62
cache_dit/cache_factory/cache_adapters.py +194 -91
cache_dit/cache_factory/cache_blocks.py +9 -3
cache_dit/cache_factory/cache_context.py +40 -40
cache_dit/cache_factory/cache_interface.py +149 -0
cache_dit/cache_factory/cache_types.py +21 -52
cache_dit/cache_factory/taylorseer.py +0 -2
cache_dit/cache_factory/utils.py +4 -0
cache_dit/compile/utils.py +6 -2
cache_dit/utils.py +33 -4
{cache_dit-0.2.21.dist-info → cache_dit-0.2.23.dist-info}/METADATA +19 -17
cache_dit-0.2.23.dist-info/RECORD +33 -0
cache_dit-0.2.21.dist-info/RECORD +0 -32
{cache_dit-0.2.21.dist-info → cache_dit-0.2.23.dist-info}/WHEEL +0 -0
{cache_dit-0.2.21.dist-info → cache_dit-0.2.23.dist-info}/entry_points.txt +0 -0
{cache_dit-0.2.21.dist-info → cache_dit-0.2.23.dist-info}/licenses/LICENSE +0 -0
{cache_dit-0.2.21.dist-info → cache_dit-0.2.23.dist-info}/top_level.txt +0 -0

cache_dit/cache_factory/cache_context.py CHANGED Viewed

@@ -69,11 +69,11 @@ class DBCacheContext:
     taylorseer: Optional[TaylorSeer] = None
     encoder_tarlorseer: Optional[TaylorSeer] = None
-    # Support do_separate_classifier_free_guidance, such as Wan 2.1,
+    # Support do_separate_cfg, such as Wan 2.1,
     # Qwen-Image. For model that fused CFG and non-CFG into single
-    # forward step, should set do_separate_classifier_free_guidance
-    # as False. For example: CogVideoX, HunyuanVideo, Mochi.
-    do_separate_classifier_free_guidance: bool = False
+    # forward step, should set do_separate_cfg as False.
+    # For example: CogVideoX, HunyuanVideo, Mochi.
+    do_separate_cfg: bool = False
     # Compute cfg forward first or not, default False, namely,
     # 0, 2, 4, ..., -> non-CFG step; 1, 3, 5, ... -> CFG step.
     cfg_compute_first: bool = False
@@ -97,10 +97,10 @@ class DBCacheContext:
     @torch.compiler.disable
     def __post_init__(self):
         # Some checks for settings
-        if self.do_separate_classifier_free_guidance:
+        if self.do_separate_cfg:
             assert self.enable_alter_cache is False, (
                 "enable_alter_cache must set as False if "
-                "do_separate_classifier_free_guidance is enabled."
+                "do_separate_cfg is enabled."
             )
             if self.cfg_diff_compute_separate:
                 assert self.cfg_compute_first is False, (
@@ -123,12 +123,12 @@ class DBCacheContext:
         if self.enable_taylorseer:
             self.taylorseer = TaylorSeer(**self.taylorseer_kwargs)
-            if self.do_separate_classifier_free_guidance:
+            if self.do_separate_cfg:
                 self.cfg_taylorseer = TaylorSeer(**self.taylorseer_kwargs)
         if self.enable_encoder_taylorseer:
             self.encoder_tarlorseer = TaylorSeer(**self.taylorseer_kwargs)
-            if self.do_separate_classifier_free_guidance:
+            if self.do_separate_cfg:
                 self.cfg_encoder_taylorseer = TaylorSeer(
                     **self.taylorseer_kwargs
                 )
@@ -175,16 +175,16 @@ class DBCacheContext:
         # incr    step: prev 0 -> 1; prev 1 -> 2
         # current step: incr step - 1
         self.transformer_executed_steps += 1
-        if not self.do_separate_classifier_free_guidance:
+        if not self.do_separate_cfg:
             self.executed_steps += 1
         else:
             # 0,1 -> 0 + 1, 2,3 -> 1 + 1, ...
             if not self.cfg_compute_first:
-                if not self.is_separate_classifier_free_guidance_step():
+                if not self.is_separate_cfg_step():
                     # transformer step: 0,2,4,...
                     self.executed_steps += 1
             else:
-                if self.is_separate_classifier_free_guidance_step():
+                if self.is_separate_cfg_step():
                     # transformer step: 0,2,4,...
                     self.executed_steps += 1
@@ -217,9 +217,9 @@ class DBCacheContext:
         # mark_step_begin of TaylorSeer must be called after the cache is reset.
         if self.enable_taylorseer or self.enable_encoder_taylorseer:
-            if self.do_separate_classifier_free_guidance:
+            if self.do_separate_cfg:
                 # Assume non-CFG steps: 0, 2, 4, 6, ...
-                if not self.is_separate_classifier_free_guidance_step():
+                if not self.is_separate_cfg_step():
                     taylorseer, encoder_taylorseer = self.get_taylorseers()
                     if taylorseer is not None:
                         taylorseer.mark_step_begin()
@@ -251,7 +251,7 @@ class DBCacheContext:
         # step: executed_steps - 1, not transformer_steps - 1
         step = str(self.get_current_step())
         # Only add the diff if it is not already recorded for this step
-        if not self.is_separate_classifier_free_guidance_step():
+        if not self.is_separate_cfg_step():
             if step not in self.residual_diffs:
                 self.residual_diffs[step] = diff
         else:
@@ -268,7 +268,7 @@ class DBCacheContext:
     @torch.compiler.disable
     def add_cached_step(self):
-        if not self.is_separate_classifier_free_guidance_step():
+        if not self.is_separate_cfg_step():
             self.cached_steps.append(self.get_current_step())
         else:
             self.cfg_cached_steps.append(self.get_current_step())
@@ -290,8 +290,8 @@ class DBCacheContext:
         return self.transformer_executed_steps - 1
     @torch.compiler.disable
-    def is_separate_classifier_free_guidance_step(self):
-        if not self.do_separate_classifier_free_guidance:
+    def is_separate_cfg_step(self):
+        if not self.do_separate_cfg:
             return False
         if self.cfg_compute_first:
             # CFG steps: 0, 2, 4, 6, ...
@@ -589,17 +589,17 @@ def Bn_compute_blocks_ids():
 @torch.compiler.disable
-def do_separate_classifier_free_guidance():
+def do_separate_cfg():
     cache_context = get_current_cache_context()
     assert cache_context is not None, "cache_context must be set before"
-    return cache_context.do_separate_classifier_free_guidance
+    return cache_context.do_separate_cfg
 @torch.compiler.disable
-def is_separate_classifier_free_guidance_step():
+def is_separate_cfg_step():
     cache_context = get_current_cache_context()
     assert cache_context is not None, "cache_context must be set before"
-    return cache_context.is_separate_classifier_free_guidance_step()
+    return cache_context.is_separate_cfg_step()
 @torch.compiler.disable
@@ -710,8 +710,8 @@ def are_two_tensors_similar(
     if all(
         (
-            do_separate_classifier_free_guidance(),
-            is_separate_classifier_free_guidance_step(),
+            do_separate_cfg(),
+            is_separate_cfg_step(),
             not cfg_diff_compute_separate(),
             get_current_step_residual_diff() is not None,
         )
@@ -789,7 +789,7 @@ def set_Fn_buffer(buffer: torch.Tensor, prefix: str = "Fn"):
     if downsample_factor > 1:
         buffer = buffer[..., ::downsample_factor]
         buffer = buffer.contiguous()
-    if is_separate_classifier_free_guidance_step():
+    if is_separate_cfg_step():
         _debugging_set_buffer(f"{prefix}_buffer_cfg")
         set_buffer(f"{prefix}_buffer_cfg", buffer)
     else:
@@ -799,7 +799,7 @@ def set_Fn_buffer(buffer: torch.Tensor, prefix: str = "Fn"):
 @torch.compiler.disable
 def get_Fn_buffer(prefix: str = "Fn"):
-    if is_separate_classifier_free_guidance_step():
+    if is_separate_cfg_step():
         _debugging_get_buffer(f"{prefix}_buffer_cfg")
         return get_buffer(f"{prefix}_buffer_cfg")
     _debugging_get_buffer(f"{prefix}_buffer")
@@ -808,7 +808,7 @@ def get_Fn_buffer(prefix: str = "Fn"):
 @torch.compiler.disable
 def set_Fn_encoder_buffer(buffer: torch.Tensor, prefix: str = "Fn"):
-    if is_separate_classifier_free_guidance_step():
+    if is_separate_cfg_step():
         _debugging_set_buffer(f"{prefix}_encoder_buffer_cfg")
         set_buffer(f"{prefix}_encoder_buffer_cfg", buffer)
     else:
@@ -818,7 +818,7 @@ def set_Fn_encoder_buffer(buffer: torch.Tensor, prefix: str = "Fn"):
 @torch.compiler.disable
 def get_Fn_encoder_buffer(prefix: str = "Fn"):
-    if is_separate_classifier_free_guidance_step():
+    if is_separate_cfg_step():
         _debugging_get_buffer(f"{prefix}_encoder_buffer_cfg")
         return get_buffer(f"{prefix}_encoder_buffer_cfg")
     _debugging_get_buffer(f"{prefix}_encoder_buffer")
@@ -832,7 +832,7 @@ def set_Bn_buffer(buffer: torch.Tensor, prefix: str = "Bn"):
     # This buffer is use for hidden states approximation.
     if is_taylorseer_enabled():
         # taylorseer, encoder_taylorseer
-        if is_separate_classifier_free_guidance_step():
+        if is_separate_cfg_step():
             taylorseer, _ = get_cfg_taylorseers()
         else:
             taylorseer, _ = get_taylorseers()
@@ -846,14 +846,14 @@ def set_Bn_buffer(buffer: torch.Tensor, prefix: str = "Bn"):
                     "TaylorSeer is enabled but not set in the cache context. "
                     "Falling back to default buffer retrieval."
                 )
-            if is_separate_classifier_free_guidance_step():
+            if is_separate_cfg_step():
                 _debugging_set_buffer(f"{prefix}_buffer_cfg")
                 set_buffer(f"{prefix}_buffer_cfg", buffer)
             else:
                 _debugging_set_buffer(f"{prefix}_buffer")
                 set_buffer(f"{prefix}_buffer", buffer)
     else:
-        if is_separate_classifier_free_guidance_step():
+        if is_separate_cfg_step():
             _debugging_set_buffer(f"{prefix}_buffer_cfg")
             set_buffer(f"{prefix}_buffer_cfg", buffer)
         else:
@@ -865,7 +865,7 @@ def set_Bn_buffer(buffer: torch.Tensor, prefix: str = "Bn"):
 def get_Bn_buffer(prefix: str = "Bn"):
     if is_taylorseer_enabled():
         # taylorseer, encoder_taylorseer
-        if is_separate_classifier_free_guidance_step():
+        if is_separate_cfg_step():
             taylorseer, _ = get_cfg_taylorseers()
         else:
             taylorseer, _ = get_taylorseers()
@@ -879,13 +879,13 @@ def get_Bn_buffer(prefix: str = "Bn"):
                     "Falling back to default buffer retrieval."
                 )
             # Fallback to default buffer retrieval
-            if is_separate_classifier_free_guidance_step():
+            if is_separate_cfg_step():
                 _debugging_get_buffer(f"{prefix}_buffer_cfg")
                 return get_buffer(f"{prefix}_buffer_cfg")
             _debugging_get_buffer(f"{prefix}_buffer")
             return get_buffer(f"{prefix}_buffer")
     else:
-        if is_separate_classifier_free_guidance_step():
+        if is_separate_cfg_step():
             _debugging_get_buffer(f"{prefix}_buffer_cfg")
             return get_buffer(f"{prefix}_buffer_cfg")
         _debugging_get_buffer(f"{prefix}_buffer")
@@ -897,7 +897,7 @@ def set_Bn_encoder_buffer(buffer: torch.Tensor, prefix: str = "Bn"):
     # This buffer is use for encoder hidden states approximation.
     if is_encoder_taylorseer_enabled():
         # taylorseer, encoder_taylorseer
-        if is_separate_classifier_free_guidance_step():
+        if is_separate_cfg_step():
             _, encoder_taylorseer = get_cfg_taylorseers()
         else:
             _, encoder_taylorseer = get_taylorseers()
@@ -911,14 +911,14 @@ def set_Bn_encoder_buffer(buffer: torch.Tensor, prefix: str = "Bn"):
                     "TaylorSeer is enabled but not set in the cache context. "
                     "Falling back to default buffer retrieval."
                 )
-            if is_separate_classifier_free_guidance_step():
+            if is_separate_cfg_step():
                 _debugging_set_buffer(f"{prefix}_encoder_buffer_cfg")
                 set_buffer(f"{prefix}_encoder_buffer_cfg", buffer)
             else:
                 _debugging_set_buffer(f"{prefix}_encoder_buffer")
                 set_buffer(f"{prefix}_encoder_buffer", buffer)
     else:
-        if is_separate_classifier_free_guidance_step():
+        if is_separate_cfg_step():
             _debugging_set_buffer(f"{prefix}_encoder_buffer_cfg")
             set_buffer(f"{prefix}_encoder_buffer_cfg", buffer)
         else:
@@ -929,7 +929,7 @@ def set_Bn_encoder_buffer(buffer: torch.Tensor, prefix: str = "Bn"):
 @torch.compiler.disable
 def get_Bn_encoder_buffer(prefix: str = "Bn"):
     if is_encoder_taylorseer_enabled():
-        if is_separate_classifier_free_guidance_step():
+        if is_separate_cfg_step():
             _, encoder_taylorseer = get_cfg_taylorseers()
         else:
             _, encoder_taylorseer = get_taylorseers()
@@ -944,13 +944,13 @@ def get_Bn_encoder_buffer(prefix: str = "Bn"):
                     "Falling back to default buffer retrieval."
                 )
             # Fallback to default buffer retrieval
-            if is_separate_classifier_free_guidance_step():
+            if is_separate_cfg_step():
                 _debugging_get_buffer(f"{prefix}_encoder_buffer_cfg")
                 return get_buffer(f"{prefix}_encoder_buffer_cfg")
             _debugging_get_buffer(f"{prefix}_encoder_buffer")
             return get_buffer(f"{prefix}_encoder_buffer")
     else:
-        if is_separate_classifier_free_guidance_step():
+        if is_separate_cfg_step():
             _debugging_get_buffer(f"{prefix}_encoder_buffer_cfg")
             return get_buffer(f"{prefix}_encoder_buffer_cfg")
         _debugging_get_buffer(f"{prefix}_encoder_buffer")
@@ -1021,7 +1021,7 @@ def get_can_use_cache(
         return False
     max_cached_steps = get_max_cached_steps()
-    if not is_separate_classifier_free_guidance_step():
+    if not is_separate_cfg_step():
         cached_steps = get_cached_steps()
     else:
         cached_steps = get_cfg_cached_steps()

cache_dit/cache_factory/cache_interface.py ADDED Viewed

@@ -0,0 +1,149 @@
+from diffusers import DiffusionPipeline
+from cache_dit.cache_factory.forward_pattern import ForwardPattern
+from cache_dit.cache_factory.cache_types import CacheType
+from cache_dit.cache_factory.cache_adapters import BlockAdapter
+from cache_dit.cache_factory.cache_adapters import UnifiedCacheAdapter
+from cache_dit.logger import init_logger
+logger = init_logger(__name__)
+def enable_cache(
+    # BlockAdapter & forward pattern
+    pipe_or_adapter: DiffusionPipeline | BlockAdapter,
+    forward_pattern: ForwardPattern = ForwardPattern.Pattern_0,
+    # Cache context kwargs
+    Fn_compute_blocks: int = 8,
+    Bn_compute_blocks: int = 0,
+    warmup_steps: int = 8,
+    max_cached_steps: int = -1,
+    residual_diff_threshold: float = 0.08,
+    # Cache CFG or not
+    do_separate_cfg: bool = False,
+    cfg_compute_first: bool = False,
+    cfg_diff_compute_separate: bool = False,
+    # Hybird TaylorSeer
+    enable_taylorseer: bool = False,
+    enable_encoder_taylorseer: bool = False,
+    taylorseer_cache_type: str = "residual",
+    taylorseer_order: int = 2,
+    **other_cache_kwargs,
+) -> DiffusionPipeline:
+    r"""
+    Unified Cache API for  almost Any Diffusion Transformers (with Transformer Blocks
+    that match the specific Input and Output patterns).
+    For a good balance between performance and precision, DBCache is configured by default
+    with F8B0, 8 warmup steps, and unlimited cached steps.
+    Args:
+        pipe_or_adapter (`DiffusionPipeline` or `BlockAdapter`, *required*):
+            The standard Diffusion Pipeline or custom BlockAdapter (from cache-dit or user-defined).
+            For example: cache_dit.enable_cache(FluxPipeline(...)). Please check https://github.com/vipshop/cache-dit/blob/main/docs/BlockAdapter.md
+            for the usgae of BlockAdapter.
+        forward_pattern (`ForwardPattern`, *required*, defaults to `ForwardPattern.Pattern_0`):
+            The forward pattern of Transformer block, please check https://github.com/vipshop/cache-dit/tree/main?tab=readme-ov-file#forward-pattern-matching
+            for more details.
+        Fn_compute_blocks (`int`, *required*, defaults to 8):
+            Specifies that `DBCache` uses the **first n** Transformer blocks to fit the information
+            at time step t, enabling the calculation of a more stable L1 diff and delivering more
+            accurate information to subsequent blocks. Please check https://github.com/vipshop/cache-dit/blob/main/docs/DBCache.md
+            for more details of DBCache.
+        Bn_compute_blocks: (`int`, *required*, defaults to 0):
+            Further fuses approximate information in the **last n** Transformer blocks to enhance
+            prediction accuracy. These blocks act as an auto-scaler for approximate hidden states
+            that use residual cache.
+        warmup_steps (`int`, *required*, defaults to 8):
+            DBCache does not apply the caching strategy when the number of running steps is less than
+            or equal to this value, ensuring the model sufficiently learns basic features during warmup.
+        max_cached_steps (`int`, *required*, defaults to -1):
+            DBCache disables the caching strategy when the previous cached steps exceed this value to
+            prevent precision degradation.
+        residual_diff_threshold (`float`, *required*, defaults to 0.08):
+            he value of residual diff threshold, a higher value leads to faster performance at the
+            cost of lower precision.
+        do_separate_cfg (`bool`, *required*,  defaults to False):
+            Whether to do separate cfg or not, such as Wan 2.1, Qwen-Image. For model that fused CFG
+            and non-CFG into single forward step, should set do_separate_cfg as False, for example:
+            CogVideoX, HunyuanVideo, Mochi, etc.
+        cfg_compute_first (`bool`, *required*,  defaults to False):
+            Compute cfg forward first or not, default False, namely, 0, 2, 4, ..., -> non-CFG step;
+            1, 3, 5, ... -> CFG step.
+        cfg_diff_compute_separate (`bool`, *required*,  defaults to True):
+            Compute spearate diff values for CFG and non-CFG step, default True. If False, we will
+            use the computed diff from current non-CFG transformer step for current CFG step.
+        enable_taylorseer (`bool`, *required*,  defaults to False):
+            Enable the hybird TaylorSeer for hidden_states or not. We have supported the
+            [TaylorSeers: From Reusing to Forecasting: Accelerating Diffusion Models with TaylorSeers](https://arxiv.org/pdf/2503.06923) algorithm
+            to further improve the precision of DBCache in cases where the cached steps are large,
+            namely, **Hybrid TaylorSeer + DBCache**. At timesteps with significant intervals,
+            the feature similarity in diffusion models decreases substantially, significantly
+            harming the generation quality.
+        enable_encoder_taylorseer (`bool`, *required*,  defaults to False):
+            Enable the hybird TaylorSeer for encoder_hidden_states or not.
+        taylorseer_cache_type (`str`, *required*,  defaults to `residual`):
+            The TaylorSeer implemented in cache-dit supports both `hidden_states` and `residual` as cache type.
+        taylorseer_order (`int`, *required*, defaults to 2):
+            The order of taylorseer, higher values of n_derivatives will lead to longer computation time,
+            but may improve precision significantly.
+        other_cache_kwargs: (`dict`, *optional*, defaults to {})
+            Other cache context kwargs, please check https://github.com/vipshop/cache-dit/blob/main/src/cache_dit/cache_factory/cache_context.py
+            for more details.
+    Examples:
+    ```py
+    >>> import cache_dit
+    >>> from diffusers import DiffusionPipeline
+    >>> pipe = DiffusionPipeline.from_pretrained("Qwen/Qwen-Image") # Can be any diffusion pipeline
+    >>> cache_dit.enable_cache(pipe) # One-line code with default cache options.
+    >>> output = pipe(...) # Just call the pipe as normal.
+    >>> stats = cache_dit.summary(pipe) # Then, get the summary of cache acceleration stats.
+    """
+    # Collect cache context kwargs
+    cache_context_kwargs = other_cache_kwargs.copy()
+    cache_context_kwargs["cache_type"] = CacheType.DBCache
+    cache_context_kwargs["Fn_compute_blocks"] = Fn_compute_blocks
+    cache_context_kwargs["Bn_compute_blocks"] = Bn_compute_blocks
+    cache_context_kwargs["warmup_steps"] = warmup_steps
+    cache_context_kwargs["max_cached_steps"] = max_cached_steps
+    cache_context_kwargs["residual_diff_threshold"] = residual_diff_threshold
+    cache_context_kwargs["do_separate_cfg"] = do_separate_cfg
+    cache_context_kwargs["cfg_compute_first"] = cfg_compute_first
+    cache_context_kwargs["cfg_diff_compute_separate"] = (
+        cfg_diff_compute_separate
+    )
+    cache_context_kwargs["enable_taylorseer"] = enable_taylorseer
+    cache_context_kwargs["enable_encoder_taylorseer"] = (
+        enable_encoder_taylorseer
+    )
+    cache_context_kwargs["taylorseer_cache_type"] = taylorseer_cache_type
+    if "taylorseer_kwargs" in cache_context_kwargs:
+        cache_context_kwargs["taylorseer_kwargs"][
+            "n_derivatives"
+        ] = taylorseer_order
+    else:
+        cache_context_kwargs["taylorseer_kwargs"] = {
+            "n_derivatives": taylorseer_order
+        }
+    if isinstance(pipe_or_adapter, BlockAdapter):
+        return UnifiedCacheAdapter.apply(
+            pipe=None,
+            block_adapter=pipe_or_adapter,
+            forward_pattern=forward_pattern,
+            **cache_context_kwargs,
+        )
+    elif isinstance(pipe_or_adapter, DiffusionPipeline):
+        return UnifiedCacheAdapter.apply(
+            pipe=pipe_or_adapter,
+            block_adapter=None,
+            forward_pattern=forward_pattern,
+            **cache_context_kwargs,
+        )
+    else:
+        raise ValueError(
+            "Please pass DiffusionPipeline or BlockAdapter"
+            "for the 1's position param: pipe_or_adapter"
+        )

cache_dit/cache_factory/cache_types.py CHANGED Viewed

@@ -9,62 +9,31 @@ class CacheType(Enum):
     DBCache = "Dual_Block_Cache"
     @staticmethod
-    def type(cache_type: "CacheType | str") -> "CacheType":
-        if isinstance(cache_type, CacheType):
-            return cache_type
-        return CacheType.cache_type(cache_type)
+    def type(type_hint: "CacheType | str") -> "CacheType":
+        if isinstance(type_hint, CacheType):
+            return type_hint
+        return cache_type(type_hint)
-    @staticmethod
-    def cache_type(cache_type: "CacheType | str") -> "CacheType":
-        if cache_type is None:
-            return CacheType.NONE
-        if isinstance(cache_type, CacheType):
-            return cache_type
-        elif cache_type.lower() in (
-            "dual_block_cache",
-            "db_cache",
-            "dbcache",
-            "db",
-        ):
-            return CacheType.DBCache
+def cache_type(type_hint: "CacheType | str") -> "CacheType":
+    if type_hint is None:
         return CacheType.NONE
-    @staticmethod
-    def block_range(start: int, end: int, step: int = 1) -> list[int]:
-        if start > end or end <= 0 or step <= 1:
-            return []
-        # Always compute 0 and end - 1 blocks for DB Cache
-        return list(
-            sorted(set([0] + list(range(start, end, step)) + [end - 1]))
-        )
-    @staticmethod
-    def default_options(cache_type: "CacheType | str") -> dict:
-        _no_options = {
-            "cache_type": CacheType.NONE,
-        }
+    if isinstance(type_hint, CacheType):
+        return type_hint
-        _Fn_compute_blocks = 8
-        _Bn_compute_blocks = 0
+    elif type_hint.lower() in (
+        "dual_block_cache",
+        "db_cache",
+        "dbcache",
+        "db",
+    ):
+        return CacheType.DBCache
+    return CacheType.NONE
-        _db_options = {
-            "cache_type": CacheType.DBCache,
-            "residual_diff_threshold": 0.12,
-            "warmup_steps": 8,
-            "max_cached_steps": -1,  # -1 means no limit
-            "Fn_compute_blocks": _Fn_compute_blocks,
-            "Bn_compute_blocks": _Bn_compute_blocks,
-            "max_Fn_compute_blocks": 16,
-            "max_Bn_compute_blocks": 16,
-            "Fn_compute_blocks_ids": [],  # 0, 1, 2, ..., 7, etc.
-            "Bn_compute_blocks_ids": [],  # 0, 1, 2, ..., 7, etc.
-        }
-        if cache_type == CacheType.DBCache:
-            return _db_options
-        elif cache_type == CacheType.NONE:
-            return _no_options
-        else:
-            raise ValueError(f"Unknown cache type: {cache_type}")
+def block_range(start: int, end: int, step: int = 1) -> list[int]:
+    if start > end or end <= 0 or step <= 1:
+        return []
+    # Always compute 0 and end - 1 blocks for DB Cache
+    return list(sorted(set([0] + list(range(start, end, step)) + [end - 1])))

cache_dit/cache_factory/taylorseer.py CHANGED Viewed

@@ -1,5 +1,3 @@
-# Adapted from: https://github.com/chengzeyi/ParaAttention/blob/main/src/para_attn/first_block_cache/taylorseer.py
-# Reference: https://github.com/Shenyi-Z/TaylorSeer/TaylorSeer-FLUX/src/flux/taylor_utils/__init__.py
 import math
 import torch

cache_dit/cache_factory/utils.py CHANGED Viewed

@@ -51,3 +51,7 @@ def load_cache_options_from_yaml(yaml_file_path):
         )
     except yaml.YAMLError as e:
         raise yaml.YAMLError(f"YAML file parsing error: {str(e)}")
+def load_options(path: str):
+    return load_cache_options_from_yaml(path)

cache_dit/compile/utils.py CHANGED Viewed

@@ -24,14 +24,15 @@ def epilogue_prologue_fusion_enabled(**kwargs) -> bool:
 def set_compile_configs(
+    descent_tuning: bool = True,
     cuda_graphs: bool = False,
     force_disable_compile_caches: bool = False,
     use_fast_math: bool = False,
     **kwargs,  # other kwargs
 ):
     # Alway increase recompile_limit for dynamic shape compilation
-    torch._dynamo.config.recompile_limit = 96  # default is 8
-    torch._dynamo.config.accumulated_recompile_limit = 2048  # default is 256
+    torch._dynamo.config.recompile_limit = 1024  # default is 8
+    torch._dynamo.config.accumulated_recompile_limit = 8192  # default is 256
     # Handle compiler caches
     # https://github.com/vllm-project/vllm/blob/23baa2180b0ebba5ae94073ba9b8e93f88b75486/vllm/compilation/compiler_interface.py#L270
     torch._inductor.config.fx_graph_cache = True
@@ -47,6 +48,9 @@ def set_compile_configs(
             64 if "L20" in torch.cuda.get_device_name() else 300
         )
+    if not descent_tuning:
+        return
     FORCE_DISABLE_CUSTOM_COMPILE_CONFIG = (
         os.environ.get("CACHE_DIT_FORCE_DISABLE_CUSTOM_COMPILE_CONFIG", "0")
         == "1"

cache_dit/utils.py CHANGED Viewed

@@ -26,14 +26,17 @@ class CacheStats:
     )
-def summary(pipe: DiffusionPipeline, details: bool = False):
+def summary(
+    pipe: DiffusionPipeline, details: bool = False, logging: bool = True
+):
     cache_stats = CacheStats()
     pipe_cls_name = pipe.__class__.__name__
     if hasattr(pipe, "_cache_options"):
         cache_options = pipe._cache_options
         cache_stats.cache_options = cache_options
-        print(f"\n🤗Cache Options: {pipe_cls_name}\n\n{cache_options}")
+        if logging:
+            print(f"\n🤗Cache Options: {pipe_cls_name}\n\n{cache_options}")
     if hasattr(pipe.transformer, "_cached_steps"):
         cached_steps: list[int] = pipe.transformer._cached_steps
@@ -43,7 +46,7 @@ def summary(pipe: DiffusionPipeline, details: bool = False):
         cache_stats.cached_steps = cached_steps
         cache_stats.residual_diffs = residual_diffs
-        if residual_diffs:
+        if residual_diffs and logging:
             diffs_values = list(residual_diffs.values())
             qmin = np.min(diffs_values)
             q0 = np.percentile(diffs_values, 0)
@@ -90,7 +93,7 @@ def summary(pipe: DiffusionPipeline, details: bool = False):
         cache_stats.cfg_cached_steps = cfg_cached_steps
         cache_stats.cfg_residual_diffs = cfg_residual_diffs
-        if cfg_residual_diffs:
+        if cfg_residual_diffs and logging:
             cfg_diffs_values = list(cfg_residual_diffs.values())
             qmin = np.min(cfg_diffs_values)
             q0 = np.percentile(cfg_diffs_values, 0)
@@ -130,3 +133,29 @@ def summary(pipe: DiffusionPipeline, details: bool = False):
                 )
     return cache_stats
+def strify(pipe_or_stats: DiffusionPipeline | CacheStats):
+    if not isinstance(pipe_or_stats, CacheStats):
+        stats = summary(pipe_or_stats, logging=False)
+    else:
+        stats = pipe_or_stats
+    cache_options = stats.cache_options
+    cached_steps = len(stats.cached_steps)
+    if not cache_options:
+        return "NONE"
+    cache_type_str = (
+        f"DBCACHE_F{cache_options['Fn_compute_blocks']}"
+        f"B{cache_options['Bn_compute_blocks']}"
+        f"W{cache_options['warmup_steps']}"
+        f"M{max(0, cache_options['max_cached_steps'])}"
+        f"T{int(cache_options['enable_taylorseer'])}"
+        f"O{cache_options['taylorseer_kwargs']['n_derivatives']}_"
+        f"R{cache_options['residual_diff_threshold']}_"
+        f"S{cached_steps}"  # skiped steps
+    )
+    return cache_type_str

cache-dit 0.2.21__py3-none-any.whl → 0.2.23__py3-none-any.whl

Potentially problematic release.

cache-dit 0.2.21py3-none-any.whl → 0.2.23py3-none-any.whl