PyPI - cache-dit - Versions diffs - 0.2.32__py3-none-any.whl → 0.2.34__py3-none-any.whl - Mend

cache-dit 0.2.32py3-none-any.whl → 0.2.34py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (22) hide show

cache_dit/cache_factory/cache_contexts/cache_context.py CHANGED Viewed

@@ -14,13 +14,9 @@ logger = init_logger(__name__)
 @dataclasses.dataclass
 class CachedContext:  # Internal CachedContext Impl class
     name: str = "default"
-    # Dual Block Cache
-    # Fn=1, Bn=0, means FB Cache, otherwise, Dual Block Cache
+    # Dual Block Cache with flexible FnBn configuration.
     Fn_compute_blocks: int = 1
     Bn_compute_blocks: int = 0
-    # We have added residual cache pattern for selected compute blocks
-    Fn_compute_blocks_ids: List[int] = dataclasses.field(default_factory=list)
-    Bn_compute_blocks_ids: List[int] = dataclasses.field(default_factory=list)
     # non compute blocks diff threshold, we don't skip the non
     # compute blocks if the diff >= threshold
     non_compute_blocks_diff_threshold: float = 0.08
@@ -31,13 +27,6 @@ class CachedContext:  # Internal CachedContext Impl class
     l1_hidden_states_diff_threshold: float = None
     important_condition_threshold: float = 0.0
-    # Alter Cache Settings
-    # Pattern: 0 F 1 T 2 F 3 T 4 F 5 T ...
-    enable_alter_cache: bool = False
-    is_alter_cache: bool = True
-    # 1.0 means we always cache the residuals if alter_cache is enabled.
-    alter_residual_diff_threshold: Optional[Union[torch.Tensor, float]] = 1.0
     # Buffer for storing the residuals and other tensors
     buffers: Dict[str, Any] = dataclasses.field(default_factory=dict)
     incremental_name_counters: DefaultDict[str, int] = dataclasses.field(
@@ -63,22 +52,21 @@ class CachedContext:  # Internal CachedContext Impl class
     # Url: https://arxiv.org/pdf/2503.06923
     enable_taylorseer: bool = False
     enable_encoder_taylorseer: bool = False
-    # NOTE: use residual cache for taylorseer may incur precision loss
     taylorseer_cache_type: str = "hidden_states"  # residual or hidden_states
-    taylorseer_order: int = 2  # The order for TaylorSeer
+    taylorseer_order: int = 1  # The order for TaylorSeer
     taylorseer_kwargs: Dict[str, Any] = dataclasses.field(default_factory=dict)
     taylorseer: Optional[TaylorSeer] = None
     encoder_tarlorseer: Optional[TaylorSeer] = None
-    # Support enable_spearate_cfg, such as Wan 2.1,
+    # Support enable_separate_cfg, such as Wan 2.1,
     # Qwen-Image. For model that fused CFG and non-CFG into single
-    # forward step, should set enable_spearate_cfg as False.
+    # forward step, should set enable_separate_cfg as False.
     # For example: CogVideoX, HunyuanVideo, Mochi.
-    enable_spearate_cfg: bool = False
+    enable_separate_cfg: bool = False
     # Compute cfg forward first or not, default False, namely,
     # 0, 2, 4, ..., -> non-CFG step; 1, 3, 5, ... -> CFG step.
     cfg_compute_first: bool = False
-    # Compute spearate diff values for CFG and non-CFG step,
+    # Compute separate diff values for CFG and non-CFG step,
     # default True. If False, we will use the computed diff from
     # current non-CFG transformer step for current CFG step.
     cfg_diff_compute_separate: bool = True
@@ -97,16 +85,11 @@ class CachedContext:  # Internal CachedContext Impl class
     )
     cfg_continuous_cached_steps: int = 0
-    @torch.compiler.disable
     def __post_init__(self):
         if logger.isEnabledFor(logging.DEBUG):
             logger.info(f"Created _CacheContext: {self.name}")
         # Some checks for settings
-        if self.enable_spearate_cfg:
-            assert self.enable_alter_cache is False, (
-                "enable_alter_cache must set as False if "
-                "enable_spearate_cfg is enabled."
-            )
+        if self.enable_separate_cfg:
             if self.cfg_diff_compute_separate:
                 assert self.cfg_compute_first is False, (
                     "cfg_compute_first must set as False if "
@@ -125,59 +108,44 @@ class CachedContext:  # Internal CachedContext Impl class
         if self.enable_taylorseer:
             self.taylorseer = TaylorSeer(**self.taylorseer_kwargs)
-            if self.enable_spearate_cfg:
+            if self.enable_separate_cfg:
                 self.cfg_taylorseer = TaylorSeer(**self.taylorseer_kwargs)
         if self.enable_encoder_taylorseer:
             self.encoder_tarlorseer = TaylorSeer(**self.taylorseer_kwargs)
-            if self.enable_spearate_cfg:
+            if self.enable_separate_cfg:
                 self.cfg_encoder_taylorseer = TaylorSeer(
                     **self.taylorseer_kwargs
                 )
-    @torch.compiler.disable
     def get_residual_diff_threshold(self):
-        if self.enable_alter_cache:
-            residual_diff_threshold = self.alter_residual_diff_threshold
-        else:
-            residual_diff_threshold = self.residual_diff_threshold
-            if self.l1_hidden_states_diff_threshold is not None:
-                # Use the L1 hidden states diff threshold if set
-                residual_diff_threshold = self.l1_hidden_states_diff_threshold
+        residual_diff_threshold = self.residual_diff_threshold
+        if self.l1_hidden_states_diff_threshold is not None:
+            # Use the L1 hidden states diff threshold if set
+            residual_diff_threshold = self.l1_hidden_states_diff_threshold
         if isinstance(residual_diff_threshold, torch.Tensor):
             residual_diff_threshold = residual_diff_threshold.item()
         return residual_diff_threshold
-    @torch.compiler.disable
     def get_buffer(self, name):
-        if self.enable_alter_cache and self.is_alter_cache:
-            name = f"{name}_alter"
         return self.buffers.get(name)
-    @torch.compiler.disable
     def set_buffer(self, name, buffer):
-        if self.enable_alter_cache and self.is_alter_cache:
-            name = f"{name}_alter"
         self.buffers[name] = buffer
-    @torch.compiler.disable
     def remove_buffer(self, name):
-        if self.enable_alter_cache and self.is_alter_cache:
-            name = f"{name}_alter"
         if name in self.buffers:
             del self.buffers[name]
-    @torch.compiler.disable
     def clear_buffers(self):
         self.buffers.clear()
-    @torch.compiler.disable
     def mark_step_begin(self):
         # Always increase transformer executed steps
-        # incr    step: prev 0 -> 1; prev 1 -> 2
-        # current step: incr step - 1
+        # incr     step: prev 0 -> 1; prev 1 -> 2
+        # current  step: incr step - 1
         self.transformer_executed_steps += 1
-        if not self.enable_spearate_cfg:
+        if not self.enable_separate_cfg:
             self.executed_steps += 1
         else:
             # 0,1 -> 0 + 1, 2,3 -> 1 + 1, ...
@@ -190,10 +158,6 @@ class CachedContext:  # Internal CachedContext Impl class
                     # transformer step: 0,2,4,...
                     self.executed_steps += 1
-        if not self.enable_alter_cache:
-            # 0 F 1 T 2 F 3 T 4 F 5 T ...
-            self.is_alter_cache = not self.is_alter_cache
         # Reset the cached steps and residual diffs at the beginning
         # of each inference.
         if self.get_current_transformer_step() == 0:
@@ -219,7 +183,7 @@ class CachedContext:  # Internal CachedContext Impl class
         # mark_step_begin of TaylorSeer must be called after the cache is reset.
         if self.enable_taylorseer or self.enable_encoder_taylorseer:
-            if self.enable_spearate_cfg:
+            if self.enable_separate_cfg:
                 # Assume non-CFG steps: 0, 2, 4, 6, ...
                 if not self.is_separate_cfg_step():
                     taylorseer, encoder_taylorseer = self.get_taylorseers()
@@ -248,7 +212,6 @@ class CachedContext:  # Internal CachedContext Impl class
     def get_cfg_taylorseers(self) -> Tuple[TaylorSeer, TaylorSeer]:
         return self.cfg_taylorseer, self.cfg_encoder_taylorseer
-    @torch.compiler.disable
     def add_residual_diff(self, diff):
         # step: executed_steps - 1, not transformer_steps - 1
         step = str(self.get_current_step())
@@ -260,15 +223,12 @@ class CachedContext:  # Internal CachedContext Impl class
             if step not in self.cfg_residual_diffs:
                 self.cfg_residual_diffs[step] = diff
-    @torch.compiler.disable
     def get_residual_diffs(self):
         return self.residual_diffs.copy()
-    @torch.compiler.disable
     def get_cfg_residual_diffs(self):
         return self.cfg_residual_diffs.copy()
-    @torch.compiler.disable
     def add_cached_step(self):
         curr_cached_step = self.get_current_step()
         if not self.is_separate_cfg_step():
@@ -296,25 +256,20 @@ class CachedContext:  # Internal CachedContext Impl class
             self.cfg_cached_steps.append(curr_cached_step)
-    @torch.compiler.disable
     def get_cached_steps(self):
         return self.cached_steps.copy()
-    @torch.compiler.disable
     def get_cfg_cached_steps(self):
         return self.cfg_cached_steps.copy()
-    @torch.compiler.disable
     def get_current_step(self):
         return self.executed_steps - 1
-    @torch.compiler.disable
     def get_current_transformer_step(self):
         return self.transformer_executed_steps - 1
-    @torch.compiler.disable
     def is_separate_cfg_step(self):
-        if not self.enable_spearate_cfg:
+        if not self.enable_separate_cfg:
             return False
         if self.cfg_compute_first:
             # CFG steps: 0, 2, 4, 6, ...
@@ -322,6 +277,5 @@ class CachedContext:  # Internal CachedContext Impl class
         # CFG steps: 1, 3, 5, 7, ...
         return self.get_current_transformer_step() % 2 != 0
-    @torch.compiler.disable
     def is_in_warmup(self):
         return self.get_current_step() < self.max_warmup_steps

cache_dit/cache_factory/cache_contexts/cache_manager.py CHANGED Viewed

@@ -74,8 +74,8 @@ class CachedContextManager:
                 del self._cached_context_manager[cached_context]
     def clear_contexts(self):
-        for cached_context in self._cached_context_manager:
-            self.remove_context(cached_context)
+        for context_name in list(self._cached_context_manager.keys()):
+            self.remove_context(context_name)
     @contextlib.contextmanager
     def enter_context(self, cached_context: CachedContext | str):
@@ -122,10 +122,7 @@ class CachedContextManager:
                     default_value,
                 )
-        # Manually set sequence fields, namely, Fn_compute_blocks_ids
-        # and Bn_compute_blocks_ids, which are lists or sets.
-        _safe_set_sequence_field("Fn_compute_blocks_ids", [])
-        _safe_set_sequence_field("Bn_compute_blocks_ids", [])
+        # Manually set sequence fields
         _safe_set_sequence_field("taylorseer_kwargs", {})
         for attr in cache_attrs:
@@ -301,18 +298,6 @@ class CachedContextManager:
             return self.is_taylorseer_cache_residual()
         return True
-    @torch.compiler.disable
-    def is_alter_cache_enabled(self) -> bool:
-        cached_context = self.get_context()
-        assert cached_context is not None, "cached_context must be set before"
-        return cached_context.enable_alter_cache
-    @torch.compiler.disable
-    def is_alter_cache(self) -> bool:
-        cached_context = self.get_context()
-        assert cached_context is not None, "cached_context must be set before"
-        return cached_context.is_alter_cache
     @torch.compiler.disable
     def is_in_warmup(self) -> bool:
         cached_context = self.get_context()
@@ -359,20 +344,6 @@ class CachedContextManager:
             )
         return cached_context.Fn_compute_blocks
-    @torch.compiler.disable
-    def Fn_compute_blocks_ids(self) -> List[int]:
-        cached_context = self.get_context()
-        assert cached_context is not None, "cached_context must be set before"
-        assert (
-            len(cached_context.Fn_compute_blocks_ids)
-            <= cached_context.Fn_compute_blocks
-        ), (
-            "The num of Fn_compute_blocks_ids must be <= Fn_compute_blocks "
-            f"{cached_context.Fn_compute_blocks}, but got "
-            f"{len(cached_context.Fn_compute_blocks_ids)}"
-        )
-        return cached_context.Fn_compute_blocks_ids
     @torch.compiler.disable
     def Bn_compute_blocks(self) -> int:
         cached_context = self.get_context()
@@ -393,24 +364,10 @@ class CachedContextManager:
         return cached_context.Bn_compute_blocks
     @torch.compiler.disable
-    def Bn_compute_blocks_ids(self) -> List[int]:
-        cached_context = self.get_context()
-        assert cached_context is not None, "cached_context must be set before"
-        assert (
-            len(cached_context.Bn_compute_blocks_ids)
-            <= cached_context.Bn_compute_blocks
-        ), (
-            "The num of Bn_compute_blocks_ids must be <= Bn_compute_blocks "
-            f"{cached_context.Bn_compute_blocks}, but got "
-            f"{len(cached_context.Bn_compute_blocks_ids)}"
-        )
-        return cached_context.Bn_compute_blocks_ids
-    @torch.compiler.disable
-    def enable_spearate_cfg(self) -> bool:
+    def enable_separate_cfg(self) -> bool:
         cached_context = self.get_context()
         assert cached_context is not None, "cached_context must be set before"
-        return cached_context.enable_spearate_cfg
+        return cached_context.enable_separate_cfg
     @torch.compiler.disable
     def is_separate_cfg_step(self) -> bool:
@@ -453,7 +410,7 @@ class CachedContextManager:
         if all(
             (
-                self.enable_spearate_cfg(),
+                self.enable_separate_cfg(),
                 self.is_separate_cfg_step(),
                 not self.cfg_diff_compute_separate(),
                 self.get_current_step_residual_diff() is not None,
@@ -525,6 +482,9 @@ class CachedContextManager:
     # Fn buffers
     @torch.compiler.disable
     def set_Fn_buffer(self, buffer: torch.Tensor, prefix: str = "Fn"):
+        # DON'T set None Buffer
+        if buffer is None:
+            return
         # Set hidden_states or residual for Fn blocks.
         # This buffer is only use for L1 diff calculation.
         downsample_factor = self.get_downsample_factor()
@@ -548,6 +508,9 @@ class CachedContextManager:
     @torch.compiler.disable
     def set_Fn_encoder_buffer(self, buffer: torch.Tensor, prefix: str = "Fn"):
+        # DON'T set None Buffer
+        if buffer is None:
+            return
         if self.is_separate_cfg_step():
             self._debugging_set_buffer(f"{prefix}_encoder_buffer_cfg")
             self.set_buffer(f"{prefix}_encoder_buffer_cfg", buffer)
@@ -566,6 +529,9 @@ class CachedContextManager:
     # Bn buffers
     @torch.compiler.disable
     def set_Bn_buffer(self, buffer: torch.Tensor, prefix: str = "Bn"):
+        # DON'T set None Buffer
+        if buffer is None:
+            return
         # Set hidden_states or residual for Bn blocks.
         # This buffer is use for hidden states approximation.
         if self.is_taylorseer_enabled():
@@ -820,26 +786,12 @@ class CachedContextManager:
         else:
             prev_states_tensor = self.get_Fn_buffer(prefix)
-        if not self.is_alter_cache_enabled():
-            # Dynamic cache according to the residual diff
-            can_cache = prev_states_tensor is not None and self.similarity(
-                prev_states_tensor,
-                states_tensor,
-                threshold=threshold,
-                parallelized=parallelized,
-                prefix=prefix,
-            )
-        else:
-            # Only cache in the alter cache steps
-            can_cache = (
-                prev_states_tensor is not None
-                and self.similarity(
-                    prev_states_tensor,
-                    states_tensor,
-                    threshold=threshold,
-                    parallelized=parallelized,
-                    prefix=prefix,
-                )
-                and self.is_alter_cache()
-            )
+        # Dynamic cache according to the residual diff
+        can_cache = prev_states_tensor is not None and self.similarity(
+            prev_states_tensor,
+            states_tensor,
+            threshold=threshold,
+            parallelized=parallelized,
+            prefix=prefix,
+        )
         return can_cache

cache_dit/cache_factory/cache_contexts/taylorseer.py CHANGED Viewed

@@ -1,5 +1,6 @@
 import math
 import torch
+from typing import List, Dict
 class TaylorSeer:
@@ -17,16 +18,14 @@ class TaylorSeer:
         self.compute_step_map = compute_step_map
         self.reset_cache()
-    @torch.compiler.disable
     def reset_cache(self):
-        self.state = {
+        self.state: Dict[str, List[torch.Tensor]] = {
             "dY_prev": [None] * self.ORDER,
             "dY_current": [None] * self.ORDER,
         }
         self.current_step = -1
         self.last_non_approximated_step = -1
-    @torch.compiler.disable
     def should_compute_full(self, step=None):
         step = self.current_step if step is None else step
         if self.compute_step_map is not None:
@@ -39,16 +38,19 @@ class TaylorSeer:
             return True
         return False
-    @torch.compiler.disable
-    def approximate_derivative(self, Y):
+    def approximate_derivative(self, Y: torch.Tensor) -> List[torch.Tensor]:
         # n-th order Taylor expansion:
         # Y(t) = Y(0) + dY(0)/dt * t + d^2Y(0)/dt^2 * t^2 / 2!
         #        + ... + d^nY(0)/dt^n * t^n / n!
         # TODO: Custom Triton/CUDA kernel for better performance,
         # especially for large n_derivatives.
-        dY_current = [None] * self.ORDER
+        dY_current: List[torch.Tensor] = [None] * self.ORDER
         dY_current[0] = Y
         window = self.current_step - self.last_non_approximated_step
+        if self.state["dY_prev"][0] is not None:
+            if dY_current[0].shape != self.state["dY_prev"][0].shape:
+                self.reset_cache()
         for i in range(self.n_derivatives):
             if self.state["dY_prev"][i] is not None and self.current_step > 1:
                 dY_current[i + 1] = (
@@ -58,8 +60,7 @@ class TaylorSeer:
                 break
         return dY_current
-    @torch.compiler.disable
-    def approximate_value(self):
+    def approximate_value(self) -> torch.Tensor:
         # TODO: Custom Triton/CUDA kernel for better performance,
         # especially for large n_derivatives.
         elapsed = self.current_step - self.last_non_approximated_step
@@ -71,12 +72,10 @@ class TaylorSeer:
                 break
         return output
-    @torch.compiler.disable
     def mark_step_begin(self):
         self.current_step += 1
-    @torch.compiler.disable
-    def update(self, Y):
+    def update(self, Y: torch.Tensor):
         # Directly call this method will ingnore the warmup
         # policy and force full computation.
         # Assume warmup steps is 3, and n_derivatives is 3.
@@ -94,8 +93,7 @@ class TaylorSeer:
         self.state["dY_current"] = self.approximate_derivative(Y)
         self.last_non_approximated_step = self.current_step
-    @torch.compiler.disable
-    def step(self, Y):
+    def step(self, Y: torch.Tensor):
         self.mark_step_begin()
         if self.should_compute_full():
             self.update(Y)

cache_dit/cache_factory/cache_interface.py CHANGED Viewed

@@ -24,14 +24,14 @@ def enable_cache(
     max_continuous_cached_steps: int = -1,
     residual_diff_threshold: float = 0.08,
     # Cache CFG or not
-    enable_spearate_cfg: bool | None = None,
+    enable_separate_cfg: bool = None,
     cfg_compute_first: bool = False,
     cfg_diff_compute_separate: bool = True,
     # Hybird TaylorSeer
     enable_taylorseer: bool = False,
     enable_encoder_taylorseer: bool = False,
     taylorseer_cache_type: str = "residual",
-    taylorseer_order: int = 2,
+    taylorseer_order: int = 1,
     **other_cache_context_kwargs,
 ) -> Union[
     DiffusionPipeline,
@@ -70,15 +70,15 @@ def enable_cache(
         residual_diff_threshold (`float`, *required*, defaults to 0.08):
             he value of residual diff threshold, a higher value leads to faster performance at the
             cost of lower precision.
-        enable_spearate_cfg (`bool`, *required*,  defaults to None):
+        enable_separate_cfg (`bool`, *required*,  defaults to None):
             Whether to do separate cfg or not, such as Wan 2.1, Qwen-Image. For model that fused CFG
-            and non-CFG into single forward step, should set enable_spearate_cfg as False, for example:
+            and non-CFG into single forward step, should set enable_separate_cfg as False, for example:
             CogVideoX, HunyuanVideo, Mochi, etc.
         cfg_compute_first (`bool`, *required*,  defaults to False):
             Compute cfg forward first or not, default False, namely, 0, 2, 4, ..., -> non-CFG step;
             1, 3, 5, ... -> CFG step.
         cfg_diff_compute_separate (`bool`, *required*,  defaults to True):
-            Compute spearate diff values for CFG and non-CFG step, default True. If False, we will
+            Compute separate diff values for CFG and non-CFG step, default True. If False, we will
             use the computed diff from current non-CFG transformer step for current CFG step.
         enable_taylorseer (`bool`, *required*,  defaults to False):
             Enable the hybird TaylorSeer for hidden_states or not. We have supported the
@@ -91,10 +91,10 @@ def enable_cache(
             Enable the hybird TaylorSeer for encoder_hidden_states or not.
         taylorseer_cache_type (`str`, *required*,  defaults to `residual`):
             The TaylorSeer implemented in cache-dit supports both `hidden_states` and `residual` as cache type.
-        taylorseer_order (`int`, *required*, defaults to 2):
+        taylorseer_order (`int`, *required*, defaults to 1):
             The order of taylorseer, higher values of n_derivatives will lead to longer computation time,
-            but may improve precision significantly.
-        other_cache_kwargs: (`dict`, *optional*, defaults to {})
+            the recommended value is 1 or 2.
+        other_cache_context_kwargs: (`dict`, *optional*, defaults to {})
             Other cache context kwargs, please check https://github.com/vipshop/cache-dit/blob/main/src/cache_dit/cache_factory/cache_contexts/cache_context.py
             for more details.
@@ -123,7 +123,7 @@ def enable_cache(
         max_continuous_cached_steps
     )
     cache_context_kwargs["residual_diff_threshold"] = residual_diff_threshold
-    cache_context_kwargs["enable_spearate_cfg"] = enable_spearate_cfg
+    cache_context_kwargs["enable_separate_cfg"] = enable_separate_cfg
     cache_context_kwargs["cfg_compute_first"] = cfg_compute_first
     cache_context_kwargs["cfg_diff_compute_separate"] = (
         cfg_diff_compute_separate

cache_dit/cache_factory/patch_functors/__init__.py CHANGED Viewed

@@ -1,4 +1,5 @@
 from cache_dit.cache_factory.patch_functors.functor_base import PatchFunctor
+from cache_dit.cache_factory.patch_functors.functor_dit import DiTPatchFunctor
 from cache_dit.cache_factory.patch_functors.functor_flux import FluxPatchFunctor
 from cache_dit.cache_factory.patch_functors.functor_chroma import (
     ChromaPatchFunctor,

cache-dit 0.2.32__py3-none-any.whl → 0.2.34__py3-none-any.whl

cache-dit 0.2.32py3-none-any.whl → 0.2.34py3-none-any.whl