PyPI - cache-dit - Versions diffs - 0.2.0__py3-none-any.whl → 0.2.2__py3-none-any.whl - Mend

cache-dit 0.2.0py3-none-any.whl → 0.2.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of cache-dit might be problematic. Click here for more details.

Files changed (21) hide show

cache_dit/_version.py CHANGED Viewed

@@ -17,5 +17,5 @@ __version__: str
 __version_tuple__: VERSION_TUPLE
 version_tuple: VERSION_TUPLE
-__version__ = version = '0.2.0'
-__version_tuple__ = version_tuple = (0, 2, 0)
+__version__ = version = '0.2.2'
+__version_tuple__ = version_tuple = (0, 2, 2)

cache_dit/cache_factory/dual_block_cache/cache_context.py CHANGED Viewed

@@ -9,6 +9,7 @@ from typing import Any, DefaultDict, Dict, List, Optional, Union
 import torch
 import cache_dit.primitives as DP
+from cache_dit.cache_factory.taylorseer import TaylorSeer
 from cache_dit.logger import init_logger
 logger = init_logger(__name__)
@@ -60,7 +61,55 @@ class DBCacheContext:
     residual_diffs: DefaultDict[str, float] = dataclasses.field(
         default_factory=lambda: defaultdict(float),
     )
+    # Support TaylorSeers in Dual Block Cache
+    # Title: From Reusing to Forecasting: Accelerating Diffusion Models with TaylorSeers
+    # Url: https://arxiv.org/pdf/2503.06923
+    enable_taylorseer: bool = False
+    enable_encoder_taylorseer: bool = False
+    # NOTE: use residual cache for taylorseer may incur precision loss
+    taylorseer_cache_type: str = "hidden_states"  # residual or hidden_states
+    taylorseer_kwargs: Dict[str, Any] = dataclasses.field(default_factory=dict)
+    taylorseer: Optional[TaylorSeer] = None
+    encoder_tarlorseer: Optional[TaylorSeer] = None
+    alter_taylorseer: Optional[TaylorSeer] = None
+    alter_encoder_taylorseer: Optional[TaylorSeer] = None
+    # TODO: Support SLG in Dual Block Cache
+    # Skip Layer Guidance, SLG
+    # https://github.com/huggingface/candle/issues/2588
+    slg_layers: Optional[List[int]] = None
+    slg_start: float = 0.0
+    slg_end: float = 0.1
+    @torch.compiler.disable
+    def __post_init__(self):
+        if "warmup_steps" not in self.taylorseer_kwargs:
+            # If warmup_steps is not set in taylorseer_kwargs,
+            # set the same as warmup_steps for DBCache
+            self.taylorseer_kwargs["warmup_steps"] = (
+                self.warmup_steps if self.warmup_steps > 0 else 1
+            )
+        # Only set n_derivatives as 2 or 3, which is enough for most cases.
+        if "n_derivatives" not in self.taylorseer_kwargs:
+            self.taylorseer_kwargs["n_derivatives"] = max(
+                2, min(3, self.taylorseer_kwargs["warmup_steps"])
+            )
+        if self.enable_taylorseer:
+            self.taylorseer = TaylorSeer(**self.taylorseer_kwargs)
+            if self.enable_alter_cache:
+                self.alter_taylorseer = TaylorSeer(**self.taylorseer_kwargs)
+        if self.enable_encoder_taylorseer:
+            self.encoder_tarlorseer = TaylorSeer(**self.taylorseer_kwargs)
+            if self.enable_alter_cache:
+                self.alter_encoder_taylorseer = TaylorSeer(
+                    **self.taylorseer_kwargs
+                )
+    @torch.compiler.disable
     def get_incremental_name(self, name=None):
         if name is None:
             name = "default"
@@ -68,9 +117,11 @@ class DBCacheContext:
         self.incremental_name_counters[name] += 1
         return f"{name}_{idx}"
+    @torch.compiler.disable
     def reset_incremental_names(self):
         self.incremental_name_counters.clear()
+    @torch.compiler.disable
     def get_residual_diff_threshold(self):
         if self.enable_alter_cache:
             residual_diff_threshold = self.alter_residual_diff_threshold
@@ -83,25 +134,30 @@ class DBCacheContext:
             residual_diff_threshold = residual_diff_threshold.item()
         return residual_diff_threshold
+    @torch.compiler.disable
     def get_buffer(self, name):
         if self.enable_alter_cache and self.is_alter_cache:
             name = f"{name}_alter"
         return self.buffers.get(name)
+    @torch.compiler.disable
     def set_buffer(self, name, buffer):
         if self.enable_alter_cache and self.is_alter_cache:
             name = f"{name}_alter"
         self.buffers[name] = buffer
+    @torch.compiler.disable
     def remove_buffer(self, name):
         if self.enable_alter_cache and self.is_alter_cache:
             name = f"{name}_alter"
         if name in self.buffers:
             del self.buffers[name]
+    @torch.compiler.disable
     def clear_buffers(self):
         self.buffers.clear()
+    @torch.compiler.disable
     def mark_step_begin(self):
         if not self.enable_alter_cache:
             self.executed_steps += 1
@@ -116,25 +172,53 @@ class DBCacheContext:
             self.cached_steps.clear()
             self.residual_diffs.clear()
             self.reset_incremental_names()
+            # Reset the TaylorSeers cache at the beginning of each inference.
+            # reset_cache will set the current step to -1 for TaylorSeer,
+            if self.enable_taylorseer or self.enable_encoder_taylorseer:
+                taylorseer, encoder_taylorseer = self.get_taylorseers()
+                if taylorseer is not None:
+                    taylorseer.reset_cache()
+                if encoder_taylorseer is not None:
+                    encoder_taylorseer.reset_cache()
+        # mark_step_begin of TaylorSeer must be called after the cache is reset.
+        if self.enable_taylorseer or self.enable_encoder_taylorseer:
+            taylorseer, encoder_taylorseer = self.get_taylorseers()
+            if taylorseer is not None:
+                taylorseer.mark_step_begin()
+            if encoder_taylorseer is not None:
+                encoder_taylorseer.mark_step_begin()
+    @torch.compiler.disable
+    def get_taylorseers(self):
+        if self.enable_alter_cache and self.is_alter_cache:
+            return self.alter_taylorseer, self.alter_encoder_taylorseer
+        return self.taylorseer, self.encoder_tarlorseer
+    @torch.compiler.disable
     def add_residual_diff(self, diff):
         step = str(self.get_current_step())
         if step not in self.residual_diffs:
             # Only add the diff if it is not already recorded for this step
             self.residual_diffs[step] = diff
+    @torch.compiler.disable
     def get_residual_diffs(self):
         return self.residual_diffs.copy()
+    @torch.compiler.disable
     def add_cached_step(self):
         self.cached_steps.append(self.get_current_step())
+    @torch.compiler.disable
     def get_cached_steps(self):
         return self.cached_steps.copy()
+    @torch.compiler.disable
     def get_current_step(self):
         return self.executed_steps - 1
+    @torch.compiler.disable
     def is_in_warmup(self):
         return self.get_current_step() < self.warmup_steps
@@ -216,6 +300,50 @@ def get_residual_diffs():
     return cache_context.get_residual_diffs()
+@torch.compiler.disable
+def is_taylorseer_enabled():
+    cache_context = get_current_cache_context()
+    assert cache_context is not None, "cache_context must be set before"
+    return cache_context.enable_taylorseer
+@torch.compiler.disable
+def is_encoder_taylorseer_enabled():
+    cache_context = get_current_cache_context()
+    assert cache_context is not None, "cache_context must be set before"
+    return cache_context.enable_encoder_taylorseer
+@torch.compiler.disable
+def get_taylorseers():
+    cache_context = get_current_cache_context()
+    assert cache_context is not None, "cache_context must be set before"
+    return cache_context.get_taylorseers()
+@torch.compiler.disable
+def is_taylorseer_cache_residual():
+    cache_context = get_current_cache_context()
+    assert cache_context is not None, "cache_context must be set before"
+    return cache_context.taylorseer_cache_type == "residual"
+@torch.compiler.disable
+def is_cache_residual():
+    if is_taylorseer_enabled():
+        # residual or hidden_states
+        return is_taylorseer_cache_residual()
+    return True
+@torch.compiler.disable
+def is_encoder_cache_residual():
+    if is_encoder_taylorseer_enabled():
+        # residual or hidden_states
+        return is_taylorseer_cache_residual()
+    return True
 @torch.compiler.disable
 def is_alter_cache_enabled():
     cache_context = get_current_cache_context()
@@ -367,16 +495,21 @@ def collect_cache_kwargs(default_attrs: dict, **kwargs):
         for attr in cache_attrs
     }
+    def _safe_set_sequence_field(
+        field_name: str,
+        default_value: Any = None,
+    ):
+        if field_name not in cache_kwargs:
+            cache_kwargs[field_name] = kwargs.pop(
+                field_name,
+                default_value,
+            )
     # Manually set sequence fields, namely, Fn_compute_blocks_ids
     # and Bn_compute_blocks_ids, which are lists or sets.
-    cache_kwargs["Fn_compute_blocks_ids"] = kwargs.pop(
-        "Fn_compute_blocks_ids",
-        [],
-    )
-    cache_kwargs["Bn_compute_blocks_ids"] = kwargs.pop(
-        "Bn_compute_blocks_ids",
-        [],
-    )
+    _safe_set_sequence_field("Fn_compute_blocks_ids", [])
+    _safe_set_sequence_field("Bn_compute_blocks_ids", [])
+    _safe_set_sequence_field("taylorseer_kwargs", {})
     assert default_attrs is not None, "default_attrs must be set before"
     for attr in cache_attrs:
@@ -471,6 +604,7 @@ def are_two_tensors_similar(
 @torch.compiler.disable
 def set_Fn_buffer(buffer: torch.Tensor, prefix: str = "Fn"):
     # Set hidden_states or residual for Fn blocks.
+    # This buffer is only use for L1 diff calculation.
     downsample_factor = get_downsample_factor()
     if downsample_factor > 1:
         buffer = buffer[..., ::downsample_factor]
@@ -497,22 +631,79 @@ def get_Fn_encoder_buffer(prefix: str = "Fn"):
 @torch.compiler.disable
 def set_Bn_buffer(buffer: torch.Tensor, prefix: str = "Bn"):
     # Set hidden_states or residual for Bn blocks.
-    set_buffer(f"{prefix}_buffer", buffer)
+    # This buffer is use for hidden states approximation.
+    if is_taylorseer_enabled():
+        # taylorseer, encoder_taylorseer
+        taylorseer, _ = get_taylorseers()
+        if taylorseer is not None:
+            # Use TaylorSeer to update the buffer
+            taylorseer.update(buffer)
+        else:
+            if logger.isEnabledFor(logging.DEBUG):
+                logger.debug(
+                    "TaylorSeer is enabled but not set in the cache context. "
+                    "Falling back to default buffer retrieval."
+                )
+            set_buffer(f"{prefix}_buffer", buffer)
+    else:
+        set_buffer(f"{prefix}_buffer", buffer)
 @torch.compiler.disable
 def get_Bn_buffer(prefix: str = "Bn"):
-    return get_buffer(f"{prefix}_buffer")
+    if is_taylorseer_enabled():
+        taylorseer, _ = get_taylorseers()
+        if taylorseer is not None:
+            return taylorseer.approximate_value()
+        else:
+            if logger.isEnabledFor(logging.DEBUG):
+                logger.debug(
+                    "TaylorSeer is enabled but not set in the cache context. "
+                    "Falling back to default buffer retrieval."
+                )
+            # Fallback to default buffer retrieval
+            return get_buffer(f"{prefix}_buffer")
+    else:
+        return get_buffer(f"{prefix}_buffer")
 @torch.compiler.disable
 def set_Bn_encoder_buffer(buffer: torch.Tensor, prefix: str = "Bn"):
-    set_buffer(f"{prefix}_encoder_buffer", buffer)
+    # This buffer is use for encoder hidden states approximation.
+    if is_encoder_taylorseer_enabled():
+        # taylorseer, encoder_taylorseer
+        _, encoder_taylorseer = get_taylorseers()
+        if encoder_taylorseer is not None:
+            # Use TaylorSeer to update the buffer
+            encoder_taylorseer.update(buffer)
+        else:
+            if logger.isEnabledFor(logging.DEBUG):
+                logger.debug(
+                    "TaylorSeer is enabled but not set in the cache context. "
+                    "Falling back to default buffer retrieval."
+                )
+            set_buffer(f"{prefix}_encoder_buffer", buffer)
+    else:
+        set_buffer(f"{prefix}_encoder_buffer", buffer)
 @torch.compiler.disable
 def get_Bn_encoder_buffer(prefix: str = "Bn"):
-    return get_buffer(f"{prefix}_encoder_buffer")
+    if is_encoder_taylorseer_enabled():
+        _, encoder_taylorseer = get_taylorseers()
+        if encoder_taylorseer is not None:
+            # Use TaylorSeer to approximate the value
+            return encoder_taylorseer.approximate_value()
+        else:
+            if logger.isEnabledFor(logging.DEBUG):
+                logger.debug(
+                    "TaylorSeer is enabled but not set in the cache context. "
+                    "Falling back to default buffer retrieval."
+                )
+            # Fallback to default buffer retrieval
+            return get_buffer(f"{prefix}_encoder_buffer")
+    else:
+        return get_buffer(f"{prefix}_encoder_buffer")
 @torch.compiler.disable
@@ -520,29 +711,38 @@ def apply_hidden_states_residual(
     hidden_states: torch.Tensor,
     encoder_hidden_states: torch.Tensor,
     prefix: str = "Bn",
+    encoder_prefix: str = "Bn_encoder",
 ):
     # Allow Bn and Fn prefix to be used for residual cache.
     if "Bn" in prefix:
-        hidden_states_residual = get_Bn_buffer(prefix)
+        hidden_states_prev = get_Bn_buffer(prefix)
     else:
-        hidden_states_residual = get_Fn_buffer(prefix)
+        hidden_states_prev = get_Fn_buffer(prefix)
-    assert (
-        hidden_states_residual is not None
-    ), f"{prefix}_buffer must be set before"
-    hidden_states = hidden_states_residual + hidden_states
+    assert hidden_states_prev is not None, f"{prefix}_buffer must be set before"
-    if "Bn" in prefix:
-        encoder_hidden_states_residual = get_Bn_encoder_buffer(prefix)
+    if is_cache_residual():
+        hidden_states = hidden_states_prev + hidden_states
     else:
-        encoder_hidden_states_residual = get_Fn_encoder_buffer(prefix)
+        # If cache is not residual, we use the hidden states directly
+        hidden_states = hidden_states_prev
+    if "Bn" in encoder_prefix:
+        encoder_hidden_states_prev = get_Bn_encoder_buffer(encoder_prefix)
+    else:
+        encoder_hidden_states_prev = get_Fn_encoder_buffer(encoder_prefix)
     assert (
-        encoder_hidden_states_residual is not None
+        encoder_hidden_states_prev is not None
     ), f"{prefix}_encoder_buffer must be set before"
-    encoder_hidden_states = (
-        encoder_hidden_states_residual + encoder_hidden_states
-    )
+    if is_encoder_cache_residual():
+        encoder_hidden_states = (
+            encoder_hidden_states_prev + encoder_hidden_states
+        )
+    else:
+        # If encoder cache is not residual, we use the encoder hidden states directly
+        encoder_hidden_states = encoder_hidden_states_prev
     hidden_states = hidden_states.contiguous()
     encoder_hidden_states = encoder_hidden_states.contiguous()
@@ -674,11 +874,22 @@ class DBCachedTransformerBlocks(torch.nn.Module):
         torch._dynamo.graph_break()
         if can_use_cache:
+            torch._dynamo.graph_break()
             add_cached_step()
             del Fn_hidden_states_residual
             hidden_states, encoder_hidden_states = apply_hidden_states_residual(
-                hidden_states, encoder_hidden_states, prefix="Bn_residual"
+                hidden_states,
+                encoder_hidden_states,
+                prefix=(
+                    "Bn_residual" if is_cache_residual() else "Bn_hidden_states"
+                ),
+                encoder_prefix=(
+                    "Bn_residual"
+                    if is_encoder_cache_residual()
+                    else "Bn_hidden_states"
+                ),
             )
+            torch._dynamo.graph_break()
             # Call last `n` blocks to further process the hidden states
             # for higher precision.
             hidden_states, encoder_hidden_states = (
@@ -690,26 +901,48 @@ class DBCachedTransformerBlocks(torch.nn.Module):
                 )
             )
         else:
+            torch._dynamo.graph_break()
             set_Fn_buffer(Fn_hidden_states_residual, prefix="Fn_residual")
             if is_l1_diff_enabled():
                 # for hidden states L1 diff
                 set_Fn_buffer(hidden_states, "Fn_hidden_states")
             del Fn_hidden_states_residual
+            torch._dynamo.graph_break()
             (
                 hidden_states,
                 encoder_hidden_states,
                 hidden_states_residual,
                 encoder_hidden_states_residual,
-            ) = self.call_MN2n_transformer_blocks(  # middle
+            ) = self.call_Mn_transformer_blocks(  # middle
                 hidden_states,
                 encoder_hidden_states,
                 *args,
                 **kwargs,
             )
-            set_Bn_buffer(hidden_states_residual, prefix="Bn_residual")
-            set_Bn_encoder_buffer(
-                encoder_hidden_states_residual, prefix="Bn_residual"
-            )
+            torch._dynamo.graph_break()
+            if is_cache_residual():
+                set_Bn_buffer(
+                    hidden_states_residual,
+                    prefix="Bn_residual",
+                )
+            else:
+                # TaylorSeer
+                set_Bn_buffer(
+                    hidden_states,
+                    prefix="Bn_hidden_states",
+                )
+            if is_encoder_cache_residual():
+                set_Bn_encoder_buffer(
+                    encoder_hidden_states_residual,
+                    prefix="Bn_residual",
+                )
+            else:
+                # TaylorSeer
+                set_Bn_encoder_buffer(
+                    encoder_hidden_states,
+                    prefix="Bn_hidden_states",
+                )
+            torch._dynamo.graph_break()
             # Call last `n` blocks to further process the hidden states
             # for higher precision.
             hidden_states, encoder_hidden_states = (
@@ -759,45 +992,35 @@ class DBCachedTransformerBlocks(torch.nn.Module):
         selected_Fn_transformer_blocks = self.transformer_blocks[
             : Fn_compute_blocks()
         ]
-        # Skip the blocks if they are not in the Fn_compute_blocks_ids.
-        # WARN: DON'T set len(Fn_compute_blocks_ids) > 0 NOW, still have
-        # some precision issues. We don't know whether a step should be
-        # cached or not before the first Fn blocks are processed.
-        if len(Fn_compute_blocks_ids()) > 0:
-            selected_Fn_transformer_blocks = [
-                selected_Fn_transformer_blocks[i]
-                for i in Fn_compute_blocks_ids()
-                if i < len(selected_Fn_transformer_blocks)
-            ]
         return selected_Fn_transformer_blocks
     @torch.compiler.disable
-    def _MN2n_single_transformer_blocks(self):  # middle
+    def _Mn_single_transformer_blocks(self):  # middle blocks
         # M(N-2n): transformer_blocks [n,...] + single_transformer_blocks [0,...,N-n]
-        selected_MN2n_single_transformer_blocks = []
+        selected_Mn_single_transformer_blocks = []
         if self.single_transformer_blocks is not None:
             if Bn_compute_blocks() == 0:  # WARN: x[:-0] = []
-                selected_MN2n_single_transformer_blocks = (
+                selected_Mn_single_transformer_blocks = (
                     self.single_transformer_blocks
                 )
             else:
-                selected_MN2n_single_transformer_blocks = (
+                selected_Mn_single_transformer_blocks = (
                     self.single_transformer_blocks[: -Bn_compute_blocks()]
                 )
-        return selected_MN2n_single_transformer_blocks
+        return selected_Mn_single_transformer_blocks
     @torch.compiler.disable
-    def _MN2n_transformer_blocks(self):
+    def _Mn_transformer_blocks(self):  # middle blocks
         # M(N-2n): only transformer_blocks [n,...,N-n], middle
         if Bn_compute_blocks() == 0:  # WARN: x[:-0] = []
-            selected_MN2n_transformer_blocks = self.transformer_blocks[
+            selected_Mn_transformer_blocks = self.transformer_blocks[
                 Fn_compute_blocks() :
             ]
         else:
-            selected_MN2n_transformer_blocks = self.transformer_blocks[
+            selected_Mn_transformer_blocks = self.transformer_blocks[
                 Fn_compute_blocks() : -Bn_compute_blocks()
             ]
-        return selected_MN2n_transformer_blocks
+        return selected_Mn_transformer_blocks
     @torch.compiler.disable
     def _Bn_single_transformer_blocks(self):
@@ -845,7 +1068,7 @@ class DBCachedTransformerBlocks(torch.nn.Module):
         return hidden_states, encoder_hidden_states
-    def call_MN2n_transformer_blocks(
+    def call_Mn_transformer_blocks(
         self,
         hidden_states: torch.Tensor,
         encoder_hidden_states: torch.Tensor,
@@ -873,7 +1096,7 @@ class DBCachedTransformerBlocks(torch.nn.Module):
             hidden_states = torch.cat(
                 [encoder_hidden_states, hidden_states], dim=1
             )
-            for block in self._MN2n_single_transformer_blocks():
+            for block in self._Mn_single_transformer_blocks():
                 hidden_states = block(
                     hidden_states,
                     *args,
@@ -887,7 +1110,7 @@ class DBCachedTransformerBlocks(torch.nn.Module):
                 dim=1,
             )
         else:
-            for block in self._MN2n_transformer_blocks():
+            for block in self._Mn_transformer_blocks():
                 hidden_states = block(
                     hidden_states,
                     encoder_hidden_states,
@@ -1016,7 +1239,9 @@ class DBCachedTransformerBlocks(torch.nn.Module):
     def _compute_and_cache_single_transformer_block(
         self,
-        i: int,  # Block index in the transformer blocks
+        # Block index in the transformer blocks
+        # Bn: 8, block_id should be in [0, 8)
+        block_id: int,
         # Helper inputs for hidden states split and reshape
         original_hidden_states: torch.Tensor,
         original_encoder_hidden_states: torch.Tensor,
@@ -1042,7 +1267,7 @@ class DBCachedTransformerBlocks(torch.nn.Module):
             )
             # Cache residuals for the non-compute Bn blocks for
             # subsequent cache steps.
-            if i not in Bn_compute_blocks_ids():
+            if block_id not in Bn_compute_blocks_ids():
                 Bn_i_hidden_states = hidden_states
                 (
                     Bn_i_hidden_states_residual,
@@ -1057,16 +1282,20 @@ class DBCachedTransformerBlocks(torch.nn.Module):
                 # Save original_hidden_states for diff calculation.
                 set_Bn_buffer(
                     Bn_i_original_hidden_states,
-                    prefix=f"Bn_{i}_single_original",
+                    prefix=f"Bn_{block_id}_single_original",
+                )
+                set_Bn_encoder_buffer(
+                    Bn_i_original_hidden_states,
+                    prefix=f"Bn_{block_id}_single_original",
                 )
                 set_Bn_buffer(
                     Bn_i_hidden_states_residual,
-                    prefix=f"Bn_{i}_single_residual",
+                    prefix=f"Bn_{block_id}_single_residual",
                 )
                 set_Bn_encoder_buffer(
                     Bn_i_encoder_hidden_states_residual,
-                    prefix=f"Bn_{i}_single_residual",
+                    prefix=f"Bn_{block_id}_single_residual",
                 )
                 del Bn_i_hidden_states
                 del Bn_i_hidden_states_residual
@@ -1077,7 +1306,7 @@ class DBCachedTransformerBlocks(torch.nn.Module):
         else:
             # Cache steps: Reuse the cached residuals.
             # Check if the block is in the Bn_compute_blocks_ids.
-            if i in Bn_compute_blocks_ids():
+            if block_id in Bn_compute_blocks_ids():
                 hidden_states = block(
                     hidden_states,
                     *args,
@@ -1091,7 +1320,7 @@ class DBCachedTransformerBlocks(torch.nn.Module):
                     hidden_states,  # curr step
                     parallelized=self._is_parallelized(),
                     threshold=non_compute_blocks_diff_threshold(),
-                    prefix=f"Bn_{i}_single_original",  # prev step
+                    prefix=f"Bn_{block_id}_single_original",  # prev step
                 ):
                     Bn_i_original_hidden_states = hidden_states
                     (
@@ -1106,7 +1335,16 @@ class DBCachedTransformerBlocks(torch.nn.Module):
                         apply_hidden_states_residual(
                             Bn_i_original_hidden_states,
                             Bn_i_original_encoder_hidden_states,
-                            prefix=f"Bn_{i}_single_residual",
+                            prefix=(
+                                f"Bn_{block_id}_single_residual"
+                                if is_cache_residual()
+                                else f"Bn_{block_id}_single_original"
+                            ),
+                            encoder_prefix=(
+                                f"Bn_{block_id}_single_residual"
+                                if is_encoder_cache_residual()
+                                else f"Bn_{block_id}_single_original"
+                            ),
                         )
                     )
                     hidden_states = torch.cat(
@@ -1125,7 +1363,9 @@ class DBCachedTransformerBlocks(torch.nn.Module):
     def _compute_and_cache_transformer_block(
         self,
-        i: int,  # Block index in the transformer blocks
+        # Block index in the transformer blocks
+        # Bn: 8, block_id should be in [0, 8)
+        block_id: int,
         # Below are the inputs to the block
         block,  # The transformer block to be executed
         hidden_states: torch.Tensor,
@@ -1158,7 +1398,7 @@ class DBCachedTransformerBlocks(torch.nn.Module):
                     )
             # Cache residuals for the non-compute Bn blocks for
             # subsequent cache steps.
-            if i not in Bn_compute_blocks_ids():
+            if block_id not in Bn_compute_blocks_ids():
                 Bn_i_hidden_states_residual = (
                     hidden_states - Bn_i_original_hidden_states
                 )
@@ -1169,16 +1409,20 @@ class DBCachedTransformerBlocks(torch.nn.Module):
                 # Save original_hidden_states for diff calculation.
                 set_Bn_buffer(
                     Bn_i_original_hidden_states,
-                    prefix=f"Bn_{i}_original",
+                    prefix=f"Bn_{block_id}_original",
+                )
+                set_Bn_encoder_buffer(
+                    Bn_i_original_encoder_hidden_states,
+                    prefix=f"Bn_{block_id}_original",
                 )
                 set_Bn_buffer(
                     Bn_i_hidden_states_residual,
-                    prefix=f"Bn_{i}_residual",
+                    prefix=f"Bn_{block_id}_residual",
                 )
                 set_Bn_encoder_buffer(
                     Bn_i_encoder_hidden_states_residual,
-                    prefix=f"Bn_{i}_residual",
+                    prefix=f"Bn_{block_id}_residual",
                 )
                 del Bn_i_hidden_states_residual
                 del Bn_i_encoder_hidden_states_residual
@@ -1189,7 +1433,7 @@ class DBCachedTransformerBlocks(torch.nn.Module):
         else:
             # Cache steps: Reuse the cached residuals.
             # Check if the block is in the Bn_compute_blocks_ids.
-            if i in Bn_compute_blocks_ids():
+            if block_id in Bn_compute_blocks_ids():
                 hidden_states = block(
                     hidden_states,
                     encoder_hidden_states,
@@ -1211,13 +1455,22 @@ class DBCachedTransformerBlocks(torch.nn.Module):
                     hidden_states,  # curr step
                     parallelized=self._is_parallelized(),
                     threshold=non_compute_blocks_diff_threshold(),
-                    prefix=f"Bn_{i}_original",  # prev step
+                    prefix=f"Bn_{block_id}_original",  # prev step
                 ):
                     hidden_states, encoder_hidden_states = (
                         apply_hidden_states_residual(
                             hidden_states,
                             encoder_hidden_states,
-                            prefix=f"Bn_{i}_residual",
+                            prefix=(
+                                f"Bn_{block_id}_residual"
+                                if is_cache_residual()
+                                else f"Bn_{block_id}_original"
+                            ),
+                            encoder_prefix=(
+                                f"Bn_{block_id}_residual"
+                                if is_encoder_cache_residual()
+                                else f"Bn_{block_id}_original"
+                            ),
                         )
                     )
                 else:

cache_dit/cache_factory/dual_block_cache/diffusers_adapters/__init__.py CHANGED Viewed

@@ -1,5 +1,3 @@
-# Adapted from: https://github.com/chengzeyi/ParaAttention/blob/main/src/para_attn/first_block_cache/diffusers_adapters/__init__.py
 import importlib
 from diffusers import DiffusionPipeline

cache_dit/cache_factory/dual_block_cache/diffusers_adapters/cogvideox.py CHANGED Viewed

@@ -1,5 +1,3 @@
-# Adapted from: https://github.com/chengzeyi/ParaAttention/blob/main/src/para_attn/first_block_cache/diffusers_adapters/cogvideox.py
 import functools
 import unittest

cache_dit/cache_factory/dual_block_cache/diffusers_adapters/flux.py CHANGED Viewed

@@ -1,5 +1,3 @@
-# Adapted from: https://github.com/chengzeyi/ParaAttention/blob/main/src/para_attn/first_block_cache/diffusers_adapters/flux.py
 import functools
 import unittest

cache_dit/cache_factory/dual_block_cache/diffusers_adapters/hunyuan_video.py CHANGED Viewed

@@ -1,4 +1,3 @@
-# Adapted from: https://github.com/chengzeyi/ParaAttention/blob/main/src/para_attn/first_block_cache/diffusers_adapters/hunyuan_video.py
 import functools
 import unittest
 from typing import Any, Dict, Optional, Union

cache_dit/cache_factory/dual_block_cache/diffusers_adapters/mochi.py CHANGED Viewed

@@ -1,4 +1,3 @@
-# Adapted from: https://github.com/chengzeyi/ParaAttention/blob/main/src/para_attn/first_block_cache/diffusers_adapters/mochi.py
 import functools
 import unittest

cache_dit/cache_factory/dual_block_cache/diffusers_adapters/wan.py CHANGED Viewed

@@ -1,5 +1,3 @@
-# Adapted from: https://github.com/chengzeyi/ParaAttention/tree/main/src/para_attn/first_block_cache/wan.py
 import functools
 import unittest

cache_dit/cache_factory/dynamic_block_prune/diffusers_adapters/__init__.py CHANGED Viewed

@@ -1,5 +1,3 @@
-# Adapted from: https://github.com/chengzeyi/ParaAttention/blob/main/src/para_attn/first_block_cache/diffusers_adapters/__init__.py
 import importlib
 from diffusers import DiffusionPipeline

cache_dit/cache_factory/dynamic_block_prune/diffusers_adapters/cogvideox.py CHANGED Viewed

@@ -1,5 +1,3 @@
-# Adapted from: https://github.com/chengzeyi/ParaAttention/blob/main/src/para_attn/first_block_cache/diffusers_adapters/cogvideox.py
 import functools
 import unittest

cache_dit/cache_factory/dynamic_block_prune/diffusers_adapters/flux.py CHANGED Viewed

@@ -1,5 +1,3 @@
-# Adapted from: https://github.com/chengzeyi/ParaAttention/blob/main/src/para_attn/first_block_cache/diffusers_adapters/flux.py
 import functools
 import unittest

cache_dit/cache_factory/dynamic_block_prune/diffusers_adapters/hunyuan_video.py CHANGED Viewed

@@ -1,4 +1,3 @@
-# Adapted from: https://github.com/chengzeyi/ParaAttention/blob/main/src/para_attn/first_block_cache/diffusers_adapters/hunyuan_video.py
 import functools
 import unittest
 from typing import Any, Dict, Optional, Union

cache_dit/cache_factory/dynamic_block_prune/diffusers_adapters/mochi.py CHANGED Viewed

@@ -1,5 +1,3 @@
-# Adapted from: https://github.com/chengzeyi/ParaAttention/blob/main/src/para_attn/first_block_cache/diffusers_adapters/mochi.py
 import functools
 import unittest

cache_dit/cache_factory/dynamic_block_prune/diffusers_adapters/wan.py CHANGED Viewed

@@ -1,5 +1,3 @@
-# Adapted from: https://github.com/chengzeyi/ParaAttention/tree/main/src/para_attn/first_block_cache/wan.py
 import functools
 import unittest
@@ -56,7 +54,7 @@ def apply_cache_on_pipe(
     shallow_patch: bool = False,
     residual_diff_threshold=0.03,
     downsample_factor=1,
-    # SLG is not supported in WAN with DBCache yet
+    # SLG is not supported in WAN with DBPrune yet
     # slg_layers=None,
     # slg_start: float = 0.0,
     # slg_end: float = 0.1,

cache_dit/cache_factory/first_block_cache/cache_context.py CHANGED Viewed

@@ -370,6 +370,9 @@ def apply_prev_hidden_states_residual(
         hidden_states = hidden_states_residual + hidden_states
         hidden_states = hidden_states.contiguous()
+        # NOTE: We should also support taylorseer for
+        # encoder_hidden_states approximation. Please
+        # use DBCache instead.
     else:
         hidden_states_residual = get_hidden_states_residual()
         assert (

cache_dit/cache_factory/taylorseer.py CHANGED Viewed

@@ -1,5 +1,6 @@
 # Adapted from: https://github.com/chengzeyi/ParaAttention/blob/main/src/para_attn/first_block_cache/taylorseer.py
 import math
+import torch
 class TaylorSeer:
@@ -17,6 +18,7 @@ class TaylorSeer:
         self.compute_step_map = compute_step_map
         self.reset_cache()
+    @torch.compiler.disable
     def reset_cache(self):
         self.state = {
             "dY_prev": [None] * self.ORDER,
@@ -25,6 +27,7 @@ class TaylorSeer:
         self.current_step = -1
         self.last_non_approximated_step = -1
+    @torch.compiler.disable
     def should_compute_full(self, step=None):
         step = self.current_step if step is None else step
         if self.compute_step_map is not None:
@@ -36,7 +39,15 @@ class TaylorSeer:
             return True
         return False
+    @torch.compiler.disable
     def approximate_derivative(self, Y):
+        # n-th order Taylor expansion:
+        # Y(t) = Y(0) + dY(0)/dt * t + d^2Y(0)/dt^2 * t^2 / 2!
+        #        + ... + d^nY(0)/dt^n * t^n / n!
+        # reference: https://github.com/Shenyi-Z/TaylorSeer
+        # TaylorSeer-FLUX/src/flux/taylor_utils/__init__.py
+        # TODO: Custom Triton/CUDA kernel for better performance,
+        # especially for large n_derivatives.
         dY_current = [None] * self.ORDER
         dY_current[0] = Y
         window = self.current_step - self.last_non_approximated_step
@@ -49,7 +60,10 @@ class TaylorSeer:
                 break
         return dY_current
+    @torch.compiler.disable
     def approximate_value(self):
+        # TODO: Custom Triton/CUDA kernel for better performance,
+        # especially for large n_derivatives.
         elapsed = self.current_step - self.last_non_approximated_step
         output = 0
         for i, derivative in enumerate(self.state["dY_current"]):
@@ -59,14 +73,30 @@ class TaylorSeer:
                 break
         return output
+    @torch.compiler.disable
     def mark_step_begin(self):
         self.current_step += 1
+    @torch.compiler.disable
     def update(self, Y):
+        # Directly call this method will ingnore the warmup
+        # policy and force full computation.
+        # Assume warmup steps is 3, and n_derivatives is 3.
+        # step 0: dY_prev    = [None, None,   None,    None   ]
+        #         dY_current = [Y0,   None,   None,    None   ]
+        # step 1: dY_prev    = [Y0,   None,   None,    None   ]
+        #         dY_current = [Y1,   dY1,    None,    None   ]
+        # step 2: dY_prev    = [Y1,   dY1,    None,    None   ]
+        #         dY_current = [Y2,   dY2/Y1, dY2/dY1, None   ]
+        # step 3: dY_prev    = [Y2,   dY2/Y1, dY2/dY1, None   ],
+        #         dY_current = [Y3,   dY3/Y2, dY3/dY2, dY3/dY1]
+        # step 4: dY_prev    = [Y3,   dY3/Y2, dY3/dY2, dY3/dY1]
+        #         dY_current = [Y4,   dY4/Y3, dY4/dY3, dY4/dY2]
         self.state["dY_prev"] = self.state["dY_current"]
         self.state["dY_current"] = self.approximate_derivative(Y)
         self.last_non_approximated_step = self.current_step
+    @torch.compiler.disable
     def step(self, Y):
         self.mark_step_begin()
         if self.should_compute_full():

{cache_dit-0.2.0.dist-info → cache_dit-0.2.2.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: cache_dit
-Version: 0.2.0
+Version: 0.2.2
 Summary: 🤗 CacheDiT: A Training-free and Easy-to-use Cache Acceleration Toolbox for Diffusion Transformers
 Author: DefTruth, vipshop.com, etc.
 Maintainer: DefTruth, vipshop.com, etc
@@ -37,40 +37,31 @@ Dynamic: requires-python
   <p align="center">
     <h2>🤗 CacheDiT: A Training-free and Easy-to-use Cache Acceleration <br>Toolbox for Diffusion Transformers</h2>
   </p>
-  <img src=https://github.com/vipshop/cache-dit/raw/main/assets/cache-dit.png >
+  <img src=https://github.com/vipshop/cache-dit/raw/main/assets/cache-dit-v1.png >
   <div align='center'>
       <img src=https://img.shields.io/badge/Language-Python-brightgreen.svg >
       <img src=https://img.shields.io/badge/PRs-welcome-9cf.svg >
       <img src=https://img.shields.io/badge/PyPI-pass-brightgreen.svg >
       <img src=https://static.pepy.tech/badge/cache-dit >
       <img src=https://img.shields.io/badge/Python-3.10|3.11|3.12-9cf.svg >
-      <img src=https://img.shields.io/badge/Release-v0.2.0-brightgreen.svg >
+      <img src=https://img.shields.io/badge/Release-v0.2.2-brightgreen.svg >
  </div>
   <p align="center">
-    DeepCache is for UNet not DiT. Most DiT cache speedups are complex and not training-free. CacheDiT <br>offers a set of training-free cache accelerators for DiT: 🔥DBCache, DBPrune, FBCache, etc🔥
-  </p>
-  <p align="center">
-  <h4> 🔥Supported Models🔥</h4>
-  <a href=https://github.com/vipshop/cache-dit/raw/main/examples> <b>🚀FLUX.1</b>: ✔️DBCache, ✔️DBPrune, ✔️FBCache🔥</a> <br>
-  <a href=https://github.com/vipshop/cache-dit/raw/main/examples> <b>🚀Mochi</b>: ✔️DBCache, ✔️DBPrune, ✔️FBCache🔥</a> <br>
-  <a href=https://github.com/vipshop/cache-dit/raw/main/examples> <b>🚀CogVideoX</b>: ✔️DBCache, ✔️DBPrune, ✔️FBCache🔥</a> <br>
-  <a href=https://github.com/vipshop/cache-dit/raw/main/examples> <b>🚀CogVideoX1.5</b>: ✔️DBCache, ✔️DBPrune, ✔️FBCache🔥</a> <br>
-  <a href=https://github.com/vipshop/cache-dit/raw/main/examples> <b>🚀Wan2.1</b>: ✔️DBCache, ✔️DBPrune, ✔️FBCache🔥</a> <br>
-  <a href=https://github.com/vipshop/cache-dit/raw/main/examples> <b>🚀HunyuanVideo</b>: ✔️DBCache, ✔️DBPrune, ✔️FBCache🔥</a> <br>
+    DeepCache is for UNet not DiT. Most DiT cache speedups are complex and not training-free. CacheDiT offers <br>a set of training-free cache accelerators for DiT: <b>🔥<a href="#dbcache">DBCache</a>, <a href="#dbprune">DBPrune</a>, <a href="#taylorseer">TaylorSeer</a>, <a href="#fbcache">FBCache</a></b>, etc🔥
   </p>
 </div>
-## 👋 Highlight
-<div id="reference"></div>
-The **CacheDiT** codebase is adapted from [FBCache](https://github.com/chengzeyi/ParaAttention/tree/main/src/para_attn/first_block_cache). Special thanks to their excellent work! The **FBCache** support for Mochi, FLUX.1, CogVideoX, Wan2.1, and HunyuanVideo is directly adapted from the original [FBCache](https://github.com/chengzeyi/ParaAttention/tree/main/src/para_attn/first_block_cache).
+<div align="center">
+  <p align="center">
+    <b>♥️ Please consider to leave a ⭐️ Star to support us ~ ♥️</b>
+  </p>
+</div>
 ## 🤗 Introduction
 <div align="center">
   <p align="center">
-    <h3>🔥 DBCache: Dual Block Caching for Diffusion Transformers</h3>
+    <h3>🔥DBCache: Dual Block Caching for Diffusion Transformers</h3>
   </p>
 </div>
@@ -86,9 +77,9 @@ The **CacheDiT** codebase is adapted from [FBCache](https://github.com/chengzeyi
 |:---:|:---:|:---:|:---:|:---:|:---:|
 |24.85s|15.59s|8.58s|15.41s|15.11s|17.74s|
 |<img src=https://github.com/vipshop/cache-dit/raw/main/assets/NONE_R0.08_S0.png width=105px>|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/DBCACHE_F1B0S1_R0.08_S11.png width=105px> | <img src=https://github.com/vipshop/cache-dit/raw/main/assets/DBCACHE_F1B0S1_R0.2_S19.png width=105px>|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/DBCACHE_F8B8S1_R0.15_S15.png width=105px>|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/DBCACHE_F12B12S4_R0.2_S16.png width=105px>|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/DBCACHE_F16B16S4_R0.2_S13.png width=105px>|
-|**Baseline(L20x1)**|**F1B0 (0.08)**|**F8B8 (0.12)**|**F8B12 (0.20)**|**F8B16 (0.20)**|**F8B20 (0.20)**|
+|**Baseline(L20x1)**|**F1B0 (0.08)**|**F8B8 (0.12)**|**F8B12 (0.12)**|**F8B16 (0.20)**|**F8B20 (0.20)**|
 |27.85s|6.04s|5.88s|5.77s|6.01s|6.20s|
-|<img src=https://github.com/user-attachments/assets/70ea57f4-d8f2-415b-8a96-d8315974a5e6 width=105px>|<img src=https://github.com/user-attachments/assets/fc0e1a67-19cc-44aa-bf50-04696e7978a0 width=105px> |<img src=https://github.com/user-attachments/assets/d1434896-628c-436b-95ad-43c085a8629e width=105px>|<img src=https://github.com/user-attachments/assets/aaa42cd2-57de-4c4e-8bfb-913018a8251d width=105px>|<img src=https://github.com/user-attachments/assets/dc0ba2a4-ef7c-436d-8a39-67055deab92f width=105px>|<img src=https://github.com/user-attachments/assets/aede466f-61ed-4256-8df0-fecf8020c5ca width=105px>|
+|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/TEXTURE_NONE_R0.08.png width=105px>|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/TEXTURE_DBCACHE_F1B0_R0.08.png width=105px> |<img src=https://github.com/vipshop/cache-dit/raw/main/assets/TEXTURE_DBCACHE_F8B8_R0.12.png width=105px>|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/TEXTURE_DBCACHE_F8B12_R0.12.png width=105px>|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/TEXTURE_DBCACHE_F8B16_R0.2.png width=105px>|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/TEXTURE_DBCACHE_F8B20_R0.2.png width=105px>|
 <div align="center">
   <p align="center">
@@ -100,7 +91,7 @@ These case studies demonstrate that even with relatively high thresholds (such a
 <div align="center">
   <p align="center">
-    <h3>🔥 DBPrune: Dynamic Block Prune with Residual Caching</h3>
+    <h3>🔥DBPrune: Dynamic Block Prune with Residual Caching</h3>
   </p>
 </div>
@@ -119,11 +110,11 @@ These case studies demonstrate that even with relatively high thresholds (such a
 <div align="center">
   <p align="center">
-    <h3>🔥 Context Parallelism and Torch Compile</h3>
+    <h3>🔥Context Parallelism and Torch Compile</h3>
   </p>
 </div>
-Moreover, **CacheDiT** are **plug-and-play** solutions that works hand-in-hand with [ParaAttention](https://github.com/chengzeyi/ParaAttention). Users can easily tap into its **Context Parallelism** features for distributed inference. By the way, CacheDiT is designed to work compatibly with **torch.compile.** You can easily use CacheDiT with torch.compile to further achieve a better performance.
+Moreover, **CacheDiT** are **plug-and-play** solutions that works hand-in-hand with [ParaAttention](https://github.com/chengzeyi/ParaAttention). Users can easily tap into its **Context Parallelism** features for distributed inference. CacheDiT is designed to work compatibly with **torch.compile.** You can easily use CacheDiT with torch.compile to further achieve a better performance.
 <div align="center">
   <p align="center">
@@ -137,12 +128,6 @@ Moreover, **CacheDiT** are **plug-and-play** solutions that works hand-in-hand w
 |+L20x4:7.75s|6.62s|6.03s|5.81s|5.24s|3.93s|
 |<img src=https://github.com/vipshop/cache-dit/raw/main/assets/U0_C1_NONE_R0.08_S0_T20.43s.png width=105px>|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/U0_C1_DBPRUNE_F1B0_R0.03_P24.0_T16.25s.png width=105px> | <img src=https://github.com/vipshop/cache-dit/raw/main/assets/U0_C1_DBPRUNE_F1B0_R0.04_P34.6_T14.12s.png width=105px>|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/U0_C1_DBPRUNE_F1B0_R0.045_P38.2_T13.41s.png width=105px>|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/U0_C1_DBPRUNE_F1B0_R0.055_P45.1_T12.00s.png width=105px>|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/U0_C1_DBPRUNE_F1B0_R0.2_P59.5_T8.86s.png width=105px>|
-<div align="center">
-  <p align="center">
-    <b>♥️ Please consider to leave a ⭐️ Star to support us ~ ♥️</b>
-  </p>
-</div>
 ## ©️Citations
 ```BibTeX
@@ -155,12 +140,20 @@ Moreover, **CacheDiT** are **plug-and-play** solutions that works hand-in-hand w
 }
 ```
+## 👋Reference
+<div id="reference"></div>
+The **CacheDiT** codebase is adapted from [FBCache](https://github.com/chengzeyi/ParaAttention/tree/main/src/para_attn/first_block_cache). Special thanks to their excellent work!
 ## 📖Contents
 <div id="contents"></div>
 - [⚙️Installation](#️installation)
+- [🔥Supported Models](#supported)
 - [⚡️Dual Block Cache](#dbcache)
+- [🔥Hybrid TaylorSeer](#taylorseer)
 - [🎉First Block Cache](#fbcache)
 - [⚡️Dynamic Block Prune](#dbprune)
 - [🎉Context Parallelism](#context-parallelism)
@@ -183,16 +176,31 @@ Or you can install the latest develop version from GitHub:
 pip3 install git+https://github.com/vipshop/cache-dit.git
 ```
+## 🔥Supported Models
+<div id="supported"></div>
+- [🚀FLUX.1](https://github.com/vipshop/cache-dit/raw/main/examples)
+- [🚀Mochi](https://github.com/vipshop/cache-dit/raw/main/examples)
+- [🚀CogVideoX](https://github.com/vipshop/cache-dit/raw/main/examples)
+- [🚀CogVideoX1.5](https://github.com/vipshop/cache-dit/raw/main/examples)
+- [🚀Wan2.1](https://github.com/vipshop/cache-dit/raw/main/examples)
+- [🚀HunyuanVideo](https://github.com/vipshop/cache-dit/raw/main/examples)
 ## ⚡️DBCache: Dual Block Cache
 <div id="dbcache"></div>
-![](https://github.com/user-attachments/assets/c2a382b9-0ccd-46f4-aacc-87857b4a4de8)
+![](https://github.com/vipshop/cache-dit/raw/main/assets/dbcache-v1.png)
 **DBCache** provides configurable parameters for custom optimization, enabling a balanced trade-off between performance and precision:
 - **Fn**: Specifies that DBCache uses the **first n** Transformer blocks to fit the information at time step t, enabling the calculation of a more stable L1 diff and delivering more accurate information to subsequent blocks.
 - **Bn**: Further fuses approximate information in the **last n** Transformer blocks to enhance prediction accuracy. These blocks act as an auto-scaler for approximate hidden states that use residual cache.
+![](https://github.com/vipshop/cache-dit/raw/main/assets/dbcache-fnbn-v1.png)
 - **warmup_steps**: (default: 0) DBCache does not apply the caching strategy when the number of running steps is less than or equal to this value, ensuring the model sufficiently learns basic features during warmup.
 - **max_cached_steps**:  (default: -1) DBCache disables the caching strategy when the previous cached steps exceed this value to prevent precision degradation.
 - **residual_diff_threshold**: The value of residual diff threshold, a higher value leads to faster performance at the cost of lower precision.
@@ -248,11 +256,50 @@ cache_options = {
 |24.85s|15.59s|8.58s|15.41s|15.11s|17.74s|
 |<img src=https://github.com/vipshop/cache-dit/raw/main/assets/NONE_R0.08_S0.png width=105px>|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/DBCACHE_F1B0S1_R0.08_S11.png width=105px> | <img src=https://github.com/vipshop/cache-dit/raw/main/assets/DBCACHE_F1B0S1_R0.2_S19.png width=105px>|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/DBCACHE_F8B8S1_R0.15_S15.png width=105px>|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/DBCACHE_F12B12S4_R0.2_S16.png width=105px>|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/DBCACHE_F16B16S4_R0.2_S13.png width=105px>|
+## 🔥Hybrid TaylorSeer
+<div id="taylorseer"></div>
+We have supported the [TaylorSeers: From Reusing to Forecasting: Accelerating Diffusion Models with TaylorSeers](https://arxiv.org/pdf/2503.06923) algorithm to further improve the precision of DBCache in cases where the cached steps are large, namely, **Hybrid TaylorSeer + DBCache**. At timesteps with significant intervals, the feature similarity in diffusion models decreases substantially, significantly harming the generation quality.
+$$
+\mathcal{F}\_{\text {pred }, m}\left(x_{t-k}^l\right)=\mathcal{F}\left(x_t^l\right)+\sum_{i=1}^m \frac{\Delta^i \mathcal{F}\left(x_t^l\right)}{i!\cdot N^i}(-k)^i
+$$
+**TaylorSeer** employs a differential method to approximate the higher-order derivatives of features and predict features in future timesteps with Taylor series expansion. The TaylorSeer implemented in CacheDiT supports both hidden states and residual cache types. That is $\mathcal{F}\_{\text {pred }, m}\left(x_{t-k}^l\right)$ can be a residual cache or a hidden-state cache.
+```python
+cache_options = {
+    # TaylorSeer options
+    "enable_taylorseer": True,
+    "enable_encoder_taylorseer": True,
+    # Taylorseer cache type cache be hidden_states or residual.
+    "taylorseer_cache_type": "residual",
+    # Higher values of n_derivatives will lead to longer
+    # computation time but may improve precision significantly.
+    "taylorseer_kwargs": {
+        "n_derivatives": 2, # default is 2.
+    },
+    "warmup_steps": 3, # n_derivatives + 1
+    "residual_diff_threshold": 0.12,
+}
+```
+<div align="center">
+  <p align="center">
+    <b>DBCache F1B0 + TaylorSeer</b>, L20x1, Steps: 28, <br>"A cat holding a sign that says hello world with complex background"
+  </p>
+</div>
+|Baseline(L20x1)|F1B0 (0.12)|+TaylorSeer|F1B0 (0.15)|+TaylorSeer|+compile|
+|:---:|:---:|:---:|:---:|:---:|:---:|
+|24.85s|12.85s|12.86s|10.27s|10.28s|8.48s|
+|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/NONE_R0.08_S0.png width=105px>|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/U0_C0_DBCACHE_F1B0S1W0T0ET0_R0.12_S14_T12.85s.png width=105px>|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/U0_C0_DBCACHE_F1B0S1W0T1ET1_R0.12_S14_T12.86s.png width=105px>|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/U0_C0_DBCACHE_F1B0S1W0T0ET0_R0.15_S17_T10.27s.png width=105px>|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/U0_C0_DBCACHE_F1B0S1W0T1ET1_R0.15_S17_T10.28s.png width=105px>|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/U0_C1_DBCACHE_F1B0S1W0T1ET1_R0.15_S17_T8.48s.png width=105px>|
 ## 🎉FBCache: First Block Cache
 <div id="fbcache"></div>
-![](https://github.com/user-attachments/assets/0fb66656-b711-457a-92a7-a830f134272d)
+![](https://github.com/vipshop/cache-dit/raw/main/assets/fbcache-v1.png)
 **DBCache** is a more general cache algorithm than **FBCache**. When Fn=1 and Bn=0, DBCache behaves identically to FBCache. Therefore, you can either use the original FBCache implementation directly or configure **DBCache** with **F1B0** settings to achieve the same functionality.
@@ -286,7 +333,7 @@ apply_cache_on_pipe(pipe, **cache_options)
 <div id="dbprune"></div>
-![](https://github.com/user-attachments/assets/932b6360-9533-4352-b176-4c4d84bd4695)
+![](https://github.com/vipshop/cache-dit/raw/main/assets/dbprune-v1.png)
 We have further implemented a new **Dynamic Block Prune** algorithm based on **Residual Caching** for Diffusion Transformers, which is referred to as **DBPrune**. DBPrune caches each block's hidden states and residuals, then dynamically prunes blocks during inference by computing the L1 distance between previous hidden states. When a block is pruned, its output is approximated using the cached residuals. DBPrune is currently in the experimental phase, and we kindly invite you to stay tuned for upcoming updates.
@@ -340,6 +387,9 @@ cache_options = {
 apply_cache_on_pipe(pipe, **cache_options)
 ```
+> [!Important]
+> Please note that for GPUs with lower VRAM, DBPrune may not be suitable for use on video DiTs, as it caches the hidden states and residuals of each block, leading to higher GPU memory requirements. In such cases, please use DBCache, which only caches the hidden states and residuals of 2 blocks.
 <div align="center">
   <p align="center">
     DBPrune, <b> L20x1 </b>, Steps: 28, "A cat holding a sign that says hello world with complex background"
@@ -370,7 +420,7 @@ from para_attn.context_parallel import init_context_parallel_mesh
 from para_attn.context_parallel.diffusers_adapters import parallelize_pipe
 from cache_dit.cache_factory import apply_cache_on_pipe, CacheType
- # Init distributed process group
+# Init distributed process group
 dist.init_process_group()
 torch.cuda.set_device(dist.get_rank())
@@ -417,14 +467,16 @@ torch._dynamo.config.recompile_limit = 96  # default is 8
 torch._dynamo.config.accumulated_recompile_limit = 2048  # default is 256
 ```
+Please check [bench.py](./bench/bench.py) for more details.
 ## 👋Contribute
 <div id="contribute"></div>
-How to contribute? Star ⭐️ this repo to support us or check [CONTRIBUTE.md](./CONTRIBUTE.md).
+How to contribute? Star ⭐️ this repo to support us or check [CONTRIBUTE.md](https://github.com/vipshop/cache-dit/raw/main/CONTRIBUTE.md).
 ## ©️License
 <div id="license"></div>
-We have followed the original License from [ParaAttention](https://github.com/chengzeyi/ParaAttention), please check [LICENSE](./LICENSE) for more details.
+We have followed the original License from [ParaAttention](https://github.com/chengzeyi/ParaAttention), please check [LICENSE](https://github.com/vipshop/cache-dit/raw/main/LICENSE) for more details.

{cache_dit-0.2.0.dist-info → cache_dit-0.2.2.dist-info}/RECORD RENAMED Viewed

@@ -1,36 +1,36 @@
 cache_dit/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-cache_dit/_version.py,sha256=iB5DfB5V6YB5Wo4JmvS-txT42QtmGaWcWp3udRT7zCI,511
+cache_dit/_version.py,sha256=OjGGK5TcHVG44Y62aAqeJH4CskkZoY9ydbHOtCDew50,511
 cache_dit/logger.py,sha256=dKfNe_RRk9HJwfgHGeRR1f0LbskJpKdGmISCbL9roQs,3443
 cache_dit/primitives.py,sha256=A2iG9YLot3gOsZSPp-_gyjqjLgJvWQRx8aitD4JQ23Y,3877
 cache_dit/cache_factory/__init__.py,sha256=5RNuhWakvvqrOV4vkqrEBA7d-V1LwcNSsjtW14mkqK8,5255
-cache_dit/cache_factory/taylorseer.py,sha256=0W29ykJg3MnyLAB2KFicsl11Xe41cDYPgI60bquG_NY,2495
+cache_dit/cache_factory/taylorseer.py,sha256=G3Bu7tPqKP1zt4pzN4gJ1T4VJobyaktC3zNSR1MACqc,4005
 cache_dit/cache_factory/utils.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 cache_dit/cache_factory/dual_block_cache/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-cache_dit/cache_factory/dual_block_cache/cache_context.py,sha256=EJ-uhA2-sWMW1jNDhcBtjHDqSn8lUzfKbYoPfZDQhZU,49665
-cache_dit/cache_factory/dual_block_cache/diffusers_adapters/__init__.py,sha256=ySmO_0IuSm5VrYdi9ccGVHoFVkgtPZMJGq_OMoyl0Q8,2003
-cache_dit/cache_factory/dual_block_cache/diffusers_adapters/cogvideox.py,sha256=1_n-RFMiL3v2SjhSfFrPH5Mn5Dq9z4BesVK8GN_nh2g,2404
-cache_dit/cache_factory/dual_block_cache/diffusers_adapters/flux.py,sha256=UbE6nIF-EtA92QxIZVMzIssdZKQSPAVX1hchF9R8drU,2754
-cache_dit/cache_factory/dual_block_cache/diffusers_adapters/hunyuan_video.py,sha256=k3fnlVRTSFkEfWAJ136EMVCeOHPklsArKWFQGMMyLuM,10102
-cache_dit/cache_factory/dual_block_cache/diffusers_adapters/mochi.py,sha256=qxMu1L3ycT8F-uxpGsmFQBY_BH1vDiGIOXgS_Qbb7dM,2391
-cache_dit/cache_factory/dual_block_cache/diffusers_adapters/wan.py,sha256=M_O91sVaHSJgptv6OY5MhT4vD3gQAUTETEmP1kLdE6c,2713
+cache_dit/cache_factory/dual_block_cache/cache_context.py,sha256=ip3bY79lEGo2xDcGZuRxDu065q1eYabXLJUV8BsfhvM,59698
+cache_dit/cache_factory/dual_block_cache/diffusers_adapters/__init__.py,sha256=krNAICf-aS3JLmSG8vOB9tpLa04uYRcABsC8PMbVUKY,1870
+cache_dit/cache_factory/dual_block_cache/diffusers_adapters/cogvideox.py,sha256=fibkeU-FHa30BNT-uPV2Eqcd5IRli07EKb25tMDp23c,2270
+cache_dit/cache_factory/dual_block_cache/diffusers_adapters/flux.py,sha256=fddSpTHXU24COMGAY-Z21EmHHAEArZBv_-XLRFD6ADU,2625
+cache_dit/cache_factory/dual_block_cache/diffusers_adapters/hunyuan_video.py,sha256=wcZdBhjUB8WSfz40A268BtSe3nr_hRsIi2BNlg1FHRU,9965
+cache_dit/cache_factory/dual_block_cache/diffusers_adapters/mochi.py,sha256=Cmy0KHRDgwXqtmqfkrr7kw0CP6CmkSnuz29gDHcD6sQ,2262
+cache_dit/cache_factory/dual_block_cache/diffusers_adapters/wan.py,sha256=oOb-NNesuyBAMwpRbFDG93TCwffLO7-TXZ4bvEtvGJc,2604
 cache_dit/cache_factory/dynamic_block_prune/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 cache_dit/cache_factory/dynamic_block_prune/prune_context.py,sha256=YRDwZ_16yjThpgVgDv6YaIB4QCE9nEkE-MOru0jOd50,35026
-cache_dit/cache_factory/dynamic_block_prune/diffusers_adapters/__init__.py,sha256=WNBk4GeekjG2Ln1pAer20TVHpWyGyyHN50DdqWuPhc0,2003
-cache_dit/cache_factory/dynamic_block_prune/diffusers_adapters/cogvideox.py,sha256=ORJpdkXkgziDUo-rpebC6pUemgYaDCoeu0cwwLz175U,2407
-cache_dit/cache_factory/dynamic_block_prune/diffusers_adapters/flux.py,sha256=KbEkLSsHtS6xwLWNh3jlOlXRyGRdrI2pWV1zyQxMTj4,2757
-cache_dit/cache_factory/dynamic_block_prune/diffusers_adapters/hunyuan_video.py,sha256=v3SgLJQPbEqSVy2sYVGMhptJqCe-XPXL7LIV7GEacZg,10105
-cache_dit/cache_factory/dynamic_block_prune/diffusers_adapters/mochi.py,sha256=rgeXfww-7WX6URSDg7mF1HuxSmYmoJVjMVoNGuxjwxc,2395
-cache_dit/cache_factory/dynamic_block_prune/diffusers_adapters/wan.py,sha256=JCRIq7kt59nbv7t10FdfmFZd1GawUZu2tYQ_QBB8zXQ,2713
+cache_dit/cache_factory/dynamic_block_prune/diffusers_adapters/__init__.py,sha256=hVBTXj9MMGFGVezT3j8MntFRBiphSaUL4YhSOd8JtuY,1870
+cache_dit/cache_factory/dynamic_block_prune/diffusers_adapters/cogvideox.py,sha256=KP8NxtHAKzzBOoX0lhvlMgY_5dmP4Z3T5TOfwl4SSyg,2273
+cache_dit/cache_factory/dynamic_block_prune/diffusers_adapters/flux.py,sha256=kCB7lL4OIq8TZn-baMIF8D_PVPTFW60omCMVQCb8ebs,2628
+cache_dit/cache_factory/dynamic_block_prune/diffusers_adapters/hunyuan_video.py,sha256=xAkd40BGsfuCKdW3Abrx35VwgZQg4CZFz13P4VY71eY,9968
+cache_dit/cache_factory/dynamic_block_prune/diffusers_adapters/mochi.py,sha256=zXgoRDDjus3a2WSjtNh4ERtQp20ceb6nzohHMDlo2zY,2265
+cache_dit/cache_factory/dynamic_block_prune/diffusers_adapters/wan.py,sha256=PA7nuLgfAelnaI8usQx0Kxi8XATzMapyR1WndEdFoZA,2604
 cache_dit/cache_factory/first_block_cache/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-cache_dit/cache_factory/first_block_cache/cache_context.py,sha256=DpDhtK095PlrvACf7sbjOt2-QpVkV1arr1qGEKJqgaQ,23502
+cache_dit/cache_factory/first_block_cache/cache_context.py,sha256=oeOmVDho8aJa86p8LGACAWltu6Fe4chOW2OW8aPtB5c,23643
 cache_dit/cache_factory/first_block_cache/diffusers_adapters/__init__.py,sha256=-FFgA2MoudEo7uDacg4aWgm1KwfLZFsEDTVxatgbq9M,2146
 cache_dit/cache_factory/first_block_cache/diffusers_adapters/cogvideox.py,sha256=qO5CWyurtwW30mvOe6cxeQPTSXLDlPJcezm72zEjDq8,2375
 cache_dit/cache_factory/first_block_cache/diffusers_adapters/flux.py,sha256=Dcd4OzABCtyQCZNX2KNnUTdVoO1E1ApM7P8gcVYzcK0,2733
 cache_dit/cache_factory/first_block_cache/diffusers_adapters/hunyuan_video.py,sha256=OL7W4ukYlZz0IDmBR1zVV6XT3Mgciglj9Hqzv1wUAkQ,10092
 cache_dit/cache_factory/first_block_cache/diffusers_adapters/mochi.py,sha256=lQTClo52OwPbNEE4jiBZQhfC7hbtYqnYIABp_vbm_dk,2363
 cache_dit/cache_factory/first_block_cache/diffusers_adapters/wan.py,sha256=dBNzHBECAuTTA1a7kLdvZL20YzaKTAS3iciVLzKKEWA,2638
-cache_dit-0.2.0.dist-info/licenses/LICENSE,sha256=Dqb07Ik2dV41s9nIdMUbiRWEfDqo7-dQeRiY7kPO8PE,3769
-cache_dit-0.2.0.dist-info/METADATA,sha256=WK3Fu8euIwLlm3TXjJws9VzwNjEfcMNvkCSJRt7jEdo,21845
-cache_dit-0.2.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
-cache_dit-0.2.0.dist-info/top_level.txt,sha256=ZJDydonLEhujzz0FOkVbO-BqfzO9d_VqRHmZU-3MOZo,10
-cache_dit-0.2.0.dist-info/RECORD,,
+cache_dit-0.2.2.dist-info/licenses/LICENSE,sha256=Dqb07Ik2dV41s9nIdMUbiRWEfDqo7-dQeRiY7kPO8PE,3769
+cache_dit-0.2.2.dist-info/METADATA,sha256=-4Sl6vCYl5bPBQVtlHGhHvEz0Nn-o3R5mYArGARsjLU,24521
+cache_dit-0.2.2.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
+cache_dit-0.2.2.dist-info/top_level.txt,sha256=ZJDydonLEhujzz0FOkVbO-BqfzO9d_VqRHmZU-3MOZo,10
+cache_dit-0.2.2.dist-info/RECORD,,

{cache_dit-0.2.0.dist-info → cache_dit-0.2.2.dist-info}/WHEEL RENAMED Viewed

File without changes

{cache_dit-0.2.0.dist-info → cache_dit-0.2.2.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

{cache_dit-0.2.0.dist-info → cache_dit-0.2.2.dist-info}/top_level.txt RENAMED Viewed

File without changes

cache-dit 0.2.0__py3-none-any.whl → 0.2.2__py3-none-any.whl

Potentially problematic release.

cache-dit 0.2.0py3-none-any.whl → 0.2.2py3-none-any.whl