PyPI - cache-dit - Versions diffs - 0.2.3__py3-none-any.whl → 0.2.5__py3-none-any.whl - Mend

cache-dit 0.2.3py3-none-any.whl → 0.2.5py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of cache-dit might be problematic. Click here for more details.

Files changed (9) hide show

cache_dit/_version.py CHANGED Viewed

@@ -17,5 +17,5 @@ __version__: str
 __version_tuple__: VERSION_TUPLE
 version_tuple: VERSION_TUPLE
-__version__ = version = '0.2.3'
-__version_tuple__ = version_tuple = (0, 2, 3)
+__version__ = version = '0.2.5'
+__version_tuple__ = version_tuple = (0, 2, 5)

cache_dit/cache_factory/dual_block_cache/cache_context.py CHANGED Viewed

@@ -49,18 +49,18 @@ class DBCacheContext:
     # Other settings
     downsample_factor: int = 1
-    num_inference_steps: int = -1
+    num_inference_steps: int = -1  # un-used now
     warmup_steps: int = 0  # DON'T Cache in warmup steps
     # DON'T Cache if the number of cached steps >= max_cached_steps
-    max_cached_steps: int = -1
+    max_cached_steps: int = -1  # for both CFG and non-CFG
     # Statistics for botch alter cache and non-alter cache
     # Record the steps that have been cached, both alter cache and non-alter cache
-    executed_steps: int = 0  # cache + non-cache steps
-    cached_steps: List[int] = dataclasses.field(default_factory=list)
-    residual_diffs: DefaultDict[str, float] = dataclasses.field(
-        default_factory=lambda: defaultdict(float),
-    )
+    executed_steps: int = 0  # cache + non-cache steps pippeline
+    # steps for transformer, for CFG, transformer_executed_steps will
+    # be double of executed_steps.
+    transformer_executed_steps: int = 0
     # Support TaylorSeers in Dual Block Cache
     # Title: From Reusing to Forecasting: Accelerating Diffusion Models with TaylorSeers
     # Url: https://arxiv.org/pdf/2503.06923
@@ -71,8 +71,31 @@ class DBCacheContext:
     taylorseer_kwargs: Dict[str, Any] = dataclasses.field(default_factory=dict)
     taylorseer: Optional[TaylorSeer] = None
     encoder_tarlorseer: Optional[TaylorSeer] = None
-    alter_taylorseer: Optional[TaylorSeer] = None
-    alter_encoder_taylorseer: Optional[TaylorSeer] = None
+    # Support do_separate_classifier_free_guidance, such as Wan 2.1
+    # For model that fused CFG and non-CFG into single forward step,
+    # should set do_separate_classifier_free_guidance as False. For
+    # example: CogVideoX, HunyuanVideo, Mochi.
+    do_separate_classifier_free_guidance: bool = False
+    # Compute cfg forward first or not, default False, namely,
+    # 0, 2, 4, ..., -> non-CFG step; 1, 3, 5, ... -> CFG step.
+    cfg_compute_first: bool = False
+    # Compute spearate diff values for CFG and non-CFG step,
+    # default True. If False, we will use the computed diff from
+    # current non-CFG transformer step for current CFG step.
+    cfg_diff_compute_separate: bool = True
+    cfg_taylorseer: Optional[TaylorSeer] = None
+    cfg_encoder_taylorseer: Optional[TaylorSeer] = None
+    # CFG & non-CFG cached steps
+    cached_steps: List[int] = dataclasses.field(default_factory=list)
+    residual_diffs: DefaultDict[str, float] = dataclasses.field(
+        default_factory=lambda: defaultdict(float),
+    )
+    cfg_cached_steps: List[int] = dataclasses.field(default_factory=list)
+    cfg_residual_diffs: DefaultDict[str, float] = dataclasses.field(
+        default_factory=lambda: defaultdict(float),
+    )
     # TODO: Support SLG in Dual Block Cache
     # Skip Layer Guidance, SLG
@@ -83,6 +106,17 @@ class DBCacheContext:
     @torch.compiler.disable
     def __post_init__(self):
+        # Some checks for settings
+        if self.do_separate_classifier_free_guidance:
+            assert self.enable_alter_cache is False, (
+                "enable_alter_cache must set as False if "
+                "do_separate_classifier_free_guidance is enabled."
+            )
+            if self.cfg_diff_compute_separate:
+                assert self.cfg_compute_first is False, (
+                    "cfg_compute_first must set as False if "
+                    "cfg_diff_compute_separate is enabled."
+                )
         if "warmup_steps" not in self.taylorseer_kwargs:
             # If warmup_steps is not set in taylorseer_kwargs,
@@ -99,13 +133,13 @@ class DBCacheContext:
         if self.enable_taylorseer:
             self.taylorseer = TaylorSeer(**self.taylorseer_kwargs)
-            if self.enable_alter_cache:
-                self.alter_taylorseer = TaylorSeer(**self.taylorseer_kwargs)
+            if self.do_separate_classifier_free_guidance:
+                self.cfg_taylorseer = TaylorSeer(**self.taylorseer_kwargs)
         if self.enable_encoder_taylorseer:
             self.encoder_tarlorseer = TaylorSeer(**self.taylorseer_kwargs)
-            if self.enable_alter_cache:
-                self.alter_encoder_taylorseer = TaylorSeer(
+            if self.do_separate_classifier_free_guidance:
+                self.cfg_encoder_taylorseer = TaylorSeer(
                     **self.taylorseer_kwargs
                 )
@@ -159,18 +193,34 @@ class DBCacheContext:
     @torch.compiler.disable
     def mark_step_begin(self):
-        if not self.enable_alter_cache:
+        # Always increase transformer executed steps
+        # incr    step: prev 0 -> 1; prev 1 -> 2
+        # current step: incr step - 1
+        self.transformer_executed_steps += 1
+        if not self.do_separate_classifier_free_guidance:
             self.executed_steps += 1
         else:
-            self.executed_steps += 1
+            # 0,1 -> 0 + 1, 2,3 -> 1 + 1, ...
+            if not self.cfg_compute_first:
+                if not self.is_separate_classifier_free_guidance_step():
+                    # transformer step: 0,2,4,...
+                    self.executed_steps += 1
+            else:
+                if self.is_separate_classifier_free_guidance_step():
+                    # transformer step: 0,2,4,...
+                    self.executed_steps += 1
+        if not self.enable_alter_cache:
             # 0 F 1 T 2 F 3 T 4 F 5 T ...
             self.is_alter_cache = not self.is_alter_cache
         # Reset the cached steps and residual diffs at the beginning
         # of each inference.
-        if self.get_current_step() == 0:
+        if self.get_current_transformer_step() == 0:
             self.cached_steps.clear()
             self.residual_diffs.clear()
+            self.cfg_cached_steps.clear()
+            self.cfg_residual_diffs.clear()
             self.reset_incremental_names()
             # Reset the TaylorSeers cache at the beginning of each inference.
             # reset_cache will set the current step to -1 for TaylorSeer,
@@ -180,44 +230,100 @@ class DBCacheContext:
                     taylorseer.reset_cache()
                 if encoder_taylorseer is not None:
                     encoder_taylorseer.reset_cache()
+                cfg_taylorseer, cfg_encoder_taylorseer = (
+                    self.get_cfg_taylorseers()
+                )
+                if cfg_taylorseer is not None:
+                    cfg_taylorseer.reset_cache()
+                if cfg_encoder_taylorseer is not None:
+                    cfg_encoder_taylorseer.reset_cache()
         # mark_step_begin of TaylorSeer must be called after the cache is reset.
         if self.enable_taylorseer or self.enable_encoder_taylorseer:
-            taylorseer, encoder_taylorseer = self.get_taylorseers()
-            if taylorseer is not None:
-                taylorseer.mark_step_begin()
-            if encoder_taylorseer is not None:
-                encoder_taylorseer.mark_step_begin()
+            if self.do_separate_classifier_free_guidance:
+                # Assume non-CFG steps: 0, 2, 4, 6, ...
+                if not self.is_separate_classifier_free_guidance_step():
+                    taylorseer, encoder_taylorseer = self.get_taylorseers()
+                    if taylorseer is not None:
+                        taylorseer.mark_step_begin()
+                    if encoder_taylorseer is not None:
+                        encoder_taylorseer.mark_step_begin()
+                else:
+                    cfg_taylorseer, cfg_encoder_taylorseer = (
+                        self.get_cfg_taylorseers()
+                    )
+                    if cfg_taylorseer is not None:
+                        cfg_taylorseer.mark_step_begin()
+                    if cfg_encoder_taylorseer is not None:
+                        cfg_encoder_taylorseer.mark_step_begin()
+            else:
+                taylorseer, encoder_taylorseer = self.get_taylorseers()
+                if taylorseer is not None:
+                    taylorseer.mark_step_begin()
+                if encoder_taylorseer is not None:
+                    encoder_taylorseer.mark_step_begin()
     @torch.compiler.disable
     def get_taylorseers(self):
-        if self.enable_alter_cache and self.is_alter_cache:
-            return self.alter_taylorseer, self.alter_encoder_taylorseer
         return self.taylorseer, self.encoder_tarlorseer
+    @torch.compiler.disable
+    def get_cfg_taylorseers(self):
+        return self.cfg_taylorseer, self.cfg_encoder_taylorseer
     @torch.compiler.disable
     def add_residual_diff(self, diff):
+        # step: executed_steps - 1, not transformer_steps - 1
         step = str(self.get_current_step())
-        if step not in self.residual_diffs:
-            # Only add the diff if it is not already recorded for this step
-            self.residual_diffs[step] = diff
+        # Only add the diff if it is not already recorded for this step
+        if not self.is_separate_classifier_free_guidance_step():
+            if step not in self.residual_diffs:
+                self.residual_diffs[step] = diff
+        else:
+            if step not in self.cfg_residual_diffs:
+                self.cfg_residual_diffs[step] = diff
     @torch.compiler.disable
     def get_residual_diffs(self):
         return self.residual_diffs.copy()
+    @torch.compiler.disable
+    def get_cfg_residual_diffs(self):
+        return self.cfg_residual_diffs.copy()
     @torch.compiler.disable
     def add_cached_step(self):
-        self.cached_steps.append(self.get_current_step())
+        if not self.is_separate_classifier_free_guidance_step():
+            self.cached_steps.append(self.get_current_step())
+        else:
+            self.cfg_cached_steps.append(self.get_current_step())
     @torch.compiler.disable
     def get_cached_steps(self):
         return self.cached_steps.copy()
+    @torch.compiler.disable
+    def get_cfg_cached_steps(self):
+        return self.cfg_cached_steps.copy()
     @torch.compiler.disable
     def get_current_step(self):
         return self.executed_steps - 1
+    @torch.compiler.disable
+    def get_current_transformer_step(self):
+        return self.transformer_executed_steps - 1
+    @torch.compiler.disable
+    def is_separate_classifier_free_guidance_step(self):
+        if not self.do_separate_classifier_free_guidance:
+            return False
+        if self.cfg_compute_first:
+            # CFG steps: 0, 2, 4, 6, ...
+            return self.get_current_transformer_step() % 2 == 0
+        # CFG steps: 1, 3, 5, 7, ...
+        return self.get_current_transformer_step() % 2 != 0
     @torch.compiler.disable
     def is_in_warmup(self):
         return self.get_current_step() < self.warmup_steps
@@ -265,6 +371,35 @@ def get_current_step():
     return cache_context.get_current_step()
+@torch.compiler.disable
+def get_current_step_residual_diff():
+    cache_context = get_current_cache_context()
+    assert cache_context is not None, "cache_context must be set before"
+    step = str(get_current_step())
+    residual_diffs = get_residual_diffs()
+    if step in residual_diffs:
+        return residual_diffs[step]
+    return None
+@torch.compiler.disable
+def get_current_step_cfg_residual_diff():
+    cache_context = get_current_cache_context()
+    assert cache_context is not None, "cache_context must be set before"
+    step = str(get_current_step())
+    cfg_residual_diffs = get_cfg_residual_diffs()
+    if step in cfg_residual_diffs:
+        return cfg_residual_diffs[step]
+    return None
+@torch.compiler.disable
+def get_current_transformer_step():
+    cache_context = get_current_cache_context()
+    assert cache_context is not None, "cache_context must be set before"
+    return cache_context.get_current_transformer_step()
 @torch.compiler.disable
 def get_cached_steps():
     cache_context = get_current_cache_context()
@@ -272,6 +407,13 @@ def get_cached_steps():
     return cache_context.get_cached_steps()
+@torch.compiler.disable
+def get_cfg_cached_steps():
+    cache_context = get_current_cache_context()
+    assert cache_context is not None, "cache_context must be set before"
+    return cache_context.get_cfg_cached_steps()
 @torch.compiler.disable
 def get_max_cached_steps():
     cache_context = get_current_cache_context()
@@ -300,6 +442,13 @@ def get_residual_diffs():
     return cache_context.get_residual_diffs()
+@torch.compiler.disable
+def get_cfg_residual_diffs():
+    cache_context = get_current_cache_context()
+    assert cache_context is not None, "cache_context must be set before"
+    return cache_context.get_cfg_residual_diffs()
 @torch.compiler.disable
 def is_taylorseer_enabled():
     cache_context = get_current_cache_context()
@@ -321,6 +470,13 @@ def get_taylorseers():
     return cache_context.get_taylorseers()
+@torch.compiler.disable
+def get_cfg_taylorseers():
+    cache_context = get_current_cache_context()
+    assert cache_context is not None, "cache_context must be set before"
+    return cache_context.get_cfg_taylorseers()
 @torch.compiler.disable
 def is_taylorseer_cache_residual():
     cache_context = get_current_cache_context()
@@ -459,6 +615,27 @@ def Bn_compute_blocks_ids():
     return cache_context.Bn_compute_blocks_ids
+@torch.compiler.disable
+def do_separate_classifier_free_guidance():
+    cache_context = get_current_cache_context()
+    assert cache_context is not None, "cache_context must be set before"
+    return cache_context.do_separate_classifier_free_guidance
+@torch.compiler.disable
+def is_separate_classifier_free_guidance_step():
+    cache_context = get_current_cache_context()
+    assert cache_context is not None, "cache_context must be set before"
+    return cache_context.is_separate_classifier_free_guidance_step()
+@torch.compiler.disable
+def cfg_diff_compute_separate():
+    cache_context = get_current_cache_context()
+    assert cache_context is not None, "cache_context must be set before"
+    return cache_context.cfg_diff_compute_separate
 _current_cache_context: DBCacheContext = None
@@ -559,38 +736,49 @@ def are_two_tensors_similar(
         add_residual_diff(-2.0)
         return False
-    # Find the most significant token through t1 and t2, and
-    # consider the diff of the significant token. The more significant,
-    # the more important.
-    condition_thresh = get_important_condition_threshold()
-    if condition_thresh > 0.0:
-        raw_diff = (t1 - t2).abs()  # [B, seq_len, d]
-        token_m_df = raw_diff.mean(dim=-1)  # [B, seq_len]
-        token_m_t1 = t1.abs().mean(dim=-1)  # [B, seq_len]
-        # D = (t1 - t2) / t1 = 1 - (t2 / t1), if D = 0, then t1 = t2.
-        token_diff = token_m_df / token_m_t1  # [B, seq_len]
-        condition = token_diff > condition_thresh  # [B, seq_len]
-        if condition.sum() > 0:
-            condition = condition.unsqueeze(-1)  # [B, seq_len, 1]
-            condition = condition.expand_as(raw_diff)  # [B, seq_len, d]
-            mean_diff = raw_diff[condition].mean()
-            mean_t1 = t1[condition].abs().mean()
+    if all(
+        (
+            do_separate_classifier_free_guidance(),
+            is_separate_classifier_free_guidance_step(),
+            not cfg_diff_compute_separate(),
+            get_current_step_residual_diff() is not None,
+        )
+    ):
+        # Reuse computed diff value from non-CFG step
+        diff = get_current_step_residual_diff()
+    else:
+        # Find the most significant token through t1 and t2, and
+        # consider the diff of the significant token. The more significant,
+        # the more important.
+        condition_thresh = get_important_condition_threshold()
+        if condition_thresh > 0.0:
+            raw_diff = (t1 - t2).abs()  # [B, seq_len, d]
+            token_m_df = raw_diff.mean(dim=-1)  # [B, seq_len]
+            token_m_t1 = t1.abs().mean(dim=-1)  # [B, seq_len]
+            # D = (t1 - t2) / t1 = 1 - (t2 / t1), if D = 0, then t1 = t2.
+            token_diff = token_m_df / token_m_t1  # [B, seq_len]
+            condition = token_diff > condition_thresh  # [B, seq_len]
+            if condition.sum() > 0:
+                condition = condition.unsqueeze(-1)  # [B, seq_len, 1]
+                condition = condition.expand_as(raw_diff)  # [B, seq_len, d]
+                mean_diff = raw_diff[condition].mean()
+                mean_t1 = t1[condition].abs().mean()
+            else:
+                mean_diff = (t1 - t2).abs().mean()
+                mean_t1 = t1.abs().mean()
         else:
+            # Use the mean of the absolute difference of the tensors
             mean_diff = (t1 - t2).abs().mean()
             mean_t1 = t1.abs().mean()
-    else:
-        # Use the mean of the absolute difference of the tensors
-        mean_diff = (t1 - t2).abs().mean()
-        mean_t1 = t1.abs().mean()
-    if parallelized:
-        mean_diff = DP.all_reduce_sync(mean_diff, "avg")
-        mean_t1 = DP.all_reduce_sync(mean_t1, "avg")
+        if parallelized:
+            mean_diff = DP.all_reduce_sync(mean_diff, "avg")
+            mean_t1 = DP.all_reduce_sync(mean_t1, "avg")
-    # D = (t1 - t2) / t1 = 1 - (t2 / t1), if D = 0, then t1 = t2.
-    # Futher, if we assume that (H(t,  0) - H(t-1,0)) ~ 0, then,
-    # H(t-1,n) ~ H(t  ,n), which means the hidden states are similar.
-    diff = (mean_diff / mean_t1).item()
+        # D = (t1 - t2) / t1 = 1 - (t2 / t1), if D = 0, then t1 = t2.
+        # Futher, if we assume that (H(t,  0) - H(t-1,0)) ~ 0, then,
+        # H(t-1,n) ~ H(t  ,n), which means the hidden states are similar.
+        diff = (mean_diff / mean_t1).item()
     if logger.isEnabledFor(logging.DEBUG):
         logger.debug(f"{prefix}, diff: {diff:.6f}, threshold: {threshold:.6f}")
@@ -600,6 +788,26 @@ def are_two_tensors_similar(
     return diff < threshold
+@torch.compiler.disable
+def _debugging_set_buffer(prefix):
+    if logger.isEnabledFor(logging.DEBUG):
+        logger.debug(
+            f"set {prefix}, "
+            f"transformer step: {get_current_transformer_step()}, "
+            f"executed step: {get_current_step()}"
+        )
+@torch.compiler.disable
+def _debugging_get_buffer(prefix):
+    if logger.isEnabledFor(logging.DEBUG):
+        logger.debug(
+            f"get {prefix}, "
+            f"transformer step: {get_current_transformer_step()}, "
+            f"executed step: {get_current_step()}"
+        )
 # Fn buffers
 @torch.compiler.disable
 def set_Fn_buffer(buffer: torch.Tensor, prefix: str = "Fn"):
@@ -609,21 +817,39 @@ def set_Fn_buffer(buffer: torch.Tensor, prefix: str = "Fn"):
     if downsample_factor > 1:
         buffer = buffer[..., ::downsample_factor]
         buffer = buffer.contiguous()
-    set_buffer(f"{prefix}_buffer", buffer)
+    if is_separate_classifier_free_guidance_step():
+        _debugging_set_buffer(f"{prefix}_buffer_cfg")
+        set_buffer(f"{prefix}_buffer_cfg", buffer)
+    else:
+        _debugging_set_buffer(f"{prefix}_buffer")
+        set_buffer(f"{prefix}_buffer", buffer)
 @torch.compiler.disable
 def get_Fn_buffer(prefix: str = "Fn"):
+    if is_separate_classifier_free_guidance_step():
+        _debugging_get_buffer(f"{prefix}_buffer_cfg")
+        return get_buffer(f"{prefix}_buffer_cfg")
+    _debugging_get_buffer(f"{prefix}_buffer")
     return get_buffer(f"{prefix}_buffer")
 @torch.compiler.disable
 def set_Fn_encoder_buffer(buffer: torch.Tensor, prefix: str = "Fn"):
-    set_buffer(f"{prefix}_encoder_buffer", buffer)
+    if is_separate_classifier_free_guidance_step():
+        _debugging_set_buffer(f"{prefix}_encoder_buffer_cfg")
+        set_buffer(f"{prefix}_encoder_buffer_cfg", buffer)
+    else:
+        _debugging_set_buffer(f"{prefix}_encoder_buffer")
+        set_buffer(f"{prefix}_encoder_buffer", buffer)
 @torch.compiler.disable
 def get_Fn_encoder_buffer(prefix: str = "Fn"):
+    if is_separate_classifier_free_guidance_step():
+        _debugging_get_buffer(f"{prefix}_encoder_buffer_cfg")
+        return get_buffer(f"{prefix}_encoder_buffer_cfg")
+    _debugging_get_buffer(f"{prefix}_encoder_buffer")
     return get_buffer(f"{prefix}_encoder_buffer")
@@ -634,7 +860,11 @@ def set_Bn_buffer(buffer: torch.Tensor, prefix: str = "Bn"):
     # This buffer is use for hidden states approximation.
     if is_taylorseer_enabled():
         # taylorseer, encoder_taylorseer
-        taylorseer, _ = get_taylorseers()
+        if is_separate_classifier_free_guidance_step():
+            taylorseer, _ = get_cfg_taylorseers()
+        else:
+            taylorseer, _ = get_taylorseers()
         if taylorseer is not None:
             # Use TaylorSeer to update the buffer
             taylorseer.update(buffer)
@@ -644,15 +874,30 @@ def set_Bn_buffer(buffer: torch.Tensor, prefix: str = "Bn"):
                     "TaylorSeer is enabled but not set in the cache context. "
                     "Falling back to default buffer retrieval."
                 )
-            set_buffer(f"{prefix}_buffer", buffer)
+            if is_separate_classifier_free_guidance_step():
+                _debugging_set_buffer(f"{prefix}_buffer_cfg")
+                set_buffer(f"{prefix}_buffer_cfg", buffer)
+            else:
+                _debugging_set_buffer(f"{prefix}_buffer")
+                set_buffer(f"{prefix}_buffer", buffer)
     else:
-        set_buffer(f"{prefix}_buffer", buffer)
+        if is_separate_classifier_free_guidance_step():
+            _debugging_set_buffer(f"{prefix}_buffer_cfg")
+            set_buffer(f"{prefix}_buffer_cfg", buffer)
+        else:
+            _debugging_set_buffer(f"{prefix}_buffer")
+            set_buffer(f"{prefix}_buffer", buffer)
 @torch.compiler.disable
 def get_Bn_buffer(prefix: str = "Bn"):
     if is_taylorseer_enabled():
-        taylorseer, _ = get_taylorseers()
+        # taylorseer, encoder_taylorseer
+        if is_separate_classifier_free_guidance_step():
+            taylorseer, _ = get_cfg_taylorseers()
+        else:
+            taylorseer, _ = get_taylorseers()
         if taylorseer is not None:
             return taylorseer.approximate_value()
         else:
@@ -662,8 +907,16 @@ def get_Bn_buffer(prefix: str = "Bn"):
                     "Falling back to default buffer retrieval."
                 )
             # Fallback to default buffer retrieval
+            if is_separate_classifier_free_guidance_step():
+                _debugging_get_buffer(f"{prefix}_buffer_cfg")
+                return get_buffer(f"{prefix}_buffer_cfg")
+            _debugging_get_buffer(f"{prefix}_buffer")
             return get_buffer(f"{prefix}_buffer")
     else:
+        if is_separate_classifier_free_guidance_step():
+            _debugging_get_buffer(f"{prefix}_buffer_cfg")
+            return get_buffer(f"{prefix}_buffer_cfg")
+        _debugging_get_buffer(f"{prefix}_buffer")
         return get_buffer(f"{prefix}_buffer")
@@ -672,7 +925,11 @@ def set_Bn_encoder_buffer(buffer: torch.Tensor, prefix: str = "Bn"):
     # This buffer is use for encoder hidden states approximation.
     if is_encoder_taylorseer_enabled():
         # taylorseer, encoder_taylorseer
-        _, encoder_taylorseer = get_taylorseers()
+        if is_separate_classifier_free_guidance_step():
+            _, encoder_taylorseer = get_cfg_taylorseers()
+        else:
+            _, encoder_taylorseer = get_taylorseers()
         if encoder_taylorseer is not None:
             # Use TaylorSeer to update the buffer
             encoder_taylorseer.update(buffer)
@@ -682,15 +939,29 @@ def set_Bn_encoder_buffer(buffer: torch.Tensor, prefix: str = "Bn"):
                     "TaylorSeer is enabled but not set in the cache context. "
                     "Falling back to default buffer retrieval."
                 )
-            set_buffer(f"{prefix}_encoder_buffer", buffer)
+            if is_separate_classifier_free_guidance_step():
+                _debugging_set_buffer(f"{prefix}_encoder_buffer_cfg")
+                set_buffer(f"{prefix}_encoder_buffer_cfg", buffer)
+            else:
+                _debugging_set_buffer(f"{prefix}_encoder_buffer")
+                set_buffer(f"{prefix}_encoder_buffer", buffer)
     else:
-        set_buffer(f"{prefix}_encoder_buffer", buffer)
+        if is_separate_classifier_free_guidance_step():
+            _debugging_set_buffer(f"{prefix}_encoder_buffer_cfg")
+            set_buffer(f"{prefix}_encoder_buffer_cfg", buffer)
+        else:
+            _debugging_set_buffer(f"{prefix}_encoder_buffer")
+            set_buffer(f"{prefix}_encoder_buffer", buffer)
 @torch.compiler.disable
 def get_Bn_encoder_buffer(prefix: str = "Bn"):
     if is_encoder_taylorseer_enabled():
-        _, encoder_taylorseer = get_taylorseers()
+        if is_separate_classifier_free_guidance_step():
+            _, encoder_taylorseer = get_cfg_taylorseers()
+        else:
+            _, encoder_taylorseer = get_taylorseers()
         if encoder_taylorseer is not None:
             # Use TaylorSeer to approximate the value
             return encoder_taylorseer.approximate_value()
@@ -701,8 +972,16 @@ def get_Bn_encoder_buffer(prefix: str = "Bn"):
                     "Falling back to default buffer retrieval."
                 )
             # Fallback to default buffer retrieval
+            if is_separate_classifier_free_guidance_step():
+                _debugging_get_buffer(f"{prefix}_encoder_buffer_cfg")
+                return get_buffer(f"{prefix}_encoder_buffer_cfg")
+            _debugging_get_buffer(f"{prefix}_encoder_buffer")
             return get_buffer(f"{prefix}_encoder_buffer")
     else:
+        if is_separate_classifier_free_guidance_step():
+            _debugging_get_buffer(f"{prefix}_encoder_buffer_cfg")
+            return get_buffer(f"{prefix}_encoder_buffer_cfg")
+        _debugging_get_buffer(f"{prefix}_encoder_buffer")
         return get_buffer(f"{prefix}_encoder_buffer")
@@ -766,8 +1045,13 @@ def get_can_use_cache(
 ):
     if is_in_warmup():
         return False
-    cached_steps = get_cached_steps()
     max_cached_steps = get_max_cached_steps()
+    if not is_separate_classifier_free_guidance_step():
+        cached_steps = get_cached_steps()
+    else:
+        cached_steps = get_cfg_cached_steps()
     if max_cached_steps >= 0 and (len(cached_steps) >= max_cached_steps):
         if logger.isEnabledFor(logging.DEBUG):
             logger.debug(
@@ -775,10 +1059,12 @@ def get_can_use_cache(
                 "cannot use cache."
             )
         return False
     if threshold is None or threshold <= 0.0:
         threshold = get_residual_diff_threshold()
     if threshold <= 0.0:
         return False
     downsample_factor = get_downsample_factor()
     if downsample_factor > 1 and "Bn" not in prefix:
         states_tensor = states_tensor[..., ::downsample_factor]
@@ -982,7 +1268,9 @@ class DBCachedTransformerBlocks(torch.nn.Module):
         # Check if the current step is in cache steps.
         # If so, we can skip some Bn blocks and directly
         # use the cached values.
-        return get_current_step() in get_cached_steps()
+        return (get_current_step() in get_cached_steps()) or (
+            get_current_step() in get_cfg_cached_steps()
+        )
     @torch.compiler.disable
     def _Fn_transformer_blocks(self):
@@ -1601,3 +1889,5 @@ def patch_cached_stats(
     # TODO: Patch more cached stats to the transformer
     transformer._cached_steps = get_cached_steps()
     transformer._residual_diffs = get_residual_diffs()
+    transformer._cfg_cached_steps = get_cfg_cached_steps()
+    transformer._cfg_residual_diffs = get_cfg_residual_diffs()

cache_dit/cache_factory/dual_block_cache/diffusers_adapters/wan.py CHANGED Viewed

@@ -70,7 +70,7 @@ def apply_db_cache_on_pipe(
             # "slg_layers": slg_layers,
             # "slg_start": slg_start,
             # "slg_end": slg_end,
-            "num_inference_steps": kwargs.get("num_inference_steps", 50),
+            # "num_inference_steps": kwargs.get("num_inference_steps", 50),
             "warmup_steps": warmup_steps,
             "max_cached_steps": max_cached_steps,
         },

cache_dit/cache_factory/first_block_cache/cache_context.py CHANGED Viewed

@@ -370,8 +370,8 @@ def apply_prev_hidden_states_residual(
         hidden_states = hidden_states_residual + hidden_states
         hidden_states = hidden_states.contiguous()
-        # NOTE: We should also support taylorseer for
-        # encoder_hidden_states approximation. Please
+        # NOTE: We should also support taylorseer for
+        # encoder_hidden_states approximation. Please
         # use DBCache instead.
     else:
         hidden_states_residual = get_hidden_states_residual()

{cache_dit-0.2.3.dist-info → cache_dit-0.2.5.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: cache_dit
-Version: 0.2.3
+Version: 0.2.5
 Summary: 🤗 CacheDiT: A Training-free and Easy-to-use Cache Acceleration Toolbox for Diffusion Transformers
 Author: DefTruth, vipshop.com, etc.
 Maintainer: DefTruth, vipshop.com, etc
@@ -44,7 +44,7 @@ Dynamic: requires-python
       <img src=https://img.shields.io/badge/PyPI-pass-brightgreen.svg >
       <img src=https://static.pepy.tech/badge/cache-dit >
       <img src=https://img.shields.io/badge/Python-3.10|3.11|3.12-9cf.svg >
-      <img src=https://img.shields.io/badge/Release-v0.2.2-brightgreen.svg >
+      <img src=https://img.shields.io/badge/Release-v0.2-brightgreen.svg >
  </div>
   <p align="center">
     DeepCache is for UNet not DiT. Most DiT cache speedups are complex and not training-free. CacheDiT offers <br>a set of training-free cache accelerators for DiT: <b>🔥<a href="#dbcache">DBCache</a>, <a href="#dbprune">DBPrune</a>, <a href="#taylorseer">TaylorSeer</a>, <a href="#fbcache">FBCache</a></b>, etc🔥
@@ -154,6 +154,7 @@ The **CacheDiT** codebase is adapted from [FBCache](https://github.com/chengzeyi
 - [🔥Supported Models](#supported)
 - [⚡️Dual Block Cache](#dbcache)
 - [🔥Hybrid TaylorSeer](#taylorseer)
+- [⚡️Hybrid Cache CFG](#cfg)
 - [🎉First Block Cache](#fbcache)
 - [⚡️Dynamic Block Prune](#dbprune)
 - [🎉Context Parallelism](#context-parallelism)
@@ -168,7 +169,7 @@ The **CacheDiT** codebase is adapted from [FBCache](https://github.com/chengzeyi
 You can install the stable release of `cache-dit` from PyPI:
 ```bash
-pip3 install cache-dit
+pip3 install -U cache-dit
 ```
 Or you can install the latest develop version from GitHub:
@@ -180,11 +181,13 @@ pip3 install git+https://github.com/vipshop/cache-dit.git
 <div id="supported"></div>
-- [🚀FLUX.1](https://github.com/vipshop/cache-dit/raw/main/examples)
-- [🚀Mochi](https://github.com/vipshop/cache-dit/raw/main/examples)
+- [🚀FLUX.1-dev](https://github.com/vipshop/cache-dit/raw/main/examples)
+- [🚀FLUX.1-Fill-dev](https://github.com/vipshop/cache-dit/raw/main/examples)
+- [🚀mochi-1-preview](https://github.com/vipshop/cache-dit/raw/main/examples)
 - [🚀CogVideoX](https://github.com/vipshop/cache-dit/raw/main/examples)
 - [🚀CogVideoX1.5](https://github.com/vipshop/cache-dit/raw/main/examples)
-- [🚀Wan2.1](https://github.com/vipshop/cache-dit/raw/main/examples)
+- [🚀Wan2.1-T2V](https://github.com/vipshop/cache-dit/raw/main/examples)
+- [🚀Wan2.1-FLF2V](https://github.com/vipshop/cache-dit/raw/main/examples)
 - [🚀HunyuanVideo](https://github.com/vipshop/cache-dit/raw/main/examples)
@@ -280,7 +283,7 @@ cache_options = {
     "taylorseer_kwargs": {
         "n_derivatives": 2, # default is 2.
     },
-    "warmup_steps": 3, # n_derivatives + 1
+    "warmup_steps": 3, # prefer: >= n_derivatives + 1
     "residual_diff_threshold": 0.12,
 }
 ```
@@ -299,6 +302,30 @@ cache_options = {
 |24.85s|12.85s|12.86s|10.27s|10.28s|8.48s|
 |<img src=https://github.com/vipshop/cache-dit/raw/main/assets/NONE_R0.08_S0.png width=105px>|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/U0_C0_DBCACHE_F1B0S1W0T0ET0_R0.12_S14_T12.85s.png width=105px>|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/U0_C0_DBCACHE_F1B0S1W0T1ET1_R0.12_S14_T12.86s.png width=105px>|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/U0_C0_DBCACHE_F1B0S1W0T0ET0_R0.15_S17_T10.27s.png width=105px>|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/U0_C0_DBCACHE_F1B0S1W0T1ET1_R0.15_S17_T10.28s.png width=105px>|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/U0_C1_DBCACHE_F1B0S1W0T1ET1_R0.15_S17_T8.48s.png width=105px>|
+## ⚡️Hybrid Cache CFG
+<div id="cfg"></div>
+CacheDiT supports caching for **CFG (classifier-free guidance)**. For models that fuse CFG and non-CFG into a single forward step, or models that do not include CFG (classifier-free guidance) in the forward step, please set `do_separate_classifier_free_guidance` param to **False (default)**. Otherwise, set it to True. For examples:
+```python
+cache_options = {
+    # CFG: classifier free guidance or not
+    # For model that fused CFG and non-CFG into single forward step,
+    # should set do_separate_classifier_free_guidance as False.
+    # For example, set it as True for Wan 2.1 and set it as False
+    # for FLUX.1, HunyuanVideo, CogVideoX, Mochi.
+    "do_separate_classifier_free_guidance": True,  # Wan 2.1
+    # Compute cfg forward first or not, default False, namely,
+    # 0, 2, 4, ..., -> non-CFG step; 1, 3, 5, ... -> CFG step.
+    "cfg_compute_first": False,
+    # Compute spearate diff values for CFG and non-CFG step,
+    # default True. If False, we will use the computed diff from
+    # current non-CFG transformer step for current CFG step.
+    "cfg_diff_compute_separate": True,
+}
+```
 ## 🎉FBCache: First Block Cache
 <div id="fbcache"></div>

{cache_dit-0.2.3.dist-info → cache_dit-0.2.5.dist-info}/RECORD RENAMED Viewed

@@ -1,18 +1,18 @@
 cache_dit/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-cache_dit/_version.py,sha256=wD8hnA5gV5UmPkQnpT3xR6V2csgj9K5NEADogbLK79M,511
+cache_dit/_version.py,sha256=N3oBwJUFmS-AwCjqOcSlRW4GvSq-uJJMaBvoGfv1-hM,511
 cache_dit/logger.py,sha256=0zsu42hN-3-rgGC_C29ms1IvVpV4_b4_SwJCKSenxBE,4304
 cache_dit/primitives.py,sha256=A2iG9YLot3gOsZSPp-_gyjqjLgJvWQRx8aitD4JQ23Y,3877
 cache_dit/cache_factory/__init__.py,sha256=5RNuhWakvvqrOV4vkqrEBA7d-V1LwcNSsjtW14mkqK8,5255
 cache_dit/cache_factory/taylorseer.py,sha256=LKSNo2ode69EVo9xrxjxAMEjz0yDGiGADeDYnEqddA8,3987
 cache_dit/cache_factory/utils.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 cache_dit/cache_factory/dual_block_cache/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-cache_dit/cache_factory/dual_block_cache/cache_context.py,sha256=EV0REfwLuXTqnkZ9vD2nFFjRceRLrXhl1b1SO5N4os8,59272
+cache_dit/cache_factory/dual_block_cache/cache_context.py,sha256=wE_xYp7DRbgB-fD8dpr75o4Cvvl2s-jnT2fRyqWm_RM,71286
 cache_dit/cache_factory/dual_block_cache/diffusers_adapters/__init__.py,sha256=krNAICf-aS3JLmSG8vOB9tpLa04uYRcABsC8PMbVUKY,1870
 cache_dit/cache_factory/dual_block_cache/diffusers_adapters/cogvideox.py,sha256=fibkeU-FHa30BNT-uPV2Eqcd5IRli07EKb25tMDp23c,2270
 cache_dit/cache_factory/dual_block_cache/diffusers_adapters/flux.py,sha256=fddSpTHXU24COMGAY-Z21EmHHAEArZBv_-XLRFD6ADU,2625
 cache_dit/cache_factory/dual_block_cache/diffusers_adapters/hunyuan_video.py,sha256=wcZdBhjUB8WSfz40A268BtSe3nr_hRsIi2BNlg1FHRU,9965
 cache_dit/cache_factory/dual_block_cache/diffusers_adapters/mochi.py,sha256=Cmy0KHRDgwXqtmqfkrr7kw0CP6CmkSnuz29gDHcD6sQ,2262
-cache_dit/cache_factory/dual_block_cache/diffusers_adapters/wan.py,sha256=kNoCZshNUxgwS9Di84Mz8Js1QAcs_U665x6wcSKYE2A,2594
+cache_dit/cache_factory/dual_block_cache/diffusers_adapters/wan.py,sha256=EREHM5E1wxnL-uRXRAEege4HXraRp1oD_r1Zx4CsiKk,2596
 cache_dit/cache_factory/dynamic_block_prune/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 cache_dit/cache_factory/dynamic_block_prune/prune_context.py,sha256=so1wGdb8W0ATwrjv7E5IEZLPcobybaY1HJa6hBYlOOQ,34698
 cache_dit/cache_factory/dynamic_block_prune/diffusers_adapters/__init__.py,sha256=hVBTXj9MMGFGVezT3j8MntFRBiphSaUL4YhSOd8JtuY,1870
@@ -22,7 +22,7 @@ cache_dit/cache_factory/dynamic_block_prune/diffusers_adapters/hunyuan_video.py,
 cache_dit/cache_factory/dynamic_block_prune/diffusers_adapters/mochi.py,sha256=zXgoRDDjus3a2WSjtNh4ERtQp20ceb6nzohHMDlo2zY,2265
 cache_dit/cache_factory/dynamic_block_prune/diffusers_adapters/wan.py,sha256=PA7nuLgfAelnaI8usQx0Kxi8XATzMapyR1WndEdFoZA,2604
 cache_dit/cache_factory/first_block_cache/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-cache_dit/cache_factory/first_block_cache/cache_context.py,sha256=NeAfDJlJVVUAL4btax5_iOLTuue1x4qeXwk0pM-QH28,23219
+cache_dit/cache_factory/first_block_cache/cache_context.py,sha256=tTPwhPLEA7LqGupps1Zy2MycCtLzs22wsW0yUhiiF-U,23217
 cache_dit/cache_factory/first_block_cache/diffusers_adapters/__init__.py,sha256=-FFgA2MoudEo7uDacg4aWgm1KwfLZFsEDTVxatgbq9M,2146
 cache_dit/cache_factory/first_block_cache/diffusers_adapters/cogvideox.py,sha256=qO5CWyurtwW30mvOe6cxeQPTSXLDlPJcezm72zEjDq8,2375
 cache_dit/cache_factory/first_block_cache/diffusers_adapters/flux.py,sha256=Dcd4OzABCtyQCZNX2KNnUTdVoO1E1ApM7P8gcVYzcK0,2733
@@ -33,8 +33,8 @@ cache_dit/compile/__init__.py,sha256=DfMdPleFFGADXLsr7zXui8BTz_y9futY6rNmNdh9y7k
 cache_dit/compile/utils.py,sha256=KU60xc474Anbj7Y_FLRFmNxEjVYLLXkhbtCLXO7o_Tc,3699
 cache_dit/custom_ops/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 cache_dit/custom_ops/triton_taylorseer.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-cache_dit-0.2.3.dist-info/licenses/LICENSE,sha256=Dqb07Ik2dV41s9nIdMUbiRWEfDqo7-dQeRiY7kPO8PE,3769
-cache_dit-0.2.3.dist-info/METADATA,sha256=XH7Wn7GdTRth6g0yAY5heArGqXmrrC7CflOJ8ZXH-1k,24867
-cache_dit-0.2.3.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
-cache_dit-0.2.3.dist-info/top_level.txt,sha256=ZJDydonLEhujzz0FOkVbO-BqfzO9d_VqRHmZU-3MOZo,10
-cache_dit-0.2.3.dist-info/RECORD,,
+cache_dit-0.2.5.dist-info/licenses/LICENSE,sha256=Dqb07Ik2dV41s9nIdMUbiRWEfDqo7-dQeRiY7kPO8PE,3769
+cache_dit-0.2.5.dist-info/METADATA,sha256=J37Waq-cMbuFfTrngXuxqouXpjHK9qhR_MZHlE2odmY,26249
+cache_dit-0.2.5.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
+cache_dit-0.2.5.dist-info/top_level.txt,sha256=ZJDydonLEhujzz0FOkVbO-BqfzO9d_VqRHmZU-3MOZo,10
+cache_dit-0.2.5.dist-info/RECORD,,

{cache_dit-0.2.3.dist-info → cache_dit-0.2.5.dist-info}/WHEEL RENAMED Viewed

File without changes

{cache_dit-0.2.3.dist-info → cache_dit-0.2.5.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

{cache_dit-0.2.3.dist-info → cache_dit-0.2.5.dist-info}/top_level.txt RENAMED Viewed

File without changes

cache-dit 0.2.3__py3-none-any.whl → 0.2.5__py3-none-any.whl

Potentially problematic release.

cache-dit 0.2.3py3-none-any.whl → 0.2.5py3-none-any.whl