PyPI - cache-dit - Versions diffs - 0.2.23__py3-none-any.whl → 0.2.24__py3-none-any.whl - Mend

cache-dit 0.2.23py3-none-any.whl → 0.2.24py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of cache-dit might be problematic. Click here for more details.

Files changed (14) hide show

cache_dit/_version.py +2 -2
cache_dit/cache_factory/cache_adapters.py +137 -76
cache_dit/cache_factory/cache_context.py +85 -15
cache_dit/cache_factory/cache_interface.py +10 -3
cache_dit/cache_factory/taylorseer.py +5 -4
cache_dit/cache_factory/utils.py +1 -1
cache_dit/utils.py +25 -22
{cache_dit-0.2.23.dist-info → cache_dit-0.2.24.dist-info}/METADATA +8 -6
{cache_dit-0.2.23.dist-info → cache_dit-0.2.24.dist-info}/RECORD +13 -14
cache_dit/primitives.py +0 -152
{cache_dit-0.2.23.dist-info → cache_dit-0.2.24.dist-info}/WHEEL +0 -0
{cache_dit-0.2.23.dist-info → cache_dit-0.2.24.dist-info}/entry_points.txt +0 -0
{cache_dit-0.2.23.dist-info → cache_dit-0.2.24.dist-info}/licenses/LICENSE +0 -0
{cache_dit-0.2.23.dist-info → cache_dit-0.2.24.dist-info}/top_level.txt +0 -0

cache_dit/_version.py CHANGED Viewed

@@ -28,7 +28,7 @@ version_tuple: VERSION_TUPLE
 commit_id: COMMIT_ID
 __commit_id__: COMMIT_ID
-__version__ = version = '0.2.23'
-__version_tuple__ = version_tuple = (0, 2, 23)
+__version__ = version = '0.2.24'
+__version_tuple__ = version_tuple = (0, 2, 24)
 __commit_id__ = commit_id = None

cache_dit/cache_factory/cache_adapters.py CHANGED Viewed

@@ -5,7 +5,7 @@ import unittest
 import functools
 import dataclasses
-from typing import Any, Tuple, List
+from typing import Any, Tuple, List, Optional
 from contextlib import ExitStack
 from diffusers import DiffusionPipeline
 from cache_dit.cache_factory.patch.flux import (
@@ -40,6 +40,7 @@ class BlockAdapter:
             "layers",
         ]
     )
+    check_prefixes: bool = True
     allow_suffixes: List[str] = dataclasses.field(
         default_factory=lambda: ["TransformerBlock"]
     )
@@ -48,8 +49,24 @@ class BlockAdapter:
         default="max", metadata={"allowed_values": ["max", "min"]}
     )
+    def __post_init__(self):
+        self.maybe_apply_patch()
+    def maybe_apply_patch(self):
+        # Process some specificial cases, specific for transformers
+        # that has different forward patterns between single_transformer_blocks
+        # and transformer_blocks , such as Flux (diffusers < 0.35.0).
+        if self.transformer.__class__.__name__.startswith("Flux"):
+            self.transformer = maybe_patch_flux_transformer(
+                self.transformer,
+                blocks=self.blocks,
+            )
     @staticmethod
-    def auto_block_adapter(adapter: "BlockAdapter") -> "BlockAdapter":
+    def auto_block_adapter(
+        adapter: "BlockAdapter",
+        forward_pattern: Optional[ForwardPattern] = None,
+    ) -> "BlockAdapter":
         assert adapter.auto, (
             "Please manually set `auto` to True, or, manually "
             "set all the transformer blocks configuration."
@@ -66,8 +83,10 @@ class BlockAdapter:
             transformer=transformer,
             allow_prefixes=adapter.allow_prefixes,
             allow_suffixes=adapter.allow_suffixes,
+            check_prefixes=adapter.check_prefixes,
             check_suffixes=adapter.check_suffixes,
             blocks_policy=adapter.blocks_policy,
+            forward_pattern=forward_pattern,
         )
         return BlockAdapter(
@@ -87,6 +106,8 @@ class BlockAdapter:
             and isinstance(adapter.blocks, torch.nn.ModuleList)
         ):
             return True
+        logger.warning("Check block adapter failed!")
         return False
     @staticmethod
@@ -101,24 +122,30 @@ class BlockAdapter:
         allow_suffixes: List[str] = [
             "TransformerBlock",
         ],
+        check_prefixes: bool = True,
         check_suffixes: bool = False,
         **kwargs,
     ) -> Tuple[torch.nn.ModuleList, str]:
+        # Check prefixes
+        if check_prefixes:
+            blocks_names = []
+            for attr_name in dir(transformer):
+                for prefix in allow_prefixes:
+                    if attr_name.startswith(prefix):
+                        blocks_names.append(attr_name)
+        else:
+            blocks_names = dir(transformer)
-        blocks_names = []
-        for attr_name in dir(transformer):
-            for prefix in allow_prefixes:
-                if attr_name.startswith(prefix):
-                    blocks_names.append(attr_name)
-        # Type check
+        # Check ModuleList
         valid_names = []
         valid_count = []
+        forward_pattern = kwargs.get("forward_pattern", None)
         for blocks_name in blocks_names:
             if blocks := getattr(transformer, blocks_name, None):
                 if isinstance(blocks, torch.nn.ModuleList):
                     block = blocks[0]
                     block_cls_name = block.__class__.__name__
+                    # Check suffixes
                     if isinstance(block, torch.nn.Module) and (
                         any(
                             (
@@ -128,8 +155,18 @@ class BlockAdapter:
                         )
                         or (not check_suffixes)
                     ):
-                        valid_names.append(blocks_name)
-                        valid_count.append(len(blocks))
+                        # May check forward pattern
+                        if forward_pattern is not None:
+                            if BlockAdapter.match_blocks_pattern(
+                                blocks,
+                                forward_pattern,
+                                logging=False,
+                            ):
+                                valid_names.append(blocks_name)
+                                valid_count.append(len(blocks))
+                        else:
+                            valid_names.append(blocks_name)
+                            valid_count.append(len(blocks))
         if not valid_names:
             raise ValueError(
@@ -139,6 +176,7 @@ class BlockAdapter:
         final_name = valid_names[0]
         final_count = valid_count[0]
         block_policy = kwargs.get("blocks_policy", "max")
         for blocks_name, count in zip(valid_names, valid_count):
             blocks = getattr(transformer, blocks_name)
             logger.info(
@@ -165,6 +203,67 @@ class BlockAdapter:
         return final_blocks, final_name
+    @staticmethod
+    def match_block_pattern(
+        block: torch.nn.Module,
+        forward_pattern: ForwardPattern,
+    ) -> bool:
+        assert (
+            forward_pattern.Supported
+            and forward_pattern in ForwardPattern.supported_patterns()
+        ), f"Pattern {forward_pattern} is not support now!"
+        forward_parameters = set(
+            inspect.signature(block.forward).parameters.keys()
+        )
+        num_outputs = str(
+            inspect.signature(block.forward).return_annotation
+        ).count("torch.Tensor")
+        in_matched = True
+        out_matched = True
+        if num_outputs > 0 and len(forward_pattern.Out) != num_outputs:
+            # output pattern not match
+            out_matched = False
+        for required_param in forward_pattern.In:
+            if required_param not in forward_parameters:
+                in_matched = False
+        return in_matched and out_matched
+    @staticmethod
+    def match_blocks_pattern(
+        transformer_blocks: torch.nn.ModuleList,
+        forward_pattern: ForwardPattern,
+        logging: bool = True,
+    ) -> bool:
+        assert (
+            forward_pattern.Supported
+            and forward_pattern in ForwardPattern.supported_patterns()
+        ), f"Pattern {forward_pattern} is not support now!"
+        assert isinstance(transformer_blocks, torch.nn.ModuleList)
+        pattern_matched_states = []
+        for block in transformer_blocks:
+            pattern_matched_states.append(
+                BlockAdapter.match_block_pattern(
+                    block,
+                    forward_pattern,
+                )
+            )
+        pattern_matched = all(pattern_matched_states)  # all block match
+        if pattern_matched and logging:
+            block_cls_name = transformer_blocks[0].__class__.__name__
+            logger.info(
+                f"Match Block Forward Pattern: {block_cls_name}, {forward_pattern}"
+                f"\nIN:{forward_pattern.In}, OUT:{forward_pattern.Out})"
+            )
+        return pattern_matched
 @dataclasses.dataclass
 class UnifiedCacheParams:
@@ -463,19 +562,42 @@ class UnifiedCacheAdapter:
     ) -> DiffusionPipeline:
         if block_adapter.auto:
-            block_adapter = BlockAdapter.auto_block_adapter(block_adapter)
+            block_adapter = BlockAdapter.auto_block_adapter(
+                block_adapter,
+                forward_pattern,
+            )
         if BlockAdapter.check_block_adapter(block_adapter):
-            assert isinstance(block_adapter.blocks, torch.nn.ModuleList)
             # Apply cache on pipeline: wrap cache context
-            cls.create_context(block_adapter.pipe, **cache_context_kwargs)
+            cls.create_context(
+                block_adapter.pipe,
+                **cache_context_kwargs,
+            )
             # Apply cache on transformer: mock cached transformer blocks
             cls.mock_blocks(
                 block_adapter,
                 forward_pattern=forward_pattern,
             )
+            cls.patch_params(
+                block_adapter,
+                forward_pattern=forward_pattern,
+                **cache_context_kwargs,
+            )
         return block_adapter.pipe
+    @classmethod
+    def patch_params(
+        cls,
+        block_adapter: BlockAdapter,
+        forward_pattern: ForwardPattern = None,
+        **cache_context_kwargs,
+    ):
+        block_adapter.transformer._forward_pattern = forward_pattern
+        block_adapter.transformer._cache_context_kwargs = cache_context_kwargs
+        block_adapter.pipe.__class__._cache_context_kwargs = (
+            cache_context_kwargs
+        )
     @classmethod
     def has_separate_cfg(
         cls,
@@ -534,7 +656,6 @@ class UnifiedCacheAdapter:
         pipe.__class__.__call__ = new_call
         pipe.__class__._is_cached = True
-        pipe.__class__._cache_options = cache_kwargs
         return pipe
     @classmethod
@@ -544,28 +665,11 @@ class UnifiedCacheAdapter:
         forward_pattern: ForwardPattern = ForwardPattern.Pattern_0,
     ) -> torch.nn.Module:
-        if (
-            block_adapter.transformer is None
-            or block_adapter.blocks_name is None
-            or block_adapter.blocks is None
-        ):
-            assert block_adapter.auto, (
-                "Please manually set `auto` to True, or, "
-                "manually set transformer blocks configuration."
-            )
         if getattr(block_adapter.transformer, "_is_cached", False):
             return block_adapter.transformer
-        # Firstly, process some specificial cases (TODO: more patches)
-        if block_adapter.transformer.__class__.__name__.startswith("Flux"):
-            block_adapter.transformer = maybe_patch_flux_transformer(
-                block_adapter.transformer,
-                blocks=block_adapter.blocks,
-            )
         # Check block forward pattern matching
-        assert cls.match_pattern(
+        assert BlockAdapter.match_blocks_pattern(
             block_adapter.blocks,
             forward_pattern=forward_pattern,
         ), (
@@ -615,46 +719,3 @@ class UnifiedCacheAdapter:
         block_adapter.transformer._is_cached = True
         return block_adapter.transformer
-    @classmethod
-    def match_pattern(
-        cls,
-        transformer_blocks: torch.nn.ModuleList,
-        forward_pattern: ForwardPattern = ForwardPattern.Pattern_0,
-    ) -> bool:
-        pattern_matched_states = []
-        assert (
-            forward_pattern.Supported
-            and forward_pattern in ForwardPattern.supported_patterns()
-        ), f"Pattern {forward_pattern} is not support now!"
-        for block in transformer_blocks:
-            forward_parameters = set(
-                inspect.signature(block.forward).parameters.keys()
-            )
-            num_outputs = str(
-                inspect.signature(block.forward).return_annotation
-            ).count("torch.Tensor")
-            in_matched = True
-            out_matched = True
-            if num_outputs > 0 and len(forward_pattern.Out) != num_outputs:
-                # output pattern not match
-                out_matched = False
-            for required_param in forward_pattern.In:
-                if required_param not in forward_parameters:
-                    in_matched = False
-            pattern_matched_states.append(in_matched and out_matched)
-        pattern_matched = all(pattern_matched_states)  # all block match
-        if pattern_matched:
-            block_cls_name = transformer_blocks[0].__class__.__name__
-            logger.info(
-                f"Match Block Forward Pattern: {block_cls_name}, {forward_pattern}"
-                f"\nIN:{forward_pattern.In}, OUT:{forward_pattern.Out})"
-            )
-        return pattern_matched

cache_dit/cache_factory/cache_context.py CHANGED Viewed

@@ -5,8 +5,8 @@ from collections import defaultdict
 from typing import Any, DefaultDict, Dict, List, Optional, Union, Tuple
 import torch
+import torch.distributed as dist
-import cache_dit.primitives as primitives
 from cache_dit.cache_factory.taylorseer import TaylorSeer
 from cache_dit.logger import init_logger
@@ -47,10 +47,11 @@ class DBCacheContext:
     # Other settings
     downsample_factor: int = 1
-    num_inference_steps: int = -1  # un-used now
-    warmup_steps: int = 0  # DON'T Cache in warmup steps
+    num_inference_steps: int = -1  # for future use
+    max_warmup_steps: int = 0  # DON'T Cache in warmup steps
     # DON'T Cache if the number of cached steps >= max_cached_steps
     max_cached_steps: int = -1  # for both CFG and non-CFG
+    max_continuous_cached_steps: int = -1  # the max continuous cached steps
     # Record the steps that have been cached, both cached and non-cache
     executed_steps: int = 0  # cache + non-cache steps pippeline
@@ -89,10 +90,12 @@ class DBCacheContext:
     residual_diffs: DefaultDict[str, float] = dataclasses.field(
         default_factory=lambda: defaultdict(float),
     )
+    continuous_cached_steps: int = 0
     cfg_cached_steps: List[int] = dataclasses.field(default_factory=list)
     cfg_residual_diffs: DefaultDict[str, float] = dataclasses.field(
         default_factory=lambda: defaultdict(float),
     )
+    cfg_continuous_cached_steps: int = 0
     @torch.compiler.disable
     def __post_init__(self):
@@ -108,17 +111,17 @@ class DBCacheContext:
                     "cfg_diff_compute_separate is enabled."
                 )
-        if "warmup_steps" not in self.taylorseer_kwargs:
-            # If warmup_steps is not set in taylorseer_kwargs,
-            # set the same as warmup_steps for DBCache
-            self.taylorseer_kwargs["warmup_steps"] = (
-                self.warmup_steps if self.warmup_steps > 0 else 1
+        if "max_warmup_steps" not in self.taylorseer_kwargs:
+            # If max_warmup_steps is not set in taylorseer_kwargs,
+            # set the same as max_warmup_steps for DBCache
+            self.taylorseer_kwargs["max_warmup_steps"] = (
+                self.max_warmup_steps if self.max_warmup_steps > 0 else 1
             )
         # Only set n_derivatives as 2 or 3, which is enough for most cases.
         if "n_derivatives" not in self.taylorseer_kwargs:
             self.taylorseer_kwargs["n_derivatives"] = max(
-                2, min(3, self.taylorseer_kwargs["warmup_steps"])
+                2, min(3, self.taylorseer_kwargs["max_warmup_steps"])
             )
         if self.enable_taylorseer:
@@ -268,10 +271,31 @@ class DBCacheContext:
     @torch.compiler.disable
     def add_cached_step(self):
+        curr_cached_step = self.get_current_step()
         if not self.is_separate_cfg_step():
-            self.cached_steps.append(self.get_current_step())
+            if self.cached_steps:
+                prev_cached_step = self.cached_steps[-1]
+                if curr_cached_step - prev_cached_step == 1:
+                    if self.continuous_cached_steps == 0:
+                        self.continuous_cached_steps += 2
+                    else:
+                        self.continuous_cached_steps += 1
+            else:
+                self.continuous_cached_steps += 1
+            self.cached_steps.append(curr_cached_step)
         else:
-            self.cfg_cached_steps.append(self.get_current_step())
+            if self.cfg_cached_steps:
+                prev_cfg_cached_step = self.cfg_cached_steps[-1]
+                if curr_cached_step - prev_cfg_cached_step == 1:
+                    if self.cfg_continuous_cached_steps == 0:
+                        self.cfg_continuous_cached_steps += 2
+                    else:
+                        self.cfg_continuous_cached_steps += 1
+            else:
+                self.cfg_continuous_cached_steps += 1
+            self.cfg_cached_steps.append(curr_cached_step)
     @torch.compiler.disable
     def get_cached_steps(self):
@@ -301,7 +325,7 @@ class DBCacheContext:
     @torch.compiler.disable
     def is_in_warmup(self):
-        return self.get_current_step() < self.warmup_steps
+        return self.get_current_step() < self.max_warmup_steps
 @torch.compiler.disable
@@ -396,6 +420,27 @@ def get_max_cached_steps():
     return cache_context.max_cached_steps
+@torch.compiler.disable
+def get_max_continuous_cached_steps():
+    cache_context = get_current_cache_context()
+    assert cache_context is not None, "cache_context must be set before"
+    return cache_context.max_continuous_cached_steps
+@torch.compiler.disable
+def get_continuous_cached_steps():
+    cache_context = get_current_cache_context()
+    assert cache_context is not None, "cache_context must be set before"
+    return cache_context.continuous_cached_steps
+@torch.compiler.disable
+def get_cfg_continuous_cached_steps():
+    cache_context = get_current_cache_context()
+    assert cache_context is not None, "cache_context must be set before"
+    return cache_context.cfg_continuous_cached_steps
 @torch.compiler.disable
 def add_cached_step():
     cache_context = get_current_cache_context()
@@ -744,8 +789,8 @@ def are_two_tensors_similar(
             mean_t1 = t1.abs().mean()
         if parallelized:
-            mean_diff = primitives.all_reduce_sync(mean_diff, "avg")
-            mean_t1 = primitives.all_reduce_sync(mean_t1, "avg")
+            dist.all_reduce(mean_diff, op=dist.ReduceOp.AVG)
+            dist.all_reduce(mean_t1, op=dist.ReduceOp.AVG)
         # D = (t1 - t2) / t1 = 1 - (t2 / t1), if D = 0, then t1 = t2.
         # Futher, if we assume that (H(t,  0) - H(t-1,0)) ~ 0, then,
@@ -1020,6 +1065,7 @@ def get_can_use_cache(
     if is_in_warmup():
         return False
+    # max cached steps
     max_cached_steps = get_max_cached_steps()
     if not is_separate_cfg_step():
         cached_steps = get_cached_steps()
@@ -1030,8 +1076,32 @@ def get_can_use_cache(
         if logger.isEnabledFor(logging.DEBUG):
             logger.debug(
                 f"{prefix}, max_cached_steps reached: {max_cached_steps}, "
-                "cannot use cache."
+                "can not use cache."
+            )
+        return False
+    # max continuous cached steps
+    max_continuous_cached_steps = get_max_continuous_cached_steps()
+    if not is_separate_cfg_step():
+        continuous_cached_steps = get_continuous_cached_steps()
+    else:
+        continuous_cached_steps = get_cfg_continuous_cached_steps()
+    if max_continuous_cached_steps >= 0 and (
+        continuous_cached_steps >= max_continuous_cached_steps
+    ):
+        if logger.isEnabledFor(logging.DEBUG):
+            logger.debug(
+                f"{prefix}, max_continuous_cached_steps "
+                f"reached: {max_continuous_cached_steps}, "
+                "can not use cache."
             )
+        # reset continuous cached steps stats
+        cache_context = get_current_cache_context()
+        if not is_separate_cfg_step():
+            cache_context.continuous_cached_steps = 0
+        else:
+            cache_context.cfg_continuous_cached_steps = 0
         return False
     if threshold is None or threshold <= 0.0:

cache_dit/cache_factory/cache_interface.py CHANGED Viewed

@@ -16,8 +16,9 @@ def enable_cache(
     # Cache context kwargs
     Fn_compute_blocks: int = 8,
     Bn_compute_blocks: int = 0,
-    warmup_steps: int = 8,
+    max_warmup_steps: int = 8,
     max_cached_steps: int = -1,
+    max_continuous_cached_steps: int = -1,
     residual_diff_threshold: float = 0.08,
     # Cache CFG or not
     do_separate_cfg: bool = False,
@@ -54,12 +55,15 @@ def enable_cache(
             Further fuses approximate information in the **last n** Transformer blocks to enhance
             prediction accuracy. These blocks act as an auto-scaler for approximate hidden states
             that use residual cache.
-        warmup_steps (`int`, *required*, defaults to 8):
+        max_warmup_steps (`int`, *required*, defaults to 8):
             DBCache does not apply the caching strategy when the number of running steps is less than
             or equal to this value, ensuring the model sufficiently learns basic features during warmup.
         max_cached_steps (`int`, *required*, defaults to -1):
             DBCache disables the caching strategy when the previous cached steps exceed this value to
             prevent precision degradation.
+        max_continuous_cached_steps (`int`, *required*, defaults to -1):
+            DBCache disables the caching strategy when the previous continous cached steps exceed this value to
+            prevent precision degradation.
         residual_diff_threshold (`float`, *required*, defaults to 0.08):
             he value of residual diff threshold, a higher value leads to faster performance at the
             cost of lower precision.
@@ -106,8 +110,11 @@ def enable_cache(
     cache_context_kwargs["cache_type"] = CacheType.DBCache
     cache_context_kwargs["Fn_compute_blocks"] = Fn_compute_blocks
     cache_context_kwargs["Bn_compute_blocks"] = Bn_compute_blocks
-    cache_context_kwargs["warmup_steps"] = warmup_steps
+    cache_context_kwargs["max_warmup_steps"] = max_warmup_steps
     cache_context_kwargs["max_cached_steps"] = max_cached_steps
+    cache_context_kwargs["max_continuous_cached_steps"] = (
+        max_continuous_cached_steps
+    )
     cache_context_kwargs["residual_diff_threshold"] = residual_diff_threshold
     cache_context_kwargs["do_separate_cfg"] = do_separate_cfg
     cache_context_kwargs["cfg_compute_first"] = cfg_compute_first

cache_dit/cache_factory/taylorseer.py CHANGED Viewed

@@ -6,13 +6,13 @@ class TaylorSeer:
     def __init__(
         self,
         n_derivatives=2,
-        warmup_steps=1,
+        max_warmup_steps=1,
         skip_interval_steps=1,
         compute_step_map=None,
     ):
         self.n_derivatives = n_derivatives
         self.ORDER = n_derivatives + 1
-        self.warmup_steps = warmup_steps
+        self.max_warmup_steps = max_warmup_steps
         self.skip_interval_steps = skip_interval_steps
         self.compute_step_map = compute_step_map
         self.reset_cache()
@@ -32,8 +32,9 @@ class TaylorSeer:
         if self.compute_step_map is not None:
             return self.compute_step_map[step]
         if (
-            step < self.warmup_steps
-            or (step - self.warmup_steps + 1) % self.skip_interval_steps == 0
+            step < self.max_warmup_steps
+            or (step - self.max_warmup_steps + 1) % self.skip_interval_steps
+            == 0
         ):
             return True
         return False

cache_dit/cache_factory/utils.py CHANGED Viewed

@@ -9,7 +9,7 @@ def load_cache_options_from_yaml(yaml_file_path):
         required_keys = [
             "cache_type",
-            "warmup_steps",
+            "max_warmup_steps",
             "max_cached_steps",
             "Fn_compute_blocks",
             "Bn_compute_blocks",

cache_dit/utils.py CHANGED Viewed

@@ -27,22 +27,26 @@ class CacheStats:
 def summary(
-    pipe: DiffusionPipeline, details: bool = False, logging: bool = True
-):
+    pipe_or_transformer: DiffusionPipeline | torch.nn.Module,
+    details: bool = False,
+    logging: bool = True,
+) -> CacheStats:
     cache_stats = CacheStats()
-    pipe_cls_name = pipe.__class__.__name__
+    cls_name = pipe_or_transformer.__class__.__name__
+    if isinstance(pipe_or_transformer, DiffusionPipeline):
+        transformer = pipe_or_transformer.transformer
+    else:
+        transformer = pipe_or_transformer
-    if hasattr(pipe, "_cache_options"):
-        cache_options = pipe._cache_options
+    if hasattr(transformer, "_cache_context_kwargs"):
+        cache_options = transformer._cache_context_kwargs
         cache_stats.cache_options = cache_options
         if logging:
-            print(f"\n🤗Cache Options: {pipe_cls_name}\n\n{cache_options}")
+            print(f"\n🤗Cache Options: {cls_name}\n\n{cache_options}")
-    if hasattr(pipe.transformer, "_cached_steps"):
-        cached_steps: list[int] = pipe.transformer._cached_steps
-        residual_diffs: dict[str, float] = dict(
-            pipe.transformer._residual_diffs
-        )
+    if hasattr(transformer, "_cached_steps"):
+        cached_steps: list[int] = transformer._cached_steps
+        residual_diffs: dict[str, float] = dict(transformer._residual_diffs)
         cache_stats.cached_steps = cached_steps
         cache_stats.residual_diffs = residual_diffs
@@ -57,7 +61,7 @@ def summary(
             qmax = np.max(diffs_values)
             print(
-                f"\n⚡️Cache Steps and Residual Diffs Statistics: {pipe_cls_name}\n"
+                f"\n⚡️Cache Steps and Residual Diffs Statistics: {cls_name}\n"
             )
             print(
@@ -74,9 +78,7 @@ def summary(
             print("")
             if details:
-                print(
-                    f"📚Cache Steps and Residual Diffs Details: {pipe_cls_name}\n"
-                )
+                print(f"📚Cache Steps and Residual Diffs Details: {cls_name}\n")
                 pprint(
                     f"Cache Steps: {len(cached_steps)}, {cached_steps}",
                 )
@@ -85,10 +87,10 @@ def summary(
                     compact=True,
                 )
-    if hasattr(pipe.transformer, "_cfg_cached_steps"):
-        cfg_cached_steps: list[int] = pipe.transformer._cfg_cached_steps
+    if hasattr(transformer, "_cfg_cached_steps"):
+        cfg_cached_steps: list[int] = transformer._cfg_cached_steps
         cfg_residual_diffs: dict[str, float] = dict(
-            pipe.transformer._cfg_residual_diffs
+            transformer._cfg_residual_diffs
         )
         cache_stats.cfg_cached_steps = cfg_cached_steps
         cache_stats.cfg_residual_diffs = cfg_residual_diffs
@@ -104,7 +106,7 @@ def summary(
             qmax = np.max(cfg_diffs_values)
             print(
-                f"\n⚡️CFG Cache Steps and Residual Diffs Statistics: {pipe_cls_name}\n"
+                f"\n⚡️CFG Cache Steps and Residual Diffs Statistics: {cls_name}\n"
             )
             print(
@@ -122,7 +124,7 @@ def summary(
             if details:
                 print(
-                    f"📚CFG Cache Steps and Residual Diffs Details: {pipe_cls_name}\n"
+                    f"📚CFG Cache Steps and Residual Diffs Details: {cls_name}\n"
                 )
                 pprint(
                     f"CFG Cache Steps: {len(cfg_cached_steps)}, {cfg_cached_steps}",
@@ -149,9 +151,10 @@ def strify(pipe_or_stats: DiffusionPipeline | CacheStats):
     cache_type_str = (
         f"DBCACHE_F{cache_options['Fn_compute_blocks']}"
-        f"B{cache_options['Bn_compute_blocks']}"
-        f"W{cache_options['warmup_steps']}"
+        f"B{cache_options['Bn_compute_blocks']}_"
+        f"W{cache_options['max_warmup_steps']}"
         f"M{max(0, cache_options['max_cached_steps'])}"
+        f"MC{max(0, cache_options['max_continuous_cached_steps'])}_"
         f"T{int(cache_options['enable_taylorseer'])}"
         f"O{cache_options['taylorseer_kwargs']['n_derivatives']}_"
         f"R{cache_options['residual_diff_threshold']}_"

{cache_dit-0.2.23.dist-info → cache_dit-0.2.24.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: cache_dit
-Version: 0.2.23
+Version: 0.2.24
 Summary: 🤗 CacheDiT: An Unified and Training-free Cache Acceleration Toolbox for Diffusion Transformers
 Author: DefTruth, vipshop.com, etc.
 Maintainer: DefTruth, vipshop.com, etc
@@ -59,12 +59,13 @@ Dynamic: requires-python
   </p>
   <p align="center">
   🎉Now, <b>cache-dit</b> covers <b>Most</b> mainstream <b>Diffusers'</b> Pipelines</b>🎉<br>
-  🔥<b><a href="#supported">Qwen-Image</a> | <a href="#supported">FLUX.1</a> | <a href="#supported">Wan 2.1</a> | <a href="#supported"> ... </a> | <a href="#supported">CogVideoX</a></b>🔥
+  🔥<b><a href="#supported">Qwen-Image</a> | <a href="#supported">FLUX.1</a> | <a href="#supported">Wan 2.1/2.2</a> | <a href="#supported"> ... </a> | <a href="#supported">CogVideoX</a></b>🔥
   </p>
 </div>
 ## 🔥News
+- [2025-08-26] 🎉[**Wan2.2**](https://github.com/Wan-Video) **1.5x⚡️** speedup! Please check [run_wan_2.2.py](./examples/run_wan_2.2.py) as an example.
 - [2025-08-19] 🔥[**Qwen-Image-Edit**](https://github.com/QwenLM/Qwen-Image) **2x⚡️** speedup! Check example [run_qwen_image_edit.py](./examples/run_qwen_image_edit.py).
 - [2025-08-12] 🎉First caching mechanism in [QwenLM/Qwen-Image](https://github.com/QwenLM/Qwen-Image) with **[cache-dit](https://github.com/vipshop/cache-dit)**, check the [PR](https://github.com/QwenLM/Qwen-Image/pull/61).
 - [2025-08-11] 🔥[**Qwen-Image**](https://github.com/QwenLM/Qwen-Image) **1.8x⚡️** speedup! Please refer [run_qwen_image.py](./examples/run_qwen_image.py) as an example.
@@ -119,6 +120,7 @@ Currently, **cache-dit** library supports almost **Any** Diffusion Transformers
 - [🚀FLUX.1-Kontext-dev](https://github.com/vipshop/cache-dit/raw/main/examples)
 - [🚀CogVideoX](https://github.com/vipshop/cache-dit/raw/main/examples)
 - [🚀CogVideoX1.5](https://github.com/vipshop/cache-dit/raw/main/examples)
+- [🚀Wan2.2-T2V](https://github.com/vipshop/cache-dit/raw/main/examples)
 - [🚀Wan2.1-T2V](https://github.com/vipshop/cache-dit/raw/main/examples)
 - [🚀Wan2.1-FLF2V](https://github.com/vipshop/cache-dit/raw/main/examples)
 - [🚀HunyuanVideo](https://github.com/vipshop/cache-dit/raw/main/examples)
@@ -166,7 +168,7 @@ cache_dit.enable_cache(pipe)
 output = pipe(...)
 ```
-### 🔥BlockAdapter: Cache Acceleration for Custom Diffusion Models
+### 🔥Automatic Block Adapter
 But in some cases, you may have a **modified** Diffusion Pipeline or Transformer that is not located in the diffusers library or not officially supported by **cache-dit** at this time. The **BlockAdapter** can help you solve this problems. Please refer to [Qwen-Image w/ BlockAdapter](./examples/run_qwen_image_adapter.py) as an example.
@@ -181,7 +183,7 @@ cache_dit.enable_cache(
     forward_pattern=ForwardPattern.Pattern_1,
 )
-# Or, manualy setup transformer configurations.
+# Or, manually setup transformer configurations.
 cache_dit.enable_cache(
     BlockAdapter(
         pipe=pipe, # Qwen-Image, etc.
@@ -238,7 +240,7 @@ cache_dit.enable_cache(pipe)
 # Custom options, F8B8, higher precision
 cache_dit.enable_cache(
     pipe,
-    warmup_steps=8,      # steps do not cache
+    max_warmup_steps=8,  # steps do not cache
     max_cached_steps=-1, # -1 means no limit
     Fn_compute_blocks=8, # Fn, F8, etc.
     Bn_compute_blocks=8, # Bn, B8, etc.
@@ -297,7 +299,7 @@ cache_dit.enable_cache(
     taylorseer_kwargs={
         "n_derivatives": 2, # default is 2.
     },
-    warmup_steps=3, # prefer: >= n_derivatives + 1
+    max_warmup_steps=3, # prefer: >= n_derivatives + 1
     residual_diff_threshold=0.12
 )
 ```

{cache_dit-0.2.23.dist-info → cache_dit-0.2.24.dist-info}/RECORD RENAMED Viewed

@@ -1,18 +1,17 @@
 cache_dit/__init__.py,sha256=KwhX9NfYkWSvDFuuUVeVjcuiZiGS_22y386l8j4afMo,905
-cache_dit/_version.py,sha256=6GZdGbiFdhndXqR5oFLOd8VGzUvRkESP-NyStAZWYUw,706
+cache_dit/_version.py,sha256=AZPr2DJJAwMsYN7GLT_kjMvP33B8Rgy4O_7h4o_T_88,706
 cache_dit/logger.py,sha256=0zsu42hN-3-rgGC_C29ms1IvVpV4_b4_SwJCKSenxBE,4304
-cache_dit/primitives.py,sha256=A2iG9YLot3gOsZSPp-_gyjqjLgJvWQRx8aitD4JQ23Y,3877
-cache_dit/utils.py,sha256=3UgVhfmTFG28w6CV-Rfxp5u1uzLrRozocHwLCTGiQ5M,5865
+cache_dit/utils.py,sha256=kzwF98nzfzIFHSLtCx7Vq4a9aTW42lY-Bth7Oi4jAhg,6083
 cache_dit/cache_factory/.gitignore,sha256=5Cb-qT9wsTUoMJ7vACDF7ZcLpAXhi5v-xdcWSRit988,23
 cache_dit/cache_factory/__init__.py,sha256=evWenCin1kuBGa6W5BCKMrDZc1C1R2uVPSg0BjXgdXE,499
-cache_dit/cache_factory/cache_adapters.py,sha256=QTSwjdCmHDeF80TLp6D3KhzQS_oMPna0_bESgJBrdkg,23978
+cache_dit/cache_factory/cache_adapters.py,sha256=Yugqljm9tm615srM2BGQlR_tA0QiZo3PbLPceObh4dQ,25988
 cache_dit/cache_factory/cache_blocks.py,sha256=ZeazBsYvLIjI5M_OnLL2xP2W7zMeM0rxVfBBwIVHBRs,18661
-cache_dit/cache_factory/cache_context.py,sha256=4thx9NYxVaYZ_Nr2quUVE8bsNmTsXhZK0F960rccOc8,39000
-cache_dit/cache_factory/cache_interface.py,sha256=PohG_2oy747O37YSsWz_DwxxTXN7ORhQatyEbg_6fQs,8045
+cache_dit/cache_factory/cache_context.py,sha256=Cexr1_uwEkX7v8gB7DSyhCX0SI2dqS_e_ccTR16G2es,41738
+cache_dit/cache_factory/cache_interface.py,sha256=ri8wAxmHOsDW8c6qYP6VquOJQaTSXuOchWXG3PdcYQM,8434
 cache_dit/cache_factory/cache_types.py,sha256=FIFa6ZBfvvSMMHyBBhvarvgg2Y2wbRgITcG_uGylGe0,991
 cache_dit/cache_factory/forward_pattern.py,sha256=B2YeqV2t_zo2Ar8m7qimPBjwQgoXHGp2grPZmEAhi8s,1286
-cache_dit/cache_factory/taylorseer.py,sha256=WeK2WlAJa4Px_pnAKokmnZXeqQYylQkPw4-EDqBIqeQ,3770
-cache_dit/cache_factory/utils.py,sha256=YGtn02O3fVlrfQ32gGV4WAtTRvzzwSXNxzP_FmnE2Uk,1867
+cache_dit/cache_factory/taylorseer.py,sha256=etSUIZzDvqW3ScKCbccTPcFaSmxV1T-xAXdk-p3e3wk,3802
+cache_dit/cache_factory/utils.py,sha256=XkVM9AXcB9zYq8-S8QKAsGz80r3tA6U3lBNGDGeHOe4,1871
 cache_dit/cache_factory/patch/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 cache_dit/cache_factory/patch/flux.py,sha256=iNQ-1RlOgXupZ4uPiEvJ__Ro6vKT_fOKja9JrpMrO78,8998
 cache_dit/compile/__init__.py,sha256=FcTVzCeyypl-mxlc59_ehHL3lBNiDAFsXuRoJ-5Cfi0,56
@@ -25,9 +24,9 @@ cache_dit/metrics/fid.py,sha256=9Ivtazl6mW0Bon2VXa-Ia5Xj2ewxRD3V1Qkd69zYM3Y,1706
 cache_dit/metrics/inception.py,sha256=pBVe2X6ylLPIXTG4-GWDM9DWnCviMJbJ45R3ulhktR0,12759
 cache_dit/metrics/lpips.py,sha256=I2qCNi6qJh5TRsaIsdxO0WoRX1DN7U_H3zS0oCSahYM,1032
 cache_dit/metrics/metrics.py,sha256=8jvM1sF-nDxUuwCRy44QEoo4dYVLCQVh1QyAMs4eaQY,27840
-cache_dit-0.2.23.dist-info/licenses/LICENSE,sha256=Dqb07Ik2dV41s9nIdMUbiRWEfDqo7-dQeRiY7kPO8PE,3769
-cache_dit-0.2.23.dist-info/METADATA,sha256=Dq2f8TlyTmv36otIJ2F-fRGkJlZmpW2SY6O14P2AYKo,19772
-cache_dit-0.2.23.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
-cache_dit-0.2.23.dist-info/entry_points.txt,sha256=FX2gysXaZx6NeK1iCLMcIdP8Q4_qikkIHtEmi3oWn8o,65
-cache_dit-0.2.23.dist-info/top_level.txt,sha256=ZJDydonLEhujzz0FOkVbO-BqfzO9d_VqRHmZU-3MOZo,10
-cache_dit-0.2.23.dist-info/RECORD,,
+cache_dit-0.2.24.dist-info/licenses/LICENSE,sha256=Dqb07Ik2dV41s9nIdMUbiRWEfDqo7-dQeRiY7kPO8PE,3769
+cache_dit-0.2.24.dist-info/METADATA,sha256=zq_bGjQ_X--m1njAbOob--MwOpTDlUlAzZ3u_MiNiFM,19977
+cache_dit-0.2.24.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
+cache_dit-0.2.24.dist-info/entry_points.txt,sha256=FX2gysXaZx6NeK1iCLMcIdP8Q4_qikkIHtEmi3oWn8o,65
+cache_dit-0.2.24.dist-info/top_level.txt,sha256=ZJDydonLEhujzz0FOkVbO-BqfzO9d_VqRHmZU-3MOZo,10
+cache_dit-0.2.24.dist-info/RECORD,,

cache_dit/primitives.py DELETED Viewed

@@ -1,152 +0,0 @@
-# Adapted from: https://github.com/chengzeyi/ParaAttention/blob/main/src/para_attn/primitives.py
-from typing import List, Optional, Tuple, Union
-import torch
-import torch.distributed as dist
-if dist.is_available():
-    import torch.distributed._functional_collectives as ft_c
-    import torch.distributed.distributed_c10d as c10d
-else:
-    ft_c = None
-    c10d = None
-def get_group(group=None):
-    if group is None:
-        group = c10d._get_default_group()
-    if isinstance(group, dist.ProcessGroup):
-        pg: Union[dist.ProcessGroup, List[dist.ProcessGroup]] = group
-    else:
-        pg = group.get_group()
-    return pg
-def get_world_size(group=None):
-    pg = get_group(group)
-    return dist.get_world_size(pg)
-def get_rank(group=None):
-    pg = get_group(group)
-    return dist.get_rank(pg)
-def _maybe_wait(tensor: torch.Tensor) -> torch.Tensor:
-    """
-    When tracing the code, the result tensor is not an AsyncCollectiveTensor,
-    so we cannot call ``wait()``.
-    """
-    if isinstance(tensor, ft_c.AsyncCollectiveTensor):
-        return tensor.wait()
-    return tensor
-def all_gather_tensor_sync(x, *args, group=None, **kwargs):
-    group = get_group(group)
-    x_shape = x.shape
-    x = x.flatten()
-    x_numel = x.numel()
-    x = ft_c.all_gather_tensor(x, *args, group=group, **kwargs)
-    x = _maybe_wait(x)
-    x_shape = list(x_shape)
-    x_shape[0] *= x.numel() // x_numel
-    x = x.reshape(x_shape)
-    return x
-def all_gather_tensor_autograd_sync(x, *args, group=None, **kwargs):
-    group = get_group(group)
-    x_shape = x.shape
-    x = x.flatten()
-    x_numel = x.numel()
-    x = ft_c.all_gather_tensor_autograd(x, *args, group=group, **kwargs)
-    x = _maybe_wait(x)
-    x_shape = list(x_shape)
-    x_shape[0] *= x.numel() // x_numel
-    x = x.reshape(x_shape)
-    return x
-def all_to_all_single_sync(x, *args, **kwargs):
-    x_shape = x.shape
-    x = x.flatten()
-    x = ft_c.all_to_all_single(x, *args, **kwargs)
-    x = _maybe_wait(x)
-    x = x.reshape(x_shape)
-    return x
-def all_to_all_single_autograd_sync(x, *args, **kwargs):
-    x_shape = x.shape
-    x = x.flatten()
-    x = ft_c.all_to_all_single_autograd(x, *args, **kwargs)
-    x = _maybe_wait(x)
-    x = x.reshape(x_shape)
-    return x
-def all_reduce_sync(x, *args, group=None, **kwargs):
-    group = get_group(group)
-    x = ft_c.all_reduce(x, *args, group=group, **kwargs)
-    x = _maybe_wait(x)
-    return x
-def get_buffer(
-    shape_or_tensor: Union[Tuple[int], torch.Tensor],
-    *,
-    repeats: int = 1,
-    dim: int = 0,
-    dtype: Optional[torch.dtype] = None,
-    device: Optional[torch.device] = None,
-    group=None,
-) -> torch.Tensor:
-    if repeats is None:
-        repeats = get_world_size(group)
-    if isinstance(shape_or_tensor, torch.Tensor):
-        shape = shape_or_tensor.shape
-        dtype = shape_or_tensor.dtype
-        device = shape_or_tensor.device
-    assert dtype is not None
-    assert device is not None
-    shape = list(shape)
-    if repeats > 1:
-        shape[dim] *= repeats
-    buffer = torch.empty(shape, dtype=dtype, device=device)
-    return buffer
-def get_assigned_chunk(
-    tensor: torch.Tensor,
-    dim: int = 0,
-    idx: Optional[int] = None,
-    group=None,
-) -> torch.Tensor:
-    if idx is None:
-        idx = get_rank(group)
-    world_size = get_world_size(group)
-    total_size = tensor.shape[dim]
-    assert (
-        total_size % world_size == 0
-    ), f"tensor.shape[{dim}]={total_size} is not divisible by world_size={world_size}"
-    return tensor.chunk(world_size, dim=dim)[idx]
-def get_complete_tensor(
-    tensor: torch.Tensor,
-    *,
-    dim: int = 0,
-    group=None,
-) -> torch.Tensor:
-    tensor = tensor.transpose(0, dim).contiguous()
-    output_tensor = all_gather_tensor_sync(tensor, gather_dim=0, group=group)
-    output_tensor = output_tensor.transpose(0, dim)
-    return output_tensor

{cache_dit-0.2.23.dist-info → cache_dit-0.2.24.dist-info}/WHEEL RENAMED Viewed

File without changes

{cache_dit-0.2.23.dist-info → cache_dit-0.2.24.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{cache_dit-0.2.23.dist-info → cache_dit-0.2.24.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

{cache_dit-0.2.23.dist-info → cache_dit-0.2.24.dist-info}/top_level.txt RENAMED Viewed

File without changes

cache-dit 0.2.23__py3-none-any.whl → 0.2.24__py3-none-any.whl

Potentially problematic release.

cache-dit 0.2.23py3-none-any.whl → 0.2.24py3-none-any.whl