PyPI - cache-dit - Versions diffs - 1.0.1__py3-none-any.whl → 1.0.2__py3-none-any.whl - Mend

cache-dit 1.0.1py3-none-any.whl → 1.0.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of cache-dit might be problematic. Click here for more details.

Files changed (17) hide show

cache_dit/_version.py CHANGED Viewed

@@ -28,7 +28,7 @@ version_tuple: VERSION_TUPLE
 commit_id: COMMIT_ID
 __commit_id__: COMMIT_ID
-__version__ = version = '1.0.1'
-__version_tuple__ = version_tuple = (1, 0, 1)
+__version__ = version = '1.0.2'
+__version_tuple__ = version_tuple = (1, 0, 2)
 __commit_id__ = commit_id = None

cache_dit/cache_factory/block_adapters/__init__.py CHANGED Viewed

@@ -143,14 +143,30 @@ def qwenimage_adapter(pipe, **kwargs) -> BlockAdapter:
     from diffusers import QwenImageTransformer2DModel
     assert isinstance(pipe.transformer, QwenImageTransformer2DModel)
-    return BlockAdapter(
-        pipe=pipe,
-        transformer=pipe.transformer,
-        blocks=pipe.transformer.transformer_blocks,
-        forward_pattern=ForwardPattern.Pattern_1,
-        has_separate_cfg=True,
-        **kwargs,
-    )
+    pipe_cls_name: str = pipe.__class__.__name__
+    if pipe_cls_name.startswith("QwenImageControlNet"):
+        from cache_dit.cache_factory.patch_functors import (
+            QwenImageControlNetPatchFunctor,
+        )
+        return BlockAdapter(
+            pipe=pipe,
+            transformer=pipe.transformer,
+            blocks=pipe.transformer.transformer_blocks,
+            forward_pattern=ForwardPattern.Pattern_1,
+            patch_functor=QwenImageControlNetPatchFunctor(),
+            has_separate_cfg=True,
+        )
+    else:
+        return BlockAdapter(
+            pipe=pipe,
+            transformer=pipe.transformer,
+            blocks=pipe.transformer.transformer_blocks,
+            forward_pattern=ForwardPattern.Pattern_1,
+            has_separate_cfg=True,
+            **kwargs,
+        )
 @BlockAdapterRegistry.register("LTX")

cache_dit/cache_factory/cache_adapters/cache_adapter.py CHANGED Viewed

@@ -14,10 +14,6 @@ from cache_dit.cache_factory.cache_contexts import CachedContextManager
 from cache_dit.cache_factory.cache_contexts import BasicCacheConfig
 from cache_dit.cache_factory.cache_contexts import CalibratorConfig
 from cache_dit.cache_factory.cache_blocks import CachedBlocks
-from cache_dit.cache_factory.cache_blocks import (
-    patch_cached_stats,
-    remove_cached_stats,
-)
 from cache_dit.logger import init_logger
 logger = init_logger(__name__)
@@ -167,7 +163,7 @@ class CachedAdapter:
         cls,
         block_adapter: BlockAdapter,
         **cache_context_kwargs,
-    ) -> DiffusionPipeline:
+    ) -> Tuple[List[str], List[Dict[str, Any]]]:
         BlockAdapter.assert_normalized(block_adapter)
@@ -221,7 +217,7 @@ class CachedAdapter:
         cls.apply_params_hooks(block_adapter, contexts_kwargs)
-        return block_adapter.pipe
+        return flatten_contexts, contexts_kwargs
     @classmethod
     def modify_context_params(
@@ -470,6 +466,10 @@ class CachedAdapter:
         cls,
         block_adapter: BlockAdapter,
     ):
+        from cache_dit.cache_factory.cache_blocks import (
+            patch_cached_stats,
+        )
         cache_manager = block_adapter.pipe._cache_manager
         for i in range(len(block_adapter.transformer)):
@@ -557,6 +557,10 @@ class CachedAdapter:
         )
         # release stats hooks
+        from cache_dit.cache_factory.cache_blocks import (
+            remove_cached_stats,
+        )
         cls.release_hooks(
             pipe_or_adapter,
             remove_cached_stats,

cache_dit/cache_factory/cache_blocks/pattern_3_4_5.py CHANGED Viewed

@@ -1,6 +1,9 @@
 import torch
 from cache_dit.cache_factory import ForwardPattern
+from cache_dit.cache_factory.cache_contexts.cache_manager import (
+    CacheNotExistError,
+)
 from cache_dit.cache_factory.cache_blocks.pattern_base import (
     CachedBlocks_Pattern_Base,
 )
@@ -16,6 +19,70 @@ class CachedBlocks_Pattern_3_4_5(CachedBlocks_Pattern_Base):
         ForwardPattern.Pattern_5,
     ]
+    def call_blocks(
+        self,
+        hidden_states: torch.Tensor,
+        *args,
+        **kwargs,
+    ):
+        # Call all blocks to process the hidden states without cache.
+        new_encoder_hidden_states = None
+        for block in self.transformer_blocks:
+            hidden_states = block(
+                hidden_states,
+                *args,
+                **kwargs,
+            )
+            hidden_states, new_encoder_hidden_states = self._process_outputs(
+                hidden_states
+            )
+        return hidden_states, new_encoder_hidden_states
+    @torch.compiler.disable
+    def _process_outputs(
+        self, hidden_states: torch.Tensor | tuple
+    ) -> tuple[torch.Tensor, torch.Tensor | None]:
+        # Process the outputs for the block.
+        new_encoder_hidden_states = None
+        if not isinstance(hidden_states, torch.Tensor):  # Pattern 4, 5
+            if len(hidden_states) == 2:
+                if isinstance(hidden_states[1], torch.Tensor):
+                    hidden_states, new_encoder_hidden_states = hidden_states
+                    if not self.forward_pattern.Return_H_First:
+                        hidden_states, new_encoder_hidden_states = (
+                            new_encoder_hidden_states,
+                            hidden_states,
+                        )
+                elif isinstance(hidden_states[0], torch.Tensor):
+                    hidden_states = hidden_states[0]
+                else:
+                    raise ValueError("Unexpected hidden_states format.")
+            else:
+                assert (
+                    len(hidden_states) == 1
+                ), f"Unexpected output length: {len(hidden_states)}"
+                hidden_states = hidden_states[0]
+        return hidden_states, new_encoder_hidden_states
+    @torch.compiler.disable
+    def _forward_outputs(
+        self,
+        hidden_states: torch.Tensor,
+        new_encoder_hidden_states: torch.Tensor | None,
+    ) -> (
+        torch.Tensor
+        | tuple[torch.Tensor, torch.Tensor]
+        | tuple[torch.Tensor, None]
+    ):
+        if self.forward_pattern.Return_H_Only:
+            return hidden_states
+        else:
+            if self.forward_pattern.Return_H_First:
+                return (hidden_states, new_encoder_hidden_states)
+            else:
+                return (new_encoder_hidden_states, hidden_states)
     def forward(
         self,
         hidden_states: torch.Tensor,
@@ -23,8 +90,19 @@ class CachedBlocks_Pattern_3_4_5(CachedBlocks_Pattern_Base):
         **kwargs,
     ):
         # Use it's own cache context.
-        self.cache_manager.set_context(self.cache_context)
-        self._check_cache_params()
+        try:
+            self.cache_manager.set_context(self.cache_context)
+            self._check_cache_params()
+        except CacheNotExistError as e:
+            logger.warning(f"Cache context not exist: {e}, skip cache.")
+            hidden_states, new_encoder_hidden_states = self.call_blocks(
+                hidden_states,
+                *args,
+                **kwargs,
+            )
+            return self._forward_outputs(
+                hidden_states, new_encoder_hidden_states
+            )
         original_hidden_states = hidden_states
         # Call first `n` blocks to process the hidden states for
@@ -35,7 +113,9 @@ class CachedBlocks_Pattern_3_4_5(CachedBlocks_Pattern_Base):
             **kwargs,
         )
-        Fn_hidden_states_residual = hidden_states - original_hidden_states
+        Fn_hidden_states_residual = hidden_states - original_hidden_states.to(
+            hidden_states.device
+        )
         del original_hidden_states
         self.cache_manager.mark_step_begin()
@@ -147,15 +227,7 @@ class CachedBlocks_Pattern_3_4_5(CachedBlocks_Pattern_Base):
         torch._dynamo.graph_break()
-        return (
-            hidden_states
-            if self.forward_pattern.Return_H_Only
-            else (
-                (hidden_states, new_encoder_hidden_states)
-                if self.forward_pattern.Return_H_First
-                else (new_encoder_hidden_states, hidden_states)
-            )
-        )
+        return self._forward_outputs(hidden_states, new_encoder_hidden_states)
     def call_Fn_blocks(
         self,
@@ -170,13 +242,9 @@ class CachedBlocks_Pattern_3_4_5(CachedBlocks_Pattern_Base):
                 *args,
                 **kwargs,
             )
-            if not isinstance(hidden_states, torch.Tensor):  # Pattern 4, 5
-                hidden_states, new_encoder_hidden_states = hidden_states
-                if not self.forward_pattern.Return_H_First:
-                    hidden_states, new_encoder_hidden_states = (
-                        new_encoder_hidden_states,
-                        hidden_states,
-                    )
+            hidden_states, new_encoder_hidden_states = self._process_outputs(
+                hidden_states
+            )
         return hidden_states, new_encoder_hidden_states
@@ -194,16 +262,16 @@ class CachedBlocks_Pattern_3_4_5(CachedBlocks_Pattern_Base):
                 *args,
                 **kwargs,
             )
-            if not isinstance(hidden_states, torch.Tensor):  # Pattern 4, 5
-                hidden_states, new_encoder_hidden_states = hidden_states
-                if not self.forward_pattern.Return_H_First:
-                    hidden_states, new_encoder_hidden_states = (
-                        new_encoder_hidden_states,
-                        hidden_states,
-                    )
+            hidden_states, new_encoder_hidden_states = self._process_outputs(
+                hidden_states
+            )
         # compute hidden_states residual
         hidden_states = hidden_states.contiguous()
-        hidden_states_residual = hidden_states - original_hidden_states
+        hidden_states_residual = hidden_states - original_hidden_states.to(
+            hidden_states.device
+        )
         return (
             hidden_states,
@@ -227,12 +295,9 @@ class CachedBlocks_Pattern_3_4_5(CachedBlocks_Pattern_Base):
                 *args,
                 **kwargs,
             )
-            if not isinstance(hidden_states, torch.Tensor):  # Pattern 4,5
-                hidden_states, new_encoder_hidden_states = hidden_states
-                if not self.forward_pattern.Return_H_First:
-                    hidden_states, new_encoder_hidden_states = (
-                        new_encoder_hidden_states,
-                        hidden_states,
-                    )
+            hidden_states, new_encoder_hidden_states = self._process_outputs(
+                hidden_states
+            )
         return hidden_states, new_encoder_hidden_states

cache_dit/cache_factory/cache_blocks/pattern_base.py CHANGED Viewed

@@ -1,12 +1,11 @@
 import inspect
-import asyncio
 import torch
 import torch.distributed as dist
-from typing import List
 from cache_dit.cache_factory.cache_contexts.cache_context import CachedContext
 from cache_dit.cache_factory.cache_contexts.cache_manager import (
     CachedContextManager,
+    CacheNotExistError,
 )
 from cache_dit.cache_factory import ForwardPattern
 from cache_dit.logger import init_logger
@@ -47,7 +46,6 @@ class CachedBlocks_Pattern_Base(torch.nn.Module):
         self.cache_prefix = cache_prefix
         self.cache_context = cache_context
         self.cache_manager = cache_manager
-        self.pending_tasks: List[asyncio.Task] = []
         self._check_forward_pattern()
         logger.info(
@@ -111,6 +109,62 @@ class CachedBlocks_Pattern_Base(torch.nn.Module):
             f"the number of transformer blocks {len(self.transformer_blocks)}"
         )
+    def call_blocks(
+        self,
+        hidden_states: torch.Tensor,
+        encoder_hidden_states: torch.Tensor,
+        *args,
+        **kwargs,
+    ):
+        # Call all blocks to process the hidden states without cache.
+        for block in self.transformer_blocks:
+            hidden_states = block(
+                hidden_states,
+                encoder_hidden_states,
+                *args,
+                **kwargs,
+            )
+            if not isinstance(hidden_states, torch.Tensor):
+                hidden_states, encoder_hidden_states = hidden_states
+                if not self.forward_pattern.Return_H_First:
+                    hidden_states, encoder_hidden_states = (
+                        encoder_hidden_states,
+                        hidden_states,
+                    )
+        return hidden_states, encoder_hidden_states
+    @torch.compiler.disable
+    def _process_outputs(
+        self,
+        hidden_states: torch.Tensor | tuple,
+        encoder_hidden_states: torch.Tensor | None,
+    ) -> tuple[torch.Tensor, torch.Tensor | None]:
+        if not isinstance(hidden_states, torch.Tensor):
+            hidden_states, encoder_hidden_states = hidden_states
+            if not self.forward_pattern.Return_H_First:
+                hidden_states, encoder_hidden_states = (
+                    encoder_hidden_states,
+                    hidden_states,
+                )
+        return hidden_states, encoder_hidden_states
+    @torch.compiler.disable
+    def _forward_outputs(
+        self,
+        hidden_states: torch.Tensor,
+        encoder_hidden_states: torch.Tensor | None,
+    ) -> tuple[torch.Tensor, torch.Tensor | None] | torch.Tensor:
+        return (
+            hidden_states
+            if self.forward_pattern.Return_H_Only
+            else (
+                (hidden_states, encoder_hidden_states)
+                if self.forward_pattern.Return_H_First
+                else (encoder_hidden_states, hidden_states)
+            )
+        )
     def forward(
         self,
         hidden_states: torch.Tensor,
@@ -119,8 +173,19 @@ class CachedBlocks_Pattern_Base(torch.nn.Module):
         **kwargs,
     ):
         # Use it's own cache context.
-        self.cache_manager.set_context(self.cache_context)
-        self._check_cache_params()
+        try:
+            self.cache_manager.set_context(self.cache_context)
+            self._check_cache_params()
+        except CacheNotExistError as e:
+            logger.warning(f"Cache context not exist: {e}, skip cache.")
+            # Call all blocks to process the hidden states.
+            hidden_states, encoder_hidden_states = self.call_blocks(
+                hidden_states,
+                encoder_hidden_states,
+                *args,
+                **kwargs,
+            )
+            return self._forward_outputs(hidden_states, encoder_hidden_states)
         original_hidden_states = hidden_states
         # Call first `n` blocks to process the hidden states for
@@ -239,15 +304,7 @@ class CachedBlocks_Pattern_Base(torch.nn.Module):
         # patch cached stats for blocks or remove it.
         torch._dynamo.graph_break()
-        return (
-            hidden_states
-            if self.forward_pattern.Return_H_Only
-            else (
-                (hidden_states, encoder_hidden_states)
-                if self.forward_pattern.Return_H_First
-                else (encoder_hidden_states, hidden_states)
-            )
-        )
+        return self._forward_outputs(hidden_states, encoder_hidden_states)
     @torch.compiler.disable
     def _is_parallelized(self):
@@ -322,13 +379,9 @@ class CachedBlocks_Pattern_Base(torch.nn.Module):
                 *args,
                 **kwargs,
             )
-            if not isinstance(hidden_states, torch.Tensor):
-                hidden_states, encoder_hidden_states = hidden_states
-                if not self.forward_pattern.Return_H_First:
-                    hidden_states, encoder_hidden_states = (
-                        encoder_hidden_states,
-                        hidden_states,
-                    )
+            hidden_states, encoder_hidden_states = self._process_outputs(
+                hidden_states, encoder_hidden_states
+            )
         return hidden_states, encoder_hidden_states
@@ -348,13 +401,9 @@ class CachedBlocks_Pattern_Base(torch.nn.Module):
                 *args,
                 **kwargs,
             )
-            if not isinstance(hidden_states, torch.Tensor):
-                hidden_states, encoder_hidden_states = hidden_states
-                if not self.forward_pattern.Return_H_First:
-                    hidden_states, encoder_hidden_states = (
-                        encoder_hidden_states,
-                        hidden_states,
-                    )
+            hidden_states, encoder_hidden_states = self._process_outputs(
+                hidden_states, encoder_hidden_states
+            )
         # compute hidden_states residual
         hidden_states = hidden_states.contiguous()
@@ -396,12 +445,8 @@ class CachedBlocks_Pattern_Base(torch.nn.Module):
                 *args,
                 **kwargs,
             )
-            if not isinstance(hidden_states, torch.Tensor):
-                hidden_states, encoder_hidden_states = hidden_states
-                if not self.forward_pattern.Return_H_First:
-                    hidden_states, encoder_hidden_states = (
-                        encoder_hidden_states,
-                        hidden_states,
-                    )
+            hidden_states, encoder_hidden_states = self._process_outputs(
+                hidden_states, encoder_hidden_states
+            )
         return hidden_states, encoder_hidden_states

cache_dit/cache_factory/cache_contexts/__init__.py CHANGED Viewed

@@ -11,4 +11,5 @@ from cache_dit.cache_factory.cache_contexts.cache_context import (
 )
 from cache_dit.cache_factory.cache_contexts.cache_manager import (
     CachedContextManager,
+    CacheNotExistError,
 )

cache_dit/cache_factory/cache_contexts/cache_manager.py CHANGED Viewed

@@ -14,6 +14,10 @@ from cache_dit.logger import init_logger
 logger = init_logger(__name__)
+class CacheNotExistError(Exception):
+    pass
 class CachedContextManager:
     # Each Pipeline should have it's own context manager instance.
@@ -27,16 +31,19 @@ class CachedContextManager:
         self._cached_context_manager[_context.name] = _context
         return _context
-    def set_context(self, cached_context: CachedContext | str):
+    def set_context(self, cached_context: CachedContext | str) -> CachedContext:
         if isinstance(cached_context, CachedContext):
             self._current_context = cached_context
         else:
+            if cached_context not in self._cached_context_manager:
+                raise CacheNotExistError("Context not exist!")
             self._current_context = self._cached_context_manager[cached_context]
+        return self._current_context
     def get_context(self, name: str = None) -> CachedContext:
         if name is not None:
             if name not in self._cached_context_manager:
-                raise ValueError("Context not exist!")
+                raise CacheNotExistError("Context not exist!")
             return self._cached_context_manager[name]
         return self._current_context

cache_dit/cache_factory/cache_interface.py CHANGED Viewed

@@ -38,23 +38,43 @@ def enable_cache(
     BlockAdapter,
 ]:
     r"""
-    Unified Cache API for  almost Any Diffusion Transformers (with Transformer Blocks
-    that match the specific Input and Output patterns).
-    For a good balance between performance and precision, DBCache is configured by default
-    with F8B0, 8 warmup steps, and unlimited cached steps.
+    The `enable_cache` function serves as a unified caching interface designed to optimize the performance
+    of diffusion transformer models by implementing an intelligent caching mechanism known as `DBCache`.
+    This API is engineered to be compatible with nearly `all` diffusion transformer architectures that
+    feature transformer blocks adhering to standard input-output patterns, eliminating the need for
+    architecture-specific modifications.
+    By strategically caching intermediate outputs of transformer blocks during the diffusion process,
+    `DBCache` significantly reduces redundant computations without compromising generation quality.
+    The caching mechanism works by tracking residual differences between consecutive steps, allowing
+    the model to reuse previously computed features when these differences fall below a configurable
+    threshold. This approach maintains a balance between computational efficiency and output precision.
+    The default configuration (`F8B0, 8 warmup steps, unlimited cached steps`) is carefully tuned to
+    provide an optimal tradeoff for most common use cases. The "F8B0" configuration indicates that
+    the first 8 transformer blocks are used to compute stable feature differences, while no final
+    blocks are employed for additional fusion. The warmup phase ensures the model establishes
+    sufficient feature representation before caching begins, preventing potential degradation of
+    output quality.
+    This function seamlessly integrates with both standard diffusion pipelines and custom block
+    adapters, making it versatile for various deployment scenarios—from research prototyping to
+    production environments where inference speed is critical. By abstracting the complexity of
+    caching logic behind a simple interface, it enables developers to enhance model performance
+    with minimal code changes.
     Args:
         pipe_or_adapter (`DiffusionPipeline` or `BlockAdapter`, *required*):
             The standard Diffusion Pipeline or custom BlockAdapter (from cache-dit or user-defined).
             For example: cache_dit.enable_cache(FluxPipeline(...)). Please check https://github.com/vipshop/cache-dit/blob/main/docs/BlockAdapter.md
             for the usgae of BlockAdapter.
         cache_config (`BasicCacheConfig`, *required*, defaults to BasicCacheConfig()):
             Basic DBCache config for cache context, defaults to BasicCacheConfig(). The configurable params listed belows:
                 Fn_compute_blocks: (`int`, *required*, defaults to 8):
-                    Specifies that `DBCache` uses the **first n** Transformer blocks to fit the information
-                    at time step t, enabling the calculation of a more stable L1 diff and delivering more
-                    accurate information to subsequent blocks. Please check https://github.com/vipshop/cache-dit/blob/main/docs/DBCache.md
+                    Specifies that `DBCache` uses the**first n**Transformer blocks to fit the information at time step t,
+                    enabling the calculation of a more stable L1 difference and delivering more accurate information
+                    to subsequent blocks. Please check https://github.com/vipshop/cache-dit/blob/main/docs/DBCache.md
                     for more details of DBCache.
                 Bn_compute_blocks: (`int`, *required*, defaults to 0):
                     Further fuses approximate information in the **last n** Transformer blocks to enhance
@@ -77,14 +97,18 @@ def enable_cache(
                     and non-CFG into single forward step, should set enable_separate_cfg as False, for example:
                     CogVideoX, HunyuanVideo, Mochi, etc.
                 cfg_compute_first (`bool`, *required*,  defaults to False):
-                    Compute cfg forward first or not, default False, namely, 0, 2, 4, ..., -> non-CFG step;
+                    Whether to compute cfg forward first, default is False, meaning:
+                    0, 2, 4, ..., -> non-CFG step;
                     1, 3, 5, ... -> CFG step.
                 cfg_diff_compute_separate (`bool`, *required*,  defaults to True):
-                    Compute separate diff values for CFG and non-CFG step, default True. If False, we will
-                    use the computed diff from current non-CFG transformer step for current CFG step.
+                    Whether to compute separate difference values for CFG and non-CFG steps, default is True.
+                    If False, we will use the computed difference from the current non-CFG transformer step
+                    for the current CFG step.
         calibrator_config (`CalibratorConfig`, *optional*, defaults to None):
-            Config for calibrator, if calibrator_config is not None, means that user want to use DBCache
-            with specific calibrator, such as taylorseer, foca, and so on.
+            Config for calibrator. If calibrator_config is not None, it means the user wants to use DBCache
+            with a specific calibrator, such as taylorseer, foca, and so on.
         params_modifiers ('ParamsModifier', *optional*, defaults to None):
             Modify cache context params for specific blocks. The configurable params listed belows:
                 cache_config (`BasicCacheConfig`, *required*, defaults to BasicCacheConfig()):
@@ -93,6 +117,7 @@ def enable_cache(
                     The same as 'calibrator_config' param in cache_dit.enable_cache() interface.
                 **kwargs: (`dict`, *optional*, defaults to {}):
                     The same as 'kwargs' param in cache_dit.enable_cache() interface.
         kwargs (`dict`, *optional*, defaults to {})
             Other cache context kwargs, please check https://github.com/vipshop/cache-dit/blob/main/src/cache_dit/cache_factory/cache_contexts/cache_context.py
             for more details.

cache_dit/cache_factory/patch_functors/__init__.py CHANGED Viewed

@@ -10,3 +10,6 @@ from cache_dit.cache_factory.patch_functors.functor_hidream import (
 from cache_dit.cache_factory.patch_functors.functor_hunyuan_dit import (
     HunyuanDiTPatchFunctor,
 )
+from cache_dit.cache_factory.patch_functors.functor_qwen_image_controlnet import (
+    QwenImageControlNetPatchFunctor,
+)

cache_dit/cache_factory/patch_functors/functor_qwen_image_controlnet.py ADDED Viewed

@@ -0,0 +1,263 @@
+import torch
+import numpy as np
+from typing import Tuple, Optional, Dict, Any, Union, List
+from diffusers import QwenImageTransformer2DModel
+from diffusers.models.transformers.transformer_qwenimage import (
+    QwenImageTransformerBlock,
+    Transformer2DModelOutput,
+)
+from diffusers.utils import (
+    USE_PEFT_BACKEND,
+    scale_lora_layers,
+    unscale_lora_layers,
+)
+from cache_dit.cache_factory.patch_functors.functor_base import (
+    PatchFunctor,
+)
+from cache_dit.logger import init_logger
+logger = init_logger(__name__)
+class QwenImageControlNetPatchFunctor(PatchFunctor):
+    def apply(
+        self,
+        transformer: QwenImageTransformer2DModel,
+        **kwargs,
+    ) -> QwenImageTransformer2DModel:
+        if hasattr(transformer, "_is_patched"):
+            return transformer
+        is_patched = False
+        _index_block = 0
+        _num_blocks = len(transformer.transformer_blocks)
+        for block in transformer.transformer_blocks:
+            assert isinstance(block, QwenImageTransformerBlock)
+            block._index_block = _index_block
+            block._num_blocks = _num_blocks
+            block.forward = __patch_block_forward__.__get__(block)
+            _index_block += 1
+        is_patched = True
+        cls_name = transformer.__class__.__name__
+        if is_patched:
+            logger.warning(f"Patched {cls_name} for cache-dit.")
+            assert not getattr(transformer, "_is_parallelized", False), (
+                "Please call `cache_dit.enable_cache` before Parallelize, "
+                "the __patch_transformer_forward__ will overwrite the "
+                "parallized forward and cause a downgrade of performance."
+            )
+            transformer.forward = __patch_transformer_forward__.__get__(
+                transformer
+            )
+        transformer._is_patched = is_patched  # True or False
+        logger.info(
+            f"Applied {self.__class__.__name__} for {cls_name}, "
+            f"Patch: {is_patched}."
+        )
+        return transformer
+def __patch_block_forward__(
+    self: QwenImageTransformerBlock,
+    hidden_states: torch.Tensor,
+    encoder_hidden_states: torch.Tensor,
+    encoder_hidden_states_mask: torch.Tensor,
+    temb: torch.Tensor,
+    image_rotary_emb: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
+    joint_attention_kwargs: Optional[Dict[str, Any]] = None,
+    controlnet_block_samples: Optional[List[torch.Tensor]] = None,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    # Get modulation parameters for both streams
+    img_mod_params = self.img_mod(temb)  # [B, 6*dim]
+    txt_mod_params = self.txt_mod(temb)  # [B, 6*dim]
+    # Split modulation parameters for norm1 and norm2
+    img_mod1, img_mod2 = img_mod_params.chunk(2, dim=-1)  # Each [B, 3*dim]
+    txt_mod1, txt_mod2 = txt_mod_params.chunk(2, dim=-1)  # Each [B, 3*dim]
+    # Process image stream - norm1 + modulation
+    img_normed = self.img_norm1(hidden_states)
+    img_modulated, img_gate1 = self._modulate(img_normed, img_mod1)
+    # Process text stream - norm1 + modulation
+    txt_normed = self.txt_norm1(encoder_hidden_states)
+    txt_modulated, txt_gate1 = self._modulate(txt_normed, txt_mod1)
+    # Use QwenAttnProcessor2_0 for joint attention computation
+    # This directly implements the DoubleStreamLayerMegatron logic:
+    # 1. Computes QKV for both streams
+    # 2. Applies QK normalization and RoPE
+    # 3. Concatenates and runs joint attention
+    # 4. Splits results back to separate streams
+    joint_attention_kwargs = joint_attention_kwargs or {}
+    attn_output = self.attn(
+        hidden_states=img_modulated,  # Image stream (will be processed as "sample")
+        encoder_hidden_states=txt_modulated,  # Text stream (will be processed as "context")
+        encoder_hidden_states_mask=encoder_hidden_states_mask,
+        image_rotary_emb=image_rotary_emb,
+        **joint_attention_kwargs,
+    )
+    # QwenAttnProcessor2_0 returns (img_output, txt_output) when encoder_hidden_states is provided
+    img_attn_output, txt_attn_output = attn_output
+    # Apply attention gates and add residual (like in Megatron)
+    hidden_states = hidden_states + img_gate1 * img_attn_output
+    encoder_hidden_states = encoder_hidden_states + txt_gate1 * txt_attn_output
+    # Process image stream - norm2 + MLP
+    img_normed2 = self.img_norm2(hidden_states)
+    img_modulated2, img_gate2 = self._modulate(img_normed2, img_mod2)
+    img_mlp_output = self.img_mlp(img_modulated2)
+    hidden_states = hidden_states + img_gate2 * img_mlp_output
+    # Process text stream - norm2 + MLP
+    txt_normed2 = self.txt_norm2(encoder_hidden_states)
+    txt_modulated2, txt_gate2 = self._modulate(txt_normed2, txt_mod2)
+    txt_mlp_output = self.txt_mlp(txt_modulated2)
+    encoder_hidden_states = encoder_hidden_states + txt_gate2 * txt_mlp_output
+    # Clip to prevent overflow for fp16
+    if encoder_hidden_states.dtype == torch.float16:
+        encoder_hidden_states = encoder_hidden_states.clip(-65504, 65504)
+    if hidden_states.dtype == torch.float16:
+        hidden_states = hidden_states.clip(-65504, 65504)
+    if controlnet_block_samples is not None:
+        # Add ControlNet conditioning
+        num_blocks = self._num_blocks
+        index_block = self._index_block
+        interval_control = num_blocks / len(controlnet_block_samples)
+        interval_control = int(np.ceil(interval_control))
+        hidden_states = (
+            hidden_states
+            + controlnet_block_samples[index_block // interval_control]
+        )
+    return encoder_hidden_states, hidden_states
+def __patch_transformer_forward__(
+    self: QwenImageTransformer2DModel,
+    hidden_states: torch.Tensor,
+    encoder_hidden_states: torch.Tensor = None,
+    encoder_hidden_states_mask: torch.Tensor = None,
+    timestep: torch.LongTensor = None,
+    img_shapes: Optional[List[Tuple[int, int, int]]] = None,
+    txt_seq_lens: Optional[List[int]] = None,
+    guidance: torch.Tensor = None,  # TODO: this should probably be removed
+    attention_kwargs: Optional[Dict[str, Any]] = None,
+    controlnet_block_samples=None,
+    return_dict: bool = True,
+) -> Union[torch.Tensor, Transformer2DModelOutput]:
+    """
+    The [`QwenTransformer2DModel`] forward method.
+    Args:
+        hidden_states (`torch.Tensor` of shape `(batch_size, image_sequence_length, in_channels)`):
+            Input `hidden_states`.
+        encoder_hidden_states (`torch.Tensor` of shape `(batch_size, text_sequence_length, joint_attention_dim)`):
+            Conditional embeddings (embeddings computed from the input conditions such as prompts) to use.
+        encoder_hidden_states_mask (`torch.Tensor` of shape `(batch_size, text_sequence_length)`):
+            Mask of the input conditions.
+        timestep ( `torch.LongTensor`):
+            Used to indicate denoising step.
+        attention_kwargs (`dict`, *optional*):
+            A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
+            `self.processor` in
+            [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+        return_dict (`bool`, *optional*, defaults to `True`):
+            Whether or not to return a [`~models.transformer_2d.Transformer2DModelOutput`] instead of a plain
+            tuple.
+    Returns:
+        If `return_dict` is True, an [`~models.transformer_2d.Transformer2DModelOutput`] is returned, otherwise a
+        `tuple` where the first element is the sample tensor.
+    """
+    if attention_kwargs is not None:
+        attention_kwargs = attention_kwargs.copy()
+        lora_scale = attention_kwargs.pop("scale", 1.0)
+    else:
+        lora_scale = 1.0
+    if USE_PEFT_BACKEND:
+        # weight the lora layers by setting `lora_scale` for each PEFT layer
+        scale_lora_layers(self, lora_scale)
+    else:
+        if (
+            attention_kwargs is not None
+            and attention_kwargs.get("scale", None) is not None
+        ):
+            logger.warning(
+                "Passing `scale` via `joint_attention_kwargs` when not using the PEFT backend is ineffective."
+            )
+    hidden_states = self.img_in(hidden_states)
+    timestep = timestep.to(hidden_states.dtype)
+    encoder_hidden_states = self.txt_norm(encoder_hidden_states)
+    encoder_hidden_states = self.txt_in(encoder_hidden_states)
+    if guidance is not None:
+        guidance = guidance.to(hidden_states.dtype) * 1000
+    temb = (
+        self.time_text_embed(timestep, hidden_states)
+        if guidance is None
+        else self.time_text_embed(timestep, guidance, hidden_states)
+    )
+    image_rotary_emb = self.pos_embed(
+        img_shapes, txt_seq_lens, device=hidden_states.device
+    )
+    for index_block, block in enumerate(self.transformer_blocks):
+        if torch.is_grad_enabled() and self.gradient_checkpointing:
+            encoder_hidden_states, hidden_states = (
+                self._gradient_checkpointing_func(
+                    block,
+                    hidden_states,
+                    encoder_hidden_states,
+                    encoder_hidden_states_mask,
+                    temb,
+                    image_rotary_emb,
+                    controlnet_block_samples,
+                )
+            )
+        else:
+            encoder_hidden_states, hidden_states = block(
+                hidden_states=hidden_states,
+                encoder_hidden_states=encoder_hidden_states,
+                encoder_hidden_states_mask=encoder_hidden_states_mask,
+                temb=temb,
+                image_rotary_emb=image_rotary_emb,
+                controlnet_block_samples=controlnet_block_samples,
+                joint_attention_kwargs=attention_kwargs,
+            )
+        # # controlnet residual
+        # if controlnet_block_samples is not None:
+        #     interval_control = len(self.transformer_blocks) / len(controlnet_block_samples)
+        #     interval_control = int(np.ceil(interval_control))
+        #     hidden_states = hidden_states + controlnet_block_samples[index_block // interval_control]
+    # Use only the image part (hidden_states) from the dual-stream blocks
+    hidden_states = self.norm_out(hidden_states, temb)
+    output = self.proj_out(hidden_states)
+    if USE_PEFT_BACKEND:
+        # remove `lora_scale` from each PEFT layer
+        unscale_lora_layers(self, lora_scale)
+    if not return_dict:
+        return (output,)
+    return Transformer2DModelOutput(sample=output)

cache_dit/quantize/quantize_ao.py CHANGED Viewed

@@ -182,12 +182,16 @@ def quantize_ao(
     force_empty_cache()
     logger.info(
+        f"Quantized        Module: {module.__class__.__name__:>5}\n"
         f"Quantized        Method: {quant_type:>5}\n"
         f"Quantized Linear Layers: {num_quant_linear:>5}\n"
         f"Skipped   Linear Layers: {num_skip_linear:>5}\n"
         f"Total     Linear Layers: {num_linear_layers:>5}\n"
         f"Total     (all)  Layers: {num_layers:>5}"
     )
+    module._quantize_type = quant_type
+    module._is_quantized = True
     return module

{cache_dit-1.0.1.dist-info → cache_dit-1.0.2.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: cache_dit
-Version: 1.0.1
+Version: 1.0.2
 Summary: A Unified, Flexible and Training-free Cache Acceleration Framework for 🤗Diffusers.
 Author: DefTruth, vipshop.com, etc.
 Maintainer: DefTruth, vipshop.com, etc
@@ -63,73 +63,17 @@ Dynamic: requires-python
   </div>
   <p align="center">
     🎉Now, <b>cache-dit</b> covers almost <b>All</b> Diffusers' <b>DiT</b> Pipelines🎉<br>
-    🔥<a href="./examples/">Qwen-Image</a> | <a href="./examples/">FLUX.1</a> | <a href="./examples/">Qwen-Image-Lightning</a> | <a href="./examples/"> Wan 2.1 </a> | <a href="./examples/"> Wan 2.2 </a>🔥<br>
-    🔥<a href="./examples/">HunyuanImage-2.1</a> | <a href="./examples/">HunyuanVideo</a> | <a href="./examples/">HunyuanDiT</a> | <a href="./examples/">HiDream</a> | <a href="./examples/">AuraFlow</a>🔥<br>
-    🔥<a href="./examples/">CogView3Plus</a> | <a href="./examples/">CogView4</a> | <a href="./examples/">LTXVideo</a> | <a href="./examples/">CogVideoX</a> | <a href="./examples/">CogVideoX 1.5</a> | <a href="./examples/">ConsisID</a>🔥<br>
-    🔥<a href="./examples/">Cosmos</a> | <a href="./examples/">SkyReelsV2</a> | <a href="./examples/">VisualCloze</a> | <a href="./examples/">OmniGen 1/2</a> | <a href="./examples/">Lumina 1/2</a> | <a href="./examples/">PixArt</a>🔥<br>
-    🔥<a href="./examples/">Chroma</a> | <a href="./examples/">Sana</a> | <a href="./examples/">Allegro</a> | <a href="./examples/">Mochi</a> | <a href="./examples/">SD 3/3.5</a> | <a href="./examples/">Amused</a> | <a href="./examples/"> ... </a> | <a href="./examples/">DiT-XL</a>🔥
+    🔥<a href="./examples/pipeline">Qwen-Image</a> | <a href="./examples/pipeline">Qwen-Image-Edit</a> | <a href="./examples/pipeline">Qwen-Image-Edit-Plus </a> 🔥<br>
+    🔥<a href="./examples/pipeline">FLUX.1</a> | <a href="./examples/pipeline">Qwen-Image-Lightning 4/8 Steps</a> | <a href="./examples/pipeline"> Wan 2.1 </a> | <a href="./examples/pipeline"> Wan 2.2 </a>🔥<br>
+    🔥<a href="./examples/pipeline">HunyuanImage-2.1</a> | <a href="./examples/pipeline">HunyuanVideo</a> | <a href="./examples/pipeline">HunyuanDiT</a> | <a href="./examples/pipeline">HiDream</a> | <a href="./examples/pipeline">AuraFlow</a>🔥<br>
+    🔥<a href="./examples/pipeline">CogView3Plus</a> | <a href="./examples/pipeline">CogView4</a> | <a href="./examples/pipeline">LTXVideo</a> | <a href="./examples/pipeline">CogVideoX</a> | <a href="./examples/">CogVideoX 1.5</a> | <a href="./examples/">ConsisID</a>🔥<br>
+    🔥<a href="./examples/pipeline">Cosmos</a> | <a href="./examples/pipeline">SkyReelsV2</a> | <a href="./examples/pipeline">VisualCloze</a> | <a href="./examples/pipeline">OmniGen 1/2</a> | <a href="./examples/pipeline">Lumina 1/2</a> | <a href="./examples/pipeline">PixArt</a>🔥<br>
+    🔥<a href="./examples/pipeline">Chroma</a> | <a href="./examples/pipeline">Sana</a> | <a href="./examples/pipeline">Allegro</a> | <a href="./examples/pipeline">Mochi</a> | <a href="./examples/pipeline">SD 3/3.5</a> | <a href="./examples/pipeline">Amused</a> | <a href="./examples/pipeline"> ... </a> | <a href="./examples/pipeline">DiT-XL</a>🔥
     <br>♥️ Please consider to leave a <b>⭐️ Star</b> to support us ~ ♥️
   </p>
 </div>
-## 🔥Hightlight <a href="https://huggingface.co/docs/diffusers/main/en/optimization/cache_dit"><img src=https://img.shields.io/badge/🤗Diffusers-ecosystem-yellow.svg ></a>
 <div align='center'>
-<details>
-<summary> 🔥<b>Click</b> here to show <b>Important News</b>: First API-stable (v1.0.0) Release🔥 </summary>
-2025.09.25: 🎉The **first API-stable version (v1.0.0)** of cache-dit has finally been released!<br>
-2025.09.25: 🔥**cache-dit** has joined the Diffusers community ecosystem: <a href="https://huggingface.co/docs/diffusers/main/en/optimization/cache_dit"><img src=https://img.shields.io/badge/🤗Diffusers-ecosystem-yellow.svg ></a><br>
-2025.09.10: 🎉Day 1 support [**HunyuanImage-2.1**](https://github.com/Tencent-Hunyuan/HunyuanImage-2.1) with **1.7x↑🎉** speedup! Check this [example](https://github.com/vipshop/cache-dit/blob/main/examples/pipeline/run_hunyuan_image_2.1.py).<br>
-2025.09.08: 🔥[**Qwen-Image-Lightning**](https://github.com/vipshop/cache-dit/blob/main/examples/pipeline/run_qwen_image_lightning.py) **7.1/3.5 steps🎉** inference with **[DBCache: F16B16](https://github.com/vipshop/cache-dit)**.<br>
-2025.09.03: 🎉[**Wan2.2-MoE**](https://github.com/Wan-Video) **2.4x↑🎉** speedup! Please refer to [run_wan_2.2.py](https://github.com/vipshop/cache-dit/blob/main/examples/pipeline/run_wan_2.2.py) as an example.<br>
-2025.08.12: 🎉First caching mechanism in [QwenLM/Qwen-Image](https://github.com/QwenLM/Qwen-Image) with **[cache-dit](https://github.com/vipshop/cache-dit)**, check this [PR](https://github.com/QwenLM/Qwen-Image/pull/61).<br>
-2025.08.11: 🔥[**Qwen-Image**](https://github.com/QwenLM/Qwen-Image) **1.8x↑🎉** speedup! Please refer to [run_qwen_image.py](https://github.com/vipshop/cache-dit/blob/main/examples/pipeline/run_qwen_image.py) as an example.<br>
-2025.09.08: 🎉First caching mechanism in [Wan2.2](https://github.com/Wan-Video/Wan2.2) with **[cache-dit](https://github.com/vipshop/cache-dit)**, check this [PR](https://github.com/Wan-Video/Wan2.2/pull/127) for more details.<br>
-2025.09.08: 🎉First caching mechanism in [Qwen-Image-Lightning](https://github.com/ModelTC/Qwen-Image-Lightning) with **[cache-dit](https://github.com/vipshop/cache-dit)**, check this [PR](https://github.com/ModelTC/Qwen-Image-Lightning/pull/35).<br>
-2025.08.19: 🔥[**Qwen-Image-Edit**](https://github.com/QwenLM/Qwen-Image) **2x↑🎉** speedup! Check the example: [run_qwen_image_edit.py](https://github.com/vipshop/cache-dit/blob/main/examples/pipeline/run_qwen_image_edit.py).<br>
-2025.09.01: 📚[**Hybird Forward Pattern**](#unified) is supported! Please check [FLUX.1-dev](https://github.com/vipshop/cache-dit/blob/main/examples/run_flux_adapter.py) as an example.<br>
-2025.08.10: 🔥[**FLUX.1-Kontext-dev**](https://huggingface.co/black-forest-labs/FLUX.1-Kontext-dev) is supported! Please refer [run_flux_kontext.py](https://github.com/vipshop/cache-dit/blob/main/examples/pipeline/run_flux_kontext.py) as an example.<br>
-2025.07.18: 🎉First caching mechanism in [🤗huggingface/flux-fast](https://github.com/huggingface/flux-fast) with **[cache-dit](https://github.com/vipshop/cache-dit)**, check the [PR](https://github.com/huggingface/flux-fast/pull/13).<br>
-2025.07.13: 🎉[**FLUX.1-dev**](https://github.com/xlite-dev/flux-faster) **3.3x↑🎉** speedup! NVIDIA L20 with **[cache-dit](https://github.com/vipshop/cache-dit)** + **compile + FP8 DQ**.<br>
-</details>
-</div>
-We are excited to announce that the **first API-stable version (v1.0.0)** of cache-dit has finally been released!
-**[cache-dit](https://github.com/vipshop/cache-dit)** is a **Unified**, **Flexible**, and **Training-free** cache acceleration framework for 🤗 Diffusers, enabling cache acceleration with just **one line** of code. Key features include **Unified Cache APIs**, **Forward Pattern Matching**, **Automatic Block Adapter**, **Hybrid Forward Pattern**, **DBCache**, **TaylorSeer Calibrator**, and **Cache CFG**.
-```bash
-pip3 install -U cache-dit # pip3 install git+https://github.com/vipshop/cache-dit.git
-```
-You can install the stable release of cache-dit from PyPI, or the latest development version from GitHub. Then try ♥️ Cache Acceleration with just **one line** of code ~ ♥️
-```python
->>> import cache_dit
->>> from diffusers import DiffusionPipeline
->>> pipe = DiffusionPipeline.from_pretrained("Qwen/Qwen-Image") # Can be any diffusion pipeline
->>> cache_dit.enable_cache(pipe) # One-line code with default cache options.
->>> output = pipe(...) # Just call the pipe as normal.
->>> stats = cache_dit.summary(pipe) # Then, get the summary of cache acceleration stats.
->>> cache_dit.disable_cache(pipe) # Disable cache and run original pipe.
-```
-### 📚Core Features
-- **[🎉Full 🤗Diffusers Support](./docs/User_Guide.md#supported-pipelines)**: Notably, **[cache-dit](https://github.com/vipshop/cache-dit)** now supports nearly **all** of Diffusers' **DiT-based** pipelines, such as Qwen-Image, FLUX.1, Qwen-Image-Lightning, HunyuanImage-2.1, HunyuanVideo, HunyuanDiT, Wan 2.1/2.2, HiDream, AuraFlow, CogView3Plus, CogView4, LTXVideo, CogVideoX 1.5, ConsisID, SkyReelsV2, VisualCloze, OmniGen, Lumina, PixArt, Chroma, Sana, Allegro, Mochi, SD 3.5, Amused, and DiT-XL.
-- **[🎉Extremely Easy to Use](./docs/User_Guide.md#unified-cache-apis)**: In most cases, you only need **one line** of code: `cache_dit.enable_cache(...)`. After calling this API, just use the pipeline as normal.
-- **[🎉Easy New Model Integration](./docs/User_Guide.md#automatic-block-adapter)**: Features like **Unified Cache APIs**, **Forward Pattern Matching**, **Automatic Block Adapter**, **Hybrid Forward Pattern**, and **Patch Functor** make it highly functional and flexible. For example, we achieved 🎉 Day 1 support for [HunyuanImage-2.1](https://github.com/Tencent-Hunyuan/HunyuanImage-2.1) with 1.7x speedup w/o precision loss—even before it was available in the Diffusers library.
-- **[🎉State-of-the-Art Performance](./bench/)**: Compared with algorithms including Δ-DiT, Chipmunk, FORA, DuCa, TaylorSeer and FoCa, cache-dit achieves the best accuracy when the speedup ratio is below 4x.
-- **[🎉Support for 4/8-Steps Distilled Models](./bench/)**: Surprisingly, cache-dit's **DBCache** works for extremely few-step distilled models—something many other methods fail to do.
-- **[🎉Compatibility with Other Optimizations](./docs/User_Guide.md#️torch-compile)**: Designed to work seamlessly with torch.compile, model CPU offload, sequential CPU offload, group offloading, etc.
-- **[🎉Hybrid Cache Acceleration](./docs/User_Guide.md#taylorseer-calibrator)**: Now supports hybrid **DBCache + Calibrator** schemes (e.g., DBCache + TaylorSeerCalibrator). DBCache acts as the **Indicator** to decide *when* to cache, while the Calibrator decides *how* to cache. More mainstream cache acceleration algorithms (e.g., FoCa) will be supported in the future, along with additional benchmarks—stay tuned for updates!
-- **[🤗Diffusers Ecosystem Integration](https://huggingface.co/docs/diffusers/main/en/optimization/cache_dit)**: 🔥**cache-dit** has joined the Diffusers community ecosystem as the **first** DiT-specific cache acceleration framework! Check out the documentation here: <a href="https://huggingface.co/docs/diffusers/main/en/optimization/cache_dit"><img src=https://img.shields.io/badge/🤗Diffusers-ecosystem-yellow.svg ></a>
-![image-reward-bench](https://github.com/vipshop/cache-dit/raw/main/assets/image-reward-bench.png)
-<details align='center'>
-<summary>🔥<b>Click</b> here to show many <b>Image/Video</b> cases🔥</summary>
-<div  align='center'>
   <img src=https://github.com/vipshop/cache-dit/raw/main/assets/gifs/wan2.2.C0_Q0_NONE.gif width=124px>
   <img src=https://github.com/vipshop/cache-dit/raw/main/assets/gifs/wan2.2.C1_Q0_DBCACHE_F1B0_W2M8MC2_T1O2_R0.08.gif width=124px>
   <img src=https://github.com/vipshop/cache-dit/raw/main/assets/gifs/hunyuan_video.C0_L0_Q0_NONE.gif width=126px>
@@ -150,6 +94,12 @@ You can install the stable release of cache-dit from PyPI, or the latest develop
   <img src=https://github.com/vipshop/cache-dit/raw/main/assets/qwen-image-edit.C0_L0_Q0_DBCACHE_F8B0_W8M0MC0_T0O2_R0.08_S18.png width=125px>
   <img src=https://github.com/vipshop/cache-dit/raw/main/assets/qwen-image-edit.C0_L0_Q0_DBCACHE_F1B0_W8M0MC2_T0O2_R0.12_S24.png width=125px>
   <p><b>🔥Qwen-Image-Edit</b> | Input w/o Edit | Baseline | <a href="https://github.com/vipshop/cache-dit">+cache-dit</a>:1.6x↑🎉 | 1.9x↑🎉 </p>
+</div>
+<details align='center'>
+<summary>🔥<b>Click</b> here to show many <b>Image/Video</b> cases🔥</summary>
+<div align='center'>
   <img src=https://github.com/vipshop/cache-dit/raw/main/assets/flux-kontext-cat.C0_L0_Q0_NONE.png width=100px>
   <img src=https://github.com/vipshop/cache-dit/raw/main/assets/flux-kontext.C0_L0_Q0_NONE.png width=100px>
   <img src=https://github.com/vipshop/cache-dit/raw/main/assets/flux-kontext.C0_L0_Q0_DBCACHE_F8B0_W8M0MC0_T0O2_R0.08_S10.png width=100px>
@@ -218,6 +168,62 @@ You can install the stable release of cache-dit from PyPI, or the latest develop
 </details>
+## 🔥Hightlight <a href="https://huggingface.co/docs/diffusers/main/en/optimization/cache_dit"><img src=https://img.shields.io/badge/🤗Diffusers-ecosystem-yellow.svg ></a>
+We are excited to announce that the **first API-stable version (v1.0.0)** of cache-dit has finally been released!
+**[cache-dit](https://github.com/vipshop/cache-dit)** is a **Unified**, **Flexible**, and **Training-free** cache acceleration framework for 🤗 Diffusers, enabling cache acceleration with just **one line** of code. Key features include **Unified Cache APIs**, **Forward Pattern Matching**, **Automatic Block Adapter**, **Hybrid Forward Pattern**, **DBCache**, **TaylorSeer Calibrator**, and **Cache CFG**.
+```bash
+pip3 install -U cache-dit # pip3 install git+https://github.com/vipshop/cache-dit.git
+```
+You can install the stable release of cache-dit from PyPI, or the latest development version from GitHub. Then try ♥️ Cache Acceleration with just **one line** of code ~ ♥️
+```python
+>>> import cache_dit
+>>> from diffusers import DiffusionPipeline
+>>> pipe = DiffusionPipeline.from_pretrained("Qwen/Qwen-Image") # Can be any diffusion pipeline
+>>> cache_dit.enable_cache(pipe) # One-line code with default cache options.
+>>> output = pipe(...) # Just call the pipe as normal.
+>>> stats = cache_dit.summary(pipe) # Then, get the summary of cache acceleration stats.
+>>> cache_dit.disable_cache(pipe) # Disable cache and run original pipe.
+```
+### 📚Core Features
+- **[🎉Full 🤗Diffusers Support](./docs/User_Guide.md#supported-pipelines)**: Notably, **[cache-dit](https://github.com/vipshop/cache-dit)** now supports nearly **all** of Diffusers' **DiT-based** pipelines, such as Qwen-Image, FLUX.1, Qwen-Image-Lightning, HunyuanImage-2.1, HunyuanVideo, HunyuanDiT, Wan 2.1/2.2, HiDream, AuraFlow, CogView3Plus, CogView4, LTXVideo, CogVideoX 1.5, ConsisID, SkyReelsV2, VisualCloze, OmniGen, Lumina, PixArt, Chroma, Sana, Allegro, Mochi, SD 3.5, Amused, and DiT-XL.
+- **[🎉Extremely Easy to Use](./docs/User_Guide.md#unified-cache-apis)**: In most cases, you only need **one line** of code: `cache_dit.enable_cache(...)`. After calling this API, just use the pipeline as normal.
+- **[🎉Easy New Model Integration](./docs/User_Guide.md#automatic-block-adapter)**: Features like **Unified Cache APIs**, **Forward Pattern Matching**, **Automatic Block Adapter**, **Hybrid Forward Pattern**, and **Patch Functor** make it highly functional and flexible. For example, we achieved 🎉 Day 1 support for [HunyuanImage-2.1](https://github.com/Tencent-Hunyuan/HunyuanImage-2.1) with 1.7x speedup w/o precision loss—even before it was available in the Diffusers library.
+- **[🎉State-of-the-Art Performance](./bench/)**: Compared with algorithms including Δ-DiT, Chipmunk, FORA, DuCa, TaylorSeer and FoCa, cache-dit achieves the best accuracy when the speedup ratio is below 4x.
+- **[🎉Support for 4/8-Steps Distilled Models](./bench/)**: Surprisingly, cache-dit's **DBCache** works for extremely few-step distilled models—something many other methods fail to do.
+- **[🎉Compatibility with Other Optimizations](./docs/User_Guide.md#️torch-compile)**: Designed to work seamlessly with torch.compile, model CPU offload, sequential CPU offload, group offloading, etc.
+- **[🎉Hybrid Cache Acceleration](./docs/User_Guide.md#taylorseer-calibrator)**: Now supports hybrid **DBCache + Calibrator** schemes (e.g., DBCache + TaylorSeerCalibrator). DBCache acts as the **Indicator** to decide *when* to cache, while the Calibrator decides *how* to cache. More mainstream cache acceleration algorithms (e.g., FoCa) will be supported in the future, along with additional benchmarks—stay tuned for updates!
+- **[🤗Diffusers Ecosystem Integration](https://huggingface.co/docs/diffusers/main/en/optimization/cache_dit)**: 🔥**cache-dit** has joined the Diffusers community ecosystem as the **first** DiT-specific cache acceleration framework! Check out the documentation here: <a href="https://huggingface.co/docs/diffusers/main/en/optimization/cache_dit"><img src=https://img.shields.io/badge/🤗Diffusers-ecosystem-yellow.svg ></a>
+![image-reward-bench](https://github.com/vipshop/cache-dit/raw/main/assets/image-reward-bench.png)
+## 🔥Important News
+- 2025.10.10: 🔥[**Qwen-Image-ControlNet-Inpainting**](https://huggingface.co/InstantX/Qwen-Image-ControlNet-Inpainting) **2.3x↑🎉** speedup! Check the [example](https://github.com/vipshop/cache-dit/blob/main/examples/pipeline/run_qwen_image_controlnet_inpaint.py).
+- 2025.09.26: 🔥[**Qwen-Image-Edit-Plus(2509)**](https://github.com/QwenLM/Qwen-Image) **2.1x↑🎉** speedup! Please check the [example](https://github.com/vipshop/cache-dit/blob/main/examples/pipeline/run_qwen_image_edit_plus.py).
+- 2025.09.25: 🎉The **first API-stable version (v1.0.0)** of cache-dit has finally been released!
+- 2025.09.25: 🔥**cache-dit** has joined the Diffusers community ecosystem: <a href="https://huggingface.co/docs/diffusers/main/en/optimization/cache_dit"><img src=https://img.shields.io/badge/🤗Diffusers-ecosystem-yellow.svg ></a>
+- 2025.09.10: 🎉Day 1 support [**HunyuanImage-2.1**](https://github.com/Tencent-Hunyuan/HunyuanImage-2.1) with **1.7x↑🎉** speedup! Check this [example](https://github.com/vipshop/cache-dit/blob/main/examples/pipeline/run_hunyuan_image_2.1.py).
+- 2025.09.08: 🔥[**Qwen-Image-Lightning**](https://github.com/vipshop/cache-dit/blob/main/examples/pipeline/run_qwen_image_lightning.py) **7.1/3.5 steps🎉** inference with **[DBCache: F16B16](https://github.com/vipshop/cache-dit)**.
+- 2025.09.03: 🎉[**Wan2.2-MoE**](https://github.com/Wan-Video) **2.4x↑🎉** speedup! Please refer to [run_wan_2.2.py](https://github.com/vipshop/cache-dit/blob/main/examples/pipeline/run_wan_2.2.py) as an example.
+- 2025.08.19: 🔥[**Qwen-Image-Edit**](https://github.com/QwenLM/Qwen-Image) **2x↑🎉** speedup! Check the example: [run_qwen_image_edit.py](https://github.com/vipshop/cache-dit/blob/main/examples/pipeline/run_qwen_image_edit.py).
+- 2025.08.11: 🔥[**Qwen-Image**](https://github.com/QwenLM/Qwen-Image) **1.8x↑🎉** speedup! Please refer to [run_qwen_image.py](https://github.com/vipshop/cache-dit/blob/main/examples/pipeline/run_qwen_image.py) as an example.
+<details>
+<summary>Previous News</summary>
+- 2025.09.08: 🎉First caching mechanism in [Wan2.2](https://github.com/Wan-Video/Wan2.2) with **[cache-dit](https://github.com/vipshop/cache-dit)**, check this [PR](https://github.com/Wan-Video/Wan2.2/pull/127) for more details.
+- 2025.09.08: 🎉First caching mechanism in [Qwen-Image-Lightning](https://github.com/ModelTC/Qwen-Image-Lightning) with **[cache-dit](https://github.com/vipshop/cache-dit)**, check this [PR](https://github.com/ModelTC/Qwen-Image-Lightning/pull/35).
+- 2025.08.10: 🔥[**FLUX.1-Kontext-dev**](https://huggingface.co/black-forest-labs/FLUX.1-Kontext-dev) is supported! Please refer [run_flux_kontext.py](https://github.com/vipshop/cache-dit/blob/main/examples/pipeline/run_flux_kontext.py) as an example.
+- 2025.08.12: 🎉First caching mechanism in [QwenLM/Qwen-Image](https://github.com/QwenLM/Qwen-Image) with **[cache-dit](https://github.com/vipshop/cache-dit)**, check this [PR](https://github.com/QwenLM/Qwen-Image/pull/61).
+- 2025.07.18: 🎉First caching mechanism in [🤗huggingface/flux-fast](https://github.com/huggingface/flux-fast) with **[cache-dit](https://github.com/vipshop/cache-dit)**, check the [PR](https://github.com/huggingface/flux-fast/pull/13).
+- 2025.07.13: 🎉[**FLUX.1-dev**](https://github.com/xlite-dev/flux-faster) **3.3x↑🎉** speedup! NVIDIA L20 with **[cache-dit](https://github.com/vipshop/cache-dit)** + **compile + FP8 DQ**.
+</details>
 ## 📚User Guide
 <div id="user-guide"></div>

{cache_dit-1.0.1.dist-info → cache_dit-1.0.2.dist-info}/RECORD RENAMED Viewed

@@ -1,39 +1,40 @@
 cache_dit/__init__.py,sha256=sHRg0swXZZiw6lvSQ53fcVtN9JRayx0az2lXAz5OOGI,1510
-cache_dit/_version.py,sha256=JvmBpae6cHui8lSCsCcZQAxzawN2NERHGsr-rIUeJMo,704
+cache_dit/_version.py,sha256=ZTgKq8LPNy3l9uR2ke-VtLhvvl5l71frQ9wO76n1L5k,704
 cache_dit/logger.py,sha256=0zsu42hN-3-rgGC_C29ms1IvVpV4_b4_SwJCKSenxBE,4304
 cache_dit/utils.py,sha256=AyYRwi5XBxYBH4GaXxOxv9-X24Te_IYOYwh54t_1d3A,10674
 cache_dit/cache_factory/.gitignore,sha256=5Cb-qT9wsTUoMJ7vACDF7ZcLpAXhi5v-xdcWSRit988,23
 cache_dit/cache_factory/__init__.py,sha256=vy9I6Ofkj9jWeUoOvh-cY5a9QlDDKfj2FVPlVTf7BeA,1390
-cache_dit/cache_factory/cache_interface.py,sha256=A_8bBsLfGOE5wM3_rniQKPJ223_-fSpNIq65uv00sF0,10620
+cache_dit/cache_factory/cache_interface.py,sha256=KseSPyZ9D3m6pmpE7k-uYr0wfBI-hhscG1Nw54GCHxk,12316
 cache_dit/cache_factory/cache_types.py,sha256=ooukxQRG55uTLmaZ0SKw6gIeY6SQHhMxkbv55uj2Sqk,991
 cache_dit/cache_factory/forward_pattern.py,sha256=FumlCuZ-TSmSYH0hGBHctSJ-oGLCftdZjLygqhsmdR4,2258
 cache_dit/cache_factory/params_modifier.py,sha256=zYJJsInTYCaYHBZ7mZJOP-PZnkSg3iN1WPewNOayXos,3628
 cache_dit/cache_factory/utils.py,sha256=XkVM9AXcB9zYq8-S8QKAsGz80r3tA6U3lBNGDGeHOe4,1871
-cache_dit/cache_factory/block_adapters/__init__.py,sha256=33geXMz56TxFWMp0c-H4__MY5SGRzKMKj3TXnUYOMlc,17512
+cache_dit/cache_factory/block_adapters/__init__.py,sha256=vM3aDMzPY79Tw4L0hlV2PdA3MFYomnf0eo0BGBo9P78,18087
 cache_dit/cache_factory/block_adapters/block_adapters.py,sha256=2TVK_KqiYXC7AKZ2s07fzdOzUoeUBc9P1SzQtLVzhf4,22249
 cache_dit/cache_factory/block_adapters/block_registers.py,sha256=2L7QeM4ygnaKQpC9PoJod0QRYyxidUKU2AYpysDCUwE,2572
 cache_dit/cache_factory/cache_adapters/__init__.py,sha256=py71WGD3JztQ1uk6qdLVbzYcQ1rvqFidNNaQYo7tqTo,79
-cache_dit/cache_factory/cache_adapters/cache_adapter.py,sha256=7heGoy8LHMP54ISMwfJ-i_ALngkbnUdeQDBRrE-MTgs,21303
+cache_dit/cache_factory/cache_adapters/cache_adapter.py,sha256=HTyZdspd34G6QiJ2qPNoLmGwcxmAnCwpAf91NTIQtl4,21442
 cache_dit/cache_factory/cache_blocks/__init__.py,sha256=mivvm8YOfqT7YHs8y_MzGOGztPw8LxAqKGXuSRXxCv0,3032
 cache_dit/cache_factory/cache_blocks/offload_utils.py,sha256=wusgcqaCrwEjvv7Guy-6VXhNOgPPUrBV2sSVuRmGuvo,3513
 cache_dit/cache_factory/cache_blocks/pattern_0_1_2.py,sha256=ElMps6_7uI74tSF9GDR_dEI0bZEhdzcepM29xFWnYo8,428
-cache_dit/cache_factory/cache_blocks/pattern_3_4_5.py,sha256=Bv56qETXhsREvCrNvnZpSqDIIHsi6Ze3FJW4Yk2x3uI,8597
-cache_dit/cache_factory/cache_blocks/pattern_base.py,sha256=wdh0bbcpKO08AW2FTsj9X_tTbFCLkDmBjrstMxTf7MQ,14668
+cache_dit/cache_factory/cache_blocks/pattern_3_4_5.py,sha256=rfq5-WEt-ErY28vcB4ur9E-uCb6BKP0S8v5lTw61ROk,10555
+cache_dit/cache_factory/cache_blocks/pattern_base.py,sha256=StNW2PyDiXEIxZd30byPUrZZ8jgSiuC_yrly2w7X2LQ,16176
 cache_dit/cache_factory/cache_blocks/pattern_utils.py,sha256=dGOC1tMMOvcbvEgx44eTESKn_jsv-0RZ3tRHPa3wmQ4,1315
-cache_dit/cache_factory/cache_contexts/__init__.py,sha256=T6Vak3x7Rs0Oy15Tou49p-rPQRA2jiuYtJBsbv1lBBU,388
+cache_dit/cache_factory/cache_contexts/__init__.py,sha256=N3SxFnluXk5q09nhSqKIJCVzEGWzySJWm-vic6dH79E,412
 cache_dit/cache_factory/cache_contexts/cache_context.py,sha256=3EhaMCz3VUQ_NF81VgYwWoSEGIvhScPxPYhjL1OcgxE,15240
-cache_dit/cache_factory/cache_contexts/cache_manager.py,sha256=hSKAeP1CxmO3RFUxjFjAK1xdvVvTmeayh5jEHMaQXNE,30225
+cache_dit/cache_factory/cache_contexts/cache_manager.py,sha256=X99XnmiY-Us8D2pqJGPKxWcXAhQQpk3xdEWOOOYXIZ4,30465
 cache_dit/cache_factory/cache_contexts/calibrators/__init__.py,sha256=mzYXO8tbytGpJJ9rpPu20kMoj1Iu_7Ym9tjfzV8rA98,5574
 cache_dit/cache_factory/cache_contexts/calibrators/base.py,sha256=mn6ZBkChGpGwN5csrHTUGMoX6BBPvqHXSLbIExiW-EU,748
 cache_dit/cache_factory/cache_contexts/calibrators/foca.py,sha256=nhHGs_hxwW1M942BQDMJb9-9IuHdnOxp774Jrna1bJI,891
 cache_dit/cache_factory/cache_contexts/calibrators/taylorseer.py,sha256=aGxr9SpytYznTepDWGPAxWDnuVMSuNyn6uNXnLh2acQ,4001
-cache_dit/cache_factory/patch_functors/__init__.py,sha256=oI6F3N9ezahRHaFUOZ1GfrAw1qFdKrxFXXmlwwehHj4,530
+cache_dit/cache_factory/patch_functors/__init__.py,sha256=IJZrvSkeHbR_xW-6IzY7sqEhApBsOfPyorQGJutvWH0,652
 cache_dit/cache_factory/patch_functors/functor_base.py,sha256=Ahk0fTfrHgNdEl-9JSkACvfyyv9G-Ei5OSz7XBIlX5o,357
 cache_dit/cache_factory/patch_functors/functor_chroma.py,sha256=xD0Q96VArp1vYBLQ0pcjRIyFB1i_Y7muZ2q07Hz8Oqs,13430
 cache_dit/cache_factory/patch_functors/functor_dit.py,sha256=SDjhzCWa6PoFNN4_upoQEf6DHvW1yJ7zuXMS2VvyJco,3904
 cache_dit/cache_factory/patch_functors/functor_flux.py,sha256=UMkyuEYjO7UO_zmXi9Djd-nD-XMgCUgE-qkYA3plWSM,9559
 cache_dit/cache_factory/patch_functors/functor_hidream.py,sha256=inf4T5UcIa06zVsoLWCNJbb1bEDmGeBGSyC7OL1zpuc,15309
 cache_dit/cache_factory/patch_functors/functor_hunyuan_dit.py,sha256=iSo5dD5uKnjQQeysDUIkKt0wdnK5bzXTc_F_lfHG70w,6401
+cache_dit/cache_factory/patch_functors/functor_qwen_image_controlnet.py,sha256=D5i1Rrq1FQ49liupLcV2DW04moBqLnW9TICzfnMMzIU,10519
 cache_dit/compile/__init__.py,sha256=FcTVzCeyypl-mxlc59_ehHL3lBNiDAFsXuRoJ-5Cfi0,56
 cache_dit/compile/utils.py,sha256=nN2OIrSdwRR5zGxJinKDqb07pXpvTNTF3g_OgLkeeBU,3858
 cache_dit/custom_ops/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -47,11 +48,11 @@ cache_dit/metrics/inception.py,sha256=pBVe2X6ylLPIXTG4-GWDM9DWnCviMJbJ45R3ulhktR
 cache_dit/metrics/lpips.py,sha256=hrHrmdM-f2B4TKDs0xLqJO5JFaYcCjq2qNIR8oCrVkc,811
 cache_dit/metrics/metrics.py,sha256=AZbQyoavE-djvyRUZ_EfCIrWSQbiWQFo7n2dhn7XptE,40466
 cache_dit/quantize/__init__.py,sha256=kWYoMAyZgBXu9BJlZjTQ0dRffW9GqeeY9_iTkXrb70A,59
-cache_dit/quantize/quantize_ao.py,sha256=Fx1KW4l3gdEkdrcAYtPoDW7WKBJWrs3glOHiEwW_TgE,6160
+cache_dit/quantize/quantize_ao.py,sha256=Pr3u3Qr6qLvFkd8k-_rfcz4Mkjlg36U9BHG2t6Bl-6M,6301
 cache_dit/quantize/quantize_interface.py,sha256=2s_R7xPSKuJeFpEGeLwRxnq_CqJcBG3a3lzyW5wh-UM,1241
-cache_dit-1.0.1.dist-info/licenses/LICENSE,sha256=Dqb07Ik2dV41s9nIdMUbiRWEfDqo7-dQeRiY7kPO8PE,3769
-cache_dit-1.0.1.dist-info/METADATA,sha256=Oygjg65VRvdeNciptBlFK7Eh3vURd1Hxcq35UEYN5Tg,26173
-cache_dit-1.0.1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
-cache_dit-1.0.1.dist-info/entry_points.txt,sha256=FX2gysXaZx6NeK1iCLMcIdP8Q4_qikkIHtEmi3oWn8o,65
-cache_dit-1.0.1.dist-info/top_level.txt,sha256=ZJDydonLEhujzz0FOkVbO-BqfzO9d_VqRHmZU-3MOZo,10
-cache_dit-1.0.1.dist-info/RECORD,,
+cache_dit-1.0.2.dist-info/licenses/LICENSE,sha256=Dqb07Ik2dV41s9nIdMUbiRWEfDqo7-dQeRiY7kPO8PE,3769
+cache_dit-1.0.2.dist-info/METADATA,sha256=E6MkP_T9cwJEbqWE1DIRVkQLI7wLWr5zryY2poWgkyw,26766
+cache_dit-1.0.2.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
+cache_dit-1.0.2.dist-info/entry_points.txt,sha256=FX2gysXaZx6NeK1iCLMcIdP8Q4_qikkIHtEmi3oWn8o,65
+cache_dit-1.0.2.dist-info/top_level.txt,sha256=ZJDydonLEhujzz0FOkVbO-BqfzO9d_VqRHmZU-3MOZo,10
+cache_dit-1.0.2.dist-info/RECORD,,

{cache_dit-1.0.1.dist-info → cache_dit-1.0.2.dist-info}/WHEEL RENAMED Viewed

File without changes

{cache_dit-1.0.1.dist-info → cache_dit-1.0.2.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{cache_dit-1.0.1.dist-info → cache_dit-1.0.2.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

{cache_dit-1.0.1.dist-info → cache_dit-1.0.2.dist-info}/top_level.txt RENAMED Viewed

File without changes

cache-dit 1.0.1__py3-none-any.whl → 1.0.2__py3-none-any.whl

Potentially problematic release.

cache-dit 1.0.1py3-none-any.whl → 1.0.2py3-none-any.whl