PyPI - cache-dit - Versions diffs - 1.0.5__py3-none-any.whl → 1.0.7__py3-none-any.whl - Mend

cache-dit 1.0.5py3-none-any.whl → 1.0.7py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of cache-dit might be problematic. Click here for more details.

Files changed (20) hide show

cache_dit/__init__.py +5 -3
cache_dit/_version.py +2 -2
cache_dit/cache_factory/block_adapters/__init__.py +44 -0
cache_dit/cache_factory/block_adapters/block_registers.py +1 -0
cache_dit/cache_factory/cache_adapters/cache_adapter.py +12 -0
cache_dit/cache_factory/cache_blocks/pattern_base.py +3 -8
cache_dit/cache_factory/cache_interface.py +35 -1
cache_dit/parallelism/__init__.py +3 -0
cache_dit/parallelism/backends/parallel_difffusers.py +73 -0
cache_dit/parallelism/parallel_backend.py +18 -0
cache_dit/parallelism/parallel_config.py +47 -0
cache_dit/parallelism/parallel_interface.py +62 -0
cache_dit/quantize/quantize_ao.py +27 -64
cache_dit/utils.py +25 -1
{cache_dit-1.0.5.dist-info → cache_dit-1.0.7.dist-info}/METADATA +15 -10
{cache_dit-1.0.5.dist-info → cache_dit-1.0.7.dist-info}/RECORD +20 -15
{cache_dit-1.0.5.dist-info → cache_dit-1.0.7.dist-info}/WHEEL +0 -0
{cache_dit-1.0.5.dist-info → cache_dit-1.0.7.dist-info}/entry_points.txt +0 -0
{cache_dit-1.0.5.dist-info → cache_dit-1.0.7.dist-info}/licenses/LICENSE +0 -0
{cache_dit-1.0.5.dist-info → cache_dit-1.0.7.dist-info}/top_level.txt +0 -0

cache_dit/__init__.py CHANGED Viewed

@@ -4,8 +4,7 @@ except ImportError:
     __version__ = "unknown version"
     version_tuple = (0, 0, "unknown version")
-from cache_dit.utils import summary
-from cache_dit.utils import strify
 from cache_dit.utils import disable_print
 from cache_dit.logger import init_logger
 from cache_dit.cache_factory import load_options
@@ -28,7 +27,10 @@ from cache_dit.cache_factory import supported_pipelines
 from cache_dit.cache_factory import get_adapter
 from cache_dit.compile import set_compile_configs
 from cache_dit.quantize import quantize
+from cache_dit.parallelism import ParallelismBackend
+from cache_dit.parallelism import ParallelismConfig
+from cache_dit.utils import summary
+from cache_dit.utils import strify
 NONE = CacheType.NONE
 DBCache = CacheType.DBCache

cache_dit/_version.py CHANGED Viewed

@@ -28,7 +28,7 @@ version_tuple: VERSION_TUPLE
 commit_id: COMMIT_ID
 __commit_id__: COMMIT_ID
-__version__ = version = '1.0.5'
-__version_tuple__ = version_tuple = (1, 0, 5)
+__version__ = version = '1.0.7'
+__version_tuple__ = version_tuple = (1, 0, 7)
 __commit_id__ = commit_id = None

cache_dit/cache_factory/block_adapters/__init__.py CHANGED Viewed

@@ -577,3 +577,47 @@ def hunyuanditpag_adapter(pipe, **kwargs) -> BlockAdapter:
         patch_functor=HunyuanDiTPatchFunctor(),
         **kwargs,
     )
+@BlockAdapterRegistry.register("Kandinsky5")
+def kandinsky5_adapter(pipe, **kwargs) -> BlockAdapter:
+    try:
+        from diffusers import Kandinsky5Transformer3DModel
+        assert isinstance(pipe.transformer, Kandinsky5Transformer3DModel)
+        return BlockAdapter(
+            pipe=pipe,
+            transformer=pipe.transformer,
+            blocks=pipe.transformer.visual_transformer_blocks,
+            forward_pattern=ForwardPattern.Pattern_3,  # or Pattern_2
+            has_separate_cfg=True,
+            check_forward_pattern=False,
+            check_num_outputs=False,
+            **kwargs,
+        )
+    except ImportError:
+        raise ImportError(
+            "Kandinsky5Transformer3DModel is not available in the current diffusers version. "
+            "Please upgrade diffusers>=0.36.dev0 to use this adapter."
+        )
+@BlockAdapterRegistry.register("PRX")
+def prx_adapter(pipe, **kwargs) -> BlockAdapter:
+    try:
+        from diffusers import PRXTransformer2DModel
+        assert isinstance(pipe.transformer, PRXTransformer2DModel)
+        return BlockAdapter(
+            pipe=pipe,
+            transformer=pipe.transformer,
+            blocks=pipe.transformer.blocks,
+            forward_pattern=ForwardPattern.Pattern_3,
+            check_num_outputs=False,
+            **kwargs,
+        )
+    except ImportError:
+        raise ImportError(
+            "PRXTransformer2DModel is not available in the current diffusers version. "
+            "Please upgrade diffusers>=0.36.dev0 to use this adapter."
+        )

cache_dit/cache_factory/block_adapters/block_registers.py CHANGED Viewed

@@ -18,6 +18,7 @@ class BlockAdapterRegistry:
         "SkyReelsV2",
         "Chroma",
         "Lumina2",
+        "Kandinsky5",
     ]
     @classmethod

cache_dit/cache_factory/cache_adapters/cache_adapter.py CHANGED Viewed

@@ -614,6 +614,18 @@ class CachedAdapter:
             pipe_or_adapter, remove_stats, remove_stats, remove_stats
         )
+        # maybe release parallelism stats
+        from cache_dit.parallelism.parallel_interface import (
+            remove_parallelism_stats,
+        )
+        cls.release_hooks(
+            pipe_or_adapter,
+            remove_parallelism_stats,
+            remove_parallelism_stats,
+            remove_parallelism_stats,
+        )
     @classmethod
     def release_hooks(
         cls,

cache_dit/cache_factory/cache_blocks/pattern_base.py CHANGED Viewed

@@ -139,14 +139,9 @@ class CachedBlocks_Pattern_Base(torch.nn.Module):
                 *args,
                 **kwargs,
             )
-            if not isinstance(hidden_states, torch.Tensor):
-                hidden_states, encoder_hidden_states = hidden_states
-                if not self.forward_pattern.Return_H_First:
-                    hidden_states, encoder_hidden_states = (
-                        encoder_hidden_states,
-                        hidden_states,
-                    )
+            hidden_states, encoder_hidden_states = self._process_block_outputs(
+                hidden_states, encoder_hidden_states
+            )
         return hidden_states, encoder_hidden_states
     @torch.compiler.disable

cache_dit/cache_factory/cache_interface.py CHANGED Viewed

@@ -9,6 +9,8 @@ from cache_dit.cache_factory.cache_contexts import DBCacheConfig
 from cache_dit.cache_factory.cache_contexts import DBPruneConfig
 from cache_dit.cache_factory.cache_contexts import CalibratorConfig
 from cache_dit.cache_factory.params_modifier import ParamsModifier
+from cache_dit.parallelism import ParallelismConfig
+from cache_dit.parallelism import enable_parallelism
 from cache_dit.logger import init_logger
@@ -37,6 +39,8 @@ def enable_cache(
             List[List[ParamsModifier]],
         ]
     ] = None,
+    # Config for Parallelism
+    parallelism_config: Optional[ParallelismConfig] = None,
     # Other cache context kwargs: Deprecated cache kwargs
     **kwargs,
 ) -> Union[
@@ -127,6 +131,15 @@ def enable_cache(
                 **kwargs: (`dict`, *optional*, defaults to {}):
                     The same as 'kwargs' param in cache_dit.enable_cache() interface.
+        parallelism_config (`ParallelismConfig`, *optional*, defaults to None):
+            Config for Parallelism. If parallelism_config is not None, it means the user wants to enable
+            parallelism for cache-dit. Please check https://github.com/vipshop/cache-dit/blob/main/src/cache_dit/parallelism/parallel_config.py
+            for more details of ParallelismConfig.
+                ulysses_size: (`int`, *optional*, defaults to None):
+                    The size of Ulysses cluster. If ulysses_size is not None, enable Ulysses style parallelism.
+                ring_size: (`int`, *optional*, defaults to None):
+                    The size of ring for ring parallelism. If ring_size is not None, enable ring attention.
         kwargs (`dict`, *optional*, defaults to {})
             Other cache context kwargs, please check https://github.com/vipshop/cache-dit/blob/main/src/cache_dit/cache_factory/cache_contexts/cache_context.py
             for more details.
@@ -214,7 +227,7 @@ def enable_cache(
         context_kwargs["params_modifiers"] = params_modifiers
     if isinstance(pipe_or_adapter, (DiffusionPipeline, BlockAdapter)):
-        return CachedAdapter.apply(
+        pipe_or_adapter = CachedAdapter.apply(
             pipe_or_adapter,
             **context_kwargs,
         )
@@ -225,6 +238,27 @@ def enable_cache(
             "for the 1's position param: pipe_or_adapter"
         )
+    # NOTE: Users should always enable parallelism after applying
+    # cache to avoid hooks conflict.
+    if parallelism_config is not None:
+        assert isinstance(
+            parallelism_config, ParallelismConfig
+        ), "parallelism_config should be of type ParallelismConfig."
+        if isinstance(pipe_or_adapter, DiffusionPipeline):
+            transformer = pipe_or_adapter.transformer
+        else:
+            assert BlockAdapter.assert_normalized(pipe_or_adapter)
+            assert (
+                len(BlockAdapter.flatten(pipe_or_adapter.transformer)) == 1
+            ), (
+                "Only single transformer is supported to enable parallelism "
+                "currently for BlockAdapter."
+            )
+            transformer = BlockAdapter.flatten(pipe_or_adapter.transformer)[0]
+        # Enable parallelism for the transformer inplace
+        transformer = enable_parallelism(transformer, parallelism_config)
+    return pipe_or_adapter
 def disable_cache(
     pipe_or_adapter: Union[

cache_dit/parallelism/__init__.py ADDED Viewed

@@ -0,0 +1,3 @@
+from cache_dit.parallelism.parallel_backend import ParallelismBackend
+from cache_dit.parallelism.parallel_config import ParallelismConfig
+from cache_dit.parallelism.parallel_interface import enable_parallelism

cache_dit/parallelism/backends/parallel_difffusers.py ADDED Viewed

@@ -0,0 +1,73 @@
+import torch
+from typing import Optional
+from cache_dit.logger import init_logger
+logger = init_logger(__name__)
+try:
+    from diffusers import ContextParallelConfig
+    def native_diffusers_parallelism_available() -> bool:
+        return True
+except ImportError:
+    ContextParallelConfig = None
+    def native_diffusers_parallelism_available() -> bool:
+        return False
+from diffusers.models.modeling_utils import ModelMixin
+from cache_dit.parallelism.parallel_backend import ParallelismBackend
+from cache_dit.parallelism.parallel_config import ParallelismConfig
+def maybe_enable_parallelism(
+    transformer: torch.nn.Module,
+    parallelism_config: Optional[ParallelismConfig],
+) -> torch.nn.Module:
+    assert isinstance(transformer, ModelMixin), (
+        "transformer must be an instance of diffusers' ModelMixin, "
+        f"but got {type(transformer)}"
+    )
+    if parallelism_config is None:
+        return transformer
+    assert isinstance(parallelism_config, ParallelismConfig), (
+        "parallelism_config must be an instance of ParallelismConfig"
+        f" but got {type(parallelism_config)}"
+    )
+    if (
+        parallelism_config.backend == ParallelismBackend.NATIVE_DIFFUSER
+        and native_diffusers_parallelism_available()
+    ):
+        cp_config = None
+        if (
+            parallelism_config.ulysses_size is not None
+            or parallelism_config.ring_size is not None
+        ):
+            cp_config = ContextParallelConfig(
+                ulysses_degree=parallelism_config.ulysses_size,
+                ring_degree=parallelism_config.ring_size,
+            )
+        if cp_config is not None:
+            if hasattr(transformer, "enable_parallelism"):
+                if hasattr(transformer, "set_attention_backend"):
+                    # Now only _native_cudnn is supported for parallelism
+                    # issue: https://github.com/huggingface/diffusers/pull/12443
+                    transformer.set_attention_backend("_native_cudnn")
+                    logger.warning(
+                        "Set attention backend to _native_cudnn for parallelism because of "
+                        "the issue: https://github.com/huggingface/diffusers/pull/12443"
+                    )
+                transformer.enable_parallelism(config=cp_config)
+            else:
+                raise ValueError(
+                    f"{transformer.__class__.__name__} does not support context parallelism."
+                )
+    return transformer

cache_dit/parallelism/parallel_backend.py ADDED Viewed

@@ -0,0 +1,18 @@
+from enum import Enum
+class ParallelismBackend(Enum):
+    NATIVE_DIFFUSER = "Native_Diffuser"
+    NATIVE_PYTORCH = "Native_PyTorch"
+    NONE = "None"
+    @classmethod
+    def is_supported(cls, backend: "ParallelismBackend") -> bool:
+        # Now, only Native_Diffuser backend is supported
+        if backend in [cls.NATIVE_DIFFUSER]:
+            try:
+                import diffusers  # noqa: F401
+            except ImportError:
+                return False
+            return True
+        return False

cache_dit/parallelism/parallel_config.py ADDED Viewed

@@ -0,0 +1,47 @@
+import dataclasses
+from cache_dit.parallelism.parallel_backend import ParallelismBackend
+from cache_dit.logger import init_logger
+logger = init_logger(__name__)
+@dataclasses.dataclass
+class ParallelismConfig:
+    # Parallelism backend, defaults to NATIVE_DIFFUSER
+    backend: ParallelismBackend = ParallelismBackend.NATIVE_DIFFUSER
+    # Context parallelism config
+    # ulysses_size (`int`, *optional*):
+    #     The degree of ulysses parallelism.
+    ulysses_size: int = None
+    # ring_size (`int`, *optional*):
+    #     The degree of ring parallelism.
+    ring_size: int = None
+    # Tensor parallelism config
+    # tp_size (`int`, *optional*):
+    #     The degree of tensor parallelism.
+    tp_size: int = None
+    def __post_init__(self):
+        assert ParallelismBackend.is_supported(self.backend), (
+            f"Parallel backend {self.backend} is not supported. "
+            f"Please make sure the required packages are installed."
+        )
+        assert self.tp_size is None, "Tensor parallelism is not supported yet."
+    def strify(self, details: bool = False) -> str:
+        if details:
+            return (
+                f"ParallelismConfig(backend={self.backend}, "
+                f"ulysses_size={self.ulysses_size}, "
+                f"ring_size={self.ring_size}, "
+                f"tp_size={self.tp_size})"
+            )
+        else:
+            parallel_str = ""
+            if self.ulysses_size is not None:
+                parallel_str += f"Ulysses{self.ulysses_size}"
+            if self.ring_size is not None:
+                parallel_str += f"Ring{self.ring_size}"
+            if self.tp_size is not None:
+                parallel_str += f"TP{self.tp_size}"
+            return parallel_str

cache_dit/parallelism/parallel_interface.py ADDED Viewed

@@ -0,0 +1,62 @@
+import torch
+from cache_dit.parallelism.parallel_backend import ParallelismBackend
+from cache_dit.parallelism.parallel_config import ParallelismConfig
+from cache_dit.logger import init_logger
+logger = init_logger(__name__)
+def enable_parallelism(
+    transformer: torch.nn.Module,
+    parallelism_config: ParallelismConfig,
+) -> torch.nn.Module:
+    assert isinstance(transformer, torch.nn.Module), (
+        "transformer must be an instance of torch.nn.Module, "
+        f"but got {type(transformer)}"
+    )
+    if getattr(transformer, "_is_parallelized", False):
+        logger.warning(
+            "The transformer is already parallelized. "
+            "Skipping parallelism enabling."
+        )
+        return transformer
+    if parallelism_config.backend == ParallelismBackend.NATIVE_DIFFUSER:
+        from cache_dit.parallelism.backends.parallel_difffusers import (
+            maybe_enable_parallelism,
+            native_diffusers_parallelism_available,
+        )
+        assert (
+            native_diffusers_parallelism_available()
+        ), "Please install diffusers>=0.36.dev0 to use Native_Diffuser backend."
+        transformer = maybe_enable_parallelism(
+            transformer,
+            parallelism_config,
+        )
+    else:
+        raise ValueError(
+            f"Parallel backend {parallelism_config.backend} is not supported yet."
+        )
+    transformer._is_parallelized = True  # type: ignore[attr-defined]
+    transformer._parallelism_config = parallelism_config  # type: ignore[attr-defined]
+    logger.info(f"Enabled parallelism: {parallelism_config.strify(True)}")
+    return transformer
+def remove_parallelism_stats(
+    transformer: torch.nn.Module,
+) -> torch.nn.Module:
+    if not getattr(transformer, "_is_parallelized", False):
+        logger.warning(
+            "The transformer is not parallelized. "
+            "Skipping removing parallelism."
+        )
+        return transformer
+    if hasattr(transformer, "_is_parallelized"):
+        del transformer._is_parallelized  # type: ignore[attr-defined]
+    if hasattr(transformer, "_parallelism_config"):
+        del transformer._parallelism_config  # type: ignore[attr-defined]
+    return transformer

cache_dit/quantize/quantize_ao.py CHANGED Viewed

@@ -80,26 +80,19 @@ def quantize_ao(
         return False
-    def _quantization_fn():
+    def _quant_config():
         try:
             if quant_type == "fp8_w8a8_dq":
-                try:
-                    from torchao.quantization import (
-                        float8_dynamic_activation_float8_weight,
-                        PerTensor,
-                        PerRow,
-                    )
-                except ImportError:
-                    from torchao.quantization import (
-                        Float8DynamicActivationFloat8WeightConfig as float8_dynamic_activation_float8_weight,
-                        PerTensor,
-                        PerRow,
-                    )
+                from torchao.quantization import (
+                    Float8DynamicActivationFloat8WeightConfig,
+                    PerTensor,
+                    PerRow,
+                )
                 if per_row:  # Ensure bfloat16
                     module.to(torch.bfloat16)
-                quantization_fn = float8_dynamic_activation_float8_weight(
+                quant_config = Float8DynamicActivationFloat8WeightConfig(
                     weight_dtype=kwargs.get(
                         "weight_dtype",
                         torch.float8_e4m3fn,
@@ -116,14 +109,9 @@ def quantize_ao(
                 )
             elif quant_type == "fp8_w8a16_wo":
-                try:
-                    from torchao.quantization import float8_weight_only
-                except ImportError:
-                    from torchao.quantization import (
-                        Float8WeightOnlyConfig as float8_weight_only,
-                    )
+                from torchao.quantization import Float8WeightOnlyConfig
-                quantization_fn = float8_weight_only(
+                quant_config = Float8WeightOnlyConfig(
                     weight_dtype=kwargs.get(
                         "weight_dtype",
                         torch.float8_e4m3fn,
@@ -131,69 +119,44 @@ def quantize_ao(
                 )
             elif quant_type == "int8_w8a8_dq":
-                try:
-                    from torchao.quantization import (
-                        int8_dynamic_activation_int8_weight,
-                    )
-                except ImportError:
-                    from torchao.quantization import (
-                        Int8DynamicActivationInt8WeightConfig as int8_dynamic_activation_int8_weight,
-                    )
+                from torchao.quantization import (
+                    Int8DynamicActivationInt8WeightConfig,
+                )
-                quantization_fn = int8_dynamic_activation_int8_weight()
+                quant_config = Int8DynamicActivationInt8WeightConfig()
             elif quant_type == "int8_w8a16_wo":
-                try:
-                    from torchao.quantization import int8_weight_only
-                except ImportError:
-                    from torchao.quantization import (
-                        Int8WeightOnlyConfig as int8_weight_only,
-                    )
+                from torchao.quantization import Int8WeightOnlyConfig
-                quantization_fn = int8_weight_only(
+                quant_config = Int8WeightOnlyConfig(
                     # group_size is None -> per_channel, else per group
                     group_size=kwargs.get("group_size", None),
                 )
             elif quant_type == "int4_w4a8_dq":
-                try:
-                    from torchao.quantization import (
-                        int8_dynamic_activation_int4_weight,
-                    )
-                except ImportError:
-                    from torchao.quantization import (
-                        Int8DynamicActivationInt4WeightConfig as int8_dynamic_activation_int4_weight,
-                    )
+                from torchao.quantization import (
+                    Int8DynamicActivationInt4WeightConfig,
+                )
-                quantization_fn = int8_dynamic_activation_int4_weight(
+                quant_config = Int8DynamicActivationInt4WeightConfig(
                     group_size=kwargs.get("group_size", 32),
                 )
             elif quant_type == "int4_w4a4_dq":
-                try:
-                    from torchao.quantization import (
-                        int4_dynamic_activation_int4_weight,
-                    )
-                except ImportError:
-                    from torchao.quantization import (
-                        Int4DynamicActivationInt4WeightConfig as int4_dynamic_activation_int4_weight,
-                    )
+                from torchao.quantization import (
+                    Int4DynamicActivationInt4WeightConfig,
+                )
-                quantization_fn = int4_dynamic_activation_int4_weight()
+                quant_config = Int4DynamicActivationInt4WeightConfig()
             elif quant_type == "int4_w4a16_wo":
-                try:
-                    from torchao.quantization import int4_weight_only
-                except ImportError:
-                    from torchao.quantization import (
-                        Int4WeightOnlyConfig as int4_weight_only,
-                    )
+                from torchao.quantization import Int4WeightOnlyConfig
-                quantization_fn = int4_weight_only(
+                quant_config = Int4WeightOnlyConfig(
                     group_size=kwargs.get("group_size", 32),
                 )
@@ -209,13 +172,13 @@ def quantize_ao(
             )
             raise e
-        return quantization_fn
+        return quant_config
     from torchao.quantization import quantize_
     quantize_(
         module,
-        _quantization_fn(),
+        _quant_config(),
         filter_fn=_filter_fn if filter_fn is None else filter_fn,
         device=kwargs.get("device", None),
     )

cache_dit/utils.py CHANGED Viewed

@@ -13,6 +13,7 @@ from cache_dit.cache_factory import CacheType
 from cache_dit.cache_factory import BlockAdapter
 from cache_dit.cache_factory import BasicCacheConfig
 from cache_dit.cache_factory import CalibratorConfig
+from cache_dit.parallelism import ParallelismConfig
 from cache_dit.logger import init_logger
@@ -55,6 +56,8 @@ class CacheStats:
     cfg_pruned_blocks: list[int] = dataclasses.field(default_factory=list)
     cfg_actual_blocks: list[int] = dataclasses.field(default_factory=list)
     cfg_pruned_ratio: float = None
+    # Parallelism Stats
+    parallelism_config: ParallelismConfig = None
 def summary(
@@ -180,6 +183,8 @@ def strify(
         cached_steps = None
         cache_type = cache_options.get("cache_type", CacheType.NONE)
+        stats = None
         if cache_type == CacheType.NONE:
             return "NONE"
     else:
@@ -213,7 +218,15 @@ def strify(
             return calibrator_config.strify()
         return "T0O0"
-    cache_type_str = f"{cache_str()}_{calibrator_str()}"
+    def parallelism_str():
+        if stats is None:
+            return ""
+        parallelism_config: ParallelismConfig = stats.parallelism_config
+        if parallelism_config is not None:
+            return f"_{parallelism_config.strify()}"
+        return ""
+    cache_type_str = f"{cache_str()}_{calibrator_str()}{parallelism_str()}"
     if cached_steps:
         cache_type_str += f"_S{cached_steps}"
@@ -252,6 +265,17 @@ def _summary(
         if logging:
             logger.warning(f"Can't find Context Options for: {cls_name}")
+    if hasattr(module, "_parallelism_config"):
+        parallelism_config: ParallelismConfig = module._parallelism_config
+        cache_stats.parallelism_config = parallelism_config
+        if logging:
+            print(
+                f"\n🤖Parallelism Config: {cls_name}\n\n{parallelism_config.strify(True)}"
+            )
+    else:
+        if logging:
+            logger.warning(f"Can't find Parallelism Config for: {cls_name}")
     if hasattr(module, "_cached_steps"):
         cached_steps: list[int] = module._cached_steps
         residual_diffs: dict[str, list | float] = dict(module._residual_diffs)

{cache_dit-1.0.5.dist-info → cache_dit-1.0.7.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: cache_dit
-Version: 1.0.5
+Version: 1.0.7
 Summary: A Unified, Flexible and Training-free Cache Acceleration Framework for 🤗Diffusers.
 Author: DefTruth, vipshop.com, etc.
 Maintainer: DefTruth, vipshop.com, etc
@@ -45,13 +45,15 @@ Dynamic: provides-extra
 Dynamic: requires-dist
 Dynamic: requires-python
-<a href="./README.md">📚English</a> | <a href="./README_CN.md">📚中文阅读 </a>
+📚English | <a href="./README_CN.md">📚中文阅读 </a>
 <div align="center">
   <img src=https://github.com/vipshop/cache-dit/raw/main/assets/cache-dit-logo.png height="120">
   <p align="center">
     A <b>Unified</b>, Flexible and Training-free <b>Cache Acceleration</b> Framework for <b>🤗Diffusers</b> <br>
-    ♥️ Cache Acceleration with <b>One-line</b> Code ~ ♥️
+    ♥️ Cache Acceleration with <b>One-line</b> Code ~ ♥️ <br>
+    🔥<b><a href="./docs/User_Guide.md">DBCache</a> | <a href="./docs/User_Guide.md">DBPrune</a> | <a href="./docs/User_Guide.md">Hybrid TaylorSeer</a> | <a href="./docs/User_Guide.md">Hybrid Cache CFG</a></b>🔥 <br>
+    🔥<b><a href="./docs/User_Guide.md">Hybrid Context Paralleism</a> | <a href="./docs/User_Guide.md">Diffusers Native</a> | <a href="./docs/User_Guide.md">SOTA</a></b>🔥
   </p>
   <div align='center'>
       <img src=https://img.shields.io/badge/Language-Python-brightgreen.svg >
@@ -194,7 +196,7 @@ You can install the stable release of cache-dit from PyPI, or the latest develop
 - **[🎉Easy New Model Integration](./docs/User_Guide.md#automatic-block-adapter)**: Features like **Unified Cache APIs**, **Forward Pattern Matching**, **Automatic Block Adapter**, **Hybrid Forward Pattern**, and **Patch Functor** make it highly functional and flexible. For example, we achieved 🎉 Day 1 support for [HunyuanImage-2.1](https://github.com/Tencent-Hunyuan/HunyuanImage-2.1) with 1.7x speedup w/o precision loss—even before it was available in the Diffusers library.
 - **[🎉State-of-the-Art Performance](./bench/)**: Compared with algorithms including Δ-DiT, Chipmunk, FORA, DuCa, TaylorSeer and FoCa, cache-dit achieved the **SOTA** performance w/ **7.4x↑🎉** speedup on ClipScore!
 - **[🎉Support for 4/8-Steps Distilled Models](./bench/)**: Surprisingly, cache-dit's **DBCache** works for extremely few-step distilled models—something many other methods fail to do.
-- **[🎉Compatibility with Other Optimizations](./docs/User_Guide.md#️torch-compile)**: Designed to work seamlessly with torch.compile, model CPU offload, sequential CPU offload, group offloading, Quantization(**[torchao](./examples/quantize/)**, **[🔥nunchaku](./examples/quantize/)**), etc.
+- **[🎉Compatibility with Other Optimizations](./docs/User_Guide.md#️torch-compile)**: Designed to work seamlessly with torch.compile, Quantization ([torchao](./examples/quantize/), [🔥nunchaku](./examples/quantize/)), CPU or Sequential Offloading, **[🔥Context Parallelism](./docs/User_Guide.md/#️hybrid-context-parallelism)**, Tensor Parallelism, etc.
 - **[🎉Hybrid Cache Acceleration](./docs/User_Guide.md#taylorseer-calibrator)**: Now supports hybrid **Block-wise Cache + Calibrator** schemes (e.g., DBCache or DBPrune + TaylorSeerCalibrator). DBCache or DBPrune acts as the **Indicator** to decide *when* to cache, while the Calibrator decides *how* to cache. More mainstream cache acceleration algorithms (e.g., FoCa) will be supported in the future, along with additional benchmarks—stay tuned for updates!
 - **[🤗Diffusers Ecosystem Integration](https://huggingface.co/docs/diffusers/main/en/optimization/cache_dit)**: 🔥**cache-dit** has joined the Diffusers community ecosystem as the **first** DiT-specific cache acceleration framework! Check out the documentation here: <a href="https://huggingface.co/docs/diffusers/main/en/optimization/cache_dit"><img src=https://img.shields.io/badge/🤗Diffusers-ecosystem-yellow.svg ></a>
@@ -202,12 +204,12 @@ You can install the stable release of cache-dit from PyPI, or the latest develop
 ## 🔥Important News
+- 2025.10.20: 🔥Now cache-dit supported the **[Hybrid Cache + Context Parallelism](./docs/User_Guide.md/#️hybrid-context-parallelism)** scheme!🔥
+- 2025.10.16: 🎉cache-dit + [**🔥nunchaku 4-bits**](https://github.com/nunchaku-tech/nunchaku) supported: [Qwen-Image-Lightning 4/8 steps](./examples/quantize/).
 - 2025.10.15: 🎉cache-dit now supported [**🔥nunchaku**](https://github.com/nunchaku-tech/nunchaku): Qwen-Image/FLUX.1 [4-bits examples](./examples/quantize/)
 - 2025.10.13: 🎉cache-dit achieved the **SOTA** performance w/ **7.4x↑🎉** speedup on ClipScore!
 - 2025.10.10: 🔥[**Qwen-Image-ControlNet-Inpainting**](https://huggingface.co/InstantX/Qwen-Image-ControlNet-Inpainting) **2.3x↑🎉** speedup! Check the [example](https://github.com/vipshop/cache-dit/blob/main/examples/pipeline/run_qwen_image_controlnet_inpaint.py).
 - 2025.09.26: 🔥[**Qwen-Image-Edit-Plus(2509)**](https://github.com/QwenLM/Qwen-Image) **2.1x↑🎉** speedup! Please check the [example](https://github.com/vipshop/cache-dit/blob/main/examples/pipeline/run_qwen_image_edit_plus.py).
-- 2025.09.25: 🎉The **first API-stable version (v1.0.0)** of cache-dit has finally been released!
-- 2025.09.25: 🔥**cache-dit** has joined the Diffusers community ecosystem: <a href="https://huggingface.co/docs/diffusers/main/en/optimization/cache_dit"><img src=https://img.shields.io/badge/🤗Diffusers-ecosystem-yellow.svg ></a>
 - 2025.09.10: 🎉Day 1 support [**HunyuanImage-2.1**](https://github.com/Tencent-Hunyuan/HunyuanImage-2.1) with **1.7x↑🎉** speedup! Check this [example](https://github.com/vipshop/cache-dit/blob/main/examples/pipeline/run_hunyuan_image_2.1.py).
 - 2025.09.08: 🔥[**Qwen-Image-Lightning**](https://github.com/vipshop/cache-dit/blob/main/examples/pipeline/run_qwen_image_lightning.py) **7.1/3.5 steps🎉** inference with **[DBCache: F16B16](https://github.com/vipshop/cache-dit)**.
 - 2025.09.03: 🎉[**Wan2.2-MoE**](https://github.com/Wan-Video) **2.4x↑🎉** speedup! Please refer to [run_wan_2.2.py](https://github.com/vipshop/cache-dit/blob/main/examples/pipeline/run_wan_2.2.py) as an example.
@@ -217,6 +219,8 @@ You can install the stable release of cache-dit from PyPI, or the latest develop
 <details>
 <summary>Previous News</summary>
+- 2025.09.25: 🎉The **first API-stable version (v1.0.0)** of cache-dit has finally been released!
+- 2025.09.25: 🔥**cache-dit** has joined the Diffusers community ecosystem: <a href="https://huggingface.co/docs/diffusers/main/en/optimization/cache_dit"><img src=https://img.shields.io/badge/🤗Diffusers-ecosystem-yellow.svg ></a>
 - 2025.09.08: 🎉First caching mechanism in [Wan2.2](https://github.com/Wan-Video/Wan2.2) with **[cache-dit](https://github.com/vipshop/cache-dit)**, check this [PR](https://github.com/Wan-Video/Wan2.2/pull/127) for more details.
 - 2025.09.08: 🎉First caching mechanism in [Qwen-Image-Lightning](https://github.com/ModelTC/Qwen-Image-Lightning) with **[cache-dit](https://github.com/vipshop/cache-dit)**, check this [PR](https://github.com/ModelTC/Qwen-Image-Lightning/pull/35).
 - 2025.08.10: 🔥[**FLUX.1-Kontext-dev**](https://huggingface.co/black-forest-labs/FLUX.1-Kontext-dev) is supported! Please refer [run_flux_kontext.py](https://github.com/vipshop/cache-dit/blob/main/examples/pipeline/run_flux_kontext.py) as an example.
@@ -239,14 +243,15 @@ For more advanced features such as **Unified Cache APIs**, **Forward Pattern Mat
   - [📚Forward Pattern Matching](./docs/User_Guide.md#forward-pattern-matching)
   - [📚Cache with One-line Code](./docs/User_Guide.md#%EF%B8%8Fcache-acceleration-with-one-line-code)
   - [🔥Automatic Block Adapter](./docs/User_Guide.md#automatic-block-adapter)
-  - [📚Hybird Forward Pattern](./docs/User_Guide.md#hybird-forward-pattern)
+  - [📚Hybrid Forward Pattern](./docs/User_Guide.md#hybrid-forward-pattern)
   - [📚Implement Patch Functor](./docs/User_Guide.md#implement-patch-functor)
   - [🤖Cache Acceleration Stats](./docs/User_Guide.md#cache-acceleration-stats-summary)
 - [⚡️DBCache: Dual Block Cache](./docs/User_Guide.md#️dbcache-dual-block-cache)
 - [⚡️DBPrune: Dynamic Block Prune](./docs/User_Guide.md#️dbprune-dynamic-block-prune)
-- [🔥TaylorSeer Calibrator](./docs/User_Guide.md#taylorseer-calibrator)
 - [⚡️Hybrid Cache CFG](./docs/User_Guide.md#️hybrid-cache-cfg)
-- [🛠Metrics CLI](./docs/User_Guide.md#metrics-cli)
+- [🔥Hybrid TaylorSeer Calibrator](./docs/User_Guide.md#taylorseer-calibrator)
+- [⚡️Hybrid Context Parallelism](./docs/User_Guide.md#context-paralleism)
+- [🛠Metrics Command Line](./docs/User_Guide.md#metrics-cli)
 - [⚙️Torch Compile](./docs/User_Guide.md#️torch-compile)
 - [📚API Documents](./docs/User_Guide.md#api-documentation)
@@ -268,7 +273,7 @@ How to contribute? Star ⭐️ this repo to support us or check [CONTRIBUTE.md](
 ## 🎉Projects Using CacheDiT
-Here is a curated list of open-source projects integrating **CacheDiT**, including popular repositories like [jetson-containers](https://github.com/dusty-nv/jetson-containers/blob/master/packages/diffusion/cache_edit/build.sh) ![](https://img.shields.io/github/stars/dusty-nv/jetson-containers.svg), [flux-fast](https://github.com/huggingface/flux-fast) ![](https://img.shields.io/github/stars/huggingface/flux-fast.svg), and [sdnext](https://github.com/vladmandic/sdnext/blob/dev/modules/cachedit.py) ![](https://img.shields.io/github/stars/vladmandic/sdnext.svg). **CacheDiT** has also been **recommended** by [Wan2.2](https://github.com/Wan-Video/Wan2.2) ![](https://img.shields.io/github/stars/Wan-Video/Wan2.2.svg), [Qwen-Image-Lightning](https://github.com/ModelTC/Qwen-Image-Lightning) ![](https://img.shields.io/github/stars/ModelTC/Qwen-Image-Lightning.svg), [Qwen-Image](https://github.com/QwenLM/Qwen-Image) ![](https://img.shields.io/github/stars/QwenLM/Qwen-Image.svg), and <a href="https://huggingface.co/docs/diffusers/main/en/optimization/cache_dit"><img src="https://img.shields.io/badge/🤗Diffusers-ecosystem-yellow.svg"></a> ![](https://img.shields.io/github/stars/huggingface/diffusers.svg), among others. We would be grateful if you could let us know if you have used CacheDiT.
+Here is a curated list of open-source projects integrating **CacheDiT**, including popular repositories like [jetson-containers](https://github.com/dusty-nv/jetson-containers/blob/master/packages/diffusion/cache_edit/build.sh) ![](https://img.shields.io/github/stars/dusty-nv/jetson-containers.svg), [flux-fast](https://github.com/huggingface/flux-fast) ![](https://img.shields.io/github/stars/huggingface/flux-fast.svg), and [sdnext](https://github.com/vladmandic/sdnext/discussions/4269) ![](https://img.shields.io/github/stars/vladmandic/sdnext.svg). **CacheDiT** has also been **recommended** by [Wan2.2](https://github.com/Wan-Video/Wan2.2) ![](https://img.shields.io/github/stars/Wan-Video/Wan2.2.svg), [Qwen-Image-Lightning](https://github.com/ModelTC/Qwen-Image-Lightning) ![](https://img.shields.io/github/stars/ModelTC/Qwen-Image-Lightning.svg), [Qwen-Image](https://github.com/QwenLM/Qwen-Image) ![](https://img.shields.io/github/stars/QwenLM/Qwen-Image.svg), and <a href="https://huggingface.co/docs/diffusers/main/en/optimization/cache_dit"><img src="https://img.shields.io/badge/🤗Diffusers-ecosystem-yellow.svg"></a> ![](https://img.shields.io/github/stars/huggingface/diffusers.svg), among others. We would be grateful if you could let us know if you have used CacheDiT.
 ## ©️Acknowledgements

{cache_dit-1.0.5.dist-info → cache_dit-1.0.7.dist-info}/RECORD RENAMED Viewed

@@ -1,24 +1,24 @@
-cache_dit/__init__.py,sha256=JQLxwr5aqoMFp-BNR58J0i6NutbRmNXKsaRJKCZQDCg,1638
-cache_dit/_version.py,sha256=uYAmg6HpJuEIdeeqRyfS9YeZSA-4pXIPoW7JKBe0Uvw,704
+cache_dit/__init__.py,sha256=HZb04M7AHCfk9DaEAGApGJ2lCM-rsP6pbsNQxsQudi0,1743
+cache_dit/_version.py,sha256=xUX1oSOk6hTPREy9SfhUBjaOBMJucMgoQViQ3e2Ce9A,704
 cache_dit/logger.py,sha256=0zsu42hN-3-rgGC_C29ms1IvVpV4_b4_SwJCKSenxBE,4304
-cache_dit/utils.py,sha256=0YNFr84pxYoHOCZvnONKKXYN3PZY4kao9Tq2yEfHHR8,16986
+cache_dit/utils.py,sha256=3NcEb324fNY0NYnrBTjsLURKQuckKeFe3V9Dfc_g4sc,17851
 cache_dit/cache_factory/.gitignore,sha256=5Cb-qT9wsTUoMJ7vACDF7ZcLpAXhi5v-xdcWSRit988,23
 cache_dit/cache_factory/__init__.py,sha256=5UjrpxLVlmjHttTL0O14fD5oU5uKI3FKYevL613ibFQ,1848
-cache_dit/cache_factory/cache_interface.py,sha256=spiE7pWF80G3Y06_TKVvrmKufbAvQmyvshZZVsmb-nM,12714
+cache_dit/cache_factory/cache_interface.py,sha256=244uTVx83hpCpbCDgEOydi5HqG7hKHHzEoz1ApJW6lI,14627
 cache_dit/cache_factory/cache_types.py,sha256=QnWfaS52UOXQtnoCUOwwz4ziY0dyBta6vQ6hvgtdV44,1404
 cache_dit/cache_factory/forward_pattern.py,sha256=FumlCuZ-TSmSYH0hGBHctSJ-oGLCftdZjLygqhsmdR4,2258
 cache_dit/cache_factory/params_modifier.py,sha256=2T98IbepAolWW6GwQsqUDsRzu0k65vo7BOrN3V8mKog,3606
 cache_dit/cache_factory/utils.py,sha256=S3SD6Zhexzhkqnmfo830v6oNLm8stZe32nF4VdxD_bA,2497
-cache_dit/cache_factory/block_adapters/__init__.py,sha256=zs-cYacRL_hWlhUXmKc0TZNDAKzzWuznvHeuDpAmuwc,18221
+cache_dit/cache_factory/block_adapters/__init__.py,sha256=eeBcWUMIvS-x3GcD1LNesW2SuB9V5mtwG9MoUBWHsL8,19765
 cache_dit/cache_factory/block_adapters/block_adapters.py,sha256=2TVK_KqiYXC7AKZ2s07fzdOzUoeUBc9P1SzQtLVzhf4,22249
-cache_dit/cache_factory/block_adapters/block_registers.py,sha256=2L7QeM4ygnaKQpC9PoJod0QRYyxidUKU2AYpysDCUwE,2572
+cache_dit/cache_factory/block_adapters/block_registers.py,sha256=KU0cqtLYRlij2WvuQ6erqZbxUWkb6DjvmY_sB3o_fQM,2594
 cache_dit/cache_factory/cache_adapters/__init__.py,sha256=py71WGD3JztQ1uk6qdLVbzYcQ1rvqFidNNaQYo7tqTo,79
-cache_dit/cache_factory/cache_adapters/cache_adapter.py,sha256=-KTKukcNaVYk92np-QnGJ-EJbTdRQplqIHtqEORPiAo,23745
+cache_dit/cache_factory/cache_adapters/cache_adapter.py,sha256=WYrgV3DKxOxttl-wEKymyKIB1Po0eW73Q2_vOlGEKdQ,24080
 cache_dit/cache_factory/cache_blocks/__init__.py,sha256=cpxzmDcUhbXcReHqaKSnWyEEbIg1H91Pz5hE3z9Xj3k,9984
 cache_dit/cache_factory/cache_blocks/offload_utils.py,sha256=wusgcqaCrwEjvv7Guy-6VXhNOgPPUrBV2sSVuRmGuvo,3513
 cache_dit/cache_factory/cache_blocks/pattern_0_1_2.py,sha256=j4bTafqU5DLQhzP_X5XwOk-QUVLWkGrX-Q6JZvBGHh0,666
 cache_dit/cache_factory/cache_blocks/pattern_3_4_5.py,sha256=2qPnXVZwpQIm2oJ-Yrn3Avqi3BcXtE2133jPIL_LhK8,19595
-cache_dit/cache_factory/cache_blocks/pattern_base.py,sha256=9H87qBRpa6UWRkUKXLVO0_9NJgxCVKkFSzaQxM9YPw8,25487
+cache_dit/cache_factory/cache_blocks/pattern_base.py,sha256=uNcPZU8b8i_-re_X1xBHkSDQSacQO7Fa69vjbfAYxOA,25275
 cache_dit/cache_factory/cache_blocks/pattern_utils.py,sha256=qOxoVTlYPQzPMrR06-7_Ce_lwNg6n5pt1KQrvxzAJhE,3124
 cache_dit/cache_factory/cache_contexts/__init__.py,sha256=7uY8fX9uhpC71VNm1HH4aDIicYn-dD3kRpPQhvc9-EI,853
 cache_dit/cache_factory/cache_contexts/cache_config.py,sha256=G0PVWgckDqeyARc72Ne_0lRtO_LftsOeMERRhbh2gCA,5739
@@ -52,12 +52,17 @@ cache_dit/metrics/image_reward.py,sha256=N8HalJo1T1js0dsNb2V1KRv4kIdcm3nhx7iOXJu
 cache_dit/metrics/inception.py,sha256=pBVe2X6ylLPIXTG4-GWDM9DWnCviMJbJ45R3ulhktR0,12759
 cache_dit/metrics/lpips.py,sha256=hrHrmdM-f2B4TKDs0xLqJO5JFaYcCjq2qNIR8oCrVkc,811
 cache_dit/metrics/metrics.py,sha256=AZbQyoavE-djvyRUZ_EfCIrWSQbiWQFo7n2dhn7XptE,40466
+cache_dit/parallelism/__init__.py,sha256=dheBG5_TZCuwctviMslpAEgB-B3N8F816bE51qsw_fU,210
+cache_dit/parallelism/parallel_backend.py,sha256=js1soTMenLeAyPMsBgdI3gWcdXoqjWgBD-PuFEywMr0,508
+cache_dit/parallelism/parallel_config.py,sha256=bu24sRSzJMmH7FZqzUPTcT6tAzQ20-FAqAEvGV3Q1Fw,1733
+cache_dit/parallelism/parallel_interface.py,sha256=tsiIdHosTmRbeRg0z9q0eMQlx-7vefmSIlc56OWnuMg,2205
+cache_dit/parallelism/backends/parallel_difffusers.py,sha256=YQkCJ1yq1OomZLyRLtGMaPSNWbDeAWGx9XuObVJ_85I,2499
 cache_dit/quantize/__init__.py,sha256=kWYoMAyZgBXu9BJlZjTQ0dRffW9GqeeY9_iTkXrb70A,59
-cache_dit/quantize/quantize_ao.py,sha256=LlkKh2uAtLihNRXHWIVggYR7ls12ak_VVLCY8trp2LY,7996
+cache_dit/quantize/quantize_ao.py,sha256=bbEUwsrMp3bMuRw8qJZREIvCHaJRQoZyfMjlu4ImRMI,6315
 cache_dit/quantize/quantize_interface.py,sha256=2s_R7xPSKuJeFpEGeLwRxnq_CqJcBG3a3lzyW5wh-UM,1241
-cache_dit-1.0.5.dist-info/licenses/LICENSE,sha256=Dqb07Ik2dV41s9nIdMUbiRWEfDqo7-dQeRiY7kPO8PE,3769
-cache_dit-1.0.5.dist-info/METADATA,sha256=qalYkFx9Y0cSmoBqHx-ZmI2RdVxcFhuNkWZVs7laExI,28632
-cache_dit-1.0.5.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
-cache_dit-1.0.5.dist-info/entry_points.txt,sha256=FX2gysXaZx6NeK1iCLMcIdP8Q4_qikkIHtEmi3oWn8o,65
-cache_dit-1.0.5.dist-info/top_level.txt,sha256=ZJDydonLEhujzz0FOkVbO-BqfzO9d_VqRHmZU-3MOZo,10
-cache_dit-1.0.5.dist-info/RECORD,,
+cache_dit-1.0.7.dist-info/licenses/LICENSE,sha256=Dqb07Ik2dV41s9nIdMUbiRWEfDqo7-dQeRiY7kPO8PE,3769
+cache_dit-1.0.7.dist-info/METADATA,sha256=I0Vb-ZqUHblKOWwXyCyZVfcllq1lLm7ML2X7U6TJs4s,29475
+cache_dit-1.0.7.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
+cache_dit-1.0.7.dist-info/entry_points.txt,sha256=FX2gysXaZx6NeK1iCLMcIdP8Q4_qikkIHtEmi3oWn8o,65
+cache_dit-1.0.7.dist-info/top_level.txt,sha256=ZJDydonLEhujzz0FOkVbO-BqfzO9d_VqRHmZU-3MOZo,10
+cache_dit-1.0.7.dist-info/RECORD,,

{cache_dit-1.0.5.dist-info → cache_dit-1.0.7.dist-info}/WHEEL RENAMED Viewed

File without changes

{cache_dit-1.0.5.dist-info → cache_dit-1.0.7.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{cache_dit-1.0.5.dist-info → cache_dit-1.0.7.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

{cache_dit-1.0.5.dist-info → cache_dit-1.0.7.dist-info}/top_level.txt RENAMED Viewed

File without changes

cache-dit 1.0.5__py3-none-any.whl → 1.0.7__py3-none-any.whl

Potentially problematic release.

cache-dit 1.0.5py3-none-any.whl → 1.0.7py3-none-any.whl