PyPI - cache-dit - Versions diffs - 1.0.4__py3-none-any.whl → 1.0.6__py3-none-any.whl - Mend

cache-dit 1.0.4py3-none-any.whl → 1.0.6py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of cache-dit might be problematic. Click here for more details.

Files changed (21) hide show

cache_dit/__init__.py CHANGED Viewed

@@ -4,8 +4,7 @@ except ImportError:
     __version__ = "unknown version"
     version_tuple = (0, 0, "unknown version")
-from cache_dit.utils import summary
-from cache_dit.utils import strify
 from cache_dit.utils import disable_print
 from cache_dit.logger import init_logger
 from cache_dit.cache_factory import load_options
@@ -28,7 +27,10 @@ from cache_dit.cache_factory import supported_pipelines
 from cache_dit.cache_factory import get_adapter
 from cache_dit.compile import set_compile_configs
 from cache_dit.quantize import quantize
+from cache_dit.parallelism import ParallelismBackend
+from cache_dit.parallelism import ParallelismConfig
+from cache_dit.utils import summary
+from cache_dit.utils import strify
 NONE = CacheType.NONE
 DBCache = CacheType.DBCache

cache_dit/_version.py CHANGED Viewed

@@ -28,7 +28,7 @@ version_tuple: VERSION_TUPLE
 commit_id: COMMIT_ID
 __commit_id__: COMMIT_ID
-__version__ = version = '1.0.4'
-__version_tuple__ = version_tuple = (1, 0, 4)
+__version__ = version = '1.0.6'
+__version_tuple__ = version_tuple = (1, 0, 6)
 __commit_id__ = commit_id = None

cache_dit/cache_factory/block_adapters/__init__.py CHANGED Viewed

@@ -12,7 +12,10 @@ def flux_adapter(pipe, **kwargs) -> BlockAdapter:
     from cache_dit.utils import is_diffusers_at_least_0_3_5
     assert isinstance(pipe.transformer, FluxTransformer2DModel)
-    if is_diffusers_at_least_0_3_5():
+    transformer_cls_name: str = pipe.transformer.__class__.__name__
+    if is_diffusers_at_least_0_3_5() and not transformer_cls_name.startswith(
+        "Nunchaku"
+    ):
         return BlockAdapter(
             pipe=pipe,
             transformer=pipe.transformer,

cache_dit/cache_factory/cache_adapters/cache_adapter.py CHANGED Viewed

@@ -1,3 +1,4 @@
+import copy
 import torch
 import unittest
 import functools
@@ -197,7 +198,6 @@ class CachedAdapter:
         flatten_contexts, contexts_kwargs = cls.modify_context_params(
             block_adapter, **context_kwargs
         )
         original_call = block_adapter.pipe.__class__.__call__
         @functools.wraps(original_call)
@@ -238,7 +238,7 @@ class CachedAdapter:
             block_adapter.unique_blocks_name
         )
         contexts_kwargs = [
-            context_kwargs.copy()
+            copy.deepcopy(context_kwargs)  # must deep copy
             for _ in range(
                 len(flatten_contexts),
             )
@@ -259,9 +259,41 @@ class CachedAdapter:
         for i in range(
             min(len(contexts_kwargs), len(flatten_modifiers)),
         ):
-            contexts_kwargs[i].update(
-                flatten_modifiers[i]._context_kwargs,
-            )
+            if "cache_config" in flatten_modifiers[i]._context_kwargs:
+                modifier_cache_config = flatten_modifiers[
+                    i
+                ]._context_kwargs.get("cache_config", None)
+                modifier_calibrator_config = flatten_modifiers[
+                    i
+                ]._context_kwargs.get("calibrator_config", None)
+                if modifier_cache_config is not None:
+                    assert isinstance(
+                        modifier_cache_config, BasicCacheConfig
+                    ), (
+                        f"cache_config must be BasicCacheConfig, but got "
+                        f"{type(modifier_cache_config)}."
+                    )
+                    contexts_kwargs[i]["cache_config"].update(
+                        **modifier_cache_config.as_dict()
+                    )
+                if modifier_calibrator_config is not None:
+                    assert isinstance(
+                        modifier_calibrator_config, CalibratorConfig
+                    ), (
+                        f"calibrator_config must be CalibratorConfig, but got "
+                        f"{type(modifier_calibrator_config)}."
+                    )
+                    if (
+                        contexts_kwargs[i].get("calibrator_config", None)
+                        is None
+                    ):
+                        contexts_kwargs[i][
+                            "calibrator_config"
+                        ] = modifier_calibrator_config
+                    else:
+                        contexts_kwargs[i]["calibrator_config"].update(
+                            **modifier_calibrator_config.as_dict()
+                        )
             cls._config_messages(**contexts_kwargs[i])
         return flatten_contexts, contexts_kwargs
@@ -582,6 +614,18 @@ class CachedAdapter:
             pipe_or_adapter, remove_stats, remove_stats, remove_stats
         )
+        # maybe release parallelism stats
+        from cache_dit.parallelism.parallel_interface import (
+            remove_parallelism_stats,
+        )
+        cls.release_hooks(
+            pipe_or_adapter,
+            remove_parallelism_stats,
+            remove_parallelism_stats,
+            remove_parallelism_stats,
+        )
     @classmethod
     def release_hooks(
         cls,

cache_dit/cache_factory/cache_contexts/cache_config.py CHANGED Viewed

@@ -60,9 +60,25 @@ class BasicCacheConfig:
     def update(self, **kwargs) -> "BasicCacheConfig":
         for key, value in kwargs.items():
             if hasattr(self, key):
-                setattr(self, key, value)
+                if value is not None:
+                    setattr(self, key, value)
         return self
+    def empty(self, **kwargs) -> "BasicCacheConfig":
+        # Set all fields to None
+        for field in dataclasses.fields(self):
+            if hasattr(self, field.name):
+                setattr(self, field.name, None)
+        if kwargs:
+            self.update(**kwargs)
+        return self
+    def reset(self, **kwargs) -> "BasicCacheConfig":
+        return self.empty(**kwargs)
+    def as_dict(self) -> dict:
+        return dataclasses.asdict(self)
     def strify(self) -> str:
         return (
             f"{self.cache_type}_"

cache_dit/cache_factory/cache_contexts/calibrators/__init__.py CHANGED Viewed

@@ -45,6 +45,28 @@ class CalibratorConfig:
     def to_kwargs(self) -> Dict:
         return self.calibrator_kwargs.copy()
+    def as_dict(self) -> dict:
+        return dataclasses.asdict(self)
+    def update(self, **kwargs) -> "CalibratorConfig":
+        for key, value in kwargs.items():
+            if hasattr(self, key):
+                if value is not None:
+                    setattr(self, key, value)
+        return self
+    def empty(self, **kwargs) -> "CalibratorConfig":
+        # Set all fields to None
+        for field in dataclasses.fields(self):
+            if hasattr(self, field.name):
+                setattr(self, field.name, None)
+        if kwargs:
+            self.update(**kwargs)
+        return self
+    def reset(self, **kwargs) -> "CalibratorConfig":
+        return self.empty(**kwargs)
 @dataclasses.dataclass
 class TaylorSeerCalibratorConfig(CalibratorConfig):

cache_dit/cache_factory/cache_contexts/prune_config.py CHANGED Viewed

@@ -50,12 +50,6 @@ class DBPruneConfig(BasicCacheConfig):
     #     to at least 2 to reduce the VRAM usage of the calibrator.
     force_reduce_calibrator_vram: bool = False
-    def update(self, **kwargs) -> "DBPruneConfig":
-        for key, value in kwargs.items():
-            if hasattr(self, key):
-                setattr(self, key, value)
-        return self
     def strify(self) -> str:
         return (
             f"{self.cache_type}_"

cache_dit/cache_factory/cache_interface.py CHANGED Viewed

@@ -9,6 +9,8 @@ from cache_dit.cache_factory.cache_contexts import DBCacheConfig
 from cache_dit.cache_factory.cache_contexts import DBPruneConfig
 from cache_dit.cache_factory.cache_contexts import CalibratorConfig
 from cache_dit.cache_factory.params_modifier import ParamsModifier
+from cache_dit.parallelism import ParallelismConfig
+from cache_dit.parallelism import enable_parallelism
 from cache_dit.logger import init_logger
@@ -37,6 +39,8 @@ def enable_cache(
             List[List[ParamsModifier]],
         ]
     ] = None,
+    # Config for Parallelism
+    parallelism_config: Optional[ParallelismConfig] = None,
     # Other cache context kwargs: Deprecated cache kwargs
     **kwargs,
 ) -> Union[
@@ -127,6 +131,15 @@ def enable_cache(
                 **kwargs: (`dict`, *optional*, defaults to {}):
                     The same as 'kwargs' param in cache_dit.enable_cache() interface.
+        parallelism_config (`ParallelismConfig`, *optional*, defaults to None):
+            Config for Parallelism. If parallelism_config is not None, it means the user wants to enable
+            parallelism for cache-dit. Please check https://github.com/vipshop/cache-dit/blob/main/src/cache_dit/parallelism/parallel_config.py
+            for more details of ParallelismConfig.
+                ulysses_size: (`int`, *optional*, defaults to None):
+                    The size of Ulysses cluster. If ulysses_size is not None, enable Ulysses style parallelism.
+                ring_size: (`int`, *optional*, defaults to None):
+                    The size of ring for ring parallelism. If ring_size is not None, enable ring attention.
         kwargs (`dict`, *optional*, defaults to {})
             Other cache context kwargs, please check https://github.com/vipshop/cache-dit/blob/main/src/cache_dit/cache_factory/cache_contexts/cache_context.py
             for more details.
@@ -214,7 +227,7 @@ def enable_cache(
         context_kwargs["params_modifiers"] = params_modifiers
     if isinstance(pipe_or_adapter, (DiffusionPipeline, BlockAdapter)):
-        return CachedAdapter.apply(
+        pipe_or_adapter = CachedAdapter.apply(
             pipe_or_adapter,
             **context_kwargs,
         )
@@ -225,6 +238,27 @@ def enable_cache(
             "for the 1's position param: pipe_or_adapter"
         )
+    # NOTE: Users should always enable parallelism after applying
+    # cache to avoid hooks conflict.
+    if parallelism_config is not None:
+        assert isinstance(
+            parallelism_config, ParallelismConfig
+        ), "parallelism_config should be of type ParallelismConfig."
+        if isinstance(pipe_or_adapter, DiffusionPipeline):
+            transformer = pipe_or_adapter.transformer
+        else:
+            assert BlockAdapter.assert_normalized(pipe_or_adapter)
+            assert (
+                len(BlockAdapter.flatten(pipe_or_adapter.transformer)) == 1
+            ), (
+                "Only single transformer is supported to enable parallelism "
+                "currently for BlockAdapter."
+            )
+            transformer = BlockAdapter.flatten(pipe_or_adapter.transformer)[0]
+        # Enable parallelism for the transformer inplace
+        transformer = enable_parallelism(transformer, parallelism_config)
+    return pipe_or_adapter
 def disable_cache(
     pipe_or_adapter: Union[

cache_dit/parallelism/__init__.py ADDED Viewed

@@ -0,0 +1,3 @@
+from cache_dit.parallelism.parallel_backend import ParallelismBackend
+from cache_dit.parallelism.parallel_config import ParallelismConfig
+from cache_dit.parallelism.parallel_interface import enable_parallelism

cache_dit/parallelism/backends/parallel_difffusers.py ADDED Viewed

@@ -0,0 +1,56 @@
+import torch
+from typing import Optional
+try:
+    from diffusers import ContextParallelConfig
+    def native_diffusers_parallelism_available() -> bool:
+        return True
+except ImportError:
+    ContextParallelConfig = None
+    def native_diffusers_parallelism_available() -> bool:
+        return False
+from diffusers.models.modeling_utils import ModelMixin
+from cache_dit.parallelism.parallel_backend import ParallelismBackend
+from cache_dit.parallelism.parallel_config import ParallelismConfig
+def maybe_enable_parallelism(
+    transformer: torch.nn.Module,
+    parallelism_config: Optional[ParallelismConfig],
+) -> torch.nn.Module:
+    assert isinstance(transformer, ModelMixin)
+    if parallelism_config is None:
+        return transformer
+    if (
+        parallelism_config.backend == ParallelismBackend.NATIVE_DIFFUSER
+        and native_diffusers_parallelism_available()
+    ):
+        cp_config = None
+        if (
+            parallelism_config.ulysses_size is not None
+            or parallelism_config.ring_size is not None
+        ):
+            cp_config = ContextParallelConfig(
+                ulysses_degree=parallelism_config.ulysses_size,
+                ring_degree=parallelism_config.ring_size,
+            )
+        if cp_config is not None:
+            if hasattr(transformer, "enable_parallelism"):
+                if hasattr(transformer, "set_attention_backend"):  # type: ignore[attr-defined]
+                    # Now only _native_cudnn is supported for parallelism
+                    # issue: https://github.com/huggingface/diffusers/pull/12443
+                    transformer.set_attention_backend("_native_cudnn")  # type: ignore[attr-defined]
+                transformer.enable_parallelism(config=cp_config)
+            else:
+                raise ValueError(
+                    f"{transformer.__class__.__name__} does not support context parallelism."
+                )
+    return transformer

cache_dit/parallelism/parallel_backend.py ADDED Viewed

@@ -0,0 +1,18 @@
+from enum import Enum
+class ParallelismBackend(Enum):
+    NATIVE_DIFFUSER = "Native_Diffuser"
+    NATIVE_PYTORCH = "Native_PyTorch"
+    NONE = "None"
+    @classmethod
+    def is_supported(cls, backend: "ParallelismBackend") -> bool:
+        # Now, only Native_Diffuser backend is supported
+        if backend in [cls.NATIVE_DIFFUSER]:
+            try:
+                import diffusers  # noqa: F401
+            except ImportError:
+                return False
+            return True
+        return False

cache_dit/parallelism/parallel_config.py ADDED Viewed

@@ -0,0 +1,47 @@
+import dataclasses
+from cache_dit.parallelism.parallel_backend import ParallelismBackend
+from cache_dit.logger import init_logger
+logger = init_logger(__name__)
+@dataclasses.dataclass
+class ParallelismConfig:
+    # Parallelism backend, defaults to NATIVE_DIFFUSER
+    backend: ParallelismBackend = ParallelismBackend.NATIVE_DIFFUSER
+    # Context parallelism config
+    # ulysses_size (`int`, *optional*):
+    #     The degree of ulysses parallelism.
+    ulysses_size: int = None
+    # ring_size (`int`, *optional*):
+    #     The degree of ring parallelism.
+    ring_size: int = None
+    # Tensor parallelism config
+    # tp_size (`int`, *optional*):
+    #     The degree of tensor parallelism.
+    tp_size: int = None
+    def __post_init__(self):
+        assert ParallelismBackend.is_supported(self.backend), (
+            f"Parallel backend {self.backend} is not supported. "
+            f"Please make sure the required packages are installed."
+        )
+        assert self.tp_size is None, "Tensor parallelism is not supported yet."
+    def strify(self, details: bool = False) -> str:
+        if details:
+            return (
+                f"ParallelismConfig(backend={self.backend}, "
+                f"ulysses_size={self.ulysses_size}, "
+                f"ring_size={self.ring_size}, "
+                f"tp_size={self.tp_size})"
+            )
+        else:
+            parallel_str = ""
+            if self.ulysses_size is not None:
+                parallel_str += f"Ulysses{self.ulysses_size}"
+            if self.ring_size is not None:
+                parallel_str += f"Ring{self.ring_size}"
+            if self.tp_size is not None:
+                parallel_str += f"TP{self.tp_size}"
+            return parallel_str

cache_dit/parallelism/parallel_interface.py ADDED Viewed

@@ -0,0 +1,62 @@
+import torch
+from cache_dit.parallelism.parallel_backend import ParallelismBackend
+from cache_dit.parallelism.parallel_config import ParallelismConfig
+from cache_dit.logger import init_logger
+logger = init_logger(__name__)
+def enable_parallelism(
+    transformer: torch.nn.Module,
+    parallelism_config: ParallelismConfig,
+) -> torch.nn.Module:
+    assert isinstance(transformer, torch.nn.Module), (
+        "transformer must be an instance of torch.nn.Module, "
+        f"but got {type(transformer)}"
+    )
+    if getattr(transformer, "_is_parallelized", False):
+        logger.warning(
+            "The transformer is already parallelized. "
+            "Skipping parallelism enabling."
+        )
+        return transformer
+    if parallelism_config.backend == ParallelismBackend.NATIVE_DIFFUSER:
+        from cache_dit.parallelism.backends.parallel_difffusers import (
+            maybe_enable_parallelism,
+            native_diffusers_parallelism_available,
+        )
+        assert (
+            native_diffusers_parallelism_available()
+        ), "Please install diffusers>=0.36.dev0 to use Native_Diffuser backend."
+        transformer = maybe_enable_parallelism(
+            transformer,
+            parallelism_config,
+        )
+    else:
+        raise ValueError(
+            f"Parallel backend {parallelism_config.backend} is not supported yet."
+        )
+    transformer._is_parallelized = True  # type: ignore[attr-defined]
+    transformer._parallelism_config = parallelism_config  # type: ignore[attr-defined]
+    logger.info(f"Enabled parallelism: {parallelism_config.strify(True)}")
+    return transformer
+def remove_parallelism_stats(
+    transformer: torch.nn.Module,
+) -> torch.nn.Module:
+    if not getattr(transformer, "_is_parallelized", False):
+        logger.warning(
+            "The transformer is not parallelized. "
+            "Skipping removing parallelism."
+        )
+        return transformer
+    if hasattr(transformer, "_is_parallelized"):
+        del transformer._is_parallelized  # type: ignore[attr-defined]
+    if hasattr(transformer, "_parallelism_config"):
+        del transformer._parallelism_config  # type: ignore[attr-defined]
+    return transformer

cache_dit/quantize/quantize_ao.py CHANGED Viewed

@@ -80,11 +80,11 @@ def quantize_ao(
         return False
-    def _quantization_fn():
+    def _quant_config():
         try:
             if quant_type == "fp8_w8a8_dq":
                 from torchao.quantization import (
-                    float8_dynamic_activation_float8_weight,
+                    Float8DynamicActivationFloat8WeightConfig,
                     PerTensor,
                     PerRow,
                 )
@@ -92,7 +92,7 @@ def quantize_ao(
                 if per_row:  # Ensure bfloat16
                     module.to(torch.bfloat16)
-                quantization_fn = float8_dynamic_activation_float8_weight(
+                quant_config = Float8DynamicActivationFloat8WeightConfig(
                     weight_dtype=kwargs.get(
                         "weight_dtype",
                         torch.float8_e4m3fn,
@@ -109,9 +109,9 @@ def quantize_ao(
                 )
             elif quant_type == "fp8_w8a16_wo":
-                from torchao.quantization import float8_weight_only
+                from torchao.quantization import Float8WeightOnlyConfig
-                quantization_fn = float8_weight_only(
+                quant_config = Float8WeightOnlyConfig(
                     weight_dtype=kwargs.get(
                         "weight_dtype",
                         torch.float8_e4m3fn,
@@ -120,39 +120,43 @@ def quantize_ao(
             elif quant_type == "int8_w8a8_dq":
                 from torchao.quantization import (
-                    int8_dynamic_activation_int8_weight,
+                    Int8DynamicActivationInt8WeightConfig,
                 )
-                quantization_fn = int8_dynamic_activation_int8_weight()
+                quant_config = Int8DynamicActivationInt8WeightConfig()
             elif quant_type == "int8_w8a16_wo":
-                from torchao.quantization import int8_weight_only
-                quantization_fn = int8_weight_only(
+                from torchao.quantization import Int8WeightOnlyConfig
+                quant_config = Int8WeightOnlyConfig(
                     # group_size is None -> per_channel, else per group
                     group_size=kwargs.get("group_size", None),
                 )
             elif quant_type == "int4_w4a8_dq":
                 from torchao.quantization import (
-                    int8_dynamic_activation_int4_weight,
+                    Int8DynamicActivationInt4WeightConfig,
                 )
-                quantization_fn = int8_dynamic_activation_int4_weight(
+                quant_config = Int8DynamicActivationInt4WeightConfig(
                     group_size=kwargs.get("group_size", 32),
                 )
             elif quant_type == "int4_w4a4_dq":
                 from torchao.quantization import (
-                    int4_dynamic_activation_int4_weight,
+                    Int4DynamicActivationInt4WeightConfig,
                 )
-                quantization_fn = int4_dynamic_activation_int4_weight()
+                quant_config = Int4DynamicActivationInt4WeightConfig()
             elif quant_type == "int4_w4a16_wo":
-                from torchao.quantization import int4_weight_only
-                quantization_fn = int4_weight_only(
+                from torchao.quantization import Int4WeightOnlyConfig
+                quant_config = Int4WeightOnlyConfig(
                     group_size=kwargs.get("group_size", 32),
                 )
@@ -168,13 +172,13 @@ def quantize_ao(
             )
             raise e
-        return quantization_fn
+        return quant_config
     from torchao.quantization import quantize_
     quantize_(
         module,
-        _quantization_fn(),
+        _quant_config(),
         filter_fn=_filter_fn if filter_fn is None else filter_fn,
         device=kwargs.get("device", None),
     )

cache_dit/utils.py CHANGED Viewed

@@ -13,6 +13,7 @@ from cache_dit.cache_factory import CacheType
 from cache_dit.cache_factory import BlockAdapter
 from cache_dit.cache_factory import BasicCacheConfig
 from cache_dit.cache_factory import CalibratorConfig
+from cache_dit.parallelism import ParallelismConfig
 from cache_dit.logger import init_logger
@@ -55,6 +56,8 @@ class CacheStats:
     cfg_pruned_blocks: list[int] = dataclasses.field(default_factory=list)
     cfg_actual_blocks: list[int] = dataclasses.field(default_factory=list)
     cfg_pruned_ratio: float = None
+    # Parallelism Stats
+    parallelism_config: ParallelismConfig = None
 def summary(
@@ -213,7 +216,13 @@ def strify(
             return calibrator_config.strify()
         return "T0O0"
-    cache_type_str = f"{cache_str()}_{calibrator_str()}"
+    def parallelism_str():
+        parallelism_config: ParallelismConfig = stats.parallelism_config
+        if parallelism_config is not None:
+            return f"_{parallelism_config.strify()}"
+        return ""
+    cache_type_str = f"{cache_str()}_{calibrator_str()}{parallelism_str()}"
     if cached_steps:
         cache_type_str += f"_S{cached_steps}"
@@ -252,6 +261,17 @@ def _summary(
         if logging:
             logger.warning(f"Can't find Context Options for: {cls_name}")
+    if hasattr(module, "_parallelism_config"):
+        parallelism_config: ParallelismConfig = module._parallelism_config
+        cache_stats.parallelism_config = parallelism_config
+        if logging:
+            print(
+                f"\n🤖Parallelism Config: {cls_name}\n\n{parallelism_config.strify(True)}"
+            )
+    else:
+        if logging:
+            logger.warning(f"Can't find Parallelism Config for: {cls_name}")
     if hasattr(module, "_cached_steps"):
         cached_steps: list[int] = module._cached_steps
         residual_diffs: dict[str, list | float] = dict(module._residual_diffs)

{cache_dit-1.0.4.dist-info → cache_dit-1.0.6.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: cache_dit
-Version: 1.0.4
+Version: 1.0.6
 Summary: A Unified, Flexible and Training-free Cache Acceleration Framework for 🤗Diffusers.
 Author: DefTruth, vipshop.com, etc.
 Maintainer: DefTruth, vipshop.com, etc
@@ -53,6 +53,10 @@ Dynamic: requires-python
     A <b>Unified</b>, Flexible and Training-free <b>Cache Acceleration</b> Framework for <b>🤗Diffusers</b> <br>
     ♥️ Cache Acceleration with <b>One-line</b> Code ~ ♥️
   </p>
+  <p align="center">
+    🔥<b><a href="./docs/User_Guide.md">DBCache</a> | <a href="./docs/User_Guide.md">DBPrune</a> | <a href="./docs/User_Guide.md">Hybird TaylorSeer</a> | <a href="./docs/User_Guide.md">Hybird Cache CFG</a></b>🔥 <br>
+    🔥<b><a href="./docs/User_Guide.md">Hybrid Context Paralleism</a> | <a href="./docs/User_Guide.md">PyTorch Native</a> | <a href="./docs/User_Guide.md">SOTA</a></b>🔥
+  </p>
   <div align='center'>
       <img src=https://img.shields.io/badge/Language-Python-brightgreen.svg >
       <img src=https://img.shields.io/badge/PRs-welcome-blue.svg >
@@ -194,7 +198,7 @@ You can install the stable release of cache-dit from PyPI, or the latest develop
 - **[🎉Easy New Model Integration](./docs/User_Guide.md#automatic-block-adapter)**: Features like **Unified Cache APIs**, **Forward Pattern Matching**, **Automatic Block Adapter**, **Hybrid Forward Pattern**, and **Patch Functor** make it highly functional and flexible. For example, we achieved 🎉 Day 1 support for [HunyuanImage-2.1](https://github.com/Tencent-Hunyuan/HunyuanImage-2.1) with 1.7x speedup w/o precision loss—even before it was available in the Diffusers library.
 - **[🎉State-of-the-Art Performance](./bench/)**: Compared with algorithms including Δ-DiT, Chipmunk, FORA, DuCa, TaylorSeer and FoCa, cache-dit achieved the **SOTA** performance w/ **7.4x↑🎉** speedup on ClipScore!
 - **[🎉Support for 4/8-Steps Distilled Models](./bench/)**: Surprisingly, cache-dit's **DBCache** works for extremely few-step distilled models—something many other methods fail to do.
-- **[🎉Compatibility with Other Optimizations](./docs/User_Guide.md#️torch-compile)**: Designed to work seamlessly with torch.compile, model CPU offload, sequential CPU offload, group offloading, etc.
+- **[🎉Compatibility with Other Optimizations](./docs/User_Guide.md#️torch-compile)**: Designed to work seamlessly with torch.compile, Offloading, Quantization([torchao](./examples/quantize/), [🔥nunchaku](./examples/quantize/)), [🔥Context Parallelism](./docs/User_Guide.md/#️hybrid-context-parallelism), etc.
 - **[🎉Hybrid Cache Acceleration](./docs/User_Guide.md#taylorseer-calibrator)**: Now supports hybrid **Block-wise Cache + Calibrator** schemes (e.g., DBCache or DBPrune + TaylorSeerCalibrator). DBCache or DBPrune acts as the **Indicator** to decide *when* to cache, while the Calibrator decides *how* to cache. More mainstream cache acceleration algorithms (e.g., FoCa) will be supported in the future, along with additional benchmarks—stay tuned for updates!
 - **[🤗Diffusers Ecosystem Integration](https://huggingface.co/docs/diffusers/main/en/optimization/cache_dit)**: 🔥**cache-dit** has joined the Diffusers community ecosystem as the **first** DiT-specific cache acceleration framework! Check out the documentation here: <a href="https://huggingface.co/docs/diffusers/main/en/optimization/cache_dit"><img src=https://img.shields.io/badge/🤗Diffusers-ecosystem-yellow.svg ></a>
@@ -202,6 +206,9 @@ You can install the stable release of cache-dit from PyPI, or the latest develop
 ## 🔥Important News
+- 2025.10.20: 🔥Now cache-dit supported the [Hybrid Cache + Context Parallelism](./docs/User_Guide.md/#️hybrid-context-parallelism) scheme!🔥
+- 2025.10.16: 🎉cache-dit + [**🔥nunchaku 4-bits**](https://github.com/nunchaku-tech/nunchaku) supported: [Qwen-Image-Lightning 4/8 steps](./examples/quantize/).
+- 2025.10.15: 🎉cache-dit now supported [**🔥nunchaku**](https://github.com/nunchaku-tech/nunchaku): Qwen-Image/FLUX.1 [4-bits examples](./examples/quantize/)
 - 2025.10.13: 🎉cache-dit achieved the **SOTA** performance w/ **7.4x↑🎉** speedup on ClipScore!
 - 2025.10.10: 🔥[**Qwen-Image-ControlNet-Inpainting**](https://huggingface.co/InstantX/Qwen-Image-ControlNet-Inpainting) **2.3x↑🎉** speedup! Check the [example](https://github.com/vipshop/cache-dit/blob/main/examples/pipeline/run_qwen_image_controlnet_inpaint.py).
 - 2025.09.26: 🔥[**Qwen-Image-Edit-Plus(2509)**](https://github.com/QwenLM/Qwen-Image) **2.1x↑🎉** speedup! Please check the [example](https://github.com/vipshop/cache-dit/blob/main/examples/pipeline/run_qwen_image_edit_plus.py).
@@ -243,9 +250,10 @@ For more advanced features such as **Unified Cache APIs**, **Forward Pattern Mat
   - [🤖Cache Acceleration Stats](./docs/User_Guide.md#cache-acceleration-stats-summary)
 - [⚡️DBCache: Dual Block Cache](./docs/User_Guide.md#️dbcache-dual-block-cache)
 - [⚡️DBPrune: Dynamic Block Prune](./docs/User_Guide.md#️dbprune-dynamic-block-prune)
-- [🔥TaylorSeer Calibrator](./docs/User_Guide.md#taylorseer-calibrator)
+- [🔥Hybrid TaylorSeer](./docs/User_Guide.md#taylorseer-calibrator)
 - [⚡️Hybrid Cache CFG](./docs/User_Guide.md#️hybrid-cache-cfg)
-- [🛠Metrics CLI](./docs/User_Guide.md#metrics-cli)
+- [⚡️Hybrid Context Parallelism](./docs/User_Guide.md#context-paralleism)
+- [🛠Metrics Command Line](./docs/User_Guide.md#metrics-cli)
 - [⚙️Torch Compile](./docs/User_Guide.md#️torch-compile)
 - [📚API Documents](./docs/User_Guide.md#api-documentation)

{cache_dit-1.0.4.dist-info → cache_dit-1.0.6.dist-info}/RECORD RENAMED Viewed

@@ -1,19 +1,19 @@
-cache_dit/__init__.py,sha256=JQLxwr5aqoMFp-BNR58J0i6NutbRmNXKsaRJKCZQDCg,1638
-cache_dit/_version.py,sha256=jp1Oow7okdi1HqeKIp8SmyysmUf-oq2X9syICfrgATI,704
+cache_dit/__init__.py,sha256=HZb04M7AHCfk9DaEAGApGJ2lCM-rsP6pbsNQxsQudi0,1743
+cache_dit/_version.py,sha256=r9csd7YQr6ubaa9S-K5iWXSr4c-RpLuQWy5uJw8f4MU,704
 cache_dit/logger.py,sha256=0zsu42hN-3-rgGC_C29ms1IvVpV4_b4_SwJCKSenxBE,4304
-cache_dit/utils.py,sha256=0YNFr84pxYoHOCZvnONKKXYN3PZY4kao9Tq2yEfHHR8,16986
+cache_dit/utils.py,sha256=sLJNMARd9a3dA9dmuD6pZZg5n5FslwUeAktfsL1eO4I,17781
 cache_dit/cache_factory/.gitignore,sha256=5Cb-qT9wsTUoMJ7vACDF7ZcLpAXhi5v-xdcWSRit988,23
 cache_dit/cache_factory/__init__.py,sha256=5UjrpxLVlmjHttTL0O14fD5oU5uKI3FKYevL613ibFQ,1848
-cache_dit/cache_factory/cache_interface.py,sha256=spiE7pWF80G3Y06_TKVvrmKufbAvQmyvshZZVsmb-nM,12714
+cache_dit/cache_factory/cache_interface.py,sha256=244uTVx83hpCpbCDgEOydi5HqG7hKHHzEoz1ApJW6lI,14627
 cache_dit/cache_factory/cache_types.py,sha256=QnWfaS52UOXQtnoCUOwwz4ziY0dyBta6vQ6hvgtdV44,1404
 cache_dit/cache_factory/forward_pattern.py,sha256=FumlCuZ-TSmSYH0hGBHctSJ-oGLCftdZjLygqhsmdR4,2258
 cache_dit/cache_factory/params_modifier.py,sha256=2T98IbepAolWW6GwQsqUDsRzu0k65vo7BOrN3V8mKog,3606
 cache_dit/cache_factory/utils.py,sha256=S3SD6Zhexzhkqnmfo830v6oNLm8stZe32nF4VdxD_bA,2497
-cache_dit/cache_factory/block_adapters/__init__.py,sha256=vM3aDMzPY79Tw4L0hlV2PdA3MFYomnf0eo0BGBo9P78,18087
+cache_dit/cache_factory/block_adapters/__init__.py,sha256=zs-cYacRL_hWlhUXmKc0TZNDAKzzWuznvHeuDpAmuwc,18221
 cache_dit/cache_factory/block_adapters/block_adapters.py,sha256=2TVK_KqiYXC7AKZ2s07fzdOzUoeUBc9P1SzQtLVzhf4,22249
 cache_dit/cache_factory/block_adapters/block_registers.py,sha256=2L7QeM4ygnaKQpC9PoJod0QRYyxidUKU2AYpysDCUwE,2572
 cache_dit/cache_factory/cache_adapters/__init__.py,sha256=py71WGD3JztQ1uk6qdLVbzYcQ1rvqFidNNaQYo7tqTo,79
-cache_dit/cache_factory/cache_adapters/cache_adapter.py,sha256=Za9HixVkEKldYzyDA57xvF91fm9dao2S-Fz5QBIT02M,22123
+cache_dit/cache_factory/cache_adapters/cache_adapter.py,sha256=WYrgV3DKxOxttl-wEKymyKIB1Po0eW73Q2_vOlGEKdQ,24080
 cache_dit/cache_factory/cache_blocks/__init__.py,sha256=cpxzmDcUhbXcReHqaKSnWyEEbIg1H91Pz5hE3z9Xj3k,9984
 cache_dit/cache_factory/cache_blocks/offload_utils.py,sha256=wusgcqaCrwEjvv7Guy-6VXhNOgPPUrBV2sSVuRmGuvo,3513
 cache_dit/cache_factory/cache_blocks/pattern_0_1_2.py,sha256=j4bTafqU5DLQhzP_X5XwOk-QUVLWkGrX-Q6JZvBGHh0,666
@@ -21,14 +21,14 @@ cache_dit/cache_factory/cache_blocks/pattern_3_4_5.py,sha256=2qPnXVZwpQIm2oJ-Yrn
 cache_dit/cache_factory/cache_blocks/pattern_base.py,sha256=9H87qBRpa6UWRkUKXLVO0_9NJgxCVKkFSzaQxM9YPw8,25487
 cache_dit/cache_factory/cache_blocks/pattern_utils.py,sha256=qOxoVTlYPQzPMrR06-7_Ce_lwNg6n5pt1KQrvxzAJhE,3124
 cache_dit/cache_factory/cache_contexts/__init__.py,sha256=7uY8fX9uhpC71VNm1HH4aDIicYn-dD3kRpPQhvc9-EI,853
-cache_dit/cache_factory/cache_contexts/cache_config.py,sha256=WBHU2XVuYSFUSkrrJk8c4952LTeqvgetdkdtch_uSmg,5238
+cache_dit/cache_factory/cache_contexts/cache_config.py,sha256=G0PVWgckDqeyARc72Ne_0lRtO_LftsOeMERRhbh2gCA,5739
 cache_dit/cache_factory/cache_contexts/cache_context.py,sha256=fjZMEHaT1DZvUKnzY41GP0Ep8tmPEZTOsCSvG-5it5k,11269
 cache_dit/cache_factory/cache_contexts/cache_manager.py,sha256=tKtP35GDwZDoxGrQ_Okg_enlh3L-t-iqpytx8TFO_fw,30519
 cache_dit/cache_factory/cache_contexts/context_manager.py,sha256=j5zP_kwZAKla3EXbfr6JKI1vIxZuUEbZVhAPrtC4COw,853
-cache_dit/cache_factory/cache_contexts/prune_config.py,sha256=efFO_tu6AFJxIDp0OxExWKPzOFj95-NSrLGXggimBMA,3407
+cache_dit/cache_factory/cache_contexts/prune_config.py,sha256=WMTh6zb480a0oJiYMlgI0cwCsDSVvs6UjyeJLiXbjP8,3216
 cache_dit/cache_factory/cache_contexts/prune_context.py,sha256=ywiT9P0w_GjIFLowzUDa6jhTohNsSGfTbanZcs9wMic,6359
 cache_dit/cache_factory/cache_contexts/prune_manager.py,sha256=rZG7HD9ATqgH4VZdMq1XtP_h2pokaotFOVx1svB3J7E,5478
-cache_dit/cache_factory/cache_contexts/calibrators/__init__.py,sha256=mzYXO8tbytGpJJ9rpPu20kMoj1Iu_7Ym9tjfzV8rA98,5574
+cache_dit/cache_factory/cache_contexts/calibrators/__init__.py,sha256=QTbyT8xcFEjfIp9xjbnsnlnVCNvMjUc20NjB0W-s95k,6269
 cache_dit/cache_factory/cache_contexts/calibrators/base.py,sha256=mn6ZBkChGpGwN5csrHTUGMoX6BBPvqHXSLbIExiW-EU,748
 cache_dit/cache_factory/cache_contexts/calibrators/foca.py,sha256=nhHGs_hxwW1M942BQDMJb9-9IuHdnOxp774Jrna1bJI,891
 cache_dit/cache_factory/cache_contexts/calibrators/taylorseer.py,sha256=l1QSNaBwtGtpZZFAgCE7Hu8Nf1oL4QAcYu7lShpFGyw,5850
@@ -52,12 +52,17 @@ cache_dit/metrics/image_reward.py,sha256=N8HalJo1T1js0dsNb2V1KRv4kIdcm3nhx7iOXJu
 cache_dit/metrics/inception.py,sha256=pBVe2X6ylLPIXTG4-GWDM9DWnCviMJbJ45R3ulhktR0,12759
 cache_dit/metrics/lpips.py,sha256=hrHrmdM-f2B4TKDs0xLqJO5JFaYcCjq2qNIR8oCrVkc,811
 cache_dit/metrics/metrics.py,sha256=AZbQyoavE-djvyRUZ_EfCIrWSQbiWQFo7n2dhn7XptE,40466
+cache_dit/parallelism/__init__.py,sha256=dheBG5_TZCuwctviMslpAEgB-B3N8F816bE51qsw_fU,210
+cache_dit/parallelism/parallel_backend.py,sha256=js1soTMenLeAyPMsBgdI3gWcdXoqjWgBD-PuFEywMr0,508
+cache_dit/parallelism/parallel_config.py,sha256=bu24sRSzJMmH7FZqzUPTcT6tAzQ20-FAqAEvGV3Q1Fw,1733
+cache_dit/parallelism/parallel_interface.py,sha256=tsiIdHosTmRbeRg0z9q0eMQlx-7vefmSIlc56OWnuMg,2205
+cache_dit/parallelism/backends/parallel_difffusers.py,sha256=i57yZzYc9kGPUjLXTzoAA4j7U8EtVIAJRK1exw30Voo,1939
 cache_dit/quantize/__init__.py,sha256=kWYoMAyZgBXu9BJlZjTQ0dRffW9GqeeY9_iTkXrb70A,59
-cache_dit/quantize/quantize_ao.py,sha256=Pr3u3Qr6qLvFkd8k-_rfcz4Mkjlg36U9BHG2t6Bl-6M,6301
+cache_dit/quantize/quantize_ao.py,sha256=bbEUwsrMp3bMuRw8qJZREIvCHaJRQoZyfMjlu4ImRMI,6315
 cache_dit/quantize/quantize_interface.py,sha256=2s_R7xPSKuJeFpEGeLwRxnq_CqJcBG3a3lzyW5wh-UM,1241
-cache_dit-1.0.4.dist-info/licenses/LICENSE,sha256=Dqb07Ik2dV41s9nIdMUbiRWEfDqo7-dQeRiY7kPO8PE,3769
-cache_dit-1.0.4.dist-info/METADATA,sha256=f04uCgApjgfHTC7Ll9aPejXCFFXbFTpTy-rjd5I_iwM,28376
-cache_dit-1.0.4.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
-cache_dit-1.0.4.dist-info/entry_points.txt,sha256=FX2gysXaZx6NeK1iCLMcIdP8Q4_qikkIHtEmi3oWn8o,65
-cache_dit-1.0.4.dist-info/top_level.txt,sha256=ZJDydonLEhujzz0FOkVbO-BqfzO9d_VqRHmZU-3MOZo,10
-cache_dit-1.0.4.dist-info/RECORD,,
+cache_dit-1.0.6.dist-info/licenses/LICENSE,sha256=Dqb07Ik2dV41s9nIdMUbiRWEfDqo7-dQeRiY7kPO8PE,3769
+cache_dit-1.0.6.dist-info/METADATA,sha256=h_fb2Lf6XGsTofkmLJrsWh67H0LRvry1SDrMeQ9Uf5I,29476
+cache_dit-1.0.6.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
+cache_dit-1.0.6.dist-info/entry_points.txt,sha256=FX2gysXaZx6NeK1iCLMcIdP8Q4_qikkIHtEmi3oWn8o,65
+cache_dit-1.0.6.dist-info/top_level.txt,sha256=ZJDydonLEhujzz0FOkVbO-BqfzO9d_VqRHmZU-3MOZo,10
+cache_dit-1.0.6.dist-info/RECORD,,

{cache_dit-1.0.4.dist-info → cache_dit-1.0.6.dist-info}/WHEEL RENAMED Viewed

File without changes

{cache_dit-1.0.4.dist-info → cache_dit-1.0.6.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{cache_dit-1.0.4.dist-info → cache_dit-1.0.6.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

{cache_dit-1.0.4.dist-info → cache_dit-1.0.6.dist-info}/top_level.txt RENAMED Viewed

File without changes

cache-dit 1.0.4__py3-none-any.whl → 1.0.6__py3-none-any.whl

Potentially problematic release.

cache-dit 1.0.4py3-none-any.whl → 1.0.6py3-none-any.whl