PyPI - cache-dit - Versions diffs - 0.2.2__tar.gz → 0.2.4__tar.gz - Mend

cache-dit 0.2.2tar.gz → 0.2.4tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (143) hide show

{cache_dit-0.2.2 → cache_dit-0.2.4}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: cache_dit
-Version: 0.2.2
+Version: 0.2.4
 Summary: 🤗 CacheDiT: A Training-free and Easy-to-use Cache Acceleration Toolbox for Diffusion Transformers
 Author: DefTruth, vipshop.com, etc.
 Maintainer: DefTruth, vipshop.com, etc
@@ -154,6 +154,7 @@ The **CacheDiT** codebase is adapted from [FBCache](https://github.com/chengzeyi
 - [🔥Supported Models](#supported)
 - [⚡️Dual Block Cache](#dbcache)
 - [🔥Hybrid TaylorSeer](#taylorseer)
+- [⚡️Hybrid Cache CFG](#cfg)
 - [🎉First Block Cache](#fbcache)
 - [⚡️Dynamic Block Prune](#dbprune)
 - [🎉Context Parallelism](#context-parallelism)
@@ -283,7 +284,11 @@ cache_options = {
     "warmup_steps": 3, # n_derivatives + 1
     "residual_diff_threshold": 0.12,
 }
-```
+```
+> [!Important]
+> Please note that if you have used TaylorSeer as the calibrator for approximate hidden states, the **Bn** param of DBCache can be set to **0**. In essence, DBCache's Bn is also act as a calibrator, so you can choose either Bn > 0 or TaylorSeer. We recommend using the configuration scheme of **TaylorSeer** + **DBCache FnB0**.
 <div align="center">
   <p align="center">
     <b>DBCache F1B0 + TaylorSeer</b>, L20x1, Steps: 28, <br>"A cat holding a sign that says hello world with complex background"
@@ -295,6 +300,19 @@ cache_options = {
 |24.85s|12.85s|12.86s|10.27s|10.28s|8.48s|
 |<img src=https://github.com/vipshop/cache-dit/raw/main/assets/NONE_R0.08_S0.png width=105px>|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/U0_C0_DBCACHE_F1B0S1W0T0ET0_R0.12_S14_T12.85s.png width=105px>|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/U0_C0_DBCACHE_F1B0S1W0T1ET1_R0.12_S14_T12.86s.png width=105px>|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/U0_C0_DBCACHE_F1B0S1W0T0ET0_R0.15_S17_T10.27s.png width=105px>|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/U0_C0_DBCACHE_F1B0S1W0T1ET1_R0.15_S17_T10.28s.png width=105px>|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/U0_C1_DBCACHE_F1B0S1W0T1ET1_R0.15_S17_T8.48s.png width=105px>|
+## ⚡️Hybrid Cache CFG
+<div id="cfg"></div>
+CacheDiT supports caching for CFG (classifier-free guidance). For models that fuse CFG and non-CFG into a single forward step, or models that do not include CFG (classifier-free guidance) in the forward step, please set `do_separate_classifier_free_guidance` param to False. Otherwise, set it to True. Wan 2.1: True. FLUX.1, HunyunVideo, CogVideoX, Mochi: False.
+```python
+cache_options = {
+    "do_separate_classifier_free_guidance": True,  # Wan 2.1
+    "cfg_compute_first": False,
+}
+```
 ## 🎉FBCache: First Block Cache
 <div id="fbcache"></div>

{cache_dit-0.2.2 → cache_dit-0.2.4}/README.md RENAMED Viewed

@@ -119,6 +119,7 @@ The **CacheDiT** codebase is adapted from [FBCache](https://github.com/chengzeyi
 - [🔥Supported Models](#supported)
 - [⚡️Dual Block Cache](#dbcache)
 - [🔥Hybrid TaylorSeer](#taylorseer)
+- [⚡️Hybrid Cache CFG](#cfg)
 - [🎉First Block Cache](#fbcache)
 - [⚡️Dynamic Block Prune](#dbprune)
 - [🎉Context Parallelism](#context-parallelism)
@@ -248,7 +249,11 @@ cache_options = {
     "warmup_steps": 3, # n_derivatives + 1
     "residual_diff_threshold": 0.12,
 }
-```
+```
+> [!Important]
+> Please note that if you have used TaylorSeer as the calibrator for approximate hidden states, the **Bn** param of DBCache can be set to **0**. In essence, DBCache's Bn is also act as a calibrator, so you can choose either Bn > 0 or TaylorSeer. We recommend using the configuration scheme of **TaylorSeer** + **DBCache FnB0**.
 <div align="center">
   <p align="center">
     <b>DBCache F1B0 + TaylorSeer</b>, L20x1, Steps: 28, <br>"A cat holding a sign that says hello world with complex background"
@@ -260,6 +265,19 @@ cache_options = {
 |24.85s|12.85s|12.86s|10.27s|10.28s|8.48s|
 |<img src=https://github.com/vipshop/cache-dit/raw/main/assets/NONE_R0.08_S0.png width=105px>|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/U0_C0_DBCACHE_F1B0S1W0T0ET0_R0.12_S14_T12.85s.png width=105px>|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/U0_C0_DBCACHE_F1B0S1W0T1ET1_R0.12_S14_T12.86s.png width=105px>|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/U0_C0_DBCACHE_F1B0S1W0T0ET0_R0.15_S17_T10.27s.png width=105px>|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/U0_C0_DBCACHE_F1B0S1W0T1ET1_R0.15_S17_T10.28s.png width=105px>|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/U0_C1_DBCACHE_F1B0S1W0T1ET1_R0.15_S17_T8.48s.png width=105px>|
+## ⚡️Hybrid Cache CFG
+<div id="cfg"></div>
+CacheDiT supports caching for CFG (classifier-free guidance). For models that fuse CFG and non-CFG into a single forward step, or models that do not include CFG (classifier-free guidance) in the forward step, please set `do_separate_classifier_free_guidance` param to False. Otherwise, set it to True. Wan 2.1: True. FLUX.1, HunyunVideo, CogVideoX, Mochi: False.
+```python
+cache_options = {
+    "do_separate_classifier_free_guidance": True,  # Wan 2.1
+    "cfg_compute_first": False,
+}
+```
 ## 🎉FBCache: First Block Cache
 <div id="fbcache"></div>

{cache_dit-0.2.2 → cache_dit-0.2.4}/bench/bench.py RENAMED Viewed

@@ -21,9 +21,6 @@ def get_args() -> argparse.ArgumentParser:
     parser.add_argument("--alter", action="store_true", default=False)
     parser.add_argument("--taylorseer", action="store_true", default=False)
     parser.add_argument("--taylorseer-order", "--order", type=int, default=2)
-    parser.add_argument(
-        "--encoder-taylorseer", action="store_true", default=False
-    )
     parser.add_argument("--l1-diff", action="store_true", default=False)
     parser.add_argument("--rdt", type=float, default=0.08)
     parser.add_argument("--Fn-compute-blocks", "--Fn", type=int, default=1)
@@ -32,9 +29,15 @@ def get_args() -> argparse.ArgumentParser:
     parser.add_argument("--warmup-steps", type=int, default=0)
     parser.add_argument("--max-cached-steps", type=int, default=-1)
     parser.add_argument("--max-pruned-steps", type=int, default=-1)
+    parser.add_argument("--gen-device", type=str, default="cuda")
     parser.add_argument("--ulysses", type=int, default=None)
     parser.add_argument("--compile", action="store_true", default=False)
-    parser.add_argument("--gen-device", type=str, default="cuda")
+    parser.add_argument(
+        "--force-compile-all",
+        "--compile-all",
+        action="store_true",
+        default=False,
+    )
     return parser.parse_args()
@@ -52,12 +55,7 @@ def get_cache_options(cache_type: CacheType, args: argparse.Namespace):
     elif cache_type == CacheType.DBCache:
         cache_options = {
             "cache_type": CacheType.DBCache,
-            "warmup_steps": (
-                # TaylorSeer needs at least order + 1 warmup steps
-                max(args.warmup_steps, args.taylorseer_order + 1)
-                if (args.taylorseer or args.encoder_taylorseer)
-                else args.warmup_steps
-            ),
+            "warmup_steps": args.warmup_steps,
             "max_cached_steps": args.max_cached_steps,  # -1 means no limit
             # Fn=1, Bn=0, means FB Cache, otherwise, Dual Block Cache
             "Fn_compute_blocks": args.Fn_compute_blocks,  # Fn, F8, etc.
@@ -81,7 +79,7 @@ def get_cache_options(cache_type: CacheType, args: argparse.Namespace):
             "important_condition_threshold": 0.00,
             # TaylorSeer options
             "enable_taylorseer": args.taylorseer,
-            "enable_encoder_taylorseer": args.encoder_taylorseer,
+            "enable_encoder_taylorseer": args.taylorseer,
             # Taylorseer cache type cache be hidden_states or residual
             "taylorseer_cache_type": "residual",
             "taylorseer_kwargs": {
@@ -90,7 +88,7 @@ def get_cache_options(cache_type: CacheType, args: argparse.Namespace):
         }
     elif cache_type == CacheType.DBPrune:
         assert (
-            args.taylorseer is False and args.encoder_taylorseer is False
+            args.taylorseer is False
         ), "DBPrune does not support TaylorSeer yet."
         cache_options = {
             "cache_type": CacheType.DBPrune,
@@ -122,7 +120,6 @@ def get_cache_options(cache_type: CacheType, args: argparse.Namespace):
             f"{cache_type_str}_F{args.Fn_compute_blocks}"
             f"B{args.Bn_compute_blocks}S{args.Bn_steps}"
             f"W{args.warmup_steps}T{int(args.taylorseer)}"
-            f"ET{int(args.encoder_taylorseer)}"
             f"O{args.taylorseer_order}"
         )
     elif cache_type == CacheType.DBPrune:
@@ -132,7 +129,7 @@ def get_cache_options(cache_type: CacheType, args: argparse.Namespace):
         )
     elif cache_type == CacheType.FBCache:
         cache_type_str = (
-            f"{cache_type_str}_W{args.warmup_steps}" f"T{int(args.taylorseer)}"
+            f"{cache_type_str}_W{args.warmup_steps}T{int(args.taylorseer)}"
         )
     return cache_options, cache_type_str
@@ -201,8 +198,10 @@ def main():
                 "Only compile transformer blocks not the whole model "
                 "for FluxTransformer2DModel to keep higher precision."
             )
-            if args.taylorseer_order <= 2 or (
-                not args.taylorseer and not args.encoder_taylorseer
+            if (
+                args.taylorseer_order <= 2
+                or not args.taylorseer
+                or args.force_compile_all
             ):
                 # NOTE: Seems like compiling the whole transformer
                 # will cause precision issues while using TaylorSeer

cache_dit-0.2.4/examples/README.md ADDED Viewed

@@ -0,0 +1,57 @@
+# Examples for CacheDiT
+## Install requirements
+```bash
+pip3 install -r requirements.txt
+```
+## Run examples
+- FLUX.1-dev
+```bash
+python3 run_flux.py # baseline
+python3 run_flux.py --cache --Fn 8 --Bn 8
+python3 run_flux.py --cache --Fn 8 --Bn 0 --taylorseer
+```
+- FLUX.1-Fill-dev
+```bash
+python3 run_flux_fill.py # baseline
+python3 run_flux_fill.py --cache --Fn 8 --Bn 8
+python3 run_flux_fill.py --cache --Fn 8 --Bn 0 --taylorseer
+```
+- CogVideoX
+```bash
+python3 run_cogvideox.py # baseline
+python3 run_cogvideox.py --cache --Fn 8 --Bn 8
+python3 run_cogvideox.py --cache --Fn 8 --Bn 0 --taylorseer
+```
+- Wan2.1
+```bash
+python3 run_wan.py # baseline
+python3 run_wan.py --cache --Fn 8 --Bn 8
+python3 run_wan.py --cache --Fn 8 --Bn 0 --taylorseer
+```
+- Mochi
+```bash
+python3 run_mochi.py # baseline
+python3 run_mochi.py --cache --Fn 8 --Bn 8
+python3 run_mochi.py --cache --Fn 8 --Bn 0 --taylorseer
+```
+- HunyuanVideo
+```bash
+python3 run_hunyuan_video.py # baseline
+python3 run_hunyuan_video.py --cache --Fn 8 --Bn 8
+python3 run_hunyuan_video.py --cache --Fn 8 --Bn 0 --taylorseer
+```

cache_dit-0.2.4/examples/run_cogvideox.py ADDED Viewed

@@ -0,0 +1,142 @@
+import os
+import time
+import torch
+import argparse
+from diffusers.utils import export_to_video
+from diffusers import CogVideoXPipeline, AutoencoderKLCogVideoX
+from cache_dit.cache_factory import apply_cache_on_pipe, CacheType
+def get_args() -> argparse.ArgumentParser:
+    parser = argparse.ArgumentParser()
+    # General arguments
+    parser.add_argument("--cache", action="store_true", default=False)
+    parser.add_argument("--taylorseer", action="store_true", default=False)
+    parser.add_argument("--taylorseer-order", "--order", type=int, default=2)
+    parser.add_argument("--Fn-compute-blocks", "--Fn", type=int, default=1)
+    parser.add_argument("--Bn-compute-blocks", "--Bn", type=int, default=0)
+    parser.add_argument("--rdt", type=float, default=0.08)
+    parser.add_argument("--warmup-steps", type=int, default=0)
+    return parser.parse_args()
+args = get_args()
+print(args)
+model_id = os.environ.get("COGVIDEOX_DIR", "THUDM/CogVideoX-5b")
+def is_cogvideox_1_5():
+    return "CogVideoX1.5" in model_id or "THUDM/CogVideoX1.5" in model_id
+def get_gpu_memory_in_gib():
+    if not torch.cuda.is_available():
+        return 0
+    try:
+        total_memory_bytes = torch.cuda.get_device_properties(
+            torch.cuda.current_device(),
+        ).total_memory
+        total_memory_gib = total_memory_bytes / (1024**3)
+        return int(total_memory_gib)
+    except Exception:
+        return 0
+pipe = CogVideoXPipeline.from_pretrained(
+    model_id,
+    torch_dtype=torch.bfloat16,
+).to("cuda")
+if args.cache:
+    cache_options = {
+        "cache_type": CacheType.DBCache,
+        "warmup_steps": args.warmup_steps,
+        "max_cached_steps": -1,  # -1 means no limit
+        # Fn=1, Bn=0, means FB Cache, otherwise, Dual Block Cache
+        "Fn_compute_blocks": args.Fn_compute_blocks,  # Fn, F8, etc.
+        "Bn_compute_blocks": args.Bn_compute_blocks,  # Bn, B16, etc.
+        "residual_diff_threshold": args.rdt,
+        # releative token diff threshold, default is 0.0
+        "important_condition_threshold": 0.05,
+        # CFG: classifier free guidance or not
+        # CogVideoX fused CFG and non-CFG into single forward step
+        # so, we set do_separate_classifier_free_guidance as False.
+        "do_separate_classifier_free_guidance": False,
+        "cfg_compute_first": False,
+        "enable_taylorseer": args.taylorseer,
+        "enable_encoder_taylorseer": args.taylorseer,
+        # Taylorseer cache type cache be hidden_states or residual
+        "taylorseer_cache_type": "residual",
+        "taylorseer_kwargs": {
+            "n_derivatives": args.taylorseer_order,
+        },
+    }
+    cache_type_str = "DBCACHE"
+    cache_type_str = (
+        f"{cache_type_str}_F{args.Fn_compute_blocks}"
+        f"B{args.Bn_compute_blocks}W{args.warmup_steps}"
+        f"T{int(args.taylorseer)}O{args.taylorseer_order}"
+    )
+    print(f"cache options:\n{cache_options}")
+    apply_cache_on_pipe(pipe, **cache_options)
+else:
+    cache_type_str = "NONE"
+pipe.enable_model_cpu_offload()
+assert isinstance(pipe.vae, AutoencoderKLCogVideoX)  # enable type check for IDE
+pipe.vae.enable_slicing()
+pipe.vae.enable_tiling()
+start = time.time()
+prompt = (
+    "A panda, dressed in a small, red jacket and a tiny hat, "
+    "sits on a wooden stool in a serene bamboo forest. The "
+    "panda's fluffy paws strum a miniature acoustic guitar, "
+    "producing soft, melodic tunes. Nearby, a few other pandas "
+    "gather, watching curiously and some clapping in rhythm. "
+    "Sunlight filters through the tall bamboo, casting a gentle "
+    "glow on the scene. The panda's face is expressive, showing "
+    "concentration and joy as it plays. The background includes "
+    "a small, flowing stream and vibrant green foliage, enhancing "
+    "the peaceful and magical atmosphere of this unique musical "
+    "performance."
+)
+video = pipe(
+    prompt=prompt,
+    num_videos_per_prompt=1,
+    num_inference_steps=50,
+    num_frames=(
+        # Avoid OOM for CogVideoX1.5 model on 48GB GPU
+        16
+        if (is_cogvideox_1_5() and get_gpu_memory_in_gib() < 48)
+        else 49
+    ),
+    guidance_scale=6,
+    generator=torch.Generator("cpu").manual_seed(0),
+).frames[0]
+end = time.time()
+if hasattr(pipe.transformer, "_cached_steps"):
+    cached_steps = pipe.transformer._cached_steps
+    residual_diffs = pipe.transformer._residual_diffs
+    print(f"Cache Steps: {len(cached_steps)}, {cached_steps}")
+    print(f"Residual Diffs: {len(residual_diffs)}, {residual_diffs}")
+if hasattr(pipe.transformer, "_cfg_cached_steps"):
+    cfg_cached_steps = pipe.transformer._cfg_cached_steps
+    cfg_residual_diffs = pipe.transformer._cfg_residual_diffs
+    print(f"CFG Cache Steps: {len(cfg_cached_steps)}, {cfg_cached_steps} ")
+    print(
+        f"CFG Residual Diffs: {len(cfg_residual_diffs)}, {cfg_residual_diffs}"
+    )
+time_cost = end - start
+save_path = f"cogvideox.{cache_type_str}.mp4"
+print(f"Time cost: {time_cost:.2f}s")
+print(f"Saving video to {save_path}")
+export_to_video(video, save_path, fps=8)

cache_dit-0.2.4/examples/run_flux.py ADDED Viewed

@@ -0,0 +1,96 @@
+import os
+import time
+import torch
+import argparse
+from diffusers import FluxPipeline
+from cache_dit.cache_factory import apply_cache_on_pipe, CacheType
+def get_args() -> argparse.ArgumentParser:
+    parser = argparse.ArgumentParser()
+    # General arguments
+    parser.add_argument("--cache", action="store_true", default=False)
+    parser.add_argument("--taylorseer", action="store_true", default=False)
+    parser.add_argument("--taylorseer-order", "--order", type=int, default=2)
+    parser.add_argument("--Fn-compute-blocks", "--Fn", type=int, default=1)
+    parser.add_argument("--Bn-compute-blocks", "--Bn", type=int, default=0)
+    parser.add_argument("--rdt", type=float, default=0.08)
+    parser.add_argument("--warmup-steps", type=int, default=0)
+    return parser.parse_args()
+args = get_args()
+print(args)
+pipe = FluxPipeline.from_pretrained(
+    os.environ.get(
+        "FLUX_DIR",
+        "black-forest-labs/FLUX.1-dev",
+    ),
+    torch_dtype=torch.bfloat16,
+).to("cuda")
+if args.cache:
+    cache_options = {
+        "cache_type": CacheType.DBCache,
+        "warmup_steps": args.warmup_steps,
+        "max_cached_steps": -1,  # -1 means no limit
+        # Fn=1, Bn=0, means FB Cache, otherwise, Dual Block Cache
+        "Fn_compute_blocks": args.Fn_compute_blocks,  # Fn, F8, etc.
+        "Bn_compute_blocks": args.Bn_compute_blocks,  # Bn, B16, etc.
+        "residual_diff_threshold": args.rdt,
+        # CFG: classifier free guidance or not
+        # FLUX.1 dev don not have CFG, so, we set
+        # do_separate_classifier_free_guidance as False.
+        "do_separate_classifier_free_guidance": False,
+        "cfg_compute_first": False,
+        "enable_taylorseer": args.taylorseer,
+        "enable_encoder_taylorseer": args.taylorseer,
+        # Taylorseer cache type cache be hidden_states or residual
+        "taylorseer_cache_type": "residual",
+        "taylorseer_kwargs": {
+            "n_derivatives": args.taylorseer_order,
+        },
+    }
+    cache_type_str = "DBCACHE"
+    cache_type_str = (
+        f"{cache_type_str}_F{args.Fn_compute_blocks}"
+        f"B{args.Bn_compute_blocks}W{args.warmup_steps}"
+        f"T{int(args.taylorseer)}O{args.taylorseer_order}"
+    )
+    print(f"cache options:\n{cache_options}")
+    apply_cache_on_pipe(pipe, **cache_options)
+else:
+    cache_type_str = "NONE"
+start = time.time()
+image = pipe(
+    "A cat holding a sign that says hello world",
+    num_inference_steps=28,
+    generator=torch.Generator("cpu").manual_seed(0),
+).images[0]
+end = time.time()
+if hasattr(pipe.transformer, "_cached_steps"):
+    cached_steps = pipe.transformer._cached_steps
+    residual_diffs = pipe.transformer._residual_diffs
+    print(f"Cache Steps: {len(cached_steps)}, {cached_steps}")
+    print(f"Residual Diffs: {len(residual_diffs)}, {residual_diffs}")
+if hasattr(pipe.transformer, "_cfg_cached_steps"):
+    cfg_cached_steps = pipe.transformer._cfg_cached_steps
+    cfg_residual_diffs = pipe.transformer._cfg_residual_diffs
+    print(f"CFG Cache Steps: {len(cfg_cached_steps)}, {cfg_cached_steps} ")
+    print(
+        f"CFG Residual Diffs: {len(cfg_residual_diffs)}, {cfg_residual_diffs}"
+    )
+time_cost = end - start
+save_path = f"flux.{cache_type_str}.png"
+print(f"Time cost: {time_cost:.2f}s")
+print(f"Saving image to {save_path}")
+image.save(save_path)

cache_dit-0.2.4/examples/run_flux_fill.py ADDED Viewed

@@ -0,0 +1,100 @@
+import os
+import time
+import torch
+import argparse
+from diffusers import FluxFillPipeline
+from diffusers.utils import load_image
+from cache_dit.cache_factory import apply_cache_on_pipe, CacheType
+def get_args() -> argparse.ArgumentParser:
+    parser = argparse.ArgumentParser()
+    # General arguments
+    parser.add_argument("--cache", action="store_true", default=False)
+    parser.add_argument("--taylorseer", action="store_true", default=False)
+    parser.add_argument("--taylorseer-order", "--order", type=int, default=2)
+    parser.add_argument("--Fn-compute-blocks", "--Fn", type=int, default=1)
+    parser.add_argument("--Bn-compute-blocks", "--Bn", type=int, default=0)
+    parser.add_argument("--rdt", type=float, default=0.08)
+    parser.add_argument("--warmup-steps", type=int, default=0)
+    return parser.parse_args()
+args = get_args()
+print(args)
+pipe = FluxFillPipeline.from_pretrained(
+    os.environ.get(
+        "FLUX_FILL_DIR",
+        "black-forest-labs/FLUX.1-Fill-dev",
+    ),
+    torch_dtype=torch.bfloat16,
+).to("cuda")
+if args.cache:
+    cache_options = {
+        "cache_type": CacheType.DBCache,
+        "warmup_steps": args.warmup_steps,
+        "max_cached_steps": -1,  # -1 means no limit
+        # Fn=1, Bn=0, means FB Cache, otherwise, Dual Block Cache
+        "Fn_compute_blocks": args.Fn_compute_blocks,  # Fn, F8, etc.
+        "Bn_compute_blocks": args.Bn_compute_blocks,  # Bn, B16, etc.
+        "residual_diff_threshold": args.rdt,
+        # CFG: classifier free guidance or not
+        # FLUX.1 dev don not have CFG, so, we set
+        # do_separate_classifier_free_guidance as False.
+        "do_separate_classifier_free_guidance": False,
+        "cfg_compute_first": False,
+        "enable_taylorseer": args.taylorseer,
+        "enable_encoder_taylorseer": args.taylorseer,
+        # Taylorseer cache type cache be hidden_states or residual
+        "taylorseer_cache_type": "residual",
+        "taylorseer_kwargs": {
+            "n_derivatives": args.taylorseer_order,
+        },
+    }
+    cache_type_str = "DBCACHE"
+    cache_type_str = (
+        f"{cache_type_str}_F{args.Fn_compute_blocks}"
+        f"B{args.Bn_compute_blocks}W{args.warmup_steps}"
+        f"T{int(args.taylorseer)}O{args.taylorseer_order}"
+    )
+    print(f"cache options:\n{cache_options}")
+    apply_cache_on_pipe(pipe, **cache_options)
+else:
+    cache_type_str = "NONE"
+start = time.time()
+image = pipe(
+    prompt="a white paper cup",
+    image=load_image("data/cup.png"),
+    mask_image=load_image("data/cup_mask.png"),
+    guidance_scale=30,
+    num_inference_steps=28,
+    max_sequence_length=512,
+    generator=torch.Generator("cpu").manual_seed(0),
+).images[0]
+end = time.time()
+if hasattr(pipe.transformer, "_cached_steps"):
+    cached_steps = pipe.transformer._cached_steps
+    residual_diffs = pipe.transformer._residual_diffs
+    print(f"Cache Steps: {len(cached_steps)}, {cached_steps}")
+    print(f"Residual Diffs: {len(residual_diffs)}, {residual_diffs}")
+if hasattr(pipe.transformer, "_cfg_cached_steps"):
+    cfg_cached_steps = pipe.transformer._cfg_cached_steps
+    cfg_residual_diffs = pipe.transformer._cfg_residual_diffs
+    print(f"CFG Cache Steps: {len(cfg_cached_steps)}, {cfg_cached_steps} ")
+    print(
+        f"CFG Residual Diffs: {len(cfg_residual_diffs)}, {cfg_residual_diffs}"
+    )
+time_cost = end - start
+save_path = f"flux-fill.{cache_type_str}.png"
+print(f"Time cost: {time_cost:.2f}s")
+print(f"Saving image to {save_path}")
+image.save(save_path)

cache-dit 0.2.2__tar.gz → 0.2.4__tar.gz

cache-dit 0.2.2tar.gz → 0.2.4tar.gz