cache-dit 0.2.36__py3-none-any.whl → 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of cache-dit might be problematic. Click here for more details.

cache_dit/_version.py CHANGED
@@ -28,7 +28,7 @@ version_tuple: VERSION_TUPLE
28
28
  commit_id: COMMIT_ID
29
29
  __commit_id__: COMMIT_ID
30
30
 
31
- __version__ = version = '0.2.36'
32
- __version_tuple__ = version_tuple = (0, 2, 36)
31
+ __version__ = version = '0.3.0'
32
+ __version_tuple__ = version_tuple = (0, 3, 0)
33
33
 
34
34
  __commit_id__ = commit_id = None
@@ -1,4 +1,5 @@
1
1
  import os
2
+ import re
2
3
  import cv2
3
4
  import pathlib
4
5
  import argparse
@@ -160,25 +161,30 @@ def compute_dir_metric(
160
161
  # File
161
162
  if not os.path.isdir(image_true_dir) or not os.path.isdir(image_test_dir):
162
163
  return compute_file_func(image_true_dir, image_test_dir), 1
164
+
163
165
  # Dir
166
+ # compute dir metric
167
+ def natural_sort_key(filename):
168
+ match = re.search(r"(\d+)\D*$", filename)
169
+ return int(match.group(1)) if match else filename
170
+
164
171
  image_true_dir: pathlib.Path = pathlib.Path(image_true_dir)
165
- image_true_files = sorted(
166
- [
167
- file
168
- for ext in _IMAGE_EXTENSIONS
169
- for file in image_true_dir.rglob("*.{}".format(ext))
170
- ]
171
- )
172
- image_test_dir: pathlib.Path = pathlib.Path(image_test_dir)
173
- image_test_files = sorted(
174
- [
175
- file
176
- for ext in _IMAGE_EXTENSIONS
177
- for file in image_test_dir.rglob("*.{}".format(ext))
178
- ]
179
- )
172
+ image_true_files = [
173
+ file
174
+ for ext in _IMAGE_EXTENSIONS
175
+ for file in image_true_dir.rglob("*.{}".format(ext))
176
+ ]
180
177
  image_true_files = [file.as_posix() for file in image_true_files]
178
+ image_true_files = sorted(image_true_files, key=natural_sort_key)
179
+
180
+ image_test_dir: pathlib.Path = pathlib.Path(image_test_dir)
181
+ image_test_files = [
182
+ file
183
+ for ext in _IMAGE_EXTENSIONS
184
+ for file in image_test_dir.rglob("*.{}".format(ext))
185
+ ]
181
186
  image_test_files = [file.as_posix() for file in image_test_files]
187
+ image_test_files = sorted(image_test_files, key=natural_sort_key)
182
188
 
183
189
  # select valid files
184
190
  image_true_files_selected = []
@@ -192,6 +198,7 @@ def compute_dir_metric(
192
198
  ):
193
199
  image_true_files_selected.append(selected_image_true)
194
200
  image_test_files_selected.append(selected_image_test)
201
+
195
202
  image_true_files = image_true_files_selected.copy()
196
203
  image_test_files = image_test_files_selected.copy()
197
204
  if len(image_true_files) == 0:
@@ -206,20 +213,22 @@ def compute_dir_metric(
206
213
 
207
214
  total_metric = 0.0
208
215
  valid_files = 0
216
+ total_files = 0
209
217
  for image_true, image_test in tqdm(
210
218
  zip(image_true_files, image_test_files),
211
219
  total=len(image_true_files),
212
220
  disable=DISABLE_VERBOSE,
213
221
  ):
214
222
  metric = compute_file_func(image_true, image_test)
215
- if metric != float("inf"):
223
+ if metric != float("inf"): # means no cache apply to image_test
216
224
  total_metric += metric
217
225
  valid_files += 1
226
+ total_files += 1
218
227
 
219
228
  if valid_files > 0:
220
229
  average_metric = total_metric / valid_files
221
230
  logger.debug(f"Average: {average_metric:.2f}")
222
- return average_metric, valid_files
231
+ return average_metric, total_files
223
232
  else:
224
233
  logger.debug("No valid files to compare")
225
234
  return None, None
@@ -1217,6 +1226,9 @@ def entrypoint():
1217
1226
 
1218
1227
  if args.gen_markdown_table:
1219
1228
  table = _format_table(format_strs, metric)
1229
+ table = table.replace("Latency(s)(↑)", "SpeedUp(↑)")
1230
+ table = table.replace("TFLOPs(↑)", "SpeedUp(↑)")
1231
+ table = table.replace("FLOPs(↑)", "SpeedUp(↑)")
1220
1232
  print("-" * format_len)
1221
1233
  print(f"{table}")
1222
1234
  print("-" * format_len)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: cache_dit
3
- Version: 0.2.36
3
+ Version: 0.3.0
4
4
  Summary: 🤗 A Unified and Training-free Cache Acceleration Toolbox for Diffusion Transformers
5
5
  Author: DefTruth, vipshop.com, etc.
6
6
  Maintainer: DefTruth, vipshop.com, etc
@@ -58,11 +58,12 @@ Dynamic: requires-python
58
58
  <img src=https://img.shields.io/badge/PyPI-pass-brightgreen.svg >
59
59
  <img src=https://static.pepy.tech/badge/cache-dit >
60
60
  <img src=https://img.shields.io/badge/Python-3.10|3.11|3.12-9cf.svg >
61
- <img src=https://img.shields.io/badge/Release-v0.2-brightgreen.svg >
61
+ <img src=https://img.shields.io/badge/Release-v0.3-brightgreen.svg >
62
62
  </div>
63
63
  <p align="center">
64
64
  <b><a href="#unified">📚Unified Cache APIs</a></b> | <a href="#forward-pattern-matching">📚Forward Pattern Matching</a> | <a href="#automatic-block-adapter">📚Automatic Block Adapter</a><br>
65
- <a href="#hybird-forward-pattern">📚Hybrid Forward Pattern</a> | <a href="#dbcache">📚DBCache</a> | <a href="#taylorseer">📚Hybrid TaylorSeer</a> | <a href="#cfg">📚Cache CFG</a>
65
+ <a href="#hybird-forward-pattern">📚Hybrid Forward Pattern</a> | <a href="#dbcache">📚DBCache</a> | <a href="#taylorseer">📚Hybrid TaylorSeer</a> | <a href="#cfg">📚Cache CFG</a><br>
66
+ <a href="#benchmarks">📚Text2Image DrawBench</a> | <a href="#benchmarks">📚Text2Image Distillation DrawBench</a>
66
67
  </p>
67
68
  <p align="center">
68
69
  🎉Now, <b>cache-dit</b> covers <b>most</b> mainstream Diffusers' <b>DiT</b> Pipelines🎉<br>
@@ -275,45 +276,88 @@ Currently, **cache-dit** library supports almost **Any** Diffusion Transformers
275
276
 
276
277
  <div id="benchmarks"></div>
277
278
 
278
- Take FLUX.1-dev as an example. Here, only the results of some precision and performance benchmarks are presented. The test dataset is DrawBench. For a complete benchmark, please refer to [benchmarks](./bench/). **Device**: NVIDIA L20. **F**: Fn_compute_blocks, **B**: Bn_compute_blocks.
279
+ cache-dit will support more mainstream Cache acceleration algorithms in the future. More benchmarks will be released, please stay tuned for update. Here, only the results of some precision and performance benchmarks are presented. The test dataset is **DrawBench**. For a complete benchmark, please refer to [📚Benchmarks](./bench/).
279
280
 
280
- | Config | Clip Score(↑) | ImageReward(↑) | PSNR(↑) | TFLOPs(↑) | SpeedUp(↑) |
281
+ ### 📚Text2Image DrawBench: FLUX.1-dev
282
+
283
+ Comparisons between different FnBn compute block configurations show that **more compute blocks result in higher precision**. For example, the F8B0_W8MC0 configuration achieves the best Clip Score (33.007) and ImageReward (1.0333). **Device**: NVIDIA L20. **F**: Fn_compute_blocks, **B**: Bn_compute_blocks, 50 steps.
284
+
285
+
286
+ | Config | Clip Score(↑) | ImageReward(↑) | PSNR(↑) | TFLOPs(↓) | SpeedUp(↑) |
281
287
  | --- | --- | --- | --- | --- | --- |
282
- | [**FLUX.1**-dev]: 50 steps | 32.9217 | 1.0412 | INF | 3726.87 | 1.00 |
283
- | F8B0_W8MC0_R0.08 | 33.0070 | 1.0333 | 35.2008 | 2162.19 | 1.72x |
288
+ | [**FLUX.1**-dev]: 50 steps | 32.9217 | 1.0412 | INF | 3726.87 | 1.00x |
284
289
  | F8B0_W4MC0_R0.08 | 32.9871 | 1.0370 | 33.8317 | 2064.81 | 1.80x |
285
- | F4B0_W4MC2_R0.12 | 32.9718 | 1.0301 | 31.9394 | 1678.98 | 2.22x |
286
- | F8B0_W8MC3_R0.12 | 32.9613 | 1.0270 | 34.2834 | 1977.69 | 1.88x |
287
290
  | F8B0_W4MC2_R0.12 | 32.9535 | 1.0185 | 32.7346 | 1935.73 | 1.93x |
288
- | F8B0_W8MC2_R0.12 | 32.9302 | 1.0227 | 34.7449 | 2072.18 | 1.80x |
289
291
  | F8B0_W4MC3_R0.12 | 32.9234 | 1.0085 | 32.5385 | 1816.58 | 2.05x |
290
- | F8B0_W8MC4_R0.12 | 32.9041 | 1.0140 | 33.9466 | 1897.61 | 1.96x |
291
292
  | F4B0_W4MC3_R0.12 | 32.8981 | 1.0130 | 31.8031 | 1507.83 | 2.47x |
292
- | F4B0_W4MC0_R0.08 | 32.8544 | 1.0065 | 32.3555 | 1654.72 | 2.25x |
293
- | F8B0_W4MC4_R0.12 | 32.8443 | 1.0102 | 32.4231 | 1753.48 | 2.13x |
294
293
  | F4B0_W4MC4_R0.12 | 32.8384 | 1.0065 | 31.5292 | 1400.08 | 2.66x |
295
- | F1B0_W4MC4_R0.12 | 32.8291 | 1.0181 | 32.9462 | 1401.61 | 2.66x |
296
- | F1B0_W4MC3_R0.12 | 32.8236 | 1.0166 | 33.0037 | 1457.62 | 2.56x |
297
294
 
298
- The comparison between DBCache and algorithms such as Δ-DiT, Chipmunk, FORA, DuCa, TaylorSeer and FoCa is as follows. Now, in the comparison with a speedup ratio less than **3x**, cache-dit achieved the best accuracy. Please check [📚How to Reproduce?](./bench/) for more details.
295
+ The comparison between **cache-dit: DBCache** and algorithms such as Δ-DiT, Chipmunk, FORA, DuCa, TaylorSeer and FoCa is as follows. Now, in the comparison with a speedup ratio less than **3x**, cache-dit achieved the best accuracy. Please check [📚How to Reproduce?](./bench/) for more details.
299
296
 
300
297
  | Method | TFLOPs(↓) | SpeedUp(↑) | ImageReward(↑) | Clip Score(↑) |
301
298
  | --- | --- | --- | --- | --- |
302
299
  | [**FLUX.1**-dev]: 50 steps | 3726.87 | 1.00× | 0.9898 | 32.404 |
303
300
  | [**FLUX.1**-dev]: 60% steps | 2231.70 | 1.67× | 0.9663 | 32.312 |
304
- | Δ-DiT (N=2) | 2480.01 | 1.50× | 0.9444 | 32.273 |
305
- | Δ-DiT (N=3) | 1686.76 | 2.21× | 0.8721 | 32.102 |
301
+ | Δ-DiT(N=2) | 2480.01 | 1.50× | 0.9444 | 32.273 |
302
+ | Δ-DiT(N=3) | 1686.76 | 2.21× | 0.8721 | 32.102 |
306
303
  | [**FLUX.1**-dev]: 34% steps | 1264.63 | 3.13× | 0.9453 | 32.114 |
307
304
  | Chipmunk | 1505.87 | 2.47× | 0.9936 | 32.776 |
308
305
  | FORA (N=3) | 1320.07 | 2.82× | 0.9776 | 32.266 |
309
- | **[DBCache (F=4, B=0)](https://github.com/vipshop/cache-dit)** | **1400.08** | **2.66×** | **1.0065** | **32.838** |
310
- | **[DBCache + TaylorSeer (F=4, B=0)](https://github.com/vipshop/cache-dit)** | **1388.30** | **2.68×** | **1.0287** | **32.914** |
306
+ | **[DBCache(F=4,B=0,W=4,MC=4)](https://github.com/vipshop/cache-dit)** | **1400.08** | **2.66×** | **1.0065** | **32.838** |
311
307
  | DuCa(N=5) | 978.76 | 3.80× | 0.9955 | 32.241 |
312
- | TaylorSeer (N=4, O=2) | 1042.27 | 3.57× | 0.9857 | 32.413 |
313
- | **[FoCa (N=5) arxiv.2508.16211](https://arxiv.org/pdf/2508.16211)** | **893.54** | **4.16×** | **1.0029** | **32.948** |
308
+ | TaylorSeer(N=4,O=2) | 1042.27 | 3.57× | 0.9857 | 32.413 |
309
+ | **[DBCache+TaylorSeer(F=1,B=0,O=1)](https://github.com/vipshop/cache-dit)** | **1153.05** | **3.23×** | **1.0221** | **32.819** |
310
+ | **[FoCa(N=5) arxiv.2508.16211](https://arxiv.org/pdf/2508.16211)** | **893.54** | **4.16×** | **1.0029** | **32.948** |
311
+
312
+ <details>
313
+ <summary> Show all comparison </summary>
314
+
315
+ | Method | TFLOPs(↓) | SpeedUp(↑) | ImageReward(↑) | Clip Score(↑) |
316
+ | --- | --- | --- | --- | --- |
317
+ | [**FLUX.1**-dev]: 50 steps | 3726.87 | 1.00× | 0.9898 | 32.404 |
318
+ | [**FLUX.1**-dev]: 60% steps | 2231.70 | 1.67× | 0.9663 | 32.312 |
319
+ | Δ-DiT(N=2) | 2480.01 | 1.50× | 0.9444 | 32.273 |
320
+ | Δ-DiT(N=3) | 1686.76 | 2.21× | 0.8721 | 32.102 |
321
+ | [**FLUX.1**-dev]: 34% steps | 1264.63 | 3.13× | 0.9453 | 32.114 |
322
+ | Chipmunk | 1505.87 | 2.47× | 0.9936 | 32.776 |
323
+ | FORA (N=3) | 1320.07 | 2.82× | 0.9776 | 32.266 |
324
+ | **[DBCache(F=4,B=0,W=4,MC=4)](https://github.com/vipshop/cache-dit)** | **1400.08** | **2.66×** | **1.0065** | **32.838** |
325
+ | DuCa(N=5) | 978.76 | 3.80× | 0.9955 | 32.241 |
326
+ | TaylorSeer(N=4,O=2) | 1042.27 | 3.57× | 0.9857 | 32.413 |
327
+ | **[DBCache+TaylorSeer(F=1,B=0,O=1)](https://github.com/vipshop/cache-dit)** | **1153.05** | **3.23×** | **1.0221** | **32.819** |
328
+ | **[FoCa(N=5) arxiv.2508.16211](https://arxiv.org/pdf/2508.16211)** | **893.54** | **4.16×** | **1.0029** | **32.948** |
329
+ | [**FLUX.1**-dev]: 22% steps | 818.29 | 4.55× | 0.8183 | 31.772 |
330
+ | FORA(N=4) | 967.91 | 3.84× | 0.9730 | 32.142 |
331
+ | ToCa(N=8) | 784.54 | 4.74× | 0.9451 | 31.993 |
332
+ | DuCa(N=7) | 760.14 | 4.89× | 0.9757 | 32.066 |
333
+ | TeaCache(l=0.8) | 892.35 | 4.17× | 0.8683 | 31.704 |
334
+ | **[DBCache(F=4,B=0,W=4,MC=10)](https://github.com/vipshop/cache-dit)** | 816.65 | 4.56x | 0.8245 | 32.191 |
335
+ | TaylorSeer(N=5,O=2) | 893.54 | 4.16× | 0.9768 | 32.467 |
336
+ | **[FoCa(N=7) arxiv.2508.16211](https://arxiv.org/pdf/2508.16211)** | **670.44** | **5.54×** | **0.9891** | **32.920** |
337
+ | FORA(N=7) | 670.14 | 5.55× | 0.7418 | 31.519 |
338
+ | ToCa(N=12) | 644.70 | 5.77× | 0.7155 | 31.808 |
339
+ | DuCa(N=10) | 606.91 | 6.13× | 0.8382 | 31.759 |
340
+ | TeaCache(l=1.2) | 669.27 | 5.56× | 0.7394 | 31.704 |
341
+ | **[DBCache(F=1,B=0,W=4,MC=10)](https://github.com/vipshop/cache-dit)** | **651.90** | **5.72x** | 0.8796 | **32.318** |
342
+ | TaylorSeer(N=7,O=2) | 670.44 | 5.54× | 0.9128 | 32.128 |
343
+ | **[FoCa(N=8) arxiv.2508.16211](https://arxiv.org/pdf/2508.16211)** | **596.07** | **6.24×** | **0.9502** | **32.706** |
344
+
345
+ NOTE: Except for DBCache, other performance data are referenced from the paper [FoCa, arxiv.2508.16211](https://arxiv.org/pdf/2508.16211).
346
+
347
+ </details>
348
+
349
+ ### 📚Text2Image Distillation DrawBench: Qwen-Image-Lightning
314
350
 
315
- cache-dit will support more mainstream Cache acceleration algorithms in the future. More benchmarks will be released, please stay tuned for update.
351
+ Surprisingly, cache-dit: DBCache still works in the extremely few-step distill model. For example, **Qwen-Image-Lightning w/ 4 steps**, with the F16B16 configuration, the PSNR is 34.8163, the Clip Score is 35.6109, and the ImageReward is 1.2614. It maintained a relatively high precision.
316
352
 
353
+ | Config | PSNR(↑) | Clip Score(↑) | ImageReward(↑) | TFLOPs(↓) | SpeedUp(↑) |
354
+ |----------------------------|-----------|------------|--------------|----------|------------|
355
+ | [**Lightning**]: 4 steps | INF | 35.5797 | 1.2630 | 274.33 | 1.00x |
356
+ | F24B24_W2MC1_R0.8 | 36.3242 | 35.6224 | 1.2630 | 264.74 | 1.04x |
357
+ | F16B16_W2MC1_R0.8 | 34.8163 | 35.6109 | 1.2614 | 244.25 | 1.12x |
358
+ | F12B12_W2MC1_R0.8 | 33.8953 | 35.6535 | 1.2549 | 234.63 | 1.17x |
359
+ | F8B8_W2MC1_R0.8 | 33.1374 | 35.7284 | 1.2517 | 224.29 | 1.22x |
360
+ | F1B0_W2MC1_R0.8 | 31.8317 | 35.6651 | 1.2397 | 206.90 | 1.33x |
317
361
 
318
362
  ## 🎉Unified Cache APIs
319
363
 
@@ -615,7 +659,7 @@ torch._dynamo.config.recompile_limit = 96 # default is 8
615
659
  torch._dynamo.config.accumulated_recompile_limit = 2048 # default is 256
616
660
  ```
617
661
 
618
- Please check [bench.py](./bench/bench.py) for more details.
662
+ Please check [perf.py](./bench/perf.py) for more details.
619
663
 
620
664
 
621
665
  ## 🛠Metrics CLI
@@ -1,5 +1,5 @@
1
1
  cache_dit/__init__.py,sha256=hzaexC1VQ0TxiWY6TJ1lTm-04e65WOTNHOfYryu1vFA,1284
2
- cache_dit/_version.py,sha256=y3W4kIzeljZ6sUAtO8hW9y1LA6HKsN9jWHCm6JjF5gw,706
2
+ cache_dit/_version.py,sha256=5zTqm8rgXsWYBpB2M3Zw_K1D-aV8wP7NsBLrmMKkrAQ,704
3
3
  cache_dit/logger.py,sha256=0zsu42hN-3-rgGC_C29ms1IvVpV4_b4_SwJCKSenxBE,4304
4
4
  cache_dit/utils.py,sha256=nuHHr6NB286qE9u6klLNfhAVRMOGipihOhM8LRqznmU,10775
5
5
  cache_dit/cache_factory/.gitignore,sha256=5Cb-qT9wsTUoMJ7vACDF7ZcLpAXhi5v-xdcWSRit988,23
@@ -39,13 +39,13 @@ cache_dit/metrics/fid.py,sha256=ZM_FM0XERtpnkMUfphmw2aOdljrh1uba-pnYItu0q6M,1821
39
39
  cache_dit/metrics/image_reward.py,sha256=N8HalJo1T1js0dsNb2V1KRv4kIdcm3nhx7iOXJuqcns,5421
40
40
  cache_dit/metrics/inception.py,sha256=pBVe2X6ylLPIXTG4-GWDM9DWnCviMJbJ45R3ulhktR0,12759
41
41
  cache_dit/metrics/lpips.py,sha256=hrHrmdM-f2B4TKDs0xLqJO5JFaYcCjq2qNIR8oCrVkc,811
42
- cache_dit/metrics/metrics.py,sha256=RADSUUMYKBMkABsYFCEr_9PV8cDXLuxe2xuQ-mRBs4Y,39691
42
+ cache_dit/metrics/metrics.py,sha256=7UV-H2NRbhfr6dvrXEzU97Zy-BSQ5zEfm9CKtaK4ldg,40231
43
43
  cache_dit/quantize/__init__.py,sha256=kWYoMAyZgBXu9BJlZjTQ0dRffW9GqeeY9_iTkXrb70A,59
44
44
  cache_dit/quantize/quantize_ao.py,sha256=Fx1KW4l3gdEkdrcAYtPoDW7WKBJWrs3glOHiEwW_TgE,6160
45
45
  cache_dit/quantize/quantize_interface.py,sha256=2s_R7xPSKuJeFpEGeLwRxnq_CqJcBG3a3lzyW5wh-UM,1241
46
- cache_dit-0.2.36.dist-info/licenses/LICENSE,sha256=Dqb07Ik2dV41s9nIdMUbiRWEfDqo7-dQeRiY7kPO8PE,3769
47
- cache_dit-0.2.36.dist-info/METADATA,sha256=2WIwYYezSUUk0rnIutttA2-s6_yTSCz1BwiAfO9cbQI,42706
48
- cache_dit-0.2.36.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
49
- cache_dit-0.2.36.dist-info/entry_points.txt,sha256=FX2gysXaZx6NeK1iCLMcIdP8Q4_qikkIHtEmi3oWn8o,65
50
- cache_dit-0.2.36.dist-info/top_level.txt,sha256=ZJDydonLEhujzz0FOkVbO-BqfzO9d_VqRHmZU-3MOZo,10
51
- cache_dit-0.2.36.dist-info/RECORD,,
46
+ cache_dit-0.3.0.dist-info/licenses/LICENSE,sha256=Dqb07Ik2dV41s9nIdMUbiRWEfDqo7-dQeRiY7kPO8PE,3769
47
+ cache_dit-0.3.0.dist-info/METADATA,sha256=NW9YEZ1Dt3y0_o89jS3iO9o9-Y83Yo0qTz2iDOGF4j0,45943
48
+ cache_dit-0.3.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
49
+ cache_dit-0.3.0.dist-info/entry_points.txt,sha256=FX2gysXaZx6NeK1iCLMcIdP8Q4_qikkIHtEmi3oWn8o,65
50
+ cache_dit-0.3.0.dist-info/top_level.txt,sha256=ZJDydonLEhujzz0FOkVbO-BqfzO9d_VqRHmZU-3MOZo,10
51
+ cache_dit-0.3.0.dist-info/RECORD,,