cache-dit 0.2.36__py3-none-any.whl → 0.2.37__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of cache-dit might be problematic. Click here for more details.

cache_dit/_version.py CHANGED
@@ -28,7 +28,7 @@ version_tuple: VERSION_TUPLE
28
28
  commit_id: COMMIT_ID
29
29
  __commit_id__: COMMIT_ID
30
30
 
31
- __version__ = version = '0.2.36'
32
- __version_tuple__ = version_tuple = (0, 2, 36)
31
+ __version__ = version = '0.2.37'
32
+ __version_tuple__ = version_tuple = (0, 2, 37)
33
33
 
34
34
  __commit_id__ = commit_id = None
@@ -1,4 +1,5 @@
1
1
  import os
2
+ import re
2
3
  import cv2
3
4
  import pathlib
4
5
  import argparse
@@ -160,25 +161,30 @@ def compute_dir_metric(
160
161
  # File
161
162
  if not os.path.isdir(image_true_dir) or not os.path.isdir(image_test_dir):
162
163
  return compute_file_func(image_true_dir, image_test_dir), 1
164
+
163
165
  # Dir
166
+ # compute dir metric
167
+ def natural_sort_key(filename):
168
+ match = re.search(r"(\d+)\D*$", filename)
169
+ return int(match.group(1)) if match else filename
170
+
164
171
  image_true_dir: pathlib.Path = pathlib.Path(image_true_dir)
165
- image_true_files = sorted(
166
- [
167
- file
168
- for ext in _IMAGE_EXTENSIONS
169
- for file in image_true_dir.rglob("*.{}".format(ext))
170
- ]
171
- )
172
- image_test_dir: pathlib.Path = pathlib.Path(image_test_dir)
173
- image_test_files = sorted(
174
- [
175
- file
176
- for ext in _IMAGE_EXTENSIONS
177
- for file in image_test_dir.rglob("*.{}".format(ext))
178
- ]
179
- )
172
+ image_true_files = [
173
+ file
174
+ for ext in _IMAGE_EXTENSIONS
175
+ for file in image_true_dir.rglob("*.{}".format(ext))
176
+ ]
180
177
  image_true_files = [file.as_posix() for file in image_true_files]
178
+ image_true_files = sorted(image_true_files, key=natural_sort_key)
179
+
180
+ image_test_dir: pathlib.Path = pathlib.Path(image_test_dir)
181
+ image_test_files = [
182
+ file
183
+ for ext in _IMAGE_EXTENSIONS
184
+ for file in image_test_dir.rglob("*.{}".format(ext))
185
+ ]
181
186
  image_test_files = [file.as_posix() for file in image_test_files]
187
+ image_test_files = sorted(image_test_files, key=natural_sort_key)
182
188
 
183
189
  # select valid files
184
190
  image_true_files_selected = []
@@ -192,6 +198,7 @@ def compute_dir_metric(
192
198
  ):
193
199
  image_true_files_selected.append(selected_image_true)
194
200
  image_test_files_selected.append(selected_image_test)
201
+
195
202
  image_true_files = image_true_files_selected.copy()
196
203
  image_test_files = image_test_files_selected.copy()
197
204
  if len(image_true_files) == 0:
@@ -206,20 +213,22 @@ def compute_dir_metric(
206
213
 
207
214
  total_metric = 0.0
208
215
  valid_files = 0
216
+ total_files = 0
209
217
  for image_true, image_test in tqdm(
210
218
  zip(image_true_files, image_test_files),
211
219
  total=len(image_true_files),
212
220
  disable=DISABLE_VERBOSE,
213
221
  ):
214
222
  metric = compute_file_func(image_true, image_test)
215
- if metric != float("inf"):
223
+ if metric != float("inf"): # means no cache apply to image_test
216
224
  total_metric += metric
217
225
  valid_files += 1
226
+ total_files += 1
218
227
 
219
228
  if valid_files > 0:
220
229
  average_metric = total_metric / valid_files
221
230
  logger.debug(f"Average: {average_metric:.2f}")
222
- return average_metric, valid_files
231
+ return average_metric, total_files
223
232
  else:
224
233
  logger.debug("No valid files to compare")
225
234
  return None, None
@@ -1217,6 +1226,9 @@ def entrypoint():
1217
1226
 
1218
1227
  if args.gen_markdown_table:
1219
1228
  table = _format_table(format_strs, metric)
1229
+ table = table.replace("Latency(s)(↑)", "SpeedUp(↑)")
1230
+ table = table.replace("TFLOPs(↑)", "SpeedUp(↑)")
1231
+ table = table.replace("FLOPs(↑)", "SpeedUp(↑)")
1220
1232
  print("-" * format_len)
1221
1233
  print(f"{table}")
1222
1234
  print("-" * format_len)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: cache_dit
3
- Version: 0.2.36
3
+ Version: 0.2.37
4
4
  Summary: 🤗 A Unified and Training-free Cache Acceleration Toolbox for Diffusion Transformers
5
5
  Author: DefTruth, vipshop.com, etc.
6
6
  Maintainer: DefTruth, vipshop.com, etc
@@ -62,7 +62,8 @@ Dynamic: requires-python
62
62
  </div>
63
63
  <p align="center">
64
64
  <b><a href="#unified">📚Unified Cache APIs</a></b> | <a href="#forward-pattern-matching">📚Forward Pattern Matching</a> | <a href="#automatic-block-adapter">📚Automatic Block Adapter</a><br>
65
- <a href="#hybird-forward-pattern">📚Hybrid Forward Pattern</a> | <a href="#dbcache">📚DBCache</a> | <a href="#taylorseer">📚Hybrid TaylorSeer</a> | <a href="#cfg">📚Cache CFG</a>
65
+ <a href="#hybird-forward-pattern">📚Hybrid Forward Pattern</a> | <a href="#dbcache">📚DBCache</a> | <a href="#taylorseer">📚Hybrid TaylorSeer</a> | <a href="#cfg">📚Cache CFG</a><br>
66
+ <a href="#benchmarks">📚Text2Image DrawBench</a> | <a href="#benchmarks">📚Text2Image Distillation DrawBench</a>
66
67
  </p>
67
68
  <p align="center">
68
69
  🎉Now, <b>cache-dit</b> covers <b>most</b> mainstream Diffusers' <b>DiT</b> Pipelines🎉<br>
@@ -275,11 +276,16 @@ Currently, **cache-dit** library supports almost **Any** Diffusion Transformers
275
276
 
276
277
  <div id="benchmarks"></div>
277
278
 
278
- Take FLUX.1-dev as an example. Here, only the results of some precision and performance benchmarks are presented. The test dataset is DrawBench. For a complete benchmark, please refer to [benchmarks](./bench/). **Device**: NVIDIA L20. **F**: Fn_compute_blocks, **B**: Bn_compute_blocks.
279
+ cache-dit will support more mainstream Cache acceleration algorithms in the future. More benchmarks will be released, please stay tuned for update. Here, only the results of some precision and performance benchmarks are presented. The test dataset is **DrawBench**. For a complete benchmark, please refer to [📚Benchmarks](./bench/).
279
280
 
280
- | Config | Clip Score(↑) | ImageReward(↑) | PSNR(↑) | TFLOPs(↑) | SpeedUp(↑) |
281
+ ### 📚Text2Image DrawBench: FLUX.1-dev
282
+
283
+ Comparisons between different FnBn compute block configurations show that **more compute blocks result in higher precision**. For example, the F8B0_W8MC0 configuration achieves the best Clip Score (33.007) and ImageReward (1.0333). **Device**: NVIDIA L20. **F**: Fn_compute_blocks, **B**: Bn_compute_blocks, 50 steps.
284
+
285
+
286
+ | Config | Clip Score(↑) | ImageReward(↑) | PSNR(↑) | TFLOPs(↓) | SpeedUp(↑) |
281
287
  | --- | --- | --- | --- | --- | --- |
282
- | [**FLUX.1**-dev]: 50 steps | 32.9217 | 1.0412 | INF | 3726.87 | 1.00 |
288
+ | [**FLUX.1**-dev]: 50 steps | 32.9217 | 1.0412 | INF | 3726.87 | 1.00x |
283
289
  | F8B0_W8MC0_R0.08 | 33.0070 | 1.0333 | 35.2008 | 2162.19 | 1.72x |
284
290
  | F8B0_W4MC0_R0.08 | 32.9871 | 1.0370 | 33.8317 | 2064.81 | 1.80x |
285
291
  | F4B0_W4MC2_R0.12 | 32.9718 | 1.0301 | 31.9394 | 1678.98 | 2.22x |
@@ -294,25 +300,57 @@ Take FLUX.1-dev as an example. Here, only the results of some precision and perf
294
300
  | F4B0_W4MC4_R0.12 | 32.8384 | 1.0065 | 31.5292 | 1400.08 | 2.66x |
295
301
  | F1B0_W4MC4_R0.12 | 32.8291 | 1.0181 | 32.9462 | 1401.61 | 2.66x |
296
302
  | F1B0_W4MC3_R0.12 | 32.8236 | 1.0166 | 33.0037 | 1457.62 | 2.56x |
303
+ | F1B0_W4MC10_R1.0 | 32.3183 | 0.8796 | 29.6757 | 651.90 | 5.72x |
297
304
 
298
- The comparison between DBCache and algorithms such as Δ-DiT, Chipmunk, FORA, DuCa, TaylorSeer and FoCa is as follows. Now, in the comparison with a speedup ratio less than **3x**, cache-dit achieved the best accuracy. Please check [📚How to Reproduce?](./bench/) for more details.
305
+ The comparison between **cache-dit: DBCache** and algorithms such as Δ-DiT, Chipmunk, FORA, DuCa, TaylorSeer and FoCa is as follows. Now, in the comparison with a speedup ratio less than **3x**, cache-dit achieved the best accuracy. Please check [📚How to Reproduce?](./bench/) for more details.
299
306
 
300
307
  | Method | TFLOPs(↓) | SpeedUp(↑) | ImageReward(↑) | Clip Score(↑) |
301
308
  | --- | --- | --- | --- | --- |
302
309
  | [**FLUX.1**-dev]: 50 steps | 3726.87 | 1.00× | 0.9898 | 32.404 |
303
310
  | [**FLUX.1**-dev]: 60% steps | 2231.70 | 1.67× | 0.9663 | 32.312 |
304
- | Δ-DiT (N=2) | 2480.01 | 1.50× | 0.9444 | 32.273 |
305
- | Δ-DiT (N=3) | 1686.76 | 2.21× | 0.8721 | 32.102 |
311
+ | Δ-DiT(N=2) | 2480.01 | 1.50× | 0.9444 | 32.273 |
312
+ | Δ-DiT(N=3) | 1686.76 | 2.21× | 0.8721 | 32.102 |
306
313
  | [**FLUX.1**-dev]: 34% steps | 1264.63 | 3.13× | 0.9453 | 32.114 |
307
314
  | Chipmunk | 1505.87 | 2.47× | 0.9936 | 32.776 |
308
315
  | FORA (N=3) | 1320.07 | 2.82× | 0.9776 | 32.266 |
309
- | **[DBCache (F=4, B=0)](https://github.com/vipshop/cache-dit)** | **1400.08** | **2.66×** | **1.0065** | **32.838** |
310
- | **[DBCache + TaylorSeer (F=4, B=0)](https://github.com/vipshop/cache-dit)** | **1388.30** | **2.68×** | **1.0287** | **32.914** |
316
+ | **[DBCache(F=4,B=0,W=4,MC=4)](https://github.com/vipshop/cache-dit)** | **1400.08** | **2.66×** | **1.0065** | **32.838** |
311
317
  | DuCa(N=5) | 978.76 | 3.80× | 0.9955 | 32.241 |
312
- | TaylorSeer (N=4, O=2) | 1042.27 | 3.57× | 0.9857 | 32.413 |
313
- | **[FoCa (N=5) arxiv.2508.16211](https://arxiv.org/pdf/2508.16211)** | **893.54** | **4.16×** | **1.0029** | **32.948** |
314
-
315
- cache-dit will support more mainstream Cache acceleration algorithms in the future. More benchmarks will be released, please stay tuned for update.
318
+ | TaylorSeer(N=4,O=2) | 1042.27 | 3.57× | 0.9857 | 32.413 |
319
+ | **[DBCache+TaylorSeer(F=1,B=0,O=1)](https://github.com/vipshop/cache-dit)** | **1153.05** | **3.23×** | **1.0221** | **32.819** |
320
+ | **[FoCa(N=5) arxiv.2508.16211](https://arxiv.org/pdf/2508.16211)** | **893.54** | **4.16×** | **1.0029** | **32.948** |
321
+ | [**FLUX.1**-dev]: 22% steps | 818.29 | 4.55× | 0.8183 | 31.772 |
322
+ | FORA(N=4) | 967.91 | 3.84× | 0.9730 | 32.142 |
323
+ | ToCa(N=8) | 784.54 | 4.74× | 0.9451 | 31.993 |
324
+ | DuCa(N=7) | 760.14 | 4.89× | 0.9757 | 32.066 |
325
+ | TeaCache(l=0.8) | 892.35 | 4.17× | 0.8683 | 31.704 |
326
+ | **[DBCache(F=4,B=0,W=4,MC=10)](https://github.com/vipshop/cache-dit)** | 816.65 | 4.56x | 0.8245 | 32.191 |
327
+ | TaylorSeer(N=5,O=2) | 893.54 | 4.16× | 0.9768 | 32.467 |
328
+ | **[FoCa(N=7) arxiv.2508.16211](https://arxiv.org/pdf/2508.16211)** | **670.44** | **5.54×** | **0.9891** | **32.920** |
329
+ | FORA(N=7) | 670.14 | 5.55× | 0.7418 | 31.519 |
330
+ | ToCa(N=12) | 644.70 | 5.77× | 0.7155 | 31.808 |
331
+ | DuCa(N=10) | 606.91 | 6.13× | 0.8382 | 31.759 |
332
+ | TeaCache(l=1.2) | 669.27 | 5.56× | 0.7394 | 31.704 |
333
+ | **[DBCache(F=1,B=0,W=4,MC=10)](https://github.com/vipshop/cache-dit)** | **651.90** | **5.72x** | 0.8796 | **32.318** |
334
+ | TaylorSeer(N=7,O=2) | 670.44 | 5.54× | 0.9128 | 32.128 |
335
+ | **[FoCa(N=8) arxiv.2508.16211](https://arxiv.org/pdf/2508.16211)** | **596.07** | **6.24×** | **0.9502** | **32.706** |
336
+
337
+ NOTE: Except for DBCache, other performance data are referenced from the paper [FoCa, arxiv.2508.16211](https://arxiv.org/pdf/2508.16211).
338
+
339
+ ### 📚Text2Image Distillation DrawBench: Qwen-Image-Lightning
340
+
341
+ Surprisingly, cache-dit: DBCache still works in the extremely few-step distill model. For example, **Qwen-Image-Lightning w/ 4 steps**, with the F16B16 configuration, the PSNR is 34.8163, the Clip Score is 35.6109, and the ImageReward is 1.2614. It maintained a relatively high precision.
342
+
343
+ | Config | PSNR(↑) | Clip Score(↑) | ImageReward(↑) | TFLOPs(↓) | SpeedUp(↑) |
344
+ |----------------------------|-----------|------------|--------------|----------|------------|
345
+ | [**Lightning**]: 4 steps | INF | 35.5797 | 1.2630 | 274.33 | 1.00x |
346
+ | F24B24_W2MC1_R0.8 | 36.3242 | 35.6224 | 1.2630 | 264.74 | 1.04x |
347
+ | F16B16_W2MC1_R0.8 | 34.8163 | 35.6109 | 1.2614 | 244.25 | 1.12x |
348
+ | F12B12_W2MC1_R0.8 | 33.8953 | 35.6535 | 1.2549 | 234.63 | 1.17x |
349
+ | F8B8_W2MC1_R0.8 | 33.1374 | 35.7284 | 1.2517 | 224.29 | 1.22x |
350
+ | F48B0_W2MC1_R0.8 | 30.0533 | 35.8483 | 1.1979 | 265.56 | 1.03x |
351
+ | F32B0_W2MC1_R0.8 | 29.6490 | 35.7684 | 1.2302 | 261.05 | 1.05x |
352
+ | F24B0_W2MC1_R0.8 | 29.6081 | 35.8599 | 1.1874 | 245.54 | 1.12x |
353
+ | F16B0_W2MC1_R0.8 | 29.4844 | 36.0810 | 1.1586 | 227.06 | 1.21x |
316
354
 
317
355
 
318
356
  ## 🎉Unified Cache APIs
@@ -615,7 +653,7 @@ torch._dynamo.config.recompile_limit = 96 # default is 8
615
653
  torch._dynamo.config.accumulated_recompile_limit = 2048 # default is 256
616
654
  ```
617
655
 
618
- Please check [bench.py](./bench/bench.py) for more details.
656
+ Please check [perf.py](./bench/perf.py) for more details.
619
657
 
620
658
 
621
659
  ## 🛠Metrics CLI
@@ -1,5 +1,5 @@
1
1
  cache_dit/__init__.py,sha256=hzaexC1VQ0TxiWY6TJ1lTm-04e65WOTNHOfYryu1vFA,1284
2
- cache_dit/_version.py,sha256=y3W4kIzeljZ6sUAtO8hW9y1LA6HKsN9jWHCm6JjF5gw,706
2
+ cache_dit/_version.py,sha256=jVUPlUOcnlQRBFP8i5PUv2oJntFMrKgk1rs1guuDZ34,706
3
3
  cache_dit/logger.py,sha256=0zsu42hN-3-rgGC_C29ms1IvVpV4_b4_SwJCKSenxBE,4304
4
4
  cache_dit/utils.py,sha256=nuHHr6NB286qE9u6klLNfhAVRMOGipihOhM8LRqznmU,10775
5
5
  cache_dit/cache_factory/.gitignore,sha256=5Cb-qT9wsTUoMJ7vACDF7ZcLpAXhi5v-xdcWSRit988,23
@@ -39,13 +39,13 @@ cache_dit/metrics/fid.py,sha256=ZM_FM0XERtpnkMUfphmw2aOdljrh1uba-pnYItu0q6M,1821
39
39
  cache_dit/metrics/image_reward.py,sha256=N8HalJo1T1js0dsNb2V1KRv4kIdcm3nhx7iOXJuqcns,5421
40
40
  cache_dit/metrics/inception.py,sha256=pBVe2X6ylLPIXTG4-GWDM9DWnCviMJbJ45R3ulhktR0,12759
41
41
  cache_dit/metrics/lpips.py,sha256=hrHrmdM-f2B4TKDs0xLqJO5JFaYcCjq2qNIR8oCrVkc,811
42
- cache_dit/metrics/metrics.py,sha256=RADSUUMYKBMkABsYFCEr_9PV8cDXLuxe2xuQ-mRBs4Y,39691
42
+ cache_dit/metrics/metrics.py,sha256=7UV-H2NRbhfr6dvrXEzU97Zy-BSQ5zEfm9CKtaK4ldg,40231
43
43
  cache_dit/quantize/__init__.py,sha256=kWYoMAyZgBXu9BJlZjTQ0dRffW9GqeeY9_iTkXrb70A,59
44
44
  cache_dit/quantize/quantize_ao.py,sha256=Fx1KW4l3gdEkdrcAYtPoDW7WKBJWrs3glOHiEwW_TgE,6160
45
45
  cache_dit/quantize/quantize_interface.py,sha256=2s_R7xPSKuJeFpEGeLwRxnq_CqJcBG3a3lzyW5wh-UM,1241
46
- cache_dit-0.2.36.dist-info/licenses/LICENSE,sha256=Dqb07Ik2dV41s9nIdMUbiRWEfDqo7-dQeRiY7kPO8PE,3769
47
- cache_dit-0.2.36.dist-info/METADATA,sha256=2WIwYYezSUUk0rnIutttA2-s6_yTSCz1BwiAfO9cbQI,42706
48
- cache_dit-0.2.36.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
49
- cache_dit-0.2.36.dist-info/entry_points.txt,sha256=FX2gysXaZx6NeK1iCLMcIdP8Q4_qikkIHtEmi3oWn8o,65
50
- cache_dit-0.2.36.dist-info/top_level.txt,sha256=ZJDydonLEhujzz0FOkVbO-BqfzO9d_VqRHmZU-3MOZo,10
51
- cache_dit-0.2.36.dist-info/RECORD,,
46
+ cache_dit-0.2.37.dist-info/licenses/LICENSE,sha256=Dqb07Ik2dV41s9nIdMUbiRWEfDqo7-dQeRiY7kPO8PE,3769
47
+ cache_dit-0.2.37.dist-info/METADATA,sha256=dLOxpSzGT1izGxxPdbFc7jDKtiSgl-XAl-JZXRkX138,45826
48
+ cache_dit-0.2.37.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
49
+ cache_dit-0.2.37.dist-info/entry_points.txt,sha256=FX2gysXaZx6NeK1iCLMcIdP8Q4_qikkIHtEmi3oWn8o,65
50
+ cache_dit-0.2.37.dist-info/top_level.txt,sha256=ZJDydonLEhujzz0FOkVbO-BqfzO9d_VqRHmZU-3MOZo,10
51
+ cache_dit-0.2.37.dist-info/RECORD,,