diffsynth-engine 0.7.1.dev3__py3-none-any.whl → 0.7.1.dev5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -179,6 +179,7 @@ class QwenImageTransformerBlockNunchaku(QwenImageTransformerBlock):
179
179
  rotary_emb: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
180
180
  attn_mask: Optional[torch.Tensor] = None,
181
181
  attn_kwargs: Optional[Dict[str, Any]] = None,
182
+ modulate_index: Optional[List[int]] = None,
182
183
  ) -> Tuple[torch.Tensor, torch.Tensor]:
183
184
  if self.use_nunchaku_awq:
184
185
  img_mod_params = self.img_mod(temb) # [B, 6*dim]
@@ -12,7 +12,7 @@ from diffsynth_engine.configs import QwenImagePipelineConfig
12
12
  from diffsynth_engine.pipelines.qwen_image import QwenImagePipeline
13
13
  from diffsynth_engine.models.qwen_image import QwenImageVAE
14
14
  from diffsynth_engine.models.basic.lora import LoRALinear
15
- from diffsynth_engine.models.qwen_image.qwen_image_dit import QwenImageTransformerBlock
15
+ from diffsynth_engine.models.qwen_image.qwen_image_dit import QwenImageTransformerBlock, QwenEmbedRope
16
16
  from diffsynth_engine.utils import logging
17
17
  from diffsynth_engine.utils.loader import load_file
18
18
  from diffsynth_engine.utils.download import fetch_model
@@ -32,6 +32,7 @@ def odtsr_forward():
32
32
  """
33
33
  original_lora_forward = LoRALinear.forward
34
34
  original_modulate = QwenImageTransformerBlock._modulate
35
+ original_rope_forward = QwenEmbedRope.forward
35
36
 
36
37
  def lora_batch_cfg_forward(self, x):
37
38
  y = nn.Linear.forward(self, x)
@@ -50,6 +51,49 @@ def odtsr_forward():
50
51
  y[:, L:] += lora(x2)
51
52
  return y
52
53
 
54
+ def optimized_rope_forward(self, video_fhw, txt_length, device):
55
+ if self.pos_freqs.device != device:
56
+ self.pos_freqs = self.pos_freqs.to(device)
57
+ self.neg_freqs = self.neg_freqs.to(device)
58
+
59
+ vid_freqs = []
60
+ max_vid_index = 0
61
+ idx = 0
62
+ for fhw in video_fhw:
63
+ frame, height, width = fhw
64
+ rope_key = f"{idx}_{height}_{width}"
65
+
66
+ if rope_key not in self.rope_cache:
67
+ seq_lens = frame * height * width
68
+ freqs_pos = self.pos_freqs.split([x // 2 for x in self.axes_dim], dim=1)
69
+ freqs_neg = self.neg_freqs.split([x // 2 for x in self.axes_dim], dim=1)
70
+ freqs_frame = freqs_pos[0][idx : idx + frame].view(frame, 1, 1, -1).expand(frame, height, width, -1)
71
+ if self.scale_rope:
72
+ freqs_height = torch.cat(
73
+ [freqs_neg[1][-(height - height // 2) :], freqs_pos[1][: height // 2]], dim=0
74
+ )
75
+ freqs_height = freqs_height.view(1, height, 1, -1).expand(frame, height, width, -1)
76
+ freqs_width = torch.cat([freqs_neg[2][-(width - width // 2) :], freqs_pos[2][: width // 2]], dim=0)
77
+ freqs_width = freqs_width.view(1, 1, width, -1).expand(frame, height, width, -1)
78
+
79
+ else:
80
+ freqs_height = freqs_pos[1][:height].view(1, height, 1, -1).expand(frame, height, width, -1)
81
+ freqs_width = freqs_pos[2][:width].view(1, 1, width, -1).expand(frame, height, width, -1)
82
+
83
+ freqs = torch.cat([freqs_frame, freqs_height, freqs_width], dim=-1).reshape(seq_lens, -1)
84
+ self.rope_cache[rope_key] = freqs.clone().contiguous()
85
+ vid_freqs.append(self.rope_cache[rope_key])
86
+ if self.scale_rope:
87
+ max_vid_index = max(height // 2, width // 2, max_vid_index)
88
+ else:
89
+ max_vid_index = max(height, width, max_vid_index)
90
+
91
+ txt_freqs = self.pos_freqs[max_vid_index : max_vid_index + txt_length, ...]
92
+ vid_freqs = torch.cat(vid_freqs, dim=0)
93
+
94
+ return vid_freqs, txt_freqs
95
+
96
+
53
97
  def optimized_modulate(self, x, mod_params, index=None):
54
98
  if mod_params.ndim == 2:
55
99
  shift, scale, gate = mod_params.chunk(3, dim=-1)
@@ -72,12 +116,14 @@ def odtsr_forward():
72
116
 
73
117
  LoRALinear.forward = lora_batch_cfg_forward
74
118
  QwenImageTransformerBlock._modulate = optimized_modulate
119
+ QwenEmbedRope.forward = optimized_rope_forward
75
120
 
76
121
  try:
77
122
  yield
78
123
  finally:
79
124
  LoRALinear.forward = original_lora_forward
80
125
  QwenImageTransformerBlock._modulate = original_modulate
126
+ QwenEmbedRope.forward = original_rope_forward
81
127
 
82
128
 
83
129
  class QwenImageUpscalerTool:
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: diffsynth_engine
3
- Version: 0.7.1.dev3
3
+ Version: 0.7.1.dev5
4
4
  Author: MuseAI x ModelScope
5
5
  Classifier: Programming Language :: Python :: 3
6
6
  Classifier: Operating System :: OS Independent
@@ -122,7 +122,7 @@ diffsynth_engine/models/qwen_image/__init__.py,sha256=_6f0LWaoLdDvD2CsjK2OzEIQry
122
122
  diffsynth_engine/models/qwen_image/qwen2_5_vl.py,sha256=Eu-r-c42t_q74Qpwz21ToCGHpvSi7VND4B1EI0e-ePA,57748
123
123
  diffsynth_engine/models/qwen_image/qwen_image_dit.py,sha256=mMU4zeZi8-uJe9voznNIxZCTCqJPbPXkMxHwgcqJ6z8,24640
124
124
  diffsynth_engine/models/qwen_image/qwen_image_dit_fbcache.py,sha256=LIv9X_BohKk5rcEzyl3ATLwd8MSoFX43wjkArQ68nq8,4828
125
- diffsynth_engine/models/qwen_image/qwen_image_dit_nunchaku.py,sha256=1y1BkPRrX4_RioKjM09D9f9PK9neug1nSGJka0D9bvM,13516
125
+ diffsynth_engine/models/qwen_image/qwen_image_dit_nunchaku.py,sha256=EIojuf27haxqI4wkJE_Y17HMjP82-iqvyJ5v5Kjns3o,13568
126
126
  diffsynth_engine/models/qwen_image/qwen_image_vae.py,sha256=FpauZV9IVvpvBeS9volu7kzH2mmCISS86AbHt0Jk2bQ,38442
127
127
  diffsynth_engine/models/sd/__init__.py,sha256=hjoKRnwoXOLD0wude-w7I6wK5ak7ACMbnbkPuBB2oU0,380
128
128
  diffsynth_engine/models/sd/sd_controlnet.py,sha256=kMGfIdriXhC7reT6iO2Z0rPICXEkXpytjeBQcR_sjT8,50577
@@ -186,7 +186,7 @@ diffsynth_engine/tools/flux_inpainting_tool.py,sha256=qHsYKUG20A19ujRdocpIPC4a_H
186
186
  diffsynth_engine/tools/flux_outpainting_tool.py,sha256=ff4qUj2mMYW6GMts7ifnJG7Rth55pfuggopRCyAXwJ8,3894
187
187
  diffsynth_engine/tools/flux_reference_tool.py,sha256=6v0NRZPsDEHFlPruO-ZJTB4rYWxKVAlmnYEeandD3r8,4723
188
188
  diffsynth_engine/tools/flux_replace_tool.py,sha256=AOyEGxHsaNwpTS2VChAieIfECgMxlKsRw0lWPm1k9C0,4627
189
- diffsynth_engine/tools/qwen_image_upscaler_tool.py,sha256=TFtITz113zoqsdRibVuLtWF8JEhGTqzyV2ZGHJuuYKw,13876
189
+ diffsynth_engine/tools/qwen_image_upscaler_tool.py,sha256=GMhV7Sphg2zgkOJhnZeLVWQJQv1d6QnOuQZXEvHgIyI,16222
190
190
  diffsynth_engine/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
191
191
  diffsynth_engine/utils/cache.py,sha256=Ivef22pCuhEq-4H00gSvkLS8ceVZoGis7OSitYL6gH4,2101
192
192
  diffsynth_engine/utils/constants.py,sha256=Tsn3EAByfZra-nGcx0NEcP9nWTPKaDGdatosE3BuPGE,3846
@@ -209,8 +209,8 @@ diffsynth_engine/utils/video.py,sha256=8FCaeqIdUsWMgWI_6SO9SPynsToGcLCQAVYFTc4CD
209
209
  diffsynth_engine/utils/memory/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
210
210
  diffsynth_engine/utils/memory/linear_regression.py,sha256=oW_EQEw13oPoyUrxiL8A7Ksa5AuJ2ynI2qhCbfAuZbg,3930
211
211
  diffsynth_engine/utils/memory/memory_predcit_model.py,sha256=EXprSl_zlVjgfMWNXP-iw83Ot3hyMcgYaRPv-dvyL84,3943
212
- diffsynth_engine-0.7.1.dev3.dist-info/licenses/LICENSE,sha256=x7aBqQuVI0IYnftgoTPI_A0I_rjdjPPQkjnU6N2nikM,11346
213
- diffsynth_engine-0.7.1.dev3.dist-info/METADATA,sha256=GdfffMwz8CD9vSlEKGlzjwp_fO19sYw0ulei0vx6rQY,1163
214
- diffsynth_engine-0.7.1.dev3.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
215
- diffsynth_engine-0.7.1.dev3.dist-info/top_level.txt,sha256=6zgbiIzEHLbhgDKRyX0uBJOV3F6VnGGBRIQvSiYYn6w,17
216
- diffsynth_engine-0.7.1.dev3.dist-info/RECORD,,
212
+ diffsynth_engine-0.7.1.dev5.dist-info/licenses/LICENSE,sha256=x7aBqQuVI0IYnftgoTPI_A0I_rjdjPPQkjnU6N2nikM,11346
213
+ diffsynth_engine-0.7.1.dev5.dist-info/METADATA,sha256=76gzYfIIeo_71jVybkzGLWiMpkm95ifPNZkL12gCRj8,1163
214
+ diffsynth_engine-0.7.1.dev5.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
215
+ diffsynth_engine-0.7.1.dev5.dist-info/top_level.txt,sha256=6zgbiIzEHLbhgDKRyX0uBJOV3F6VnGGBRIQvSiYYn6w,17
216
+ diffsynth_engine-0.7.1.dev5.dist-info/RECORD,,