diffsynth-engine 0.6.1.dev33__py3-none-any.whl → 0.6.1.dev34__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -343,7 +343,7 @@ def long_context_attention(
343
343
  f"head_dim={q.shape[-1]}, but aiter_flash_attn only supports head dimension at most {FA3_MAX_HEADDIM}, will use fallback attention implementation"
344
344
  )
345
345
  if SDPA_AVAILABLE:
346
- return LongContextAttention(attn_type=AttnType.TORCH)(q, k, v, softmax_scale=scale)
346
+ return LongContextAttention(attn_type=AttnType.TORCH_EFFICIENT)(q, k, v, softmax_scale=scale)
347
347
  if FLASH_ATTN_2_AVAILABLE:
348
348
  return LongContextAttention(attn_type=AttnType.FA)(q, k, v, softmax_scale=scale)
349
349
  raise ValueError("No available long context attention implementation")
@@ -379,7 +379,7 @@ def long_context_attention(
379
379
  if attn_impl == "fa2":
380
380
  return LongContextAttention(attn_type=AttnType.FA)(q, k, v, softmax_scale=scale)
381
381
  if attn_impl == "sdpa":
382
- return LongContextAttention(attn_type=AttnType.TORCH)(q, k, v, softmax_scale=scale)
382
+ return LongContextAttention(attn_type=AttnType.TORCH_EFFICIENT)(q, k, v, softmax_scale=scale)
383
383
  if attn_impl == "sage":
384
384
  return LongContextAttention(attn_type=AttnType.SAGE_AUTO)(q, k, v, softmax_scale=scale)
385
385
  if attn_impl == "sparge":
@@ -286,16 +286,15 @@ class QwenImageTransformerBlock(nn.Module):
286
286
  shift_0, shift_1 = shift[:actual_batch], shift[actual_batch:]
287
287
  scale_0, scale_1 = scale[:actual_batch], scale[actual_batch:]
288
288
  gate_0, gate_1 = gate[:actual_batch], gate[actual_batch:]
289
- index_expanded = index.unsqueeze(-1)
290
289
  shift_0_exp = shift_0.unsqueeze(1)
291
290
  shift_1_exp = shift_1.unsqueeze(1)
292
291
  scale_0_exp = scale_0.unsqueeze(1)
293
292
  scale_1_exp = scale_1.unsqueeze(1)
294
293
  gate_0_exp = gate_0.unsqueeze(1)
295
294
  gate_1_exp = gate_1.unsqueeze(1)
296
- shift_result = torch.where(index_expanded == 0, shift_0_exp, shift_1_exp)
297
- scale_result = torch.where(index_expanded == 0, scale_0_exp, scale_1_exp)
298
- gate_result = torch.where(index_expanded == 0, gate_0_exp, gate_1_exp)
295
+ shift_result = torch.where(index == 0, shift_0_exp, shift_1_exp)
296
+ scale_result = torch.where(index == 0, scale_0_exp, scale_1_exp)
297
+ gate_result = torch.where(index == 0, gate_0_exp, gate_1_exp)
299
298
  else:
300
299
  shift_result = shift.unsqueeze(1)
301
300
  scale_result = scale.unsqueeze(1)
@@ -514,6 +513,7 @@ class QwenImageDiT(PreTrainedModel):
514
513
  device=timestep.device,
515
514
  dtype=torch.int,
516
515
  )
516
+ modulate_index = modulate_index.unsqueeze(-1)
517
517
  rotary_emb = self.pos_embed(video_fhw, text_seq_len, image.device)
518
518
 
519
519
  image = self.img_in(image)
@@ -535,7 +535,7 @@ class QwenImageDiT(PreTrainedModel):
535
535
 
536
536
  # warning: Eligen does not work with sequence parallel because long context attention does not support attention masks
537
537
  img_freqs, txt_freqs = rotary_emb
538
- with sequence_parallel((image, text, img_freqs, txt_freqs), seq_dims=(1, 1, 0, 0)):
538
+ with sequence_parallel((image, text, img_freqs, txt_freqs, modulate_index), seq_dims=(1, 1, 0, 0, 1)):
539
539
  rotary_emb = (img_freqs, txt_freqs)
540
540
  for block in self.transformer_blocks:
541
541
  text, image = block(
@@ -685,7 +685,6 @@ class VideoVAE(nn.Module):
685
685
  x = patchify(x, patch_size=2 if self.in_channels == 12 else 1)
686
686
  t = x.shape[2]
687
687
  iter_ = 1 + (t - 1) // 4
688
-
689
688
  for i in range(iter_):
690
689
  if i == 0:
691
690
  out = self.encoder(x[:, :, :1, :, :], feat_cache=feat_cache)
@@ -165,7 +165,7 @@ class QwenImagePipeline(BasePipeline):
165
165
  self.edit_prompt_template_encode_start_idx = 64
166
166
 
167
167
  # sampler
168
- self.noise_scheduler = RecifitedFlowScheduler(shift=3.0, use_dynamic_shifting=True)
168
+ self.noise_scheduler = RecifitedFlowScheduler(shift=3.0, use_dynamic_shifting=True, shift_terminal=0.02)
169
169
  self.sampler = FlowMatchEulerSampler()
170
170
  # models
171
171
  self.tokenizer = tokenizer
@@ -690,8 +690,9 @@ class QwenImagePipeline(BasePipeline):
690
690
  img_width, img_height = img.size
691
691
  condition_width, condition_height = self.calculate_dimensions(384 * 384, img_width / img_height)
692
692
  vae_width, vae_height = self.calculate_dimensions(1024 * 1024, img_width / img_height)
693
- condition_images.append(img.resize((condition_width, condition_height), Image.LANCZOS))
694
- vae_images.append(img.resize((vae_width, vae_height), Image.LANCZOS))
693
+ condition_images.append(img.resize((condition_width, condition_height)))
694
+ vae_images.append(img.resize((vae_width, vae_height)))
695
+
695
696
  if width is None and height is None:
696
697
  width, height = vae_images[-1].size
697
698
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: diffsynth_engine
3
- Version: 0.6.1.dev33
3
+ Version: 0.6.1.dev34
4
4
  Author: MuseAI x ModelScope
5
5
  Classifier: Programming Language :: Python :: 3
6
6
  Classifier: Operating System :: OS Independent
@@ -86,7 +86,7 @@ diffsynth_engine/kernels/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZ
86
86
  diffsynth_engine/models/__init__.py,sha256=8Ze7cSE8InetgXWTNb0neVA2Q44K7WlE-h7O-02m2sY,119
87
87
  diffsynth_engine/models/base.py,sha256=svao__9WH8VNcyXz5o5dzywYXDcGV0YV9IfkLzDKews,2558
88
88
  diffsynth_engine/models/basic/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
89
- diffsynth_engine/models/basic/attention.py,sha256=62Ar8_ydnn28F1qH9ueXtvISgNszQK3q8k14gCIXGEs,15681
89
+ diffsynth_engine/models/basic/attention.py,sha256=YrIxkYoekC3I7-sMTw60CL4GIKMLOTrn-eCk-iHT7E4,15701
90
90
  diffsynth_engine/models/basic/lora.py,sha256=Y6cBgrBsuDAP9FZz_fgK8vBi_EMg23saFIUSAsPIG-M,10670
91
91
  diffsynth_engine/models/basic/lora_nunchaku.py,sha256=7qhzGCzUIfDrwtWG0nspwdyZ7YUkaM4vMqzxZby2Zds,7510
92
92
  diffsynth_engine/models/basic/relative_position_emb.py,sha256=rCXOweZMcayVnNUVvBcYXMdhHS257B_PC8PZSWxvhNQ,2540
@@ -111,10 +111,10 @@ diffsynth_engine/models/hunyuan3d/surface_extractor.py,sha256=b15mb1N4PYwAvDk1Gu
111
111
  diffsynth_engine/models/hunyuan3d/volume_decoder.py,sha256=sgflj1a8sIerqGSalBAVQOlyiIihkLOLXYysNbulCoQ,2355
112
112
  diffsynth_engine/models/qwen_image/__init__.py,sha256=_6f0LWaoLdDvD2CsjK2OzEIQryt9efge8DFS4_GUnHQ,582
113
113
  diffsynth_engine/models/qwen_image/qwen2_5_vl.py,sha256=Eu-r-c42t_q74Qpwz21ToCGHpvSi7VND4B1EI0e-ePA,57748
114
- diffsynth_engine/models/qwen_image/qwen_image_dit.py,sha256=JEyK_yOa0A5xaqlmxI3nfD7NdCaHuvLDA10aWVbnac4,24635
114
+ diffsynth_engine/models/qwen_image/qwen_image_dit.py,sha256=mMU4zeZi8-uJe9voznNIxZCTCqJPbPXkMxHwgcqJ6z8,24640
115
115
  diffsynth_engine/models/qwen_image/qwen_image_dit_fbcache.py,sha256=LIv9X_BohKk5rcEzyl3ATLwd8MSoFX43wjkArQ68nq8,4828
116
116
  diffsynth_engine/models/qwen_image/qwen_image_dit_nunchaku.py,sha256=1y1BkPRrX4_RioKjM09D9f9PK9neug1nSGJka0D9bvM,13516
117
- diffsynth_engine/models/qwen_image/qwen_image_vae.py,sha256=eO7f4YqiYXfw7NncBNFTu-xEvdJ5uKY-SnfP15QY0tE,38443
117
+ diffsynth_engine/models/qwen_image/qwen_image_vae.py,sha256=FpauZV9IVvpvBeS9volu7kzH2mmCISS86AbHt0Jk2bQ,38442
118
118
  diffsynth_engine/models/sd/__init__.py,sha256=hjoKRnwoXOLD0wude-w7I6wK5ak7ACMbnbkPuBB2oU0,380
119
119
  diffsynth_engine/models/sd/sd_controlnet.py,sha256=kMGfIdriXhC7reT6iO2Z0rPICXEkXpytjeBQcR_sjT8,50577
120
120
  diffsynth_engine/models/sd/sd_text_encoder.py,sha256=BUOsBtSb7WH4Z37JhtYxOtpXMDJcQXZWzx_7JNbsJwM,5369
@@ -146,7 +146,7 @@ diffsynth_engine/pipelines/__init__.py,sha256=jh-4LSJ0vqlXiT8BgFgRIQxuAr2atEPyHr
146
146
  diffsynth_engine/pipelines/base.py,sha256=ShRiX5MY6bUkRKfuGrA1aalAqeHyeZxhzT87Mwc30b4,17231
147
147
  diffsynth_engine/pipelines/flux_image.py,sha256=L0ggxpthLD8a5-zdPHu9z668uWBei9YzPb4PFVypDNU,50707
148
148
  diffsynth_engine/pipelines/hunyuan3d_shape.py,sha256=TNV0Wr09Dj2bzzlpua9WioCClOj3YiLfE6utI9aWL8A,8164
149
- diffsynth_engine/pipelines/qwen_image.py,sha256=lrqwF3fikgQouifb-8KwWCxQhNVZard_7buoJqxHD7s,35759
149
+ diffsynth_engine/pipelines/qwen_image.py,sha256=9n0fZCYw5E1iloXqd7vOU0XfHVPxQp_pm-v4D3Oloos,35751
150
150
  diffsynth_engine/pipelines/sd_image.py,sha256=nr-Nhsnomq8CsUqhTM3i2l2zG01YjwXdfRXgr_bC3F0,17891
151
151
  diffsynth_engine/pipelines/sdxl_image.py,sha256=v7ZACGPb6EcBunL6e5E9jynSQjE7GQx8etEV-ZLP91g,21704
152
152
  diffsynth_engine/pipelines/utils.py,sha256=HZbJHErNJS1DhlwJKvZ9dY7Kh8Zdlsw3zE2e88TYGRY,2277
@@ -190,8 +190,8 @@ diffsynth_engine/utils/video.py,sha256=8FCaeqIdUsWMgWI_6SO9SPynsToGcLCQAVYFTc4CD
190
190
  diffsynth_engine/utils/memory/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
191
191
  diffsynth_engine/utils/memory/linear_regression.py,sha256=oW_EQEw13oPoyUrxiL8A7Ksa5AuJ2ynI2qhCbfAuZbg,3930
192
192
  diffsynth_engine/utils/memory/memory_predcit_model.py,sha256=EXprSl_zlVjgfMWNXP-iw83Ot3hyMcgYaRPv-dvyL84,3943
193
- diffsynth_engine-0.6.1.dev33.dist-info/licenses/LICENSE,sha256=x7aBqQuVI0IYnftgoTPI_A0I_rjdjPPQkjnU6N2nikM,11346
194
- diffsynth_engine-0.6.1.dev33.dist-info/METADATA,sha256=pgyNkuwU3lMQA66waiIU3BVtw-7zN3s8pEvinWC_LpI,1164
195
- diffsynth_engine-0.6.1.dev33.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
196
- diffsynth_engine-0.6.1.dev33.dist-info/top_level.txt,sha256=6zgbiIzEHLbhgDKRyX0uBJOV3F6VnGGBRIQvSiYYn6w,17
197
- diffsynth_engine-0.6.1.dev33.dist-info/RECORD,,
193
+ diffsynth_engine-0.6.1.dev34.dist-info/licenses/LICENSE,sha256=x7aBqQuVI0IYnftgoTPI_A0I_rjdjPPQkjnU6N2nikM,11346
194
+ diffsynth_engine-0.6.1.dev34.dist-info/METADATA,sha256=Uu-yhnydrVudp5RdK0ifk9-q4J_18zulQge4fNs24Z0,1164
195
+ diffsynth_engine-0.6.1.dev34.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
196
+ diffsynth_engine-0.6.1.dev34.dist-info/top_level.txt,sha256=6zgbiIzEHLbhgDKRyX0uBJOV3F6VnGGBRIQvSiYYn6w,17
197
+ diffsynth_engine-0.6.1.dev34.dist-info/RECORD,,