diffsynth-engine 0.6.1.dev20__py3-none-any.whl → 0.6.1.dev22__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -242,6 +242,8 @@ class QwenImagePipelineConfig(AttentionConfig, OptimizationConfig, ParallelConfi
242
242
  vae_tile_size: Tuple[int, int] = (34, 34)
243
243
  vae_tile_stride: Tuple[int, int] = (18, 16)
244
244
 
245
+ load_encoder: bool = True
246
+
245
247
  @classmethod
246
248
  def basic_config(
247
249
  cls,
@@ -830,7 +830,7 @@ class FluxImagePipeline(BasePipeline):
830
830
  masked_image = image.clone()
831
831
  masked_image[(mask > 0.5).repeat(1, 3, 1, 1)] = -1
832
832
  latent = self.encode_image(masked_image)
833
- mask = torch.nn.functional.interpolate(mask, size=(latent.shape[2], latent.shape[3]))
833
+ mask = torch.nn.functional.interpolate(mask, size=(latent.shape[2], latent.shape[3])).to(latent.dtype)
834
834
  mask = 1 - mask
835
835
  latent = torch.cat([latent, mask], dim=1)
836
836
  elif self.config.control_type == ControlType.bfl_fill:
@@ -186,6 +186,7 @@ class QwenImagePipeline(BasePipeline):
186
186
  logger.info(f"loading state dict from {config.vae_path} ...")
187
187
  vae_state_dict = cls.load_model_checkpoint(config.vae_path, device="cpu", dtype=config.vae_dtype)
188
188
 
189
+ encoder_state_dict = None
189
190
  if config.encoder_path is None:
190
191
  config.encoder_path = fetch_model(
191
192
  "MusePublic/Qwen-image",
@@ -197,8 +198,9 @@ class QwenImagePipeline(BasePipeline):
197
198
  "text_encoder/model-00004-of-00004.safetensors",
198
199
  ],
199
200
  )
200
- logger.info(f"loading state dict from {config.encoder_path} ...")
201
- encoder_state_dict = cls.load_model_checkpoint(config.encoder_path, device="cpu", dtype=config.encoder_dtype)
201
+ if config.load_encoder:
202
+ logger.info(f"loading state dict from {config.encoder_path} ...")
203
+ encoder_state_dict = cls.load_model_checkpoint(config.encoder_path, device="cpu", dtype=config.encoder_dtype)
202
204
 
203
205
  state_dicts = QwenImageStateDicts(
204
206
  model=model_state_dict,
@@ -225,22 +227,25 @@ class QwenImagePipeline(BasePipeline):
225
227
  @classmethod
226
228
  def _from_state_dict(cls, state_dicts: QwenImageStateDicts, config: QwenImagePipelineConfig) -> "QwenImagePipeline":
227
229
  init_device = "cpu" if config.offload_mode is not None else config.device
228
- tokenizer = Qwen2TokenizerFast.from_pretrained(QWEN_IMAGE_TOKENIZER_CONF_PATH)
229
- processor = Qwen2VLProcessor.from_pretrained(
230
- tokenizer_config_path=QWEN_IMAGE_TOKENIZER_CONF_PATH,
231
- image_processor_config_path=QWEN_IMAGE_PROCESSOR_CONFIG_FILE,
232
- )
233
- with open(QWEN_IMAGE_VISION_CONFIG_FILE, "r", encoding="utf-8") as f:
234
- vision_config = Qwen2_5_VLVisionConfig(**json.load(f))
235
- with open(QWEN_IMAGE_CONFIG_FILE, "r", encoding="utf-8") as f:
236
- text_config = Qwen2_5_VLConfig(**json.load(f))
237
- encoder = Qwen2_5_VLForConditionalGeneration.from_state_dict(
238
- state_dicts.encoder,
239
- vision_config=vision_config,
240
- config=text_config,
241
- device=("cpu" if config.use_fsdp else init_device),
242
- dtype=config.encoder_dtype,
243
- )
230
+ tokenizer, processor, encoder = None, None, None
231
+ if config.load_encoder:
232
+ tokenizer = Qwen2TokenizerFast.from_pretrained(QWEN_IMAGE_TOKENIZER_CONF_PATH)
233
+ processor = Qwen2VLProcessor.from_pretrained(
234
+ tokenizer_config_path=QWEN_IMAGE_TOKENIZER_CONF_PATH,
235
+ image_processor_config_path=QWEN_IMAGE_PROCESSOR_CONFIG_FILE,
236
+ )
237
+ with open(QWEN_IMAGE_VISION_CONFIG_FILE, "r", encoding="utf-8") as f:
238
+ vision_config = Qwen2_5_VLVisionConfig(**json.load(f))
239
+ with open(QWEN_IMAGE_CONFIG_FILE, "r", encoding="utf-8") as f:
240
+ text_config = Qwen2_5_VLConfig(**json.load(f))
241
+ encoder = Qwen2_5_VLForConditionalGeneration.from_state_dict(
242
+ state_dicts.encoder,
243
+ vision_config=vision_config,
244
+ config=text_config,
245
+ device=("cpu" if config.use_fsdp else init_device),
246
+ dtype=config.encoder_dtype,
247
+ )
248
+
244
249
  with open(QWEN_IMAGE_VAE_CONFIG_FILE, "r", encoding="utf-8") as f:
245
250
  vae_config = json.load(f)
246
251
  vae = QwenImageVAE.from_state_dict(
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: diffsynth_engine
3
- Version: 0.6.1.dev20
3
+ Version: 0.6.1.dev22
4
4
  Author: MuseAI x ModelScope
5
5
  Classifier: Programming Language :: Python :: 3
6
6
  Classifier: Operating System :: OS Independent
@@ -80,7 +80,7 @@ diffsynth_engine/conf/tokenizers/wan/umt5-xxl/tokenizer.json,sha256=bhl7TT29cdoU
80
80
  diffsynth_engine/conf/tokenizers/wan/umt5-xxl/tokenizer_config.json,sha256=7Zo6iw-qcacKMoR-BDX-A25uES1N9O23u0ipIeNE3AU,61728
81
81
  diffsynth_engine/configs/__init__.py,sha256=f6Y-j_ZQs7bM4Lr7Mh9CXFEBrSNLc9k5GJyJqjLAGiY,1187
82
82
  diffsynth_engine/configs/controlnet.py,sha256=f3vclyP3lcAjxDGD9C1vevhqqQ7W2LL_c6Wye0uxk3Q,1180
83
- diffsynth_engine/configs/pipeline.py,sha256=u4P0JnzSsvS_tfbTYyUARdT88k7TEGRYNqjaAPZlY40,13223
83
+ diffsynth_engine/configs/pipeline.py,sha256=FwHIvj2VdxtmiHxOUYoAzs5YVBprxobhV9AJ2CFrV4c,13254
84
84
  diffsynth_engine/kernels/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
85
85
  diffsynth_engine/models/__init__.py,sha256=8Ze7cSE8InetgXWTNb0neVA2Q44K7WlE-h7O-02m2sY,119
86
86
  diffsynth_engine/models/base.py,sha256=BA5vgMqfy_cjuL2OtXbrFD-Qg5xQnaumHpj5TabwSy8,2559
@@ -140,9 +140,9 @@ diffsynth_engine/models/wan/wan_text_encoder.py,sha256=OERlmwOqthAFPNnnT2sXJ4Ojy
140
140
  diffsynth_engine/models/wan/wan_vae.py,sha256=dC7MoUFeXRL7SIY0LG1OOUiZW-pp9IbXCghutMxpXr4,38889
141
141
  diffsynth_engine/pipelines/__init__.py,sha256=jh-4LSJ0vqlXiT8BgFgRIQxuAr2atEPyHrxXWj-Ud1U,604
142
142
  diffsynth_engine/pipelines/base.py,sha256=BWW7LW0E2qwu8G-6bP3nmeO7VCQxC8srOo8tE4aKA4o,14993
143
- diffsynth_engine/pipelines/flux_image.py,sha256=Dpy8AkwywuLAhvJ6cjg5TgzhSUgFQtv6p2JTTkzUHbo,50919
143
+ diffsynth_engine/pipelines/flux_image.py,sha256=vJKvnYmeeQVX2O1Zjtm4NLrltBp66VSZ-KjAUqJ8zJ8,50936
144
144
  diffsynth_engine/pipelines/hunyuan3d_shape.py,sha256=TNV0Wr09Dj2bzzlpua9WioCClOj3YiLfE6utI9aWL8A,8164
145
- diffsynth_engine/pipelines/qwen_image.py,sha256=jt4rg-U5qWsFD0kUeDwKzgIiTAC80Cj8aq1YQOR1_-k,33052
145
+ diffsynth_engine/pipelines/qwen_image.py,sha256=rksB8tiAEp9TIcLLca269dNFQRPIDxffThKRMuR06A0,33280
146
146
  diffsynth_engine/pipelines/sd_image.py,sha256=nr-Nhsnomq8CsUqhTM3i2l2zG01YjwXdfRXgr_bC3F0,17891
147
147
  diffsynth_engine/pipelines/sdxl_image.py,sha256=v7ZACGPb6EcBunL6e5E9jynSQjE7GQx8etEV-ZLP91g,21704
148
148
  diffsynth_engine/pipelines/utils.py,sha256=lk7sFGEk-fGjgadLpwwppHKG-yZ0RC-4ZmHW7pRRe8A,473
@@ -185,8 +185,8 @@ diffsynth_engine/utils/video.py,sha256=8FCaeqIdUsWMgWI_6SO9SPynsToGcLCQAVYFTc4CD
185
185
  diffsynth_engine/utils/memory/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
186
186
  diffsynth_engine/utils/memory/linear_regression.py,sha256=oW_EQEw13oPoyUrxiL8A7Ksa5AuJ2ynI2qhCbfAuZbg,3930
187
187
  diffsynth_engine/utils/memory/memory_predcit_model.py,sha256=EXprSl_zlVjgfMWNXP-iw83Ot3hyMcgYaRPv-dvyL84,3943
188
- diffsynth_engine-0.6.1.dev20.dist-info/licenses/LICENSE,sha256=x7aBqQuVI0IYnftgoTPI_A0I_rjdjPPQkjnU6N2nikM,11346
189
- diffsynth_engine-0.6.1.dev20.dist-info/METADATA,sha256=JZJRwz1ckJI0aMe_StTIj8LGkSYg8rRMQvqUsvqgx3s,1164
190
- diffsynth_engine-0.6.1.dev20.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
191
- diffsynth_engine-0.6.1.dev20.dist-info/top_level.txt,sha256=6zgbiIzEHLbhgDKRyX0uBJOV3F6VnGGBRIQvSiYYn6w,17
192
- diffsynth_engine-0.6.1.dev20.dist-info/RECORD,,
188
+ diffsynth_engine-0.6.1.dev22.dist-info/licenses/LICENSE,sha256=x7aBqQuVI0IYnftgoTPI_A0I_rjdjPPQkjnU6N2nikM,11346
189
+ diffsynth_engine-0.6.1.dev22.dist-info/METADATA,sha256=K5yUarSjYpbjDvqrG1i7rKrY2r1ILPkcuNupcMTDsvY,1164
190
+ diffsynth_engine-0.6.1.dev22.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
191
+ diffsynth_engine-0.6.1.dev22.dist-info/top_level.txt,sha256=6zgbiIzEHLbhgDKRyX0uBJOV3F6VnGGBRIQvSiYYn6w,17
192
+ diffsynth_engine-0.6.1.dev22.dist-info/RECORD,,