PyPI - diffsynth-engine - Versions diffs - 0.4.4.dev2__tar.gz → 0.4.4.dev4__tar.gz - Mend

diffsynth-engine 0.4.4.dev2tar.gz → 0.4.4.dev4tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (203) hide show

{diffsynth_engine-0.4.4.dev2 → diffsynth_engine-0.4.4.dev4}/.gitignore RENAMED Viewed

@@ -9,4 +9,5 @@ dist/
 .DS_Store/
 .pytest_cache/
 .ruff_cache/
-CLAUDE.md
+CLAUDE.md
+.claude/

{diffsynth_engine-0.4.4.dev2 → diffsynth_engine-0.4.4.dev4}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: diffsynth_engine
-Version: 0.4.4.dev2
+Version: 0.4.4.dev4
 Author: MuseAI x ModelScope
 Classifier: Programming Language :: Python :: 3
 Classifier: Operating System :: OS Independent

{diffsynth_engine-0.4.4.dev2 → diffsynth_engine-0.4.4.dev4}/diffsynth_engine/__init__.py RENAMED Viewed

@@ -4,17 +4,19 @@ from .configs import (
     FluxPipelineConfig,
     WanPipelineConfig,
     QwenImagePipelineConfig,
+    HunyuanPipelineConfig,
     SDStateDicts,
     SDXLStateDicts,
     FluxStateDicts,
+    WanStateDicts,
     QwenImageStateDicts,
     ControlNetParams,
     ControlType,
 )
 from .pipelines import (
-    FluxImagePipeline,
-    SDXLImagePipeline,
     SDImagePipeline,
+    SDXLImagePipeline,
+    FluxImagePipeline,
     WanVideoPipeline,
     QwenImagePipeline,
     Hunyuan3DShapePipeline,
@@ -22,6 +24,13 @@ from .pipelines import (
 from .models.flux import FluxControlNet, FluxIPAdapter, FluxRedux
 from .models.sd import SDControlNet
 from .models.sdxl import SDXLControlNetUnion
+from .tools import (
+    FluxInpaintingTool,
+    FluxOutpaintingTool,
+    FluxIPAdapterRefTool,
+    FluxReduxRefTool,
+    FluxReplaceByControlTool,
+)
 from .utils.download import (
     fetch_model,
     fetch_modelscope_model,
@@ -30,32 +39,29 @@ from .utils.download import (
     reset_fetch_modelscope_model,
 )
 from .utils.video import load_video, save_video
-from .tools import (
-    FluxInpaintingTool,
-    FluxOutpaintingTool,
-    FluxIPAdapterRefTool,
-    FluxReduxRefTool,
-    FluxReplaceByControlTool,
-)
 __all__ = [
     "SDPipelineConfig",
     "SDXLPipelineConfig",
     "FluxPipelineConfig",
     "WanPipelineConfig",
+    "QwenImagePipelineConfig",
+    "HunyuanPipelineConfig",
     "SDStateDicts",
     "SDXLStateDicts",
     "FluxStateDicts",
+    "WanStateDicts",
     "QwenImageStateDicts",
+    "ControlNetParams",
+    "ControlType",
+    "SDImagePipeline",
+    "SDControlNet",
+    "SDXLImagePipeline",
+    "SDXLControlNetUnion",
     "FluxImagePipeline",
-    "QwenImagePipelineConfig",
     "FluxControlNet",
     "FluxIPAdapter",
     "FluxRedux",
-    "SDControlNet",
-    "SDXLControlNetUnion",
-    "SDXLImagePipeline",
-    "SDImagePipeline",
     "WanVideoPipeline",
     "QwenImagePipeline",
     "Hunyuan3DShapePipeline",
@@ -64,8 +70,6 @@ __all__ = [
     "FluxIPAdapterRefTool",
     "FluxReplaceByControlTool",
     "FluxReduxRefTool",
-    "ControlNetParams",
-    "ControlType",
     "fetch_model",
     "fetch_modelscope_model",
     "register_fetch_modelscope_model",

{diffsynth_engine-0.4.4.dev2 → diffsynth_engine-0.4.4.dev4}/diffsynth_engine/configs/__init__.py RENAMED Viewed

@@ -8,10 +8,12 @@ from .pipeline import (
     FluxPipelineConfig,
     WanPipelineConfig,
     QwenImagePipelineConfig,
+    HunyuanPipelineConfig,
     BaseStateDicts,
     SDStateDicts,
     SDXLStateDicts,
     FluxStateDicts,
+    WanStateDicts,
     QwenImageStateDicts,
 )
 from .controlnet import ControlType, ControlNetParams
@@ -26,11 +28,13 @@ __all__ = [
     "FluxPipelineConfig",
     "WanPipelineConfig",
     "QwenImagePipelineConfig",
-    "ControlType",
-    "ControlNetParams",
+    "HunyuanPipelineConfig",
     "BaseStateDicts",
     "SDStateDicts",
     "SDXLStateDicts",
     "FluxStateDicts",
+    "WanStateDicts",
     "QwenImageStateDicts",
+    "ControlType",
+    "ControlNetParams",
 ]

{diffsynth_engine-0.4.4.dev2 → diffsynth_engine-0.4.4.dev4}/diffsynth_engine/configs/pipeline.py RENAMED Viewed

@@ -1,7 +1,7 @@
 import os
 import torch
 from dataclasses import dataclass, field
-from typing import List, Tuple, Optional, Dict
+from typing import List, Dict, Tuple, Optional
 from diffsynth_engine.configs.controlnet import ControlType
@@ -127,7 +127,7 @@ class FluxPipelineConfig(AttentionConfig, OptimizationConfig, ParallelConfig, Ba
             model_path=model_path,
             device=device,
             parallelism=parallelism,
-            use_fsdp=True,
+            use_fsdp=True if parallelism > 1 else False,
             offload_mode=offload_mode,
             offload_to_disk=offload_to_disk,
         )
@@ -174,8 +174,8 @@ class WanPipelineConfig(AttentionConfig, OptimizationConfig, ParallelConfig, Bas
             image_encoder_path=image_encoder_path,
             device=device,
             parallelism=parallelism,
-            use_cfg_parallel=True,
-            use_fsdp=True,
+            use_cfg_parallel=True if parallelism > 1 else False,
+            use_fsdp=True if parallelism > 1 else False,
             offload_mode=offload_mode,
             offload_to_disk=offload_to_disk,
         )
@@ -184,16 +184,6 @@ class WanPipelineConfig(AttentionConfig, OptimizationConfig, ParallelConfig, Bas
         init_parallel_config(self)
-@dataclass
-class HunyuanPipelineConfig(BaseConfig):
-    model_path: str | os.PathLike | List[str | os.PathLike]
-    model_dtype: torch.dtype = torch.float16
-    vae_path: Optional[str | os.PathLike | List[str | os.PathLike]] = None
-    vae_dtype: torch.dtype = torch.float16
-    image_encoder_path: Optional[str | os.PathLike | List[str | os.PathLike]] = None
-    image_encoder_dtype: torch.dtype = torch.float16
 @dataclass
 class QwenImagePipelineConfig(AttentionConfig, OptimizationConfig, ParallelConfig, BaseConfig):
     model_path: str | os.PathLike | List[str | os.PathLike]
@@ -228,8 +218,8 @@ class QwenImagePipelineConfig(AttentionConfig, OptimizationConfig, ParallelConfi
             encoder_path=encoder_path,
             vae_path=vae_path,
             parallelism=parallelism,
-            use_cfg_parallel=True,
-            use_fsdp=True,
+            use_cfg_parallel=True if parallelism > 1 else False,
+            use_fsdp=True if parallelism > 1 else False,
             offload_mode=offload_mode,
             offload_to_disk=offload_to_disk,
         )
@@ -238,32 +228,57 @@ class QwenImagePipelineConfig(AttentionConfig, OptimizationConfig, ParallelConfi
         init_parallel_config(self)
+@dataclass
+class HunyuanPipelineConfig(BaseConfig):
+    model_path: str | os.PathLike | List[str | os.PathLike]
+    model_dtype: torch.dtype = torch.float16
+    vae_path: Optional[str | os.PathLike | List[str | os.PathLike]] = None
+    vae_dtype: torch.dtype = torch.float16
+    image_encoder_path: Optional[str | os.PathLike | List[str | os.PathLike]] = None
+    image_encoder_dtype: torch.dtype = torch.float16
 @dataclass
 class BaseStateDicts:
-    model: Optional[Dict[str, torch.Tensor]] = None
-    vae: Optional[Dict[str, torch.Tensor]] = None
+    pass
+@dataclass
+class SDStateDicts:
+    model: Dict[str, torch.Tensor]
+    clip: Dict[str, torch.Tensor]
+    vae: Dict[str, torch.Tensor]
 @dataclass
-class SDStateDicts(BaseStateDicts):
-    clip: Optional[Dict[str, torch.Tensor]] = None
+class SDXLStateDicts:
+    model: Dict[str, torch.Tensor]
+    clip_l: Dict[str, torch.Tensor]
+    clip_g: Dict[str, torch.Tensor]
+    vae: Dict[str, torch.Tensor]
 @dataclass
-class SDXLStateDicts(BaseStateDicts):
-    clip_l: Optional[Dict[str, torch.Tensor]] = None
-    clip_g: Optional[Dict[str, torch.Tensor]] = None
+class FluxStateDicts:
+    model: Dict[str, torch.Tensor]
+    t5: Dict[str, torch.Tensor]
+    clip: Dict[str, torch.Tensor]
+    vae: Dict[str, torch.Tensor]
 @dataclass
-class FluxStateDicts(BaseStateDicts):
-    t5: Optional[Dict[str, torch.Tensor]] = None
-    clip: Optional[Dict[str, torch.Tensor]] = None
+class WanStateDicts:
+    model: Dict[str, torch.Tensor] | Dict[str, Dict[str, torch.Tensor]]
+    t5: Dict[str, torch.Tensor]
+    vae: Dict[str, torch.Tensor]
+    image_encoder: Optional[Dict[str, torch.Tensor]] = None
 @dataclass
-class QwenImageStateDicts(BaseStateDicts):
-    encoder: Optional[Dict[str, torch.Tensor]] = None
+class QwenImageStateDicts:
+    model: Dict[str, torch.Tensor]
+    encoder: Dict[str, torch.Tensor]
+    vae: Dict[str, torch.Tensor]
 def init_parallel_config(config: FluxPipelineConfig | QwenImagePipelineConfig | WanPipelineConfig):

{diffsynth_engine-0.4.4.dev2 → diffsynth_engine-0.4.4.dev4}/diffsynth_engine/models/base.py RENAMED Viewed

@@ -4,7 +4,6 @@ import torch.nn as nn
 from typing import Dict, Union, List, Any
 from diffsynth_engine.utils.loader import load_file
 from diffsynth_engine.models.basic.lora import LoRALinear, LoRAConv2d
-from diffsynth_engine.models.utils import no_init_weights
 class StateDictConverter:
@@ -33,10 +32,9 @@ class PreTrainedModel(nn.Module):
     @classmethod
     def from_state_dict(cls, state_dict: Dict[str, torch.Tensor], device: str, dtype: torch.dtype, **kwargs):
-        with no_init_weights():
-            model = torch.nn.utils.skip_init(cls, device=device, dtype=dtype, **kwargs)
-        model.to_empty(device=device)
-        model.load_state_dict(state_dict)
+        model = cls(device="meta", dtype=dtype, **kwargs)
+        model.requires_grad_(False)
+        model.load_state_dict(state_dict, assign=True)
         model.to(device=device, dtype=dtype, non_blocking=True)
         return model

{diffsynth_engine-0.4.4.dev2 → diffsynth_engine-0.4.4.dev4}/diffsynth_engine/models/basic/lora.py RENAMED Viewed

@@ -74,14 +74,13 @@ class LoRALinear(nn.Linear):
     @staticmethod
     def from_linear(linear: nn.Linear):
-        lora_linear = torch.nn.utils.skip_init(
-            LoRALinear,
+        lora_linear = LoRALinear(
             linear.in_features,
             linear.out_features,
             linear.bias is not None,
-            device=linear.weight.device,
+            device="meta",
             dtype=linear.weight.dtype,
-        )
+        ).to_empty(device=linear.weight.device)
         lora_linear.weight = linear.weight
         lora_linear.bias = linear.bias
         return lora_linear
@@ -98,12 +97,20 @@ class LoRALinear(nn.Linear):
         dtype: torch.dtype,
         **kwargs,
     ):
-        up_linear = torch.nn.utils.skip_init(
-            nn.Linear, up.shape[1], up.shape[0], bias=False, device=device, dtype=dtype
-        )
-        down_linear = torch.nn.utils.skip_init(
-            nn.Linear, down.shape[0], down.shape[1], bias=False, device=device, dtype=dtype
-        )
+        up_linear = nn.Linear(
+            up.shape[1],
+            up.shape[0],
+            bias=False,
+            device="meta",
+            dtype=dtype,
+        ).to_empty(device=device)
+        down_linear = nn.Linear(
+            down.shape[0],
+            down.shape[1],
+            bias=False,
+            device="meta",
+            dtype=dtype,
+        ).to_empty(device=device)
         up_linear.weight.data = up
         down_linear.weight.data = down
         lora = LoRA(scale, rank, alpha, up_linear, down_linear, device, dtype)
@@ -182,8 +189,7 @@ class LoRAConv2d(nn.Conv2d):
     @staticmethod
     def from_conv2d(conv2d: nn.Conv2d):
-        lora_conv2d = torch.nn.utils.skip_init(
-            LoRAConv2d,
+        lora_conv2d = LoRAConv2d(
             conv2d.in_channels,
             conv2d.out_channels,
             conv2d.kernel_size,
@@ -193,9 +199,9 @@ class LoRAConv2d(nn.Conv2d):
             conv2d.groups,
             conv2d.bias is not None,
             conv2d.padding_mode,
-            device=conv2d.weight.device,
+            device="meta",
             dtype=conv2d.weight.dtype,
-        )
+        ).to_empty(device=conv2d.weight.device)
         lora_conv2d.weight = conv2d.weight
         lora_conv2d.bias = conv2d.bias
         return lora_conv2d
@@ -211,31 +217,29 @@ class LoRAConv2d(nn.Conv2d):
         device: str,
         dtype: torch.dtype,
     ):
-        down_conv = torch.nn.utils.skip_init(
-            nn.Conv2d,
+        down_conv = nn.Conv2d(
             self.in_channels,
             rank,
             kernel_size=self.kernel_size,
             stride=self.stride,
             padding=self.padding,
             bias=False,
-            device=device,
+            device="meta",
             dtype=dtype,
-        )
+        ).to_empty(device=device)
         down_conv.weight.data = down
         # according to the official kohya_ss trainer kernel_size are always fixed for the up layer
         # see: https://github.com/bmaltais/kohya_ss/blob/2accb1305979ba62f5077a23aabac23b4c37e935/networks/lora_diffusers.py#L129
         # refer from diffusers
-        up_conv = torch.nn.utils.skip_init(
-            nn.Conv2d,
+        up_conv = nn.Conv2d(
             rank,
             self.out_channels,
             kernel_size=(1, 1),
             stride=(1, 1),
             bias=False,
-            device=device,
+            device="meta",
             dtype=dtype,
-        )
+        ).to_empty(device=device)
         up_conv.weight.data = up
         lora = LoRA(scale, rank, alpha, up_conv, down_conv, device, dtype)

{diffsynth_engine-0.4.4.dev2 → diffsynth_engine-0.4.4.dev4}/diffsynth_engine/models/flux/flux_controlnet.py RENAMED Viewed

@@ -8,7 +8,6 @@ from diffsynth_engine.models.flux.flux_dit import (
     RoPEEmbedding,
     TimestepEmbeddings,
 )
-from diffsynth_engine.models.utils import no_init_weights
 class FluxControlNetStateDictConverter(StateDictConverter):
@@ -164,10 +163,13 @@ class FluxControlNet(PreTrainedModel):
         else:
             condition_channels = 64
-        with no_init_weights():
-            model = torch.nn.utils.skip_init(
-                cls, condition_channels=condition_channels, attn_kwargs=attn_kwargs, device=device, dtype=dtype
-            )
-        model.load_state_dict(state_dict)
+        model = cls(
+            condition_channels=condition_channels,
+            attn_kwargs=attn_kwargs,
+            device="meta",
+            dtype=dtype,
+        )
+        model.requires_grad_(False)
+        model.load_state_dict(state_dict, assign=True)
         model.to(device=device, dtype=dtype, non_blocking=True)
         return model

{diffsynth_engine-0.4.4.dev2 → diffsynth_engine-0.4.4.dev4}/diffsynth_engine/models/flux/flux_dit.py RENAMED Viewed

@@ -14,7 +14,6 @@ from diffsynth_engine.models.basic.transformer_helper import (
 from diffsynth_engine.models.basic.timestep import TimestepEmbeddings
 from diffsynth_engine.models.base import PreTrainedModel, StateDictConverter
 from diffsynth_engine.models.basic import attention as attention_ops
-from diffsynth_engine.models.utils import no_init_weights
 from diffsynth_engine.utils.gguf import gguf_inference
 from diffsynth_engine.utils.fp8_linear import fp8_inference
 from diffsynth_engine.utils.constants import FLUX_DIT_CONFIG_FILE
@@ -503,18 +502,20 @@ class FluxDiT(PreTrainedModel):
         in_channel: int = 64,
         attn_kwargs: Optional[Dict[str, Any]] = None,
     ):
-        with no_init_weights():
-            model = torch.nn.utils.skip_init(
-                cls,
-                device=device,
-                dtype=dtype,
-                in_channel=in_channel,
-                attn_kwargs=attn_kwargs,
-            )
-            model = model.requires_grad_(False)  # for loading gguf
+        model = cls(
+            device="meta",
+            dtype=dtype,
+            in_channel=in_channel,
+            attn_kwargs=attn_kwargs,
+        )
+        model = model.requires_grad_(False)
         model.load_state_dict(state_dict, assign=True)
         model.to(device=device, dtype=dtype, non_blocking=True)
         return model
+    def compile_repeated_blocks(self, *args, **kwargs):
+        for block in self.blocks:
+            block.compile(*args, **kwargs)
     def get_fsdp_modules(self):
         return ["blocks", "single_blocks"]

{diffsynth_engine-0.4.4.dev2 → diffsynth_engine-0.4.4.dev4}/diffsynth_engine/models/flux/flux_dit_fbcache.py RENAMED Viewed

@@ -2,7 +2,6 @@ import torch
 import numpy as np
 from typing import Any, Dict, Optional
-from diffsynth_engine.models.utils import no_init_weights
 from diffsynth_engine.utils.gguf import gguf_inference
 from diffsynth_engine.utils.fp8_linear import fp8_inference
 from diffsynth_engine.utils.parallel import (
@@ -190,16 +189,14 @@ class FluxDiTFBCache(FluxDiT):
         attn_kwargs: Optional[Dict[str, Any]] = None,
         relative_l1_threshold: float = 0.05,
     ):
-        with no_init_weights():
-            model = torch.nn.utils.skip_init(
-                cls,
-                device=device,
-                dtype=dtype,
-                in_channel=in_channel,
-                attn_kwargs=attn_kwargs,
-                relative_l1_threshold=relative_l1_threshold,
-            )
-            model = model.requires_grad_(False)  # for loading gguf
+        model = cls(
+            device="meta",
+            dtype=dtype,
+            in_channel=in_channel,
+            attn_kwargs=attn_kwargs,
+            relative_l1_threshold=relative_l1_threshold,
+        )
+        model = model.requires_grad_(False)
         model.load_state_dict(state_dict, assign=True)
         model.to(device=device, dtype=dtype, non_blocking=True)
         return model

{diffsynth_engine-0.4.4.dev2 → diffsynth_engine-0.4.4.dev4}/diffsynth_engine/models/flux/flux_ipadapter.py RENAMED Viewed

@@ -4,7 +4,6 @@ from torch import nn
 from PIL import Image
 from typing import Any, Dict, List, Optional
 from functools import partial
-from diffsynth_engine.models.utils import no_init_weights
 from diffsynth_engine.models.text_encoder.siglip import SiglipImageEncoder
 from diffsynth_engine.models.basic.transformer_helper import RMSNorm
 from diffsynth_engine.models.basic.attention import attention
@@ -39,9 +38,8 @@ class FluxIPAdapterAttention(nn.Module):
     @classmethod
     def from_state_dict(cls, state_dict: Dict[str, torch.Tensor], device: str, dtype: torch.dtype, **kwargs):
-        with no_init_weights():
-            model = torch.nn.utils.skip_init(cls, device=device, dtype=dtype, **kwargs)
-        model.to_empty(device=device)
+        model = cls(device="meta", dtype=dtype, **kwargs)
+        model.requires_grad_(False)
         model.load_state_dict(state_dict, assign=True)
         model.to(device=device, dtype=dtype, non_blocking=True)
         return model
@@ -74,9 +72,8 @@ class FluxIPAdapterMLP(torch.nn.Module):
     @classmethod
     def from_state_dict(cls, state_dict: Dict[str, torch.Tensor], device: str, dtype: torch.dtype, **kwargs):
-        with no_init_weights():
-            model = torch.nn.utils.skip_init(cls, device=device, dtype=dtype, **kwargs)
-        model.to_empty(device=device)
+        model = cls(device="meta", dtype=dtype, **kwargs)
+        model.requires_grad_(False)
         model.load_state_dict(state_dict, assign=True)
         model.to(device=device, dtype=dtype, non_blocking=True)
         return model

{diffsynth_engine-0.4.4.dev2 → diffsynth_engine-0.4.4.dev4}/diffsynth_engine/models/flux/flux_redux.py RENAMED Viewed

@@ -4,7 +4,6 @@ import torch.nn as nn
 from typing import Dict
 from diffsynth_engine.utils.download import fetch_model
 from diffsynth_engine.models.base import PreTrainedModel
-from diffsynth_engine.models.utils import no_init_weights
 from diffsynth_engine.models.text_encoder.siglip import SiglipImageEncoder
@@ -30,13 +29,8 @@ class FluxReduxImageEmbedder(nn.Module):
         device: str,
         dtype: torch.dtype,
     ):
-        with no_init_weights():
-            model = torch.nn.utils.skip_init(
-                cls,
-                device=device,
-                dtype=dtype,
-            )
-            model = model.requires_grad_(False)  # for loading gguf
+        model = cls(device="meta", dtype=dtype)
+        model = model.requires_grad_(False)
         model.load_state_dict(state_dict, assign=True)
         model.to(device=device, dtype=dtype, non_blocking=True)
         return model

{diffsynth_engine-0.4.4.dev2 → diffsynth_engine-0.4.4.dev4}/diffsynth_engine/models/flux/flux_text_encoder.py RENAMED Viewed

@@ -5,7 +5,6 @@ from typing import Dict
 from diffsynth_engine.models.sd import SDTextEncoder
 from diffsynth_engine.models.text_encoder.t5 import T5EncoderModel
 from diffsynth_engine.models.base import StateDictConverter
-from diffsynth_engine.models.utils import no_init_weights
 from diffsynth_engine.utils.constants import FLUX_TEXT_ENCODER_CONFIG_FILE
 from diffsynth_engine.utils import logging
@@ -61,10 +60,10 @@ class FluxTextEncoder1(SDTextEncoder):
     def from_state_dict(
         cls, state_dict: Dict[str, torch.Tensor], device: str, dtype: torch.dtype, vocab_size: int = 49408
     ):
-        with no_init_weights():
-            model = torch.nn.utils.skip_init(cls, device=device, dtype=dtype, vocab_size=vocab_size)
-        model.load_state_dict(state_dict)
-        model.to(device=device, dtype=dtype)
+        model = cls(device="meta", dtype=dtype, vocab_size=vocab_size)
+        model.requires_grad_(False)
+        model.load_state_dict(state_dict, assign=True)
+        model.to(device=device, dtype=dtype, non_blocking=True)
         return model

{diffsynth_engine-0.4.4.dev2 → diffsynth_engine-0.4.4.dev4}/diffsynth_engine/models/flux/flux_vae.py RENAMED Viewed

@@ -3,7 +3,6 @@ import torch
 from typing import Dict
 from diffsynth_engine.models.vae import VAEDecoder, VAEEncoder, VAEStateDictConverter
-from diffsynth_engine.models.utils import no_init_weights
 from diffsynth_engine.utils.constants import FLUX_VAE_CONFIG_FILE
 from diffsynth_engine.utils import logging
@@ -51,8 +50,8 @@ class FluxVAEEncoder(VAEEncoder):
     @classmethod
     def from_state_dict(cls, state_dict: Dict[str, torch.Tensor], device: str, dtype: torch.dtype):
-        with no_init_weights():
-            model = torch.nn.utils.skip_init(cls, device=device, dtype=dtype)
+        model = cls(device="meta", dtype=dtype)
+        model.requires_grad_(False)
         model.load_state_dict(state_dict, assign=True)
         model.to(device=device, dtype=dtype, non_blocking=True)
         return model
@@ -73,8 +72,8 @@ class FluxVAEDecoder(VAEDecoder):
     @classmethod
     def from_state_dict(cls, state_dict: Dict[str, torch.Tensor], device: str, dtype: torch.dtype):
-        with no_init_weights():
-            model = torch.nn.utils.skip_init(cls, device=device, dtype=dtype)
+        model = cls(device="meta", dtype=dtype)
+        model.requires_grad_(False)
         model.load_state_dict(state_dict, assign=True)
         model.to(device=device, dtype=dtype, non_blocking=True)
         return model

{diffsynth_engine-0.4.4.dev2 → diffsynth_engine-0.4.4.dev4}/diffsynth_engine/models/qwen_image/qwen2_5_vl.py RENAMED Viewed

@@ -8,7 +8,6 @@ from typing import Any, Dict, List, Tuple, Optional
 from diffsynth_engine.models.base import PreTrainedModel
 from diffsynth_engine.models.basic.transformer_helper import RMSNorm
 from diffsynth_engine.models.basic import attention as attention_ops
-from diffsynth_engine.models.utils import no_init_weights
 from diffsynth_engine.utils.cache import Cache, DynamicCache
 from diffsynth_engine.utils import logging
@@ -968,8 +967,8 @@ class Qwen2_5_VLForConditionalGeneration(PreTrainedModel):
         device: str = "cuda:0",
         dtype: torch.dtype = torch.bfloat16,
     ):
-        with torch.device("meta"), no_init_weights():
-            model = cls(vision_config=vision_config, config=config, device=device, dtype=dtype)
+        model = cls(vision_config=vision_config, config=config, device="meta", dtype=dtype)
+        model.requires_grad_(False)
         model.load_state_dict(state_dict, assign=True)
         model.to(device=device, dtype=dtype, non_blocking=True)
         return model

diffsynth-engine 0.4.4.dev2__tar.gz → 0.4.4.dev4__tar.gz

diffsynth-engine 0.4.4.dev2tar.gz → 0.4.4.dev4tar.gz