PyPI - diffsynth-engine - Versions diffs - 0.1.0__py3-none-any.whl - Mend

diffsynth-engine 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (113) hide show

diffsynth_engine/__init__.py +25 -0
diffsynth_engine/algorithm/__init__.py +0 -0
diffsynth_engine/algorithm/noise_scheduler/__init__.py +21 -0
diffsynth_engine/algorithm/noise_scheduler/base_scheduler.py +10 -0
diffsynth_engine/algorithm/noise_scheduler/flow_match/__init__.py +5 -0
diffsynth_engine/algorithm/noise_scheduler/flow_match/flow_beta.py +28 -0
diffsynth_engine/algorithm/noise_scheduler/flow_match/flow_ddim.py +25 -0
diffsynth_engine/algorithm/noise_scheduler/flow_match/recifited_flow.py +48 -0
diffsynth_engine/algorithm/noise_scheduler/stable_diffusion/__init__.py +0 -0
diffsynth_engine/algorithm/noise_scheduler/stable_diffusion/beta.py +26 -0
diffsynth_engine/algorithm/noise_scheduler/stable_diffusion/ddim.py +28 -0
diffsynth_engine/algorithm/noise_scheduler/stable_diffusion/exponential.py +19 -0
diffsynth_engine/algorithm/noise_scheduler/stable_diffusion/karras.py +21 -0
diffsynth_engine/algorithm/noise_scheduler/stable_diffusion/linear.py +77 -0
diffsynth_engine/algorithm/noise_scheduler/stable_diffusion/sgm_uniform.py +20 -0
diffsynth_engine/algorithm/sampler/__init__.py +19 -0
diffsynth_engine/algorithm/sampler/flow_match/__init__.py +0 -0
diffsynth_engine/algorithm/sampler/flow_match/flow_match_euler.py +22 -0
diffsynth_engine/algorithm/sampler/stable_diffusion/__init__.py +0 -0
diffsynth_engine/algorithm/sampler/stable_diffusion/brownian_tree.py +54 -0
diffsynth_engine/algorithm/sampler/stable_diffusion/ddpm.py +32 -0
diffsynth_engine/algorithm/sampler/stable_diffusion/deis.py +125 -0
diffsynth_engine/algorithm/sampler/stable_diffusion/dpmpp_2m.py +29 -0
diffsynth_engine/algorithm/sampler/stable_diffusion/dpmpp_2m_sde.py +53 -0
diffsynth_engine/algorithm/sampler/stable_diffusion/dpmpp_3m_sde.py +59 -0
diffsynth_engine/algorithm/sampler/stable_diffusion/epsilon.py +29 -0
diffsynth_engine/algorithm/sampler/stable_diffusion/euler.py +12 -0
diffsynth_engine/algorithm/sampler/stable_diffusion/euler_ancestral.py +30 -0
diffsynth_engine/conf/tokenizers/flux/tokenizer_1/merges.txt +48895 -0
diffsynth_engine/conf/tokenizers/flux/tokenizer_1/special_tokens_map.json +30 -0
diffsynth_engine/conf/tokenizers/flux/tokenizer_1/tokenizer_config.json +30 -0
diffsynth_engine/conf/tokenizers/flux/tokenizer_1/vocab.json +49410 -0
diffsynth_engine/conf/tokenizers/flux/tokenizer_2/special_tokens_map.json +125 -0
diffsynth_engine/conf/tokenizers/flux/tokenizer_2/spiece.model +0 -0
diffsynth_engine/conf/tokenizers/flux/tokenizer_2/tokenizer.json +129428 -0
diffsynth_engine/conf/tokenizers/flux/tokenizer_2/tokenizer_config.json +940 -0
diffsynth_engine/conf/tokenizers/sdxl/tokenizer/merges.txt +48895 -0
diffsynth_engine/conf/tokenizers/sdxl/tokenizer/special_tokens_map.json +24 -0
diffsynth_engine/conf/tokenizers/sdxl/tokenizer/tokenizer_config.json +30 -0
diffsynth_engine/conf/tokenizers/sdxl/tokenizer/vocab.json +49410 -0
diffsynth_engine/conf/tokenizers/sdxl/tokenizer_2/merges.txt +40213 -0
diffsynth_engine/conf/tokenizers/sdxl/tokenizer_2/special_tokens_map.json +24 -0
diffsynth_engine/conf/tokenizers/sdxl/tokenizer_2/tokenizer_config.json +38 -0
diffsynth_engine/conf/tokenizers/sdxl/tokenizer_2/vocab.json +49411 -0
diffsynth_engine/conf/tokenizers/wan/umt5-xxl/special_tokens_map.json +308 -0
diffsynth_engine/conf/tokenizers/wan/umt5-xxl/spiece.model +0 -0
diffsynth_engine/conf/tokenizers/wan/umt5-xxl/tokenizer.json +1028026 -0
diffsynth_engine/conf/tokenizers/wan/umt5-xxl/tokenizer_config.json +2748 -0
diffsynth_engine/models/__init__.py +0 -0
diffsynth_engine/models/base.py +55 -0
diffsynth_engine/models/basic/__init__.py +0 -0
diffsynth_engine/models/basic/attention.py +137 -0
diffsynth_engine/models/basic/lora.py +293 -0
diffsynth_engine/models/basic/relative_position_emb.py +56 -0
diffsynth_engine/models/basic/timestep.py +81 -0
diffsynth_engine/models/basic/transformer_helper.py +88 -0
diffsynth_engine/models/basic/unet_helper.py +244 -0
diffsynth_engine/models/components/__init__.py +0 -0
diffsynth_engine/models/components/clip.py +56 -0
diffsynth_engine/models/components/t5.py +222 -0
diffsynth_engine/models/components/vae.py +393 -0
diffsynth_engine/models/flux/__init__.py +14 -0
diffsynth_engine/models/flux/flux_dit.py +504 -0
diffsynth_engine/models/flux/flux_text_encoder.py +90 -0
diffsynth_engine/models/flux/flux_vae.py +78 -0
diffsynth_engine/models/sd/__init__.py +12 -0
diffsynth_engine/models/sd/sd_text_encoder.py +142 -0
diffsynth_engine/models/sd/sd_unet.py +293 -0
diffsynth_engine/models/sd/sd_vae.py +38 -0
diffsynth_engine/models/sd3/__init__.py +14 -0
diffsynth_engine/models/sd3/sd3_dit.py +302 -0
diffsynth_engine/models/sd3/sd3_text_encoder.py +163 -0
diffsynth_engine/models/sd3/sd3_vae.py +43 -0
diffsynth_engine/models/sdxl/__init__.py +13 -0
diffsynth_engine/models/sdxl/sdxl_text_encoder.py +307 -0
diffsynth_engine/models/sdxl/sdxl_unet.py +306 -0
diffsynth_engine/models/sdxl/sdxl_vae.py +38 -0
diffsynth_engine/models/utils.py +54 -0
diffsynth_engine/models/wan/__init__.py +0 -0
diffsynth_engine/models/wan/attention.py +200 -0
diffsynth_engine/models/wan/wan_dit.py +431 -0
diffsynth_engine/models/wan/wan_image_encoder.py +495 -0
diffsynth_engine/models/wan/wan_text_encoder.py +264 -0
diffsynth_engine/models/wan/wan_vae.py +771 -0
diffsynth_engine/pipelines/__init__.py +17 -0
diffsynth_engine/pipelines/base.py +216 -0
diffsynth_engine/pipelines/flux_image.py +548 -0
diffsynth_engine/pipelines/sd_image.py +386 -0
diffsynth_engine/pipelines/sdxl_image.py +430 -0
diffsynth_engine/pipelines/wan_video.py +481 -0
diffsynth_engine/tokenizers/__init__.py +4 -0
diffsynth_engine/tokenizers/base.py +157 -0
diffsynth_engine/tokenizers/clip.py +288 -0
diffsynth_engine/tokenizers/t5.py +194 -0
diffsynth_engine/tokenizers/wan.py +79 -0
diffsynth_engine/utils/__init__.py +0 -0
diffsynth_engine/utils/constants.py +34 -0
diffsynth_engine/utils/download.py +139 -0
diffsynth_engine/utils/env.py +7 -0
diffsynth_engine/utils/fp8_linear.py +64 -0
diffsynth_engine/utils/gguf.py +415 -0
diffsynth_engine/utils/loader.py +14 -0
diffsynth_engine/utils/lock.py +56 -0
diffsynth_engine/utils/logging.py +12 -0
diffsynth_engine/utils/offload.py +44 -0
diffsynth_engine/utils/parallel.py +191 -0
diffsynth_engine/utils/prompt.py +9 -0
diffsynth_engine/utils/video.py +40 -0
diffsynth_engine-0.1.0.dist-info/LICENSE +201 -0
diffsynth_engine-0.1.0.dist-info/METADATA +237 -0
diffsynth_engine-0.1.0.dist-info/RECORD +113 -0
diffsynth_engine-0.1.0.dist-info/WHEEL +5 -0
diffsynth_engine-0.1.0.dist-info/top_level.txt +1 -0

diffsynth_engine/models/sd/sd_text_encoder.py ADDED Viewed

@@ -0,0 +1,142 @@
+import json
+import torch
+import torch.nn as nn
+from typing import Dict
+from diffsynth_engine.models.components.clip import CLIPEncoderLayer
+from diffsynth_engine.models.base import PreTrainedModel, StateDictConverter
+from diffsynth_engine.models.utils import no_init_weights
+from diffsynth_engine.utils.constants import SD_TEXT_ENCODER_CONFIG_FILE
+from diffsynth_engine.utils import logging
+logger = logging.get_logger(__name__)
+with open(SD_TEXT_ENCODER_CONFIG_FILE, "r") as f:
+    config = json.load(f)
+class SDTextEncoderStateDictConverter(StateDictConverter):
+    def __init__(self):
+        pass
+    def _from_diffusers(self, state_dict: Dict[str, torch.Tensor]) -> Dict[str, torch.Tensor]:
+        rename_dict = config["diffusers"]["rename_dict"]
+        attn_rename_dict = config["diffusers"]["attn_rename_dict"]
+        state_dict_ = {}
+        for name, param in state_dict.items():
+            if name in rename_dict:
+                if name == "text_model.embeddings.position_embedding.weight":
+                    param = param.reshape((1, param.shape[0], param.shape[1]))
+                state_dict_[rename_dict[name]] = param
+            elif name.startswith("text_model.encoder.layers."):
+                names = name.split(".")
+                layer_id, layer_type, tail = names[3], ".".join(names[4:-1]), names[-1]
+                name_ = ".".join(["encoders", layer_id, attn_rename_dict[layer_type], tail])
+                state_dict_[name_] = param
+        return state_dict_
+    def _from_civitai(self, state_dict: Dict[str, torch.Tensor]) -> Dict[str, torch.Tensor]:
+        rename_dict = config["civitai"]["rename_dict"]
+        state_dict_ = {}
+        for name, param in state_dict.items():
+            if name not in rename_dict:
+                continue
+            name_ = rename_dict[name]
+            if name == "cond_stage_model.transformer.text_model.embeddings.position_embedding.weight":
+                param = param.reshape((1, param.shape[0], param.shape[1]))
+            state_dict_[name_] = param
+        return state_dict_
+    def convert(self, state_dict: Dict[str, torch.Tensor]) -> Dict[str, torch.Tensor]:
+        if "cond_stage_model.transformer.text_model.encoder.layers.0.layer_norm1.weight" in state_dict:
+            state_dict = self._from_civitai(state_dict)
+            logger.info("use civitai format state dict")
+        elif "text_model.encoder.layers.0.layer_norm1.weight" in state_dict:
+            state_dict = self._from_diffusers(state_dict)
+            logger.info("use diffusers format state dict")
+        else:
+            logger.info("use diffsynth format state dict")
+        return state_dict
+class SDTextEncoder(PreTrainedModel):
+    converter = SDTextEncoderStateDictConverter()
+    def __init__(
+        self,
+        embed_dim=768,
+        vocab_size=49408,
+        max_position_embeddings=77,
+        num_encoder_layers=12,
+        encoder_intermediate_size=3072,
+        device: str = "cuda:0",
+        dtype: torch.dtype = torch.float16,
+    ):
+        super().__init__()
+        # token_embedding
+        self.token_embedding = nn.Embedding(vocab_size, embed_dim, device=device, dtype=dtype)
+        # position_embeds (This is a fixed tensor)
+        self.position_embeds = nn.Parameter(
+            torch.zeros(1, max_position_embeddings, embed_dim, device=device, dtype=dtype)
+        )
+        # encoders
+        self.encoders = nn.ModuleList(
+            [
+                CLIPEncoderLayer(embed_dim, encoder_intermediate_size, device=device, dtype=dtype)
+                for _ in range(num_encoder_layers)
+            ]
+        )
+        # attn_mask
+        self.attn_mask = self.attention_mask(max_position_embeddings)
+        # final_layer_norm
+        self.final_layer_norm = nn.LayerNorm(embed_dim, device=device, dtype=dtype)
+    def attention_mask(self, length):
+        mask = torch.empty(length, length)
+        mask.fill_(float("-inf"))
+        mask.triu_(1)
+        return mask
+    def forward(self, input_ids, clip_skip=1):
+        clip_skip = max(clip_skip, 1)
+        embeds = self.token_embedding(input_ids)
+        embeds += self.position_embeds.to(device=embeds.device, dtype=embeds.dtype)
+        attn_mask = self.attn_mask.to(device=embeds.device, dtype=embeds.dtype)
+        for encoder_id, encoder in enumerate(self.encoders):
+            embeds = encoder(embeds, attn_mask=attn_mask)
+            if encoder_id + clip_skip == len(self.encoders):
+                break
+        embeds = self.final_layer_norm(embeds)
+        return embeds
+    @classmethod
+    def from_state_dict(
+        cls,
+        state_dict: Dict[str, torch.Tensor],
+        device: str,
+        dtype: torch.dtype,
+        embed_dim: int = 768,
+        vocab_size: int = 49408,
+        max_position_embeddings: int = 77,
+        num_encoder_layers: int = 12,
+        encoder_intermediate_size: int = 3072,
+    ):
+        with no_init_weights():
+            model = torch.nn.utils.skip_init(
+                cls,
+                device=device,
+                dtype=dtype,
+                embed_dim=embed_dim,
+                vocab_size=vocab_size,
+                max_position_embeddings=max_position_embeddings,
+                num_encoder_layers=num_encoder_layers,
+                encoder_intermediate_size=encoder_intermediate_size,
+            )
+        model.load_state_dict(state_dict)
+        return model

diffsynth_engine/models/sd/sd_unet.py ADDED Viewed

@@ -0,0 +1,293 @@
+import json
+import torch
+import torch.nn as nn
+from typing import Dict
+from diffsynth_engine.models.base import PreTrainedModel, StateDictConverter, split_suffix
+from diffsynth_engine.models.basic.timestep import TimestepEmbeddings
+from diffsynth_engine.models.utils import no_init_weights
+from diffsynth_engine.models.basic.unet_helper import (
+    ResnetBlock,
+    AttentionBlock,
+    PushBlock,
+    DownSampler,
+    PopBlock,
+    UpSampler,
+)
+from diffsynth_engine.utils.constants import SD_UNET_CONFIG_FILE
+from diffsynth_engine.utils import logging
+logger = logging.get_logger(__name__)
+with open(SD_UNET_CONFIG_FILE) as f:
+    config = json.load(f)
+class SDUNetStateDictConverter(StateDictConverter):
+    def _from_diffusers(self, state_dict: Dict[str, torch.Tensor]) -> Dict[str, torch.Tensor]:
+        # architecture
+        block_types = [
+            "ResnetBlock",
+            "AttentionBlock",
+            "PushBlock",
+            "ResnetBlock",
+            "AttentionBlock",
+            "PushBlock",
+            "DownSampler",
+            "PushBlock",
+            "ResnetBlock",
+            "AttentionBlock",
+            "PushBlock",
+            "ResnetBlock",
+            "AttentionBlock",
+            "PushBlock",
+            "DownSampler",
+            "PushBlock",
+            "ResnetBlock",
+            "AttentionBlock",
+            "PushBlock",
+            "ResnetBlock",
+            "AttentionBlock",
+            "PushBlock",
+            "DownSampler",
+            "PushBlock",
+            "ResnetBlock",
+            "PushBlock",
+            "ResnetBlock",
+            "PushBlock",
+            "ResnetBlock",
+            "AttentionBlock",
+            "ResnetBlock",
+            "PopBlock",
+            "ResnetBlock",
+            "PopBlock",
+            "ResnetBlock",
+            "PopBlock",
+            "ResnetBlock",
+            "UpSampler",
+            "PopBlock",
+            "ResnetBlock",
+            "AttentionBlock",
+            "PopBlock",
+            "ResnetBlock",
+            "AttentionBlock",
+            "PopBlock",
+            "ResnetBlock",
+            "AttentionBlock",
+            "UpSampler",
+            "PopBlock",
+            "ResnetBlock",
+            "AttentionBlock",
+            "PopBlock",
+            "ResnetBlock",
+            "AttentionBlock",
+            "PopBlock",
+            "ResnetBlock",
+            "AttentionBlock",
+            "UpSampler",
+            "PopBlock",
+            "ResnetBlock",
+            "AttentionBlock",
+            "PopBlock",
+            "ResnetBlock",
+            "AttentionBlock",
+            "PopBlock",
+            "ResnetBlock",
+            "AttentionBlock",
+        ]
+        # rename each parameter
+        name_list = sorted([name for name in state_dict])
+        rename_dict = {}
+        block_id = {"ResnetBlock": -1, "AttentionBlock": -1, "DownSampler": -1, "UpSampler": -1}
+        last_block_type_with_id = {"ResnetBlock": "", "AttentionBlock": "", "DownSampler": "", "UpSampler": ""}
+        for name in name_list:
+            names = name.split(".")
+            if names[0] in ["conv_in", "conv_norm_out", "conv_out"]:
+                pass
+            elif names[0] in ["time_embedding", "add_embedding"]:
+                if names[0] == "add_embedding":
+                    names[0] = "add_time_embedding"
+                else:
+                    names[0] = "time_embedding.timestep_embedder"
+                names[1] = {"linear_1": "0", "linear_2": "2"}[names[1]]
+            elif names[0] in ["down_blocks", "mid_block", "up_blocks"]:
+                if names[0] == "mid_block":
+                    names.insert(1, "0")
+                block_type = {
+                    "resnets": "ResnetBlock",
+                    "attentions": "AttentionBlock",
+                    "downsamplers": "DownSampler",
+                    "upsamplers": "UpSampler",
+                }[names[2]]
+                block_type_with_id = ".".join(names[:4])
+                if block_type_with_id != last_block_type_with_id[block_type]:
+                    block_id[block_type] += 1
+                last_block_type_with_id[block_type] = block_type_with_id
+                while block_id[block_type] < len(block_types) and block_types[block_id[block_type]] != block_type:
+                    block_id[block_type] += 1
+                block_type_with_id = ".".join(names[:4])
+                names = ["blocks", str(block_id[block_type])] + names[4:]
+                if "ff" in names:
+                    ff_index = names.index("ff")
+                    component = ".".join(names[ff_index : ff_index + 3])
+                    component = {"ff.net.0": "act_fn", "ff.net.2": "ff"}[component]
+                    names = names[:ff_index] + [component] + names[ff_index + 3 :]
+                if "to_out" in names:
+                    names.pop(names.index("to_out") + 1)
+            else:
+                raise ValueError(f"Unknown parameters: {name}")
+            rename_dict[name] = ".".join(names)
+        # convert state_dict
+        state_dict_ = {}
+        for name, param in state_dict.items():
+            if ".proj_in." in name or ".proj_out." in name:
+                param = param.squeeze()
+            state_dict_[rename_dict[name]] = param
+        return state_dict_
+    def _from_civitai(self, state_dict: Dict[str, torch.Tensor]) -> Dict[str, torch.Tensor]:
+        rename_dict = config["civitai"]["rename_dict"]
+        state_dict_ = {}
+        for name, param in state_dict.items():
+            name, suffix = split_suffix(name)
+            if name in rename_dict:
+                if ".proj_in" in name or ".proj_out" in name:
+                    param = param.squeeze()
+                new_key = rename_dict[name] + suffix
+                state_dict_[new_key] = param
+        return state_dict_
+    def convert(self, state_dict: Dict[str, torch.Tensor]) -> Dict[str, torch.Tensor]:
+        if "model.diffusion_model.input_blocks.0.0.weight" in state_dict:
+            state_dict = self._from_civitai(state_dict)
+            logger.info("use civitai format state dict")
+        elif "down_blocks.0.attentions.0.norm.weight" in state_dict:
+            state_dict = self._from_diffusers(state_dict)
+            logger.info("use diffusers format state dict")
+        else:
+            logger.info("user diffsynth format state dict")
+        return state_dict
+class SDUNet(PreTrainedModel):
+    converter = SDUNetStateDictConverter()
+    def __init__(self, device: str = "cuda:0", dtype: torch.dtype = torch.float16):
+        super().__init__()
+        self.time_embedding = TimestepEmbeddings(dim_in=320, dim_out=1280, device=device, dtype=dtype)
+        self.conv_in = nn.Conv2d(4, 320, kernel_size=3, padding=1, device=device, dtype=dtype)
+        self.blocks = nn.ModuleList(
+            [
+                # CrossAttnDownBlock2D
+                ResnetBlock(320, 320, 1280, device=device, dtype=dtype),
+                AttentionBlock(8, 40, 320, 1, 768, eps=1e-6, device=device, dtype=dtype),
+                PushBlock(),
+                ResnetBlock(320, 320, 1280, device=device, dtype=dtype),
+                AttentionBlock(8, 40, 320, 1, 768, eps=1e-6, device=device, dtype=dtype),
+                PushBlock(),
+                DownSampler(320, device=device, dtype=dtype),
+                PushBlock(),
+                # CrossAttnDownBlock2D
+                ResnetBlock(320, 640, 1280, device=device, dtype=dtype),
+                AttentionBlock(8, 80, 640, 1, 768, eps=1e-6, device=device, dtype=dtype),
+                PushBlock(),
+                ResnetBlock(640, 640, 1280, device=device, dtype=dtype),
+                AttentionBlock(8, 80, 640, 1, 768, eps=1e-6, device=device, dtype=dtype),
+                PushBlock(),
+                DownSampler(640, device=device, dtype=dtype),
+                PushBlock(),
+                # CrossAttnDownBlock2D
+                ResnetBlock(640, 1280, 1280, device=device, dtype=dtype),
+                AttentionBlock(8, 160, 1280, 1, 768, eps=1e-6, device=device, dtype=dtype),
+                PushBlock(),
+                ResnetBlock(1280, 1280, 1280, device=device, dtype=dtype),
+                AttentionBlock(8, 160, 1280, 1, 768, eps=1e-6, device=device, dtype=dtype),
+                PushBlock(),
+                DownSampler(1280, device=device, dtype=dtype),
+                PushBlock(),
+                # DownBlock2D
+                ResnetBlock(1280, 1280, 1280, device=device, dtype=dtype),
+                PushBlock(),
+                ResnetBlock(1280, 1280, 1280, device=device, dtype=dtype),
+                PushBlock(),
+                # UNetMidBlock2DCrossAttn
+                ResnetBlock(1280, 1280, 1280, device=device, dtype=dtype),
+                AttentionBlock(8, 160, 1280, 1, 768, eps=1e-6, device=device, dtype=dtype),
+                ResnetBlock(1280, 1280, 1280, device=device, dtype=dtype),
+                # UpBlock2D
+                PopBlock(),
+                ResnetBlock(2560, 1280, 1280, device=device, dtype=dtype),
+                PopBlock(),
+                ResnetBlock(2560, 1280, 1280, device=device, dtype=dtype),
+                PopBlock(),
+                ResnetBlock(2560, 1280, 1280, device=device, dtype=dtype),
+                UpSampler(1280, device=device, dtype=dtype),
+                # CrossAttnUpBlock2D
+                PopBlock(),
+                ResnetBlock(2560, 1280, 1280, device=device, dtype=dtype),
+                AttentionBlock(8, 160, 1280, 1, 768, eps=1e-6, device=device, dtype=dtype),
+                PopBlock(),
+                ResnetBlock(2560, 1280, 1280, device=device, dtype=dtype),
+                AttentionBlock(8, 160, 1280, 1, 768, eps=1e-6, device=device, dtype=dtype),
+                PopBlock(),
+                ResnetBlock(1920, 1280, 1280, device=device, dtype=dtype),
+                AttentionBlock(8, 160, 1280, 1, 768, eps=1e-6, device=device, dtype=dtype),
+                UpSampler(1280, device=device, dtype=dtype),
+                # CrossAttnUpBlock2D
+                PopBlock(),
+                ResnetBlock(1920, 640, 1280, device=device, dtype=dtype),
+                AttentionBlock(8, 80, 640, 1, 768, eps=1e-6, device=device, dtype=dtype),
+                PopBlock(),
+                ResnetBlock(1280, 640, 1280, device=device, dtype=dtype),
+                AttentionBlock(8, 80, 640, 1, 768, eps=1e-6, device=device, dtype=dtype),
+                PopBlock(),
+                ResnetBlock(960, 640, 1280, device=device, dtype=dtype),
+                AttentionBlock(8, 80, 640, 1, 768, eps=1e-6, device=device, dtype=dtype),
+                UpSampler(640, device=device, dtype=dtype),
+                # CrossAttnUpBlock2D
+                PopBlock(),
+                ResnetBlock(960, 320, 1280, device=device, dtype=dtype),
+                AttentionBlock(8, 40, 320, 1, 768, eps=1e-6, device=device, dtype=dtype),
+                PopBlock(),
+                ResnetBlock(640, 320, 1280, device=device, dtype=dtype),
+                AttentionBlock(8, 40, 320, 1, 768, eps=1e-6, device=device, dtype=dtype),
+                PopBlock(),
+                ResnetBlock(640, 320, 1280, device=device, dtype=dtype),
+                AttentionBlock(8, 40, 320, 1, 768, eps=1e-6, device=device, dtype=dtype),
+            ]
+        )
+        self.conv_norm_out = nn.GroupNorm(num_channels=320, num_groups=32, eps=1e-5, device=device, dtype=dtype)
+        self.conv_act = nn.SiLU()
+        self.conv_out = nn.Conv2d(320, 4, kernel_size=3, padding=1, device=device, dtype=dtype)
+    def forward(self, x, timestep, context, **kwargs):
+        # 1. time
+        time_emb = self.time_embedding(timestep, dtype=x.dtype)
+        # 2. pre-process
+        hidden_states = self.conv_in(x)
+        text_emb = context
+        res_stack = [hidden_states]
+        # 3. blocks
+        for i, block in enumerate(self.blocks):
+            hidden_states, time_emb, text_emb, res_stack = block(hidden_states, time_emb, text_emb, res_stack)
+        # 4. output
+        hidden_states = self.conv_norm_out(hidden_states)
+        hidden_states = self.conv_act(hidden_states)
+        hidden_states = self.conv_out(hidden_states)
+        return hidden_states
+    @classmethod
+    def from_state_dict(cls, state_dict: Dict[str, torch.Tensor], device: str, dtype: torch.dtype):
+        with no_init_weights():
+            model = torch.nn.utils.skip_init(cls, device=device, dtype=dtype)
+        model.load_state_dict(state_dict, assign=True)
+        model.to(device=device, dtype=dtype, non_blocking=True)
+        return model

diffsynth_engine/models/sd/sd_vae.py ADDED Viewed

@@ -0,0 +1,38 @@
+import torch
+from typing import Dict
+from diffsynth_engine.models.components.vae import VAEDecoder, VAEEncoder
+from diffsynth_engine.models.utils import no_init_weights
+class SDVAEEncoder(VAEEncoder):
+    def __init__(self, device: str = "cuda:0", dtype: torch.dtype = torch.float32):
+        super().__init__(
+            latent_channels=4, scaling_factor=0.18215, shift_factor=0, use_quant_conv=True, device=device, dtype=dtype
+        )
+    @classmethod
+    def from_state_dict(cls, state_dict: Dict[str, torch.Tensor], device: str, dtype: torch.dtype):
+        with no_init_weights():
+            model = torch.nn.utils.skip_init(cls, device=device, dtype=dtype)
+        model.load_state_dict(state_dict)
+        return model
+class SDVAEDecoder(VAEDecoder):
+    def __init__(self, device: str = "cuda:0", dtype: torch.dtype = torch.float32):
+        super().__init__(
+            latent_channels=4,
+            scaling_factor=0.18215,
+            shift_factor=0,
+            use_post_quant_conv=True,
+            device=device,
+            dtype=dtype,
+        )
+    @classmethod
+    def from_state_dict(cls, state_dict: Dict[str, torch.Tensor], device: str, dtype: torch.dtype):
+        with no_init_weights():
+            model = torch.nn.utils.skip_init(cls, device=device, dtype=dtype)
+        model.load_state_dict(state_dict)
+        return model

diffsynth_engine/models/sd3/__init__.py ADDED Viewed

@@ -0,0 +1,14 @@
+from .sd3_dit import SD3DiT, config as sd3_dit_config
+from .sd3_text_encoder import SD3TextEncoder1, SD3TextEncoder2, SD3TextEncoder3, config as sd3_text_encoder_config
+from .sd3_vae import SD3VAEEncoder, SD3VAEDecoder
+__all__ = [
+    "SD3DiT",
+    "SD3TextEncoder1",
+    "SD3TextEncoder2",
+    "SD3TextEncoder3",
+    "SD3VAEEncoder",
+    "SD3VAEDecoder",
+    "sd3_dit_config",
+    "sd3_text_encoder_config",
+]