PyPI - diffsynth-engine - Versions diffs - 0.1.0__py3-none-any.whl - Mend

diffsynth-engine 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (113) hide show

diffsynth_engine/__init__.py +25 -0
diffsynth_engine/algorithm/__init__.py +0 -0
diffsynth_engine/algorithm/noise_scheduler/__init__.py +21 -0
diffsynth_engine/algorithm/noise_scheduler/base_scheduler.py +10 -0
diffsynth_engine/algorithm/noise_scheduler/flow_match/__init__.py +5 -0
diffsynth_engine/algorithm/noise_scheduler/flow_match/flow_beta.py +28 -0
diffsynth_engine/algorithm/noise_scheduler/flow_match/flow_ddim.py +25 -0
diffsynth_engine/algorithm/noise_scheduler/flow_match/recifited_flow.py +48 -0
diffsynth_engine/algorithm/noise_scheduler/stable_diffusion/__init__.py +0 -0
diffsynth_engine/algorithm/noise_scheduler/stable_diffusion/beta.py +26 -0
diffsynth_engine/algorithm/noise_scheduler/stable_diffusion/ddim.py +28 -0
diffsynth_engine/algorithm/noise_scheduler/stable_diffusion/exponential.py +19 -0
diffsynth_engine/algorithm/noise_scheduler/stable_diffusion/karras.py +21 -0
diffsynth_engine/algorithm/noise_scheduler/stable_diffusion/linear.py +77 -0
diffsynth_engine/algorithm/noise_scheduler/stable_diffusion/sgm_uniform.py +20 -0
diffsynth_engine/algorithm/sampler/__init__.py +19 -0
diffsynth_engine/algorithm/sampler/flow_match/__init__.py +0 -0
diffsynth_engine/algorithm/sampler/flow_match/flow_match_euler.py +22 -0
diffsynth_engine/algorithm/sampler/stable_diffusion/__init__.py +0 -0
diffsynth_engine/algorithm/sampler/stable_diffusion/brownian_tree.py +54 -0
diffsynth_engine/algorithm/sampler/stable_diffusion/ddpm.py +32 -0
diffsynth_engine/algorithm/sampler/stable_diffusion/deis.py +125 -0
diffsynth_engine/algorithm/sampler/stable_diffusion/dpmpp_2m.py +29 -0
diffsynth_engine/algorithm/sampler/stable_diffusion/dpmpp_2m_sde.py +53 -0
diffsynth_engine/algorithm/sampler/stable_diffusion/dpmpp_3m_sde.py +59 -0
diffsynth_engine/algorithm/sampler/stable_diffusion/epsilon.py +29 -0
diffsynth_engine/algorithm/sampler/stable_diffusion/euler.py +12 -0
diffsynth_engine/algorithm/sampler/stable_diffusion/euler_ancestral.py +30 -0
diffsynth_engine/conf/tokenizers/flux/tokenizer_1/merges.txt +48895 -0
diffsynth_engine/conf/tokenizers/flux/tokenizer_1/special_tokens_map.json +30 -0
diffsynth_engine/conf/tokenizers/flux/tokenizer_1/tokenizer_config.json +30 -0
diffsynth_engine/conf/tokenizers/flux/tokenizer_1/vocab.json +49410 -0
diffsynth_engine/conf/tokenizers/flux/tokenizer_2/special_tokens_map.json +125 -0
diffsynth_engine/conf/tokenizers/flux/tokenizer_2/spiece.model +0 -0
diffsynth_engine/conf/tokenizers/flux/tokenizer_2/tokenizer.json +129428 -0
diffsynth_engine/conf/tokenizers/flux/tokenizer_2/tokenizer_config.json +940 -0
diffsynth_engine/conf/tokenizers/sdxl/tokenizer/merges.txt +48895 -0
diffsynth_engine/conf/tokenizers/sdxl/tokenizer/special_tokens_map.json +24 -0
diffsynth_engine/conf/tokenizers/sdxl/tokenizer/tokenizer_config.json +30 -0
diffsynth_engine/conf/tokenizers/sdxl/tokenizer/vocab.json +49410 -0
diffsynth_engine/conf/tokenizers/sdxl/tokenizer_2/merges.txt +40213 -0
diffsynth_engine/conf/tokenizers/sdxl/tokenizer_2/special_tokens_map.json +24 -0
diffsynth_engine/conf/tokenizers/sdxl/tokenizer_2/tokenizer_config.json +38 -0
diffsynth_engine/conf/tokenizers/sdxl/tokenizer_2/vocab.json +49411 -0
diffsynth_engine/conf/tokenizers/wan/umt5-xxl/special_tokens_map.json +308 -0
diffsynth_engine/conf/tokenizers/wan/umt5-xxl/spiece.model +0 -0
diffsynth_engine/conf/tokenizers/wan/umt5-xxl/tokenizer.json +1028026 -0
diffsynth_engine/conf/tokenizers/wan/umt5-xxl/tokenizer_config.json +2748 -0
diffsynth_engine/models/__init__.py +0 -0
diffsynth_engine/models/base.py +55 -0
diffsynth_engine/models/basic/__init__.py +0 -0
diffsynth_engine/models/basic/attention.py +137 -0
diffsynth_engine/models/basic/lora.py +293 -0
diffsynth_engine/models/basic/relative_position_emb.py +56 -0
diffsynth_engine/models/basic/timestep.py +81 -0
diffsynth_engine/models/basic/transformer_helper.py +88 -0
diffsynth_engine/models/basic/unet_helper.py +244 -0
diffsynth_engine/models/components/__init__.py +0 -0
diffsynth_engine/models/components/clip.py +56 -0
diffsynth_engine/models/components/t5.py +222 -0
diffsynth_engine/models/components/vae.py +393 -0
diffsynth_engine/models/flux/__init__.py +14 -0
diffsynth_engine/models/flux/flux_dit.py +504 -0
diffsynth_engine/models/flux/flux_text_encoder.py +90 -0
diffsynth_engine/models/flux/flux_vae.py +78 -0
diffsynth_engine/models/sd/__init__.py +12 -0
diffsynth_engine/models/sd/sd_text_encoder.py +142 -0
diffsynth_engine/models/sd/sd_unet.py +293 -0
diffsynth_engine/models/sd/sd_vae.py +38 -0
diffsynth_engine/models/sd3/__init__.py +14 -0
diffsynth_engine/models/sd3/sd3_dit.py +302 -0
diffsynth_engine/models/sd3/sd3_text_encoder.py +163 -0
diffsynth_engine/models/sd3/sd3_vae.py +43 -0
diffsynth_engine/models/sdxl/__init__.py +13 -0
diffsynth_engine/models/sdxl/sdxl_text_encoder.py +307 -0
diffsynth_engine/models/sdxl/sdxl_unet.py +306 -0
diffsynth_engine/models/sdxl/sdxl_vae.py +38 -0
diffsynth_engine/models/utils.py +54 -0
diffsynth_engine/models/wan/__init__.py +0 -0
diffsynth_engine/models/wan/attention.py +200 -0
diffsynth_engine/models/wan/wan_dit.py +431 -0
diffsynth_engine/models/wan/wan_image_encoder.py +495 -0
diffsynth_engine/models/wan/wan_text_encoder.py +264 -0
diffsynth_engine/models/wan/wan_vae.py +771 -0
diffsynth_engine/pipelines/__init__.py +17 -0
diffsynth_engine/pipelines/base.py +216 -0
diffsynth_engine/pipelines/flux_image.py +548 -0
diffsynth_engine/pipelines/sd_image.py +386 -0
diffsynth_engine/pipelines/sdxl_image.py +430 -0
diffsynth_engine/pipelines/wan_video.py +481 -0
diffsynth_engine/tokenizers/__init__.py +4 -0
diffsynth_engine/tokenizers/base.py +157 -0
diffsynth_engine/tokenizers/clip.py +288 -0
diffsynth_engine/tokenizers/t5.py +194 -0
diffsynth_engine/tokenizers/wan.py +79 -0
diffsynth_engine/utils/__init__.py +0 -0
diffsynth_engine/utils/constants.py +34 -0
diffsynth_engine/utils/download.py +139 -0
diffsynth_engine/utils/env.py +7 -0
diffsynth_engine/utils/fp8_linear.py +64 -0
diffsynth_engine/utils/gguf.py +415 -0
diffsynth_engine/utils/loader.py +14 -0
diffsynth_engine/utils/lock.py +56 -0
diffsynth_engine/utils/logging.py +12 -0
diffsynth_engine/utils/offload.py +44 -0
diffsynth_engine/utils/parallel.py +191 -0
diffsynth_engine/utils/prompt.py +9 -0
diffsynth_engine/utils/video.py +40 -0
diffsynth_engine-0.1.0.dist-info/LICENSE +201 -0
diffsynth_engine-0.1.0.dist-info/METADATA +237 -0
diffsynth_engine-0.1.0.dist-info/RECORD +113 -0
diffsynth_engine-0.1.0.dist-info/WHEEL +5 -0
diffsynth_engine-0.1.0.dist-info/top_level.txt +1 -0

diffsynth_engine/models/wan/wan_text_encoder.py ADDED Viewed

@@ -0,0 +1,264 @@
+import math
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from typing import Dict
+from diffsynth_engine.models.base import StateDictConverter, PreTrainedModel
+from diffsynth_engine.models.utils import no_init_weights
+def fp16_clamp(x):
+    if x.dtype == torch.float16 and torch.isinf(x).any():
+        clamp = torch.finfo(x.dtype).max - 1000
+        x = torch.clamp(x, min=-clamp, max=clamp)
+    return x
+class GELU(nn.Module):
+    def forward(self, x):
+        return 0.5 * x * (1.0 + torch.tanh(math.sqrt(2.0 / math.pi) * (x + 0.044715 * torch.pow(x, 3.0))))
+class T5LayerNorm(nn.Module):
+    def __init__(self, dim, eps=1e-6):
+        super(T5LayerNorm, self).__init__()
+        self.dim = dim
+        self.eps = eps
+        self.weight = nn.Parameter(torch.ones(dim))
+    def forward(self, x):
+        x = x * torch.rsqrt(x.float().pow(2).mean(dim=-1, keepdim=True) + self.eps)
+        if self.weight.dtype in [torch.float16, torch.bfloat16]:
+            x = x.type_as(self.weight)
+        return self.weight * x
+class T5Attention(nn.Module):
+    def __init__(self, dim, dim_attn, num_heads, dropout=0.0):
+        assert dim_attn % num_heads == 0
+        super(T5Attention, self).__init__()
+        self.dim = dim
+        self.dim_attn = dim_attn
+        self.num_heads = num_heads
+        self.head_dim = dim_attn // num_heads
+        # layers
+        self.q = nn.Linear(dim, dim_attn, bias=False)
+        self.k = nn.Linear(dim, dim_attn, bias=False)
+        self.v = nn.Linear(dim, dim_attn, bias=False)
+        self.o = nn.Linear(dim_attn, dim, bias=False)
+        self.dropout = nn.Dropout(dropout)
+    def forward(self, x, context=None, mask=None, pos_bias=None):
+        """
+        x:          [B, L1, C].
+        context:    [B, L2, C] or None.
+        mask:       [B, L2] or [B, L1, L2] or None.
+        """
+        # check inputs
+        context = x if context is None else context
+        b, n, c = x.size(0), self.num_heads, self.head_dim
+        # compute query, key, value
+        q = self.q(x).view(b, -1, n, c)
+        k = self.k(context).view(b, -1, n, c)
+        v = self.v(context).view(b, -1, n, c)
+        # attention bias
+        attn_bias = x.new_zeros(b, n, q.size(1), k.size(1))
+        if pos_bias is not None:
+            attn_bias += pos_bias
+        if mask is not None:
+            assert mask.ndim in [2, 3]
+            mask = mask.view(b, 1, 1, -1) if mask.ndim == 2 else mask.unsqueeze(1)
+            attn_bias.masked_fill_(mask == 0, torch.finfo(x.dtype).min)
+        # compute attention (T5 does not use scaling)
+        attn = torch.einsum("binc,bjnc->bnij", q, k) + attn_bias
+        attn = F.softmax(attn.float(), dim=-1).type_as(attn)
+        x = torch.einsum("bnij,bjnc->binc", attn, v)
+        # output
+        x = x.reshape(b, -1, n * c)
+        x = self.o(x)
+        x = self.dropout(x)
+        return x
+class T5FeedForward(nn.Module):
+    def __init__(self, dim, dim_ffn, dropout=0.0):
+        super(T5FeedForward, self).__init__()
+        self.dim = dim
+        self.dim_ffn = dim_ffn
+        # layers
+        self.gate = nn.Sequential(nn.Linear(dim, dim_ffn, bias=False), GELU())
+        self.fc1 = nn.Linear(dim, dim_ffn, bias=False)
+        self.fc2 = nn.Linear(dim_ffn, dim, bias=False)
+        self.dropout = nn.Dropout(dropout)
+    def forward(self, x):
+        x = self.fc1(x) * self.gate(x)
+        x = self.dropout(x)
+        x = self.fc2(x)
+        x = self.dropout(x)
+        return x
+class T5SelfAttention(nn.Module):
+    def __init__(self, dim, dim_attn, dim_ffn, num_heads, num_buckets, shared_pos=True, dropout=0.0):
+        super(T5SelfAttention, self).__init__()
+        self.dim = dim
+        self.dim_attn = dim_attn
+        self.dim_ffn = dim_ffn
+        self.num_heads = num_heads
+        self.num_buckets = num_buckets
+        self.shared_pos = shared_pos
+        # layers
+        self.norm1 = T5LayerNorm(dim)
+        self.attn = T5Attention(dim, dim_attn, num_heads, dropout)
+        self.norm2 = T5LayerNorm(dim)
+        self.ffn = T5FeedForward(dim, dim_ffn, dropout)
+        self.pos_embedding = None if shared_pos else T5RelativeEmbedding(num_buckets, num_heads, bidirectional=True)
+    def forward(self, x, mask=None, pos_bias=None):
+        e = pos_bias if self.shared_pos else self.pos_embedding(x.size(1), x.size(1))
+        x = fp16_clamp(x + self.attn(self.norm1(x), mask=mask, pos_bias=e))
+        x = fp16_clamp(x + self.ffn(self.norm2(x)))
+        return x
+class T5RelativeEmbedding(nn.Module):
+    def __init__(self, num_buckets, num_heads, bidirectional, max_dist=128):
+        super(T5RelativeEmbedding, self).__init__()
+        self.num_buckets = num_buckets
+        self.num_heads = num_heads
+        self.bidirectional = bidirectional
+        self.max_dist = max_dist
+        # layers
+        self.embedding = nn.Embedding(num_buckets, num_heads)
+    def forward(self, lq, lk):
+        device = self.embedding.weight.device
+        # rel_pos = torch.arange(lk).unsqueeze(0).to(device) - \
+        #     torch.arange(lq).unsqueeze(1).to(device)
+        rel_pos = torch.arange(lk, device=device).unsqueeze(0) - torch.arange(lq, device=device).unsqueeze(1)
+        rel_pos = self._relative_position_bucket(rel_pos)
+        rel_pos_embeds = self.embedding(rel_pos)
+        rel_pos_embeds = rel_pos_embeds.permute(2, 0, 1).unsqueeze(0)  # [1, N, Lq, Lk]
+        return rel_pos_embeds.contiguous()
+    def _relative_position_bucket(self, rel_pos):
+        # preprocess
+        if self.bidirectional:
+            num_buckets = self.num_buckets // 2
+            rel_buckets = (rel_pos > 0).long() * num_buckets
+            rel_pos = torch.abs(rel_pos)
+        else:
+            num_buckets = self.num_buckets
+            rel_buckets = 0
+            rel_pos = -torch.min(rel_pos, torch.zeros_like(rel_pos))
+        # embeddings for small and large positions
+        max_exact = num_buckets // 2
+        rel_pos_large = (
+            max_exact
+            + (
+                torch.log(rel_pos.float() / max_exact) / math.log(self.max_dist / max_exact) * (num_buckets - max_exact)
+            ).long()
+        )
+        rel_pos_large = torch.min(rel_pos_large, torch.full_like(rel_pos_large, num_buckets - 1))
+        rel_buckets += torch.where(rel_pos < max_exact, rel_pos, rel_pos_large)
+        return rel_buckets
+def init_weights(m):
+    if isinstance(m, T5LayerNorm):
+        nn.init.ones_(m.weight)
+    elif isinstance(m, T5FeedForward):
+        nn.init.normal_(m.gate[0].weight, std=m.dim**-0.5)
+        nn.init.normal_(m.fc1.weight, std=m.dim**-0.5)
+        nn.init.normal_(m.fc2.weight, std=m.dim_ffn**-0.5)
+    elif isinstance(m, T5Attention):
+        nn.init.normal_(m.q.weight, std=(m.dim * m.dim_attn) ** -0.5)
+        nn.init.normal_(m.k.weight, std=m.dim**-0.5)
+        nn.init.normal_(m.v.weight, std=m.dim**-0.5)
+        nn.init.normal_(m.o.weight, std=(m.num_heads * m.dim_attn) ** -0.5)
+    elif isinstance(m, T5RelativeEmbedding):
+        nn.init.normal_(m.embedding.weight, std=(2 * m.num_buckets * m.num_heads) ** -0.5)
+class WanTextEncoderStateDictConverter(StateDictConverter):
+    def from_diffusers(self, state_dict):
+        return state_dict
+    def from_civitai(self, state_dict):
+        return state_dict
+    def convert(self, state_dict):
+        return state_dict
+class WanTextEncoder(PreTrainedModel):
+    converter = WanTextEncoderStateDictConverter()
+    def __init__(
+        self,
+        vocab=256384,
+        dim=4096,
+        dim_attn=4096,
+        dim_ffn=10240,
+        num_heads=64,
+        num_layers=24,
+        num_buckets=32,
+        shared_pos=False,
+        dropout=0.0,
+        device: str = "cuda:0",
+        dtype: torch.dtype = torch.bfloat16,
+    ):
+        super().__init__()
+        self.dim = dim
+        self.dim_attn = dim_attn
+        self.dim_ffn = dim_ffn
+        self.num_heads = num_heads
+        self.num_layers = num_layers
+        self.num_buckets = num_buckets
+        self.shared_pos = shared_pos
+        # layers
+        self.token_embedding = vocab if isinstance(vocab, nn.Embedding) else nn.Embedding(vocab, dim)
+        self.pos_embedding = T5RelativeEmbedding(num_buckets, num_heads, bidirectional=True) if shared_pos else None
+        self.dropout = nn.Dropout(dropout)
+        self.blocks = nn.ModuleList(
+            [
+                T5SelfAttention(dim, dim_attn, dim_ffn, num_heads, num_buckets, shared_pos, dropout)
+                for _ in range(num_layers)
+            ]
+        )
+        self.norm = T5LayerNorm(dim)
+    def forward(self, ids, mask=None):
+        x = self.token_embedding(ids)
+        x = self.dropout(x)
+        e = self.pos_embedding(x.size(1), x.size(1)) if self.shared_pos else None
+        for block in self.blocks:
+            x = block(x, mask, pos_bias=e)
+        x = self.norm(x)
+        x = self.dropout(x)
+        return x
+    @classmethod
+    def from_state_dict(
+        cls,
+        state_dict: Dict[str, torch.Tensor],
+        device: str,
+        dtype: torch.dtype,
+    ):
+        with no_init_weights():
+            model = torch.nn.utils.skip_init(cls, device=device, dtype=dtype)
+        model.load_state_dict(state_dict, assign=True)
+        model.to(device=device, dtype=dtype, non_blocking=True)
+        return model