PyPI - diffsynth-engine - Versions diffs - 0.3.6.dev13__py3-none-any.whl → 0.3.6.dev14__py3-none-any.whl - Mend

diffsynth-engine 0.3.6.dev13py3-none-any.whl → 0.3.6.dev14py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (22) hide show

diffsynth_engine/algorithm/sampler/flow_match/flow_match_euler.py CHANGED Viewed

@@ -9,13 +9,12 @@ class FlowMatchEulerSampler:
         self.mask = mask
     def step(self, latents, model_outputs, i):
-        if self.mask is not None:
-            model_outputs = model_outputs * self.mask + self.init_latents * (1 - self.mask)
         dt = self.sigmas[i + 1] - self.sigmas[i]
         latents = latents.to(dtype=torch.float32)
         latents = latents + model_outputs * dt
         latents = latents.to(dtype=model_outputs.dtype)
+        if self.mask is not None:
+            latents = latents * self.mask + self.init_latents * (1 - self.mask)
         return latents
     def add_noise(self, latents, noise, sigma):

diffsynth_engine/conf/models/wan/dit/{14b-i2v.json → wan2.1-flf2v-14b.json} RENAMED Viewed

@@ -1,5 +1,7 @@
 {
-    "has_image_input": true,
+    "has_clip_feature": true,
+    "has_vae_feature": true,
+    "flf_pos_emb": true,
     "patch_size": [1, 2, 2],
     "in_dim": 36,
     "dim": 5120,
@@ -9,5 +11,6 @@
     "out_dim": 16,
     "num_heads": 40,
     "num_layers": 40,
-    "eps": 1e-6
+    "eps": 1e-6,
+    "shift": 16.0
 }

diffsynth_engine/conf/models/wan/dit/{14b-flf2v.json → wan2.1-i2v-14b.json} RENAMED Viewed

@@ -1,6 +1,6 @@
 {
-    "has_image_input": true,
-    "flf_pos_emb": true,
+    "has_clip_feature": true,
+    "has_vae_feature": true,
     "patch_size": [1, 2, 2],
     "in_dim": 36,
     "dim": 5120,

diffsynth_engine/conf/models/wan/dit/{1.3b-t2v.json → wan2.1-t2v-1.3b.json} RENAMED Viewed

@@ -1,5 +1,4 @@
 {
-    "has_image_input": false,
     "patch_size": [1, 2, 2],
     "in_dim": 16,
     "dim": 1536,

diffsynth_engine/conf/models/wan/dit/{14b-t2v.json → wan2.1-t2v-14b.json} RENAMED Viewed

@@ -1,5 +1,4 @@
 {
-    "has_image_input": false,
     "patch_size": [1, 2, 2],
     "in_dim": 16,
     "dim": 5120,

diffsynth_engine/conf/models/wan/dit/wan2.2-i2v-a14b.json ADDED Viewed

@@ -0,0 +1,16 @@
+{
+    "has_vae_feature": true,
+    "patch_size": [1, 2, 2],
+    "in_dim": 36,
+    "dim": 5120,
+    "ffn_dim": 13824,
+    "freq_dim": 256,
+    "text_dim": 4096,
+    "out_dim": 16,
+    "num_heads": 40,
+    "num_layers": 40,
+    "eps": 1e-6,
+    "boundary": 0.900,
+    "cfg_scale": [3.5, 3.5],
+    "num_inference_steps": 40
+}

diffsynth_engine/conf/models/wan/dit/wan2.2-t2v-a14b.json ADDED Viewed

@@ -0,0 +1,16 @@
+{
+    "patch_size": [1, 2, 2],
+    "in_dim": 16,
+    "dim": 5120,
+    "ffn_dim": 13824,
+    "freq_dim": 256,
+    "text_dim": 4096,
+    "out_dim": 16,
+    "num_heads": 40,
+    "num_layers": 40,
+    "eps": 1e-6,
+    "boundary": 0.875,
+    "shift": 12.0,
+    "cfg_scale": [3.0, 4.0],
+    "num_inference_steps": 40
+}

diffsynth_engine/conf/models/wan/dit/wan2.2-ti2v-5b.json ADDED Viewed

@@ -0,0 +1,14 @@
+{
+    "fuse_image_latents": true,
+    "patch_size": [1, 2, 2],
+    "in_dim": 48,
+    "dim": 3072,
+    "ffn_dim": 14336,
+    "freq_dim": 256,
+    "text_dim": 4096,
+    "out_dim": 48,
+    "num_heads": 24,
+    "num_layers": 30,
+    "eps": 1e-6,
+    "fps": 24
+}

diffsynth_engine/conf/models/wan/vae/wan2.1-vae.json ADDED Viewed

@@ -0,0 +1,48 @@
+{
+    "in_channels": 3,
+    "out_channels": 3,
+    "encoder_dim": 96,
+    "decoder_dim": 96,
+    "z_dim": 16,
+    "dim_mult": [1, 2, 4, 4],
+    "num_res_blocks": 2,
+    "temperal_downsample": [false, true, true],
+    "dropout": 0.0,
+    "patch_size": 1,
+    "mean": [
+        -0.7571,
+        -0.7089,
+        -0.9113,
+        0.1075,
+        -0.1745,
+        0.9653,
+        -0.1517,
+        1.5508,
+        0.4134,
+        -0.0715,
+        0.5517,
+        -0.3632,
+        -0.1922,
+        -0.9497,
+        0.2503,
+        -0.2921
+    ],
+    "std": [
+        2.8184,
+        1.4541,
+        2.3275,
+        2.6558,
+        1.2196,
+        1.7708,
+        2.6052,
+        2.0743,
+        3.2687,
+        2.1526,
+        2.8652,
+        1.5579,
+        1.6382,
+        1.1253,
+        2.8251,
+        1.9160
+    ]
+}

diffsynth_engine/conf/models/wan/vae/wan2.2-vae.json ADDED Viewed

@@ -0,0 +1,112 @@
+{
+    "in_channels": 12,
+    "out_channels": 12,
+    "encoder_dim": 160,
+    "decoder_dim": 256,
+    "z_dim": 48,
+    "dim_mult": [1, 2, 4, 4],
+    "num_res_blocks": 2,
+    "temperal_downsample": [false, true, true],
+    "dropout": 0.0,
+    "patch_size": 2,
+    "mean": [
+        -0.2289,
+        -0.0052,
+        -0.1323,
+        -0.2339,
+        -0.2799,
+        0.0174,
+        0.1838,
+        0.1557,
+        -0.1382,
+        0.0542,
+        0.2813,
+        0.0891,
+        0.1570,
+        -0.0098,
+        0.0375,
+        -0.1825,
+        -0.2246,
+        -0.1207,
+        -0.0698,
+        0.5109,
+        0.2665,
+        -0.2108,
+        -0.2158,
+        0.2502,
+        -0.2055,
+        -0.0322,
+        0.1109,
+        0.1567,
+        -0.0729,
+        0.0899,
+        -0.2799,
+        -0.1230,
+        -0.0313,
+        -0.1649,
+        0.0117,
+        0.0723,
+        -0.2839,
+        -0.2083,
+        -0.0520,
+        0.3748,
+        0.0152,
+        0.1957,
+        0.1433,
+        -0.2944,
+        0.3573,
+        -0.0548,
+        -0.1681,
+        -0.0667
+    ],
+    "std": [
+        0.4765,
+        1.0364,
+        0.4514,
+        1.1677,
+        0.5313,
+        0.4990,
+        0.4818,
+        0.5013,
+        0.8158,
+        1.0344,
+        0.5894,
+        1.0901,
+        0.6885,
+        0.6165,
+        0.8454,
+        0.4978,
+        0.5759,
+        0.3523,
+        0.7135,
+        0.6804,
+        0.5833,
+        1.4146,
+        0.8986,
+        0.5659,
+        0.7069,
+        0.5338,
+        0.4889,
+        0.4917,
+        0.4069,
+        0.4999,
+        0.6866,
+        0.4093,
+        0.5709,
+        0.6065,
+        0.6415,
+        0.4944,
+        0.5726,
+        1.2042,
+        0.5458,
+        1.6887,
+        0.3971,
+        1.0600,
+        0.3943,
+        0.5537,
+        0.5444,
+        0.4089,
+        0.7468,
+        0.7744
+    ]
+}

diffsynth_engine/configs/pipeline.py CHANGED Viewed

@@ -139,7 +139,12 @@ class WanPipelineConfig(AttentionConfig, OptimizationConfig, ParallelConfig, Bas
     vae_dtype: torch.dtype = torch.bfloat16
     image_encoder_dtype: torch.dtype = torch.bfloat16
-    shift: Optional[float] = field(default=None, init=False)  # RecifitedFlowScheduler shift factor, set by model type
+    # default params set by model type
+    boundary: Optional[float] = field(default=None, init=False)  # boundary
+    shift: Optional[float] = field(default=None, init=False)  # RecifitedFlowScheduler shift factor
+    cfg_scale: Optional[float | Tuple[float, float]] = field(default=None, init=False)  # default CFG scale
+    num_inference_steps: Optional[int] = field(default=None, init=False)  # default inference steps
+    fps: Optional[int] = field(default=None, init=False)  # default FPS
     # override BaseConfig
     vae_tiled: bool = True

diffsynth_engine/models/wan/wan_dit.py CHANGED Viewed

@@ -10,10 +10,13 @@ from diffsynth_engine.models.basic import attention as attention_ops
 from diffsynth_engine.models.basic.transformer_helper import RMSNorm
 from diffsynth_engine.models.utils import no_init_weights
 from diffsynth_engine.utils.constants import (
-    WAN_DIT_1_3B_T2V_CONFIG_FILE,
-    WAN_DIT_14B_I2V_CONFIG_FILE,
-    WAN_DIT_14B_T2V_CONFIG_FILE,
-    WAN_DIT_14B_FLF2V_CONFIG_FILE,
+    WAN2_1_DIT_T2V_1_3B_CONFIG_FILE,
+    WAN2_1_DIT_I2V_14B_CONFIG_FILE,
+    WAN2_1_DIT_T2V_14B_CONFIG_FILE,
+    WAN2_1_DIT_FLF2V_14B_CONFIG_FILE,
+    WAN2_2_DIT_TI2V_5B_CONFIG_FILE,
+    WAN2_2_DIT_I2V_A14B_CONFIG_FILE,
+    WAN2_2_DIT_T2V_A14B_CONFIG_FILE,
 )
 from diffsynth_engine.utils.gguf import gguf_inference
 from diffsynth_engine.utils.parallel import (
@@ -182,7 +185,9 @@ class DiTBlock(nn.Module):
     def forward(self, x, context, t_mod, freqs):
         # msa: multi-head self-attention  mlp: multi-layer perceptron
-        shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = (self.modulation + t_mod).chunk(6, dim=1)
+        shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = [
+            t.squeeze(1) for t in (self.modulation + t_mod).chunk(6, dim=1)
+        ]
         input_x = modulate(self.norm1(x), shift_msa, scale_msa)
         x = x + gate_msa * self.self_attn(input_x, freqs)
         x = x + self.cross_attn(self.norm3(x), context)
@@ -237,7 +242,7 @@ class Head(nn.Module):
         self.modulation = nn.Parameter(torch.randn(1, 2, dim, device=device, dtype=dtype) / dim**0.5)
     def forward(self, x, t_mod):
-        shift, scale = (self.modulation + t_mod).chunk(2, dim=1)
+        shift, scale = [t.squeeze(1) for t in (self.modulation + t_mod.unsqueeze(1)).chunk(2, dim=1)]
         x = self.head(self.norm(x) * (1 + scale) + shift)
         return x
@@ -263,17 +268,22 @@ class WanDiT(PreTrainedModel):
         patch_size: Tuple[int, int, int],
         num_heads: int,
         num_layers: int,
-        has_image_input: bool,
+        has_clip_feature: bool = False,
+        has_vae_feature: bool = False,
+        fuse_image_latents: bool = False,
         flf_pos_emb: bool = False,
         attn_kwargs: Optional[Dict[str, Any]] = None,
-        device: str = "cpu",
+        device: str = "cuda:0",
         dtype: torch.dtype = torch.bfloat16,
     ):
         super().__init__()
+        self.in_dim = in_dim
         self.dim = dim
         self.freq_dim = freq_dim
-        self.has_image_input = has_image_input
+        self.has_clip_feature = has_clip_feature
+        self.has_vae_feature = has_vae_feature
+        self.fuse_image_latents = fuse_image_latents
         self.patch_size = patch_size
         self.patch_embedding = nn.Conv3d(
@@ -296,7 +306,7 @@ class WanDiT(PreTrainedModel):
         )
         self.blocks = nn.ModuleList(
             [
-                DiTBlock(has_image_input, dim, num_heads, ffn_dim, eps, attn_kwargs, device=device, dtype=dtype)
+                DiTBlock(has_clip_feature, dim, num_heads, ffn_dim, eps, attn_kwargs, device=device, dtype=dtype)
                 for _ in range(num_layers)
             ]
         )
@@ -305,7 +315,7 @@ class WanDiT(PreTrainedModel):
         head_dim = dim // num_heads
         self.freqs = precompute_freqs_cis_3d(head_dim)
-        if has_image_input:
+        if has_clip_feature:
             self.img_emb = MLP(1280, dim, flf_pos_emb, device=device, dtype=dtype)  # clip_feature_dim = 1280
     def patchify(self, x: torch.Tensor):
@@ -339,13 +349,14 @@ class WanDiT(PreTrainedModel):
             gguf_inference(),
             cfg_parallel((x, context, timestep, clip_feature, y), use_cfg=use_cfg),
         ):
-            t = self.time_embedding(sinusoidal_embedding_1d(self.freq_dim, timestep))
-            t_mod = self.time_projection(t).unflatten(1, (6, self.dim))
+            t = self.time_embedding(sinusoidal_embedding_1d(self.freq_dim, timestep))  # (s, d)
+            t_mod = self.time_projection(t).unflatten(1, (6, self.dim))  # (s, 6, d)
             context = self.text_embedding(context)
-            if self.has_image_input:
+            if self.has_vae_feature:
                 x = torch.cat([x, y], dim=1)  # (b, c_x + c_y, f, h, w)
-                clip_embdding = self.img_emb(clip_feature)
-                context = torch.cat([clip_embdding, context], dim=1)  # (b, s1 + s2, d)
+            if self.has_clip_feature:
+                clip_embedding = self.img_emb(clip_feature)
+                context = torch.cat([clip_embedding, context], dim=1)  # (b, s1 + s2, d)
             x, (f, h, w) = self.patchify(x)
             freqs = (
                 torch.cat(
@@ -360,7 +371,7 @@ class WanDiT(PreTrainedModel):
                 .to(x.device)
             )
-            with sequence_parallel((x, freqs), seq_dims=(1, 0)):
+            with sequence_parallel((x, t, t_mod, freqs), seq_dims=(1, 0, 0, 0)):
                 for block in self.blocks:
                     x = block(x, context, t_mod, freqs)
                 x = self.head(x, t)
@@ -369,26 +380,35 @@ class WanDiT(PreTrainedModel):
             (x,) = cfg_parallel_unshard((x,), use_cfg=use_cfg)
             return x
+    @staticmethod
+    def get_model_config(model_type: str):
+        MODEL_CONFIG_FILES = {
+            "wan2.1-t2v-1.3b": WAN2_1_DIT_T2V_1_3B_CONFIG_FILE,
+            "wan2.1-t2v-14b": WAN2_1_DIT_T2V_14B_CONFIG_FILE,
+            "wan2.1-i2v-14b": WAN2_1_DIT_I2V_14B_CONFIG_FILE,
+            "wan2.1-flf2v-14b": WAN2_1_DIT_FLF2V_14B_CONFIG_FILE,
+            "wan2.2-ti2v-5b": WAN2_2_DIT_TI2V_5B_CONFIG_FILE,
+            "wan2.2-t2v-a14b": WAN2_2_DIT_T2V_A14B_CONFIG_FILE,
+            "wan2.2-i2v-a14b": WAN2_2_DIT_I2V_A14B_CONFIG_FILE,
+        }
+        if model_type not in MODEL_CONFIG_FILES:
+            raise ValueError(f"Unsupported model type: {model_type}")
+        config_file = MODEL_CONFIG_FILES[model_type]
+        with open(config_file, "r") as f:
+            config = json.load(f)
+        return config
     @classmethod
     def from_state_dict(
         cls,
-        state_dict,
-        device,
-        dtype,
-        model_type="1.3b-t2v",
+        state_dict: Dict[str, torch.Tensor],
+        config: Dict[str, Any],
+        device: str = "cuda:0",
+        dtype: torch.dtype = torch.bfloat16,
         attn_kwargs: Optional[Dict[str, Any]] = None,
-        assign=True,
+        assign: bool = True,
     ):
-        if model_type == "1.3b-t2v":
-            config = json.load(open(WAN_DIT_1_3B_T2V_CONFIG_FILE, "r"))
-        elif model_type == "14b-t2v":
-            config = json.load(open(WAN_DIT_14B_T2V_CONFIG_FILE, "r"))
-        elif model_type == "14b-i2v":
-            config = json.load(open(WAN_DIT_14B_I2V_CONFIG_FILE, "r"))
-        elif model_type == "14b-flf2v":
-            config = json.load(open(WAN_DIT_14B_FLF2V_CONFIG_FILE, "r"))
-        else:
-            raise ValueError(f"Unsupported model type: {model_type}")
         with no_init_weights():
             model = torch.nn.utils.skip_init(cls, **config, device=device, dtype=dtype, attn_kwargs=attn_kwargs)
             model = model.requires_grad_(False)

diffsynth-engine 0.3.6.dev13__py3-none-any.whl → 0.3.6.dev14__py3-none-any.whl

diffsynth-engine 0.3.6.dev13py3-none-any.whl → 0.3.6.dev14py3-none-any.whl