PyPI - diffsynth-engine - Versions diffs - 0.3.6.dev12__tar.gz → 0.3.6.dev14__tar.gz - Mend

diffsynth-engine 0.3.6.dev12tar.gz → 0.3.6.dev14tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (169) hide show

{diffsynth_engine-0.3.6.dev12 → diffsynth_engine-0.3.6.dev14}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: diffsynth_engine
-Version: 0.3.6.dev12
+Version: 0.3.6.dev14
 Author: MuseAI x ModelScope
 Classifier: Programming Language :: Python :: 3
 Classifier: Operating System :: OS Independent

{diffsynth_engine-0.3.6.dev12 → diffsynth_engine-0.3.6.dev14}/README.md RENAMED Viewed

@@ -21,6 +21,12 @@ and offloading strategies, enabling loading of larger diffusion models (e.g., Fl
 - **Cross-Platform Support:** Runnable on Windows, macOS (Apple Silicon), and Linux, ensuring a smooth experience across different operating systems.
+## News
+- **[v0.4.0](https://github.com/modelscope/DiffSynth-Engine/releases/tag/v0.4.0)** | **August 1, 2025**:
+  - 🔥Supports [Wan2.2](https://modelscope.cn/collections/tongyiwanxiang-22--shipinshengcheng-2bb5b1adef2840) video generation model
+  - ⚠️[**Breaking Change**] Improved `from_pretrained` method pipeline initialization
 ## Quick Start
 ### Requirements

{diffsynth_engine-0.3.6.dev12 → diffsynth_engine-0.3.6.dev14}/diffsynth_engine/algorithm/sampler/flow_match/flow_match_euler.py RENAMED Viewed

@@ -9,13 +9,12 @@ class FlowMatchEulerSampler:
         self.mask = mask
     def step(self, latents, model_outputs, i):
-        if self.mask is not None:
-            model_outputs = model_outputs * self.mask + self.init_latents * (1 - self.mask)
         dt = self.sigmas[i + 1] - self.sigmas[i]
         latents = latents.to(dtype=torch.float32)
         latents = latents + model_outputs * dt
         latents = latents.to(dtype=model_outputs.dtype)
+        if self.mask is not None:
+            latents = latents * self.mask + self.init_latents * (1 - self.mask)
         return latents
     def add_noise(self, latents, noise, sigma):

diffsynth_engine-0.3.6.dev12/diffsynth_engine/conf/models/wan/dit/14b-i2v.json → diffsynth_engine-0.3.6.dev14/diffsynth_engine/conf/models/wan/dit/wan2.1-flf2v-14b.json RENAMED Viewed

@@ -1,5 +1,7 @@
 {
-    "has_image_input": true,
+    "has_clip_feature": true,
+    "has_vae_feature": true,
+    "flf_pos_emb": true,
     "patch_size": [1, 2, 2],
     "in_dim": 36,
     "dim": 5120,
@@ -9,5 +11,6 @@
     "out_dim": 16,
     "num_heads": 40,
     "num_layers": 40,
-    "eps": 1e-6
+    "eps": 1e-6,
+    "shift": 16.0
 }

diffsynth_engine-0.3.6.dev12/diffsynth_engine/conf/models/wan/dit/14b-flf2v.json → diffsynth_engine-0.3.6.dev14/diffsynth_engine/conf/models/wan/dit/wan2.1-i2v-14b.json RENAMED Viewed

@@ -1,6 +1,6 @@
 {
-    "has_image_input": true,
-    "flf_pos_emb": true,
+    "has_clip_feature": true,
+    "has_vae_feature": true,
     "patch_size": [1, 2, 2],
     "in_dim": 36,
     "dim": 5120,

diffsynth_engine-0.3.6.dev12/diffsynth_engine/conf/models/wan/dit/1.3b-t2v.json → diffsynth_engine-0.3.6.dev14/diffsynth_engine/conf/models/wan/dit/wan2.1-t2v-1.3b.json RENAMED Viewed

@@ -1,5 +1,4 @@
 {
-    "has_image_input": false,
     "patch_size": [1, 2, 2],
     "in_dim": 16,
     "dim": 1536,

diffsynth_engine-0.3.6.dev12/diffsynth_engine/conf/models/wan/dit/14b-t2v.json → diffsynth_engine-0.3.6.dev14/diffsynth_engine/conf/models/wan/dit/wan2.1-t2v-14b.json RENAMED Viewed

@@ -1,5 +1,4 @@
 {
-    "has_image_input": false,
     "patch_size": [1, 2, 2],
     "in_dim": 16,
     "dim": 5120,

diffsynth_engine-0.3.6.dev14/diffsynth_engine/conf/models/wan/dit/wan2.2-i2v-a14b.json ADDED Viewed

@@ -0,0 +1,16 @@
+{
+    "has_vae_feature": true,
+    "patch_size": [1, 2, 2],
+    "in_dim": 36,
+    "dim": 5120,
+    "ffn_dim": 13824,
+    "freq_dim": 256,
+    "text_dim": 4096,
+    "out_dim": 16,
+    "num_heads": 40,
+    "num_layers": 40,
+    "eps": 1e-6,
+    "boundary": 0.900,
+    "cfg_scale": [3.5, 3.5],
+    "num_inference_steps": 40
+}

diffsynth_engine-0.3.6.dev14/diffsynth_engine/conf/models/wan/dit/wan2.2-t2v-a14b.json ADDED Viewed

@@ -0,0 +1,16 @@
+{
+    "patch_size": [1, 2, 2],
+    "in_dim": 16,
+    "dim": 5120,
+    "ffn_dim": 13824,
+    "freq_dim": 256,
+    "text_dim": 4096,
+    "out_dim": 16,
+    "num_heads": 40,
+    "num_layers": 40,
+    "eps": 1e-6,
+    "boundary": 0.875,
+    "shift": 12.0,
+    "cfg_scale": [3.0, 4.0],
+    "num_inference_steps": 40
+}

diffsynth_engine-0.3.6.dev14/diffsynth_engine/conf/models/wan/dit/wan2.2-ti2v-5b.json ADDED Viewed

@@ -0,0 +1,14 @@
+{
+    "fuse_image_latents": true,
+    "patch_size": [1, 2, 2],
+    "in_dim": 48,
+    "dim": 3072,
+    "ffn_dim": 14336,
+    "freq_dim": 256,
+    "text_dim": 4096,
+    "out_dim": 48,
+    "num_heads": 24,
+    "num_layers": 30,
+    "eps": 1e-6,
+    "fps": 24
+}

diffsynth_engine-0.3.6.dev14/diffsynth_engine/conf/models/wan/vae/wan2.1-vae.json ADDED Viewed

@@ -0,0 +1,48 @@
+{
+    "in_channels": 3,
+    "out_channels": 3,
+    "encoder_dim": 96,
+    "decoder_dim": 96,
+    "z_dim": 16,
+    "dim_mult": [1, 2, 4, 4],
+    "num_res_blocks": 2,
+    "temperal_downsample": [false, true, true],
+    "dropout": 0.0,
+    "patch_size": 1,
+    "mean": [
+        -0.7571,
+        -0.7089,
+        -0.9113,
+        0.1075,
+        -0.1745,
+        0.9653,
+        -0.1517,
+        1.5508,
+        0.4134,
+        -0.0715,
+        0.5517,
+        -0.3632,
+        -0.1922,
+        -0.9497,
+        0.2503,
+        -0.2921
+    ],
+    "std": [
+        2.8184,
+        1.4541,
+        2.3275,
+        2.6558,
+        1.2196,
+        1.7708,
+        2.6052,
+        2.0743,
+        3.2687,
+        2.1526,
+        2.8652,
+        1.5579,
+        1.6382,
+        1.1253,
+        2.8251,
+        1.9160
+    ]
+}

diffsynth_engine-0.3.6.dev14/diffsynth_engine/conf/models/wan/vae/wan2.2-vae.json ADDED Viewed

@@ -0,0 +1,112 @@
+{
+    "in_channels": 12,
+    "out_channels": 12,
+    "encoder_dim": 160,
+    "decoder_dim": 256,
+    "z_dim": 48,
+    "dim_mult": [1, 2, 4, 4],
+    "num_res_blocks": 2,
+    "temperal_downsample": [false, true, true],
+    "dropout": 0.0,
+    "patch_size": 2,
+    "mean": [
+        -0.2289,
+        -0.0052,
+        -0.1323,
+        -0.2339,
+        -0.2799,
+        0.0174,
+        0.1838,
+        0.1557,
+        -0.1382,
+        0.0542,
+        0.2813,
+        0.0891,
+        0.1570,
+        -0.0098,
+        0.0375,
+        -0.1825,
+        -0.2246,
+        -0.1207,
+        -0.0698,
+        0.5109,
+        0.2665,
+        -0.2108,
+        -0.2158,
+        0.2502,
+        -0.2055,
+        -0.0322,
+        0.1109,
+        0.1567,
+        -0.0729,
+        0.0899,
+        -0.2799,
+        -0.1230,
+        -0.0313,
+        -0.1649,
+        0.0117,
+        0.0723,
+        -0.2839,
+        -0.2083,
+        -0.0520,
+        0.3748,
+        0.0152,
+        0.1957,
+        0.1433,
+        -0.2944,
+        0.3573,
+        -0.0548,
+        -0.1681,
+        -0.0667
+    ],
+    "std": [
+        0.4765,
+        1.0364,
+        0.4514,
+        1.1677,
+        0.5313,
+        0.4990,
+        0.4818,
+        0.5013,
+        0.8158,
+        1.0344,
+        0.5894,
+        1.0901,
+        0.6885,
+        0.6165,
+        0.8454,
+        0.4978,
+        0.5759,
+        0.3523,
+        0.7135,
+        0.6804,
+        0.5833,
+        1.4146,
+        0.8986,
+        0.5659,
+        0.7069,
+        0.5338,
+        0.4889,
+        0.4917,
+        0.4069,
+        0.4999,
+        0.6866,
+        0.4093,
+        0.5709,
+        0.6065,
+        0.6415,
+        0.4944,
+        0.5726,
+        1.2042,
+        0.5458,
+        1.6887,
+        0.3971,
+        1.0600,
+        0.3943,
+        0.5537,
+        0.5444,
+        0.4089,
+        0.7468,
+        0.7744
+    ]
+}

{diffsynth_engine-0.3.6.dev12 → diffsynth_engine-0.3.6.dev14}/diffsynth_engine/configs/pipeline.py RENAMED Viewed

@@ -139,7 +139,12 @@ class WanPipelineConfig(AttentionConfig, OptimizationConfig, ParallelConfig, Bas
     vae_dtype: torch.dtype = torch.bfloat16
     image_encoder_dtype: torch.dtype = torch.bfloat16
-    shift: Optional[float] = field(default=None, init=False)  # RecifitedFlowScheduler shift factor, set by model type
+    # default params set by model type
+    boundary: Optional[float] = field(default=None, init=False)  # boundary
+    shift: Optional[float] = field(default=None, init=False)  # RecifitedFlowScheduler shift factor
+    cfg_scale: Optional[float | Tuple[float, float]] = field(default=None, init=False)  # default CFG scale
+    num_inference_steps: Optional[int] = field(default=None, init=False)  # default inference steps
+    fps: Optional[int] = field(default=None, init=False)  # default FPS
     # override BaseConfig
     vae_tiled: bool = True

{diffsynth_engine-0.3.6.dev12 → diffsynth_engine-0.3.6.dev14}/diffsynth_engine/models/basic/attention.py RENAMED Viewed

@@ -14,6 +14,8 @@ from diffsynth_engine.utils.flag import (
     SPARGE_ATTN_AVAILABLE,
 )
+FA3_MAX_HEADDIM = 256
 logger = logging.get_logger(__name__)
@@ -130,31 +132,40 @@ def attention(
         "sage_attn",
         "sparge_attn",
     ]
+    flash_attn3_compatible = q.shape[-1] <= FA3_MAX_HEADDIM
     if attn_impl is None or attn_impl == "auto":
         if FLASH_ATTN_3_AVAILABLE:
-            return flash_attn3(q, k, v, softmax_scale=scale)
-        elif XFORMERS_AVAILABLE:
+            if flash_attn3_compatible:
+                return flash_attn3(q, k, v, softmax_scale=scale)
+            else:
+                logger.warning(
+                    f"head_dim={q.shape[-1]}, but flash_attn_3 only supports head dimension at most {FA3_MAX_HEADDIM}, will use fallback attention implementation"
+                )
+        if XFORMERS_AVAILABLE:
             return xformers_attn(q, k, v, attn_mask=attn_mask, scale=scale)
-        elif SDPA_AVAILABLE:
+        if SDPA_AVAILABLE:
             return sdpa_attn(q, k, v, attn_mask=attn_mask, scale=scale)
-        elif FLASH_ATTN_2_AVAILABLE:
+        if FLASH_ATTN_2_AVAILABLE:
             return flash_attn2(q, k, v, softmax_scale=scale)
-        else:
-            return eager_attn(q, k, v, attn_mask=attn_mask, scale=scale)
+        return eager_attn(q, k, v, attn_mask=attn_mask, scale=scale)
     else:
         if attn_impl == "eager":
             return eager_attn(q, k, v, attn_mask=attn_mask, scale=scale)
-        elif attn_impl == "flash_attn_3":
+        if attn_impl == "flash_attn_3":
+            if not flash_attn3_compatible:
+                raise RuntimeError(
+                    f"head_dim={q.shape[-1]}, but flash_attn_3 only supports head dimension at most {FA3_MAX_HEADDIM}"
+                )
             return flash_attn3(q, k, v, softmax_scale=scale)
-        elif attn_impl == "flash_attn_2":
+        if attn_impl == "flash_attn_2":
             return flash_attn2(q, k, v, softmax_scale=scale)
-        elif attn_impl == "xformers":
+        if attn_impl == "xformers":
             return xformers_attn(q, k, v, attn_mask=attn_mask, scale=scale)
-        elif attn_impl == "sdpa":
+        if attn_impl == "sdpa":
             return sdpa_attn(q, k, v, attn_mask=attn_mask, scale=scale)
-        elif attn_impl == "sage_attn":
+        if attn_impl == "sage_attn":
             return sage_attn(q, k, v, attn_mask=attn_mask, scale=scale)
-        elif attn_impl == "sparge_attn":
+        if attn_impl == "sparge_attn":
             return sparge_attn(
                 q,
                 k,
@@ -166,8 +177,7 @@ def attention(
                 cdfthreshd=kwargs.get("sparge_cdfthreshd", 0.98),
                 pvthreshd=kwargs.get("sparge_pvthreshd", 50),
             )
-        else:
-            raise ValueError(f"Invalid attention implementation: {attn_impl}")
+        raise ValueError(f"Invalid attention implementation: {attn_impl}")
 class Attention(nn.Module):
@@ -240,32 +250,42 @@ def long_context_attention(
         "sage_attn",
         "sparge_attn",
     ]
+    flash_attn3_compatible = q.shape[-1] <= FA3_MAX_HEADDIM
     if attn_impl is None or attn_impl == "auto":
         if FLASH_ATTN_3_AVAILABLE:
-            attn_func = LongContextAttention(attn_type=AttnType.FA3)
-        elif SDPA_AVAILABLE:
-            attn_func = LongContextAttention(attn_type=AttnType.TORCH)
-        elif FLASH_ATTN_2_AVAILABLE:
-            attn_func = LongContextAttention(attn_type=AttnType.FA)
-        else:
-            raise ValueError("No available long context attention implementation")
+            if flash_attn3_compatible:
+                return LongContextAttention(attn_type=AttnType.FA3)(q, k, v, softmax_scale=scale)
+            else:
+                logger.warning(
+                    f"head_dim={q.shape[-1]}, but flash_attn_3 only supports head dimension at most {FA3_MAX_HEADDIM}, will use fallback attention implementation"
+                )
+        if SDPA_AVAILABLE:
+            return LongContextAttention(attn_type=AttnType.TORCH)(q, k, v, softmax_scale=scale)
+        if FLASH_ATTN_2_AVAILABLE:
+            return LongContextAttention(attn_type=AttnType.FA)(q, k, v, softmax_scale=scale)
+        raise ValueError("No available long context attention implementation")
     else:
         if attn_impl == "flash_attn_3":
-            attn_func = LongContextAttention(attn_type=AttnType.FA3)
-        elif attn_impl == "flash_attn_2":
-            attn_func = LongContextAttention(attn_type=AttnType.FA)
-        elif attn_impl == "sdpa":
-            attn_func = LongContextAttention(attn_type=AttnType.TORCH)
-        elif attn_impl == "sage_attn":
-            attn_func = LongContextAttention(attn_type=AttnType.SAGE_FP8)
-        elif attn_impl == "sparge_attn":
+            if flash_attn3_compatible:
+                return LongContextAttention(attn_type=AttnType.FA3)(q, k, v, softmax_scale=scale)
+            else:
+                raise RuntimeError(
+                    f"head_dim={q.shape[-1]}, but flash_attn_3 only supports head dimension at most {FA3_MAX_HEADDIM}"
+                )
+        if attn_impl == "flash_attn_2":
+            return LongContextAttention(attn_type=AttnType.FA)(q, k, v, softmax_scale=scale)
+        if attn_impl == "sdpa":
+            return LongContextAttention(attn_type=AttnType.TORCH)(q, k, v, softmax_scale=scale)
+        if attn_impl == "sage_attn":
+            return LongContextAttention(attn_type=AttnType.SAGE_FP8)(q, k, v, softmax_scale=scale)
+        if attn_impl == "sparge_attn":
             attn_processor = SparseAttentionMeansim()
             # default args from spas_sage2_attn_meansim_cuda
             attn_processor.smooth_k = torch.tensor(kwargs.get("sparge_smooth_k", True))
             attn_processor.simthreshd1 = torch.tensor(kwargs.get("sparge_simthreshd1", 0.6))
             attn_processor.cdfthreshd = torch.tensor(kwargs.get("sparge_cdfthreshd", 0.98))
             attn_processor.pvthreshd = torch.tensor(kwargs.get("sparge_pvthreshd", 50))
-            attn_func = LongContextAttention(attn_type=AttnType.SPARSE_SAGE, attn_processor=attn_processor)
-        else:
-            raise ValueError(f"Invalid long context attention implementation: {attn_impl}")
-    return attn_func(q, k, v, softmax_scale=scale)
+            return LongContextAttention(attn_type=AttnType.SPARSE_SAGE, attn_processor=attn_processor)(
+                q, k, v, softmax_scale=scale
+            )
+        raise ValueError(f"Invalid long context attention implementation: {attn_impl}")

{diffsynth_engine-0.3.6.dev12 → diffsynth_engine-0.3.6.dev14}/diffsynth_engine/models/wan/wan_dit.py RENAMED Viewed

@@ -10,10 +10,13 @@ from diffsynth_engine.models.basic import attention as attention_ops
 from diffsynth_engine.models.basic.transformer_helper import RMSNorm
 from diffsynth_engine.models.utils import no_init_weights
 from diffsynth_engine.utils.constants import (
-    WAN_DIT_1_3B_T2V_CONFIG_FILE,
-    WAN_DIT_14B_I2V_CONFIG_FILE,
-    WAN_DIT_14B_T2V_CONFIG_FILE,
-    WAN_DIT_14B_FLF2V_CONFIG_FILE,
+    WAN2_1_DIT_T2V_1_3B_CONFIG_FILE,
+    WAN2_1_DIT_I2V_14B_CONFIG_FILE,
+    WAN2_1_DIT_T2V_14B_CONFIG_FILE,
+    WAN2_1_DIT_FLF2V_14B_CONFIG_FILE,
+    WAN2_2_DIT_TI2V_5B_CONFIG_FILE,
+    WAN2_2_DIT_I2V_A14B_CONFIG_FILE,
+    WAN2_2_DIT_T2V_A14B_CONFIG_FILE,
 )
 from diffsynth_engine.utils.gguf import gguf_inference
 from diffsynth_engine.utils.parallel import (
@@ -182,7 +185,9 @@ class DiTBlock(nn.Module):
     def forward(self, x, context, t_mod, freqs):
         # msa: multi-head self-attention  mlp: multi-layer perceptron
-        shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = (self.modulation + t_mod).chunk(6, dim=1)
+        shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = [
+            t.squeeze(1) for t in (self.modulation + t_mod).chunk(6, dim=1)
+        ]
         input_x = modulate(self.norm1(x), shift_msa, scale_msa)
         x = x + gate_msa * self.self_attn(input_x, freqs)
         x = x + self.cross_attn(self.norm3(x), context)
@@ -237,7 +242,7 @@ class Head(nn.Module):
         self.modulation = nn.Parameter(torch.randn(1, 2, dim, device=device, dtype=dtype) / dim**0.5)
     def forward(self, x, t_mod):
-        shift, scale = (self.modulation + t_mod).chunk(2, dim=1)
+        shift, scale = [t.squeeze(1) for t in (self.modulation + t_mod.unsqueeze(1)).chunk(2, dim=1)]
         x = self.head(self.norm(x) * (1 + scale) + shift)
         return x
@@ -263,17 +268,22 @@ class WanDiT(PreTrainedModel):
         patch_size: Tuple[int, int, int],
         num_heads: int,
         num_layers: int,
-        has_image_input: bool,
+        has_clip_feature: bool = False,
+        has_vae_feature: bool = False,
+        fuse_image_latents: bool = False,
         flf_pos_emb: bool = False,
         attn_kwargs: Optional[Dict[str, Any]] = None,
-        device: str = "cpu",
+        device: str = "cuda:0",
         dtype: torch.dtype = torch.bfloat16,
     ):
         super().__init__()
+        self.in_dim = in_dim
         self.dim = dim
         self.freq_dim = freq_dim
-        self.has_image_input = has_image_input
+        self.has_clip_feature = has_clip_feature
+        self.has_vae_feature = has_vae_feature
+        self.fuse_image_latents = fuse_image_latents
         self.patch_size = patch_size
         self.patch_embedding = nn.Conv3d(
@@ -296,7 +306,7 @@ class WanDiT(PreTrainedModel):
         )
         self.blocks = nn.ModuleList(
             [
-                DiTBlock(has_image_input, dim, num_heads, ffn_dim, eps, attn_kwargs, device=device, dtype=dtype)
+                DiTBlock(has_clip_feature, dim, num_heads, ffn_dim, eps, attn_kwargs, device=device, dtype=dtype)
                 for _ in range(num_layers)
             ]
         )
@@ -305,7 +315,7 @@ class WanDiT(PreTrainedModel):
         head_dim = dim // num_heads
         self.freqs = precompute_freqs_cis_3d(head_dim)
-        if has_image_input:
+        if has_clip_feature:
             self.img_emb = MLP(1280, dim, flf_pos_emb, device=device, dtype=dtype)  # clip_feature_dim = 1280
     def patchify(self, x: torch.Tensor):
@@ -339,13 +349,14 @@ class WanDiT(PreTrainedModel):
             gguf_inference(),
             cfg_parallel((x, context, timestep, clip_feature, y), use_cfg=use_cfg),
         ):
-            t = self.time_embedding(sinusoidal_embedding_1d(self.freq_dim, timestep))
-            t_mod = self.time_projection(t).unflatten(1, (6, self.dim))
+            t = self.time_embedding(sinusoidal_embedding_1d(self.freq_dim, timestep))  # (s, d)
+            t_mod = self.time_projection(t).unflatten(1, (6, self.dim))  # (s, 6, d)
             context = self.text_embedding(context)
-            if self.has_image_input:
+            if self.has_vae_feature:
                 x = torch.cat([x, y], dim=1)  # (b, c_x + c_y, f, h, w)
-                clip_embdding = self.img_emb(clip_feature)
-                context = torch.cat([clip_embdding, context], dim=1)  # (b, s1 + s2, d)
+            if self.has_clip_feature:
+                clip_embedding = self.img_emb(clip_feature)
+                context = torch.cat([clip_embedding, context], dim=1)  # (b, s1 + s2, d)
             x, (f, h, w) = self.patchify(x)
             freqs = (
                 torch.cat(
@@ -360,7 +371,7 @@ class WanDiT(PreTrainedModel):
                 .to(x.device)
             )
-            with sequence_parallel((x, freqs), seq_dims=(1, 0)):
+            with sequence_parallel((x, t, t_mod, freqs), seq_dims=(1, 0, 0, 0)):
                 for block in self.blocks:
                     x = block(x, context, t_mod, freqs)
                 x = self.head(x, t)
@@ -369,26 +380,35 @@ class WanDiT(PreTrainedModel):
             (x,) = cfg_parallel_unshard((x,), use_cfg=use_cfg)
             return x
+    @staticmethod
+    def get_model_config(model_type: str):
+        MODEL_CONFIG_FILES = {
+            "wan2.1-t2v-1.3b": WAN2_1_DIT_T2V_1_3B_CONFIG_FILE,
+            "wan2.1-t2v-14b": WAN2_1_DIT_T2V_14B_CONFIG_FILE,
+            "wan2.1-i2v-14b": WAN2_1_DIT_I2V_14B_CONFIG_FILE,
+            "wan2.1-flf2v-14b": WAN2_1_DIT_FLF2V_14B_CONFIG_FILE,
+            "wan2.2-ti2v-5b": WAN2_2_DIT_TI2V_5B_CONFIG_FILE,
+            "wan2.2-t2v-a14b": WAN2_2_DIT_T2V_A14B_CONFIG_FILE,
+            "wan2.2-i2v-a14b": WAN2_2_DIT_I2V_A14B_CONFIG_FILE,
+        }
+        if model_type not in MODEL_CONFIG_FILES:
+            raise ValueError(f"Unsupported model type: {model_type}")
+        config_file = MODEL_CONFIG_FILES[model_type]
+        with open(config_file, "r") as f:
+            config = json.load(f)
+        return config
     @classmethod
     def from_state_dict(
         cls,
-        state_dict,
-        device,
-        dtype,
-        model_type="1.3b-t2v",
+        state_dict: Dict[str, torch.Tensor],
+        config: Dict[str, Any],
+        device: str = "cuda:0",
+        dtype: torch.dtype = torch.bfloat16,
         attn_kwargs: Optional[Dict[str, Any]] = None,
-        assign=True,
+        assign: bool = True,
     ):
-        if model_type == "1.3b-t2v":
-            config = json.load(open(WAN_DIT_1_3B_T2V_CONFIG_FILE, "r"))
-        elif model_type == "14b-t2v":
-            config = json.load(open(WAN_DIT_14B_T2V_CONFIG_FILE, "r"))
-        elif model_type == "14b-i2v":
-            config = json.load(open(WAN_DIT_14B_I2V_CONFIG_FILE, "r"))
-        elif model_type == "14b-flf2v":
-            config = json.load(open(WAN_DIT_14B_FLF2V_CONFIG_FILE, "r"))
-        else:
-            raise ValueError(f"Unsupported model type: {model_type}")
         with no_init_weights():
             model = torch.nn.utils.skip_init(cls, **config, device=device, dtype=dtype, attn_kwargs=attn_kwargs)
             model = model.requires_grad_(False)

diffsynth-engine 0.3.6.dev12__tar.gz → 0.3.6.dev14__tar.gz

diffsynth-engine 0.3.6.dev12tar.gz → 0.3.6.dev14tar.gz