PyPI - diffsynth-engine - Versions diffs - 0.5.1.dev4__py3-none-any.whl → 0.6.1.dev25__py3-none-any.whl - Mend

diffsynth-engine 0.5.1.dev4py3-none-any.whl → 0.6.1.dev25py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (69) hide show

diffsynth_engine/models/qwen_image/qwen_image_dit.py CHANGED Viewed

@@ -1,15 +1,20 @@
 import torch
 import torch.nn as nn
-from typing import Any, Dict, Tuple, Union, Optional
+from typing import Any, Dict, List, Tuple, Union, Optional
 from einops import rearrange
 from diffsynth_engine.models.base import StateDictConverter, PreTrainedModel
 from diffsynth_engine.models.basic import attention as attention_ops
 from diffsynth_engine.models.basic.timestep import TimestepEmbeddings
-from diffsynth_engine.models.basic.transformer_helper import AdaLayerNorm, ApproximateGELU, RMSNorm
+from diffsynth_engine.models.basic.transformer_helper import AdaLayerNorm, GELU, RMSNorm
 from diffsynth_engine.utils.gguf import gguf_inference
 from diffsynth_engine.utils.fp8_linear import fp8_inference
-from diffsynth_engine.utils.parallel import cfg_parallel, cfg_parallel_unshard
+from diffsynth_engine.utils.parallel import (
+    cfg_parallel,
+    cfg_parallel_unshard,
+    sequence_parallel,
+    sequence_parallel_unshard,
+)
 class QwenImageDiTStateDictConverter(StateDictConverter):
@@ -139,7 +144,7 @@ class QwenFeedForward(nn.Module):
         super().__init__()
         inner_dim = int(dim * 4)
         self.net = nn.ModuleList([])
-        self.net.append(ApproximateGELU(dim, inner_dim, device=device, dtype=dtype))
+        self.net.append(GELU(dim, inner_dim, approximate="tanh", device=device, dtype=dtype))
         self.net.append(nn.Dropout(dropout))
         self.net.append(nn.Linear(inner_dim, dim_out, device=device, dtype=dtype))
@@ -150,8 +155,8 @@ class QwenFeedForward(nn.Module):
 def apply_rotary_emb_qwen(x: torch.Tensor, freqs_cis: Union[torch.Tensor, Tuple[torch.Tensor]]):
-    x_rotated = torch.view_as_complex(x.float().reshape(*x.shape[:-1], -1, 2))
-    x_out = torch.view_as_real(x_rotated * freqs_cis).flatten(3)
+    x_rotated = torch.view_as_complex(x.float().reshape(*x.shape[:-1], -1, 2))  # (b, s, h, d) -> (b, s, h, d/2, 2)
+    x_out = torch.view_as_real(x_rotated * freqs_cis.unsqueeze(1)).flatten(3)  # (b, s, h, d/2, 2) -> (b, s, h, d)
     return x_out.type_as(x)
@@ -162,7 +167,6 @@ class QwenDoubleStreamAttention(nn.Module):
         dim_b,
         num_heads,
         head_dim,
-        attn_kwargs: Optional[Dict[str, Any]] = None,
         device: str = "cuda:0",
         dtype: torch.dtype = torch.bfloat16,
     ):
@@ -184,44 +188,42 @@ class QwenDoubleStreamAttention(nn.Module):
         self.to_out = nn.Linear(dim_a, dim_a, device=device, dtype=dtype)
         self.to_add_out = nn.Linear(dim_b, dim_b, device=device, dtype=dtype)
-        self.attn_kwargs = attn_kwargs if attn_kwargs is not None else {}
     def forward(
         self,
         image: torch.FloatTensor,
         text: torch.FloatTensor,
-        image_rotary_emb: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
+        rotary_emb: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
+        attn_mask: Optional[torch.Tensor] = None,
+        attn_kwargs: Optional[Dict[str, Any]] = None,
     ) -> Tuple[torch.FloatTensor, torch.FloatTensor]:
         img_q, img_k, img_v = self.to_q(image), self.to_k(image), self.to_v(image)
         txt_q, txt_k, txt_v = self.add_q_proj(text), self.add_k_proj(text), self.add_v_proj(text)
-        img_q = rearrange(img_q, "b s (h d) -> b h s d", h=self.num_heads)
-        img_k = rearrange(img_k, "b s (h d) -> b h s d", h=self.num_heads)
-        img_v = rearrange(img_v, "b s (h d) -> b h s d", h=self.num_heads)
+        img_q = rearrange(img_q, "b s (h d) -> b s h d", h=self.num_heads)
+        img_k = rearrange(img_k, "b s (h d) -> b s h d", h=self.num_heads)
+        img_v = rearrange(img_v, "b s (h d) -> b s h d", h=self.num_heads)
-        txt_q = rearrange(txt_q, "b s (h d) -> b h s d", h=self.num_heads)
-        txt_k = rearrange(txt_k, "b s (h d) -> b h s d", h=self.num_heads)
-        txt_v = rearrange(txt_v, "b s (h d) -> b h s d", h=self.num_heads)
+        txt_q = rearrange(txt_q, "b s (h d) -> b s h d", h=self.num_heads)
+        txt_k = rearrange(txt_k, "b s (h d) -> b s h d", h=self.num_heads)
+        txt_v = rearrange(txt_v, "b s (h d) -> b s h d", h=self.num_heads)
         img_q, img_k = self.norm_q(img_q), self.norm_k(img_k)
         txt_q, txt_k = self.norm_added_q(txt_q), self.norm_added_k(txt_k)
-        if image_rotary_emb is not None:
-            img_freqs, txt_freqs = image_rotary_emb
+        if rotary_emb is not None:
+            img_freqs, txt_freqs = rotary_emb
             img_q = apply_rotary_emb_qwen(img_q, img_freqs)
             img_k = apply_rotary_emb_qwen(img_k, img_freqs)
             txt_q = apply_rotary_emb_qwen(txt_q, txt_freqs)
             txt_k = apply_rotary_emb_qwen(txt_k, txt_freqs)
-        joint_q = torch.cat([txt_q, img_q], dim=2)
-        joint_k = torch.cat([txt_k, img_k], dim=2)
-        joint_v = torch.cat([txt_v, img_v], dim=2)
+        joint_q = torch.cat([txt_q, img_q], dim=1)
+        joint_k = torch.cat([txt_k, img_k], dim=1)
+        joint_v = torch.cat([txt_v, img_v], dim=1)
-        joint_q = joint_q.transpose(1, 2)
-        joint_k = joint_k.transpose(1, 2)
-        joint_v = joint_v.transpose(1, 2)
-        joint_attn_out = attention_ops.attention(joint_q, joint_k, joint_v, **self.attn_kwargs)
+        attn_kwargs = attn_kwargs if attn_kwargs is not None else {}
+        joint_attn_out = attention_ops.attention(joint_q, joint_k, joint_v, attn_mask=attn_mask, **attn_kwargs)
         joint_attn_out = rearrange(joint_attn_out, "b s h d -> b s (h d)").to(joint_q.dtype)
@@ -241,7 +243,6 @@ class QwenImageTransformerBlock(nn.Module):
         num_attention_heads: int,
         attention_head_dim: int,
         eps: float = 1e-6,
-        attn_kwargs: Optional[Dict[str, Any]] = None,
         device: str = "cuda:0",
         dtype: torch.dtype = torch.bfloat16,
     ):
@@ -261,7 +262,6 @@ class QwenImageTransformerBlock(nn.Module):
             dim_b=dim,
             num_heads=num_attention_heads,
             head_dim=attention_head_dim,
-            attn_kwargs=attn_kwargs,
             device=device,
             dtype=dtype,
         )
@@ -285,7 +285,9 @@ class QwenImageTransformerBlock(nn.Module):
         image: torch.Tensor,
         text: torch.Tensor,
         temb: torch.Tensor,
-        image_rotary_emb: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
+        rotary_emb: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
+        attn_mask: Optional[torch.Tensor] = None,
+        attn_kwargs: Optional[Dict[str, Any]] = None,
     ) -> Tuple[torch.Tensor, torch.Tensor]:
         img_mod_attn, img_mod_mlp = self.img_mod(temb).chunk(2, dim=-1)  # [B, 3*dim] each
         txt_mod_attn, txt_mod_mlp = self.txt_mod(temb).chunk(2, dim=-1)  # [B, 3*dim] each
@@ -299,7 +301,9 @@ class QwenImageTransformerBlock(nn.Module):
         img_attn_out, txt_attn_out = self.attn(
             image=img_modulated,
             text=txt_modulated,
-            image_rotary_emb=image_rotary_emb,
+            rotary_emb=rotary_emb,
+            attn_mask=attn_mask,
+            attn_kwargs=attn_kwargs,
         )
         image = image + img_gate * img_attn_out
@@ -327,7 +331,6 @@ class QwenImageDiT(PreTrainedModel):
     def __init__(
         self,
         num_layers: int = 60,
-        attn_kwargs: Optional[Dict[str, Any]] = None,
         device: str = "cuda:0",
         dtype: torch.dtype = torch.bfloat16,
     ):
@@ -348,7 +351,6 @@ class QwenImageDiT(PreTrainedModel):
                     dim=3072,
                     num_attention_heads=24,
                     attention_head_dim=128,
-                    attn_kwargs=attn_kwargs,
                     device=device,
                     dtype=dtype,
                 )
@@ -368,13 +370,75 @@ class QwenImageDiT(PreTrainedModel):
         )
         return hidden_states
+    def process_entity_masks(
+        self,
+        text: torch.Tensor,
+        text_seq_lens: torch.LongTensor,
+        rotary_emb: Tuple[torch.Tensor, torch.Tensor],
+        video_fhw: List[Tuple[int, int, int]],
+        entity_text: List[torch.Tensor],
+        entity_seq_lens: List[torch.LongTensor],
+        entity_masks: List[torch.Tensor],
+        device: str,
+        dtype: torch.dtype,
+    ):
+        entity_seq_lens = [seq_lens.max().item() for seq_lens in entity_seq_lens]
+        text_seq_lens = entity_seq_lens + [text_seq_lens.max().item()]
+        entity_text = [
+            self.txt_in(self.txt_norm(text[:, :seq_len])) for text, seq_len in zip(entity_text, entity_seq_lens)
+        ]
+        text = torch.cat(entity_text + [text], dim=1)
+        entity_txt_freqs = [self.pos_embed(video_fhw, seq_len, device)[1] for seq_len in entity_seq_lens]
+        img_freqs, txt_freqs = rotary_emb
+        txt_freqs = torch.cat(entity_txt_freqs + [txt_freqs], dim=0)
+        rotary_emb = (img_freqs, txt_freqs)
+        global_mask = torch.ones_like(entity_masks[0], device=device, dtype=dtype)
+        patched_masks = [self.patchify(mask) for mask in entity_masks + [global_mask]]
+        batch_size, image_seq_len = patched_masks[0].shape[:2]
+        total_seq_len = sum(text_seq_lens) + image_seq_len
+        attention_mask = torch.ones((batch_size, total_seq_len, total_seq_len), device=device, dtype=torch.bool)
+        # text-image attention mask
+        img_start, img_end = sum(text_seq_lens), total_seq_len
+        cumsum = [0]
+        for seq_len in text_seq_lens:
+            cumsum.append(cumsum[-1] + seq_len)
+        for i, patched_mask in enumerate(patched_masks):
+            txt_start, txt_end = cumsum[i], cumsum[i + 1]
+            mask = torch.sum(patched_mask, dim=-1) > 0
+            mask = mask.unsqueeze(1).repeat(1, text_seq_lens[i], 1)
+            # text-to-image attention
+            attention_mask[:, txt_start:txt_end, img_start:img_end] = mask
+            # image-to-text attention
+            attention_mask[:, img_start:img_end, txt_start:txt_end] = mask.transpose(1, 2)
+        # entity text tokens should not attend to each other
+        for i in range(len(text_seq_lens)):
+            for j in range(len(text_seq_lens)):
+                if i == j:
+                    continue
+                i_start, i_end = cumsum[i], cumsum[i + 1]
+                j_start, j_end = cumsum[j], cumsum[j + 1]
+                attention_mask[:, i_start:i_end, j_start:j_end] = False
+        attn_mask = torch.zeros_like(attention_mask, device=device, dtype=dtype)
+        attn_mask[~attention_mask] = -torch.inf
+        attn_mask = attn_mask.unsqueeze(1)
+        return text, rotary_emb, attn_mask
     def forward(
         self,
         image: torch.Tensor,
         edit: torch.Tensor = None,
-        text: torch.Tensor = None,
         timestep: torch.LongTensor = None,
-        txt_seq_lens: torch.LongTensor = None,
+        text: torch.Tensor = None,
+        text_seq_lens: torch.LongTensor = None,
+        context_latents: Optional[torch.Tensor] = None,
+        entity_text: Optional[List[torch.Tensor]] = None,
+        entity_seq_lens: Optional[List[torch.LongTensor]] = None,
+        entity_masks: Optional[List[torch.Tensor]] = None,
+        attn_kwargs: Optional[Dict[str, Any]] = None,
     ):
         h, w = image.shape[-2:]
         fp8_linear_enabled = getattr(self, "fp8_linear_enabled", False)
@@ -385,37 +449,72 @@ class QwenImageDiT(PreTrainedModel):
             cfg_parallel(
                 (
                     image,
-                    edit,
-                    text,
+                    *(edit if edit is not None else ()),
                     timestep,
-                    txt_seq_lens,
+                    text,
+                    text_seq_lens,
+                    *(entity_text if entity_text is not None else ()),
+                    *(entity_seq_lens if entity_seq_lens is not None else ()),
+                    *(entity_masks if entity_masks is not None else ()),
+                    context_latents,
                 ),
                 use_cfg=use_cfg,
             ),
         ):
             conditioning = self.time_text_embed(timestep, image.dtype)
             video_fhw = [(1, h // 2, w // 2)]  # frame, height, width
-            max_length = txt_seq_lens.max().item()
+            text_seq_len = text_seq_lens.max().item()
             image = self.patchify(image)
             image_seq_len = image.shape[1]
+            if context_latents is not None:
+                context_latents = context_latents.to(dtype=image.dtype)
+                context_latents = self.patchify(context_latents)
+                image = torch.cat([image, context_latents], dim=1)
+                video_fhw += [(1, h // 2, w // 2)]
             if edit is not None:
-                edit = edit.to(dtype=image.dtype)
-                edit = self.patchify(edit)
-                image = torch.cat([image, edit], dim=1)
-                video_fhw += video_fhw
+                for img in edit:
+                    img = img.to(dtype=image.dtype)
+                    edit_h, edit_w = img.shape[-2:]
+                    img = self.patchify(img)
+                    image = torch.cat([image, img], dim=1)
+                    video_fhw += [(1, edit_h // 2, edit_w // 2)]
-            image_rotary_emb = self.pos_embed(video_fhw, max_length, image.device)
+            rotary_emb = self.pos_embed(video_fhw, text_seq_len, image.device)
             image = self.img_in(image)
-            text = self.txt_in(self.txt_norm(text[:, :max_length]))
+            text = self.txt_in(self.txt_norm(text[:, :text_seq_len]))
-            for block in self.transformer_blocks:
-                text, image = block(image=image, text=text, temb=conditioning, image_rotary_emb=image_rotary_emb)
-            image = self.norm_out(image, conditioning)
-            image = self.proj_out(image)
-            if edit is not None:
-                image = image[:, :image_seq_len]
+            attn_mask = None
+            if entity_text is not None:
+                text, rotary_emb, attn_mask = self.process_entity_masks(
+                    text,
+                    text_seq_lens,
+                    rotary_emb,
+                    video_fhw,
+                    entity_text,
+                    entity_seq_lens,
+                    entity_masks,
+                    image.device,
+                    image.dtype,
+                )
+            # warning: Eligen does not work with sequence parallel because long context attention does not support attention masks
+            img_freqs, txt_freqs = rotary_emb
+            with sequence_parallel((image, text, img_freqs, txt_freqs), seq_dims=(1, 1, 0, 0)):
+                rotary_emb = (img_freqs, txt_freqs)
+                for block in self.transformer_blocks:
+                    text, image = block(
+                        image=image,
+                        text=text,
+                        temb=conditioning,
+                        rotary_emb=rotary_emb,
+                        attn_mask=attn_mask,
+                        attn_kwargs=attn_kwargs,
+                    )
+                image = self.norm_out(image, conditioning)
+                image = self.proj_out(image)
+                (image,) = sequence_parallel_unshard((image,), seq_dims=(1,), seq_lens=(image_seq_len,))
+            image = image[:, :image_seq_len]
             image = self.unpatchify(image, h, w)
         (image,) = cfg_parallel_unshard((image,), use_cfg=use_cfg)
@@ -428,14 +527,8 @@ class QwenImageDiT(PreTrainedModel):
         device: str,
         dtype: torch.dtype,
         num_layers: int = 60,
-        attn_kwargs: Optional[Dict[str, Any]] = None,
     ):
-        model = cls(
-            device="meta",
-            dtype=dtype,
-            num_layers=num_layers,
-            attn_kwargs=attn_kwargs,
-        )
+        model = cls(device="meta", dtype=dtype, num_layers=num_layers)
         model = model.requires_grad_(False)
         model.load_state_dict(state_dict, assign=True)
         model.to(device=device, dtype=dtype, non_blocking=True)
@@ -445,5 +538,5 @@ class QwenImageDiT(PreTrainedModel):
         for block in self.transformer_blocks:
             block.compile(*args, **kwargs)
-    def get_fsdp_modules(self):
-        return ["transformer_blocks"]
+    def get_fsdp_module_cls(self):
+        return {QwenImageTransformerBlock}

diffsynth_engine/models/qwen_image/qwen_image_dit_fbcache.py CHANGED Viewed

@@ -11,12 +11,11 @@ class QwenImageDiTFBCache(QwenImageDiT):
     def __init__(
         self,
         num_layers: int = 60,
-        attn_kwargs: Optional[Dict[str, Any]] = None,
         device: str = "cuda:0",
         dtype: torch.dtype = torch.bfloat16,
         relative_l1_threshold: float = 0.05,
     ):
-        super().__init__(num_layers=num_layers, attn_kwargs=attn_kwargs, device=device, dtype=dtype)
+        super().__init__(num_layers=num_layers, device=device, dtype=dtype)
         self.relative_l1_threshold = relative_l1_threshold
         self.step_count = 0
         self.num_inference_steps = 0
@@ -43,6 +42,7 @@ class QwenImageDiTFBCache(QwenImageDiT):
         text: torch.Tensor = None,
         timestep: torch.LongTensor = None,
         txt_seq_lens: torch.LongTensor = None,
+        attn_kwargs: Optional[Dict[str, Any]] = None,
     ):
         h, w = image.shape[-2:]
         fp8_linear_enabled = getattr(self, "fp8_linear_enabled", False)
@@ -72,7 +72,11 @@ class QwenImageDiTFBCache(QwenImageDiT):
             # first block
             original_hidden_states = image
             text, image = self.transformer_blocks[0](
-                image=image, text=text, temb=conditioning, image_rotary_emb=image_rotary_emb
+                image=image,
+                text=text,
+                temb=conditioning,
+                image_rotary_emb=image_rotary_emb,
+                attn_kwargs=attn_kwargs,
             )
             first_hidden_states_residual = image - original_hidden_states
@@ -94,7 +98,13 @@ class QwenImageDiTFBCache(QwenImageDiT):
                 first_hidden_states = image.clone()
                 for block in self.transformer_blocks[1:]:
-                    text, image = block(image=image, text=text, temb=conditioning, image_rotary_emb=image_rotary_emb)
+                    text, image = block(
+                        image=image,
+                        text=text,
+                        temb=conditioning,
+                        image_rotary_emb=image_rotary_emb,
+                        attn_kwargs=attn_kwargs,
+                    )
                 previous_residual = image - first_hidden_states
                 self.previous_residual = previous_residual
@@ -114,14 +124,12 @@ class QwenImageDiTFBCache(QwenImageDiT):
         device: str,
         dtype: torch.dtype,
         num_layers: int = 60,
-        attn_kwargs: Optional[Dict[str, Any]] = None,
         relative_l1_threshold: float = 0.05,
     ):
         model = cls(
             device="meta",
             dtype=dtype,
             num_layers=num_layers,
-            attn_kwargs=attn_kwargs,
             relative_l1_threshold=relative_l1_threshold,
         )
         model = model.requires_grad_(False)

diffsynth_engine/models/qwen_image/qwen_image_vae.py CHANGED Viewed

@@ -12,7 +12,7 @@ from diffsynth_engine.utils.constants import QWEN_IMAGE_VAE_KEYMAP_FILE
 CACHE_T = 2
-with open(QWEN_IMAGE_VAE_KEYMAP_FILE, "r") as f:
+with open(QWEN_IMAGE_VAE_KEYMAP_FILE, "r", encoding="utf-8") as f:
     config = json.load(f)

diffsynth_engine/models/sd/sd_text_encoder.py CHANGED Viewed

@@ -10,7 +10,7 @@ from diffsynth_engine.utils import logging
 logger = logging.get_logger(__name__)
-with open(SD_TEXT_ENCODER_CONFIG_FILE, "r") as f:
+with open(SD_TEXT_ENCODER_CONFIG_FILE, "r", encoding="utf-8") as f:
     config = json.load(f)

diffsynth_engine/models/sd/sd_unet.py CHANGED Viewed

@@ -18,7 +18,7 @@ from diffsynth_engine.utils import logging
 logger = logging.get_logger(__name__)
-with open(SD_UNET_CONFIG_FILE) as f:
+with open(SD_UNET_CONFIG_FILE, encoding="utf-8") as f:
     config = json.load(f)

diffsynth_engine/models/sd3/sd3_dit.py CHANGED Viewed

@@ -13,7 +13,7 @@ from diffsynth_engine.utils import logging
 logger = logging.get_logger(__name__)
-with open(SD3_DIT_CONFIG_FILE, "r") as f:
+with open(SD3_DIT_CONFIG_FILE, "r", encoding="utf-8") as f:
     config = json.load(f)

diffsynth_engine/models/sd3/sd3_text_encoder.py CHANGED Viewed

@@ -11,7 +11,7 @@ from diffsynth_engine.utils import logging
 logger = logging.get_logger(__name__)
-with open(SD3_TEXT_ENCODER_CONFIG_FILE, "r") as f:
+with open(SD3_TEXT_ENCODER_CONFIG_FILE, "r", encoding="utf-8") as f:
     config = json.load(f)

diffsynth_engine/models/sdxl/sdxl_text_encoder.py CHANGED Viewed

@@ -10,7 +10,7 @@ from diffsynth_engine.utils import logging
 logger = logging.get_logger(__name__)
-with open(SDXL_TEXT_ENCODER_CONFIG_FILE, "r") as f:
+with open(SDXL_TEXT_ENCODER_CONFIG_FILE, "r", encoding="utf-8") as f:
     config = json.load(f)

diffsynth_engine/models/sdxl/sdxl_unet.py CHANGED Viewed

@@ -18,7 +18,7 @@ from diffsynth_engine.utils import logging
 logger = logging.get_logger(__name__)
-with open(SDXL_UNET_CONFIG_FILE, "r") as f:
+with open(SDXL_UNET_CONFIG_FILE, "r", encoding="utf-8") as f:
     config = json.load(f)

diffsynth_engine/models/vae/vae.py CHANGED Viewed

@@ -12,7 +12,7 @@ from diffsynth_engine.utils import logging
 logger = logging.get_logger(__name__)
-with open(VAE_CONFIG_FILE, "r") as f:
+with open(VAE_CONFIG_FILE, "r", encoding="utf-8") as f:
     config = json.load(f)

diffsynth_engine/models/wan/wan_audio_encoder.py CHANGED Viewed

@@ -223,7 +223,6 @@ class Wav2Vec2StateDictConverter:
 class Wav2Vec2Model(PreTrainedModel):
     converter = Wav2Vec2StateDictConverter()
-    _supports_parallelization = False
     def __init__(self, config: Wav2Vec2Config, device: str = "cuda:0", dtype: torch.dtype = torch.bfloat16):
         super().__init__()
@@ -267,9 +266,13 @@ def linear_interpolation(features: torch.Tensor, input_fps: int, output_fps: int
     return output_features.transpose(1, 2)  # [1, output_len, 512]
-def extract_audio_feat(audio_input: torch.Tensor, model: Wav2Vec2Model, dtype=torch.float32, device="cuda:0") -> torch.Tensor:
+def extract_audio_feat(
+    audio_input: torch.Tensor, model: Wav2Vec2Model, dtype=torch.float32, device="cuda:0"
+) -> torch.Tensor:
     video_rate = 30
-    input_values = (audio_input - audio_input.mean(dim=1, keepdim=True)) / torch.sqrt(audio_input.var(dim=1, keepdim=True) + 1e-7)
+    input_values = (audio_input - audio_input.mean(dim=1, keepdim=True)) / torch.sqrt(
+        audio_input.var(dim=1, keepdim=True) + 1e-7
+    )
     feat = torch.cat(model(input_values.to(device)))
     feat = linear_interpolation(feat, input_fps=50, output_fps=video_rate)
     return feat.to(dtype)  # Encoding for the motion

diffsynth-engine 0.5.1.dev4__py3-none-any.whl → 0.6.1.dev25__py3-none-any.whl

diffsynth-engine 0.5.1.dev4py3-none-any.whl → 0.6.1.dev25py3-none-any.whl