PyPI - diffsynth-engine - Versions diffs - 0.6.1.dev29__py3-none-any.whl → 0.6.1.dev31__py3-none-any.whl - Mend

diffsynth-engine 0.6.1.dev29py3-none-any.whl → 0.6.1.dev31py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

diffsynth_engine/models/wan/wan_image_encoder.py CHANGED Viewed

@@ -439,7 +439,7 @@ class WanImageEncoder(PreTrainedModel):
     def __init__(self, device: str = "cuda:0", dtype: torch.dtype = torch.bfloat16):
         super().__init__()
         # init model
-        self.model, self.transforms = clip_xlm_roberta_vit_h_14(dtype=torch.float32, device="cpu")
+        self.model, self.transforms = clip_xlm_roberta_vit_h_14(dtype=torch.float32, device=device)
     def encode_image(self, images: List[torch.Tensor]):
         # preprocess

diffsynth_engine/models/wan/wan_text_encoder.py CHANGED Viewed

@@ -38,19 +38,20 @@ class T5LayerNorm(nn.Module):
 class T5Attention(nn.Module):
-    def __init__(self, dim, dim_attn, num_heads, dropout=0.0):
+    def __init__(self, dim, dim_attn, num_heads, dropout=0.0, device="cuda:0"):
         assert dim_attn % num_heads == 0
         super(T5Attention, self).__init__()
         self.dim = dim
         self.dim_attn = dim_attn
         self.num_heads = num_heads
         self.head_dim = dim_attn // num_heads
+        self.device = device
         # layers
-        self.q = nn.Linear(dim, dim_attn, bias=False)
-        self.k = nn.Linear(dim, dim_attn, bias=False)
-        self.v = nn.Linear(dim, dim_attn, bias=False)
-        self.o = nn.Linear(dim_attn, dim, bias=False)
+        self.q = nn.Linear(dim, dim_attn, bias=False, device=device)
+        self.k = nn.Linear(dim, dim_attn, bias=False, device=device)
+        self.v = nn.Linear(dim, dim_attn, bias=False, device=device)
+        self.o = nn.Linear(dim_attn, dim, bias=False, device=device)
         self.dropout = nn.Dropout(dropout)
     def forward(self, x, context=None, mask=None, pos_bias=None):
@@ -90,15 +91,16 @@ class T5Attention(nn.Module):
 class T5FeedForward(nn.Module):
-    def __init__(self, dim, dim_ffn, dropout=0.0):
+    def __init__(self, dim, dim_ffn, dropout=0.0, device="cuda:0"):
         super(T5FeedForward, self).__init__()
         self.dim = dim
         self.dim_ffn = dim_ffn
+        self.device = device
         # layers
-        self.gate = nn.Sequential(nn.Linear(dim, dim_ffn, bias=False), GELU())
-        self.fc1 = nn.Linear(dim, dim_ffn, bias=False)
-        self.fc2 = nn.Linear(dim_ffn, dim, bias=False)
+        self.gate = nn.Sequential(nn.Linear(dim, dim_ffn, bias=False, device=device), GELU())
+        self.fc1 = nn.Linear(dim, dim_ffn, bias=False, device=device)
+        self.fc2 = nn.Linear(dim_ffn, dim, bias=False, device=device)
         self.dropout = nn.Dropout(dropout)
     def forward(self, x):
@@ -110,7 +112,7 @@ class T5FeedForward(nn.Module):
 class T5SelfAttention(nn.Module):
-    def __init__(self, dim, dim_attn, dim_ffn, num_heads, num_buckets, shared_pos=True, dropout=0.0):
+    def __init__(self, dim, dim_attn, dim_ffn, num_heads, num_buckets, shared_pos=True, dropout=0.0, device="cuda:0"):
         super(T5SelfAttention, self).__init__()
         self.dim = dim
         self.dim_attn = dim_attn
@@ -118,13 +120,14 @@ class T5SelfAttention(nn.Module):
         self.num_heads = num_heads
         self.num_buckets = num_buckets
         self.shared_pos = shared_pos
+        self.device = device
         # layers
         self.norm1 = T5LayerNorm(dim)
-        self.attn = T5Attention(dim, dim_attn, num_heads, dropout)
+        self.attn = T5Attention(dim, dim_attn, num_heads, dropout, device)
         self.norm2 = T5LayerNorm(dim)
-        self.ffn = T5FeedForward(dim, dim_ffn, dropout)
-        self.pos_embedding = None if shared_pos else T5RelativeEmbedding(num_buckets, num_heads, bidirectional=True)
+        self.ffn = T5FeedForward(dim, dim_ffn, dropout, device)
+        self.pos_embedding = None if shared_pos else T5RelativeEmbedding(num_buckets, num_heads, bidirectional=True, device=device)
     def forward(self, x, mask=None, pos_bias=None):
         e = pos_bias if self.shared_pos else self.pos_embedding(x.size(1), x.size(1))
@@ -134,15 +137,16 @@ class T5SelfAttention(nn.Module):
 class T5RelativeEmbedding(nn.Module):
-    def __init__(self, num_buckets, num_heads, bidirectional, max_dist=128):
+    def __init__(self, num_buckets, num_heads, bidirectional, max_dist=128, device="cuda:0"):
         super(T5RelativeEmbedding, self).__init__()
         self.num_buckets = num_buckets
         self.num_heads = num_heads
         self.bidirectional = bidirectional
         self.max_dist = max_dist
+        self.device = device
         # layers
-        self.embedding = nn.Embedding(num_buckets, num_heads)
+        self.embedding = nn.Embedding(num_buckets, num_heads, device=device)
     def forward(self, lq, lk):
         device = self.embedding.weight.device
@@ -257,12 +261,12 @@ class WanTextEncoder(PreTrainedModel):
         self.shared_pos = shared_pos
         # layers
-        self.token_embedding = vocab if isinstance(vocab, nn.Embedding) else nn.Embedding(vocab, dim)
-        self.pos_embedding = T5RelativeEmbedding(num_buckets, num_heads, bidirectional=True) if shared_pos else None
+        self.token_embedding = vocab if isinstance(vocab, nn.Embedding) else nn.Embedding(vocab, dim, device=device)
+        self.pos_embedding = T5RelativeEmbedding(num_buckets, num_heads, bidirectional=True, device=device) if shared_pos else None
         self.dropout = nn.Dropout(dropout)
         self.blocks = nn.ModuleList(
             [
-                T5SelfAttention(dim, dim_attn, dim_ffn, num_heads, num_buckets, shared_pos, dropout)
+                T5SelfAttention(dim, dim_attn, dim_ffn, num_heads, num_buckets, shared_pos, dropout, device)
                 for _ in range(num_layers)
             ]
         )

diffsynth_engine/pipelines/base.py CHANGED Viewed

@@ -74,9 +74,9 @@ class BasePipeline:
             component.load_state_dict(state_dict, assign=True)
             component.to(device=device, dtype=dtype, non_blocking=True)
-    def load_loras(
+    def _load_lora_state_dicts(
         self,
-        lora_list: List[Tuple[str, Union[float, LoraConfig]]],
+        lora_state_dict_list: List[Tuple[Dict[str, torch.Tensor], Union[float, LoraConfig], str]],
         fused: bool = True,
         save_original_weight: bool = False,
         lora_converter: Optional[LoRAStateDictConverter] = None,
@@ -84,29 +84,30 @@ class BasePipeline:
         if not lora_converter:
             lora_converter = self.lora_converter
-        for lora_path, lora_item in lora_list:
+        for state_dict, lora_item, lora_name in lora_state_dict_list:
             if isinstance(lora_item, float):
                 lora_scale = lora_item
                 scheduler_config = None
-            if isinstance(lora_item, LoraConfig):
+            elif isinstance(lora_item, LoraConfig):
                 lora_scale = lora_item.scale
                 scheduler_config = lora_item.scheduler_config
+            else:
+                raise ValueError(f"lora_item must be float or LoraConfig, got {type(lora_item)}")
-            logger.info(f"loading lora from {lora_path} with LoraConfig (scale={lora_scale})")
-            state_dict = load_file(lora_path, device=self.device)
+            logger.info(f"loading lora from state_dict '{lora_name}' with scale={lora_scale}")
             if scheduler_config is not None:
                 self.apply_scheduler_config(scheduler_config)
                 logger.info(f"Applied scheduler args from LoraConfig: {scheduler_config}")
             lora_state_dict = lora_converter.convert(state_dict)
-            for model_name, state_dict in lora_state_dict.items():
+            for model_name, model_state_dict in lora_state_dict.items():
                 model = getattr(self, model_name)
                 lora_args = []
-                for key, param in state_dict.items():
+                for key, param in model_state_dict.items():
                     lora_args.append(
                         {
-                            "name": lora_path,
+                            "name": lora_name,
                             "key": key,
                             "scale": lora_scale,
                             "rank": param["rank"],
@@ -120,6 +121,26 @@ class BasePipeline:
                     )
                 model.load_loras(lora_args, fused=fused)
+    def load_loras(
+        self,
+        lora_list: List[Tuple[str, Union[float, LoraConfig]]],
+        fused: bool = True,
+        save_original_weight: bool = False,
+        lora_converter: Optional[LoRAStateDictConverter] = None,
+    ):
+        lora_state_dict_list = []
+        for lora_path, lora_item in lora_list:
+            logger.info(f"loading lora from {lora_path}")
+            state_dict = load_file(lora_path, device=self.device)
+            lora_state_dict_list.append((state_dict, lora_item, lora_path))
+        self._load_lora_state_dicts(
+            lora_state_dict_list=lora_state_dict_list,
+            fused=fused,
+            save_original_weight=save_original_weight,
+            lora_converter=lora_converter,
+        )
     def load_lora(self, path: str, scale: float, fused: bool = True, save_original_weight: bool = False):
         self.load_loras([(path, scale)], fused, save_original_weight)

{diffsynth_engine-0.6.1.dev29.dist-info → diffsynth_engine-0.6.1.dev31.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: diffsynth_engine
-Version: 0.6.1.dev29
+Version: 0.6.1.dev31
 Author: MuseAI x ModelScope
 Classifier: Programming Language :: Python :: 3
 Classifier: Operating System :: OS Independent

{diffsynth_engine-0.6.1.dev29.dist-info → diffsynth_engine-0.6.1.dev31.dist-info}/RECORD RENAMED Viewed

@@ -138,12 +138,12 @@ diffsynth_engine/models/vae/vae.py,sha256=1Hz5Yb6f8V-psC0qothfzg8EZBPVPpg9KGlSMD
 diffsynth_engine/models/wan/__init__.py,sha256=eYwZ2Upo2mTjaAcBWuSft1m4mLnqE47bz2V_u-WtkwQ,246
 diffsynth_engine/models/wan/wan_audio_encoder.py,sha256=i8mVu5lhVlTnzVTDcSv7qGC6HjB3MuS9hFVkUrw9458,13629
 diffsynth_engine/models/wan/wan_dit.py,sha256=MEt9eWy6djWT1dtlFEHP9Yevat4-M_LSzWRauNSIHck,21599
-diffsynth_engine/models/wan/wan_image_encoder.py,sha256=VE7crdTxOFN2UCMN2cQlvHB9BilSbKOBQYgnXgl4E2Y,14313
+diffsynth_engine/models/wan/wan_image_encoder.py,sha256=Vdd39lv_QvOsmPxihZWZZbpP-9QuCFpNJ39bdtI5qTQ,14314
 diffsynth_engine/models/wan/wan_s2v_dit.py,sha256=j63ulcWLY4XGITOKUMGX292LtSEtP-n8BTvqb98YExU,23615
-diffsynth_engine/models/wan/wan_text_encoder.py,sha256=OERlmwOqthAFPNnnT2sXJ4OjyyRmsRLx7VGp1zlBkLU,11021
+diffsynth_engine/models/wan/wan_text_encoder.py,sha256=ePeOifbTI_o650mckzugyWPuHn5vhM-uFMcDVCijxPM,11394
 diffsynth_engine/models/wan/wan_vae.py,sha256=dC7MoUFeXRL7SIY0LG1OOUiZW-pp9IbXCghutMxpXr4,38889
 diffsynth_engine/pipelines/__init__.py,sha256=jh-4LSJ0vqlXiT8BgFgRIQxuAr2atEPyHrxXWj-Ud1U,604
-diffsynth_engine/pipelines/base.py,sha256=BNMNL-OU-9ilUv7O60trA3_rjHA21d6Oc5PKzKYBa80,16347
+diffsynth_engine/pipelines/base.py,sha256=ShRiX5MY6bUkRKfuGrA1aalAqeHyeZxhzT87Mwc30b4,17231
 diffsynth_engine/pipelines/flux_image.py,sha256=L0ggxpthLD8a5-zdPHu9z668uWBei9YzPb4PFVypDNU,50707
 diffsynth_engine/pipelines/hunyuan3d_shape.py,sha256=TNV0Wr09Dj2bzzlpua9WioCClOj3YiLfE6utI9aWL8A,8164
 diffsynth_engine/pipelines/qwen_image.py,sha256=ktOirdU2ljgb6vHhXosC0tWgXI3gwvsoAtrYKYvMwzI,35719
@@ -190,8 +190,8 @@ diffsynth_engine/utils/video.py,sha256=8FCaeqIdUsWMgWI_6SO9SPynsToGcLCQAVYFTc4CD
 diffsynth_engine/utils/memory/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 diffsynth_engine/utils/memory/linear_regression.py,sha256=oW_EQEw13oPoyUrxiL8A7Ksa5AuJ2ynI2qhCbfAuZbg,3930
 diffsynth_engine/utils/memory/memory_predcit_model.py,sha256=EXprSl_zlVjgfMWNXP-iw83Ot3hyMcgYaRPv-dvyL84,3943
-diffsynth_engine-0.6.1.dev29.dist-info/licenses/LICENSE,sha256=x7aBqQuVI0IYnftgoTPI_A0I_rjdjPPQkjnU6N2nikM,11346
-diffsynth_engine-0.6.1.dev29.dist-info/METADATA,sha256=8A5q0qhRMxeJi7IOvP3dcqk58BsgIBxy16ndlnDM_6I,1164
-diffsynth_engine-0.6.1.dev29.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
-diffsynth_engine-0.6.1.dev29.dist-info/top_level.txt,sha256=6zgbiIzEHLbhgDKRyX0uBJOV3F6VnGGBRIQvSiYYn6w,17
-diffsynth_engine-0.6.1.dev29.dist-info/RECORD,,
+diffsynth_engine-0.6.1.dev31.dist-info/licenses/LICENSE,sha256=x7aBqQuVI0IYnftgoTPI_A0I_rjdjPPQkjnU6N2nikM,11346
+diffsynth_engine-0.6.1.dev31.dist-info/METADATA,sha256=PGHUdyy75RQEl6ownCDC66hY24x07mNdRA7oFszGvss,1164
+diffsynth_engine-0.6.1.dev31.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
+diffsynth_engine-0.6.1.dev31.dist-info/top_level.txt,sha256=6zgbiIzEHLbhgDKRyX0uBJOV3F6VnGGBRIQvSiYYn6w,17
+diffsynth_engine-0.6.1.dev31.dist-info/RECORD,,

{diffsynth_engine-0.6.1.dev29.dist-info → diffsynth_engine-0.6.1.dev31.dist-info}/WHEEL RENAMED Viewed

File without changes

{diffsynth_engine-0.6.1.dev29.dist-info → diffsynth_engine-0.6.1.dev31.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

{diffsynth_engine-0.6.1.dev29.dist-info → diffsynth_engine-0.6.1.dev31.dist-info}/top_level.txt RENAMED Viewed

File without changes

diffsynth-engine 0.6.1.dev29__py3-none-any.whl → 0.6.1.dev31__py3-none-any.whl

diffsynth-engine 0.6.1.dev29py3-none-any.whl → 0.6.1.dev31py3-none-any.whl