PyPI - diffsynth-engine - Versions diffs - 0.6.1.dev30__py3-none-any.whl → 0.6.1.dev31__py3-none-any.whl - Mend

diffsynth-engine 0.6.1.dev30py3-none-any.whl → 0.6.1.dev31py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

diffsynth_engine/models/wan/wan_image_encoder.py CHANGED Viewed

@@ -439,7 +439,7 @@ class WanImageEncoder(PreTrainedModel):
     def __init__(self, device: str = "cuda:0", dtype: torch.dtype = torch.bfloat16):
         super().__init__()
         # init model
-        self.model, self.transforms = clip_xlm_roberta_vit_h_14(dtype=torch.float32, device="cpu")
+        self.model, self.transforms = clip_xlm_roberta_vit_h_14(dtype=torch.float32, device=device)
     def encode_image(self, images: List[torch.Tensor]):
         # preprocess

diffsynth_engine/models/wan/wan_text_encoder.py CHANGED Viewed

@@ -38,19 +38,20 @@ class T5LayerNorm(nn.Module):
 class T5Attention(nn.Module):
-    def __init__(self, dim, dim_attn, num_heads, dropout=0.0):
+    def __init__(self, dim, dim_attn, num_heads, dropout=0.0, device="cuda:0"):
         assert dim_attn % num_heads == 0
         super(T5Attention, self).__init__()
         self.dim = dim
         self.dim_attn = dim_attn
         self.num_heads = num_heads
         self.head_dim = dim_attn // num_heads
+        self.device = device
         # layers
-        self.q = nn.Linear(dim, dim_attn, bias=False)
-        self.k = nn.Linear(dim, dim_attn, bias=False)
-        self.v = nn.Linear(dim, dim_attn, bias=False)
-        self.o = nn.Linear(dim_attn, dim, bias=False)
+        self.q = nn.Linear(dim, dim_attn, bias=False, device=device)
+        self.k = nn.Linear(dim, dim_attn, bias=False, device=device)
+        self.v = nn.Linear(dim, dim_attn, bias=False, device=device)
+        self.o = nn.Linear(dim_attn, dim, bias=False, device=device)
         self.dropout = nn.Dropout(dropout)
     def forward(self, x, context=None, mask=None, pos_bias=None):
@@ -90,15 +91,16 @@ class T5Attention(nn.Module):
 class T5FeedForward(nn.Module):
-    def __init__(self, dim, dim_ffn, dropout=0.0):
+    def __init__(self, dim, dim_ffn, dropout=0.0, device="cuda:0"):
         super(T5FeedForward, self).__init__()
         self.dim = dim
         self.dim_ffn = dim_ffn
+        self.device = device
         # layers
-        self.gate = nn.Sequential(nn.Linear(dim, dim_ffn, bias=False), GELU())
-        self.fc1 = nn.Linear(dim, dim_ffn, bias=False)
-        self.fc2 = nn.Linear(dim_ffn, dim, bias=False)
+        self.gate = nn.Sequential(nn.Linear(dim, dim_ffn, bias=False, device=device), GELU())
+        self.fc1 = nn.Linear(dim, dim_ffn, bias=False, device=device)
+        self.fc2 = nn.Linear(dim_ffn, dim, bias=False, device=device)
         self.dropout = nn.Dropout(dropout)
     def forward(self, x):
@@ -110,7 +112,7 @@ class T5FeedForward(nn.Module):
 class T5SelfAttention(nn.Module):
-    def __init__(self, dim, dim_attn, dim_ffn, num_heads, num_buckets, shared_pos=True, dropout=0.0):
+    def __init__(self, dim, dim_attn, dim_ffn, num_heads, num_buckets, shared_pos=True, dropout=0.0, device="cuda:0"):
         super(T5SelfAttention, self).__init__()
         self.dim = dim
         self.dim_attn = dim_attn
@@ -118,13 +120,14 @@ class T5SelfAttention(nn.Module):
         self.num_heads = num_heads
         self.num_buckets = num_buckets
         self.shared_pos = shared_pos
+        self.device = device
         # layers
         self.norm1 = T5LayerNorm(dim)
-        self.attn = T5Attention(dim, dim_attn, num_heads, dropout)
+        self.attn = T5Attention(dim, dim_attn, num_heads, dropout, device)
         self.norm2 = T5LayerNorm(dim)
-        self.ffn = T5FeedForward(dim, dim_ffn, dropout)
-        self.pos_embedding = None if shared_pos else T5RelativeEmbedding(num_buckets, num_heads, bidirectional=True)
+        self.ffn = T5FeedForward(dim, dim_ffn, dropout, device)
+        self.pos_embedding = None if shared_pos else T5RelativeEmbedding(num_buckets, num_heads, bidirectional=True, device=device)
     def forward(self, x, mask=None, pos_bias=None):
         e = pos_bias if self.shared_pos else self.pos_embedding(x.size(1), x.size(1))
@@ -134,15 +137,16 @@ class T5SelfAttention(nn.Module):
 class T5RelativeEmbedding(nn.Module):
-    def __init__(self, num_buckets, num_heads, bidirectional, max_dist=128):
+    def __init__(self, num_buckets, num_heads, bidirectional, max_dist=128, device="cuda:0"):
         super(T5RelativeEmbedding, self).__init__()
         self.num_buckets = num_buckets
         self.num_heads = num_heads
         self.bidirectional = bidirectional
         self.max_dist = max_dist
+        self.device = device
         # layers
-        self.embedding = nn.Embedding(num_buckets, num_heads)
+        self.embedding = nn.Embedding(num_buckets, num_heads, device=device)
     def forward(self, lq, lk):
         device = self.embedding.weight.device
@@ -257,12 +261,12 @@ class WanTextEncoder(PreTrainedModel):
         self.shared_pos = shared_pos
         # layers
-        self.token_embedding = vocab if isinstance(vocab, nn.Embedding) else nn.Embedding(vocab, dim)
-        self.pos_embedding = T5RelativeEmbedding(num_buckets, num_heads, bidirectional=True) if shared_pos else None
+        self.token_embedding = vocab if isinstance(vocab, nn.Embedding) else nn.Embedding(vocab, dim, device=device)
+        self.pos_embedding = T5RelativeEmbedding(num_buckets, num_heads, bidirectional=True, device=device) if shared_pos else None
         self.dropout = nn.Dropout(dropout)
         self.blocks = nn.ModuleList(
             [
-                T5SelfAttention(dim, dim_attn, dim_ffn, num_heads, num_buckets, shared_pos, dropout)
+                T5SelfAttention(dim, dim_attn, dim_ffn, num_heads, num_buckets, shared_pos, dropout, device)
                 for _ in range(num_layers)
             ]
         )

{diffsynth_engine-0.6.1.dev30.dist-info → diffsynth_engine-0.6.1.dev31.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: diffsynth_engine
-Version: 0.6.1.dev30
+Version: 0.6.1.dev31
 Author: MuseAI x ModelScope
 Classifier: Programming Language :: Python :: 3
 Classifier: Operating System :: OS Independent

{diffsynth_engine-0.6.1.dev30.dist-info → diffsynth_engine-0.6.1.dev31.dist-info}/RECORD RENAMED Viewed

@@ -138,9 +138,9 @@ diffsynth_engine/models/vae/vae.py,sha256=1Hz5Yb6f8V-psC0qothfzg8EZBPVPpg9KGlSMD
 diffsynth_engine/models/wan/__init__.py,sha256=eYwZ2Upo2mTjaAcBWuSft1m4mLnqE47bz2V_u-WtkwQ,246
 diffsynth_engine/models/wan/wan_audio_encoder.py,sha256=i8mVu5lhVlTnzVTDcSv7qGC6HjB3MuS9hFVkUrw9458,13629
 diffsynth_engine/models/wan/wan_dit.py,sha256=MEt9eWy6djWT1dtlFEHP9Yevat4-M_LSzWRauNSIHck,21599
-diffsynth_engine/models/wan/wan_image_encoder.py,sha256=VE7crdTxOFN2UCMN2cQlvHB9BilSbKOBQYgnXgl4E2Y,14313
+diffsynth_engine/models/wan/wan_image_encoder.py,sha256=Vdd39lv_QvOsmPxihZWZZbpP-9QuCFpNJ39bdtI5qTQ,14314
 diffsynth_engine/models/wan/wan_s2v_dit.py,sha256=j63ulcWLY4XGITOKUMGX292LtSEtP-n8BTvqb98YExU,23615
-diffsynth_engine/models/wan/wan_text_encoder.py,sha256=OERlmwOqthAFPNnnT2sXJ4OjyyRmsRLx7VGp1zlBkLU,11021
+diffsynth_engine/models/wan/wan_text_encoder.py,sha256=ePeOifbTI_o650mckzugyWPuHn5vhM-uFMcDVCijxPM,11394
 diffsynth_engine/models/wan/wan_vae.py,sha256=dC7MoUFeXRL7SIY0LG1OOUiZW-pp9IbXCghutMxpXr4,38889
 diffsynth_engine/pipelines/__init__.py,sha256=jh-4LSJ0vqlXiT8BgFgRIQxuAr2atEPyHrxXWj-Ud1U,604
 diffsynth_engine/pipelines/base.py,sha256=ShRiX5MY6bUkRKfuGrA1aalAqeHyeZxhzT87Mwc30b4,17231
@@ -190,8 +190,8 @@ diffsynth_engine/utils/video.py,sha256=8FCaeqIdUsWMgWI_6SO9SPynsToGcLCQAVYFTc4CD
 diffsynth_engine/utils/memory/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 diffsynth_engine/utils/memory/linear_regression.py,sha256=oW_EQEw13oPoyUrxiL8A7Ksa5AuJ2ynI2qhCbfAuZbg,3930
 diffsynth_engine/utils/memory/memory_predcit_model.py,sha256=EXprSl_zlVjgfMWNXP-iw83Ot3hyMcgYaRPv-dvyL84,3943
-diffsynth_engine-0.6.1.dev30.dist-info/licenses/LICENSE,sha256=x7aBqQuVI0IYnftgoTPI_A0I_rjdjPPQkjnU6N2nikM,11346
-diffsynth_engine-0.6.1.dev30.dist-info/METADATA,sha256=z-j4fdSyJwgilKYRl-MrSlhicE8MJP9uvoGYYTFrYKk,1164
-diffsynth_engine-0.6.1.dev30.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
-diffsynth_engine-0.6.1.dev30.dist-info/top_level.txt,sha256=6zgbiIzEHLbhgDKRyX0uBJOV3F6VnGGBRIQvSiYYn6w,17
-diffsynth_engine-0.6.1.dev30.dist-info/RECORD,,
+diffsynth_engine-0.6.1.dev31.dist-info/licenses/LICENSE,sha256=x7aBqQuVI0IYnftgoTPI_A0I_rjdjPPQkjnU6N2nikM,11346
+diffsynth_engine-0.6.1.dev31.dist-info/METADATA,sha256=PGHUdyy75RQEl6ownCDC66hY24x07mNdRA7oFszGvss,1164
+diffsynth_engine-0.6.1.dev31.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
+diffsynth_engine-0.6.1.dev31.dist-info/top_level.txt,sha256=6zgbiIzEHLbhgDKRyX0uBJOV3F6VnGGBRIQvSiYYn6w,17
+diffsynth_engine-0.6.1.dev31.dist-info/RECORD,,

{diffsynth_engine-0.6.1.dev30.dist-info → diffsynth_engine-0.6.1.dev31.dist-info}/WHEEL RENAMED Viewed

File without changes

{diffsynth_engine-0.6.1.dev30.dist-info → diffsynth_engine-0.6.1.dev31.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

{diffsynth_engine-0.6.1.dev30.dist-info → diffsynth_engine-0.6.1.dev31.dist-info}/top_level.txt RENAMED Viewed

File without changes

diffsynth-engine 0.6.1.dev30__py3-none-any.whl → 0.6.1.dev31__py3-none-any.whl

diffsynth-engine 0.6.1.dev30py3-none-any.whl → 0.6.1.dev31py3-none-any.whl