PyPI - Stackformer - Versions diffs - 0.1.2__tar.gz → 0.1.3__tar.gz - Mend

Stackformer 0.1.2tar.gz → 0.1.3tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (31) hide show

{stackformer-0.1.2 → stackformer-0.1.3}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: Stackformer
-Version: 0.1.2
+Version: 0.1.3
 Summary: Modular transformer blocks built in PyTorch
 Home-page: https://github.com/Gurumurthy30/Stackformer
 Author: Gurumurthy
@@ -12,8 +12,8 @@ Project-URL: Discussions, https://github.com/Gurumurthy30/Stackformer/discussion
 Requires-Python: >=3.9
 Description-Content-Type: text/markdown
 License-File: LICENSE
-Requires-Dist: torch<2.6,>=2.0
-Requires-Dist: tqdm>=4.67
+Requires-Dist: torch<2.7,>=2.0
+Requires-Dist: tqdm<5.0,>=4.5
 Dynamic: author
 Dynamic: home-page
 Dynamic: license-file

{stackformer-0.1.2 → stackformer-0.1.3}/Stackformer.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: Stackformer
-Version: 0.1.2
+Version: 0.1.3
 Summary: Modular transformer blocks built in PyTorch
 Home-page: https://github.com/Gurumurthy30/Stackformer
 Author: Gurumurthy
@@ -12,8 +12,8 @@ Project-URL: Discussions, https://github.com/Gurumurthy30/Stackformer/discussion
 Requires-Python: >=3.9
 Description-Content-Type: text/markdown
 License-File: LICENSE
-Requires-Dist: torch<2.6,>=2.0
-Requires-Dist: tqdm>=4.67
+Requires-Dist: torch<2.7,>=2.0
+Requires-Dist: tqdm<5.0,>=4.5
 Dynamic: author
 Dynamic: home-page
 Dynamic: license-file

{stackformer-0.1.2 → stackformer-0.1.3}/Stackformer.egg-info/SOURCES.txt RENAMED Viewed

@@ -8,6 +8,7 @@ Stackformer.egg-info/dependency_links.txt
 Stackformer.egg-info/requires.txt
 Stackformer.egg-info/top_level.txt
 stackformer/__init__.py
+stackformer/generate.py
 stackformer/trainer.py
 stackformer/models/Meta.py
 stackformer/models/OpenAI.py

stackformer-0.1.3/Stackformer.egg-info/requires.txt ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ torch<2.7,>=2.0
2	+ tqdm<5.0,>=4.5

{stackformer-0.1.2 → stackformer-0.1.3}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [project]
 name = "Stackformer"
-version = "0.1.2"
+version = "0.1.3"
 description = "Modular transformer blocks built in PyTorch"
 readme = "README.md"
 requires-python = ">=3.9"
@@ -11,8 +11,8 @@ authors = [
 ]
 dependencies = [
-  "torch>=2.0,<2.6",
-  "tqdm>=4.67"
+  "torch>=2.0,<2.7",
+  "tqdm>=4.5,<5.0"
 ]
 [project.urls]

{stackformer-0.1.2 → stackformer-0.1.3}/setup.py RENAMED Viewed

@@ -2,7 +2,7 @@ from setuptools import setup, find_packages
 setup(
     name="Stackformer",
-    version="0.1.2",
+    version="0.1.3",
     description="Modular transformer blocks built in PyTorch",
     # long_description=open("README.md", "r", encoding="utf-8").read(),
     # long_description_content_type="text/markdown",
@@ -18,8 +18,8 @@ setup(
     python_requires=">=3.9",
     packages=find_packages(exclude=["tests", "examples"]),
     install_requires=[
-        "torch>=2.0,<2.6",
-        "tqdm>=4.67",
+    "torch>=2.0,<2.7",
+    "tqdm>=4.5,<5.0",
     ],
     classifiers=[
         "Programming Language :: Python :: 3",

{stackformer-0.1.2 → stackformer-0.1.3}/stackformer/__init__.py RENAMED Viewed

@@ -9,6 +9,7 @@ from .modules.position_embedding import RoPE
 # --- Attention mechanisms ---
 from .modules.Attention import Self_Attention
 from .modules.Attention import Multi_Head_Attention
+from .modules.Attention import Multi_Head_Attention_with_RoPE
 from .modules.Attention import Cross_MultiHead_Attention
 from .modules.Attention import Multi_query_Attention
 from .modules.Attention import Group_query_Attention
@@ -28,11 +29,17 @@ from .modules.Feed_forward import FF_GELU
 from .modules.Feed_forward import FF_LeakyReLU
 from .modules.Feed_forward import FF_Sigmoid
 from .modules.Feed_forward import FF_SiLU
+from .modules.Feed_forward import FF_SwiGLU
 # --- Model ---
+from .models.OpenAI import GPT_1
 from .models.OpenAI import GPT_2
-from .models.Meta import Llama_2
+from .models.Meta import llama_1
+from .models.Meta import llama_2
 from .models.Transformer import transformer
 # --- Trainer ---
-from .trainer import Trainer
+from .trainer import Trainer
+# --- Generate ---
+from .generate import text_generate

stackformer-0.1.3/stackformer/generate.py ADDED Viewed

@@ -0,0 +1,53 @@
+import torch
+import torch.nn.functional as F
+def text_generate(self, prompt_ids, max_context_len=128, max_new_tokens=50, temperature=1.0, top_k=None, top_p=1.0, eos_token_id=None):
+    if prompt_ids.dim() == 1:
+        prompt_ids = prompt_ids.unsqueeze(0)
+    generated = prompt_ids.clone()
+    for _ in range(max_new_tokens):
+        # Use sliding window if sequence gets too long
+        if generated.size(1) > max_context_len:
+            input_ids = generated[:, -max_context_len:]
+        else:
+            input_ids = generated
+        logits = self.forward(input_ids)  # (batch_size, seq_len, vocab_size)
+        logits = logits[:, -1, :]  # (batch_size, vocab_size)
+        # --- Temperature scaling ---
+        if temperature != 1.0:
+            logits = logits / temperature
+        # --- Top-k filtering ---
+        if top_k is not None and top_k > 0:
+            topk_vals, topk_indices = torch.topk(logits, top_k)
+            mask = torch.full_like(logits, float('-inf'))
+            mask.scatter_(dim=-1, index=topk_indices, src=topk_vals)
+            logits = mask
+        # --- Top-p ---
+        if top_p < 1.0:
+            sorted_logits, sorted_indices = torch.sort(logits, descending=True, dim=-1)
+            probs = F.softmax(sorted_logits, dim=-1)
+            cum_probs = torch.cumsum(probs, dim=-1)
+            sorted_mask = cum_probs > top_p
+            sorted_mask[..., 1:] = sorted_mask[..., :-1].clone()
+            sorted_mask[..., 0] = 0
+            indices_to_remove = sorted_mask.scatter(dim=-1, index=sorted_indices, src=sorted_mask)
+            logits = logits.masked_fill(indices_to_remove, float('-inf'))
+        # Sample next token
+        probs = F.softmax(logits, dim=-1)
+        next_token = torch.multinomial(probs, num_samples=1)  # (batch_size, 1)
+        generated = torch.cat([generated, next_token], dim=-1)
+        # check if we've reached the end of the sequence
+        if eos_token_id is not None and next_token.item() == eos_token_id:
+            break
+    return generated

stackformer-0.1.3/stackformer/models/Meta.py ADDED Viewed

@@ -0,0 +1,159 @@
+import torch
+from torch import nn
+import torch.nn.functional as F
+from stackformer.modules.Attention import kv_cache_group_query, Multi_Head_Attention_with_RoPE
+from stackformer.modules.Feed_forward import FF_SwiGLU
+from stackformer.modules.Normalization import RMSNormilization
+from stackformer.generate import text_generate
+'''
+llama 1
+Attention: MHA
+Mask: Casual
+position: RoPE
+FF: SwiGLU
+Norm: pre norm (RMS norm)
+'''
+class llama_1_Block(nn.Module):
+    def __init__(self, emb_dim, num_heads, dropout, hidden_dim, eps=1e-5, device='cpu', dtype=torch.float32):
+        super().__init__()
+        self.attention = Multi_Head_Attention_with_RoPE(emb_dim, num_heads, dropout, device=device, dtype=dtype)
+        self.norm1 = RMSNormilization(emb_dim, eps=eps)
+        self.FF_SwiGLU = FF_SwiGLU(emb_dim, hidden_dim, dropout, device=device, dtype=dtype)
+        self.norm2 = RMSNormilization(emb_dim, eps=eps)
+    def forward(self, x):
+        residual = x
+        x = self.norm1(x)
+        x = self.attention(x)
+        x = x + residual
+        residual = x
+        x = self.norm2(x)
+        x = self.FF_SwiGLU(x)
+        x = x + residual
+        return x
+# --- Encoder ---
+class llama_1_Encoder(nn.Module):
+    def __init__(self, num_layers, emb_dim, num_heads, dropout, hidden_dim, eps=1e-5, device='cpu', dtype=torch.float32):
+        super().__init__()
+        self.layers = nn.ModuleList([
+            llama_1_Block(emb_dim, num_heads, dropout, hidden_dim, eps, device=device, dtype=dtype)
+            for _ in range(num_layers)
+        ])
+    def forward(self, x):
+        for layer in self.layers:
+            x = layer(x)
+        return x
+class llama_1(nn.Module):
+    def __init__(self, vocab_size, num_layers, emb_dim, num_heads, seq_len,
+            dropout, hidden_dim, eps=1e-5, device='cpu', dtype=torch.float32):
+        super().__init__()
+        self.device = device
+        self.dtype = dtype
+        self.seq_len = seq_len
+        # --- Token embedding ---
+        self.embedding = nn.Embedding(vocab_size, emb_dim, dtype=self.dtype, device=self.device)
+        # --- Encoder ---
+        self.encoder = llama_1_Encoder(num_layers=num_layers,emb_dim=emb_dim,num_heads=num_heads,dropout=dropout,
+                                    hidden_dim=hidden_dim,eps=eps,device=self.device,dtype=self.dtype)
+        # --- Final norm
+        self.final_norm = RMSNormilization(emb_dim, eps=eps)
+        # --- Output Projection ---
+        self.lm_head = nn.Linear(emb_dim, vocab_size, bias=False, dtype=self.dtype, device=self.device)
+    def forward(self, x):
+        # x shape: (batch_size, seq_len)
+        emb = self.embedding(x)  # (batch_size, seq_len, emb_dim)
+        x = self.encoder(emb)
+        x = self.final_norm(x)
+        x = self.lm_head(x)
+        return x
+    @torch.no_grad()
+    def generate(self, prompt_ids, max_context_len=128, max_new_tokens=50, temperature=1.0, top_k=None, top_p=1.0, eos_token_id=None):
+        return text_generate(self, prompt_ids, max_context_len, max_new_tokens, temperature, top_k, top_p, eos_token_id)
+'''
+llama 2
+Attention: GQA with KV catch
+Mask: Casual
+position: RoPE
+FF: SwiGLU
+Norm: pre norm (RMS norm)
+'''
+class llama_2_Block(nn.Module):
+    def __init__(self, emb_dim, query_num_heads, kv_num_heads, batch_size, kv_seq_len, hidden_dim,
+                eps=1e-5, dropout=0.1, dtype=torch.float32, device='cpu'):
+        super().__init__()
+        self.attn_norm = RMSNormilization(emb_dim, eps=eps)
+        self.ff_norm = RMSNormilization(emb_dim, eps=eps)
+        self.attn = kv_cache_group_query(emb_dim=emb_dim, query_num_heads=query_num_heads, kv_num_heads=kv_num_heads,
+                                        batch_size=batch_size, kv_seq_len=kv_seq_len, dtype=dtype,dropout=dropout, device=device)
+        self.ff = FF_SwiGLU(emb_dim=emb_dim, hidden_dim=hidden_dim, device=device, dtype=dtype)
+    def forward(self, x, start_pos):
+        residual = x
+        x = self.attn_norm(x)
+        x = self.attn(x, start_pos, rope=True)
+        x = x + residual
+        residual = x
+        x = self.ff_norm(x)
+        x = self.ff(x)
+        x = x + residual
+        return x
+class llama_2_Encoder(nn.Module):
+    def __init__(self, num_layers, emb_dim, query_num_heads, kv_num_heads, batch_size, kv_seq_len,
+                hidden_dim, eps=1e-5, dropout=0.1, dtype=torch.float32, device='cpu'):
+        super().__init__()
+        self.layers = nn.ModuleList([
+            llama_2_Block(emb_dim=emb_dim, query_num_heads=query_num_heads, kv_num_heads=kv_num_heads,
+                batch_size=batch_size, kv_seq_len=kv_seq_len, hidden_dim=hidden_dim,
+                eps=eps, dropout=dropout, dtype=dtype, device=device)
+            for _ in range(num_layers)
+        ])
+    def forward(self, x, start_pos):
+        for layer in self.layers:
+            x = layer(x, start_pos)
+        return x
+class llama_2(nn.Module):
+    def __init__(self, num_layers, emb_dim, query_num_heads, kv_num_heads, batch_size, kv_seq_len, vocab_size,
+                hidden_dim, eps=1e-5, dropout=0.1, dtype=torch.float32, device='cpu'):
+        super().__init__()
+        self.device = device
+        self.vocab_size = vocab_size
+        self.dtype = dtype
+        self.seq_len = kv_seq_len  # For generation slicing
+        self.embedding = nn.Embedding(vocab_size, emb_dim, dtype=dtype, device=device)
+        self.llama_2_Encoder = llama_2_Encoder(num_layers=num_layers, emb_dim=emb_dim, query_num_heads=query_num_heads,
+                            kv_num_heads=kv_num_heads, batch_size=batch_size, kv_seq_len=kv_seq_len,
+                            hidden_dim=hidden_dim, eps=eps, dropout=dropout, dtype=dtype, device=device)
+        self.final_norm = RMSNormilization(emb_dim, eps=eps)
+        self.lm_head = nn.Linear(emb_dim, vocab_size, bias=False, dtype=dtype, device=device)
+    def forward(self, input_ids, start_pos=0):
+        x = self.embedding(input_ids)
+        x = self.llama_2_Encoder(x, start_pos)
+        x = self.final_norm(x)
+        logits = self.lm_head(x)
+        return logits
+    @torch.no_grad()
+    def generate(self, prompt_ids, max_context_len=128, max_new_tokens=50, temperature=1.0, top_k=None, top_p=1.0, eos_token_id=None):
+        return text_generate(self, prompt_ids, max_context_len, max_new_tokens, temperature, top_k, top_p, eos_token_id)

stackformer-0.1.3/stackformer/models/OpenAI.py ADDED Viewed

@@ -0,0 +1,177 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from stackformer.modules.Attention import Multi_Head_Attention
+from stackformer.modules.position_embedding import AbsolutePositionEmbedding
+from stackformer.modules.Normalization import LayerNorm
+from stackformer.modules.Feed_forward import FF_GELU
+from stackformer.generate import text_generate
+'''
+GPT-1
+Attention: MHA
+Mask: Casual
+position: absolute
+FF: GeLU
+Norm: post normalization (layer norm)
+'''
+# --- GPT_1 Encoder Block ---
+class GPT_1_Block(nn.Module):
+    def __init__(self, emb_dim, num_heads, dropout, hidden_dim, eps=1e-5, device='cpu', dtype=torch.float32):
+        super().__init__()
+        self.attention = Multi_Head_Attention(emb_dim, num_heads, dropout, device=device, dtype=dtype)
+        self.norm1 = LayerNorm(emb_dim, eps=eps)
+        self.FF_GELU = FF_GELU(emb_dim, hidden_dim, dropout, device=device, dtype=dtype)
+        self.norm2 = LayerNorm(emb_dim, eps=eps)
+    def forward(self, x):
+        residual = x
+        x = self.attention(x)
+        x = self.norm1(x)
+        x = x + residual
+        residual = x
+        x = self.FF_GELU(x)
+        x = self.norm2(x)
+        x = x + residual
+        return x
+# --- GPT_1 Encoder ---
+class GPT_1_Encoder(nn.Module):
+    def __init__(self, num_layers, emb_dim, num_heads, dropout, hidden_dim, eps=1e-5, device='cpu', dtype=torch.float32):
+        super().__init__()
+        self.layers = nn.ModuleList([
+            GPT_1_Block(emb_dim, num_heads, dropout, hidden_dim, eps, device=device, dtype=dtype)
+            for _ in range(num_layers)
+        ])
+    def forward(self, x):
+        for layer in self.layers:
+            x = layer(x)
+        return x
+class GPT_1(nn.Module):
+    def __init__(self, vocab_size, num_layers, emb_dim, num_heads, seq_len,
+            dropout, hidden_dim, eps=1e-5, device='cpu', dtype=torch.float32):
+        super().__init__()
+        self.device = device
+        self.dtype = dtype
+        self.seq_len = seq_len
+        # --- Token embedding ---
+        self.embedding = nn.Embedding(vocab_size, emb_dim, dtype=self.dtype, device=self.device)
+        # --- absolute position embedding ---
+        self.position_embedding = AbsolutePositionEmbedding(emb_dim=emb_dim, seq_len=seq_len)
+        # --- Encoder ---
+        self.encoder = GPT_1_Encoder(num_layers=num_layers,emb_dim=emb_dim,num_heads=num_heads,dropout=dropout,
+            hidden_dim=hidden_dim,eps=eps,device=self.device,dtype=self.dtype)
+        # --- Final norm
+        self.final_norm = LayerNorm(emb_dim, eps=eps)
+        # --- Output Projection ---
+        self.lm_head = nn.Linear(emb_dim, vocab_size, bias=False, dtype=self.dtype, device=self.device)
+    def forward(self, x):
+        # x shape: (batch_size, seq_len)
+        emb = self.embedding(x)  # (batch_size, seq_len, emb_dim)
+        pos = self.position_embedding(x)  # (batch_size, seq_len, emb_dim)
+        x = emb + pos
+        x = self.encoder(x)
+        x = self.final_norm(x)
+        x = self.lm_head(x)
+        return x
+    @torch.no_grad()
+    def generate(self, prompt_ids, max_context_len=128, max_new_tokens=50, temperature=1.0, top_k=None, top_p=1.0, eos_token_id=None):
+        return text_generate(self, prompt_ids, max_context_len, max_new_tokens, temperature, top_k, top_p, eos_token_id)
+'''
+GPT-2
+Attention: MHA
+Mask: Casual
+position: absolute
+FF: GeLU
+Norm: pre normalization (layer norm)
+'''
+# --- Encoder Block ---
+class GPT_2_Block(nn.Module):
+    def __init__(self, emb_dim, num_heads, dropout, hidden_dim, eps=1e-5, device='cpu', dtype=torch.float32):
+        super().__init__()
+        self.attention = Multi_Head_Attention(emb_dim, num_heads, dropout, device=device, dtype=dtype)
+        self.norm1 = LayerNorm(emb_dim, eps=eps)
+        self.FF_GELU = FF_GELU(emb_dim, hidden_dim, dropout, device=device, dtype=dtype)
+        self.norm2 = LayerNorm(emb_dim, eps=eps)
+    def forward(self, x):
+        residual = x
+        x = self.norm1(x)
+        x = self.attention(x)
+        x = x + residual
+        residual = x
+        x = self.norm2(x)
+        x = self.FF_GELU(x)
+        x = x + residual
+        return x
+# --- Encoder ---
+class GPT_2_Encoder(nn.Module):
+    def __init__(self, num_layers, emb_dim, num_heads, dropout, hidden_dim, eps=1e-5, device='cpu', dtype=torch.float32):
+        super().__init__()
+        self.layers = nn.ModuleList([
+            GPT_2_Block(emb_dim, num_heads, dropout, hidden_dim, eps, device=device, dtype=dtype)
+            for _ in range(num_layers)
+        ])
+    def forward(self, x):
+        for layer in self.layers:
+            x = layer(x)
+        return x
+class GPT_2(nn.Module):
+    def __init__(self, vocab_size, num_layers, emb_dim, num_heads, seq_len,
+            dropout, hidden_dim, eps=1e-5, device='cpu', dtype=torch.float32):
+        super().__init__()
+        self.device = device
+        self.dtype = dtype
+        self.seq_len = seq_len
+        # --- Token embedding ---
+        self.embedding = nn.Embedding(vocab_size, emb_dim, dtype=self.dtype, device=self.device)
+        # --- Adaptive position embedding ---
+        self.position_embedding = AbsolutePositionEmbedding(
+            emb_dim=emb_dim,
+            seq_len=seq_len
+        )
+        # --- Encoder ---
+        self.encoder = GPT_2_Encoder(num_layers=num_layers,emb_dim=emb_dim,num_heads=num_heads,dropout=dropout,
+            hidden_dim=hidden_dim,eps=eps,device=self.device,dtype=self.dtype)
+        # --- Final norm
+        self.final_norm = LayerNorm(emb_dim, eps=eps)
+        # --- Output Projection ---
+        self.lm_head = nn.Linear(emb_dim, vocab_size, bias=False,
+                    dtype=self.dtype, device=self.device)
+    def forward(self, x):
+        # x shape: (batch_size, seq_len)
+        emb = self.embedding(x)  # (batch_size, seq_len, emb_dim)
+        pos = self.position_embedding(x)  # (batch_size, seq_len, emb_dim)
+        x = emb + pos
+        x = self.encoder(x)
+        x = self.final_norm(x)
+        x = self.lm_head(x)
+        return x
+    @torch.no_grad()
+    def generate(self, prompt_ids, max_context_len=128, max_new_tokens=50, temperature=1.0, top_k=None, top_p=1.0, eos_token_id=None):
+        return text_generate(self, prompt_ids, max_context_len, max_new_tokens, temperature, top_k, top_p, eos_token_id)

stackformer-0.1.3/stackformer/models/Transformer.py ADDED Viewed

@@ -0,0 +1,104 @@
+import math
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from stackformer.modules.Attention import Multi_Head_Attention, Cross_MultiHead_Attention
+from stackformer.modules.position_embedding import SinusoidalPositionalEmbedding
+from stackformer.modules.Feed_forward import FF_ReLU
+from stackformer.modules.Normalization import LayerNorm
+class Encoder(nn.Module):
+    def __init__(self, emb_dim, num_heads, dropout, hidden_dim, eps=1e-5, device='cpu', dtype=torch.float32):
+        super().__init__()
+        self.attention = Multi_Head_Attention(emb_dim, num_heads, dropout, device=device, dtype=dtype)
+        self.norm1 = LayerNorm(emb_dim, eps=eps, device=device, dtype=dtype)
+        self.ff_relu = FF_ReLU(emb_dim, hidden_dim, dropout, device=device, dtype=dtype)
+        self.norm2 = LayerNorm(emb_dim, eps=eps, device=device, dtype=dtype)
+    def forward(self, x):
+        residual = x
+        x = self.attention(x)
+        x = self.norm1(x)
+        x = x + residual
+        residual = x
+        x = self.ff_relu(x)
+        x = self.norm2(x)
+        x = x + residual
+        return x
+class Decoder(nn.Module):
+    def __init__(self, emb_dim, num_heads, dropout, hidden_dim, eps=1e-5, device='cpu', dtype=torch.float32):
+        super().__init__()
+        self.attention = Multi_Head_Attention(emb_dim, num_heads, dropout, device=device, dtype=dtype)
+        self.norm1 = LayerNorm(emb_dim, eps=eps, device=device, dtype=dtype)
+        self.cross_attention = Cross_MultiHead_Attention(emb_dim, num_heads, dropout, device=device, dtype=dtype)
+        self.norm2 = LayerNorm(emb_dim, eps=eps, device=device, dtype=dtype)
+        self.ff_relu = FF_ReLU(emb_dim, hidden_dim, dropout, device=device, dtype=dtype)
+        self.norm3 = LayerNorm(emb_dim, eps=eps, device=device, dtype=dtype)
+    def forward(self, x, enc_output):
+        residual = x
+        x = self.attention(x)
+        x = self.norm1(x)
+        x = x + residual
+        residual = x
+        x = self.cross_attention(x, context = enc_output)
+        x = self.norm2(x)
+        x = x + residual
+        residual = x
+        x = self.ff_relu(x)
+        x = self.norm3(x)
+        x = x + residual
+        return x
+class transformer(nn.Module):
+    def __init__(self, vocab_size, emb_dim, num_heads, dropout, hidden_dim,
+                encoder_layers, decoder_layers, seq_len, eps=1e-5, device='cpu', dtype=torch.float32,
+                ):
+        super().__init__()
+        self.encoder_layers = encoder_layers
+        self.decoder_layers = decoder_layers
+        self.token_emb = nn.Embedding(vocab_size, emb_dim, device=device, dtype=dtype)
+        self.pos = SinusoidalPositionalEmbedding(seq_len=seq_len, emb_dim=emb_dim)
+        self.encoder_stack = nn.ModuleList([
+            Encoder(emb_dim, num_heads, dropout, hidden_dim, eps=eps, device=device, dtype=dtype)
+            for _ in range(encoder_layers)
+        ])
+        self.decoder_stack = nn.ModuleList([
+            Decoder(emb_dim, num_heads, dropout, hidden_dim, eps=eps, device=device, dtype=dtype)
+            for _ in range(decoder_layers)
+        ])
+        # --- final norm ---
+        self.final_norm = LayerNorm(emb_dim, eps=eps, device=device, dtype=dtype)
+        # --- output projection ---
+        self.out_proj = nn.Linear(emb_dim, vocab_size, device=device, dtype=dtype)
+    def encoder(self, x):
+        x = self.token_emb(x) + self.pos(x)
+        for block in self.encoder_stack:
+            x = block(x)
+        return x
+    def decoder(self, x, enc_output):
+        x = self.token_emb(x) + self.pos(x)
+        for block in self.decoder_stack:
+            x = block(x, enc_output)
+        return x
+    def forward(self, source, target):
+        enc_output = self.encoder(source)
+        out = self.decoder(target, enc_output)
+        out = self.final_norm(out)
+        out = self.out_proj(out)
+        return out

Stackformer 0.1.2__tar.gz → 0.1.3__tar.gz

Stackformer 0.1.2tar.gz → 0.1.3tar.gz