PyPI - Stackformer - Versions diffs - 0.1.0__tar.gz → 0.1.2__tar.gz - Mend

Stackformer 0.1.0tar.gz → 0.1.2tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (27) hide show

{stackformer-0.1.0 → stackformer-0.1.2}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: Stackformer
-Version: 0.1.0
+Version: 0.1.2
 Summary: Modular transformer blocks built in PyTorch
 Home-page: https://github.com/Gurumurthy30/Stackformer
 Author: Gurumurthy
@@ -12,7 +12,7 @@ Project-URL: Discussions, https://github.com/Gurumurthy30/Stackformer/discussion
 Requires-Python: >=3.9
 Description-Content-Type: text/markdown
 License-File: LICENSE
-Requires-Dist: torch>=2.6
+Requires-Dist: torch<2.6,>=2.0
 Requires-Dist: tqdm>=4.67
 Dynamic: author
 Dynamic: home-page
@@ -61,11 +61,16 @@ stackformer/ \
 ## 💻 Installation
-Clone the repository and install in development mode:
+✅ Method 1: Install from PyPI:
+```bash
+pip install Stackformer
+import stackformer
+```
+🔧 Method 2: Clone the repository:
 ```bash
 git clone https://github.com/Gurumurthy30/Stackformer
-cd transformers
+cd Stackformer
 pip install -e .
 ```

{stackformer-0.1.0 → stackformer-0.1.2}/README.md RENAMED Viewed

@@ -40,11 +40,16 @@ stackformer/ \
 ## 💻 Installation
-Clone the repository and install in development mode:
+✅ Method 1: Install from PyPI:
+```bash
+pip install Stackformer
+import stackformer
+```
+🔧 Method 2: Clone the repository:
 ```bash
 git clone https://github.com/Gurumurthy30/Stackformer
-cd transformers
+cd Stackformer
 pip install -e .
 ```

{stackformer-0.1.0 → stackformer-0.1.2}/Stackformer.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: Stackformer
-Version: 0.1.0
+Version: 0.1.2
 Summary: Modular transformer blocks built in PyTorch
 Home-page: https://github.com/Gurumurthy30/Stackformer
 Author: Gurumurthy
@@ -12,7 +12,7 @@ Project-URL: Discussions, https://github.com/Gurumurthy30/Stackformer/discussion
 Requires-Python: >=3.9
 Description-Content-Type: text/markdown
 License-File: LICENSE
-Requires-Dist: torch>=2.6
+Requires-Dist: torch<2.6,>=2.0
 Requires-Dist: tqdm>=4.67
 Dynamic: author
 Dynamic: home-page
@@ -61,11 +61,16 @@ stackformer/ \
 ## 💻 Installation
-Clone the repository and install in development mode:
+✅ Method 1: Install from PyPI:
+```bash
+pip install Stackformer
+import stackformer
+```
+🔧 Method 2: Clone the repository:
 ```bash
 git clone https://github.com/Gurumurthy30/Stackformer
-cd transformers
+cd Stackformer
 pip install -e .
 ```

stackformer-0.1.2/Stackformer.egg-info/SOURCES.txt ADDED Viewed

@@ -0,0 +1,22 @@
+LICENSE
+README.md
+pyproject.toml
+setup.py
+Stackformer.egg-info/PKG-INFO
+Stackformer.egg-info/SOURCES.txt
+Stackformer.egg-info/dependency_links.txt
+Stackformer.egg-info/requires.txt
+Stackformer.egg-info/top_level.txt
+stackformer/__init__.py
+stackformer/trainer.py
+stackformer/models/Meta.py
+stackformer/models/OpenAI.py
+stackformer/models/Transformer.py
+stackformer/models/__init__.py
+stackformer/modules/Attention.py
+stackformer/modules/Feed_forward.py
+stackformer/modules/Normalization.py
+stackformer/modules/__init__.py
+stackformer/modules/mask.py
+stackformer/modules/position_embedding.py
+stackformer/modules/tokenizer.py

stackformer-0.1.2/Stackformer.egg-info/requires.txt ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ torch<2.6,>=2.0
2	+ tqdm>=4.67

stackformer-0.1.2/Stackformer.egg-info/top_level.txt ADDED Viewed

	@@ -0,0 +1 @@
1	+ stackformer

{stackformer-0.1.0 → stackformer-0.1.2}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [project]
 name = "Stackformer"
-version = "0.1.0"
+version = "0.1.2"
 description = "Modular transformer blocks built in PyTorch"
 readme = "README.md"
 requires-python = ">=3.9"
@@ -11,7 +11,7 @@ authors = [
 ]
 dependencies = [
-  "torch>=2.6",
+  "torch>=2.0,<2.6",
   "tqdm>=4.67"
 ]

{stackformer-0.1.0 → stackformer-0.1.2}/setup.py RENAMED Viewed

@@ -2,7 +2,7 @@ from setuptools import setup, find_packages
 setup(
     name="Stackformer",
-    version="0.1.0",
+    version="0.1.2",
     description="Modular transformer blocks built in PyTorch",
     # long_description=open("README.md", "r", encoding="utf-8").read(),
     # long_description_content_type="text/markdown",
@@ -18,7 +18,7 @@ setup(
     python_requires=">=3.9",
     packages=find_packages(exclude=["tests", "examples"]),
     install_requires=[
-        "torch>=2.6",
+        "torch>=2.0,<2.6",
         "tqdm>=4.67",
     ],
     classifiers=[

stackformer-0.1.2/stackformer/__init__.py ADDED Viewed

@@ -0,0 +1,38 @@
+# --- Tokenizer ---
+from .modules.tokenizer import Embedding_using_tiktoken
+# --- Position Embeddings ---
+from .modules.position_embedding import AbsolutePositionEmbedding
+from .modules.position_embedding import SinusoidalPositionalEmbedding
+from .modules.position_embedding import RoPE
+# --- Attention mechanisms ---
+from .modules.Attention import Self_Attention
+from .modules.Attention import Multi_Head_Attention
+from .modules.Attention import Cross_MultiHead_Attention
+from .modules.Attention import Multi_query_Attention
+from .modules.Attention import Group_query_Attention
+from .modules.Attention import Linear_Attention
+from .modules.Attention import Multi_latent_Attention
+from .modules.Attention import Local_Attention
+from .modules.Attention import kv_cache_multihead
+from .modules.Attention import kv_cache_group_query
+# --- Normalization layers ---
+from .modules.Normalization import LayerNorm
+from .modules.Normalization import RMSNormilization
+# --- Feed Forward layers ---
+from .modules.Feed_forward import FF_ReLU
+from .modules.Feed_forward import FF_GELU
+from .modules.Feed_forward import FF_LeakyReLU
+from .modules.Feed_forward import FF_Sigmoid
+from .modules.Feed_forward import FF_SiLU
+# --- Model ---
+from .models.OpenAI import GPT_2
+from .models.Meta import Llama_2
+from .models.Transformer import transformer
+# --- Trainer ---
+from .trainer import Trainer

stackformer-0.1.2/stackformer/models/Meta.py ADDED Viewed

@@ -0,0 +1,213 @@
+import torch
+from torch import nn
+import torch.nn.functional as F
+def precompute_theta_position_frequency(head_dim, seq_len, device='cpu', theta=10000.0):
+    assert head_dim % 2 == 0, "head_dim must be even"
+    theta_numerator = torch.arange(0, head_dim, 2, device=device)
+    inv_freq = 1.0 / (theta ** (theta_numerator / head_dim))
+    m = torch.arange(seq_len, device=device)
+    freqs = torch.outer(m, inv_freq)
+    freq_complex = torch.polar(torch.ones_like(freqs), freqs)
+    return freq_complex
+def apply_rotry_position_embedding(x, freq_complex, device='cpu', dtype=torch.float32):
+    batch_size, seq_len, num_head, emb_dim = x.shape
+    assert emb_dim % 2 == 0, "emb_dim must be even"
+    x_reshaped = x.view(batch_size, seq_len, num_head, emb_dim // 2, 2).to(device=device, dtype=dtype)
+    x_complex = torch.view_as_complex(x_reshaped)
+    freq_complex = freq_complex[:seq_len].unsqueeze(0).unsqueeze(2).to(device=device)
+    x_rotated = x_complex * freq_complex
+    x_out = torch.view_as_real(x_rotated).contiguous().view(batch_size, seq_len, num_head, emb_dim)
+    return x_out.to(device=device, dtype=dtype)
+class kv_cache_group_query(nn.Module):
+    def __init__(self, emb_dim, query_num_heads, kv_num_heads, batch_size, kv_seq_len,
+                device='cpu', dtype=torch.float32, dropout=0.1):
+        super().__init__()
+        assert emb_dim % query_num_heads == 0, "Embedding dim must be divisible by query heads"
+        assert query_num_heads % kv_num_heads == 0, "query heads must be divisible by kv heads"
+        self.device = device
+        self.dtype = dtype
+        self.emb_dim = emb_dim
+        self.query_num_heads = query_num_heads
+        self.kv_num_heads = kv_num_heads
+        self.head_dim = emb_dim // query_num_heads
+        self.num_queries_per_kv = query_num_heads // kv_num_heads
+        self.kv_seq_len = kv_seq_len
+        self.query = nn.Linear(emb_dim, emb_dim, bias=False, dtype=dtype, device=device)
+        self.key = nn.Linear(emb_dim, kv_num_heads * self.head_dim, bias=False, dtype=dtype, device=device)
+        self.value = nn.Linear(emb_dim, kv_num_heads * self.head_dim, bias=False, dtype=dtype, device=device)
+        self.out_proj = nn.Linear(query_num_heads * self.head_dim, emb_dim, dtype=dtype, device=device)
+        self.dropout = nn.Dropout(dropout)
+        self.register_buffer("cache_keys", torch.zeros(batch_size, kv_seq_len, kv_num_heads, self.head_dim, device=device, dtype=dtype))
+        self.register_buffer("cache_value", torch.zeros(batch_size, kv_seq_len, kv_num_heads, self.head_dim, device=device, dtype=dtype))
+    def forward(self, x, start_pos):
+        batch_size, seq_len, _ = x.shape
+        xq = self.query(x).view(batch_size, seq_len, self.query_num_heads, self.head_dim)
+        xk = self.key(x).view(batch_size, seq_len, self.kv_num_heads, self.head_dim)
+        xv = self.value(x).view(batch_size, seq_len, self.kv_num_heads, self.head_dim)
+        freq_q = precompute_theta_position_frequency(head_dim=self.head_dim, seq_len=seq_len, device=self.device)
+        xq = apply_rotry_position_embedding(xq, freq_q, device=self.device, dtype=self.dtype)
+        freq_k = precompute_theta_position_frequency(head_dim=self.head_dim, seq_len=self.kv_seq_len, device=self.device)
+        xk = apply_rotry_position_embedding(xk, freq_k, device=self.device, dtype=self.dtype)
+        self.cache_keys[:, start_pos:start_pos + seq_len] = xk
+        self.cache_value[:, start_pos:start_pos + seq_len] = xv
+        xk_full = self.cache_keys[:, :start_pos + seq_len]
+        xv_full = self.cache_value[:, :start_pos + seq_len]
+        query = xq.transpose(1, 2)
+        key = xk_full.transpose(1, 2).repeat_interleave(self.num_queries_per_kv, dim=1)
+        value = xv_full.transpose(1, 2).repeat_interleave(self.num_queries_per_kv, dim=1)
+        attn_scores = torch.matmul(query, key.transpose(2, 3)) / (self.head_dim ** 0.5)
+        causal_mask = torch.triu(torch.ones(seq_len, attn_scores.shape[-1], dtype=torch.bool, device=self.device), diagonal=1)
+        attn_scores.masked_fill_(causal_mask[None, None, :, :], float('-inf'))
+        attn_weights = F.softmax(attn_scores, dim=-1)
+        out = torch.matmul(attn_weights, value)
+        out = out.transpose(1, 2).contiguous().view(batch_size, seq_len, self.query_num_heads * self.head_dim)
+        return self.dropout(self.out_proj(out))
+class RMSNormilization(nn.Module):
+    def __init__(self, emb_dim, eps=1e-5, device='cpu', dtype=torch.float32):
+        super().__init__()
+        self.eps = eps
+        self.scale = nn.Parameter(torch.ones(emb_dim, dtype=dtype, device=device))
+    def forward(self, x):
+        norm = x.norm(2, dim=-1, keepdim=True)
+        rms = norm / (x.shape[-1] ** 0.5)
+        return (x / (rms + self.eps)) * self.scale
+class FF_SiLU(nn.Module):
+    def __init__(self, emb_dim, hidden_dim, device='cpu', dtype=torch.float32):
+        super().__init__()
+        self.silu = nn.Sequential(
+            nn.Linear(emb_dim, hidden_dim, device=device, dtype=dtype),
+            nn.SiLU(),
+            nn.Linear(hidden_dim, emb_dim, device=device, dtype=dtype),
+        )
+    def forward(self, x):
+        return self.silu(x)
+class block(nn.Module):
+    def __init__(self, emb_dim, query_num_heads, kv_num_heads, batch_size, kv_seq_len, hidden_dim,
+                 eps=1e-5, dropout=0.1, dtype=torch.float32, device='cpu'):
+        super().__init__()
+        self.attn_norm = RMSNormilization(emb_dim=emb_dim, eps=eps, device=device, dtype=dtype)
+        self.ff_norm = RMSNormilization(emb_dim=emb_dim, eps=eps, device=device, dtype=dtype)
+        self.attn = kv_cache_group_query(emb_dim=emb_dim, query_num_heads=query_num_heads, kv_num_heads=kv_num_heads,
+                                        batch_size=batch_size, kv_seq_len=kv_seq_len, dtype=dtype,
+                                        dropout=dropout, device=device)
+        self.ff = FF_SiLU(emb_dim=emb_dim, hidden_dim=hidden_dim, device=device, dtype=dtype)
+    def forward(self, x, start_pos):
+        residual = x
+        x = self.attn_norm(x)
+        x = self.attn(x, start_pos)
+        x = x + residual
+        residual = x
+        x = self.ff_norm(x)
+        x = self.ff(x)
+        x = x + residual
+        return x
+class Encoder(nn.Module):
+    def __init__(self, num_layers, emb_dim, query_num_heads, kv_num_heads, batch_size, kv_seq_len,
+                hidden_dim, eps=1e-5, dropout=0.1, dtype=torch.float32, device='cpu'):
+        super().__init__()
+        self.layers = nn.ModuleList([
+            block(emb_dim=emb_dim, query_num_heads=query_num_heads, kv_num_heads=kv_num_heads,
+                batch_size=batch_size, kv_seq_len=kv_seq_len, hidden_dim=hidden_dim,
+                eps=eps, dropout=dropout, dtype=dtype, device=device)
+            for _ in range(num_layers)
+        ])
+    def forward(self, x, start_pos):
+        for layer in self.layers:
+            x = layer(x, start_pos)
+        return x
+class Llama_2(nn.Module):
+    def __init__(self, num_layers, emb_dim, query_num_heads, kv_num_heads, batch_size, kv_seq_len, vocab_size,
+                hidden_dim, eps=1e-5, dropout=0.1, dtype=torch.float32, device='cpu'):
+        super().__init__()
+        self.device = device
+        self.vocab_size = vocab_size
+        self.dtype = dtype
+        self.seq_len = kv_seq_len  # For generation slicing
+        self.embedding = nn.Embedding(vocab_size, emb_dim, dtype=dtype, device=device)
+        self.encoder = Encoder(num_layers=num_layers, emb_dim=emb_dim, query_num_heads=query_num_heads,
+                            kv_num_heads=kv_num_heads, batch_size=batch_size, kv_seq_len=kv_seq_len,
+                            hidden_dim=hidden_dim, eps=eps, dropout=dropout, dtype=dtype, device=device)
+        self.final_norm = RMSNormilization(emb_dim, eps=eps, device=device, dtype=dtype)
+        self.lm_head = nn.Linear(emb_dim, vocab_size, bias=False, dtype=dtype, device=device)
+    def forward(self, input_ids, start_pos=0):
+        x = self.embedding(input_ids)
+        x = self.encoder(x, start_pos)
+        x = self.final_norm(x)
+        logits = self.lm_head(x)
+        return logits
+    @torch.no_grad()
+    def generate(self, prompt_ids, max_new_tokens=50, temperature=1.0, top_k=None, top_p=1.0):
+        self.eval()
+        if prompt_ids.dim() == 1:
+            prompt_ids = prompt_ids.unsqueeze(0)
+        generated = prompt_ids.clone()
+        for step in range(max_new_tokens):
+            input_ids = generated[:, -self.seq_len:]
+            logits = self.forward(input_ids, start_pos=step)  # Correct start_pos
+            logits = logits[:, -1, :]
+            if temperature != 1.0:
+                logits = logits / temperature
+            if top_k is not None and top_k > 0:
+                topk_vals, topk_indices = torch.topk(logits, top_k)
+                mask = torch.full_like(logits, float('-inf'))
+                mask.scatter_(dim=-1, index=topk_indices, src=topk_vals)
+                logits = mask
+            if top_p < 1.0:
+                sorted_logits, sorted_indices = torch.sort(logits, descending=True, dim=-1)
+                probs = F.softmax(sorted_logits, dim=-1)
+                cum_probs = torch.cumsum(probs, dim=-1)
+                sorted_mask = cum_probs > top_p
+                sorted_mask[..., 1:] = sorted_mask[..., :-1].clone()
+                sorted_mask[..., 0] = 0
+                indices_to_remove = sorted_mask.scatter(dim=-1, index=sorted_indices, src=sorted_mask)
+                logits = logits.masked_fill(indices_to_remove, float('-inf'))
+            probs = F.softmax(logits, dim=-1)
+            next_token = torch.multinomial(probs, num_samples=1)
+            generated = torch.cat([generated, next_token], dim=-1)
+        return generated

stackformer-0.1.0/models/GPT_2.py → stackformer-0.1.2/stackformer/models/OpenAI.py RENAMED Viewed

@@ -22,7 +22,8 @@ class SinusoidalPositionalEmbedding(nn.Module):
     def forward(self, x):
         # x shape: (batch_size, seq_len, emb_dim) or (batch_size, seq_len)
         batch_size, seq_len = x.shape[0], x.shape[1]
-        return self.pe[:seq_len].unsqueeze(0).expand(batch_size, seq_len, -1).to(x.device)
+        out = self.pe[:seq_len].unsqueeze(0).expand(batch_size, seq_len, -1)
+        return out.to(device=x.device,dtype=x.dtype)
 # --- Multi Head Attention ---
 class MultiHeadAttention(nn.Module):
@@ -135,11 +136,14 @@ class Encoder(nn.Module):
             x = layer(x)
         return x
-class GPTModel(nn.Module):
+class GPT_2(nn.Module):
     def __init__(self, vocab_size, num_layers, Emb_dim, num_heads, seq_len,
             dropout, hidden_dim, eps=1e-5, device='cpu', dtype=torch.float32):
         super().__init__()
+        self.device = device
+        self.dtype = dtype
+        self.seq_len = seq_len
         # --- Token embedding ---
         self.embedding = nn.Embedding(vocab_size, Emb_dim, dtype=self.dtype, device=self.device)

stackformer-0.1.2/stackformer/models/Transformer.py ADDED Viewed

@@ -0,0 +1,238 @@
+import math
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+# --- position embedding ---
+class SinusoidalPositionalEmbedding(nn.Module):
+    def __init__(self, seq_len, emb_dim):
+        super().__init__()
+        self.seq_len = seq_len
+        self.emb_dim = emb_dim
+        position = torch.arange(0, seq_len).unsqueeze(1)
+        div_term = torch.exp(torch.arange(0, emb_dim, 2) * -(math.log(10000.0) / emb_dim))
+        pe = torch.zeros(seq_len, emb_dim)
+        pe[:, 0::2] = torch.sin(position * div_term)
+        pe[:, 1::2] = torch.cos(position * div_term)
+        self.register_buffer("pe", pe)
+    def forward(self, x):
+        # x shape: (batch_size, seq_len, emb_dim) or (batch_size, seq_len)
+        batch_size, seq_len = x.shape[0], x.shape[1]
+        out = self.pe[:seq_len].unsqueeze(0).expand(batch_size, seq_len, -1)
+        return out.to(device=x.device,dtype=x.dtype)
+# --- multi-head attention ---
+class Multi_Head_Attention(nn.Module):
+    def __init__(self, emb_dim, num_heads, dropout, device='cpu',dtype=torch.float32):
+        super().__init__()
+        assert emb_dim % num_heads == 0, "emb_dim must be divisible by num_heads"
+        self.emb_dim = emb_dim
+        self.num_heads = num_heads
+        self.device = device
+        self.head_dim = emb_dim // num_heads
+        self.key = nn.Linear(emb_dim, emb_dim, bias=False,dtype=dtype,device=device)
+        self.query = nn.Linear(emb_dim, emb_dim, bias=False,dtype=dtype,device=device)
+        self.value = nn.Linear(emb_dim, emb_dim, bias=False,dtype=dtype,device=device)
+        self.scale = torch.tensor(self.head_dim ** 0.5,device=device,dtype=dtype)
+        self.out_proj = nn.Linear(emb_dim, emb_dim,dtype=dtype,device=device)
+        self.dropout = nn.Dropout(dropout)
+    def forward(self, x):
+        Batch_size, Seq_len, _ = x.shape
+        # Generate Q, K, V and reshape for multi-head attention
+        Keys = self.key(x).view(Batch_size, Seq_len, self.num_heads, self.head_dim).transpose(1, 2)  # (Batch_size, nh, Seq_len, hd)
+        Querys = self.query(x).view(Batch_size, Seq_len, self.num_heads, self.head_dim).transpose(1, 2)
+        Values = self.value(x).view(Batch_size, Seq_len, self.num_heads, self.head_dim).transpose(1, 2)
+        # Compute attention scores
+        scores = (Querys @ Keys.transpose(-2, -1)) / self.scale  # (Batch_size, nh, Seq_len, Seq_len)
+        # Apply causal mask if requested
+        causal_mask = torch.triu(torch.ones(Seq_len, Seq_len, dtype=torch.bool, device=self.device), diagonal=1)
+        scores = scores.masked_fill_(causal_mask[None, None, :, :], float('-inf'))
+        # Apply softmax and dropout
+        attn = F.softmax(scores, dim=-1)
+        attn = self.dropout(attn)
+        # Apply attention to values
+        out = attn @ Values  # (Batch_size, nh, Seq_len, hd)
+        # Concatenate heads and project
+        out = out.transpose(1, 2).contiguous().view(Batch_size, Seq_len, self.emb_dim)  # (Batch_size, Seq_len, emb_dim)
+        return self.out_proj(out)
+# --- cross-attention ---
+class Cross_MultiHead_Attention(nn.Module):
+    def __init__(self, emb_dim, num_heads, dropout,device='cpu', dtype=torch.float32):
+        super().__init__()
+        assert emb_dim % num_heads == 0, "emb_dim must be divisible by num_heads"
+        self.emb_dim = emb_dim
+        self.device = device
+        self.num_heads = num_heads
+        self.head_dim = emb_dim // num_heads
+        # Querys, Key, Value projections
+        self.query = nn.Linear(emb_dim, emb_dim, bias=False,dtype=dtype,device=device)
+        self.key = nn.Linear(emb_dim, emb_dim, bias=False,dtype=dtype,device=device)
+        self.value = nn.Linear(emb_dim, emb_dim, bias=False,dtype=dtype,device=device)
+        self.scale = torch.tensor(self.head_dim ** 0.5,device=device,dtype=dtype)
+        self.out_proj = nn.Linear(emb_dim, emb_dim,dtype=dtype,device=device)
+        self.dropout = nn.Dropout(dropout)
+    def forward(self, x, context=None):
+        Batch_size, query_seq_len, _ = x.shape
+        context = x if context is None else context  # self-attention fallback
+        KV_seq_len = context.shape[1]
+        # Project Q, K, V
+        Querys = self.query(x).view(Batch_size, query_seq_len, self.num_heads, self.head_dim).transpose(1, 2)
+        Keys = self.key(context).view(Batch_size, KV_seq_len, self.num_heads, self.head_dim).transpose(1, 2)
+        Values = self.value(context).view(Batch_size, KV_seq_len, self.num_heads, self.head_dim).transpose(1, 2)
+        # Attention scores
+        scores = (Querys @ Keys.transpose(-2, -1)) / self.scale  # (Batch_size, nh, query_seq_len, KV_seq_len)
+        causal_mask = torch.triu(torch.ones(query_seq_len, query_seq_len, dtype=torch.bool, device=self.device), diagonal=1)
+        scores = scores.masked_fill_(causal_mask[None, None, :, :], float('-inf'))
+        attn = F.softmax(scores, dim=-1)
+        attn = self.dropout(attn)
+        out = attn @ Values
+        out = out.transpose(1, 2).contiguous().view(Batch_size, query_seq_len, self.emb_dim)  # (Batch_size, query_seq_len, emb_dim)
+        return self.out_proj(out)
+# --- Feed Forward ---
+class FF_ReLU(nn.Module):
+    def __init__(self, emb_dim, hidden_dim, dropout=0.1, device='cpu', dtype=torch.float32):
+        super().__init__()
+        self.relu = nn.Sequential(
+            nn.Linear(emb_dim, hidden_dim, device=device, dtype=dtype),
+            nn.ReLU(),
+            nn.Dropout(dropout),
+            nn.Linear(hidden_dim, emb_dim, device=device, dtype=dtype),
+        )
+    def forward(self, x):
+        return self.relu(x)
+class LayerNorm(nn.Module):
+    def __init__(self, emb_dim, eps=1e-5, device='cpu', dtype=torch.float32):
+        super().__init__()
+        self.eps = eps
+        self.weight = nn.Parameter(torch.ones(emb_dim, device=device, dtype=dtype))
+        self.bias = nn.Parameter(torch.zeros(emb_dim, device=device, dtype=dtype))
+    def forward(self, x):
+        mean = x.mean(dim=-1, keepdim=True)
+        var = x.var(dim=-1, keepdim=True, unbiased=False)
+        norm_x = (x - mean) / torch.sqrt(var + self.eps)
+        return norm_x * self.weight + self.bias
+class Encoder(nn.Module):
+    def __init__(self, emb_dim, num_heads, dropout, hidden_dim, eps=1e-5, device='cpu', dtype=torch.float32):
+        super().__init__()
+        self.attention = Multi_Head_Attention(emb_dim, num_heads, dropout, device=device, dtype=dtype)
+        self.norm1 = LayerNorm(emb_dim, eps=eps, device=device, dtype=dtype)
+        self.ff_relu = FF_ReLU(emb_dim, hidden_dim, dropout, device=device, dtype=dtype)
+        self.norm2 = LayerNorm(emb_dim, eps=eps, device=device, dtype=dtype)
+    def forward(self, x):
+        residual = x
+        x = self.attention(x)
+        x = self.norm1(x)
+        x = x + residual
+        residual = x
+        x = self.ff_relu(x)
+        x = self.norm2(x)
+        x = x + residual
+        return x
+class Decoder(nn.Module):
+    def __init__(self, emb_dim, num_heads, dropout, hidden_dim, eps=1e-5, device='cpu', dtype=torch.float32):
+        super().__init__()
+        self.attention = Multi_Head_Attention(emb_dim, num_heads, dropout, device=device, dtype=dtype)
+        self.norm1 = LayerNorm(emb_dim, eps=eps, device=device, dtype=dtype)
+        self.cross_attention = Cross_MultiHead_Attention(emb_dim, num_heads, dropout, device=device, dtype=dtype)
+        self.norm2 = LayerNorm(emb_dim, eps=eps, device=device, dtype=dtype)
+        self.ff_relu = FF_ReLU(emb_dim, hidden_dim, dropout, device=device, dtype=dtype)
+        self.norm3 = LayerNorm(emb_dim, eps=eps, device=device, dtype=dtype)
+    def forward(self, x, enc_output):
+        residual = x
+        x = self.attention(x)
+        x = self.norm1(x)
+        x = x + residual
+        residual = x
+        x = self.cross_attention(x, context = enc_output)
+        x = self.norm2(x)
+        x = x + residual
+        residual = x
+        x = self.ff_relu(x)
+        x = self.norm3(x)
+        x = x + residual
+        return x
+class transformer(nn.Module):
+    def __init__(self, vocab_size, emb_dim, num_heads, dropout, hidden_dim,
+                encoder_layers, decoder_layers, seq_len, eps=1e-5, device='cpu', dtype=torch.float32,
+                ):
+        super().__init__()
+        self.encoder_layers = encoder_layers
+        self.decoder_layers = decoder_layers
+        self.token_emb = nn.Embedding(vocab_size, emb_dim, device=device, dtype=dtype)
+        self.pos = SinusoidalPositionalEmbedding(seq_len=seq_len, emb_dim=emb_dim)
+        self.encoder_stack = nn.ModuleList([
+            Encoder(emb_dim, num_heads, dropout, hidden_dim, eps=eps, device=device, dtype=dtype)
+            for _ in range(encoder_layers)
+        ])
+        self.decoder_stack = nn.ModuleList([
+            Decoder(emb_dim, num_heads, dropout, hidden_dim, eps=eps, device=device, dtype=dtype)
+            for _ in range(decoder_layers)
+        ])
+        # --- final norm ---
+        self.final_norm = LayerNorm(emb_dim, eps=eps, device=device, dtype=dtype)
+        # --- output projection ---
+        self.out_proj = nn.Linear(emb_dim, vocab_size, device=device, dtype=dtype)
+    def encoder(self, x):
+        x = self.token_emb(x) + self.pos(x)
+        for block in self.encoder_stack:
+            x = block(x)
+        return x
+    def decoder(self, x, enc_output):
+        x = self.token_emb(x) + self.pos(x)
+        for block in self.decoder_stack:
+            x = block(x, enc_output)
+        return x
+    def forward(self, source, target):
+        enc_output = self.encoder(source)
+        out = self.decoder(target, enc_output)
+        out = self.final_norm(out)
+        out = self.out_proj(out)
+        return out

{stackformer-0.1.0 → stackformer-0.1.2/stackformer}/modules/Attention.py RENAMED Viewed

@@ -1,6 +1,7 @@
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
 class Self_Attention(nn.Module):
     def __init__(self, Emb_dim, dropout,dtype=torch.float32,device='cpu'):
         super().__init__()
@@ -412,23 +413,23 @@ class kv_cache_multihead(nn.Module):
         self.dtype = dtype
         self.device = device
-        assert emb_dim % num_heads == 0
+        assert emb_dim % num_heads == 0, "emb_dim must be divisible by num_heads"
         self.emb_dim = emb_dim
         self.num_heads = num_heads
         self.head_dim = emb_dim // num_heads
         self.kv_seq_len = kv_seq_len
-        self.query = nn.Linear(emb_dim, emb_dim, bias=False,dtype=dtype,device=device)
-        self.key = nn.Linear(emb_dim, emb_dim, bias=False,dtype=dtype,device=device)
-        self.value = nn.Linear(emb_dim, emb_dim, bias=False,dtype=dtype,device=device)
+        self.query = nn.Linear(emb_dim, emb_dim, bias=False, dtype=dtype, device=device)
+        self.key = nn.Linear(emb_dim, emb_dim, bias=False, dtype=dtype, device=device)
+        self.value = nn.Linear(emb_dim, emb_dim, bias=False, dtype=dtype, device=device)
-        self.out_proj = nn.Linear(emb_dim, emb_dim,dtype=dtype,device=device)
+        self.out_proj = nn.Linear(emb_dim, emb_dim, dtype=dtype, device=device)
         self.dropout = nn.Dropout(dropout)
+        # KV caches
+        self.register_buffer("cache_keys", torch.zeros(batch_size, kv_seq_len*2, num_heads, self.head_dim,device=device,dtype=dtype))
+        self.register_buffer("cache_value", torch.zeros(batch_size, kv_seq_len*2, num_heads, self.head_dim,device=device,dtype=dtype))
-        self.cache_keys = torch.zeros(batch_size, kv_seq_len, num_heads, self.head_dim,dtype=dtype,device=device)
-        self.cache_value = torch.zeros(batch_size, kv_seq_len, num_heads, self.head_dim,dtype=dtype,device=device)
-    def forward(self, x, start_pos, RoPE: False):
+    def forward(self, x, start_pos, RoPE=False):
         batch_size, seq_len, C = x.shape
         xq = self.query(x).view(batch_size, seq_len, self.num_heads, self.head_dim)
@@ -441,36 +442,35 @@ class kv_cache_multihead(nn.Module):
             freq_complex = precompute_theta_position_frequency(head_dim=self.head_dim, seq_len=self.kv_seq_len, device=self.device)
             xk = apply_rotry_position_embedding(xk, freq_complex, device=self.device, dtype=self.dtype)
-        # Cache keys and values
-        self.cache_keys[:, start_pos:start_pos+seq_len] = xk
-        self.cache_value[:, start_pos:start_pos+seq_len] = xv
-        xk_full = self.cache_keys[:, :start_pos+seq_len]
-        xv_full = self.cache_value[:, :start_pos+seq_len]
+        # Cache keys and values - only update the batch_size portion we're using
+        self.cache_keys[:batch_size, start_pos:start_pos+seq_len] = xk
+        self.cache_value[:batch_size, start_pos:start_pos+seq_len] = xv
+        # Only use the relevant batch portion from cache
+        xk_full = self.cache_keys[:batch_size, :start_pos+seq_len]
+        xv_full = self.cache_value[:batch_size, :start_pos+seq_len]
         query = xq.transpose(1, 2)         # (batch_size, num_head, seq_len, emb_dim)
         key = xk_full.transpose(1, 2)    # (batch_size, num_head, T_total, emb_dim)
         value = xv_full.transpose(1, 2)    # (batch_size, num_head, T_total, emb_dim)
         attn_scores = torch.matmul(query, key.transpose(2, 3)) / (self.head_dim ** 0.5)
         # Causal mask
-        causal_mask = torch.triu(torch.ones(seq_len, seq_len, dtype=torch.bool, device=self.device), diagonal=1)
+        causal_mask = torch.triu(torch.ones(attn_scores.shape[-2], attn_scores.shape[-1], dtype=torch.bool, device=self.device), diagonal=1)
         attn_scores.masked_fill_(causal_mask[None, None, :, :], float('-inf'))
         attn_weights = F.softmax(attn_scores, dim=-1)
         out = torch.matmul(attn_weights, value)
         out = out.transpose(1, 2).contiguous().view(batch_size, seq_len, -1)
         return self.dropout(self.out_proj(out))
 class kv_cache_group_query(nn.Module):
-    def __init__(self, emb_dim, query_num_heads, kv_num_heads, batch_size, kv_seq_len,device='cpu' , dtype=torch.float32 , dropout=0.1):
+    def __init__(self, emb_dim, query_num_heads, kv_num_heads, batch_size, kv_seq_len, device='cpu', dtype=torch.float32, dropout=0.1):
         super().__init__()
         self.dtype = dtype
         self.device = device
-        assert query_num_heads % kv_num_heads == 0, "query heads must be divisible by kv heads"
+        assert query_num_heads % kv_num_heads  == 0, "query heads must be divisible by kv heads"
         assert emb_dim % query_num_heads == 0, "embedding must be divisible by query heads"
         self.emb_dim = emb_dim
@@ -488,8 +488,8 @@ class kv_cache_group_query(nn.Module):
         self.dropout = nn.Dropout(dropout)
         # KV caches
-        self.register_buffer("cache_keys", torch.zeros(batch_size, kv_seq_len, kv_num_heads, self.head_dim,device=device,dtype=dtype))
-        self.register_buffer("cache_value", torch.zeros(batch_size, kv_seq_len, kv_num_heads, self.head_dim,device=device,dtype=dtype))
+        self.register_buffer("cache_keys", torch.zeros(batch_size, kv_seq_len*2, kv_num_heads, self.head_dim,device=device,dtype=dtype))
+        self.register_buffer("cache_value", torch.zeros(batch_size, kv_seq_len*2, kv_num_heads, self.head_dim,device=device,dtype=dtype))
     def forward(self, x, start_pos, RoPE=False):
         batch_size, seq_len, _ = x.shape
@@ -501,15 +501,16 @@ class kv_cache_group_query(nn.Module):
         if RoPE:
             freq_q = precompute_theta_position_frequency(head_dim=self.head_dim, seq_len=seq_len, device=self.device)
             xq = apply_rotry_position_embedding(xq, freq_q, device=self.device, dtype=self.dtype)
-            freq_k = precompute_theta_position_frequency(head_dim=self.head_dim, seq_len=self.kv_seq_len, device=self.device)
+            freq_k = precompute_theta_position_frequency(head_dim=self.head_dim, seq_len=seq_len, device=self.device)
             xk = apply_rotry_position_embedding(xk, freq_k, device=self.device, dtype=self.dtype)
-        # Cache
-        self.cache_keys[:, start_pos:start_pos+seq_len] = xk
-        self.cache_value[:, start_pos:start_pos+seq_len] = xv
-        xk_full = self.cache_keys[:, :start_pos+seq_len]  # [B, T, kv_heads, D]
-        xv_full = self.cache_value[:, :start_pos+seq_len]
+        # Cache keys and values - only update the batch_size portion we're using
+        self.cache_keys[:batch_size, start_pos:start_pos+seq_len] = xk
+        self.cache_value[:batch_size, start_pos:start_pos+seq_len] = xv
+        # Only use the relevant batch portion from cache
+        xk_full = self.cache_keys[:batch_size, :start_pos+seq_len]
+        xv_full = self.cache_value[:batch_size, :start_pos+seq_len]
         # Transpose for attention: [B, H, T, D]
         query = xq.transpose(1, 2)  # [B, q_heads, seq_len, D]
         key = xk_full.transpose(1, 2)  # [B, kv_heads, total_kv_len, D]
@@ -518,16 +519,14 @@ class kv_cache_group_query(nn.Module):
         # Repeat keys and values to match query heads
         key = key.repeat_interleave(self.num_queries_per_kv, dim=1)
         value = value.repeat_interleave(self.num_queries_per_kv, dim=1)
         # Attention
         attn_scores = torch.matmul(query, key.transpose(2, 3)) / (self.head_dim ** 0.5)
         # Causal mask
-        causal_mask = torch.triu(torch.ones(seq_len, seq_len, dtype=torch.bool, device=self.device), diagonal=1)
+        causal_mask = torch.triu(torch.ones(attn_scores.shape[-2], attn_scores.shape[-1], dtype=torch.bool, device=self.device), diagonal=1)
         attn_scores.masked_fill_(causal_mask[None, None, :, :], float('-inf'))
+        #softmax
         attn_weights = F.softmax(attn_scores, dim=-1)
+        # atten weight @ value
         out = torch.matmul(attn_weights, value)
         out = out.transpose(1, 2).contiguous().view(batch_size, seq_len, self.emb_dim)
         return self.dropout(self.out_proj(out))

{stackformer-0.1.0 → stackformer-0.1.2/stackformer}/modules/mask.py RENAMED Viewed

@@ -33,4 +33,4 @@ def global_mask(Seq_len, global_index):
     for g in global_index:
         mask[g,:] = 1
     mask[:,global_index_tensor] = 1
-    return ~mask
+    return ~mask.bool()

{stackformer-0.1.0 → stackformer-0.1.2/stackformer}/modules/position_embedding.py RENAMED Viewed

@@ -14,7 +14,8 @@ class AbsolutePositionEmbedding(nn.Module):
         batch_size, seq_len = x.shape[0], x.shape[1]
         positions = torch.arange(0, seq_len)
         abs_pos = self.embedding(positions)  # (seq_len, emb_dim)
-        return abs_pos.unsqueeze(0).expand(batch_size, seq_len, -1).to(x.device)
+        out = abs_pos.unsqueeze(0).expand(batch_size, seq_len, -1)
+        return out.to(device=x.device,dtype=x.dtype)
 # --- Sinusoidal Positional Embedding ---
 class SinusoidalPositionalEmbedding(nn.Module):
@@ -35,8 +36,9 @@ class SinusoidalPositionalEmbedding(nn.Module):
     def forward(self, x):
         # x shape: (batch_size, seq_len, emb_dim) or (batch_size, seq_len)
         batch_size, seq_len = x.shape[0], x.shape[1]
-        return self.pe[:seq_len].unsqueeze(0).expand(batch_size, seq_len, -1).to(x.device)
+        out = self.pe[:seq_len].unsqueeze(0).expand(batch_size, seq_len, -1)
+        return out.to(device=x.device,dtype=x.dtype)
 # --- RoPE ---
 class RoPE(nn.Module):
     def __init__(self, head_dim, seq_len, theta=10000.0, device='cpu', dtype=torch.float32):
@@ -58,4 +60,4 @@ class RoPE(nn.Module):
         freqs = self.freq_complex[:seq_len].unsqueeze(0).unsqueeze(2)  # (1, seq_len, 1, head_dim//2)
         x_rotated = x_complex * freqs
         x_out = torch.view_as_real(x_rotated).contiguous().view(batch_size, seq_len, num_head, emb_dim)
-        return x_out.to(device=self.device, dtype=self.dtype)
+        return x_out.to(device=x.device, dtype=x.dtype)

stackformer-0.1.2/stackformer/trainer.py ADDED Viewed

@@ -0,0 +1,356 @@
+import torch
+import os
+from torch.utils.data import DataLoader
+from torch.optim import AdamW, SGD
+from torch.optim.lr_scheduler import LinearLR, CosineAnnealingLR, CosineAnnealingWarmRestarts
+from transformers import get_linear_schedule_with_warmup, get_cosine_schedule_with_warmup, get_cosine_with_hard_restarts_schedule_with_warmup
+from tqdm import tqdm
+class Trainer:
+    def __init__(self,
+                model,
+                train_dataset,
+                eval_dataset,
+                train_batch_size,
+                eval_batch_size,
+                vocab_size,
+                output_dir,
+                num_epoch,
+                lr: float,
+                scheduler_type=None,
+                optimizer_type="adamw",
+                eval_per_epoch = 1,
+                eval_per_step = None,
+                weight_decay=0.01,
+                warmup_steps=0,
+                grad_accumulation_step=1,
+                max_eval_step=None,
+                max_steps=None,
+                Save_step=None,
+                Save_epoch=None,
+                max_epoch=None,
+                model_to_resume=None,
+                resume_training=False,
+                seed=42,
+                device='cpu'):
+        self.model = model
+        self.train_dataset = train_dataset
+        self.train_batch_size = train_batch_size
+        self.eval_dataset = eval_dataset
+        self.eval_batch_size = eval_batch_size
+        self.vocab_size = vocab_size
+        self.num_epoch = num_epoch
+        self.max_steps = max_steps
+        self.max_epoch = max_epoch
+        self.eval_per_epoch = eval_per_epoch
+        self.eval_per_step = eval_per_step
+        self.max_eval_step = max_eval_step
+        self.lr = lr
+        self.scheduler_type = scheduler_type
+        self.output_dir = output_dir
+        self.model_to_resume = model_to_resume
+        self.resume_training = resume_training
+        self.Save_step = Save_step
+        self.Save_epoch = Save_epoch
+        self.grad_accumulation_step = grad_accumulation_step
+        self.optimizer_type = optimizer_type
+        self.weight_decay = weight_decay
+        self.warmup_steps = warmup_steps
+        self.seed = seed
+        self.device = device
+    # --- random seed ---
+    def set_seed(self, seed):
+        torch.manual_seed(seed)
+        if self.device == 'cuda' and torch.cuda.is_available():
+            torch.cuda.manual_seed_all(seed)
+    # --- scheduler ---
+    def get_scheduler(self, scheduler_type, total_training_steps, optimizer):
+        if scheduler_type is None:
+            return None
+        elif scheduler_type == "linear":
+            return get_linear_schedule_with_warmup(
+                optimizer,
+                num_warmup_steps=self.warmup_steps,
+                num_training_steps=total_training_steps
+            )
+        elif scheduler_type == "cosine":
+            return get_cosine_schedule_with_warmup(
+                optimizer,
+                num_warmup_steps=self.warmup_steps,
+                num_training_steps=total_training_steps
+            )
+        elif scheduler_type == "cosine_restarts":
+            return get_cosine_with_hard_restarts_schedule_with_warmup(
+                optimizer,
+                num_warmup_steps=self.warmup_steps,
+                num_training_steps=total_training_steps,
+                num_cycles=4  # Number of restarts
+            )
+        elif scheduler_type == "cosineannealing":
+            return CosineAnnealingLR(optimizer, T_max=total_training_steps)
+        elif scheduler_type == "cosine_warm_restarts":
+            return CosineAnnealingWarmRestarts(optimizer, T_0=total_training_steps//4, T_mult=2)
+        else:
+            raise ValueError(f"Unsupported scheduler type: {scheduler_type}")
+    # --- optimizer ---
+    def get_optimizer(self, optimizer_type, model, lr, weight_decay):
+        if optimizer_type.lower() == "adamw":
+            return AdamW(model.parameters(), lr=lr, weight_decay=weight_decay)
+        elif optimizer_type.lower() == "sgd":
+            return SGD(model.parameters(), lr=lr, weight_decay=weight_decay, momentum=0.9)
+        else:
+            raise ValueError(f"Unsupported optimizer {optimizer_type}")
+    # --- validate model ---
+    def eval_model(self, model, eval_loader, max_val_steps):
+        eval_loss = 0
+        model.eval()
+        max_val_steps = min(max_val_steps or len(eval_loader), len(eval_loader))
+        with torch.no_grad():
+            pbar = tqdm(eval_loader, total=max_val_steps, desc="Evaluating", leave=False)
+            for step, (inputs, targets) in enumerate(pbar):
+                inputs = inputs.to(self.device)
+                targets = targets.to(self.device)
+                output = model(inputs)  # shape: [B, T, V]
+                loss = torch.nn.functional.cross_entropy(
+                    output.view(-1, output.size(-1)),
+                    targets.view(-1), ignore_index=-100)
+                pbar.set_postfix(loss=loss.item())
+                eval_loss += loss.item()
+                if step + 1 >= max_val_steps:
+                    break
+        model.train()
+        avg_eval_loss = eval_loss / max_val_steps
+        return avg_eval_loss
+    # --- train dataloader ---
+    def get_train_loader(self, train_dataset, batch_size, seed):
+        generator = torch.Generator()
+        generator.manual_seed(seed)
+        train_loader = DataLoader(
+            train_dataset,
+            batch_size=batch_size,
+            shuffle=True,
+            generator=generator,
+            pin_memory=True if self.device == 'cuda' else False
+        )
+        return train_loader
+    # --- validation dataloader ---
+    def get_eval_loader(self, eval_dataset, batch_size, seed):
+        generator = torch.Generator()
+        generator.manual_seed(seed)
+        eval_loader = DataLoader(
+            eval_dataset,
+            batch_size=batch_size,
+            shuffle=False,
+            generator=generator,
+            pin_memory=True if self.device == 'cuda' else False
+        )
+        return eval_loader
+    # --- save model ---
+    def save_model(self, model, optimizer, scheduler, epoch, num_epoch, loss, global_step,
+                accumulated_steps, batch_idx_to_resume, output_dir, name):
+        checkpoint = {
+            'model_state_dict': model.state_dict(),
+            'optimizer_state_dict': optimizer.state_dict(),
+            'scheduler_state_dict': scheduler.state_dict() if scheduler else None,
+            'current_epoch': epoch,
+            'num_epoch': num_epoch,
+            'loss': loss,
+            'accumulated_steps': accumulated_steps,
+            'global_step': global_step,
+            'batch_idx_to_resume': batch_idx_to_resume,
+            'rng_state': {
+                'torch': torch.get_rng_state(),
+                'cuda': torch.cuda.get_rng_state_all() if torch.cuda.is_available() else None
+            }
+        }
+        os.makedirs(output_dir, exist_ok=True)
+        path = f'{output_dir}/checkpoint_{name}.pt'
+        torch.save(checkpoint, path)
+        print(f'Saved training state to {path}')
+    def load_checkpoint(self, path, model, optimizer, scheduler):
+        checkpoint = torch.load(path, map_location=self.device)
+        model.load_state_dict(checkpoint['model_state_dict'])
+        optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
+        if checkpoint.get('scheduler_state_dict') is not None and scheduler is not None:
+            scheduler.load_state_dict(checkpoint['scheduler_state_dict'])
+        current_epoch = checkpoint['current_epoch']
+        global_step = checkpoint['global_step']
+        loss = checkpoint['loss']
+        num_epoch = checkpoint['num_epoch']
+        accumulated_steps = checkpoint['accumulated_steps']
+        batch_idx_to_resume = checkpoint['batch_idx_to_resume']
+        # RNG
+        torch.set_rng_state(checkpoint['rng_state']['torch'])
+        if torch.cuda.is_available() and checkpoint['rng_state']['cuda']:
+            torch.cuda.set_rng_state_all(checkpoint['rng_state']['cuda'])
+        return {
+            'current_epoch': current_epoch,
+            'num_epoch': num_epoch,
+            'accumulated_steps' : accumulated_steps,
+            'batch_idx_to_resume': batch_idx_to_resume,
+            'global_step': global_step,
+            'loss': loss
+        }
+    # --- train ---
+    def train(self):
+        # --- seed ---
+        self.set_seed(self.seed)
+        # --- dataloader ---
+        train_loader = self.get_train_loader(self.train_dataset, self.train_batch_size, self.seed)
+        eval_loader = self.get_eval_loader(self.eval_dataset, self.eval_batch_size, self.seed)
+        # --- Calculate the total step ---
+        steps_per_epoch = len(train_loader) // self.grad_accumulation_step
+        total_training_steps = self.max_steps if self.max_steps is not None else steps_per_epoch * self.num_epoch
+        os.makedirs(self.output_dir, exist_ok=True)
+        model = self.model.to(self.device)
+        optimizer = self.get_optimizer(self.optimizer_type, model, self.lr, self.weight_decay)
+        criterion = torch.nn.functional.cross_entropy
+        scheduler = self.get_scheduler(self.scheduler_type, total_training_steps, optimizer)
+        global_step = 0
+        start_epoch = 0
+        num_epoch = self.num_epoch
+        batch_idx_to_resume = 0
+        accumulated_steps = 0
+        if self.resume_training and self.model_to_resume:
+            ckpt_data = self.load_checkpoint(self.model_to_resume, model, optimizer, scheduler)
+            start_epoch = ckpt_data['current_epoch']
+            global_step = ckpt_data['global_step']
+            num_epoch = ckpt_data['num_epoch']
+            batch_idx_to_resume = ckpt_data['batch_idx_to_resume']
+            accumulated_steps = ckpt_data['accumulated_steps']
+            print(f"♻️ Resuming training from epoch {start_epoch}, step {global_step}")
+        # --- print info ---
+        print(f"🧠 Number of parameters: {sum(p.numel() for p in self.model.parameters()):,}")
+        print(f"🍱 Number of train samples: {len(self.train_dataset):,}")
+        print(f"📊 Number of eval samples: {len(self.eval_dataset):,}")
+        print(f"📦 Train steps per epoch (batches): {len(train_loader):,}")
+        print(f"📦 Eval steps per epoch (batches): {len(eval_loader):,}")
+        for epoch in range(start_epoch, num_epoch):
+            model.train()
+            epoch_loss = 0
+            pbar = tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epoch}", leave=False)
+            for batch_idx, batch in enumerate(pbar):
+                if epoch == start_epoch and self.resume_training:
+                    if batch_idx < batch_idx_to_resume:
+                        continue
+                    elif batch_idx == batch_idx_to_resume:
+                        batch_idx_to_resume = 0
+                # --- load the inputs and targets ---
+                inputs, targets = batch
+                inputs = inputs.to(self.device, non_blocking=True)
+                targets = targets.to(self.device, non_blocking=True)
+                # --- get prediction from model ---
+                output = model(inputs)
+                # --- calculate loss ---
+                loss = criterion(
+                    output.view(-1, self.vocab_size),
+                    targets.view(-1),
+                    ignore_index=-100
+                )
+                loss = loss / self.grad_accumulation_step
+                loss.backward()
+                pbar.set_postfix(loss=loss.item() * self.grad_accumulation_step)
+                epoch_loss += loss.item() * self.grad_accumulation_step
+                accumulated_steps += 1
+                # --- gradient accumulation ---
+                if accumulated_steps % self.grad_accumulation_step == 0:
+                    # Gradient clipping for stable training
+                    torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
+                    optimizer.step()
+                    if scheduler is not None:
+                        scheduler.step()
+                    optimizer.zero_grad()
+                    global_step += 1
+                    accumulated_steps = 0
+                is_last_step = (self.max_steps is not None and global_step >= self.max_steps)
+                # check eval_per_step
+                if (self.eval_per_step is not None and global_step+1 % self.eval_per_step == 0) or is_last_step:
+                                avg_eval_loss = self.eval_model(model, eval_loader, self.max_eval_step)
+                                print(f"🎯 Eval loss: {avg_eval_loss:.4f}")
+                # Check max steps
+                if is_last_step:
+                    self.save_model(
+                        model=model, optimizer=optimizer, scheduler=scheduler,
+                        epoch=epoch+1, num_epoch=num_epoch, loss=epoch_loss,
+                        global_step=global_step, output_dir=self.output_dir,
+                        batch_idx_to_resume=batch_idx+1,accumulated_steps=accumulated_steps,
+                        name=f'final_step_epoch_{epoch+1}_step_{global_step}'
+                    )
+                    return
+                # Save at specific steps
+                if (self.Save_step is not None and
+                    global_step > 0 and
+                    global_step % self.Save_step == 0):
+                    self.save_model(
+                        model=model, optimizer=optimizer, scheduler=scheduler,
+                        epoch=epoch+1, num_epoch=num_epoch, loss=epoch_loss,
+                        global_step=global_step, output_dir=self.output_dir,
+                        batch_idx_to_resume=batch_idx+1,accumulated_steps=accumulated_steps,
+                        name=f'epoch_{epoch+1}_step_{global_step}'
+                    )
+            # Handle remaining accumulated gradients at epoch end
+            if accumulated_steps > 0:
+                torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
+                optimizer.step()
+                if scheduler is not None:
+                    scheduler.step()
+                optimizer.zero_grad()
+                global_step += 1
+            is_last_epoch = (self.max_epoch is not None and (epoch+1) == self.max_epoch)
+            # Evaluation
+            if (self.eval_per_epoch is not None and (epoch+1) % self.eval_per_epoch == 0) or is_last_epoch:
+                avg_eval_loss = self.eval_model(model, eval_loader, self.max_eval_step)
+                print(f"🎯 Eval loss: {avg_eval_loss:.4f}")
+            # Check max epoch
+            if is_last_epoch:
+                self.save_model(
+                    model=model, optimizer=optimizer, scheduler=scheduler,
+                    epoch=epoch+1, num_epoch=num_epoch, loss=epoch_loss,
+                    global_step=global_step, output_dir=self.output_dir,
+                    batch_idx_to_resume=batch_idx+1,accumulated_steps=accumulated_steps,
+                    name=f'final_model_epoch_{epoch+1}_step_{global_step}'
+                )
+                return
+            # print epoch loss
+            avg_epoch_loss = epoch_loss / len(train_loader)
+            print(f"🔥 Epoch {epoch+1} finished - Training Loss: {avg_epoch_loss:.4f}")
+            # Save at specific epochs
+            if (self.Save_epoch is not None and
+                (epoch + 1) % self.Save_epoch == 0):
+                self.save_model(
+                    model=model, optimizer=optimizer, scheduler=scheduler,
+                    epoch=epoch+1, num_epoch=num_epoch, loss=epoch_loss,
+                    global_step=global_step, output_dir=self.output_dir,
+                    batch_idx_to_resume=batch_idx+1,accumulated_steps=accumulated_steps,
+                    name=f'epoch_{epoch+1}_step_{global_step}'
+                )

stackformer-0.1.0/Stackformer.egg-info/SOURCES.txt DELETED Viewed

@@ -1,18 +0,0 @@
-LICENSE
-README.md
-pyproject.toml
-setup.py
-Stackformer.egg-info/PKG-INFO
-Stackformer.egg-info/SOURCES.txt
-Stackformer.egg-info/dependency_links.txt
-Stackformer.egg-info/requires.txt
-Stackformer.egg-info/top_level.txt
-models/GPT_2.py
-models/__init__.py
-modules/Attention.py
-modules/Feed_forward.py
-modules/Normalization.py
-modules/__init__.py
-modules/mask.py
-modules/position_embedding.py
-modules/tokenizer.py

stackformer-0.1.0/Stackformer.egg-info/requires.txt DELETED Viewed

	@@ -1,2 +0,0 @@
1	- torch>=2.6
2	- tqdm>=4.67

stackformer-0.1.0/Stackformer.egg-info/top_level.txt DELETED Viewed

	@@ -1,2 +0,0 @@
1	- models
2	- modules

{stackformer-0.1.0 → stackformer-0.1.2}/LICENSE RENAMED Viewed

File without changes

{stackformer-0.1.0 → stackformer-0.1.2}/Stackformer.egg-info/dependency_links.txt RENAMED Viewed

File without changes

{stackformer-0.1.0 → stackformer-0.1.2}/setup.cfg RENAMED Viewed

File without changes

{stackformer-0.1.0 → stackformer-0.1.2/stackformer}/models/__init__.py RENAMED Viewed

File without changes

{stackformer-0.1.0 → stackformer-0.1.2/stackformer}/modules/Feed_forward.py RENAMED Viewed

File without changes

{stackformer-0.1.0 → stackformer-0.1.2/stackformer}/modules/Normalization.py RENAMED Viewed

File without changes

{stackformer-0.1.0 → stackformer-0.1.2/stackformer}/modules/__init__.py RENAMED Viewed

File without changes

{stackformer-0.1.0 → stackformer-0.1.2/stackformer}/modules/tokenizer.py RENAMED Viewed

File without changes

Stackformer 0.1.0__tar.gz → 0.1.2__tar.gz

Stackformer 0.1.0tar.gz → 0.1.2tar.gz