PyPI - transformers-from-scratch - Versions diffs - 0.1.0__py3-none-any.whl - Mend

transformers-from-scratch 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

transformer_variants/__init__.py +13 -0
transformer_variants/model.py +294 -0
transformers_from_scratch-0.1.0.dist-info/METADATA +98 -0
transformers_from_scratch-0.1.0.dist-info/RECORD +7 -0
transformers_from_scratch-0.1.0.dist-info/WHEEL +5 -0
transformers_from_scratch-0.1.0.dist-info/licenses/LICENSE +21 -0
transformers_from_scratch-0.1.0.dist-info/top_level.txt +1 -0

transformer_variants/__init__.py ADDED Viewed

@@ -0,0 +1,13 @@
+from .model import (
+    EncoderTransformer,
+    DecoderOnlyTransformer,
+    DecoderTransformer,
+    EncoderDecoderTransformer,
+)
+__all__ = [
+    "EncoderTransformer",
+    "DecoderOnlyTransformer",
+    "DecoderTransformer",
+    "EncoderDecoderTransformer",
+]

transformer_variants/model.py ADDED Viewed

@@ -0,0 +1,294 @@
+"""
+Implementations of encoder, decoder, and encoder-decoder Transformer architectures,
+following the design of "Attention Is All You Need" (Vaswani et al., 2017).
+Notation used in shape comments throughout this module:
+  B       — batch size
+  T       — sequence length (generic)
+  T_src   — source sequence length
+  T_tgt   — target sequence length
+  d_model — model embedding dimension
+  d_ff    — feed-forward hidden dimension
+  d_key   — per-head key/query dimension  (= d_model // n_heads)
+  d_val   — per-head value dimension      (= d_model // n_heads)
+Mask convention: BoolTensor where True marks positions to ignore (filled with -inf).
+"""
+import math
+import torch
+from torch import Tensor, BoolTensor
+import torch.nn as nn
+import torch.nn.functional as F
+class PositionalEncoding(nn.Module):
+    def __init__(self, max_seq_len: int, d_model: int):
+        super().__init__()
+        pos = torch.arange(max_seq_len).unsqueeze(1)  # [T, 1]
+        i = torch.arange(0, d_model, 2)               # [d_model/2]
+        pe = torch.zeros(1, max_seq_len, d_model)      # [1, T, d_model]
+        pe[0, :, 0::2] = torch.sin(pos / 10000 ** (i / d_model))
+        pe[0, :, 1::2] = torch.cos(pos / 10000 ** (i / d_model))
+        self.register_buffer('pe', pe)
+    def forward(self, x: Tensor) -> Tensor:
+        # x: [B, T, d_model]
+        return x + self.pe[:, :x.size(1)]  # [B, T, d_model]
+class FeedForward(nn.Module):
+    def __init__(self, d_model: int, d_ff: int):
+        super().__init__()
+        self.gelu = nn.GELU()
+        self.linear1 = nn.Linear(d_model, d_ff)
+        self.linear2 = nn.Linear(d_ff, d_model)
+    def forward(self, x: Tensor):
+        # x: [B, T, d_model]
+        x = self.linear1(x)  # [B, T, d_ff]
+        x = self.gelu(x)     # [B, T, d_ff]
+        x = self.linear2(x)  # [B, T, d_model]
+        return x
+class SingleHeadCrossAttention(nn.Module):
+    def __init__(self, d_model: int, d_key: int, d_val: int):
+        super().__init__()
+        self.softmax = nn.Softmax(-1)
+        self.q_proj = nn.Linear(d_model, d_key, bias=False)
+        self.k_proj = nn.Linear(d_model, d_key, bias=False)
+        self.v_proj = nn.Linear(d_model, d_val, bias=False)
+    def forward(self, src: Tensor, tgt: Tensor, mask: BoolTensor = None):
+        # src: [B, T_src, d_model], tgt: [B, T_tgt, d_model], mask: [B, 1, T_src] or [B, T_tgt, T_src]
+        Q = self.q_proj(tgt)                                           # [B, T_tgt, d_key]
+        K = self.k_proj(src)                                           # [B, T_src, d_key]
+        V = self.v_proj(src)                                           # [B, T_src, d_val]
+        attn_scores = torch.matmul(Q, K.transpose(-1, -2)) / math.sqrt(K.size(-1))  # [B, T_tgt, T_src]
+        if mask is not None:
+            attn_scores = attn_scores.masked_fill(mask, float('-inf'))
+        return torch.matmul(self.softmax(attn_scores), V)              # [B, T_tgt, d_val]
+class SingleHeadSelfAttention(nn.Module):
+    def __init__(self, d_model: int, d_key: int, d_val: int):
+        super().__init__()
+        self.softmax = nn.Softmax(-1)
+        self.q_proj = nn.Linear(d_model, d_key, bias=False)
+        self.k_proj = nn.Linear(d_model, d_key, bias=False)
+        self.v_proj = nn.Linear(d_model, d_val, bias=False)
+    def forward(self, x: Tensor, mask: BoolTensor = None):
+        # x: [B, T, d_model], mask: [B, T, T] or [1, T, T]
+        Q = self.q_proj(x)                                             # [B, T, d_key]
+        K = self.k_proj(x)                                             # [B, T, d_key]
+        V = self.v_proj(x)                                             # [B, T, d_val]
+        attn_scores = torch.matmul(Q, K.transpose(-1, -2)) / math.sqrt(K.size(-1))  # [B, T, T]
+        if mask is not None:
+            attn_scores = attn_scores.masked_fill(mask, float('-inf'))
+        return torch.matmul(self.softmax(attn_scores), V)              # [B, T, d_val]
+class MultiHeadCrossAttention(nn.Module):
+    def __init__(self, d_model: int, n_heads: int):
+        super().__init__()
+        assert d_model % n_heads == 0
+        d_head = d_model // n_heads
+        self.heads = nn.ModuleList([
+            SingleHeadCrossAttention(d_model, d_head, d_head) for _ in range(n_heads)
+        ])
+        self.out_proj = nn.Linear(d_model, d_model, bias=False)
+    def forward(self, src: Tensor, tgt: Tensor, mask: BoolTensor = None):
+        # src: [B, T_src, d_model], tgt: [B, T_tgt, d_model], mask: [B, 1, T_src] or [B, T_tgt, T_src]
+        return self.out_proj(torch.cat([head(src, tgt, mask) for head in self.heads], dim=-1))  # [B, T_tgt, d_model]
+class MultiHeadSelfAttention(nn.Module):
+    def __init__(self, d_model: int, n_heads: int):
+        super().__init__()
+        assert d_model % n_heads == 0
+        d_head = d_model // n_heads
+        self.heads = nn.ModuleList([
+            SingleHeadSelfAttention(d_model, d_head, d_head) for _ in range(n_heads)
+        ])
+        self.out_proj = nn.Linear(d_model, d_model, bias=False)
+    def forward(self, x: Tensor, mask: BoolTensor = None):
+        # x: [B, T, d_model], mask: [B, T, T] or [1, T, T]
+        return self.out_proj(torch.cat([head(x, mask) for head in self.heads], dim=-1))  # [B, T, d_model]
+class CrossAttentionTransformerBlock(nn.Module):
+    def __init__(self, d_model: int, d_ff: int, n_heads: int):
+        super().__init__()
+        self.layer_norm1 = nn.LayerNorm(d_model)
+        self.layer_norm2 = nn.LayerNorm(d_model)
+        self.layer_norm3 = nn.LayerNorm(d_model)
+        self.multi_head_self_attn = MultiHeadSelfAttention(d_model, n_heads)
+        self.multi_head_cross_attn = MultiHeadCrossAttention(d_model, n_heads)
+        self.feed_forward = FeedForward(d_model, d_ff)
+    def forward(self, src: Tensor, tgt: Tensor, src_mask: BoolTensor = None, tgt_mask: BoolTensor = None):
+        # src: [B, T_src, d_model], tgt: [B, T_tgt, d_model], src_mask: [B, 1, T_src], tgt_mask: [B, T_tgt, T_tgt]
+        carry = self.multi_head_self_attn(self.layer_norm1(tgt), tgt_mask)    # [B, T_tgt, d_model]
+        tgt = carry + tgt                                                     # [B, T_tgt, d_model]
+        carry = self.multi_head_cross_attn(src, self.layer_norm2(tgt), src_mask)  # [B, T_tgt, d_model]
+        tgt = carry + tgt                                                     # [B, T_tgt, d_model]
+        carry = self.feed_forward(self.layer_norm3(tgt))                      # [B, T_tgt, d_model]
+        tgt = carry + tgt                                                     # [B, T_tgt, d_model]
+        return tgt
+class TransformerBlock(nn.Module):
+    def __init__(self, d_model: int, d_ff: int, n_heads: int):
+        super().__init__()
+        self.layer_norm1 = nn.LayerNorm(d_model)
+        self.layer_norm2 = nn.LayerNorm(d_model)
+        self.multi_head_attn = MultiHeadSelfAttention(d_model, n_heads)
+        self.feed_forward = FeedForward(d_model, d_ff)
+    def forward(self, x: Tensor, mask: BoolTensor = None):
+        # x: [B, T, d_model], mask: [B, T, T] or [1, T, T]
+        carry = self.multi_head_attn(self.layer_norm1(x), mask)  # [B, T, d_model]
+        x = carry + x                                            # [B, T, d_model]
+        carry = self.feed_forward(self.layer_norm2(x))           # [B, T, d_model]
+        x = carry + x                                            # [B, T, d_model]
+        return x
+class DecoderOnlyTransformer(nn.Module):
+    """
+    An autoregressive, GPT-style, decoder-only transformer for text generation purposes.
+    Importantly, returns outputs with dimension vocab_size, i.e. logits.
+    Stand-alone, can be used on its own.
+    """
+    def __init__(
+            self,
+            vocab_size: int,
+            max_seq_len: int,
+            d_model: int,
+            d_ff: int,
+            n_heads: int,
+            n_blocks: int,
+        ):
+        super().__init__()
+        assert d_model % n_heads == 0
+        self.embedding = nn.Embedding(vocab_size, d_model)
+        self.pos_encoding = PositionalEncoding(max_seq_len, d_model)
+        self.attn_blocks = nn.ModuleList([
+            TransformerBlock(d_model, d_ff, n_heads) for blk in range(n_blocks)
+        ])
+        self.final_layer_norm = nn.LayerNorm(d_model)
+        # In decoder-only Transformer, we use the unembedding matrix to map to logits of the vocabulary
+        self.unembedding = nn.Linear(d_model, vocab_size, bias=False)
+        self.unembedding.weight = self.embedding.weight
+    def forward(self, x: Tensor, pad_mask: BoolTensor = None):
+        # x: [B, T] (token indices), pad_mask: [B, 1, T] (optional, True = padding)
+        T = x.size(1)
+        causal_mask = torch.ones(T, T, dtype=torch.bool, device=x.device).triu(1).unsqueeze(0)  # [1, T, T]
+        mask = causal_mask | pad_mask if pad_mask is not None else causal_mask                  # [B, T, T]
+        x = self.pos_encoding(self.embedding(x))  # [B, T, d_model]
+        for attn_blk in self.attn_blocks:
+            x = attn_blk(x, mask)                 # [B, T, d_model]
+        x = self.final_layer_norm(x)              # [B, T, d_model]
+        x = self.unembedding(x)                   # [B, T, vocab_size]
+        return x
+class EncoderTransformer(nn.Module):
+    """
+    An encoder Transformer, with BERT-style bidirectionality.
+    Importantly, returns outputs with dimension d_model, i.e. same as the input embeddings.
+    Stand-alone, can be used on its own.
+    """
+    def __init__(
+            self,
+            vocab_size: int,
+            max_seq_len: int,
+            d_model: int,
+            d_ff: int,
+            n_heads: int,
+            n_blocks: int,
+        ):
+        super().__init__()
+        assert d_model % n_heads == 0
+        self.embedding = nn.Embedding(vocab_size, d_model)
+        self.pos_encoding = PositionalEncoding(max_seq_len, d_model)
+        self.attn_blocks = nn.ModuleList([
+            TransformerBlock(d_model, d_ff, n_heads) for blk in range(n_blocks)
+        ])
+        self.final_layer_norm = nn.LayerNorm(d_model)
+    def forward(self, x: Tensor, mask: BoolTensor = None):
+        # x: [B, T] (token indices), mask: [B, 1, T] (optional, True = padding)
+        x = self.pos_encoding(self.embedding(x))  # [B, T, d_model]
+        for attn_blk in self.attn_blocks:
+            x = attn_blk(x, mask)                 # [B, T, d_model]
+        x = self.final_layer_norm(x)              # [B, T, d_model]
+        return x
+class DecoderTransformer(nn.Module):
+    """
+    A decoder Transformer, which assumes that the src argument already consists of encoded vectors.
+    Importantly, returns outputs with dimension vocab_size, i.e. logits.
+    Not for stand-alone use.
+    """
+    def __init__(
+            self,
+            vocab_size: int,
+            max_seq_len: int,
+            d_model: int,
+            d_ff: int,
+            n_heads: int,
+            n_blocks: int,
+        ):
+        super().__init__()
+        assert d_model % n_heads == 0
+        self.embedding = nn.Embedding(vocab_size, d_model)
+        self.pos_encoding = PositionalEncoding(max_seq_len, d_model)
+        self.attn_blocks = nn.ModuleList([
+            CrossAttentionTransformerBlock(d_model, d_ff, n_heads) for blk in range(n_blocks)
+        ])
+        self.final_layer_norm = nn.LayerNorm(d_model)
+        # In decoder Transformer, we use the unembedding matrix to map to logits of the vocabulary
+        self.unembedding = nn.Linear(d_model, vocab_size, bias=False)
+        self.unembedding.weight = self.embedding.weight
+    def forward(self, src: Tensor, tgt: Tensor, src_mask: BoolTensor = None, tgt_mask: BoolTensor = None):
+        # src: [B, T_src, d_model] (encoded), tgt: [B, T_tgt] (token indices), src_mask: [B, 1, T_src], tgt_mask: [B, T_tgt, T_tgt]
+        tgt = self.pos_encoding(self.embedding(tgt))      # [B, T_tgt, d_model]
+        for attn_blk in self.attn_blocks:
+            tgt = attn_blk(src, tgt, src_mask, tgt_mask)  # [B, T_tgt, d_model]
+        tgt = self.final_layer_norm(tgt)                  # [B, T_tgt, d_model]
+        tgt = self.unembedding(tgt)                       # [B, T_tgt, vocab_size]
+        return tgt
+class EncoderDecoderTransformer(nn.Module):
+    """
+    An encoder-decoder Transformer.
+    Importantly, returns outputs with dimension vocab_size, i.e. logits.
+    Stand-alone, can be used on its own.
+    """
+    def __init__(
+            self,
+            vocab_size: int,
+            max_seq_len: int,
+            d_model: int,
+            d_ff: int,
+            n_heads: int,
+            n_blocks_encoder: int,
+            n_blocks_decoder: int,
+        ):
+        super().__init__()
+        assert d_model % n_heads == 0
+        self.encoder = EncoderTransformer(vocab_size, max_seq_len, d_model, d_ff, n_heads, n_blocks_encoder)
+        self.decoder = DecoderTransformer(vocab_size, max_seq_len, d_model, d_ff, n_heads, n_blocks_decoder)
+    def forward(self, src: Tensor, tgt: Tensor, src_mask: BoolTensor = None, tgt_mask: BoolTensor = None):
+        # src: [B, T_src] (token indices), tgt: [B, T_tgt] (token indices), src_mask: [B, 1, T_src], tgt_mask: [B, T_tgt, T_tgt]
+        return self.decoder(self.encoder(src, src_mask), tgt, src_mask, tgt_mask)  # [B, T_tgt, vocab_size]

transformers_from_scratch-0.1.0.dist-info/METADATA ADDED Viewed

@@ -0,0 +1,98 @@
+Metadata-Version: 2.4
+Name: transformers-from-scratch
+Version: 0.1.0
+Summary: Encoder, decoder, and encoder-decoder Transformer architectures in PyTorch
+License-Expression: MIT
+Project-URL: Repository, https://github.com/etfrer-yi/Transformer-Variants
+Requires-Python: >=3.9
+Description-Content-Type: text/markdown
+License-File: LICENSE
+Requires-Dist: torch>=2.0
+Dynamic: license-file
+# Transformer Variants
+Implementations of encoder, decoder, and encoder-decoder Transformer architectures in PyTorch, following "Attention Is All You Need" (Vaswani et al., 2017).
+## Installation
+```bash
+pip install transformers-from-scratch
+```
+PyTorch is required but not installed automatically (to allow users to choose their CUDA build). Install it first from [pytorch.org](https://pytorch.org/get-started/locally/).
+## Development Setup
+```bash
+python3 -m venv venv
+source venv/bin/activate
+pip install -r requirements.txt
+```
+## Modules
+`model.py` provides three stand-alone models and the building blocks they are composed of.
+| Class | Description |
+|---|---|
+| `EncoderTransformer` | BERT-style bidirectional encoder |
+| `DecoderOnlyTransformer` | GPT-style autoregressive decoder |
+| `EncoderDecoderTransformer` | Sequence-to-sequence encoder-decoder |
+| `DecoderTransformer` | Decoder component — not for stand-alone use |
+## Usage
+```python
+import torch
+from model import EncoderTransformer, DecoderOnlyTransformer, EncoderDecoderTransformer
+VOCAB_SIZE, MAX_SEQ_LEN = 32000, 512
+D_MODEL, D_FF, N_HEADS, N_BLOCKS = 512, 2048, 8, 6
+```
+### Encoder
+```python
+model = EncoderTransformer(VOCAB_SIZE, MAX_SEQ_LEN, D_MODEL, D_FF, N_HEADS, N_BLOCKS)
+src = torch.randint(0, VOCAB_SIZE, (B, T_src))          # [B, T_src]
+src_mask = (src == pad_id).unsqueeze(1)                 # [B, 1, T_src]  True = padding
+out = model(src, src_mask)                              # [B, T_src, d_model]
+```
+### Decoder-only
+```python
+model = DecoderOnlyTransformer(VOCAB_SIZE, MAX_SEQ_LEN, D_MODEL, D_FF, N_HEADS, N_BLOCKS)
+tgt = torch.randint(0, VOCAB_SIZE, (B, T))              # [B, T]
+pad_mask = (tgt == pad_id).unsqueeze(1)                 # [B, 1, T]  True = padding (optional)
+logits = model(tgt, pad_mask)                           # [B, T, vocab_size]
+```
+A causal mask is generated internally — no need to pass one.
+### Encoder-decoder
+```python
+model = EncoderDecoderTransformer(VOCAB_SIZE, MAX_SEQ_LEN, D_MODEL, D_FF, N_HEADS, N_BLOCKS, N_BLOCKS)
+src = torch.randint(0, VOCAB_SIZE, (B, T_src))          # [B, T_src]
+tgt = torch.randint(0, VOCAB_SIZE, (B, T_tgt))          # [B, T_tgt]
+src_mask = (src == pad_id).unsqueeze(1)                 # [B, 1, T_src]
+T_tgt = tgt.size(1)
+causal = torch.ones(T_tgt, T_tgt, dtype=torch.bool).triu(1).unsqueeze(0)  # [1, T_tgt, T_tgt]
+tgt_mask = causal | (tgt == pad_id).unsqueeze(1)        # [B, T_tgt, T_tgt]
+logits = model(src, tgt, src_mask, tgt_mask)            # [B, T_tgt, vocab_size]
+```
+## Running tests
+```bash
+python -m unittest test -v
+```

transformers_from_scratch-0.1.0.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,7 @@
+transformer_variants/__init__.py,sha256=U743Fca33knFSbDz3_FHwfxkpcHrEP9-uCYVL-YRSAg,260
+transformer_variants/model.py,sha256=POoztWJBJzGFyN59fwToQi3Mt2of32rEjrLyGBvHvkE,13295
+transformers_from_scratch-0.1.0.dist-info/licenses/LICENSE,sha256=vLbi0g8BI5P1FqzwEAkoqy-di-NXJKt-YP-bPzwVQjQ,1066
+transformers_from_scratch-0.1.0.dist-info/METADATA,sha256=pG5KAWgiD6mDsMSTqE_b69Sk3qNSBHdl_HZ30gHN_bo,3005
+transformers_from_scratch-0.1.0.dist-info/WHEEL,sha256=aeYiig01lYGDzBgS8HxWXOg3uV61G9ijOsup-k9o1sk,91
+transformers_from_scratch-0.1.0.dist-info/top_level.txt,sha256=GDjutKqgZ7wS82s968eaptW75Dx4cxZ87h-cOh9l-WA,21
+transformers_from_scratch-0.1.0.dist-info/RECORD,,

transformers_from_scratch-0.1.0.dist-info/WHEEL ADDED Viewed

@@ -0,0 +1,5 @@
+Wheel-Version: 1.0
+Generator: setuptools (82.0.1)
+Root-Is-Purelib: true
+Tag: py3-none-any

transformers_from_scratch-0.1.0.dist-info/licenses/LICENSE ADDED Viewed

@@ -0,0 +1,21 @@
+MIT License
+Copyright (c) 2026 etfrer-yi
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

transformers_from_scratch-0.1.0.dist-info/top_level.txt ADDED Viewed

	@@ -0,0 +1 @@
1	+ transformer_variants