gptmed 0.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- gptmed/__init__.py +37 -0
- gptmed/configs/__init__.py +1 -0
- gptmed/configs/train_config.py +154 -0
- gptmed/data/__init__.py +5 -0
- gptmed/data/parsers/__init__.py +10 -0
- gptmed/data/parsers/medquad_parser.py +257 -0
- gptmed/data/parsers/text_formatter.py +148 -0
- gptmed/inference/__init__.py +1 -0
- gptmed/inference/decoding_utils.py +190 -0
- gptmed/inference/generation_config.py +83 -0
- gptmed/inference/generator.py +253 -0
- gptmed/inference/sampling.py +261 -0
- gptmed/model/__init__.py +9 -0
- gptmed/model/architecture/__init__.py +35 -0
- gptmed/model/architecture/attention.py +188 -0
- gptmed/model/architecture/decoder_block.py +130 -0
- gptmed/model/architecture/embeddings.py +146 -0
- gptmed/model/architecture/feedforward.py +109 -0
- gptmed/model/architecture/transformer.py +204 -0
- gptmed/model/configs/__init__.py +17 -0
- gptmed/model/configs/model_config.py +155 -0
- gptmed/tokenizer/__init__.py +7 -0
- gptmed/tokenizer/tokenize_data.py +286 -0
- gptmed/tokenizer/train_tokenizer.py +218 -0
- gptmed/training/__init__.py +1 -0
- gptmed/training/dataset.py +183 -0
- gptmed/training/train.py +272 -0
- gptmed/training/trainer.py +331 -0
- gptmed/training/utils.py +212 -0
- gptmed/utils/__init__.py +1 -0
- gptmed/utils/checkpoints.py +224 -0
- gptmed/utils/logging.py +189 -0
- gptmed-0.0.1.dist-info/METADATA +325 -0
- gptmed-0.0.1.dist-info/RECORD +38 -0
- gptmed-0.0.1.dist-info/WHEEL +5 -0
- gptmed-0.0.1.dist-info/entry_points.txt +3 -0
- gptmed-0.0.1.dist-info/licenses/LICENSE +21 -0
- gptmed-0.0.1.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,146 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Token and Positional Embeddings
|
|
3
|
+
|
|
4
|
+
PURPOSE:
|
|
5
|
+
This file converts token IDs (integers) into continuous vector representations
|
|
6
|
+
that the Transformer can process. It also adds positional information so the
|
|
7
|
+
model knows the order of tokens (since attention has no inherent notion of position).
|
|
8
|
+
|
|
9
|
+
WHAT THIS STEP DOES:
|
|
10
|
+
1. Token Embedding: Maps each token ID to a learned vector of size d_model
|
|
11
|
+
- Input: [batch_size, seq_len] of token IDs
|
|
12
|
+
- Output: [batch_size, seq_len, d_model] of vectors
|
|
13
|
+
|
|
14
|
+
2. Positional Embedding: Adds position information to each token
|
|
15
|
+
- Two approaches: learned embeddings or sinusoidal encoding
|
|
16
|
+
- Same shape as token embeddings: [batch_size, seq_len, d_model]
|
|
17
|
+
|
|
18
|
+
3. Combines both: token_emb + pos_emb
|
|
19
|
+
- Final output: [batch_size, seq_len, d_model]
|
|
20
|
+
|
|
21
|
+
PACKAGES USED:
|
|
22
|
+
- torch: PyTorch tensors and neural network modules
|
|
23
|
+
- torch.nn: Embedding layer, Dropout
|
|
24
|
+
|
|
25
|
+
FILES FROM THIS PROJECT:
|
|
26
|
+
- None (this is a base component)
|
|
27
|
+
|
|
28
|
+
TENSOR SHAPES EXPLAINED:
|
|
29
|
+
- batch_size: Number of sequences processed together
|
|
30
|
+
- seq_len: Length of each sequence (512 in our case)
|
|
31
|
+
- vocab_size: Size of tokenizer vocabulary (8000)
|
|
32
|
+
- d_model: Embedding dimension (256-512)
|
|
33
|
+
|
|
34
|
+
COMMON FAILURE MODES TO AVOID:
|
|
35
|
+
- Forgetting dropout → overfitting
|
|
36
|
+
- Wrong positional encoding dimension → shape mismatch
|
|
37
|
+
- Not scaling embeddings → training instability
|
|
38
|
+
- Using fixed positions > max_seq_len → index out of bounds
|
|
39
|
+
"""
|
|
40
|
+
|
|
41
|
+
import torch
|
|
42
|
+
import torch.nn as nn
|
|
43
|
+
import math
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
class TokenEmbedding(nn.Module):
|
|
47
|
+
"""
|
|
48
|
+
Convert token IDs to embeddings.
|
|
49
|
+
|
|
50
|
+
Tensor shape flow:
|
|
51
|
+
Input: [batch_size, seq_len] (token IDs)
|
|
52
|
+
Output: [batch_size, seq_len, d_model] (embeddings)
|
|
53
|
+
"""
|
|
54
|
+
|
|
55
|
+
def __init__(self, vocab_size: int, d_model: int):
|
|
56
|
+
super().__init__()
|
|
57
|
+
self.embedding = nn.Embedding(vocab_size, d_model)
|
|
58
|
+
self.d_model = d_model
|
|
59
|
+
|
|
60
|
+
def forward(self, x: torch.Tensor) -> torch.Tensor:
|
|
61
|
+
"""
|
|
62
|
+
Args:
|
|
63
|
+
x: Token IDs [batch_size, seq_len]
|
|
64
|
+
|
|
65
|
+
Returns:
|
|
66
|
+
Embeddings [batch_size, seq_len, d_model]
|
|
67
|
+
|
|
68
|
+
Note: We scale by sqrt(d_model) following the original Transformer paper.
|
|
69
|
+
This prevents embeddings from being too small relative to positional encodings.
|
|
70
|
+
"""
|
|
71
|
+
return self.embedding(x) * math.sqrt(self.d_model)
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
class PositionalEmbedding(nn.Module):
|
|
75
|
+
"""
|
|
76
|
+
Learned positional embeddings.
|
|
77
|
+
|
|
78
|
+
Alternative to sinusoidal encoding - the model learns optimal position representations.
|
|
79
|
+
GPT uses learned embeddings, so we follow that.
|
|
80
|
+
|
|
81
|
+
Tensor shape flow:
|
|
82
|
+
Input: [batch_size, seq_len]
|
|
83
|
+
Output: [batch_size, seq_len, d_model]
|
|
84
|
+
"""
|
|
85
|
+
|
|
86
|
+
def __init__(self, max_seq_len: int, d_model: int):
|
|
87
|
+
super().__init__()
|
|
88
|
+
self.embedding = nn.Embedding(max_seq_len, d_model)
|
|
89
|
+
|
|
90
|
+
def forward(self, x: torch.Tensor) -> torch.Tensor:
|
|
91
|
+
"""
|
|
92
|
+
Args:
|
|
93
|
+
x: Token IDs [batch_size, seq_len] (we only use seq_len from this)
|
|
94
|
+
|
|
95
|
+
Returns:
|
|
96
|
+
Position embeddings [batch_size, seq_len, d_model]
|
|
97
|
+
"""
|
|
98
|
+
batch_size, seq_len = x.size()
|
|
99
|
+
|
|
100
|
+
# Create position indices: [0, 1, 2, ..., seq_len-1]
|
|
101
|
+
positions = torch.arange(seq_len, device=x.device).unsqueeze(0) # [1, seq_len]
|
|
102
|
+
|
|
103
|
+
# Expand to batch: [batch_size, seq_len]
|
|
104
|
+
positions = positions.expand(batch_size, seq_len)
|
|
105
|
+
|
|
106
|
+
return self.embedding(positions)
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
class TokenPositionalEmbedding(nn.Module):
|
|
110
|
+
"""
|
|
111
|
+
Combined token + positional embeddings.
|
|
112
|
+
|
|
113
|
+
This is what the transformer actually uses.
|
|
114
|
+
|
|
115
|
+
Tensor shape flow:
|
|
116
|
+
Input: [batch_size, seq_len] (token IDs)
|
|
117
|
+
Output: [batch_size, seq_len, d_model] (combined embeddings)
|
|
118
|
+
"""
|
|
119
|
+
|
|
120
|
+
def __init__(self, vocab_size: int, d_model: int, max_seq_len: int, dropout: float = 0.1):
|
|
121
|
+
super().__init__()
|
|
122
|
+
self.token_embedding = TokenEmbedding(vocab_size, d_model)
|
|
123
|
+
self.positional_embedding = PositionalEmbedding(max_seq_len, d_model)
|
|
124
|
+
self.dropout = nn.Dropout(dropout)
|
|
125
|
+
|
|
126
|
+
def forward(self, x: torch.Tensor) -> torch.Tensor:
|
|
127
|
+
"""
|
|
128
|
+
Args:
|
|
129
|
+
x: Token IDs [batch_size, seq_len]
|
|
130
|
+
|
|
131
|
+
Returns:
|
|
132
|
+
Combined embeddings [batch_size, seq_len, d_model]
|
|
133
|
+
"""
|
|
134
|
+
# Get token embeddings: [batch_size, seq_len, d_model]
|
|
135
|
+
token_emb = self.token_embedding(x)
|
|
136
|
+
|
|
137
|
+
# Get positional embeddings: [batch_size, seq_len, d_model]
|
|
138
|
+
pos_emb = self.positional_embedding(x)
|
|
139
|
+
|
|
140
|
+
# Combine by addition
|
|
141
|
+
# Why addition? Because we want the model to learn relationships
|
|
142
|
+
# between both "what" (token) and "where" (position)
|
|
143
|
+
embeddings = token_emb + pos_emb
|
|
144
|
+
|
|
145
|
+
# Apply dropout for regularization
|
|
146
|
+
return self.dropout(embeddings)
|
|
@@ -0,0 +1,109 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Feed-Forward Network (FFN)
|
|
3
|
+
|
|
4
|
+
PURPOSE:
|
|
5
|
+
Applies position-wise transformations to each token independently.
|
|
6
|
+
This adds non-linear processing power to the model beyond what attention provides.
|
|
7
|
+
|
|
8
|
+
WHAT THIS STEP DOES:
|
|
9
|
+
1. First linear projection: Expand dimension
|
|
10
|
+
- Input: [batch_size, seq_len, d_model]
|
|
11
|
+
- Output: [batch_size, seq_len, d_ff]
|
|
12
|
+
- Typically d_ff = 4 * d_model (expansion)
|
|
13
|
+
|
|
14
|
+
2. Non-linear activation (GELU or ReLU)
|
|
15
|
+
- Introduces non-linearity
|
|
16
|
+
- GELU is smoother than ReLU, often better for transformers
|
|
17
|
+
|
|
18
|
+
3. Second linear projection: Project back
|
|
19
|
+
- Input: [batch_size, seq_len, d_ff]
|
|
20
|
+
- Output: [batch_size, seq_len, d_model]
|
|
21
|
+
|
|
22
|
+
4. Dropout for regularization
|
|
23
|
+
|
|
24
|
+
PACKAGES USED:
|
|
25
|
+
- torch: PyTorch tensors
|
|
26
|
+
- torch.nn: Linear, Dropout, GELU/ReLU
|
|
27
|
+
|
|
28
|
+
FILES FROM THIS PROJECT:
|
|
29
|
+
- None (this is a base component)
|
|
30
|
+
|
|
31
|
+
TENSOR SHAPES EXPLAINED:
|
|
32
|
+
- d_ff: Hidden dimension in FFN (usually 4 * d_model)
|
|
33
|
+
- For d_model=256, d_ff=1024
|
|
34
|
+
- For d_model=512, d_ff=2048
|
|
35
|
+
|
|
36
|
+
COMMON FAILURE MODES TO AVOID:
|
|
37
|
+
- d_ff too small → insufficient expressiveness
|
|
38
|
+
- d_ff too large → OOM on GPU, overfitting
|
|
39
|
+
- Forgetting activation → just linear transformation (useless)
|
|
40
|
+
- Using ReLU with high learning rate → dead neurons
|
|
41
|
+
"""
|
|
42
|
+
|
|
43
|
+
import torch
|
|
44
|
+
import torch.nn as nn
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
class FeedForward(nn.Module):
|
|
48
|
+
"""
|
|
49
|
+
Position-wise Feed-Forward Network.
|
|
50
|
+
|
|
51
|
+
Applies the same transformation to each position independently.
|
|
52
|
+
This is why it's called "position-wise" - no interaction between positions.
|
|
53
|
+
|
|
54
|
+
Architecture:
|
|
55
|
+
Linear(d_model → d_ff) → GELU → Linear(d_ff → d_model) → Dropout
|
|
56
|
+
|
|
57
|
+
Tensor shape flow:
|
|
58
|
+
Input: [batch_size, seq_len, d_model]
|
|
59
|
+
Hidden: [batch_size, seq_len, d_ff]
|
|
60
|
+
Output: [batch_size, seq_len, d_model]
|
|
61
|
+
"""
|
|
62
|
+
|
|
63
|
+
def __init__(self, d_model: int, d_ff: int, dropout: float = 0.1):
|
|
64
|
+
"""
|
|
65
|
+
Args:
|
|
66
|
+
d_model: Model dimension (embedding size)
|
|
67
|
+
d_ff: Feed-forward hidden dimension (typically 4 * d_model)
|
|
68
|
+
dropout: Dropout probability
|
|
69
|
+
"""
|
|
70
|
+
super().__init__()
|
|
71
|
+
|
|
72
|
+
# First linear projection: expand
|
|
73
|
+
self.linear1 = nn.Linear(d_model, d_ff)
|
|
74
|
+
|
|
75
|
+
# Non-linear activation
|
|
76
|
+
# GELU (Gaussian Error Linear Unit) - smoother than ReLU
|
|
77
|
+
# Used in GPT-2, BERT, and most modern transformers
|
|
78
|
+
self.activation = nn.GELU()
|
|
79
|
+
|
|
80
|
+
# Second linear projection: compress back
|
|
81
|
+
self.linear2 = nn.Linear(d_ff, d_model)
|
|
82
|
+
|
|
83
|
+
# Dropout for regularization
|
|
84
|
+
self.dropout = nn.Dropout(dropout)
|
|
85
|
+
|
|
86
|
+
def forward(self, x: torch.Tensor) -> torch.Tensor:
|
|
87
|
+
"""
|
|
88
|
+
Args:
|
|
89
|
+
x: Input tensor [batch_size, seq_len, d_model]
|
|
90
|
+
|
|
91
|
+
Returns:
|
|
92
|
+
Output tensor [batch_size, seq_len, d_model]
|
|
93
|
+
|
|
94
|
+
Note: Each position is processed identically and independently.
|
|
95
|
+
No attention or interaction between different positions here.
|
|
96
|
+
"""
|
|
97
|
+
# Expand: [batch_size, seq_len, d_model] → [batch_size, seq_len, d_ff]
|
|
98
|
+
x = self.linear1(x)
|
|
99
|
+
|
|
100
|
+
# Non-linearity
|
|
101
|
+
x = self.activation(x)
|
|
102
|
+
|
|
103
|
+
# Compress back: [batch_size, seq_len, d_ff] → [batch_size, seq_len, d_model]
|
|
104
|
+
x = self.linear2(x)
|
|
105
|
+
|
|
106
|
+
# Dropout
|
|
107
|
+
x = self.dropout(x)
|
|
108
|
+
|
|
109
|
+
return x
|
|
@@ -0,0 +1,204 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Full GPT-Style Transformer Model
|
|
3
|
+
|
|
4
|
+
PURPOSE:
|
|
5
|
+
Assembles all components (embeddings, decoder blocks, output projection)
|
|
6
|
+
into a complete causal language model for next-token prediction.
|
|
7
|
+
|
|
8
|
+
WHAT THIS STEP DOES:
|
|
9
|
+
1. Token + Positional Embeddings
|
|
10
|
+
- Convert input IDs to vectors: [batch_size, seq_len, d_model]
|
|
11
|
+
|
|
12
|
+
2. Stack of N Decoder Blocks
|
|
13
|
+
- Each block applies self-attention + FFN
|
|
14
|
+
- Typically N = 4-6 layers for our hardware
|
|
15
|
+
|
|
16
|
+
3. Final Layer Normalization
|
|
17
|
+
- Stabilize outputs before projection
|
|
18
|
+
|
|
19
|
+
4. Output Projection (Language Modeling Head)
|
|
20
|
+
- Project to vocabulary: [batch_size, seq_len, vocab_size]
|
|
21
|
+
- No activation (raw logits for CrossEntropyLoss)
|
|
22
|
+
|
|
23
|
+
5. Forward Pass
|
|
24
|
+
- Input: token IDs [batch_size, seq_len]
|
|
25
|
+
- Output: logits [batch_size, seq_len, vocab_size]
|
|
26
|
+
|
|
27
|
+
PACKAGES USED:
|
|
28
|
+
- torch: PyTorch tensors
|
|
29
|
+
- torch.nn: Module, Linear, LayerNorm, ModuleList
|
|
30
|
+
|
|
31
|
+
FILES FROM THIS PROJECT:
|
|
32
|
+
- architecture/embeddings.py: TokenPositionalEmbedding
|
|
33
|
+
- architecture/decoder_block.py: TransformerDecoderBlock
|
|
34
|
+
- configs/model_config.py: Hyperparameters (d_model, n_layers, etc.)
|
|
35
|
+
|
|
36
|
+
TENSOR SHAPES:
|
|
37
|
+
- Input IDs: [batch_size, seq_len] (integers)
|
|
38
|
+
- Embeddings: [batch_size, seq_len, d_model]
|
|
39
|
+
- After blocks: [batch_size, seq_len, d_model]
|
|
40
|
+
- Logits: [batch_size, seq_len, vocab_size]
|
|
41
|
+
|
|
42
|
+
HYPERPARAMETERS (for GTX 1080):
|
|
43
|
+
- vocab_size: 8000 (from tokenizer)
|
|
44
|
+
- d_model: 256-512 (embedding dimension)
|
|
45
|
+
- n_layers: 4-6 (number of transformer blocks)
|
|
46
|
+
- n_heads: 4-8 (attention heads)
|
|
47
|
+
- d_ff: 4 * d_model (FFN hidden size)
|
|
48
|
+
- dropout: 0.1-0.2
|
|
49
|
+
- max_seq_len: 512
|
|
50
|
+
|
|
51
|
+
COMMON FAILURE MODES TO AVOID:
|
|
52
|
+
- Not tying embeddings and output weights → slower convergence
|
|
53
|
+
- Too many layers → OOM or slow training
|
|
54
|
+
- d_model not divisible by n_heads → shape mismatch
|
|
55
|
+
- Missing final LayerNorm → unstable outputs
|
|
56
|
+
- Forgetting to handle padding mask → attending to padding
|
|
57
|
+
"""
|
|
58
|
+
|
|
59
|
+
import torch
|
|
60
|
+
import torch.nn as nn
|
|
61
|
+
|
|
62
|
+
from .embeddings import TokenPositionalEmbedding
|
|
63
|
+
from .decoder_block import TransformerDecoderBlock
|
|
64
|
+
from .attention import create_causal_mask
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
class GPTTransformer(nn.Module):
|
|
68
|
+
"""
|
|
69
|
+
GPT-style Causal Language Model.
|
|
70
|
+
|
|
71
|
+
This is the COMPLETE model. Everything comes together here.
|
|
72
|
+
|
|
73
|
+
Architecture:
|
|
74
|
+
1. Token + Positional Embeddings
|
|
75
|
+
2. N x Decoder Blocks
|
|
76
|
+
3. Final LayerNorm
|
|
77
|
+
4. LM Head (projects to vocab)
|
|
78
|
+
|
|
79
|
+
Training objective: Next-token prediction
|
|
80
|
+
- Given tokens [t0, t1, t2, ..., tn]
|
|
81
|
+
- Predict [t1, t2, t3, ..., tn+1]
|
|
82
|
+
|
|
83
|
+
Tensor flow:
|
|
84
|
+
Input: [batch_size, seq_len] token IDs
|
|
85
|
+
Embed: [batch_size, seq_len, d_model]
|
|
86
|
+
Blocks: [batch_size, seq_len, d_model]
|
|
87
|
+
Logits: [batch_size, seq_len, vocab_size]
|
|
88
|
+
"""
|
|
89
|
+
|
|
90
|
+
def __init__(self, config):
|
|
91
|
+
"""
|
|
92
|
+
Args:
|
|
93
|
+
config: ModelConfig object with hyperparameters
|
|
94
|
+
"""
|
|
95
|
+
super().__init__()
|
|
96
|
+
|
|
97
|
+
self.config = config
|
|
98
|
+
|
|
99
|
+
# 1. Embeddings (token + positional)
|
|
100
|
+
self.embeddings = TokenPositionalEmbedding(
|
|
101
|
+
vocab_size=config.vocab_size,
|
|
102
|
+
d_model=config.d_model,
|
|
103
|
+
max_seq_len=config.max_seq_len,
|
|
104
|
+
dropout=config.dropout,
|
|
105
|
+
)
|
|
106
|
+
|
|
107
|
+
# 2. Stack of decoder blocks
|
|
108
|
+
self.decoder_blocks = nn.ModuleList(
|
|
109
|
+
[
|
|
110
|
+
TransformerDecoderBlock(
|
|
111
|
+
d_model=config.d_model,
|
|
112
|
+
n_heads=config.n_heads,
|
|
113
|
+
d_ff=config.d_ff,
|
|
114
|
+
dropout=config.dropout,
|
|
115
|
+
)
|
|
116
|
+
for _ in range(config.n_layers)
|
|
117
|
+
]
|
|
118
|
+
)
|
|
119
|
+
|
|
120
|
+
# 3. Final layer normalization
|
|
121
|
+
self.final_norm = nn.LayerNorm(config.d_model)
|
|
122
|
+
|
|
123
|
+
# 4. Language modeling head (output projection)
|
|
124
|
+
self.lm_head = nn.Linear(config.d_model, config.vocab_size, bias=False)
|
|
125
|
+
|
|
126
|
+
# Weight tying: Share weights between embeddings and output projection
|
|
127
|
+
# This is standard practice in language models
|
|
128
|
+
# Why? Reduces parameters and improves generalization
|
|
129
|
+
self.lm_head.weight = self.embeddings.token_embedding.embedding.weight
|
|
130
|
+
|
|
131
|
+
# Initialize weights
|
|
132
|
+
self.apply(self._init_weights)
|
|
133
|
+
|
|
134
|
+
def _init_weights(self, module):
|
|
135
|
+
"""
|
|
136
|
+
Initialize weights using Xavier/Glorot initialization.
|
|
137
|
+
|
|
138
|
+
This is CRITICAL for training stability.
|
|
139
|
+
Poor initialization → vanishing/exploding gradients
|
|
140
|
+
"""
|
|
141
|
+
if isinstance(module, nn.Linear):
|
|
142
|
+
torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
|
|
143
|
+
if module.bias is not None:
|
|
144
|
+
torch.nn.init.zeros_(module.bias)
|
|
145
|
+
elif isinstance(module, nn.Embedding):
|
|
146
|
+
torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
|
|
147
|
+
elif isinstance(module, nn.LayerNorm):
|
|
148
|
+
torch.nn.init.ones_(module.weight)
|
|
149
|
+
torch.nn.init.zeros_(module.bias)
|
|
150
|
+
|
|
151
|
+
def forward(self, input_ids: torch.Tensor, mask: torch.Tensor = None) -> torch.Tensor:
|
|
152
|
+
"""
|
|
153
|
+
Forward pass through the model.
|
|
154
|
+
|
|
155
|
+
Args:
|
|
156
|
+
input_ids: Token IDs [batch_size, seq_len]
|
|
157
|
+
mask: Optional causal mask [seq_len, seq_len]
|
|
158
|
+
|
|
159
|
+
Returns:
|
|
160
|
+
Logits [batch_size, seq_len, vocab_size]
|
|
161
|
+
|
|
162
|
+
Step-by-step:
|
|
163
|
+
1. Convert token IDs to embeddings
|
|
164
|
+
2. Pass through each decoder block
|
|
165
|
+
3. Apply final normalization
|
|
166
|
+
4. Project to vocabulary
|
|
167
|
+
"""
|
|
168
|
+
batch_size, seq_len = input_ids.size()
|
|
169
|
+
|
|
170
|
+
# Create causal mask if not provided
|
|
171
|
+
if mask is None:
|
|
172
|
+
mask = create_causal_mask(seq_len, device=input_ids.device)
|
|
173
|
+
|
|
174
|
+
# 1. Embeddings: [batch_size, seq_len] → [batch_size, seq_len, d_model]
|
|
175
|
+
x = self.embeddings(input_ids)
|
|
176
|
+
|
|
177
|
+
# 2. Pass through decoder blocks
|
|
178
|
+
for block in self.decoder_blocks:
|
|
179
|
+
x = block(x, mask)
|
|
180
|
+
|
|
181
|
+
# 3. Final layer norm
|
|
182
|
+
x = self.final_norm(x)
|
|
183
|
+
|
|
184
|
+
# 4. Project to vocabulary: [batch_size, seq_len, d_model] → [batch_size, seq_len, vocab_size]
|
|
185
|
+
logits = self.lm_head(x)
|
|
186
|
+
|
|
187
|
+
return logits
|
|
188
|
+
|
|
189
|
+
def count_parameters(self) -> int:
|
|
190
|
+
"""Count total trainable parameters."""
|
|
191
|
+
return sum(p.numel() for p in self.parameters() if p.requires_grad)
|
|
192
|
+
|
|
193
|
+
def get_num_params(self, non_embedding: bool = False) -> int:
|
|
194
|
+
"""
|
|
195
|
+
Get parameter count.
|
|
196
|
+
|
|
197
|
+
Args:
|
|
198
|
+
non_embedding: If True, exclude embedding parameters
|
|
199
|
+
"""
|
|
200
|
+
n_params = sum(p.numel() for p in self.parameters())
|
|
201
|
+
if non_embedding:
|
|
202
|
+
n_params -= self.embeddings.token_embedding.embedding.weight.numel()
|
|
203
|
+
n_params -= self.embeddings.positional_embedding.embedding.weight.numel()
|
|
204
|
+
return n_params
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Model configuration module
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
from llm_med.model.configs.model_config import (
|
|
6
|
+
ModelConfig,
|
|
7
|
+
get_tiny_config,
|
|
8
|
+
get_small_config,
|
|
9
|
+
get_medium_config,
|
|
10
|
+
)
|
|
11
|
+
|
|
12
|
+
__all__ = [
|
|
13
|
+
"ModelConfig",
|
|
14
|
+
"get_tiny_config",
|
|
15
|
+
"get_small_config",
|
|
16
|
+
"get_medium_config",
|
|
17
|
+
]
|
|
@@ -0,0 +1,155 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Model Configuration
|
|
3
|
+
|
|
4
|
+
PURPOSE:
|
|
5
|
+
Central place to store all model hyperparameters. Makes it easy to experiment
|
|
6
|
+
with different model sizes without changing code.
|
|
7
|
+
|
|
8
|
+
WHAT THIS FILE CONTAINS:
|
|
9
|
+
1. ModelConfig dataclass with all hyperparameters:
|
|
10
|
+
- vocab_size: From tokenizer (8000)
|
|
11
|
+
- d_model: Embedding/hidden dimension
|
|
12
|
+
- n_layers: Number of transformer blocks
|
|
13
|
+
- n_heads: Number of attention heads
|
|
14
|
+
- d_ff: Feed-forward hidden dimension
|
|
15
|
+
- dropout: Dropout probability
|
|
16
|
+
- max_seq_len: Maximum sequence length
|
|
17
|
+
|
|
18
|
+
2. Predefined configurations:
|
|
19
|
+
- Tiny: For quick testing (d_model=128, n_layers=2)
|
|
20
|
+
- Small: For GTX 1080 training (d_model=256, n_layers=4)
|
|
21
|
+
- Medium: Larger if memory allows (d_model=512, n_layers=6)
|
|
22
|
+
|
|
23
|
+
PACKAGES USED:
|
|
24
|
+
- dataclasses: For clean config structure
|
|
25
|
+
- json: For saving/loading configs
|
|
26
|
+
|
|
27
|
+
FILES FROM THIS PROJECT:
|
|
28
|
+
- None (this defines configs for other files to use)
|
|
29
|
+
|
|
30
|
+
DESIGN DECISIONS:
|
|
31
|
+
- d_model must be divisible by n_heads
|
|
32
|
+
- d_ff typically 4 * d_model (expansion ratio)
|
|
33
|
+
- dropout 0.1-0.2 (too high → underfitting, too low → overfitting)
|
|
34
|
+
- max_seq_len matches tokenization (512)
|
|
35
|
+
|
|
36
|
+
MEMORY ESTIMATION (approximate):
|
|
37
|
+
- Model parameters ≈ 12 * n_layers * d_model^2
|
|
38
|
+
- Small config: ~10M parameters (~40MB)
|
|
39
|
+
- Medium config: ~40M parameters (~160MB)
|
|
40
|
+
- Fits comfortably in 8GB VRAM with batch_size=16-32
|
|
41
|
+
"""
|
|
42
|
+
|
|
43
|
+
from dataclasses import dataclass
|
|
44
|
+
import json
|
|
45
|
+
from pathlib import Path
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
@dataclass
|
|
49
|
+
class ModelConfig:
|
|
50
|
+
"""
|
|
51
|
+
Transformer model configuration.
|
|
52
|
+
|
|
53
|
+
All hyperparameters in one place for easy experimentation.
|
|
54
|
+
"""
|
|
55
|
+
|
|
56
|
+
# Vocabulary
|
|
57
|
+
vocab_size: int = 8000 # From SentencePiece tokenizer
|
|
58
|
+
|
|
59
|
+
# Architecture
|
|
60
|
+
d_model: int = 256 # Embedding/hidden dimension
|
|
61
|
+
n_layers: int = 4 # Number of transformer blocks
|
|
62
|
+
n_heads: int = 4 # Number of attention heads
|
|
63
|
+
d_ff: int = 1024 # FFN hidden dimension (4 * d_model)
|
|
64
|
+
|
|
65
|
+
# Regularization
|
|
66
|
+
dropout: float = 0.1 # Dropout probability
|
|
67
|
+
|
|
68
|
+
# Sequence
|
|
69
|
+
max_seq_len: int = 512 # Maximum sequence length
|
|
70
|
+
|
|
71
|
+
# Special tokens (from tokenizer)
|
|
72
|
+
pad_token_id: int = 0
|
|
73
|
+
unk_token_id: int = 1
|
|
74
|
+
bos_token_id: int = 2
|
|
75
|
+
eos_token_id: int = 3
|
|
76
|
+
|
|
77
|
+
def __post_init__(self):
|
|
78
|
+
"""Validate configuration."""
|
|
79
|
+
assert (
|
|
80
|
+
self.d_model % self.n_heads == 0
|
|
81
|
+
), f"d_model ({self.d_model}) must be divisible by n_heads ({self.n_heads})"
|
|
82
|
+
assert self.d_model > 0, "d_model must be positive"
|
|
83
|
+
assert self.n_layers > 0, "n_layers must be positive"
|
|
84
|
+
assert 0.0 <= self.dropout < 1.0, "dropout must be in [0, 1)"
|
|
85
|
+
|
|
86
|
+
@property
|
|
87
|
+
def d_head(self) -> int:
|
|
88
|
+
"""Dimension per attention head."""
|
|
89
|
+
return self.d_model // self.n_heads
|
|
90
|
+
|
|
91
|
+
def to_dict(self) -> dict:
|
|
92
|
+
"""Convert to dictionary."""
|
|
93
|
+
return {
|
|
94
|
+
"vocab_size": self.vocab_size,
|
|
95
|
+
"d_model": self.d_model,
|
|
96
|
+
"n_layers": self.n_layers,
|
|
97
|
+
"n_heads": self.n_heads,
|
|
98
|
+
"d_ff": self.d_ff,
|
|
99
|
+
"dropout": self.dropout,
|
|
100
|
+
"max_seq_len": self.max_seq_len,
|
|
101
|
+
"pad_token_id": self.pad_token_id,
|
|
102
|
+
"unk_token_id": self.unk_token_id,
|
|
103
|
+
"bos_token_id": self.bos_token_id,
|
|
104
|
+
"eos_token_id": self.eos_token_id,
|
|
105
|
+
}
|
|
106
|
+
|
|
107
|
+
def save(self, path: Path):
|
|
108
|
+
"""Save config to JSON file."""
|
|
109
|
+
with open(path, "w") as f:
|
|
110
|
+
json.dump(self.to_dict(), f, indent=2)
|
|
111
|
+
|
|
112
|
+
@classmethod
|
|
113
|
+
def from_dict(cls, config_dict: dict) -> "ModelConfig":
|
|
114
|
+
"""Load from dictionary."""
|
|
115
|
+
return cls(**config_dict)
|
|
116
|
+
|
|
117
|
+
@classmethod
|
|
118
|
+
def from_file(cls, path: Path) -> "ModelConfig":
|
|
119
|
+
"""Load from JSON file."""
|
|
120
|
+
with open(path, "r") as f:
|
|
121
|
+
config_dict = json.load(f)
|
|
122
|
+
return cls.from_dict(config_dict)
|
|
123
|
+
|
|
124
|
+
|
|
125
|
+
# Predefined configurations for different use cases
|
|
126
|
+
|
|
127
|
+
|
|
128
|
+
def get_tiny_config() -> ModelConfig:
|
|
129
|
+
"""
|
|
130
|
+
Tiny model for quick testing and debugging.
|
|
131
|
+
~2M parameters, very fast training.
|
|
132
|
+
"""
|
|
133
|
+
return ModelConfig(
|
|
134
|
+
vocab_size=8000, d_model=128, n_layers=2, n_heads=4, d_ff=512, dropout=0.1, max_seq_len=512
|
|
135
|
+
)
|
|
136
|
+
|
|
137
|
+
|
|
138
|
+
def get_small_config() -> ModelConfig:
|
|
139
|
+
"""
|
|
140
|
+
Small model for GTX 1080 (8GB VRAM).
|
|
141
|
+
~10M parameters, good balance of speed and capacity.
|
|
142
|
+
"""
|
|
143
|
+
return ModelConfig(
|
|
144
|
+
vocab_size=8000, d_model=256, n_layers=4, n_heads=4, d_ff=1024, dropout=0.1, max_seq_len=512
|
|
145
|
+
)
|
|
146
|
+
|
|
147
|
+
|
|
148
|
+
def get_medium_config() -> ModelConfig:
|
|
149
|
+
"""
|
|
150
|
+
Medium model for GPUs with more memory.
|
|
151
|
+
~40M parameters, better quality but slower.
|
|
152
|
+
"""
|
|
153
|
+
return ModelConfig(
|
|
154
|
+
vocab_size=8000, d_model=512, n_layers=6, n_heads=8, d_ff=2048, dropout=0.1, max_seq_len=512
|
|
155
|
+
)
|