gptmed 0.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (38) hide show
  1. gptmed/__init__.py +37 -0
  2. gptmed/configs/__init__.py +1 -0
  3. gptmed/configs/train_config.py +154 -0
  4. gptmed/data/__init__.py +5 -0
  5. gptmed/data/parsers/__init__.py +10 -0
  6. gptmed/data/parsers/medquad_parser.py +257 -0
  7. gptmed/data/parsers/text_formatter.py +148 -0
  8. gptmed/inference/__init__.py +1 -0
  9. gptmed/inference/decoding_utils.py +190 -0
  10. gptmed/inference/generation_config.py +83 -0
  11. gptmed/inference/generator.py +253 -0
  12. gptmed/inference/sampling.py +261 -0
  13. gptmed/model/__init__.py +9 -0
  14. gptmed/model/architecture/__init__.py +35 -0
  15. gptmed/model/architecture/attention.py +188 -0
  16. gptmed/model/architecture/decoder_block.py +130 -0
  17. gptmed/model/architecture/embeddings.py +146 -0
  18. gptmed/model/architecture/feedforward.py +109 -0
  19. gptmed/model/architecture/transformer.py +204 -0
  20. gptmed/model/configs/__init__.py +17 -0
  21. gptmed/model/configs/model_config.py +155 -0
  22. gptmed/tokenizer/__init__.py +7 -0
  23. gptmed/tokenizer/tokenize_data.py +286 -0
  24. gptmed/tokenizer/train_tokenizer.py +218 -0
  25. gptmed/training/__init__.py +1 -0
  26. gptmed/training/dataset.py +183 -0
  27. gptmed/training/train.py +272 -0
  28. gptmed/training/trainer.py +331 -0
  29. gptmed/training/utils.py +212 -0
  30. gptmed/utils/__init__.py +1 -0
  31. gptmed/utils/checkpoints.py +224 -0
  32. gptmed/utils/logging.py +189 -0
  33. gptmed-0.0.1.dist-info/METADATA +325 -0
  34. gptmed-0.0.1.dist-info/RECORD +38 -0
  35. gptmed-0.0.1.dist-info/WHEEL +5 -0
  36. gptmed-0.0.1.dist-info/entry_points.txt +3 -0
  37. gptmed-0.0.1.dist-info/licenses/LICENSE +21 -0
  38. gptmed-0.0.1.dist-info/top_level.txt +1 -0
@@ -0,0 +1,146 @@
1
+ """
2
+ Token and Positional Embeddings
3
+
4
+ PURPOSE:
5
+ This file converts token IDs (integers) into continuous vector representations
6
+ that the Transformer can process. It also adds positional information so the
7
+ model knows the order of tokens (since attention has no inherent notion of position).
8
+
9
+ WHAT THIS STEP DOES:
10
+ 1. Token Embedding: Maps each token ID to a learned vector of size d_model
11
+ - Input: [batch_size, seq_len] of token IDs
12
+ - Output: [batch_size, seq_len, d_model] of vectors
13
+
14
+ 2. Positional Embedding: Adds position information to each token
15
+ - Two approaches: learned embeddings or sinusoidal encoding
16
+ - Same shape as token embeddings: [batch_size, seq_len, d_model]
17
+
18
+ 3. Combines both: token_emb + pos_emb
19
+ - Final output: [batch_size, seq_len, d_model]
20
+
21
+ PACKAGES USED:
22
+ - torch: PyTorch tensors and neural network modules
23
+ - torch.nn: Embedding layer, Dropout
24
+
25
+ FILES FROM THIS PROJECT:
26
+ - None (this is a base component)
27
+
28
+ TENSOR SHAPES EXPLAINED:
29
+ - batch_size: Number of sequences processed together
30
+ - seq_len: Length of each sequence (512 in our case)
31
+ - vocab_size: Size of tokenizer vocabulary (8000)
32
+ - d_model: Embedding dimension (256-512)
33
+
34
+ COMMON FAILURE MODES TO AVOID:
35
+ - Forgetting dropout → overfitting
36
+ - Wrong positional encoding dimension → shape mismatch
37
+ - Not scaling embeddings → training instability
38
+ - Using fixed positions > max_seq_len → index out of bounds
39
+ """
40
+
41
+ import torch
42
+ import torch.nn as nn
43
+ import math
44
+
45
+
46
+ class TokenEmbedding(nn.Module):
47
+ """
48
+ Convert token IDs to embeddings.
49
+
50
+ Tensor shape flow:
51
+ Input: [batch_size, seq_len] (token IDs)
52
+ Output: [batch_size, seq_len, d_model] (embeddings)
53
+ """
54
+
55
+ def __init__(self, vocab_size: int, d_model: int):
56
+ super().__init__()
57
+ self.embedding = nn.Embedding(vocab_size, d_model)
58
+ self.d_model = d_model
59
+
60
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
61
+ """
62
+ Args:
63
+ x: Token IDs [batch_size, seq_len]
64
+
65
+ Returns:
66
+ Embeddings [batch_size, seq_len, d_model]
67
+
68
+ Note: We scale by sqrt(d_model) following the original Transformer paper.
69
+ This prevents embeddings from being too small relative to positional encodings.
70
+ """
71
+ return self.embedding(x) * math.sqrt(self.d_model)
72
+
73
+
74
+ class PositionalEmbedding(nn.Module):
75
+ """
76
+ Learned positional embeddings.
77
+
78
+ Alternative to sinusoidal encoding - the model learns optimal position representations.
79
+ GPT uses learned embeddings, so we follow that.
80
+
81
+ Tensor shape flow:
82
+ Input: [batch_size, seq_len]
83
+ Output: [batch_size, seq_len, d_model]
84
+ """
85
+
86
+ def __init__(self, max_seq_len: int, d_model: int):
87
+ super().__init__()
88
+ self.embedding = nn.Embedding(max_seq_len, d_model)
89
+
90
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
91
+ """
92
+ Args:
93
+ x: Token IDs [batch_size, seq_len] (we only use seq_len from this)
94
+
95
+ Returns:
96
+ Position embeddings [batch_size, seq_len, d_model]
97
+ """
98
+ batch_size, seq_len = x.size()
99
+
100
+ # Create position indices: [0, 1, 2, ..., seq_len-1]
101
+ positions = torch.arange(seq_len, device=x.device).unsqueeze(0) # [1, seq_len]
102
+
103
+ # Expand to batch: [batch_size, seq_len]
104
+ positions = positions.expand(batch_size, seq_len)
105
+
106
+ return self.embedding(positions)
107
+
108
+
109
+ class TokenPositionalEmbedding(nn.Module):
110
+ """
111
+ Combined token + positional embeddings.
112
+
113
+ This is what the transformer actually uses.
114
+
115
+ Tensor shape flow:
116
+ Input: [batch_size, seq_len] (token IDs)
117
+ Output: [batch_size, seq_len, d_model] (combined embeddings)
118
+ """
119
+
120
+ def __init__(self, vocab_size: int, d_model: int, max_seq_len: int, dropout: float = 0.1):
121
+ super().__init__()
122
+ self.token_embedding = TokenEmbedding(vocab_size, d_model)
123
+ self.positional_embedding = PositionalEmbedding(max_seq_len, d_model)
124
+ self.dropout = nn.Dropout(dropout)
125
+
126
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
127
+ """
128
+ Args:
129
+ x: Token IDs [batch_size, seq_len]
130
+
131
+ Returns:
132
+ Combined embeddings [batch_size, seq_len, d_model]
133
+ """
134
+ # Get token embeddings: [batch_size, seq_len, d_model]
135
+ token_emb = self.token_embedding(x)
136
+
137
+ # Get positional embeddings: [batch_size, seq_len, d_model]
138
+ pos_emb = self.positional_embedding(x)
139
+
140
+ # Combine by addition
141
+ # Why addition? Because we want the model to learn relationships
142
+ # between both "what" (token) and "where" (position)
143
+ embeddings = token_emb + pos_emb
144
+
145
+ # Apply dropout for regularization
146
+ return self.dropout(embeddings)
@@ -0,0 +1,109 @@
1
+ """
2
+ Feed-Forward Network (FFN)
3
+
4
+ PURPOSE:
5
+ Applies position-wise transformations to each token independently.
6
+ This adds non-linear processing power to the model beyond what attention provides.
7
+
8
+ WHAT THIS STEP DOES:
9
+ 1. First linear projection: Expand dimension
10
+ - Input: [batch_size, seq_len, d_model]
11
+ - Output: [batch_size, seq_len, d_ff]
12
+ - Typically d_ff = 4 * d_model (expansion)
13
+
14
+ 2. Non-linear activation (GELU or ReLU)
15
+ - Introduces non-linearity
16
+ - GELU is smoother than ReLU, often better for transformers
17
+
18
+ 3. Second linear projection: Project back
19
+ - Input: [batch_size, seq_len, d_ff]
20
+ - Output: [batch_size, seq_len, d_model]
21
+
22
+ 4. Dropout for regularization
23
+
24
+ PACKAGES USED:
25
+ - torch: PyTorch tensors
26
+ - torch.nn: Linear, Dropout, GELU/ReLU
27
+
28
+ FILES FROM THIS PROJECT:
29
+ - None (this is a base component)
30
+
31
+ TENSOR SHAPES EXPLAINED:
32
+ - d_ff: Hidden dimension in FFN (usually 4 * d_model)
33
+ - For d_model=256, d_ff=1024
34
+ - For d_model=512, d_ff=2048
35
+
36
+ COMMON FAILURE MODES TO AVOID:
37
+ - d_ff too small → insufficient expressiveness
38
+ - d_ff too large → OOM on GPU, overfitting
39
+ - Forgetting activation → just linear transformation (useless)
40
+ - Using ReLU with high learning rate → dead neurons
41
+ """
42
+
43
+ import torch
44
+ import torch.nn as nn
45
+
46
+
47
+ class FeedForward(nn.Module):
48
+ """
49
+ Position-wise Feed-Forward Network.
50
+
51
+ Applies the same transformation to each position independently.
52
+ This is why it's called "position-wise" - no interaction between positions.
53
+
54
+ Architecture:
55
+ Linear(d_model → d_ff) → GELU → Linear(d_ff → d_model) → Dropout
56
+
57
+ Tensor shape flow:
58
+ Input: [batch_size, seq_len, d_model]
59
+ Hidden: [batch_size, seq_len, d_ff]
60
+ Output: [batch_size, seq_len, d_model]
61
+ """
62
+
63
+ def __init__(self, d_model: int, d_ff: int, dropout: float = 0.1):
64
+ """
65
+ Args:
66
+ d_model: Model dimension (embedding size)
67
+ d_ff: Feed-forward hidden dimension (typically 4 * d_model)
68
+ dropout: Dropout probability
69
+ """
70
+ super().__init__()
71
+
72
+ # First linear projection: expand
73
+ self.linear1 = nn.Linear(d_model, d_ff)
74
+
75
+ # Non-linear activation
76
+ # GELU (Gaussian Error Linear Unit) - smoother than ReLU
77
+ # Used in GPT-2, BERT, and most modern transformers
78
+ self.activation = nn.GELU()
79
+
80
+ # Second linear projection: compress back
81
+ self.linear2 = nn.Linear(d_ff, d_model)
82
+
83
+ # Dropout for regularization
84
+ self.dropout = nn.Dropout(dropout)
85
+
86
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
87
+ """
88
+ Args:
89
+ x: Input tensor [batch_size, seq_len, d_model]
90
+
91
+ Returns:
92
+ Output tensor [batch_size, seq_len, d_model]
93
+
94
+ Note: Each position is processed identically and independently.
95
+ No attention or interaction between different positions here.
96
+ """
97
+ # Expand: [batch_size, seq_len, d_model] → [batch_size, seq_len, d_ff]
98
+ x = self.linear1(x)
99
+
100
+ # Non-linearity
101
+ x = self.activation(x)
102
+
103
+ # Compress back: [batch_size, seq_len, d_ff] → [batch_size, seq_len, d_model]
104
+ x = self.linear2(x)
105
+
106
+ # Dropout
107
+ x = self.dropout(x)
108
+
109
+ return x
@@ -0,0 +1,204 @@
1
+ """
2
+ Full GPT-Style Transformer Model
3
+
4
+ PURPOSE:
5
+ Assembles all components (embeddings, decoder blocks, output projection)
6
+ into a complete causal language model for next-token prediction.
7
+
8
+ WHAT THIS STEP DOES:
9
+ 1. Token + Positional Embeddings
10
+ - Convert input IDs to vectors: [batch_size, seq_len, d_model]
11
+
12
+ 2. Stack of N Decoder Blocks
13
+ - Each block applies self-attention + FFN
14
+ - Typically N = 4-6 layers for our hardware
15
+
16
+ 3. Final Layer Normalization
17
+ - Stabilize outputs before projection
18
+
19
+ 4. Output Projection (Language Modeling Head)
20
+ - Project to vocabulary: [batch_size, seq_len, vocab_size]
21
+ - No activation (raw logits for CrossEntropyLoss)
22
+
23
+ 5. Forward Pass
24
+ - Input: token IDs [batch_size, seq_len]
25
+ - Output: logits [batch_size, seq_len, vocab_size]
26
+
27
+ PACKAGES USED:
28
+ - torch: PyTorch tensors
29
+ - torch.nn: Module, Linear, LayerNorm, ModuleList
30
+
31
+ FILES FROM THIS PROJECT:
32
+ - architecture/embeddings.py: TokenPositionalEmbedding
33
+ - architecture/decoder_block.py: TransformerDecoderBlock
34
+ - configs/model_config.py: Hyperparameters (d_model, n_layers, etc.)
35
+
36
+ TENSOR SHAPES:
37
+ - Input IDs: [batch_size, seq_len] (integers)
38
+ - Embeddings: [batch_size, seq_len, d_model]
39
+ - After blocks: [batch_size, seq_len, d_model]
40
+ - Logits: [batch_size, seq_len, vocab_size]
41
+
42
+ HYPERPARAMETERS (for GTX 1080):
43
+ - vocab_size: 8000 (from tokenizer)
44
+ - d_model: 256-512 (embedding dimension)
45
+ - n_layers: 4-6 (number of transformer blocks)
46
+ - n_heads: 4-8 (attention heads)
47
+ - d_ff: 4 * d_model (FFN hidden size)
48
+ - dropout: 0.1-0.2
49
+ - max_seq_len: 512
50
+
51
+ COMMON FAILURE MODES TO AVOID:
52
+ - Not tying embeddings and output weights → slower convergence
53
+ - Too many layers → OOM or slow training
54
+ - d_model not divisible by n_heads → shape mismatch
55
+ - Missing final LayerNorm → unstable outputs
56
+ - Forgetting to handle padding mask → attending to padding
57
+ """
58
+
59
+ import torch
60
+ import torch.nn as nn
61
+
62
+ from .embeddings import TokenPositionalEmbedding
63
+ from .decoder_block import TransformerDecoderBlock
64
+ from .attention import create_causal_mask
65
+
66
+
67
+ class GPTTransformer(nn.Module):
68
+ """
69
+ GPT-style Causal Language Model.
70
+
71
+ This is the COMPLETE model. Everything comes together here.
72
+
73
+ Architecture:
74
+ 1. Token + Positional Embeddings
75
+ 2. N x Decoder Blocks
76
+ 3. Final LayerNorm
77
+ 4. LM Head (projects to vocab)
78
+
79
+ Training objective: Next-token prediction
80
+ - Given tokens [t0, t1, t2, ..., tn]
81
+ - Predict [t1, t2, t3, ..., tn+1]
82
+
83
+ Tensor flow:
84
+ Input: [batch_size, seq_len] token IDs
85
+ Embed: [batch_size, seq_len, d_model]
86
+ Blocks: [batch_size, seq_len, d_model]
87
+ Logits: [batch_size, seq_len, vocab_size]
88
+ """
89
+
90
+ def __init__(self, config):
91
+ """
92
+ Args:
93
+ config: ModelConfig object with hyperparameters
94
+ """
95
+ super().__init__()
96
+
97
+ self.config = config
98
+
99
+ # 1. Embeddings (token + positional)
100
+ self.embeddings = TokenPositionalEmbedding(
101
+ vocab_size=config.vocab_size,
102
+ d_model=config.d_model,
103
+ max_seq_len=config.max_seq_len,
104
+ dropout=config.dropout,
105
+ )
106
+
107
+ # 2. Stack of decoder blocks
108
+ self.decoder_blocks = nn.ModuleList(
109
+ [
110
+ TransformerDecoderBlock(
111
+ d_model=config.d_model,
112
+ n_heads=config.n_heads,
113
+ d_ff=config.d_ff,
114
+ dropout=config.dropout,
115
+ )
116
+ for _ in range(config.n_layers)
117
+ ]
118
+ )
119
+
120
+ # 3. Final layer normalization
121
+ self.final_norm = nn.LayerNorm(config.d_model)
122
+
123
+ # 4. Language modeling head (output projection)
124
+ self.lm_head = nn.Linear(config.d_model, config.vocab_size, bias=False)
125
+
126
+ # Weight tying: Share weights between embeddings and output projection
127
+ # This is standard practice in language models
128
+ # Why? Reduces parameters and improves generalization
129
+ self.lm_head.weight = self.embeddings.token_embedding.embedding.weight
130
+
131
+ # Initialize weights
132
+ self.apply(self._init_weights)
133
+
134
+ def _init_weights(self, module):
135
+ """
136
+ Initialize weights using Xavier/Glorot initialization.
137
+
138
+ This is CRITICAL for training stability.
139
+ Poor initialization → vanishing/exploding gradients
140
+ """
141
+ if isinstance(module, nn.Linear):
142
+ torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
143
+ if module.bias is not None:
144
+ torch.nn.init.zeros_(module.bias)
145
+ elif isinstance(module, nn.Embedding):
146
+ torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
147
+ elif isinstance(module, nn.LayerNorm):
148
+ torch.nn.init.ones_(module.weight)
149
+ torch.nn.init.zeros_(module.bias)
150
+
151
+ def forward(self, input_ids: torch.Tensor, mask: torch.Tensor = None) -> torch.Tensor:
152
+ """
153
+ Forward pass through the model.
154
+
155
+ Args:
156
+ input_ids: Token IDs [batch_size, seq_len]
157
+ mask: Optional causal mask [seq_len, seq_len]
158
+
159
+ Returns:
160
+ Logits [batch_size, seq_len, vocab_size]
161
+
162
+ Step-by-step:
163
+ 1. Convert token IDs to embeddings
164
+ 2. Pass through each decoder block
165
+ 3. Apply final normalization
166
+ 4. Project to vocabulary
167
+ """
168
+ batch_size, seq_len = input_ids.size()
169
+
170
+ # Create causal mask if not provided
171
+ if mask is None:
172
+ mask = create_causal_mask(seq_len, device=input_ids.device)
173
+
174
+ # 1. Embeddings: [batch_size, seq_len] → [batch_size, seq_len, d_model]
175
+ x = self.embeddings(input_ids)
176
+
177
+ # 2. Pass through decoder blocks
178
+ for block in self.decoder_blocks:
179
+ x = block(x, mask)
180
+
181
+ # 3. Final layer norm
182
+ x = self.final_norm(x)
183
+
184
+ # 4. Project to vocabulary: [batch_size, seq_len, d_model] → [batch_size, seq_len, vocab_size]
185
+ logits = self.lm_head(x)
186
+
187
+ return logits
188
+
189
+ def count_parameters(self) -> int:
190
+ """Count total trainable parameters."""
191
+ return sum(p.numel() for p in self.parameters() if p.requires_grad)
192
+
193
+ def get_num_params(self, non_embedding: bool = False) -> int:
194
+ """
195
+ Get parameter count.
196
+
197
+ Args:
198
+ non_embedding: If True, exclude embedding parameters
199
+ """
200
+ n_params = sum(p.numel() for p in self.parameters())
201
+ if non_embedding:
202
+ n_params -= self.embeddings.token_embedding.embedding.weight.numel()
203
+ n_params -= self.embeddings.positional_embedding.embedding.weight.numel()
204
+ return n_params
@@ -0,0 +1,17 @@
1
+ """
2
+ Model configuration module
3
+ """
4
+
5
+ from llm_med.model.configs.model_config import (
6
+ ModelConfig,
7
+ get_tiny_config,
8
+ get_small_config,
9
+ get_medium_config,
10
+ )
11
+
12
+ __all__ = [
13
+ "ModelConfig",
14
+ "get_tiny_config",
15
+ "get_small_config",
16
+ "get_medium_config",
17
+ ]
@@ -0,0 +1,155 @@
1
+ """
2
+ Model Configuration
3
+
4
+ PURPOSE:
5
+ Central place to store all model hyperparameters. Makes it easy to experiment
6
+ with different model sizes without changing code.
7
+
8
+ WHAT THIS FILE CONTAINS:
9
+ 1. ModelConfig dataclass with all hyperparameters:
10
+ - vocab_size: From tokenizer (8000)
11
+ - d_model: Embedding/hidden dimension
12
+ - n_layers: Number of transformer blocks
13
+ - n_heads: Number of attention heads
14
+ - d_ff: Feed-forward hidden dimension
15
+ - dropout: Dropout probability
16
+ - max_seq_len: Maximum sequence length
17
+
18
+ 2. Predefined configurations:
19
+ - Tiny: For quick testing (d_model=128, n_layers=2)
20
+ - Small: For GTX 1080 training (d_model=256, n_layers=4)
21
+ - Medium: Larger if memory allows (d_model=512, n_layers=6)
22
+
23
+ PACKAGES USED:
24
+ - dataclasses: For clean config structure
25
+ - json: For saving/loading configs
26
+
27
+ FILES FROM THIS PROJECT:
28
+ - None (this defines configs for other files to use)
29
+
30
+ DESIGN DECISIONS:
31
+ - d_model must be divisible by n_heads
32
+ - d_ff typically 4 * d_model (expansion ratio)
33
+ - dropout 0.1-0.2 (too high → underfitting, too low → overfitting)
34
+ - max_seq_len matches tokenization (512)
35
+
36
+ MEMORY ESTIMATION (approximate):
37
+ - Model parameters ≈ 12 * n_layers * d_model^2
38
+ - Small config: ~10M parameters (~40MB)
39
+ - Medium config: ~40M parameters (~160MB)
40
+ - Fits comfortably in 8GB VRAM with batch_size=16-32
41
+ """
42
+
43
+ from dataclasses import dataclass
44
+ import json
45
+ from pathlib import Path
46
+
47
+
48
+ @dataclass
49
+ class ModelConfig:
50
+ """
51
+ Transformer model configuration.
52
+
53
+ All hyperparameters in one place for easy experimentation.
54
+ """
55
+
56
+ # Vocabulary
57
+ vocab_size: int = 8000 # From SentencePiece tokenizer
58
+
59
+ # Architecture
60
+ d_model: int = 256 # Embedding/hidden dimension
61
+ n_layers: int = 4 # Number of transformer blocks
62
+ n_heads: int = 4 # Number of attention heads
63
+ d_ff: int = 1024 # FFN hidden dimension (4 * d_model)
64
+
65
+ # Regularization
66
+ dropout: float = 0.1 # Dropout probability
67
+
68
+ # Sequence
69
+ max_seq_len: int = 512 # Maximum sequence length
70
+
71
+ # Special tokens (from tokenizer)
72
+ pad_token_id: int = 0
73
+ unk_token_id: int = 1
74
+ bos_token_id: int = 2
75
+ eos_token_id: int = 3
76
+
77
+ def __post_init__(self):
78
+ """Validate configuration."""
79
+ assert (
80
+ self.d_model % self.n_heads == 0
81
+ ), f"d_model ({self.d_model}) must be divisible by n_heads ({self.n_heads})"
82
+ assert self.d_model > 0, "d_model must be positive"
83
+ assert self.n_layers > 0, "n_layers must be positive"
84
+ assert 0.0 <= self.dropout < 1.0, "dropout must be in [0, 1)"
85
+
86
+ @property
87
+ def d_head(self) -> int:
88
+ """Dimension per attention head."""
89
+ return self.d_model // self.n_heads
90
+
91
+ def to_dict(self) -> dict:
92
+ """Convert to dictionary."""
93
+ return {
94
+ "vocab_size": self.vocab_size,
95
+ "d_model": self.d_model,
96
+ "n_layers": self.n_layers,
97
+ "n_heads": self.n_heads,
98
+ "d_ff": self.d_ff,
99
+ "dropout": self.dropout,
100
+ "max_seq_len": self.max_seq_len,
101
+ "pad_token_id": self.pad_token_id,
102
+ "unk_token_id": self.unk_token_id,
103
+ "bos_token_id": self.bos_token_id,
104
+ "eos_token_id": self.eos_token_id,
105
+ }
106
+
107
+ def save(self, path: Path):
108
+ """Save config to JSON file."""
109
+ with open(path, "w") as f:
110
+ json.dump(self.to_dict(), f, indent=2)
111
+
112
+ @classmethod
113
+ def from_dict(cls, config_dict: dict) -> "ModelConfig":
114
+ """Load from dictionary."""
115
+ return cls(**config_dict)
116
+
117
+ @classmethod
118
+ def from_file(cls, path: Path) -> "ModelConfig":
119
+ """Load from JSON file."""
120
+ with open(path, "r") as f:
121
+ config_dict = json.load(f)
122
+ return cls.from_dict(config_dict)
123
+
124
+
125
+ # Predefined configurations for different use cases
126
+
127
+
128
+ def get_tiny_config() -> ModelConfig:
129
+ """
130
+ Tiny model for quick testing and debugging.
131
+ ~2M parameters, very fast training.
132
+ """
133
+ return ModelConfig(
134
+ vocab_size=8000, d_model=128, n_layers=2, n_heads=4, d_ff=512, dropout=0.1, max_seq_len=512
135
+ )
136
+
137
+
138
+ def get_small_config() -> ModelConfig:
139
+ """
140
+ Small model for GTX 1080 (8GB VRAM).
141
+ ~10M parameters, good balance of speed and capacity.
142
+ """
143
+ return ModelConfig(
144
+ vocab_size=8000, d_model=256, n_layers=4, n_heads=4, d_ff=1024, dropout=0.1, max_seq_len=512
145
+ )
146
+
147
+
148
+ def get_medium_config() -> ModelConfig:
149
+ """
150
+ Medium model for GPUs with more memory.
151
+ ~40M parameters, better quality but slower.
152
+ """
153
+ return ModelConfig(
154
+ vocab_size=8000, d_model=512, n_layers=6, n_heads=8, d_ff=2048, dropout=0.1, max_seq_len=512
155
+ )
@@ -0,0 +1,7 @@
1
+ """
2
+ Tokenizer module for MedLLM
3
+ """
4
+
5
+ from llm_med.tokenizer.train_tokenizer import train_sentencepiece_tokenizer
6
+
7
+ __all__ = ["train_sentencepiece_tokenizer"]