gptmed 0.0.1__py3-none-any.whl → 0.1.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
gptmed/__init__.py CHANGED
@@ -1,21 +1,32 @@
1
1
  """
2
- llm-med: A lightweight medical question-answering language model
2
+ GptMed: A lightweight GPT-based language model framework
3
3
 
4
- This package provides a GPT-based transformer architecture trained on the MedQuAD dataset
5
- for medical domain question answering.
4
+ A domain-agnostic framework for training custom question-answering models.
5
+ Train your own GPT model on any Q&A dataset - medical, technical support,
6
+ education, or any other domain.
6
7
 
7
- Main Components:
8
- - model: GPT transformer architecture
9
- - inference: Text generation and sampling
10
- - training: Training loop and utilities
11
- - tokenizer: SentencePiece tokenizer
12
- - configs: Configuration management
13
- - utils: Utility functions
8
+ Quick Start:
9
+ >>> import gptmed
10
+ >>>
11
+ >>> # 1. Create a config file
12
+ >>> gptmed.create_config('my_config.yaml')
13
+ >>>
14
+ >>> # 2. Edit my_config.yaml with your settings
15
+ >>>
16
+ >>> # 3. Train your model
17
+ >>> results = gptmed.train_from_config('my_config.yaml')
18
+ >>>
19
+ >>> # 4. Generate answers
20
+ >>> answer = gptmed.generate(
21
+ ... checkpoint=results['best_checkpoint'],
22
+ ... tokenizer='tokenizer/my_tokenizer.model',
23
+ ... prompt='Your question here?'
24
+ ... )
14
25
 
15
- Example:
16
- >>> from llm_med.model.architecture import GPTTransformer
17
- >>> from llm_med.model.configs.model_config import get_small_config
18
- >>> from llm_med.inference.generator import TextGenerator
26
+ Advanced Usage:
27
+ >>> from gptmed.model.architecture import GPTTransformer
28
+ >>> from gptmed.model.configs.model_config import get_small_config
29
+ >>> from gptmed.inference.generator import TextGenerator
19
30
  >>>
20
31
  >>> config = get_small_config()
21
32
  >>> model = GPTTransformer(config)
@@ -25,11 +36,23 @@ __version__ = "0.2.0"
25
36
  __author__ = "Sanjog Sigdel"
26
37
  __email__ = "sigdelsanjog@gmail.com"
27
38
 
39
+ # High-level API - Main user interface
40
+ from gptmed.api import (
41
+ create_config,
42
+ train_from_config,
43
+ generate,
44
+ )
45
+
28
46
  # Expose main components at package level for convenience
29
- from llm_med.model.architecture import GPTTransformer
30
- from llm_med.model.configs.model_config import ModelConfig, get_small_config, get_tiny_config
47
+ from gptmed.model.architecture import GPTTransformer
48
+ from gptmed.model.configs.model_config import ModelConfig, get_small_config, get_tiny_config
31
49
 
32
50
  __all__ = [
51
+ # Simple API
52
+ "create_config",
53
+ "train_from_config",
54
+ "generate",
55
+ # Advanced API
33
56
  "GPTTransformer",
34
57
  "ModelConfig",
35
58
  "get_small_config",
gptmed/api.py ADDED
@@ -0,0 +1,352 @@
1
+ """
2
+ High-Level API for GptMed
3
+
4
+ Simple, user-friendly functions to train and use GPT models.
5
+ This is the main interface users should use.
6
+
7
+ Example:
8
+ >>> import gptmed
9
+ >>>
10
+ >>> # Create a config file
11
+ >>> gptmed.create_config('my_config.yaml')
12
+ >>>
13
+ >>> # Edit my_config.yaml with your settings
14
+ >>>
15
+ >>> # Train the model
16
+ >>> gptmed.train_from_config('my_config.yaml')
17
+ >>>
18
+ >>> # Generate text
19
+ >>> answer = gptmed.generate(
20
+ ... checkpoint='model/checkpoints/best_model.pt',
21
+ ... prompt='Your question?',
22
+ ... tokenizer='tokenizer/my_tokenizer.model'
23
+ ... )
24
+ """
25
+
26
+ import torch
27
+ from pathlib import Path
28
+ from typing import Optional, Dict, Any
29
+
30
+ from gptmed.configs.config_loader import (
31
+ load_yaml_config,
32
+ validate_config,
33
+ config_to_args,
34
+ create_default_config_file
35
+ )
36
+ from gptmed.model.architecture import GPTTransformer
37
+ from gptmed.model.configs.model_config import get_tiny_config, get_small_config, get_medium_config
38
+ from gptmed.configs.train_config import TrainingConfig
39
+ from gptmed.training.dataset import create_dataloaders
40
+ from gptmed.training.trainer import Trainer
41
+ from gptmed.inference.generator import TextGenerator
42
+
43
+
44
+ def create_config(output_path: str = 'training_config.yaml') -> None:
45
+ """
46
+ Create a default training configuration file.
47
+
48
+ This creates a YAML file that you can edit with your training settings.
49
+
50
+ Args:
51
+ output_path: Where to save the config file (default: 'training_config.yaml')
52
+
53
+ Example:
54
+ >>> import gptmed
55
+ >>> gptmed.create_config('my_training_config.yaml')
56
+ >>> # Now edit my_training_config.yaml with your settings
57
+ """
58
+ create_default_config_file(output_path)
59
+
60
+
61
+ def train_from_config(config_path: str, verbose: bool = True) -> Dict[str, Any]:
62
+ """
63
+ Train a GPT model using a YAML configuration file.
64
+
65
+ This is the simplest way to train a model. Just create a config file
66
+ with create_config(), edit it with your settings, and pass it here.
67
+
68
+ Args:
69
+ config_path: Path to YAML configuration file
70
+ verbose: Whether to print training progress (default: True)
71
+
72
+ Returns:
73
+ Dictionary with training results:
74
+ - best_checkpoint: Path to best model checkpoint
75
+ - final_val_loss: Final validation loss
76
+ - total_epochs: Number of epochs trained
77
+
78
+ Example:
79
+ >>> import gptmed
80
+ >>>
81
+ >>> # Create and edit config file
82
+ >>> gptmed.create_config('config.yaml')
83
+ >>> # ... edit config.yaml ...
84
+ >>>
85
+ >>> # Train the model
86
+ >>> results = gptmed.train_from_config('config.yaml')
87
+ >>> print(f"Best model: {results['best_checkpoint']}")
88
+
89
+ Raises:
90
+ FileNotFoundError: If config file or data files don't exist
91
+ ValueError: If configuration is invalid
92
+ """
93
+ if verbose:
94
+ print("=" * 60)
95
+ print("GptMed Training from Configuration File")
96
+ print("=" * 60)
97
+
98
+ # Load and validate config
99
+ if verbose:
100
+ print(f"\n📄 Loading configuration from: {config_path}")
101
+ config = load_yaml_config(config_path)
102
+
103
+ if verbose:
104
+ print("✓ Configuration loaded")
105
+ print("\n🔍 Validating configuration...")
106
+ validate_config(config)
107
+
108
+ if verbose:
109
+ print("✓ Configuration valid")
110
+
111
+ # Convert to arguments
112
+ args = config_to_args(config)
113
+
114
+ # Import here to avoid circular imports
115
+ import random
116
+ import numpy as np
117
+
118
+ # Set random seed
119
+ def set_seed(seed: int):
120
+ random.seed(seed)
121
+ np.random.seed(seed)
122
+ torch.manual_seed(seed)
123
+ if torch.cuda.is_available():
124
+ torch.cuda.manual_seed(seed)
125
+ torch.cuda.manual_seed_all(seed)
126
+ torch.backends.cudnn.deterministic = True
127
+ torch.backends.cudnn.benchmark = False
128
+
129
+ if verbose:
130
+ print(f"\n🎲 Setting random seed: {args['seed']}")
131
+ set_seed(args['seed'])
132
+
133
+ # Check device
134
+ device = args['device']
135
+ if device == 'cuda' and not torch.cuda.is_available():
136
+ if verbose:
137
+ print("⚠️ CUDA not available, using CPU")
138
+ device = 'cpu'
139
+
140
+ # Load model config
141
+ if verbose:
142
+ print(f"\n🧠 Creating model: {args['model_size']}")
143
+
144
+ if args['model_size'] == 'tiny':
145
+ model_config = get_tiny_config()
146
+ elif args['model_size'] == 'small':
147
+ model_config = get_small_config()
148
+ elif args['model_size'] == 'medium':
149
+ model_config = get_medium_config()
150
+ else:
151
+ raise ValueError(f"Unknown model size: {args['model_size']}")
152
+
153
+ # Create model
154
+ model = GPTTransformer(model_config)
155
+ total_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
156
+
157
+ if verbose:
158
+ print(f" Model size: {args['model_size']}")
159
+ print(f" Parameters: {total_params:,}")
160
+ print(f" Memory: ~{total_params * 4 / 1024 / 1024:.2f} MB")
161
+
162
+ # Load data
163
+ if verbose:
164
+ print(f"\n📊 Loading data...")
165
+ print(f" Train: {args['train_data']}")
166
+ print(f" Val: {args['val_data']}")
167
+
168
+ train_loader, val_loader = create_dataloaders(
169
+ train_path=Path(args['train_data']),
170
+ val_path=Path(args['val_data']),
171
+ batch_size=args['batch_size'],
172
+ num_workers=0,
173
+ )
174
+
175
+ if verbose:
176
+ print(f" Train batches: {len(train_loader)}")
177
+ print(f" Val batches: {len(val_loader)}")
178
+
179
+ # Create training config
180
+ train_config = TrainingConfig(
181
+ batch_size=args['batch_size'],
182
+ learning_rate=args['learning_rate'],
183
+ num_epochs=args['num_epochs'],
184
+ warmup_steps=args['warmup_steps'],
185
+ grad_clip=args['grad_clip'],
186
+ weight_decay=args['weight_decay'],
187
+ betas=args['betas'],
188
+ eps=args['eps'],
189
+ max_steps=args['max_steps'],
190
+ save_every=args['save_every'],
191
+ eval_every=args['eval_every'],
192
+ log_every=args['log_every'],
193
+ keep_last_n=args['keep_last_n'],
194
+ train_data_path=args['train_data'],
195
+ val_data_path=args['val_data'],
196
+ checkpoint_dir=args['checkpoint_dir'],
197
+ log_dir=args['log_dir'],
198
+ device=device,
199
+ seed=args['seed'],
200
+ )
201
+
202
+ # Create optimizer
203
+ if verbose:
204
+ print(f"\n⚙️ Setting up optimizer...")
205
+ print(f" Learning rate: {args['learning_rate']}")
206
+ print(f" Weight decay: {args['weight_decay']}")
207
+
208
+ optimizer = torch.optim.AdamW(
209
+ model.parameters(),
210
+ lr=args['learning_rate'],
211
+ betas=args['betas'],
212
+ eps=args['eps'],
213
+ weight_decay=args['weight_decay'],
214
+ )
215
+
216
+ # Create trainer
217
+ if verbose:
218
+ print(f"\n🎯 Initializing trainer...")
219
+
220
+ trainer = Trainer(
221
+ model=model,
222
+ train_loader=train_loader,
223
+ val_loader=val_loader,
224
+ optimizer=optimizer,
225
+ config=train_config,
226
+ device=device,
227
+ )
228
+
229
+ # Resume if requested
230
+ if args['resume_from'] is not None:
231
+ if verbose:
232
+ print(f"\n📥 Resuming from checkpoint: {args['resume_from']}")
233
+ trainer.resume_from_checkpoint(Path(args['resume_from']))
234
+
235
+ # Start training
236
+ if verbose:
237
+ print(f"\n{'='*60}")
238
+ print("🚀 Starting Training!")
239
+ print(f"{'='*60}\n")
240
+
241
+ try:
242
+ trainer.train()
243
+ except KeyboardInterrupt:
244
+ if verbose:
245
+ print("\n\n⏸️ Training interrupted by user")
246
+ print("💾 Saving checkpoint...")
247
+ trainer.checkpoint_manager.save_checkpoint(
248
+ model=model,
249
+ optimizer=optimizer,
250
+ step=trainer.global_step,
251
+ epoch=trainer.current_epoch,
252
+ val_loss=trainer.best_val_loss,
253
+ model_config=model_config.to_dict(),
254
+ train_config=train_config.to_dict(),
255
+ )
256
+ if verbose:
257
+ print("✓ Checkpoint saved. Resume with resume_from in config.")
258
+
259
+ # Return results
260
+ best_checkpoint = Path(train_config.checkpoint_dir) / "best_model.pt"
261
+
262
+ results = {
263
+ 'best_checkpoint': str(best_checkpoint),
264
+ 'final_val_loss': trainer.best_val_loss,
265
+ 'total_epochs': trainer.current_epoch,
266
+ 'checkpoint_dir': train_config.checkpoint_dir,
267
+ 'log_dir': train_config.log_dir,
268
+ }
269
+
270
+ if verbose:
271
+ print(f"\n{'='*60}")
272
+ print("✅ Training Complete!")
273
+ print(f"{'='*60}")
274
+ print(f"\n📁 Results:")
275
+ print(f" Best checkpoint: {results['best_checkpoint']}")
276
+ print(f" Best val loss: {results['final_val_loss']:.4f}")
277
+ print(f" Total epochs: {results['total_epochs']}")
278
+ print(f" Logs: {results['log_dir']}")
279
+
280
+ return results
281
+
282
+
283
+ def generate(
284
+ checkpoint: str,
285
+ tokenizer: str,
286
+ prompt: str,
287
+ max_length: int = 100,
288
+ temperature: float = 0.7,
289
+ top_k: int = 50,
290
+ top_p: float = 0.9,
291
+ device: str = "cuda"
292
+ ) -> str:
293
+ """
294
+ Generate text using a trained model.
295
+
296
+ Args:
297
+ checkpoint: Path to model checkpoint (.pt file)
298
+ tokenizer: Path to tokenizer model (.model file)
299
+ prompt: Input text/question
300
+ max_length: Maximum tokens to generate
301
+ temperature: Sampling temperature (higher = more random)
302
+ top_k: Top-k sampling parameter
303
+ top_p: Nucleus sampling parameter
304
+ device: Device to use ('cuda' or 'cpu')
305
+
306
+ Returns:
307
+ Generated text
308
+
309
+ Example:
310
+ >>> import gptmed
311
+ >>>
312
+ >>> answer = gptmed.generate(
313
+ ... checkpoint='model/checkpoints/best_model.pt',
314
+ ... tokenizer='tokenizer/my_tokenizer.model',
315
+ ... prompt='What is machine learning?',
316
+ ... max_length=150,
317
+ ... temperature=0.7
318
+ ... )
319
+ >>> print(answer)
320
+ """
321
+ # Load checkpoint
322
+ checkpoint_path = Path(checkpoint)
323
+ if not checkpoint_path.exists():
324
+ raise FileNotFoundError(f"Checkpoint not found: {checkpoint}")
325
+
326
+ checkpoint_data = torch.load(checkpoint_path, map_location=device)
327
+
328
+ # Load model config
329
+ from gptmed.model.configs.model_config import ModelConfig
330
+ model_config = ModelConfig.from_dict(checkpoint_data['model_config'])
331
+
332
+ # Create and load model
333
+ model = GPTTransformer(model_config)
334
+ model.load_state_dict(checkpoint_data['model_state_dict'])
335
+
336
+ # Create generator
337
+ generator = TextGenerator(
338
+ model=model,
339
+ tokenizer_path=tokenizer,
340
+ device=device
341
+ )
342
+
343
+ # Generate
344
+ output = generator.generate(
345
+ prompt=prompt,
346
+ max_length=max_length,
347
+ temperature=temperature,
348
+ top_k=top_k,
349
+ top_p=top_p
350
+ )
351
+
352
+ return output
@@ -0,0 +1,191 @@
1
+ """
2
+ Configuration File Loader
3
+
4
+ Load training configuration from YAML file for easy user customization.
5
+ """
6
+
7
+ import yaml
8
+ from pathlib import Path
9
+ from typing import Dict, Any, Optional
10
+ from dataclasses import dataclass
11
+
12
+
13
+ def load_yaml_config(config_path: str) -> Dict[str, Any]:
14
+ """
15
+ Load configuration from YAML file.
16
+
17
+ Args:
18
+ config_path: Path to YAML configuration file
19
+
20
+ Returns:
21
+ Dictionary with configuration parameters
22
+
23
+ Raises:
24
+ FileNotFoundError: If config file doesn't exist
25
+ yaml.YAMLError: If YAML parsing fails
26
+ """
27
+ config_path = Path(config_path)
28
+
29
+ if not config_path.exists():
30
+ raise FileNotFoundError(f"Configuration file not found: {config_path}")
31
+
32
+ with open(config_path, 'r') as f:
33
+ try:
34
+ config = yaml.safe_load(f)
35
+ except yaml.YAMLError as e:
36
+ raise ValueError(f"Error parsing YAML configuration: {e}")
37
+
38
+ return config
39
+
40
+
41
+ def validate_config(config: Dict[str, Any]) -> None:
42
+ """
43
+ Validate configuration parameters.
44
+
45
+ Args:
46
+ config: Configuration dictionary
47
+
48
+ Raises:
49
+ ValueError: If configuration is invalid
50
+ """
51
+ # Check required sections
52
+ required_sections = ['model', 'data', 'training']
53
+ for section in required_sections:
54
+ if section not in config:
55
+ raise ValueError(f"Missing required section: {section}")
56
+
57
+ # Validate model size
58
+ valid_sizes = ['tiny', 'small', 'medium']
59
+ if config['model']['size'] not in valid_sizes:
60
+ raise ValueError(f"Invalid model size: {config['model']['size']}. "
61
+ f"Must be one of {valid_sizes}")
62
+
63
+ # Validate data paths
64
+ train_path = Path(config['data']['train_data'])
65
+ val_path = Path(config['data']['val_data'])
66
+
67
+ if not train_path.exists():
68
+ raise FileNotFoundError(f"Training data not found: {train_path}")
69
+ if not val_path.exists():
70
+ raise FileNotFoundError(f"Validation data not found: {val_path}")
71
+
72
+ # Validate training parameters
73
+ if config['training']['num_epochs'] <= 0:
74
+ raise ValueError("num_epochs must be positive")
75
+ if config['training']['batch_size'] <= 0:
76
+ raise ValueError("batch_size must be positive")
77
+ if config['training']['learning_rate'] <= 0:
78
+ raise ValueError("learning_rate must be positive")
79
+
80
+
81
+ def config_to_args(config: Dict[str, Any]) -> Dict[str, Any]:
82
+ """
83
+ Convert YAML config to training arguments.
84
+
85
+ Args:
86
+ config: Configuration dictionary from YAML
87
+
88
+ Returns:
89
+ Flattened dictionary suitable for training
90
+ """
91
+ args = {
92
+ # Model
93
+ 'model_size': config['model']['size'],
94
+
95
+ # Data
96
+ 'train_data': config['data']['train_data'],
97
+ 'val_data': config['data']['val_data'],
98
+
99
+ # Training
100
+ 'num_epochs': config['training']['num_epochs'],
101
+ 'batch_size': config['training']['batch_size'],
102
+ 'learning_rate': config['training']['learning_rate'],
103
+ 'weight_decay': config['training']['weight_decay'],
104
+ 'grad_clip': config['training']['grad_clip'],
105
+ 'warmup_steps': config['training']['warmup_steps'],
106
+
107
+ # Optimizer
108
+ 'betas': tuple(config['optimizer']['betas']),
109
+ 'eps': config['optimizer']['eps'],
110
+
111
+ # Checkpointing
112
+ 'checkpoint_dir': config['checkpointing']['checkpoint_dir'],
113
+ 'save_every': config['checkpointing']['save_every'],
114
+ 'keep_last_n': config['checkpointing']['keep_last_n'],
115
+
116
+ # Logging
117
+ 'log_dir': config['logging']['log_dir'],
118
+ 'eval_every': config['logging']['eval_every'],
119
+ 'log_every': config['logging']['log_every'],
120
+
121
+ # Device
122
+ 'device': config['device']['device'],
123
+ 'seed': config['device']['seed'],
124
+
125
+ # Advanced
126
+ 'max_steps': config.get('advanced', {}).get('max_steps', -1),
127
+ 'resume_from': config.get('advanced', {}).get('resume_from'),
128
+ 'quick_test': config.get('advanced', {}).get('quick_test', False),
129
+ }
130
+
131
+ return args
132
+
133
+
134
+ def create_default_config_file(output_path: str = 'training_config.yaml') -> None:
135
+ """
136
+ Create a default configuration file template.
137
+
138
+ Args:
139
+ output_path: Path where to save the config file
140
+ """
141
+ default_config = {
142
+ 'model': {
143
+ 'size': 'small'
144
+ },
145
+ 'data': {
146
+ 'train_data': './data/tokenized/train.npy',
147
+ 'val_data': './data/tokenized/val.npy'
148
+ },
149
+ 'training': {
150
+ 'num_epochs': 10,
151
+ 'batch_size': 16,
152
+ 'learning_rate': 0.0003,
153
+ 'weight_decay': 0.01,
154
+ 'grad_clip': 1.0,
155
+ 'warmup_steps': 100
156
+ },
157
+ 'optimizer': {
158
+ 'betas': [0.9, 0.95],
159
+ 'eps': 1.0e-8
160
+ },
161
+ 'checkpointing': {
162
+ 'checkpoint_dir': './model/checkpoints',
163
+ 'save_every': 1,
164
+ 'keep_last_n': 3
165
+ },
166
+ 'logging': {
167
+ 'log_dir': './logs',
168
+ 'eval_every': 100,
169
+ 'log_every': 10
170
+ },
171
+ 'device': {
172
+ 'device': 'cuda',
173
+ 'seed': 42
174
+ },
175
+ 'advanced': {
176
+ 'max_steps': -1,
177
+ 'resume_from': None,
178
+ 'quick_test': False
179
+ }
180
+ }
181
+
182
+ output_path = Path(output_path)
183
+
184
+ # Create directory if it doesn't exist
185
+ output_path.parent.mkdir(parents=True, exist_ok=True)
186
+
187
+ with open(output_path, 'w') as f:
188
+ yaml.dump(default_config, f, default_flow_style=False, sort_keys=False)
189
+
190
+ print(f"✓ Created default configuration file: {output_path}")
191
+ print(f" Edit this file and then run: gptmed.train_from_config('{output_path}')")
@@ -0,0 +1,64 @@
1
+ # GptMed Training Configuration File
2
+ # Edit these parameters for your training needs
3
+
4
+ # ============================================================
5
+ # MODEL CONFIGURATION
6
+ # ============================================================
7
+ model:
8
+ size: small # Options: tiny, small, medium
9
+ # tiny: ~2M params (testing)
10
+ # small: ~10M params (recommended)
11
+ # medium: ~50M params (high quality)
12
+
13
+ # ============================================================
14
+ # DATA PATHS
15
+ # ============================================================
16
+ data:
17
+ train_data: ./data/tokenized/train.npy # Path to training data (.npy file)
18
+ val_data: ./data/tokenized/val.npy # Path to validation data (.npy file)
19
+
20
+ # ============================================================
21
+ # TRAINING HYPERPARAMETERS
22
+ # ============================================================
23
+ training:
24
+ num_epochs: 10 # Number of training epochs
25
+ batch_size: 16 # Batch size (reduce if OOM: 8, 4)
26
+ learning_rate: 0.0003 # Learning rate (3e-4)
27
+ weight_decay: 0.01 # Weight decay for regularization
28
+ grad_clip: 1.0 # Gradient clipping value
29
+ warmup_steps: 100 # Learning rate warmup steps
30
+
31
+ # ============================================================
32
+ # OPTIMIZER SETTINGS
33
+ # ============================================================
34
+ optimizer:
35
+ betas: [0.9, 0.95] # Adam beta parameters
36
+ eps: 1.0e-8 # Adam epsilon
37
+
38
+ # ============================================================
39
+ # CHECKPOINTING & LOGGING
40
+ # ============================================================
41
+ checkpointing:
42
+ checkpoint_dir: ./model/checkpoints # Directory to save checkpoints
43
+ save_every: 1 # Save checkpoint every N epochs
44
+ keep_last_n: 3 # Keep last N checkpoints
45
+
46
+ logging:
47
+ log_dir: ./logs # Directory for training logs
48
+ eval_every: 100 # Evaluate every N steps
49
+ log_every: 10 # Log metrics every N steps
50
+
51
+ # ============================================================
52
+ # DEVICE & PERFORMANCE
53
+ # ============================================================
54
+ device:
55
+ device: cuda # Options: cuda, cpu
56
+ seed: 42 # Random seed for reproducibility
57
+
58
+ # ============================================================
59
+ # ADVANCED OPTIONS (optional)
60
+ # ============================================================
61
+ advanced:
62
+ max_steps: -1 # Max training steps (-1 = use num_epochs)
63
+ resume_from: null # Path to checkpoint to resume from (null = start fresh)
64
+ quick_test: false # Use quick test config for debugging
@@ -38,11 +38,11 @@ import sentencepiece as spm
38
38
  from pathlib import Path
39
39
  from typing import List, Optional
40
40
 
41
- from llm_med.model.architecture import GPTTransformer
42
- from llm_med.model.configs.model_config import ModelConfig
43
- from llm_med.inference.generation_config import GenerationConfig
44
- from llm_med.inference.sampling import sample_next_token
45
- from llm_med.inference.decoding_utils import (
41
+ from gptmed.model.architecture import GPTTransformer
42
+ from gptmed.model.configs.model_config import ModelConfig
43
+ from gptmed.inference.generation_config import GenerationConfig
44
+ from gptmed.inference.sampling import sample_next_token
45
+ from gptmed.inference.decoding_utils import (
46
46
  apply_repetition_penalty,
47
47
  block_ngram_repeats,
48
48
  should_stop_generation,
gptmed/model/__init__.py CHANGED
@@ -4,6 +4,6 @@ MedLLM Model Package
4
4
  This package contains the GPT-based transformer architecture for medical QA.
5
5
  """
6
6
 
7
- from llm_med.model.architecture import GPTTransformer
7
+ from gptmed.model.architecture import GPTTransformer
8
8
 
9
9
  __all__ = ["GPTTransformer"]
@@ -2,7 +2,7 @@
2
2
  Model configuration module
3
3
  """
4
4
 
5
- from llm_med.model.configs.model_config import (
5
+ from gptmed.model.configs.model_config import (
6
6
  ModelConfig,
7
7
  get_tiny_config,
8
8
  get_small_config,
@@ -2,6 +2,6 @@
2
2
  Tokenizer module for MedLLM
3
3
  """
4
4
 
5
- from llm_med.tokenizer.train_tokenizer import train_sentencepiece_tokenizer
5
+ from gptmed.tokenizer.train_tokenizer import train_sentencepiece_tokenizer
6
6
 
7
7
  __all__ = ["train_sentencepiece_tokenizer"]
gptmed/training/train.py CHANGED
@@ -46,11 +46,11 @@ import sys
46
46
  # Add parent directory to path for imports
47
47
  sys.path.insert(0, str(Path(__file__).parent.parent))
48
48
 
49
- from llm_med.model.architecture import GPTTransformer
50
- from llm_med.model.configs.model_config import get_small_config, get_tiny_config
51
- from llm_med.configs.train_config import get_default_config, get_quick_test_config
52
- from llm_med.training.dataset import create_dataloaders
53
- from llm_med.training.trainer import Trainer
49
+ from gptmed.model.architecture import GPTTransformer
50
+ from gptmed.model.configs.model_config import get_small_config, get_tiny_config
51
+ from gptmed.configs.train_config import get_default_config, get_quick_test_config
52
+ from gptmed.training.dataset import create_dataloaders
53
+ from gptmed.training.trainer import Trainer
54
54
 
55
55
 
56
56
  def set_seed(seed: int):
@@ -83,7 +83,7 @@ def count_parameters(model):
83
83
 
84
84
 
85
85
  def main():
86
- parser = argparse.ArgumentParser(description="Train GPT model on MedQuAD")
86
+ parser = argparse.ArgumentParser(description="Train your GPT model")
87
87
 
88
88
  # Model config
89
89
  parser.add_argument(
@@ -136,7 +136,7 @@ def main():
136
136
  args = parser.parse_args()
137
137
 
138
138
  print("=" * 60)
139
- print("GPT Training - MedQuAD")
139
+ print("GPT Training Script")
140
140
  print("=" * 60)
141
141
 
142
142
  # Check CUDA availability
@@ -170,7 +170,6 @@ def main():
170
170
  print("Using quick test config (fast debugging)")
171
171
  else:
172
172
  train_config = get_default_config()
173
-
174
173
  # Override with command-line args
175
174
  if args.batch_size is not None:
176
175
  train_config.batch_size = args.batch_size
@@ -51,16 +51,16 @@ import time
51
51
  from pathlib import Path
52
52
  from typing import Optional
53
53
 
54
- from llm_med.model.architecture import GPTTransformer
55
- from llm_med.training.utils import (
54
+ from gptmed.model.architecture import GPTTransformer
55
+ from gptmed.training.utils import (
56
56
  clip_grad_norm,
57
57
  get_lr_with_warmup,
58
58
  set_learning_rate,
59
59
  estimate_loss_dataloader,
60
60
  compute_perplexity,
61
61
  )
62
- from llm_med.utils.logging import MetricsLogger, log_training_step, log_validation
63
- from llm_med.utils.checkpoints import CheckpointManager
62
+ from gptmed.utils.logging import MetricsLogger, log_training_step, log_validation
63
+ from gptmed.utils.checkpoints import CheckpointManager
64
64
 
65
65
 
66
66
  class Trainer:
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: gptmed
3
- Version: 0.0.1
3
+ Version: 0.1.2
4
4
  Summary: A lightweight GPT-based language model framework for training custom question-answering models on any domain
5
5
  Author-email: Sanjog Sigdel <sigdelsanjog@gmail.com>
6
6
  Maintainer-email: Sanjog Sigdel <sigdelsanjog@gmail.com>
@@ -51,6 +51,7 @@ Requires-Dist: torch>=2.0.0
51
51
  Requires-Dist: sentencepiece>=0.1.99
52
52
  Requires-Dist: numpy>=1.24.0
53
53
  Requires-Dist: tqdm>=4.65.0
54
+ Requires-Dist: pyyaml>=6.0
54
55
  Provides-Extra: dev
55
56
  Requires-Dist: pytest>=7.0.0; extra == "dev"
56
57
  Requires-Dist: black>=22.0.0; extra == "dev"
@@ -69,6 +70,10 @@ A lightweight GPT-based language model framework for training custom question-an
69
70
  [![Python 3.8+](https://img.shields.io/badge/python-3.8+-blue.svg)](https://www.python.org/downloads/)
70
71
  [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
71
72
 
73
+ ## 📖 [Complete User Manual](USER_MANUAL.md) | [Quick Start](#quick-start)
74
+
75
+ > **New to GptMed?** Check out the [**step-by-step User Manual**](USER_MANUAL.md) for a complete guide on training your own model!
76
+
72
77
  ## Features
73
78
 
74
79
  - 🧠 **Custom GPT Architecture**: Lightweight transformer model for any Q&A domain
@@ -78,6 +83,27 @@ A lightweight GPT-based language model framework for training custom question-an
78
83
  - 📦 **Lightweight**: Small model size suitable for edge deployment
79
84
  - 🛠️ **Complete Toolkit**: Includes tokenizer training, model training, and inference utilities
80
85
 
86
+ ## Table of Contents
87
+
88
+ - [Features](#features)
89
+ - [Installation](#installation)
90
+ - [Quick Start](#quick-start)
91
+ - [Package Structure](#package-structure)
92
+ - [Core Modules](#core-modules)
93
+ - [Model Components](#model-components)
94
+ - [Training Components](#training-components)
95
+ - [Inference Components](#inference-components)
96
+ - [Data Processing](#data-processing)
97
+ - [Utilities](#utilities)
98
+ - [Model Architecture](#model-architecture)
99
+ - [Configuration](#configuration)
100
+ - [Documentation](#documentation)
101
+ - [Performance](#performance)
102
+ - [Examples](#examples)
103
+ - [Contributing](#contributing)
104
+ - [License](#license)
105
+ - [Support](#support)
106
+
81
107
  ## Installation
82
108
 
83
109
  ### From PyPI (Recommended)
@@ -204,27 +230,134 @@ config = TrainingConfig(
204
230
  )
205
231
  ```
206
232
 
207
- ## Project Structure
233
+ ## Package Structure
234
+
235
+ ### Core Modules
236
+
237
+ The `gptmed` package contains the following main modules:
238
+
239
+ ```
240
+ gptmed/
241
+ ├── model/ # Model architecture and configurations
242
+ ├── inference/ # Text generation and sampling
243
+ ├── training/ # Training loops and datasets
244
+ ├── tokenizer/ # Tokenizer training and data processing
245
+ ├── data/ # Data parsers and formatters
246
+ ├── configs/ # Training configurations
247
+ └── utils/ # Utilities (checkpoints, logging)
248
+ ```
249
+
250
+ ### Model Components
251
+
252
+ **`gptmed.model.architecture`** - GPT Transformer Implementation
253
+
254
+ - `GPTTransformer` - Main model class
255
+ - `TransformerBlock` - Individual transformer layers
256
+ - `MultiHeadAttention` - Attention mechanism
257
+ - `FeedForward` - Feed-forward networks
258
+ - `RoPEPositionalEncoding` - Rotary position embeddings
259
+
260
+ **`gptmed.model.configs`** - Model Configurations
261
+
262
+ - `get_tiny_config()` - ~2M parameters (testing)
263
+ - `get_small_config()` - ~10M parameters (recommended)
264
+ - `get_medium_config()` - ~50M parameters (high quality)
265
+ - `ModelConfig` - Custom configuration class
266
+
267
+ ### Training Components
268
+
269
+ **`gptmed.training`** - Training Pipeline
270
+
271
+ - `train.py` - Main training script (CLI: `gptmed-train`)
272
+ - `Trainer` - Training loop with checkpointing
273
+ - `TokenizedDataset` - PyTorch dataset for tokenized data
274
+ - `create_dataloaders()` - DataLoader creation utilities
275
+
276
+ **`gptmed.configs`** - Training Configurations
277
+
278
+ - `TrainingConfig` - Training hyperparameters
279
+ - `get_default_config()` - Default training settings
280
+ - `get_quick_test_config()` - Fast testing configuration
281
+
282
+ ### Inference Components
283
+
284
+ **`gptmed.inference`** - Text Generation
285
+
286
+ - `TextGenerator` - Main generation class
287
+ - `generator.py` - CLI command (CLI: `gptmed-generate`)
288
+ - `sampling.py` - Sampling strategies (top-k, top-p, temperature)
289
+ - `decoding_utils.py` - Decoding utilities
290
+ - `GenerationConfig` - Generation parameters
291
+
292
+ ### Data Processing
293
+
294
+ **`gptmed.tokenizer`** - Tokenizer Training & Data Processing
295
+
296
+ - `train_tokenizer.py` - Train SentencePiece tokenizer
297
+ - `tokenize_data.py` - Convert text to token sequences
298
+ - SentencePiece BPE tokenizer support
299
+
300
+ **`gptmed.data.parsers`** - Data Parsing & Formatting
301
+
302
+ - `MedQuADParser` - XML Q&A parser (example)
303
+ - `CausalTextFormatter` - Format Q&A pairs for training
304
+ - `FormatConfig` - Formatting configuration
305
+
306
+ ### Utilities
307
+
308
+ **`gptmed.utils`** - Helper Functions
309
+
310
+ - `checkpoints.py` - Model checkpoint management
311
+ - `logging.py` - Training metrics logging
312
+
313
+ ---
314
+
315
+ ## Detailed Project Structure
208
316
 
209
317
  ```
210
318
  gptmed/
211
319
  ├── model/
212
- │ ├── architecture/ # GPT transformer implementation
213
- └── configs/ # Model configurations
320
+ │ ├── architecture/
321
+ │ ├── gpt.py # GPT transformer model
322
+ │ │ ├── attention.py # Multi-head attention
323
+ │ │ ├── feedforward.py # Feed-forward networks
324
+ │ │ └── embeddings.py # Token + positional embeddings
325
+ │ └── configs/
326
+ │ └── model_config.py # Model size configurations
214
327
  ├── inference/
215
- │ ├── generator.py # Text generation
216
- └── sampling.py # Sampling strategies
328
+ │ ├── generator.py # Text generation (CLI command)
329
+ ├── sampling.py # Sampling strategies
330
+ │ ├── decoding_utils.py # Decoding utilities
331
+ │ └── generation_config.py # Generation parameters
217
332
  ├── training/
218
- │ ├── train.py # Training script
219
- │ ├── trainer.py # Training loop
220
- └── dataset.py # Data loading
333
+ │ ├── train.py # Main training script (CLI command)
334
+ │ ├── trainer.py # Training loop
335
+ ├── dataset.py # PyTorch dataset
336
+ │ └── utils.py # Training utilities
221
337
  ├── tokenizer/
222
- └── train_tokenizer.py # SentencePiece tokenizer
338
+ ├── train_tokenizer.py # Train SentencePiece tokenizer
339
+ │ └── tokenize_data.py # Tokenize text data
340
+ ├── data/
341
+ │ └── parsers/
342
+ │ ├── medquad_parser.py # Example XML parser
343
+ │ └── text_formatter.py # Q&A text formatter
223
344
  ├── configs/
224
- │ └── train_config.py # Training configurations
345
+ │ └── train_config.py # Training configurations
225
346
  └── utils/
226
- ├── checkpoints.py # Model checkpointing
227
- └── logging.py # Training logging
347
+ ├── checkpoints.py # Model checkpointing
348
+ └── logging.py # Training logging
349
+ ```
350
+
351
+ ### Command-Line Interface
352
+
353
+ The package provides two main CLI commands:
354
+
355
+ ```bash
356
+ # Train a model
357
+ gptmed-train --model-size small --num-epochs 10 --batch-size 16
358
+
359
+ # Generate text
360
+ gptmed-generate --prompt "Your question?" --max-length 100
228
361
  ```
229
362
 
230
363
  ## Requirements
@@ -237,14 +370,14 @@ gptmed/
237
370
 
238
371
  ## Documentation
239
372
 
240
- For detailed documentation, visit [GitHub Repository](https://github.com/yourusername/medllm).
373
+ 📚 **[Complete User Manual](USER_MANUAL.md)** - Step-by-step guide for training your own model
241
374
 
242
- ### Key Guides
375
+ ### Quick Links
243
376
 
244
- - [Training Guide](docs/training.md)
245
- - [Inference Guide](docs/inference.md)
246
- - [Model Architecture](docs/architecture.md)
247
- - [API Reference](docs/api.md)
377
+ - [User Manual](USER_MANUAL.md) - **Start here!** Complete training pipeline guide
378
+ - [Architecture Guide](ARCHITECTURE_EXTENSION_GUIDE.md) - Understanding the model architecture
379
+ - [Deployment Guide](DEPLOYMENT_GUIDE.md) - Publishing to PyPI
380
+ - [Changelog](CHANGELOG.md) - Version history
248
381
 
249
382
  ## Performance
250
383
 
@@ -312,7 +445,8 @@ This project is licensed under the MIT License - see the [LICENSE](LICENSE) file
312
445
 
313
446
  ## Support
314
447
 
315
- - 📫 Issues: [GitHub Issues](https://github.com/sigdelsanjog/gptmed/issues)
448
+ - **[User Manual](USER_MANUAL.md)** - Complete step-by-step training guide
449
+ - �📫 Issues: [GitHub Issues](https://github.com/sigdelsanjog/gptmed/issues)
316
450
  - 💬 Discussions: [GitHub Discussions](https://github.com/sigdelsanjog/gptmed/discussions)
317
451
  - 📧 Email: sanjog.sigdel@ku.edu.np
318
452
 
@@ -1,6 +1,9 @@
1
- gptmed/__init__.py,sha256=Hj1lpVY8kVBnVZMpz2Dk_9bQM-pgSpWaim4aYPG4i08,1130
1
+ gptmed/__init__.py,sha256=RoPQCLWrTlvpaKgboeoH9OXn8BB3gmZDUFAytvL8lVU,1676
2
+ gptmed/api.py,sha256=IU5r9ujg3S-Lem5-FOGDDLdh1UJ_FqCbaQayzyJez5c,10774
2
3
  gptmed/configs/__init__.py,sha256=yRa-zgPQ-OCzu8fvCrfWMG-CjF3dru3PZzknzm0oUaQ,23
4
+ gptmed/configs/config_loader.py,sha256=hkJRN-Rh6dlzOf97yOjPCVdy-Es5YjgKo3-iOwAeXEI,5833
3
5
  gptmed/configs/train_config.py,sha256=KqfNBh9hdTTd_6gEAlrClU8sVFSlVDmZJOrf3cPwFe8,4657
6
+ gptmed/configs/training_config.yaml,sha256=EEZZa3kcsZr3g-_fKDPYZt4_NTpmS-3NvJrTYSWNc8g,2874
4
7
  gptmed/data/__init__.py,sha256=iAHeakB5pBAd7MkmarPPY0UKS9bTaO_winLZ23Y2O90,54
5
8
  gptmed/data/parsers/__init__.py,sha256=BgVzXuZgeE5DUCC4SzN7vflL40wQ4Q4_4DmJ1Y43_nw,211
6
9
  gptmed/data/parsers/medquad_parser.py,sha256=g3QCRiVBdcq8RdyuYH_qKFrHgU5KkHY59WfWxUwspP0,7974
@@ -8,31 +11,31 @@ gptmed/data/parsers/text_formatter.py,sha256=tVmnDBT54BbxX9BPKMXSPzzLmM39frDxKRK
8
11
  gptmed/inference/__init__.py,sha256=NDPViXhOgpItC8n13T9axX4UH1E7mrjt6kJ5OfIwvMs,25
9
12
  gptmed/inference/decoding_utils.py,sha256=zTDZYdl2jcGwSrcINXMw-5uoYuF4A9TSushhPxJi1o0,5041
10
13
  gptmed/inference/generation_config.py,sha256=hpPyZUk1K6qGSBAoQx3Jm0_ZrrYld77ACxbIlCCCcVU,2813
11
- gptmed/inference/generator.py,sha256=ZVd4sPSr6l4ov3AhNJib8YDNBERrs39-zdFZumluVnI,7889
14
+ gptmed/inference/generator.py,sha256=6JFmDPQF4btau_Gp5pfk8a5G0Iyg6QsB9Y8Oo4ygH-4,7884
12
15
  gptmed/inference/sampling.py,sha256=B6fRlJafypuBMKJ0rTbsk6k8KXloXiIvroi7rN6ekBA,7947
13
- gptmed/model/__init__.py,sha256=V44r-TSMaZObgHgeXRl2Ec9pkqWUkUVAf37xgtlZQO8,192
16
+ gptmed/model/__init__.py,sha256=brAE8ZhCDeQlU13YSqiBZTrTE8SHL_3bvFhZMzZnh3A,191
14
17
  gptmed/model/architecture/__init__.py,sha256=9MpSAYwwZY-t1vBLIupuRtLD7CaOLJRENMh3zKx3M-4,970
15
18
  gptmed/model/architecture/attention.py,sha256=Qk1eGl9glKWQbhcXJWmFkO5U3VHBq7OrsjVG0tPmgnY,6420
16
19
  gptmed/model/architecture/decoder_block.py,sha256=n-Uo09TDcirKeWTWTNumldGOrx-b2Elb25lbF6cTYwg,3879
17
20
  gptmed/model/architecture/embeddings.py,sha256=GoVXctC21MsNwyoIiOq7NX-v_DzYkbFcQAfvZ2fg66s,4717
18
21
  gptmed/model/architecture/feedforward.py,sha256=uJ5QOlWX0ritKDQLUE7GPmMojelR9-sTI_BaYc4Ehfw,3232
19
22
  gptmed/model/architecture/transformer.py,sha256=H1njPoy0Uam59JbA24C0olEDwPfhh3ev4HsUFRIC_0Y,6626
20
- gptmed/model/configs/__init__.py,sha256=0ZfBO0k4yFaqh2yO7eVSQUZjHKp-Esjpdn4m6zwhLig,276
23
+ gptmed/model/configs/__init__.py,sha256=LDCWhlCDOU7490wcfSId_jXBPfQrtYQEw8FoD67rqBs,275
21
24
  gptmed/model/configs/model_config.py,sha256=wI-i2Dw_pTdIKCDe1pqLvP3ky3YedEy7DwZYN5lwmKE,4673
22
- gptmed/tokenizer/__init__.py,sha256=Cs6h9mtmh0hbqq1qvawRdggb7-GPKMnISOeNJFEuVqo,158
25
+ gptmed/tokenizer/__init__.py,sha256=KhLAHPmQyoWhnKDenyIJRxgFflKI7xklip28j4cKfKw,157
23
26
  gptmed/tokenizer/tokenize_data.py,sha256=KgMtMfaz_RtOhN_CrvC267k9ujxRdO89rToVJ6nzdwg,9139
24
27
  gptmed/tokenizer/train_tokenizer.py,sha256=f0Hucyft9e8LU2RtpTqg8h_0SpOC_oMABl0_me-wfL8,7068
25
28
  gptmed/training/__init__.py,sha256=6G0_gdlwBnQBG8wZlTm2NtgkXZJcXRfLMDQ2iu6O3U4,24
26
29
  gptmed/training/dataset.py,sha256=QbNVTN4Og5gqMAV2ckjRX8W_k9aUc9IZJDcu0u9U8t0,5347
27
- gptmed/training/train.py,sha256=d--RS5v8ZAWlqux74YHnX-HAmJB1WveK38VxAZ8x2Bo,8157
28
- gptmed/training/trainer.py,sha256=qOOn5oUVvqQMAbO5KWddngk1QzxdHdyTXEwXrL8uS40,10732
30
+ gptmed/training/train.py,sha256=sp4-1WpEXUTA9V0GUYAgSvMd2aaPkt1aq2PepQFLXD8,8142
31
+ gptmed/training/trainer.py,sha256=asOKT9d7lvmtEm5PIcMHg8iUdulNJpobNFNwOjdkeEg,10728
29
32
  gptmed/training/utils.py,sha256=pJxCwneNr2STITIYwIDCxRzIICDFOxOMzK8DT7ck2oQ,5651
30
33
  gptmed/utils/__init__.py,sha256=XuMhIqOXF7mjnog_6Iky-hSbwvFb0iK42B4iDUpgi0U,44
31
34
  gptmed/utils/checkpoints.py,sha256=L4q1-_4GbHCoD7QuEKYeQ-xXDTF-6sqZOxKQ_LT8YmQ,7112
32
35
  gptmed/utils/logging.py,sha256=7dJc1tayMxCBjFSDXe4r9ACUTpoPTTGsJ0UZMTqZIDY,5303
33
- gptmed-0.0.1.dist-info/licenses/LICENSE,sha256=v2spsd7N1pKFFh2G8wGP_45iwe5S0DYiJzG4im8Rupc,1066
34
- gptmed-0.0.1.dist-info/METADATA,sha256=tVtGIXe76Iq0IKrHfS0FsFVuJ1_wlLcrejQzg2N6qyA,10196
35
- gptmed-0.0.1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
36
- gptmed-0.0.1.dist-info/entry_points.txt,sha256=ATqOzTtPVdUiFX5ZSeo3n9JkUCqocUxEXTgy1CfNRZE,110
37
- gptmed-0.0.1.dist-info/top_level.txt,sha256=mhyEq3rG33t21ziJz5w3TPgx0RjPf4zXMNUx2JTiNmE,7
38
- gptmed-0.0.1.dist-info/RECORD,,
36
+ gptmed-0.1.2.dist-info/licenses/LICENSE,sha256=v2spsd7N1pKFFh2G8wGP_45iwe5S0DYiJzG4im8Rupc,1066
37
+ gptmed-0.1.2.dist-info/METADATA,sha256=XEC0i9WiztPA54N0KnTJL9fBuuoAdmUhR5GeENivki8,14876
38
+ gptmed-0.1.2.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
39
+ gptmed-0.1.2.dist-info/entry_points.txt,sha256=ATqOzTtPVdUiFX5ZSeo3n9JkUCqocUxEXTgy1CfNRZE,110
40
+ gptmed-0.1.2.dist-info/top_level.txt,sha256=mhyEq3rG33t21ziJz5w3TPgx0RjPf4zXMNUx2JTiNmE,7
41
+ gptmed-0.1.2.dist-info/RECORD,,
File without changes