gptmed 0.0.1__py3-none-any.whl → 0.1.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- gptmed/__init__.py +39 -16
- gptmed/api.py +352 -0
- gptmed/configs/config_loader.py +191 -0
- gptmed/configs/training_config.yaml +64 -0
- gptmed/inference/generator.py +5 -5
- gptmed/model/__init__.py +1 -1
- gptmed/model/configs/__init__.py +1 -1
- gptmed/tokenizer/__init__.py +1 -1
- gptmed/training/train.py +7 -8
- gptmed/training/trainer.py +4 -4
- {gptmed-0.0.1.dist-info → gptmed-0.1.2.dist-info}/METADATA +154 -20
- {gptmed-0.0.1.dist-info → gptmed-0.1.2.dist-info}/RECORD +16 -13
- {gptmed-0.0.1.dist-info → gptmed-0.1.2.dist-info}/WHEEL +0 -0
- {gptmed-0.0.1.dist-info → gptmed-0.1.2.dist-info}/entry_points.txt +0 -0
- {gptmed-0.0.1.dist-info → gptmed-0.1.2.dist-info}/licenses/LICENSE +0 -0
- {gptmed-0.0.1.dist-info → gptmed-0.1.2.dist-info}/top_level.txt +0 -0
gptmed/__init__.py
CHANGED
|
@@ -1,21 +1,32 @@
|
|
|
1
1
|
"""
|
|
2
|
-
|
|
2
|
+
GptMed: A lightweight GPT-based language model framework
|
|
3
3
|
|
|
4
|
-
|
|
5
|
-
|
|
4
|
+
A domain-agnostic framework for training custom question-answering models.
|
|
5
|
+
Train your own GPT model on any Q&A dataset - medical, technical support,
|
|
6
|
+
education, or any other domain.
|
|
6
7
|
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
8
|
+
Quick Start:
|
|
9
|
+
>>> import gptmed
|
|
10
|
+
>>>
|
|
11
|
+
>>> # 1. Create a config file
|
|
12
|
+
>>> gptmed.create_config('my_config.yaml')
|
|
13
|
+
>>>
|
|
14
|
+
>>> # 2. Edit my_config.yaml with your settings
|
|
15
|
+
>>>
|
|
16
|
+
>>> # 3. Train your model
|
|
17
|
+
>>> results = gptmed.train_from_config('my_config.yaml')
|
|
18
|
+
>>>
|
|
19
|
+
>>> # 4. Generate answers
|
|
20
|
+
>>> answer = gptmed.generate(
|
|
21
|
+
... checkpoint=results['best_checkpoint'],
|
|
22
|
+
... tokenizer='tokenizer/my_tokenizer.model',
|
|
23
|
+
... prompt='Your question here?'
|
|
24
|
+
... )
|
|
14
25
|
|
|
15
|
-
|
|
16
|
-
>>> from
|
|
17
|
-
>>> from
|
|
18
|
-
>>> from
|
|
26
|
+
Advanced Usage:
|
|
27
|
+
>>> from gptmed.model.architecture import GPTTransformer
|
|
28
|
+
>>> from gptmed.model.configs.model_config import get_small_config
|
|
29
|
+
>>> from gptmed.inference.generator import TextGenerator
|
|
19
30
|
>>>
|
|
20
31
|
>>> config = get_small_config()
|
|
21
32
|
>>> model = GPTTransformer(config)
|
|
@@ -25,11 +36,23 @@ __version__ = "0.2.0"
|
|
|
25
36
|
__author__ = "Sanjog Sigdel"
|
|
26
37
|
__email__ = "sigdelsanjog@gmail.com"
|
|
27
38
|
|
|
39
|
+
# High-level API - Main user interface
|
|
40
|
+
from gptmed.api import (
|
|
41
|
+
create_config,
|
|
42
|
+
train_from_config,
|
|
43
|
+
generate,
|
|
44
|
+
)
|
|
45
|
+
|
|
28
46
|
# Expose main components at package level for convenience
|
|
29
|
-
from
|
|
30
|
-
from
|
|
47
|
+
from gptmed.model.architecture import GPTTransformer
|
|
48
|
+
from gptmed.model.configs.model_config import ModelConfig, get_small_config, get_tiny_config
|
|
31
49
|
|
|
32
50
|
__all__ = [
|
|
51
|
+
# Simple API
|
|
52
|
+
"create_config",
|
|
53
|
+
"train_from_config",
|
|
54
|
+
"generate",
|
|
55
|
+
# Advanced API
|
|
33
56
|
"GPTTransformer",
|
|
34
57
|
"ModelConfig",
|
|
35
58
|
"get_small_config",
|
gptmed/api.py
ADDED
|
@@ -0,0 +1,352 @@
|
|
|
1
|
+
"""
|
|
2
|
+
High-Level API for GptMed
|
|
3
|
+
|
|
4
|
+
Simple, user-friendly functions to train and use GPT models.
|
|
5
|
+
This is the main interface users should use.
|
|
6
|
+
|
|
7
|
+
Example:
|
|
8
|
+
>>> import gptmed
|
|
9
|
+
>>>
|
|
10
|
+
>>> # Create a config file
|
|
11
|
+
>>> gptmed.create_config('my_config.yaml')
|
|
12
|
+
>>>
|
|
13
|
+
>>> # Edit my_config.yaml with your settings
|
|
14
|
+
>>>
|
|
15
|
+
>>> # Train the model
|
|
16
|
+
>>> gptmed.train_from_config('my_config.yaml')
|
|
17
|
+
>>>
|
|
18
|
+
>>> # Generate text
|
|
19
|
+
>>> answer = gptmed.generate(
|
|
20
|
+
... checkpoint='model/checkpoints/best_model.pt',
|
|
21
|
+
... prompt='Your question?',
|
|
22
|
+
... tokenizer='tokenizer/my_tokenizer.model'
|
|
23
|
+
... )
|
|
24
|
+
"""
|
|
25
|
+
|
|
26
|
+
import torch
|
|
27
|
+
from pathlib import Path
|
|
28
|
+
from typing import Optional, Dict, Any
|
|
29
|
+
|
|
30
|
+
from gptmed.configs.config_loader import (
|
|
31
|
+
load_yaml_config,
|
|
32
|
+
validate_config,
|
|
33
|
+
config_to_args,
|
|
34
|
+
create_default_config_file
|
|
35
|
+
)
|
|
36
|
+
from gptmed.model.architecture import GPTTransformer
|
|
37
|
+
from gptmed.model.configs.model_config import get_tiny_config, get_small_config, get_medium_config
|
|
38
|
+
from gptmed.configs.train_config import TrainingConfig
|
|
39
|
+
from gptmed.training.dataset import create_dataloaders
|
|
40
|
+
from gptmed.training.trainer import Trainer
|
|
41
|
+
from gptmed.inference.generator import TextGenerator
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def create_config(output_path: str = 'training_config.yaml') -> None:
|
|
45
|
+
"""
|
|
46
|
+
Create a default training configuration file.
|
|
47
|
+
|
|
48
|
+
This creates a YAML file that you can edit with your training settings.
|
|
49
|
+
|
|
50
|
+
Args:
|
|
51
|
+
output_path: Where to save the config file (default: 'training_config.yaml')
|
|
52
|
+
|
|
53
|
+
Example:
|
|
54
|
+
>>> import gptmed
|
|
55
|
+
>>> gptmed.create_config('my_training_config.yaml')
|
|
56
|
+
>>> # Now edit my_training_config.yaml with your settings
|
|
57
|
+
"""
|
|
58
|
+
create_default_config_file(output_path)
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
def train_from_config(config_path: str, verbose: bool = True) -> Dict[str, Any]:
|
|
62
|
+
"""
|
|
63
|
+
Train a GPT model using a YAML configuration file.
|
|
64
|
+
|
|
65
|
+
This is the simplest way to train a model. Just create a config file
|
|
66
|
+
with create_config(), edit it with your settings, and pass it here.
|
|
67
|
+
|
|
68
|
+
Args:
|
|
69
|
+
config_path: Path to YAML configuration file
|
|
70
|
+
verbose: Whether to print training progress (default: True)
|
|
71
|
+
|
|
72
|
+
Returns:
|
|
73
|
+
Dictionary with training results:
|
|
74
|
+
- best_checkpoint: Path to best model checkpoint
|
|
75
|
+
- final_val_loss: Final validation loss
|
|
76
|
+
- total_epochs: Number of epochs trained
|
|
77
|
+
|
|
78
|
+
Example:
|
|
79
|
+
>>> import gptmed
|
|
80
|
+
>>>
|
|
81
|
+
>>> # Create and edit config file
|
|
82
|
+
>>> gptmed.create_config('config.yaml')
|
|
83
|
+
>>> # ... edit config.yaml ...
|
|
84
|
+
>>>
|
|
85
|
+
>>> # Train the model
|
|
86
|
+
>>> results = gptmed.train_from_config('config.yaml')
|
|
87
|
+
>>> print(f"Best model: {results['best_checkpoint']}")
|
|
88
|
+
|
|
89
|
+
Raises:
|
|
90
|
+
FileNotFoundError: If config file or data files don't exist
|
|
91
|
+
ValueError: If configuration is invalid
|
|
92
|
+
"""
|
|
93
|
+
if verbose:
|
|
94
|
+
print("=" * 60)
|
|
95
|
+
print("GptMed Training from Configuration File")
|
|
96
|
+
print("=" * 60)
|
|
97
|
+
|
|
98
|
+
# Load and validate config
|
|
99
|
+
if verbose:
|
|
100
|
+
print(f"\n📄 Loading configuration from: {config_path}")
|
|
101
|
+
config = load_yaml_config(config_path)
|
|
102
|
+
|
|
103
|
+
if verbose:
|
|
104
|
+
print("✓ Configuration loaded")
|
|
105
|
+
print("\n🔍 Validating configuration...")
|
|
106
|
+
validate_config(config)
|
|
107
|
+
|
|
108
|
+
if verbose:
|
|
109
|
+
print("✓ Configuration valid")
|
|
110
|
+
|
|
111
|
+
# Convert to arguments
|
|
112
|
+
args = config_to_args(config)
|
|
113
|
+
|
|
114
|
+
# Import here to avoid circular imports
|
|
115
|
+
import random
|
|
116
|
+
import numpy as np
|
|
117
|
+
|
|
118
|
+
# Set random seed
|
|
119
|
+
def set_seed(seed: int):
|
|
120
|
+
random.seed(seed)
|
|
121
|
+
np.random.seed(seed)
|
|
122
|
+
torch.manual_seed(seed)
|
|
123
|
+
if torch.cuda.is_available():
|
|
124
|
+
torch.cuda.manual_seed(seed)
|
|
125
|
+
torch.cuda.manual_seed_all(seed)
|
|
126
|
+
torch.backends.cudnn.deterministic = True
|
|
127
|
+
torch.backends.cudnn.benchmark = False
|
|
128
|
+
|
|
129
|
+
if verbose:
|
|
130
|
+
print(f"\n🎲 Setting random seed: {args['seed']}")
|
|
131
|
+
set_seed(args['seed'])
|
|
132
|
+
|
|
133
|
+
# Check device
|
|
134
|
+
device = args['device']
|
|
135
|
+
if device == 'cuda' and not torch.cuda.is_available():
|
|
136
|
+
if verbose:
|
|
137
|
+
print("⚠️ CUDA not available, using CPU")
|
|
138
|
+
device = 'cpu'
|
|
139
|
+
|
|
140
|
+
# Load model config
|
|
141
|
+
if verbose:
|
|
142
|
+
print(f"\n🧠 Creating model: {args['model_size']}")
|
|
143
|
+
|
|
144
|
+
if args['model_size'] == 'tiny':
|
|
145
|
+
model_config = get_tiny_config()
|
|
146
|
+
elif args['model_size'] == 'small':
|
|
147
|
+
model_config = get_small_config()
|
|
148
|
+
elif args['model_size'] == 'medium':
|
|
149
|
+
model_config = get_medium_config()
|
|
150
|
+
else:
|
|
151
|
+
raise ValueError(f"Unknown model size: {args['model_size']}")
|
|
152
|
+
|
|
153
|
+
# Create model
|
|
154
|
+
model = GPTTransformer(model_config)
|
|
155
|
+
total_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
|
|
156
|
+
|
|
157
|
+
if verbose:
|
|
158
|
+
print(f" Model size: {args['model_size']}")
|
|
159
|
+
print(f" Parameters: {total_params:,}")
|
|
160
|
+
print(f" Memory: ~{total_params * 4 / 1024 / 1024:.2f} MB")
|
|
161
|
+
|
|
162
|
+
# Load data
|
|
163
|
+
if verbose:
|
|
164
|
+
print(f"\n📊 Loading data...")
|
|
165
|
+
print(f" Train: {args['train_data']}")
|
|
166
|
+
print(f" Val: {args['val_data']}")
|
|
167
|
+
|
|
168
|
+
train_loader, val_loader = create_dataloaders(
|
|
169
|
+
train_path=Path(args['train_data']),
|
|
170
|
+
val_path=Path(args['val_data']),
|
|
171
|
+
batch_size=args['batch_size'],
|
|
172
|
+
num_workers=0,
|
|
173
|
+
)
|
|
174
|
+
|
|
175
|
+
if verbose:
|
|
176
|
+
print(f" Train batches: {len(train_loader)}")
|
|
177
|
+
print(f" Val batches: {len(val_loader)}")
|
|
178
|
+
|
|
179
|
+
# Create training config
|
|
180
|
+
train_config = TrainingConfig(
|
|
181
|
+
batch_size=args['batch_size'],
|
|
182
|
+
learning_rate=args['learning_rate'],
|
|
183
|
+
num_epochs=args['num_epochs'],
|
|
184
|
+
warmup_steps=args['warmup_steps'],
|
|
185
|
+
grad_clip=args['grad_clip'],
|
|
186
|
+
weight_decay=args['weight_decay'],
|
|
187
|
+
betas=args['betas'],
|
|
188
|
+
eps=args['eps'],
|
|
189
|
+
max_steps=args['max_steps'],
|
|
190
|
+
save_every=args['save_every'],
|
|
191
|
+
eval_every=args['eval_every'],
|
|
192
|
+
log_every=args['log_every'],
|
|
193
|
+
keep_last_n=args['keep_last_n'],
|
|
194
|
+
train_data_path=args['train_data'],
|
|
195
|
+
val_data_path=args['val_data'],
|
|
196
|
+
checkpoint_dir=args['checkpoint_dir'],
|
|
197
|
+
log_dir=args['log_dir'],
|
|
198
|
+
device=device,
|
|
199
|
+
seed=args['seed'],
|
|
200
|
+
)
|
|
201
|
+
|
|
202
|
+
# Create optimizer
|
|
203
|
+
if verbose:
|
|
204
|
+
print(f"\n⚙️ Setting up optimizer...")
|
|
205
|
+
print(f" Learning rate: {args['learning_rate']}")
|
|
206
|
+
print(f" Weight decay: {args['weight_decay']}")
|
|
207
|
+
|
|
208
|
+
optimizer = torch.optim.AdamW(
|
|
209
|
+
model.parameters(),
|
|
210
|
+
lr=args['learning_rate'],
|
|
211
|
+
betas=args['betas'],
|
|
212
|
+
eps=args['eps'],
|
|
213
|
+
weight_decay=args['weight_decay'],
|
|
214
|
+
)
|
|
215
|
+
|
|
216
|
+
# Create trainer
|
|
217
|
+
if verbose:
|
|
218
|
+
print(f"\n🎯 Initializing trainer...")
|
|
219
|
+
|
|
220
|
+
trainer = Trainer(
|
|
221
|
+
model=model,
|
|
222
|
+
train_loader=train_loader,
|
|
223
|
+
val_loader=val_loader,
|
|
224
|
+
optimizer=optimizer,
|
|
225
|
+
config=train_config,
|
|
226
|
+
device=device,
|
|
227
|
+
)
|
|
228
|
+
|
|
229
|
+
# Resume if requested
|
|
230
|
+
if args['resume_from'] is not None:
|
|
231
|
+
if verbose:
|
|
232
|
+
print(f"\n📥 Resuming from checkpoint: {args['resume_from']}")
|
|
233
|
+
trainer.resume_from_checkpoint(Path(args['resume_from']))
|
|
234
|
+
|
|
235
|
+
# Start training
|
|
236
|
+
if verbose:
|
|
237
|
+
print(f"\n{'='*60}")
|
|
238
|
+
print("🚀 Starting Training!")
|
|
239
|
+
print(f"{'='*60}\n")
|
|
240
|
+
|
|
241
|
+
try:
|
|
242
|
+
trainer.train()
|
|
243
|
+
except KeyboardInterrupt:
|
|
244
|
+
if verbose:
|
|
245
|
+
print("\n\n⏸️ Training interrupted by user")
|
|
246
|
+
print("💾 Saving checkpoint...")
|
|
247
|
+
trainer.checkpoint_manager.save_checkpoint(
|
|
248
|
+
model=model,
|
|
249
|
+
optimizer=optimizer,
|
|
250
|
+
step=trainer.global_step,
|
|
251
|
+
epoch=trainer.current_epoch,
|
|
252
|
+
val_loss=trainer.best_val_loss,
|
|
253
|
+
model_config=model_config.to_dict(),
|
|
254
|
+
train_config=train_config.to_dict(),
|
|
255
|
+
)
|
|
256
|
+
if verbose:
|
|
257
|
+
print("✓ Checkpoint saved. Resume with resume_from in config.")
|
|
258
|
+
|
|
259
|
+
# Return results
|
|
260
|
+
best_checkpoint = Path(train_config.checkpoint_dir) / "best_model.pt"
|
|
261
|
+
|
|
262
|
+
results = {
|
|
263
|
+
'best_checkpoint': str(best_checkpoint),
|
|
264
|
+
'final_val_loss': trainer.best_val_loss,
|
|
265
|
+
'total_epochs': trainer.current_epoch,
|
|
266
|
+
'checkpoint_dir': train_config.checkpoint_dir,
|
|
267
|
+
'log_dir': train_config.log_dir,
|
|
268
|
+
}
|
|
269
|
+
|
|
270
|
+
if verbose:
|
|
271
|
+
print(f"\n{'='*60}")
|
|
272
|
+
print("✅ Training Complete!")
|
|
273
|
+
print(f"{'='*60}")
|
|
274
|
+
print(f"\n📁 Results:")
|
|
275
|
+
print(f" Best checkpoint: {results['best_checkpoint']}")
|
|
276
|
+
print(f" Best val loss: {results['final_val_loss']:.4f}")
|
|
277
|
+
print(f" Total epochs: {results['total_epochs']}")
|
|
278
|
+
print(f" Logs: {results['log_dir']}")
|
|
279
|
+
|
|
280
|
+
return results
|
|
281
|
+
|
|
282
|
+
|
|
283
|
+
def generate(
|
|
284
|
+
checkpoint: str,
|
|
285
|
+
tokenizer: str,
|
|
286
|
+
prompt: str,
|
|
287
|
+
max_length: int = 100,
|
|
288
|
+
temperature: float = 0.7,
|
|
289
|
+
top_k: int = 50,
|
|
290
|
+
top_p: float = 0.9,
|
|
291
|
+
device: str = "cuda"
|
|
292
|
+
) -> str:
|
|
293
|
+
"""
|
|
294
|
+
Generate text using a trained model.
|
|
295
|
+
|
|
296
|
+
Args:
|
|
297
|
+
checkpoint: Path to model checkpoint (.pt file)
|
|
298
|
+
tokenizer: Path to tokenizer model (.model file)
|
|
299
|
+
prompt: Input text/question
|
|
300
|
+
max_length: Maximum tokens to generate
|
|
301
|
+
temperature: Sampling temperature (higher = more random)
|
|
302
|
+
top_k: Top-k sampling parameter
|
|
303
|
+
top_p: Nucleus sampling parameter
|
|
304
|
+
device: Device to use ('cuda' or 'cpu')
|
|
305
|
+
|
|
306
|
+
Returns:
|
|
307
|
+
Generated text
|
|
308
|
+
|
|
309
|
+
Example:
|
|
310
|
+
>>> import gptmed
|
|
311
|
+
>>>
|
|
312
|
+
>>> answer = gptmed.generate(
|
|
313
|
+
... checkpoint='model/checkpoints/best_model.pt',
|
|
314
|
+
... tokenizer='tokenizer/my_tokenizer.model',
|
|
315
|
+
... prompt='What is machine learning?',
|
|
316
|
+
... max_length=150,
|
|
317
|
+
... temperature=0.7
|
|
318
|
+
... )
|
|
319
|
+
>>> print(answer)
|
|
320
|
+
"""
|
|
321
|
+
# Load checkpoint
|
|
322
|
+
checkpoint_path = Path(checkpoint)
|
|
323
|
+
if not checkpoint_path.exists():
|
|
324
|
+
raise FileNotFoundError(f"Checkpoint not found: {checkpoint}")
|
|
325
|
+
|
|
326
|
+
checkpoint_data = torch.load(checkpoint_path, map_location=device)
|
|
327
|
+
|
|
328
|
+
# Load model config
|
|
329
|
+
from gptmed.model.configs.model_config import ModelConfig
|
|
330
|
+
model_config = ModelConfig.from_dict(checkpoint_data['model_config'])
|
|
331
|
+
|
|
332
|
+
# Create and load model
|
|
333
|
+
model = GPTTransformer(model_config)
|
|
334
|
+
model.load_state_dict(checkpoint_data['model_state_dict'])
|
|
335
|
+
|
|
336
|
+
# Create generator
|
|
337
|
+
generator = TextGenerator(
|
|
338
|
+
model=model,
|
|
339
|
+
tokenizer_path=tokenizer,
|
|
340
|
+
device=device
|
|
341
|
+
)
|
|
342
|
+
|
|
343
|
+
# Generate
|
|
344
|
+
output = generator.generate(
|
|
345
|
+
prompt=prompt,
|
|
346
|
+
max_length=max_length,
|
|
347
|
+
temperature=temperature,
|
|
348
|
+
top_k=top_k,
|
|
349
|
+
top_p=top_p
|
|
350
|
+
)
|
|
351
|
+
|
|
352
|
+
return output
|
|
@@ -0,0 +1,191 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Configuration File Loader
|
|
3
|
+
|
|
4
|
+
Load training configuration from YAML file for easy user customization.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import yaml
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
from typing import Dict, Any, Optional
|
|
10
|
+
from dataclasses import dataclass
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def load_yaml_config(config_path: str) -> Dict[str, Any]:
|
|
14
|
+
"""
|
|
15
|
+
Load configuration from YAML file.
|
|
16
|
+
|
|
17
|
+
Args:
|
|
18
|
+
config_path: Path to YAML configuration file
|
|
19
|
+
|
|
20
|
+
Returns:
|
|
21
|
+
Dictionary with configuration parameters
|
|
22
|
+
|
|
23
|
+
Raises:
|
|
24
|
+
FileNotFoundError: If config file doesn't exist
|
|
25
|
+
yaml.YAMLError: If YAML parsing fails
|
|
26
|
+
"""
|
|
27
|
+
config_path = Path(config_path)
|
|
28
|
+
|
|
29
|
+
if not config_path.exists():
|
|
30
|
+
raise FileNotFoundError(f"Configuration file not found: {config_path}")
|
|
31
|
+
|
|
32
|
+
with open(config_path, 'r') as f:
|
|
33
|
+
try:
|
|
34
|
+
config = yaml.safe_load(f)
|
|
35
|
+
except yaml.YAMLError as e:
|
|
36
|
+
raise ValueError(f"Error parsing YAML configuration: {e}")
|
|
37
|
+
|
|
38
|
+
return config
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def validate_config(config: Dict[str, Any]) -> None:
|
|
42
|
+
"""
|
|
43
|
+
Validate configuration parameters.
|
|
44
|
+
|
|
45
|
+
Args:
|
|
46
|
+
config: Configuration dictionary
|
|
47
|
+
|
|
48
|
+
Raises:
|
|
49
|
+
ValueError: If configuration is invalid
|
|
50
|
+
"""
|
|
51
|
+
# Check required sections
|
|
52
|
+
required_sections = ['model', 'data', 'training']
|
|
53
|
+
for section in required_sections:
|
|
54
|
+
if section not in config:
|
|
55
|
+
raise ValueError(f"Missing required section: {section}")
|
|
56
|
+
|
|
57
|
+
# Validate model size
|
|
58
|
+
valid_sizes = ['tiny', 'small', 'medium']
|
|
59
|
+
if config['model']['size'] not in valid_sizes:
|
|
60
|
+
raise ValueError(f"Invalid model size: {config['model']['size']}. "
|
|
61
|
+
f"Must be one of {valid_sizes}")
|
|
62
|
+
|
|
63
|
+
# Validate data paths
|
|
64
|
+
train_path = Path(config['data']['train_data'])
|
|
65
|
+
val_path = Path(config['data']['val_data'])
|
|
66
|
+
|
|
67
|
+
if not train_path.exists():
|
|
68
|
+
raise FileNotFoundError(f"Training data not found: {train_path}")
|
|
69
|
+
if not val_path.exists():
|
|
70
|
+
raise FileNotFoundError(f"Validation data not found: {val_path}")
|
|
71
|
+
|
|
72
|
+
# Validate training parameters
|
|
73
|
+
if config['training']['num_epochs'] <= 0:
|
|
74
|
+
raise ValueError("num_epochs must be positive")
|
|
75
|
+
if config['training']['batch_size'] <= 0:
|
|
76
|
+
raise ValueError("batch_size must be positive")
|
|
77
|
+
if config['training']['learning_rate'] <= 0:
|
|
78
|
+
raise ValueError("learning_rate must be positive")
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
def config_to_args(config: Dict[str, Any]) -> Dict[str, Any]:
|
|
82
|
+
"""
|
|
83
|
+
Convert YAML config to training arguments.
|
|
84
|
+
|
|
85
|
+
Args:
|
|
86
|
+
config: Configuration dictionary from YAML
|
|
87
|
+
|
|
88
|
+
Returns:
|
|
89
|
+
Flattened dictionary suitable for training
|
|
90
|
+
"""
|
|
91
|
+
args = {
|
|
92
|
+
# Model
|
|
93
|
+
'model_size': config['model']['size'],
|
|
94
|
+
|
|
95
|
+
# Data
|
|
96
|
+
'train_data': config['data']['train_data'],
|
|
97
|
+
'val_data': config['data']['val_data'],
|
|
98
|
+
|
|
99
|
+
# Training
|
|
100
|
+
'num_epochs': config['training']['num_epochs'],
|
|
101
|
+
'batch_size': config['training']['batch_size'],
|
|
102
|
+
'learning_rate': config['training']['learning_rate'],
|
|
103
|
+
'weight_decay': config['training']['weight_decay'],
|
|
104
|
+
'grad_clip': config['training']['grad_clip'],
|
|
105
|
+
'warmup_steps': config['training']['warmup_steps'],
|
|
106
|
+
|
|
107
|
+
# Optimizer
|
|
108
|
+
'betas': tuple(config['optimizer']['betas']),
|
|
109
|
+
'eps': config['optimizer']['eps'],
|
|
110
|
+
|
|
111
|
+
# Checkpointing
|
|
112
|
+
'checkpoint_dir': config['checkpointing']['checkpoint_dir'],
|
|
113
|
+
'save_every': config['checkpointing']['save_every'],
|
|
114
|
+
'keep_last_n': config['checkpointing']['keep_last_n'],
|
|
115
|
+
|
|
116
|
+
# Logging
|
|
117
|
+
'log_dir': config['logging']['log_dir'],
|
|
118
|
+
'eval_every': config['logging']['eval_every'],
|
|
119
|
+
'log_every': config['logging']['log_every'],
|
|
120
|
+
|
|
121
|
+
# Device
|
|
122
|
+
'device': config['device']['device'],
|
|
123
|
+
'seed': config['device']['seed'],
|
|
124
|
+
|
|
125
|
+
# Advanced
|
|
126
|
+
'max_steps': config.get('advanced', {}).get('max_steps', -1),
|
|
127
|
+
'resume_from': config.get('advanced', {}).get('resume_from'),
|
|
128
|
+
'quick_test': config.get('advanced', {}).get('quick_test', False),
|
|
129
|
+
}
|
|
130
|
+
|
|
131
|
+
return args
|
|
132
|
+
|
|
133
|
+
|
|
134
|
+
def create_default_config_file(output_path: str = 'training_config.yaml') -> None:
|
|
135
|
+
"""
|
|
136
|
+
Create a default configuration file template.
|
|
137
|
+
|
|
138
|
+
Args:
|
|
139
|
+
output_path: Path where to save the config file
|
|
140
|
+
"""
|
|
141
|
+
default_config = {
|
|
142
|
+
'model': {
|
|
143
|
+
'size': 'small'
|
|
144
|
+
},
|
|
145
|
+
'data': {
|
|
146
|
+
'train_data': './data/tokenized/train.npy',
|
|
147
|
+
'val_data': './data/tokenized/val.npy'
|
|
148
|
+
},
|
|
149
|
+
'training': {
|
|
150
|
+
'num_epochs': 10,
|
|
151
|
+
'batch_size': 16,
|
|
152
|
+
'learning_rate': 0.0003,
|
|
153
|
+
'weight_decay': 0.01,
|
|
154
|
+
'grad_clip': 1.0,
|
|
155
|
+
'warmup_steps': 100
|
|
156
|
+
},
|
|
157
|
+
'optimizer': {
|
|
158
|
+
'betas': [0.9, 0.95],
|
|
159
|
+
'eps': 1.0e-8
|
|
160
|
+
},
|
|
161
|
+
'checkpointing': {
|
|
162
|
+
'checkpoint_dir': './model/checkpoints',
|
|
163
|
+
'save_every': 1,
|
|
164
|
+
'keep_last_n': 3
|
|
165
|
+
},
|
|
166
|
+
'logging': {
|
|
167
|
+
'log_dir': './logs',
|
|
168
|
+
'eval_every': 100,
|
|
169
|
+
'log_every': 10
|
|
170
|
+
},
|
|
171
|
+
'device': {
|
|
172
|
+
'device': 'cuda',
|
|
173
|
+
'seed': 42
|
|
174
|
+
},
|
|
175
|
+
'advanced': {
|
|
176
|
+
'max_steps': -1,
|
|
177
|
+
'resume_from': None,
|
|
178
|
+
'quick_test': False
|
|
179
|
+
}
|
|
180
|
+
}
|
|
181
|
+
|
|
182
|
+
output_path = Path(output_path)
|
|
183
|
+
|
|
184
|
+
# Create directory if it doesn't exist
|
|
185
|
+
output_path.parent.mkdir(parents=True, exist_ok=True)
|
|
186
|
+
|
|
187
|
+
with open(output_path, 'w') as f:
|
|
188
|
+
yaml.dump(default_config, f, default_flow_style=False, sort_keys=False)
|
|
189
|
+
|
|
190
|
+
print(f"✓ Created default configuration file: {output_path}")
|
|
191
|
+
print(f" Edit this file and then run: gptmed.train_from_config('{output_path}')")
|
|
@@ -0,0 +1,64 @@
|
|
|
1
|
+
# GptMed Training Configuration File
|
|
2
|
+
# Edit these parameters for your training needs
|
|
3
|
+
|
|
4
|
+
# ============================================================
|
|
5
|
+
# MODEL CONFIGURATION
|
|
6
|
+
# ============================================================
|
|
7
|
+
model:
|
|
8
|
+
size: small # Options: tiny, small, medium
|
|
9
|
+
# tiny: ~2M params (testing)
|
|
10
|
+
# small: ~10M params (recommended)
|
|
11
|
+
# medium: ~50M params (high quality)
|
|
12
|
+
|
|
13
|
+
# ============================================================
|
|
14
|
+
# DATA PATHS
|
|
15
|
+
# ============================================================
|
|
16
|
+
data:
|
|
17
|
+
train_data: ./data/tokenized/train.npy # Path to training data (.npy file)
|
|
18
|
+
val_data: ./data/tokenized/val.npy # Path to validation data (.npy file)
|
|
19
|
+
|
|
20
|
+
# ============================================================
|
|
21
|
+
# TRAINING HYPERPARAMETERS
|
|
22
|
+
# ============================================================
|
|
23
|
+
training:
|
|
24
|
+
num_epochs: 10 # Number of training epochs
|
|
25
|
+
batch_size: 16 # Batch size (reduce if OOM: 8, 4)
|
|
26
|
+
learning_rate: 0.0003 # Learning rate (3e-4)
|
|
27
|
+
weight_decay: 0.01 # Weight decay for regularization
|
|
28
|
+
grad_clip: 1.0 # Gradient clipping value
|
|
29
|
+
warmup_steps: 100 # Learning rate warmup steps
|
|
30
|
+
|
|
31
|
+
# ============================================================
|
|
32
|
+
# OPTIMIZER SETTINGS
|
|
33
|
+
# ============================================================
|
|
34
|
+
optimizer:
|
|
35
|
+
betas: [0.9, 0.95] # Adam beta parameters
|
|
36
|
+
eps: 1.0e-8 # Adam epsilon
|
|
37
|
+
|
|
38
|
+
# ============================================================
|
|
39
|
+
# CHECKPOINTING & LOGGING
|
|
40
|
+
# ============================================================
|
|
41
|
+
checkpointing:
|
|
42
|
+
checkpoint_dir: ./model/checkpoints # Directory to save checkpoints
|
|
43
|
+
save_every: 1 # Save checkpoint every N epochs
|
|
44
|
+
keep_last_n: 3 # Keep last N checkpoints
|
|
45
|
+
|
|
46
|
+
logging:
|
|
47
|
+
log_dir: ./logs # Directory for training logs
|
|
48
|
+
eval_every: 100 # Evaluate every N steps
|
|
49
|
+
log_every: 10 # Log metrics every N steps
|
|
50
|
+
|
|
51
|
+
# ============================================================
|
|
52
|
+
# DEVICE & PERFORMANCE
|
|
53
|
+
# ============================================================
|
|
54
|
+
device:
|
|
55
|
+
device: cuda # Options: cuda, cpu
|
|
56
|
+
seed: 42 # Random seed for reproducibility
|
|
57
|
+
|
|
58
|
+
# ============================================================
|
|
59
|
+
# ADVANCED OPTIONS (optional)
|
|
60
|
+
# ============================================================
|
|
61
|
+
advanced:
|
|
62
|
+
max_steps: -1 # Max training steps (-1 = use num_epochs)
|
|
63
|
+
resume_from: null # Path to checkpoint to resume from (null = start fresh)
|
|
64
|
+
quick_test: false # Use quick test config for debugging
|
gptmed/inference/generator.py
CHANGED
|
@@ -38,11 +38,11 @@ import sentencepiece as spm
|
|
|
38
38
|
from pathlib import Path
|
|
39
39
|
from typing import List, Optional
|
|
40
40
|
|
|
41
|
-
from
|
|
42
|
-
from
|
|
43
|
-
from
|
|
44
|
-
from
|
|
45
|
-
from
|
|
41
|
+
from gptmed.model.architecture import GPTTransformer
|
|
42
|
+
from gptmed.model.configs.model_config import ModelConfig
|
|
43
|
+
from gptmed.inference.generation_config import GenerationConfig
|
|
44
|
+
from gptmed.inference.sampling import sample_next_token
|
|
45
|
+
from gptmed.inference.decoding_utils import (
|
|
46
46
|
apply_repetition_penalty,
|
|
47
47
|
block_ngram_repeats,
|
|
48
48
|
should_stop_generation,
|
gptmed/model/__init__.py
CHANGED
gptmed/model/configs/__init__.py
CHANGED
gptmed/tokenizer/__init__.py
CHANGED
gptmed/training/train.py
CHANGED
|
@@ -46,11 +46,11 @@ import sys
|
|
|
46
46
|
# Add parent directory to path for imports
|
|
47
47
|
sys.path.insert(0, str(Path(__file__).parent.parent))
|
|
48
48
|
|
|
49
|
-
from
|
|
50
|
-
from
|
|
51
|
-
from
|
|
52
|
-
from
|
|
53
|
-
from
|
|
49
|
+
from gptmed.model.architecture import GPTTransformer
|
|
50
|
+
from gptmed.model.configs.model_config import get_small_config, get_tiny_config
|
|
51
|
+
from gptmed.configs.train_config import get_default_config, get_quick_test_config
|
|
52
|
+
from gptmed.training.dataset import create_dataloaders
|
|
53
|
+
from gptmed.training.trainer import Trainer
|
|
54
54
|
|
|
55
55
|
|
|
56
56
|
def set_seed(seed: int):
|
|
@@ -83,7 +83,7 @@ def count_parameters(model):
|
|
|
83
83
|
|
|
84
84
|
|
|
85
85
|
def main():
|
|
86
|
-
parser = argparse.ArgumentParser(description="Train GPT model
|
|
86
|
+
parser = argparse.ArgumentParser(description="Train your GPT model")
|
|
87
87
|
|
|
88
88
|
# Model config
|
|
89
89
|
parser.add_argument(
|
|
@@ -136,7 +136,7 @@ def main():
|
|
|
136
136
|
args = parser.parse_args()
|
|
137
137
|
|
|
138
138
|
print("=" * 60)
|
|
139
|
-
print("GPT Training
|
|
139
|
+
print("GPT Training Script")
|
|
140
140
|
print("=" * 60)
|
|
141
141
|
|
|
142
142
|
# Check CUDA availability
|
|
@@ -170,7 +170,6 @@ def main():
|
|
|
170
170
|
print("Using quick test config (fast debugging)")
|
|
171
171
|
else:
|
|
172
172
|
train_config = get_default_config()
|
|
173
|
-
|
|
174
173
|
# Override with command-line args
|
|
175
174
|
if args.batch_size is not None:
|
|
176
175
|
train_config.batch_size = args.batch_size
|
gptmed/training/trainer.py
CHANGED
|
@@ -51,16 +51,16 @@ import time
|
|
|
51
51
|
from pathlib import Path
|
|
52
52
|
from typing import Optional
|
|
53
53
|
|
|
54
|
-
from
|
|
55
|
-
from
|
|
54
|
+
from gptmed.model.architecture import GPTTransformer
|
|
55
|
+
from gptmed.training.utils import (
|
|
56
56
|
clip_grad_norm,
|
|
57
57
|
get_lr_with_warmup,
|
|
58
58
|
set_learning_rate,
|
|
59
59
|
estimate_loss_dataloader,
|
|
60
60
|
compute_perplexity,
|
|
61
61
|
)
|
|
62
|
-
from
|
|
63
|
-
from
|
|
62
|
+
from gptmed.utils.logging import MetricsLogger, log_training_step, log_validation
|
|
63
|
+
from gptmed.utils.checkpoints import CheckpointManager
|
|
64
64
|
|
|
65
65
|
|
|
66
66
|
class Trainer:
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: gptmed
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.1.2
|
|
4
4
|
Summary: A lightweight GPT-based language model framework for training custom question-answering models on any domain
|
|
5
5
|
Author-email: Sanjog Sigdel <sigdelsanjog@gmail.com>
|
|
6
6
|
Maintainer-email: Sanjog Sigdel <sigdelsanjog@gmail.com>
|
|
@@ -51,6 +51,7 @@ Requires-Dist: torch>=2.0.0
|
|
|
51
51
|
Requires-Dist: sentencepiece>=0.1.99
|
|
52
52
|
Requires-Dist: numpy>=1.24.0
|
|
53
53
|
Requires-Dist: tqdm>=4.65.0
|
|
54
|
+
Requires-Dist: pyyaml>=6.0
|
|
54
55
|
Provides-Extra: dev
|
|
55
56
|
Requires-Dist: pytest>=7.0.0; extra == "dev"
|
|
56
57
|
Requires-Dist: black>=22.0.0; extra == "dev"
|
|
@@ -69,6 +70,10 @@ A lightweight GPT-based language model framework for training custom question-an
|
|
|
69
70
|
[](https://www.python.org/downloads/)
|
|
70
71
|
[](https://opensource.org/licenses/MIT)
|
|
71
72
|
|
|
73
|
+
## 📖 [Complete User Manual](USER_MANUAL.md) | [Quick Start](#quick-start)
|
|
74
|
+
|
|
75
|
+
> **New to GptMed?** Check out the [**step-by-step User Manual**](USER_MANUAL.md) for a complete guide on training your own model!
|
|
76
|
+
|
|
72
77
|
## Features
|
|
73
78
|
|
|
74
79
|
- 🧠 **Custom GPT Architecture**: Lightweight transformer model for any Q&A domain
|
|
@@ -78,6 +83,27 @@ A lightweight GPT-based language model framework for training custom question-an
|
|
|
78
83
|
- 📦 **Lightweight**: Small model size suitable for edge deployment
|
|
79
84
|
- 🛠️ **Complete Toolkit**: Includes tokenizer training, model training, and inference utilities
|
|
80
85
|
|
|
86
|
+
## Table of Contents
|
|
87
|
+
|
|
88
|
+
- [Features](#features)
|
|
89
|
+
- [Installation](#installation)
|
|
90
|
+
- [Quick Start](#quick-start)
|
|
91
|
+
- [Package Structure](#package-structure)
|
|
92
|
+
- [Core Modules](#core-modules)
|
|
93
|
+
- [Model Components](#model-components)
|
|
94
|
+
- [Training Components](#training-components)
|
|
95
|
+
- [Inference Components](#inference-components)
|
|
96
|
+
- [Data Processing](#data-processing)
|
|
97
|
+
- [Utilities](#utilities)
|
|
98
|
+
- [Model Architecture](#model-architecture)
|
|
99
|
+
- [Configuration](#configuration)
|
|
100
|
+
- [Documentation](#documentation)
|
|
101
|
+
- [Performance](#performance)
|
|
102
|
+
- [Examples](#examples)
|
|
103
|
+
- [Contributing](#contributing)
|
|
104
|
+
- [License](#license)
|
|
105
|
+
- [Support](#support)
|
|
106
|
+
|
|
81
107
|
## Installation
|
|
82
108
|
|
|
83
109
|
### From PyPI (Recommended)
|
|
@@ -204,27 +230,134 @@ config = TrainingConfig(
|
|
|
204
230
|
)
|
|
205
231
|
```
|
|
206
232
|
|
|
207
|
-
##
|
|
233
|
+
## Package Structure
|
|
234
|
+
|
|
235
|
+
### Core Modules
|
|
236
|
+
|
|
237
|
+
The `gptmed` package contains the following main modules:
|
|
238
|
+
|
|
239
|
+
```
|
|
240
|
+
gptmed/
|
|
241
|
+
├── model/ # Model architecture and configurations
|
|
242
|
+
├── inference/ # Text generation and sampling
|
|
243
|
+
├── training/ # Training loops and datasets
|
|
244
|
+
├── tokenizer/ # Tokenizer training and data processing
|
|
245
|
+
├── data/ # Data parsers and formatters
|
|
246
|
+
├── configs/ # Training configurations
|
|
247
|
+
└── utils/ # Utilities (checkpoints, logging)
|
|
248
|
+
```
|
|
249
|
+
|
|
250
|
+
### Model Components
|
|
251
|
+
|
|
252
|
+
**`gptmed.model.architecture`** - GPT Transformer Implementation
|
|
253
|
+
|
|
254
|
+
- `GPTTransformer` - Main model class
|
|
255
|
+
- `TransformerBlock` - Individual transformer layers
|
|
256
|
+
- `MultiHeadAttention` - Attention mechanism
|
|
257
|
+
- `FeedForward` - Feed-forward networks
|
|
258
|
+
- `RoPEPositionalEncoding` - Rotary position embeddings
|
|
259
|
+
|
|
260
|
+
**`gptmed.model.configs`** - Model Configurations
|
|
261
|
+
|
|
262
|
+
- `get_tiny_config()` - ~2M parameters (testing)
|
|
263
|
+
- `get_small_config()` - ~10M parameters (recommended)
|
|
264
|
+
- `get_medium_config()` - ~50M parameters (high quality)
|
|
265
|
+
- `ModelConfig` - Custom configuration class
|
|
266
|
+
|
|
267
|
+
### Training Components
|
|
268
|
+
|
|
269
|
+
**`gptmed.training`** - Training Pipeline
|
|
270
|
+
|
|
271
|
+
- `train.py` - Main training script (CLI: `gptmed-train`)
|
|
272
|
+
- `Trainer` - Training loop with checkpointing
|
|
273
|
+
- `TokenizedDataset` - PyTorch dataset for tokenized data
|
|
274
|
+
- `create_dataloaders()` - DataLoader creation utilities
|
|
275
|
+
|
|
276
|
+
**`gptmed.configs`** - Training Configurations
|
|
277
|
+
|
|
278
|
+
- `TrainingConfig` - Training hyperparameters
|
|
279
|
+
- `get_default_config()` - Default training settings
|
|
280
|
+
- `get_quick_test_config()` - Fast testing configuration
|
|
281
|
+
|
|
282
|
+
### Inference Components
|
|
283
|
+
|
|
284
|
+
**`gptmed.inference`** - Text Generation
|
|
285
|
+
|
|
286
|
+
- `TextGenerator` - Main generation class
|
|
287
|
+
- `generator.py` - CLI command (CLI: `gptmed-generate`)
|
|
288
|
+
- `sampling.py` - Sampling strategies (top-k, top-p, temperature)
|
|
289
|
+
- `decoding_utils.py` - Decoding utilities
|
|
290
|
+
- `GenerationConfig` - Generation parameters
|
|
291
|
+
|
|
292
|
+
### Data Processing
|
|
293
|
+
|
|
294
|
+
**`gptmed.tokenizer`** - Tokenizer Training & Data Processing
|
|
295
|
+
|
|
296
|
+
- `train_tokenizer.py` - Train SentencePiece tokenizer
|
|
297
|
+
- `tokenize_data.py` - Convert text to token sequences
|
|
298
|
+
- SentencePiece BPE tokenizer support
|
|
299
|
+
|
|
300
|
+
**`gptmed.data.parsers`** - Data Parsing & Formatting
|
|
301
|
+
|
|
302
|
+
- `MedQuADParser` - XML Q&A parser (example)
|
|
303
|
+
- `CausalTextFormatter` - Format Q&A pairs for training
|
|
304
|
+
- `FormatConfig` - Formatting configuration
|
|
305
|
+
|
|
306
|
+
### Utilities
|
|
307
|
+
|
|
308
|
+
**`gptmed.utils`** - Helper Functions
|
|
309
|
+
|
|
310
|
+
- `checkpoints.py` - Model checkpoint management
|
|
311
|
+
- `logging.py` - Training metrics logging
|
|
312
|
+
|
|
313
|
+
---
|
|
314
|
+
|
|
315
|
+
## Detailed Project Structure
|
|
208
316
|
|
|
209
317
|
```
|
|
210
318
|
gptmed/
|
|
211
319
|
├── model/
|
|
212
|
-
│ ├── architecture/
|
|
213
|
-
│
|
|
320
|
+
│ ├── architecture/
|
|
321
|
+
│ │ ├── gpt.py # GPT transformer model
|
|
322
|
+
│ │ ├── attention.py # Multi-head attention
|
|
323
|
+
│ │ ├── feedforward.py # Feed-forward networks
|
|
324
|
+
│ │ └── embeddings.py # Token + positional embeddings
|
|
325
|
+
│ └── configs/
|
|
326
|
+
│ └── model_config.py # Model size configurations
|
|
214
327
|
├── inference/
|
|
215
|
-
│ ├── generator.py
|
|
216
|
-
│
|
|
328
|
+
│ ├── generator.py # Text generation (CLI command)
|
|
329
|
+
│ ├── sampling.py # Sampling strategies
|
|
330
|
+
│ ├── decoding_utils.py # Decoding utilities
|
|
331
|
+
│ └── generation_config.py # Generation parameters
|
|
217
332
|
├── training/
|
|
218
|
-
│ ├── train.py
|
|
219
|
-
│ ├── trainer.py
|
|
220
|
-
│
|
|
333
|
+
│ ├── train.py # Main training script (CLI command)
|
|
334
|
+
│ ├── trainer.py # Training loop
|
|
335
|
+
│ ├── dataset.py # PyTorch dataset
|
|
336
|
+
│ └── utils.py # Training utilities
|
|
221
337
|
├── tokenizer/
|
|
222
|
-
│
|
|
338
|
+
│ ├── train_tokenizer.py # Train SentencePiece tokenizer
|
|
339
|
+
│ └── tokenize_data.py # Tokenize text data
|
|
340
|
+
├── data/
|
|
341
|
+
│ └── parsers/
|
|
342
|
+
│ ├── medquad_parser.py # Example XML parser
|
|
343
|
+
│ └── text_formatter.py # Q&A text formatter
|
|
223
344
|
├── configs/
|
|
224
|
-
│ └── train_config.py
|
|
345
|
+
│ └── train_config.py # Training configurations
|
|
225
346
|
└── utils/
|
|
226
|
-
├── checkpoints.py
|
|
227
|
-
└── logging.py
|
|
347
|
+
├── checkpoints.py # Model checkpointing
|
|
348
|
+
└── logging.py # Training logging
|
|
349
|
+
```
|
|
350
|
+
|
|
351
|
+
### Command-Line Interface
|
|
352
|
+
|
|
353
|
+
The package provides two main CLI commands:
|
|
354
|
+
|
|
355
|
+
```bash
|
|
356
|
+
# Train a model
|
|
357
|
+
gptmed-train --model-size small --num-epochs 10 --batch-size 16
|
|
358
|
+
|
|
359
|
+
# Generate text
|
|
360
|
+
gptmed-generate --prompt "Your question?" --max-length 100
|
|
228
361
|
```
|
|
229
362
|
|
|
230
363
|
## Requirements
|
|
@@ -237,14 +370,14 @@ gptmed/
|
|
|
237
370
|
|
|
238
371
|
## Documentation
|
|
239
372
|
|
|
240
|
-
|
|
373
|
+
📚 **[Complete User Manual](USER_MANUAL.md)** - Step-by-step guide for training your own model
|
|
241
374
|
|
|
242
|
-
###
|
|
375
|
+
### Quick Links
|
|
243
376
|
|
|
244
|
-
- [
|
|
245
|
-
- [
|
|
246
|
-
- [
|
|
247
|
-
- [
|
|
377
|
+
- [User Manual](USER_MANUAL.md) - **Start here!** Complete training pipeline guide
|
|
378
|
+
- [Architecture Guide](ARCHITECTURE_EXTENSION_GUIDE.md) - Understanding the model architecture
|
|
379
|
+
- [Deployment Guide](DEPLOYMENT_GUIDE.md) - Publishing to PyPI
|
|
380
|
+
- [Changelog](CHANGELOG.md) - Version history
|
|
248
381
|
|
|
249
382
|
## Performance
|
|
250
383
|
|
|
@@ -312,7 +445,8 @@ This project is licensed under the MIT License - see the [LICENSE](LICENSE) file
|
|
|
312
445
|
|
|
313
446
|
## Support
|
|
314
447
|
|
|
315
|
-
-
|
|
448
|
+
- � **[User Manual](USER_MANUAL.md)** - Complete step-by-step training guide
|
|
449
|
+
- �📫 Issues: [GitHub Issues](https://github.com/sigdelsanjog/gptmed/issues)
|
|
316
450
|
- 💬 Discussions: [GitHub Discussions](https://github.com/sigdelsanjog/gptmed/discussions)
|
|
317
451
|
- 📧 Email: sanjog.sigdel@ku.edu.np
|
|
318
452
|
|
|
@@ -1,6 +1,9 @@
|
|
|
1
|
-
gptmed/__init__.py,sha256=
|
|
1
|
+
gptmed/__init__.py,sha256=RoPQCLWrTlvpaKgboeoH9OXn8BB3gmZDUFAytvL8lVU,1676
|
|
2
|
+
gptmed/api.py,sha256=IU5r9ujg3S-Lem5-FOGDDLdh1UJ_FqCbaQayzyJez5c,10774
|
|
2
3
|
gptmed/configs/__init__.py,sha256=yRa-zgPQ-OCzu8fvCrfWMG-CjF3dru3PZzknzm0oUaQ,23
|
|
4
|
+
gptmed/configs/config_loader.py,sha256=hkJRN-Rh6dlzOf97yOjPCVdy-Es5YjgKo3-iOwAeXEI,5833
|
|
3
5
|
gptmed/configs/train_config.py,sha256=KqfNBh9hdTTd_6gEAlrClU8sVFSlVDmZJOrf3cPwFe8,4657
|
|
6
|
+
gptmed/configs/training_config.yaml,sha256=EEZZa3kcsZr3g-_fKDPYZt4_NTpmS-3NvJrTYSWNc8g,2874
|
|
4
7
|
gptmed/data/__init__.py,sha256=iAHeakB5pBAd7MkmarPPY0UKS9bTaO_winLZ23Y2O90,54
|
|
5
8
|
gptmed/data/parsers/__init__.py,sha256=BgVzXuZgeE5DUCC4SzN7vflL40wQ4Q4_4DmJ1Y43_nw,211
|
|
6
9
|
gptmed/data/parsers/medquad_parser.py,sha256=g3QCRiVBdcq8RdyuYH_qKFrHgU5KkHY59WfWxUwspP0,7974
|
|
@@ -8,31 +11,31 @@ gptmed/data/parsers/text_formatter.py,sha256=tVmnDBT54BbxX9BPKMXSPzzLmM39frDxKRK
|
|
|
8
11
|
gptmed/inference/__init__.py,sha256=NDPViXhOgpItC8n13T9axX4UH1E7mrjt6kJ5OfIwvMs,25
|
|
9
12
|
gptmed/inference/decoding_utils.py,sha256=zTDZYdl2jcGwSrcINXMw-5uoYuF4A9TSushhPxJi1o0,5041
|
|
10
13
|
gptmed/inference/generation_config.py,sha256=hpPyZUk1K6qGSBAoQx3Jm0_ZrrYld77ACxbIlCCCcVU,2813
|
|
11
|
-
gptmed/inference/generator.py,sha256=
|
|
14
|
+
gptmed/inference/generator.py,sha256=6JFmDPQF4btau_Gp5pfk8a5G0Iyg6QsB9Y8Oo4ygH-4,7884
|
|
12
15
|
gptmed/inference/sampling.py,sha256=B6fRlJafypuBMKJ0rTbsk6k8KXloXiIvroi7rN6ekBA,7947
|
|
13
|
-
gptmed/model/__init__.py,sha256=
|
|
16
|
+
gptmed/model/__init__.py,sha256=brAE8ZhCDeQlU13YSqiBZTrTE8SHL_3bvFhZMzZnh3A,191
|
|
14
17
|
gptmed/model/architecture/__init__.py,sha256=9MpSAYwwZY-t1vBLIupuRtLD7CaOLJRENMh3zKx3M-4,970
|
|
15
18
|
gptmed/model/architecture/attention.py,sha256=Qk1eGl9glKWQbhcXJWmFkO5U3VHBq7OrsjVG0tPmgnY,6420
|
|
16
19
|
gptmed/model/architecture/decoder_block.py,sha256=n-Uo09TDcirKeWTWTNumldGOrx-b2Elb25lbF6cTYwg,3879
|
|
17
20
|
gptmed/model/architecture/embeddings.py,sha256=GoVXctC21MsNwyoIiOq7NX-v_DzYkbFcQAfvZ2fg66s,4717
|
|
18
21
|
gptmed/model/architecture/feedforward.py,sha256=uJ5QOlWX0ritKDQLUE7GPmMojelR9-sTI_BaYc4Ehfw,3232
|
|
19
22
|
gptmed/model/architecture/transformer.py,sha256=H1njPoy0Uam59JbA24C0olEDwPfhh3ev4HsUFRIC_0Y,6626
|
|
20
|
-
gptmed/model/configs/__init__.py,sha256=
|
|
23
|
+
gptmed/model/configs/__init__.py,sha256=LDCWhlCDOU7490wcfSId_jXBPfQrtYQEw8FoD67rqBs,275
|
|
21
24
|
gptmed/model/configs/model_config.py,sha256=wI-i2Dw_pTdIKCDe1pqLvP3ky3YedEy7DwZYN5lwmKE,4673
|
|
22
|
-
gptmed/tokenizer/__init__.py,sha256=
|
|
25
|
+
gptmed/tokenizer/__init__.py,sha256=KhLAHPmQyoWhnKDenyIJRxgFflKI7xklip28j4cKfKw,157
|
|
23
26
|
gptmed/tokenizer/tokenize_data.py,sha256=KgMtMfaz_RtOhN_CrvC267k9ujxRdO89rToVJ6nzdwg,9139
|
|
24
27
|
gptmed/tokenizer/train_tokenizer.py,sha256=f0Hucyft9e8LU2RtpTqg8h_0SpOC_oMABl0_me-wfL8,7068
|
|
25
28
|
gptmed/training/__init__.py,sha256=6G0_gdlwBnQBG8wZlTm2NtgkXZJcXRfLMDQ2iu6O3U4,24
|
|
26
29
|
gptmed/training/dataset.py,sha256=QbNVTN4Og5gqMAV2ckjRX8W_k9aUc9IZJDcu0u9U8t0,5347
|
|
27
|
-
gptmed/training/train.py,sha256=
|
|
28
|
-
gptmed/training/trainer.py,sha256=
|
|
30
|
+
gptmed/training/train.py,sha256=sp4-1WpEXUTA9V0GUYAgSvMd2aaPkt1aq2PepQFLXD8,8142
|
|
31
|
+
gptmed/training/trainer.py,sha256=asOKT9d7lvmtEm5PIcMHg8iUdulNJpobNFNwOjdkeEg,10728
|
|
29
32
|
gptmed/training/utils.py,sha256=pJxCwneNr2STITIYwIDCxRzIICDFOxOMzK8DT7ck2oQ,5651
|
|
30
33
|
gptmed/utils/__init__.py,sha256=XuMhIqOXF7mjnog_6Iky-hSbwvFb0iK42B4iDUpgi0U,44
|
|
31
34
|
gptmed/utils/checkpoints.py,sha256=L4q1-_4GbHCoD7QuEKYeQ-xXDTF-6sqZOxKQ_LT8YmQ,7112
|
|
32
35
|
gptmed/utils/logging.py,sha256=7dJc1tayMxCBjFSDXe4r9ACUTpoPTTGsJ0UZMTqZIDY,5303
|
|
33
|
-
gptmed-0.
|
|
34
|
-
gptmed-0.
|
|
35
|
-
gptmed-0.
|
|
36
|
-
gptmed-0.
|
|
37
|
-
gptmed-0.
|
|
38
|
-
gptmed-0.
|
|
36
|
+
gptmed-0.1.2.dist-info/licenses/LICENSE,sha256=v2spsd7N1pKFFh2G8wGP_45iwe5S0DYiJzG4im8Rupc,1066
|
|
37
|
+
gptmed-0.1.2.dist-info/METADATA,sha256=XEC0i9WiztPA54N0KnTJL9fBuuoAdmUhR5GeENivki8,14876
|
|
38
|
+
gptmed-0.1.2.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
39
|
+
gptmed-0.1.2.dist-info/entry_points.txt,sha256=ATqOzTtPVdUiFX5ZSeo3n9JkUCqocUxEXTgy1CfNRZE,110
|
|
40
|
+
gptmed-0.1.2.dist-info/top_level.txt,sha256=mhyEq3rG33t21ziJz5w3TPgx0RjPf4zXMNUx2JTiNmE,7
|
|
41
|
+
gptmed-0.1.2.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|