gptmed 0.0.1__tar.gz → 0.1.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {gptmed-0.0.1/gptmed.egg-info → gptmed-0.1.2}/PKG-INFO +154 -20
- {gptmed-0.0.1 → gptmed-0.1.2}/README.md +152 -19
- gptmed-0.1.2/gptmed/__init__.py +60 -0
- gptmed-0.1.2/gptmed/api.py +352 -0
- gptmed-0.1.2/gptmed/configs/config_loader.py +191 -0
- gptmed-0.1.2/gptmed/configs/training_config.yaml +64 -0
- {gptmed-0.0.1 → gptmed-0.1.2}/gptmed/inference/generator.py +5 -5
- {gptmed-0.0.1 → gptmed-0.1.2}/gptmed/model/__init__.py +1 -1
- {gptmed-0.0.1 → gptmed-0.1.2}/gptmed/model/configs/__init__.py +1 -1
- {gptmed-0.0.1 → gptmed-0.1.2}/gptmed/tokenizer/__init__.py +1 -1
- {gptmed-0.0.1 → gptmed-0.1.2}/gptmed/training/train.py +7 -8
- {gptmed-0.0.1 → gptmed-0.1.2}/gptmed/training/trainer.py +4 -4
- {gptmed-0.0.1 → gptmed-0.1.2/gptmed.egg-info}/PKG-INFO +154 -20
- {gptmed-0.0.1 → gptmed-0.1.2}/gptmed.egg-info/SOURCES.txt +3 -0
- {gptmed-0.0.1 → gptmed-0.1.2}/gptmed.egg-info/requires.txt +1 -0
- {gptmed-0.0.1 → gptmed-0.1.2}/pyproject.toml +2 -1
- gptmed-0.0.1/gptmed/__init__.py +0 -37
- {gptmed-0.0.1 → gptmed-0.1.2}/LICENSE +0 -0
- {gptmed-0.0.1 → gptmed-0.1.2}/MANIFEST.in +0 -0
- {gptmed-0.0.1 → gptmed-0.1.2}/gptmed/configs/__init__.py +0 -0
- {gptmed-0.0.1 → gptmed-0.1.2}/gptmed/configs/train_config.py +0 -0
- {gptmed-0.0.1 → gptmed-0.1.2}/gptmed/data/__init__.py +0 -0
- {gptmed-0.0.1 → gptmed-0.1.2}/gptmed/data/parsers/__init__.py +0 -0
- {gptmed-0.0.1 → gptmed-0.1.2}/gptmed/data/parsers/medquad_parser.py +0 -0
- {gptmed-0.0.1 → gptmed-0.1.2}/gptmed/data/parsers/text_formatter.py +0 -0
- {gptmed-0.0.1 → gptmed-0.1.2}/gptmed/inference/__init__.py +0 -0
- {gptmed-0.0.1 → gptmed-0.1.2}/gptmed/inference/decoding_utils.py +0 -0
- {gptmed-0.0.1 → gptmed-0.1.2}/gptmed/inference/generation_config.py +0 -0
- {gptmed-0.0.1 → gptmed-0.1.2}/gptmed/inference/sampling.py +0 -0
- {gptmed-0.0.1 → gptmed-0.1.2}/gptmed/model/architecture/__init__.py +0 -0
- {gptmed-0.0.1 → gptmed-0.1.2}/gptmed/model/architecture/attention.py +0 -0
- {gptmed-0.0.1 → gptmed-0.1.2}/gptmed/model/architecture/decoder_block.py +0 -0
- {gptmed-0.0.1 → gptmed-0.1.2}/gptmed/model/architecture/embeddings.py +0 -0
- {gptmed-0.0.1 → gptmed-0.1.2}/gptmed/model/architecture/feedforward.py +0 -0
- {gptmed-0.0.1 → gptmed-0.1.2}/gptmed/model/architecture/transformer.py +0 -0
- {gptmed-0.0.1 → gptmed-0.1.2}/gptmed/model/configs/model_config.py +0 -0
- {gptmed-0.0.1 → gptmed-0.1.2}/gptmed/tokenizer/tokenize_data.py +0 -0
- {gptmed-0.0.1 → gptmed-0.1.2}/gptmed/tokenizer/train_tokenizer.py +0 -0
- {gptmed-0.0.1 → gptmed-0.1.2}/gptmed/training/__init__.py +0 -0
- {gptmed-0.0.1 → gptmed-0.1.2}/gptmed/training/dataset.py +0 -0
- {gptmed-0.0.1 → gptmed-0.1.2}/gptmed/training/utils.py +0 -0
- {gptmed-0.0.1 → gptmed-0.1.2}/gptmed/utils/__init__.py +0 -0
- {gptmed-0.0.1 → gptmed-0.1.2}/gptmed/utils/checkpoints.py +0 -0
- {gptmed-0.0.1 → gptmed-0.1.2}/gptmed/utils/logging.py +0 -0
- {gptmed-0.0.1 → gptmed-0.1.2}/gptmed.egg-info/dependency_links.txt +0 -0
- {gptmed-0.0.1 → gptmed-0.1.2}/gptmed.egg-info/entry_points.txt +0 -0
- {gptmed-0.0.1 → gptmed-0.1.2}/gptmed.egg-info/top_level.txt +0 -0
- {gptmed-0.0.1 → gptmed-0.1.2}/requirements.txt +0 -0
- {gptmed-0.0.1 → gptmed-0.1.2}/setup.cfg +0 -0
- {gptmed-0.0.1 → gptmed-0.1.2}/setup.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: gptmed
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.1.2
|
|
4
4
|
Summary: A lightweight GPT-based language model framework for training custom question-answering models on any domain
|
|
5
5
|
Author-email: Sanjog Sigdel <sigdelsanjog@gmail.com>
|
|
6
6
|
Maintainer-email: Sanjog Sigdel <sigdelsanjog@gmail.com>
|
|
@@ -51,6 +51,7 @@ Requires-Dist: torch>=2.0.0
|
|
|
51
51
|
Requires-Dist: sentencepiece>=0.1.99
|
|
52
52
|
Requires-Dist: numpy>=1.24.0
|
|
53
53
|
Requires-Dist: tqdm>=4.65.0
|
|
54
|
+
Requires-Dist: pyyaml>=6.0
|
|
54
55
|
Provides-Extra: dev
|
|
55
56
|
Requires-Dist: pytest>=7.0.0; extra == "dev"
|
|
56
57
|
Requires-Dist: black>=22.0.0; extra == "dev"
|
|
@@ -69,6 +70,10 @@ A lightweight GPT-based language model framework for training custom question-an
|
|
|
69
70
|
[](https://www.python.org/downloads/)
|
|
70
71
|
[](https://opensource.org/licenses/MIT)
|
|
71
72
|
|
|
73
|
+
## 📖 [Complete User Manual](USER_MANUAL.md) | [Quick Start](#quick-start)
|
|
74
|
+
|
|
75
|
+
> **New to GptMed?** Check out the [**step-by-step User Manual**](USER_MANUAL.md) for a complete guide on training your own model!
|
|
76
|
+
|
|
72
77
|
## Features
|
|
73
78
|
|
|
74
79
|
- 🧠 **Custom GPT Architecture**: Lightweight transformer model for any Q&A domain
|
|
@@ -78,6 +83,27 @@ A lightweight GPT-based language model framework for training custom question-an
|
|
|
78
83
|
- 📦 **Lightweight**: Small model size suitable for edge deployment
|
|
79
84
|
- 🛠️ **Complete Toolkit**: Includes tokenizer training, model training, and inference utilities
|
|
80
85
|
|
|
86
|
+
## Table of Contents
|
|
87
|
+
|
|
88
|
+
- [Features](#features)
|
|
89
|
+
- [Installation](#installation)
|
|
90
|
+
- [Quick Start](#quick-start)
|
|
91
|
+
- [Package Structure](#package-structure)
|
|
92
|
+
- [Core Modules](#core-modules)
|
|
93
|
+
- [Model Components](#model-components)
|
|
94
|
+
- [Training Components](#training-components)
|
|
95
|
+
- [Inference Components](#inference-components)
|
|
96
|
+
- [Data Processing](#data-processing)
|
|
97
|
+
- [Utilities](#utilities)
|
|
98
|
+
- [Model Architecture](#model-architecture)
|
|
99
|
+
- [Configuration](#configuration)
|
|
100
|
+
- [Documentation](#documentation)
|
|
101
|
+
- [Performance](#performance)
|
|
102
|
+
- [Examples](#examples)
|
|
103
|
+
- [Contributing](#contributing)
|
|
104
|
+
- [License](#license)
|
|
105
|
+
- [Support](#support)
|
|
106
|
+
|
|
81
107
|
## Installation
|
|
82
108
|
|
|
83
109
|
### From PyPI (Recommended)
|
|
@@ -204,27 +230,134 @@ config = TrainingConfig(
|
|
|
204
230
|
)
|
|
205
231
|
```
|
|
206
232
|
|
|
207
|
-
##
|
|
233
|
+
## Package Structure
|
|
234
|
+
|
|
235
|
+
### Core Modules
|
|
236
|
+
|
|
237
|
+
The `gptmed` package contains the following main modules:
|
|
238
|
+
|
|
239
|
+
```
|
|
240
|
+
gptmed/
|
|
241
|
+
├── model/ # Model architecture and configurations
|
|
242
|
+
├── inference/ # Text generation and sampling
|
|
243
|
+
├── training/ # Training loops and datasets
|
|
244
|
+
├── tokenizer/ # Tokenizer training and data processing
|
|
245
|
+
├── data/ # Data parsers and formatters
|
|
246
|
+
├── configs/ # Training configurations
|
|
247
|
+
└── utils/ # Utilities (checkpoints, logging)
|
|
248
|
+
```
|
|
249
|
+
|
|
250
|
+
### Model Components
|
|
251
|
+
|
|
252
|
+
**`gptmed.model.architecture`** - GPT Transformer Implementation
|
|
253
|
+
|
|
254
|
+
- `GPTTransformer` - Main model class
|
|
255
|
+
- `TransformerBlock` - Individual transformer layers
|
|
256
|
+
- `MultiHeadAttention` - Attention mechanism
|
|
257
|
+
- `FeedForward` - Feed-forward networks
|
|
258
|
+
- `RoPEPositionalEncoding` - Rotary position embeddings
|
|
259
|
+
|
|
260
|
+
**`gptmed.model.configs`** - Model Configurations
|
|
261
|
+
|
|
262
|
+
- `get_tiny_config()` - ~2M parameters (testing)
|
|
263
|
+
- `get_small_config()` - ~10M parameters (recommended)
|
|
264
|
+
- `get_medium_config()` - ~50M parameters (high quality)
|
|
265
|
+
- `ModelConfig` - Custom configuration class
|
|
266
|
+
|
|
267
|
+
### Training Components
|
|
268
|
+
|
|
269
|
+
**`gptmed.training`** - Training Pipeline
|
|
270
|
+
|
|
271
|
+
- `train.py` - Main training script (CLI: `gptmed-train`)
|
|
272
|
+
- `Trainer` - Training loop with checkpointing
|
|
273
|
+
- `TokenizedDataset` - PyTorch dataset for tokenized data
|
|
274
|
+
- `create_dataloaders()` - DataLoader creation utilities
|
|
275
|
+
|
|
276
|
+
**`gptmed.configs`** - Training Configurations
|
|
277
|
+
|
|
278
|
+
- `TrainingConfig` - Training hyperparameters
|
|
279
|
+
- `get_default_config()` - Default training settings
|
|
280
|
+
- `get_quick_test_config()` - Fast testing configuration
|
|
281
|
+
|
|
282
|
+
### Inference Components
|
|
283
|
+
|
|
284
|
+
**`gptmed.inference`** - Text Generation
|
|
285
|
+
|
|
286
|
+
- `TextGenerator` - Main generation class
|
|
287
|
+
- `generator.py` - CLI command (CLI: `gptmed-generate`)
|
|
288
|
+
- `sampling.py` - Sampling strategies (top-k, top-p, temperature)
|
|
289
|
+
- `decoding_utils.py` - Decoding utilities
|
|
290
|
+
- `GenerationConfig` - Generation parameters
|
|
291
|
+
|
|
292
|
+
### Data Processing
|
|
293
|
+
|
|
294
|
+
**`gptmed.tokenizer`** - Tokenizer Training & Data Processing
|
|
295
|
+
|
|
296
|
+
- `train_tokenizer.py` - Train SentencePiece tokenizer
|
|
297
|
+
- `tokenize_data.py` - Convert text to token sequences
|
|
298
|
+
- SentencePiece BPE tokenizer support
|
|
299
|
+
|
|
300
|
+
**`gptmed.data.parsers`** - Data Parsing & Formatting
|
|
301
|
+
|
|
302
|
+
- `MedQuADParser` - XML Q&A parser (example)
|
|
303
|
+
- `CausalTextFormatter` - Format Q&A pairs for training
|
|
304
|
+
- `FormatConfig` - Formatting configuration
|
|
305
|
+
|
|
306
|
+
### Utilities
|
|
307
|
+
|
|
308
|
+
**`gptmed.utils`** - Helper Functions
|
|
309
|
+
|
|
310
|
+
- `checkpoints.py` - Model checkpoint management
|
|
311
|
+
- `logging.py` - Training metrics logging
|
|
312
|
+
|
|
313
|
+
---
|
|
314
|
+
|
|
315
|
+
## Detailed Project Structure
|
|
208
316
|
|
|
209
317
|
```
|
|
210
318
|
gptmed/
|
|
211
319
|
├── model/
|
|
212
|
-
│ ├── architecture/
|
|
213
|
-
│
|
|
320
|
+
│ ├── architecture/
|
|
321
|
+
│ │ ├── gpt.py # GPT transformer model
|
|
322
|
+
│ │ ├── attention.py # Multi-head attention
|
|
323
|
+
│ │ ├── feedforward.py # Feed-forward networks
|
|
324
|
+
│ │ └── embeddings.py # Token + positional embeddings
|
|
325
|
+
│ └── configs/
|
|
326
|
+
│ └── model_config.py # Model size configurations
|
|
214
327
|
├── inference/
|
|
215
|
-
│ ├── generator.py
|
|
216
|
-
│
|
|
328
|
+
│ ├── generator.py # Text generation (CLI command)
|
|
329
|
+
│ ├── sampling.py # Sampling strategies
|
|
330
|
+
│ ├── decoding_utils.py # Decoding utilities
|
|
331
|
+
│ └── generation_config.py # Generation parameters
|
|
217
332
|
├── training/
|
|
218
|
-
│ ├── train.py
|
|
219
|
-
│ ├── trainer.py
|
|
220
|
-
│
|
|
333
|
+
│ ├── train.py # Main training script (CLI command)
|
|
334
|
+
│ ├── trainer.py # Training loop
|
|
335
|
+
│ ├── dataset.py # PyTorch dataset
|
|
336
|
+
│ └── utils.py # Training utilities
|
|
221
337
|
├── tokenizer/
|
|
222
|
-
│
|
|
338
|
+
│ ├── train_tokenizer.py # Train SentencePiece tokenizer
|
|
339
|
+
│ └── tokenize_data.py # Tokenize text data
|
|
340
|
+
├── data/
|
|
341
|
+
│ └── parsers/
|
|
342
|
+
│ ├── medquad_parser.py # Example XML parser
|
|
343
|
+
│ └── text_formatter.py # Q&A text formatter
|
|
223
344
|
├── configs/
|
|
224
|
-
│ └── train_config.py
|
|
345
|
+
│ └── train_config.py # Training configurations
|
|
225
346
|
└── utils/
|
|
226
|
-
├── checkpoints.py
|
|
227
|
-
└── logging.py
|
|
347
|
+
├── checkpoints.py # Model checkpointing
|
|
348
|
+
└── logging.py # Training logging
|
|
349
|
+
```
|
|
350
|
+
|
|
351
|
+
### Command-Line Interface
|
|
352
|
+
|
|
353
|
+
The package provides two main CLI commands:
|
|
354
|
+
|
|
355
|
+
```bash
|
|
356
|
+
# Train a model
|
|
357
|
+
gptmed-train --model-size small --num-epochs 10 --batch-size 16
|
|
358
|
+
|
|
359
|
+
# Generate text
|
|
360
|
+
gptmed-generate --prompt "Your question?" --max-length 100
|
|
228
361
|
```
|
|
229
362
|
|
|
230
363
|
## Requirements
|
|
@@ -237,14 +370,14 @@ gptmed/
|
|
|
237
370
|
|
|
238
371
|
## Documentation
|
|
239
372
|
|
|
240
|
-
|
|
373
|
+
📚 **[Complete User Manual](USER_MANUAL.md)** - Step-by-step guide for training your own model
|
|
241
374
|
|
|
242
|
-
###
|
|
375
|
+
### Quick Links
|
|
243
376
|
|
|
244
|
-
- [
|
|
245
|
-
- [
|
|
246
|
-
- [
|
|
247
|
-
- [
|
|
377
|
+
- [User Manual](USER_MANUAL.md) - **Start here!** Complete training pipeline guide
|
|
378
|
+
- [Architecture Guide](ARCHITECTURE_EXTENSION_GUIDE.md) - Understanding the model architecture
|
|
379
|
+
- [Deployment Guide](DEPLOYMENT_GUIDE.md) - Publishing to PyPI
|
|
380
|
+
- [Changelog](CHANGELOG.md) - Version history
|
|
248
381
|
|
|
249
382
|
## Performance
|
|
250
383
|
|
|
@@ -312,7 +445,8 @@ This project is licensed under the MIT License - see the [LICENSE](LICENSE) file
|
|
|
312
445
|
|
|
313
446
|
## Support
|
|
314
447
|
|
|
315
|
-
-
|
|
448
|
+
- � **[User Manual](USER_MANUAL.md)** - Complete step-by-step training guide
|
|
449
|
+
- �📫 Issues: [GitHub Issues](https://github.com/sigdelsanjog/gptmed/issues)
|
|
316
450
|
- 💬 Discussions: [GitHub Discussions](https://github.com/sigdelsanjog/gptmed/discussions)
|
|
317
451
|
- 📧 Email: sanjog.sigdel@ku.edu.np
|
|
318
452
|
|
|
@@ -6,6 +6,10 @@ A lightweight GPT-based language model framework for training custom question-an
|
|
|
6
6
|
[](https://www.python.org/downloads/)
|
|
7
7
|
[](https://opensource.org/licenses/MIT)
|
|
8
8
|
|
|
9
|
+
## 📖 [Complete User Manual](USER_MANUAL.md) | [Quick Start](#quick-start)
|
|
10
|
+
|
|
11
|
+
> **New to GptMed?** Check out the [**step-by-step User Manual**](USER_MANUAL.md) for a complete guide on training your own model!
|
|
12
|
+
|
|
9
13
|
## Features
|
|
10
14
|
|
|
11
15
|
- 🧠 **Custom GPT Architecture**: Lightweight transformer model for any Q&A domain
|
|
@@ -15,6 +19,27 @@ A lightweight GPT-based language model framework for training custom question-an
|
|
|
15
19
|
- 📦 **Lightweight**: Small model size suitable for edge deployment
|
|
16
20
|
- 🛠️ **Complete Toolkit**: Includes tokenizer training, model training, and inference utilities
|
|
17
21
|
|
|
22
|
+
## Table of Contents
|
|
23
|
+
|
|
24
|
+
- [Features](#features)
|
|
25
|
+
- [Installation](#installation)
|
|
26
|
+
- [Quick Start](#quick-start)
|
|
27
|
+
- [Package Structure](#package-structure)
|
|
28
|
+
- [Core Modules](#core-modules)
|
|
29
|
+
- [Model Components](#model-components)
|
|
30
|
+
- [Training Components](#training-components)
|
|
31
|
+
- [Inference Components](#inference-components)
|
|
32
|
+
- [Data Processing](#data-processing)
|
|
33
|
+
- [Utilities](#utilities)
|
|
34
|
+
- [Model Architecture](#model-architecture)
|
|
35
|
+
- [Configuration](#configuration)
|
|
36
|
+
- [Documentation](#documentation)
|
|
37
|
+
- [Performance](#performance)
|
|
38
|
+
- [Examples](#examples)
|
|
39
|
+
- [Contributing](#contributing)
|
|
40
|
+
- [License](#license)
|
|
41
|
+
- [Support](#support)
|
|
42
|
+
|
|
18
43
|
## Installation
|
|
19
44
|
|
|
20
45
|
### From PyPI (Recommended)
|
|
@@ -141,27 +166,134 @@ config = TrainingConfig(
|
|
|
141
166
|
)
|
|
142
167
|
```
|
|
143
168
|
|
|
144
|
-
##
|
|
169
|
+
## Package Structure
|
|
170
|
+
|
|
171
|
+
### Core Modules
|
|
172
|
+
|
|
173
|
+
The `gptmed` package contains the following main modules:
|
|
174
|
+
|
|
175
|
+
```
|
|
176
|
+
gptmed/
|
|
177
|
+
├── model/ # Model architecture and configurations
|
|
178
|
+
├── inference/ # Text generation and sampling
|
|
179
|
+
├── training/ # Training loops and datasets
|
|
180
|
+
├── tokenizer/ # Tokenizer training and data processing
|
|
181
|
+
├── data/ # Data parsers and formatters
|
|
182
|
+
├── configs/ # Training configurations
|
|
183
|
+
└── utils/ # Utilities (checkpoints, logging)
|
|
184
|
+
```
|
|
185
|
+
|
|
186
|
+
### Model Components
|
|
187
|
+
|
|
188
|
+
**`gptmed.model.architecture`** - GPT Transformer Implementation
|
|
189
|
+
|
|
190
|
+
- `GPTTransformer` - Main model class
|
|
191
|
+
- `TransformerBlock` - Individual transformer layers
|
|
192
|
+
- `MultiHeadAttention` - Attention mechanism
|
|
193
|
+
- `FeedForward` - Feed-forward networks
|
|
194
|
+
- `RoPEPositionalEncoding` - Rotary position embeddings
|
|
195
|
+
|
|
196
|
+
**`gptmed.model.configs`** - Model Configurations
|
|
197
|
+
|
|
198
|
+
- `get_tiny_config()` - ~2M parameters (testing)
|
|
199
|
+
- `get_small_config()` - ~10M parameters (recommended)
|
|
200
|
+
- `get_medium_config()` - ~50M parameters (high quality)
|
|
201
|
+
- `ModelConfig` - Custom configuration class
|
|
202
|
+
|
|
203
|
+
### Training Components
|
|
204
|
+
|
|
205
|
+
**`gptmed.training`** - Training Pipeline
|
|
206
|
+
|
|
207
|
+
- `train.py` - Main training script (CLI: `gptmed-train`)
|
|
208
|
+
- `Trainer` - Training loop with checkpointing
|
|
209
|
+
- `TokenizedDataset` - PyTorch dataset for tokenized data
|
|
210
|
+
- `create_dataloaders()` - DataLoader creation utilities
|
|
211
|
+
|
|
212
|
+
**`gptmed.configs`** - Training Configurations
|
|
213
|
+
|
|
214
|
+
- `TrainingConfig` - Training hyperparameters
|
|
215
|
+
- `get_default_config()` - Default training settings
|
|
216
|
+
- `get_quick_test_config()` - Fast testing configuration
|
|
217
|
+
|
|
218
|
+
### Inference Components
|
|
219
|
+
|
|
220
|
+
**`gptmed.inference`** - Text Generation
|
|
221
|
+
|
|
222
|
+
- `TextGenerator` - Main generation class
|
|
223
|
+
- `generator.py` - CLI command (CLI: `gptmed-generate`)
|
|
224
|
+
- `sampling.py` - Sampling strategies (top-k, top-p, temperature)
|
|
225
|
+
- `decoding_utils.py` - Decoding utilities
|
|
226
|
+
- `GenerationConfig` - Generation parameters
|
|
227
|
+
|
|
228
|
+
### Data Processing
|
|
229
|
+
|
|
230
|
+
**`gptmed.tokenizer`** - Tokenizer Training & Data Processing
|
|
231
|
+
|
|
232
|
+
- `train_tokenizer.py` - Train SentencePiece tokenizer
|
|
233
|
+
- `tokenize_data.py` - Convert text to token sequences
|
|
234
|
+
- SentencePiece BPE tokenizer support
|
|
235
|
+
|
|
236
|
+
**`gptmed.data.parsers`** - Data Parsing & Formatting
|
|
237
|
+
|
|
238
|
+
- `MedQuADParser` - XML Q&A parser (example)
|
|
239
|
+
- `CausalTextFormatter` - Format Q&A pairs for training
|
|
240
|
+
- `FormatConfig` - Formatting configuration
|
|
241
|
+
|
|
242
|
+
### Utilities
|
|
243
|
+
|
|
244
|
+
**`gptmed.utils`** - Helper Functions
|
|
245
|
+
|
|
246
|
+
- `checkpoints.py` - Model checkpoint management
|
|
247
|
+
- `logging.py` - Training metrics logging
|
|
248
|
+
|
|
249
|
+
---
|
|
250
|
+
|
|
251
|
+
## Detailed Project Structure
|
|
145
252
|
|
|
146
253
|
```
|
|
147
254
|
gptmed/
|
|
148
255
|
├── model/
|
|
149
|
-
│ ├── architecture/
|
|
150
|
-
│
|
|
256
|
+
│ ├── architecture/
|
|
257
|
+
│ │ ├── gpt.py # GPT transformer model
|
|
258
|
+
│ │ ├── attention.py # Multi-head attention
|
|
259
|
+
│ │ ├── feedforward.py # Feed-forward networks
|
|
260
|
+
│ │ └── embeddings.py # Token + positional embeddings
|
|
261
|
+
│ └── configs/
|
|
262
|
+
│ └── model_config.py # Model size configurations
|
|
151
263
|
├── inference/
|
|
152
|
-
│ ├── generator.py
|
|
153
|
-
│
|
|
264
|
+
│ ├── generator.py # Text generation (CLI command)
|
|
265
|
+
│ ├── sampling.py # Sampling strategies
|
|
266
|
+
│ ├── decoding_utils.py # Decoding utilities
|
|
267
|
+
│ └── generation_config.py # Generation parameters
|
|
154
268
|
├── training/
|
|
155
|
-
│ ├── train.py
|
|
156
|
-
│ ├── trainer.py
|
|
157
|
-
│
|
|
269
|
+
│ ├── train.py # Main training script (CLI command)
|
|
270
|
+
│ ├── trainer.py # Training loop
|
|
271
|
+
│ ├── dataset.py # PyTorch dataset
|
|
272
|
+
│ └── utils.py # Training utilities
|
|
158
273
|
├── tokenizer/
|
|
159
|
-
│
|
|
274
|
+
│ ├── train_tokenizer.py # Train SentencePiece tokenizer
|
|
275
|
+
│ └── tokenize_data.py # Tokenize text data
|
|
276
|
+
├── data/
|
|
277
|
+
│ └── parsers/
|
|
278
|
+
│ ├── medquad_parser.py # Example XML parser
|
|
279
|
+
│ └── text_formatter.py # Q&A text formatter
|
|
160
280
|
├── configs/
|
|
161
|
-
│ └── train_config.py
|
|
281
|
+
│ └── train_config.py # Training configurations
|
|
162
282
|
└── utils/
|
|
163
|
-
├── checkpoints.py
|
|
164
|
-
└── logging.py
|
|
283
|
+
├── checkpoints.py # Model checkpointing
|
|
284
|
+
└── logging.py # Training logging
|
|
285
|
+
```
|
|
286
|
+
|
|
287
|
+
### Command-Line Interface
|
|
288
|
+
|
|
289
|
+
The package provides two main CLI commands:
|
|
290
|
+
|
|
291
|
+
```bash
|
|
292
|
+
# Train a model
|
|
293
|
+
gptmed-train --model-size small --num-epochs 10 --batch-size 16
|
|
294
|
+
|
|
295
|
+
# Generate text
|
|
296
|
+
gptmed-generate --prompt "Your question?" --max-length 100
|
|
165
297
|
```
|
|
166
298
|
|
|
167
299
|
## Requirements
|
|
@@ -174,14 +306,14 @@ gptmed/
|
|
|
174
306
|
|
|
175
307
|
## Documentation
|
|
176
308
|
|
|
177
|
-
|
|
309
|
+
📚 **[Complete User Manual](USER_MANUAL.md)** - Step-by-step guide for training your own model
|
|
178
310
|
|
|
179
|
-
###
|
|
311
|
+
### Quick Links
|
|
180
312
|
|
|
181
|
-
- [
|
|
182
|
-
- [
|
|
183
|
-
- [
|
|
184
|
-
- [
|
|
313
|
+
- [User Manual](USER_MANUAL.md) - **Start here!** Complete training pipeline guide
|
|
314
|
+
- [Architecture Guide](ARCHITECTURE_EXTENSION_GUIDE.md) - Understanding the model architecture
|
|
315
|
+
- [Deployment Guide](DEPLOYMENT_GUIDE.md) - Publishing to PyPI
|
|
316
|
+
- [Changelog](CHANGELOG.md) - Version history
|
|
185
317
|
|
|
186
318
|
## Performance
|
|
187
319
|
|
|
@@ -249,7 +381,8 @@ This project is licensed under the MIT License - see the [LICENSE](LICENSE) file
|
|
|
249
381
|
|
|
250
382
|
## Support
|
|
251
383
|
|
|
252
|
-
-
|
|
384
|
+
- � **[User Manual](USER_MANUAL.md)** - Complete step-by-step training guide
|
|
385
|
+
- �📫 Issues: [GitHub Issues](https://github.com/sigdelsanjog/gptmed/issues)
|
|
253
386
|
- 💬 Discussions: [GitHub Discussions](https://github.com/sigdelsanjog/gptmed/discussions)
|
|
254
387
|
- 📧 Email: sanjog.sigdel@ku.edu.np
|
|
255
388
|
|
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
"""
|
|
2
|
+
GptMed: A lightweight GPT-based language model framework
|
|
3
|
+
|
|
4
|
+
A domain-agnostic framework for training custom question-answering models.
|
|
5
|
+
Train your own GPT model on any Q&A dataset - medical, technical support,
|
|
6
|
+
education, or any other domain.
|
|
7
|
+
|
|
8
|
+
Quick Start:
|
|
9
|
+
>>> import gptmed
|
|
10
|
+
>>>
|
|
11
|
+
>>> # 1. Create a config file
|
|
12
|
+
>>> gptmed.create_config('my_config.yaml')
|
|
13
|
+
>>>
|
|
14
|
+
>>> # 2. Edit my_config.yaml with your settings
|
|
15
|
+
>>>
|
|
16
|
+
>>> # 3. Train your model
|
|
17
|
+
>>> results = gptmed.train_from_config('my_config.yaml')
|
|
18
|
+
>>>
|
|
19
|
+
>>> # 4. Generate answers
|
|
20
|
+
>>> answer = gptmed.generate(
|
|
21
|
+
... checkpoint=results['best_checkpoint'],
|
|
22
|
+
... tokenizer='tokenizer/my_tokenizer.model',
|
|
23
|
+
... prompt='Your question here?'
|
|
24
|
+
... )
|
|
25
|
+
|
|
26
|
+
Advanced Usage:
|
|
27
|
+
>>> from gptmed.model.architecture import GPTTransformer
|
|
28
|
+
>>> from gptmed.model.configs.model_config import get_small_config
|
|
29
|
+
>>> from gptmed.inference.generator import TextGenerator
|
|
30
|
+
>>>
|
|
31
|
+
>>> config = get_small_config()
|
|
32
|
+
>>> model = GPTTransformer(config)
|
|
33
|
+
"""
|
|
34
|
+
|
|
35
|
+
__version__ = "0.2.0"
|
|
36
|
+
__author__ = "Sanjog Sigdel"
|
|
37
|
+
__email__ = "sigdelsanjog@gmail.com"
|
|
38
|
+
|
|
39
|
+
# High-level API - Main user interface
|
|
40
|
+
from gptmed.api import (
|
|
41
|
+
create_config,
|
|
42
|
+
train_from_config,
|
|
43
|
+
generate,
|
|
44
|
+
)
|
|
45
|
+
|
|
46
|
+
# Expose main components at package level for convenience
|
|
47
|
+
from gptmed.model.architecture import GPTTransformer
|
|
48
|
+
from gptmed.model.configs.model_config import ModelConfig, get_small_config, get_tiny_config
|
|
49
|
+
|
|
50
|
+
__all__ = [
|
|
51
|
+
# Simple API
|
|
52
|
+
"create_config",
|
|
53
|
+
"train_from_config",
|
|
54
|
+
"generate",
|
|
55
|
+
# Advanced API
|
|
56
|
+
"GPTTransformer",
|
|
57
|
+
"ModelConfig",
|
|
58
|
+
"get_small_config",
|
|
59
|
+
"get_tiny_config",
|
|
60
|
+
]
|