gpt-simple-lm 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- gpt_simple_lm-0.1.0/LICENSE +21 -0
- gpt_simple_lm-0.1.0/PKG-INFO +231 -0
- gpt_simple_lm-0.1.0/README.md +193 -0
- gpt_simple_lm-0.1.0/pyproject.toml +66 -0
- gpt_simple_lm-0.1.0/setup.cfg +4 -0
- gpt_simple_lm-0.1.0/src/gpt_simple/__init__.py +41 -0
- gpt_simple_lm-0.1.0/src/gpt_simple/__main__.py +15 -0
- gpt_simple_lm-0.1.0/src/gpt_simple/_checkpoint.py +781 -0
- gpt_simple_lm-0.1.0/src/gpt_simple/_logging.py +69 -0
- gpt_simple_lm-0.1.0/src/gpt_simple/_run_state.py +115 -0
- gpt_simple_lm-0.1.0/src/gpt_simple/_shutdown.py +353 -0
- gpt_simple_lm-0.1.0/src/gpt_simple/_streaming.py +971 -0
- gpt_simple_lm-0.1.0/src/gpt_simple/_train_loop.py +627 -0
- gpt_simple_lm-0.1.0/src/gpt_simple/_train_setup.py +765 -0
- gpt_simple_lm-0.1.0/src/gpt_simple/cli/__init__.py +0 -0
- gpt_simple_lm-0.1.0/src/gpt_simple/cli/batch_generate_cmd.py +495 -0
- gpt_simple_lm-0.1.0/src/gpt_simple/cli/generate_cmd.py +197 -0
- gpt_simple_lm-0.1.0/src/gpt_simple/cli/init_cmd.py +187 -0
- gpt_simple_lm-0.1.0/src/gpt_simple/cli/main.py +71 -0
- gpt_simple_lm-0.1.0/src/gpt_simple/cli/status_cmd.py +403 -0
- gpt_simple_lm-0.1.0/src/gpt_simple/cli/stop_cmd.py +112 -0
- gpt_simple_lm-0.1.0/src/gpt_simple/cli/tokenize_cmd.py +78 -0
- gpt_simple_lm-0.1.0/src/gpt_simple/cli/train_cmd.py +415 -0
- gpt_simple_lm-0.1.0/src/gpt_simple/cli/validate_cmd.py +124 -0
- gpt_simple_lm-0.1.0/src/gpt_simple/config.py +485 -0
- gpt_simple_lm-0.1.0/src/gpt_simple/data.py +1207 -0
- gpt_simple_lm-0.1.0/src/gpt_simple/errors.py +26 -0
- gpt_simple_lm-0.1.0/src/gpt_simple/generate.py +384 -0
- gpt_simple_lm-0.1.0/src/gpt_simple/model.py +892 -0
- gpt_simple_lm-0.1.0/src/gpt_simple/pretokenize.py +383 -0
- gpt_simple_lm-0.1.0/src/gpt_simple/tokenizer.py +282 -0
- gpt_simple_lm-0.1.0/src/gpt_simple/train.py +329 -0
- gpt_simple_lm-0.1.0/src/gpt_simple/validate.py +1373 -0
- gpt_simple_lm-0.1.0/src/gpt_simple_lm.egg-info/PKG-INFO +231 -0
- gpt_simple_lm-0.1.0/src/gpt_simple_lm.egg-info/SOURCES.txt +49 -0
- gpt_simple_lm-0.1.0/src/gpt_simple_lm.egg-info/dependency_links.txt +1 -0
- gpt_simple_lm-0.1.0/src/gpt_simple_lm.egg-info/entry_points.txt +2 -0
- gpt_simple_lm-0.1.0/src/gpt_simple_lm.egg-info/requires.txt +18 -0
- gpt_simple_lm-0.1.0/src/gpt_simple_lm.egg-info/top_level.txt +1 -0
- gpt_simple_lm-0.1.0/tests/test_bucket_exhaustion.py +150 -0
- gpt_simple_lm-0.1.0/tests/test_checkpoint.py +563 -0
- gpt_simple_lm-0.1.0/tests/test_cli.py +1704 -0
- gpt_simple_lm-0.1.0/tests/test_data_resume.py +854 -0
- gpt_simple_lm-0.1.0/tests/test_dataloader_clamp.py +190 -0
- gpt_simple_lm-0.1.0/tests/test_distributed_invariants.py +333 -0
- gpt_simple_lm-0.1.0/tests/test_e2e_resume.py +478 -0
- gpt_simple_lm-0.1.0/tests/test_initialization.py +88 -0
- gpt_simple_lm-0.1.0/tests/test_model_arch.py +184 -0
- gpt_simple_lm-0.1.0/tests/test_pretokenized.py +802 -0
- gpt_simple_lm-0.1.0/tests/test_shutdown.py +367 -0
- gpt_simple_lm-0.1.0/tests/test_validate.py +498 -0
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Louis Bertucci
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,231 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: gpt-simple-lm
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: A clean, efficient framework for pretraining language models from scratch
|
|
5
|
+
Author: Louis Bertucci
|
|
6
|
+
License: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/lb-off/gpt-simple
|
|
8
|
+
Project-URL: Documentation, https://lb-off.github.io/gpt-simple/
|
|
9
|
+
Project-URL: Repository, https://github.com/lb-off/gpt-simple
|
|
10
|
+
Project-URL: Issues, https://github.com/lb-off/gpt-simple/issues
|
|
11
|
+
Keywords: llm,pretraining,gpt,pytorch,transformer,language-model
|
|
12
|
+
Classifier: Development Status :: 4 - Beta
|
|
13
|
+
Classifier: Intended Audience :: Science/Research
|
|
14
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
18
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
19
|
+
Requires-Python: >=3.10
|
|
20
|
+
Description-Content-Type: text/markdown
|
|
21
|
+
License-File: LICENSE
|
|
22
|
+
Requires-Dist: torch>=2.0.0
|
|
23
|
+
Requires-Dist: numpy<2.0,>=1.24.0
|
|
24
|
+
Requires-Dist: transformers>=4.36.0
|
|
25
|
+
Requires-Dist: accelerate>=0.20.0
|
|
26
|
+
Requires-Dist: sentencepiece>=0.1.99
|
|
27
|
+
Requires-Dist: pyyaml>=6.0
|
|
28
|
+
Provides-Extra: wandb
|
|
29
|
+
Requires-Dist: wandb>=0.15.0; extra == "wandb"
|
|
30
|
+
Provides-Extra: cli
|
|
31
|
+
Requires-Dist: rich>=13.0; extra == "cli"
|
|
32
|
+
Provides-Extra: dev
|
|
33
|
+
Requires-Dist: pytest>=7.0; extra == "dev"
|
|
34
|
+
Requires-Dist: ruff>=0.1.0; extra == "dev"
|
|
35
|
+
Requires-Dist: wandb>=0.15.0; extra == "dev"
|
|
36
|
+
Requires-Dist: rich>=13.0; extra == "dev"
|
|
37
|
+
Dynamic: license-file
|
|
38
|
+
|
|
39
|
+
# GPT-Simple
|
|
40
|
+
|
|
41
|
+
[](https://github.com/lb-off/gpt-simple/actions/workflows/ci.yml)
|
|
42
|
+
[](https://www.python.org/downloads/)
|
|
43
|
+
[](LICENSE)
|
|
44
|
+
|
|
45
|
+
A clean, efficient framework for pretraining language models from scratch.
|
|
46
|
+
|
|
47
|
+
GPT-Simple handles the full LLM pretraining workflow — tokenization,
|
|
48
|
+
streaming data loading, multi-GPU training, checkpointing, and inference —
|
|
49
|
+
through a single YAML config and a small CLI. It ships with a modern GPT
|
|
50
|
+
architecture ready to train out of the box.
|
|
51
|
+
|
|
52
|
+
## Features
|
|
53
|
+
|
|
54
|
+
- **Single YAML config + CLI** — `init` / `tokenize` / `train` / `status`
|
|
55
|
+
/ `stop` / `validate` / `generate` / `batch-generate`.
|
|
56
|
+
- **Multi-GPU out of the box** — `--nproc_per_node N` launches `torchrun`
|
|
57
|
+
automatically (Accelerate, bf16, `torch.compile`, gradient
|
|
58
|
+
checkpointing).
|
|
59
|
+
- **Pretokenized streaming** — memory-mapped `.bin/.idx` shards with
|
|
60
|
+
sequence packing; a raw-JSONL fallback for quick experiments.
|
|
61
|
+
- **Deterministic stop/resume** — walltime- and signal-aware checkpoints
|
|
62
|
+
with topology-agnostic data cursors, so *N* short jobs equal one long
|
|
63
|
+
job (every document seen exactly once, even if `world_size` /
|
|
64
|
+
`num_workers` change between restarts).
|
|
65
|
+
- **Orchestrator-friendly** — runs under SLURM, Kubernetes, or a local
|
|
66
|
+
loop; templates in [`examples/orchestrators/`](examples/orchestrators/).
|
|
67
|
+
- **Curriculum learning** — phase-based mixing across named data buckets.
|
|
68
|
+
- **Modern architecture** — pre-norm decoder with RoPE, RMSNorm, and a
|
|
69
|
+
gated (SwiGLU) MLP; also expresses GQA/MQA, vanilla MLPs, and untied
|
|
70
|
+
heads via config.
|
|
71
|
+
- **Python API** — `import gpt_simple; gpt_simple.train(config="config.yaml")`.
|
|
72
|
+
|
|
73
|
+
## Installation
|
|
74
|
+
|
|
75
|
+
```bash
|
|
76
|
+
pip install -e ".[dev]" # from source (development)
|
|
77
|
+
pip install . # core only
|
|
78
|
+
pip install ".[wandb]" # optional: Weights & Biases logging
|
|
79
|
+
pip install ".[cli]" # optional: rich-formatted CLI output
|
|
80
|
+
```
|
|
81
|
+
|
|
82
|
+
## Quick start
|
|
83
|
+
|
|
84
|
+
### 1. Generate a config
|
|
85
|
+
|
|
86
|
+
```bash
|
|
87
|
+
gpt-simple init -o config.yaml
|
|
88
|
+
gpt-simple init --preset small -o config.yaml # ~125M (small | medium | large)
|
|
89
|
+
```
|
|
90
|
+
|
|
91
|
+
### 2. Pretokenize your data
|
|
92
|
+
|
|
93
|
+
```bash
|
|
94
|
+
gpt-simple tokenize \
|
|
95
|
+
--input_dir ./raw_data \
|
|
96
|
+
--output_dir ./data/tokenized \
|
|
97
|
+
--tokenizer_path gpt2 \
|
|
98
|
+
--max_length 2048 \
|
|
99
|
+
--num_workers 8
|
|
100
|
+
```
|
|
101
|
+
|
|
102
|
+
Converts `.jsonl`/`.txt` into memory-mapped `.bin/.idx` shards. See the
|
|
103
|
+
[data pipeline guide](docs/data.md).
|
|
104
|
+
|
|
105
|
+
### 3. Train
|
|
106
|
+
|
|
107
|
+
```bash
|
|
108
|
+
gpt-simple train --config config.yaml # single GPU
|
|
109
|
+
gpt-simple train --config config.yaml --nproc_per_node 4 # 4 GPUs
|
|
110
|
+
|
|
111
|
+
# override any config value; start fresh with --force
|
|
112
|
+
gpt-simple train --config config.yaml --training.max_steps 5000 --force
|
|
113
|
+
```
|
|
114
|
+
|
|
115
|
+
See the [training guide](docs/training.md).
|
|
116
|
+
|
|
117
|
+
### 4. Monitor and control
|
|
118
|
+
|
|
119
|
+
```bash
|
|
120
|
+
gpt-simple status # training progress
|
|
121
|
+
gpt-simple stop # graceful shutdown (saves a checkpoint)
|
|
122
|
+
gpt-simple stop --force # immediate SIGKILL
|
|
123
|
+
```
|
|
124
|
+
|
|
125
|
+
### 5. Generate
|
|
126
|
+
|
|
127
|
+
```bash
|
|
128
|
+
gpt-simple generate --output-dir ./outputs --prompt "Once upon a time" --max-new-tokens 200
|
|
129
|
+
```
|
|
130
|
+
|
|
131
|
+
`--output-dir` auto-picks the latest checkpoint. For multi-model /
|
|
132
|
+
multi-sampling batches and a `--dry-run` submission gate, use
|
|
133
|
+
`batch-generate` — see the [inference guide](docs/inference.md).
|
|
134
|
+
|
|
135
|
+
## Long runs with stop/resume
|
|
136
|
+
|
|
137
|
+
The trainer targets clusters with a hard per-job wall-clock cap. With
|
|
138
|
+
`resume: auto` (the default), re-running the same command resumes the
|
|
139
|
+
latest checkpoint, and the trainer saves and exits cleanly before a
|
|
140
|
+
walltime deadline or on `SIGTERM`/`SIGUSR1` — so an orchestrator just
|
|
141
|
+
re-queues the job.
|
|
142
|
+
|
|
143
|
+
```bash
|
|
144
|
+
gpt-simple train --config config.yaml # resume is automatic on every restart
|
|
145
|
+
gpt-simple status
|
|
146
|
+
gpt-simple stop # or let walltime/SIGUSR1 do it
|
|
147
|
+
```
|
|
148
|
+
|
|
149
|
+
Templates: [`slurm_resume_chain.sh`](examples/orchestrators/slurm_resume_chain.sh),
|
|
150
|
+
[`kubernetes_job.yaml`](examples/orchestrators/kubernetes_job.yaml),
|
|
151
|
+
[`local_loop.sh`](examples/orchestrators/local_loop.sh). See the
|
|
152
|
+
[checkpointing & resume](docs/checkpointing-and-resume.md) and
|
|
153
|
+
[orchestration](docs/orchestration.md) guides.
|
|
154
|
+
|
|
155
|
+
## Configuration
|
|
156
|
+
|
|
157
|
+
All settings live in one YAML file with four sections — `model`, `data`,
|
|
158
|
+
`optimizer`, `training`:
|
|
159
|
+
|
|
160
|
+
```yaml
|
|
161
|
+
model:
|
|
162
|
+
n_embd: 768
|
|
163
|
+
n_layer: 12
|
|
164
|
+
n_head: 12
|
|
165
|
+
n_positions: 2048
|
|
166
|
+
|
|
167
|
+
data:
|
|
168
|
+
path: ./data/tokenized
|
|
169
|
+
tokenizer: gpt2
|
|
170
|
+
format: pretokenized # pretokenized | jsonl
|
|
171
|
+
max_length: 2048
|
|
172
|
+
|
|
173
|
+
optimizer:
|
|
174
|
+
learning_rate: 3.0e-4
|
|
175
|
+
warmup_steps: 100
|
|
176
|
+
|
|
177
|
+
training:
|
|
178
|
+
per_device_batch_size: 4
|
|
179
|
+
gradient_accumulation_steps: 4
|
|
180
|
+
max_steps: 1000
|
|
181
|
+
output_dir: ./outputs
|
|
182
|
+
# wandb_project: my-project # uncomment to enable W&B
|
|
183
|
+
```
|
|
184
|
+
|
|
185
|
+
`gpt-simple init` writes a fully commented template. Every field is
|
|
186
|
+
documented in the [configuration reference](docs/configuration.md), and
|
|
187
|
+
curriculum learning in the [data pipeline guide](docs/data.md).
|
|
188
|
+
|
|
189
|
+
## Python API
|
|
190
|
+
|
|
191
|
+
```python
|
|
192
|
+
import gpt_simple
|
|
193
|
+
|
|
194
|
+
result = gpt_simple.train(
|
|
195
|
+
model=gpt_simple.ModelConfig(n_embd=768, n_layer=12, n_head=12),
|
|
196
|
+
data=gpt_simple.DataConfig(path="./data/tokenized", tokenizer="gpt2"),
|
|
197
|
+
optimizer=gpt_simple.OptimizerConfig(learning_rate=3e-4),
|
|
198
|
+
training=gpt_simple.TrainingConfig(max_steps=1000, output_dir="./outputs"),
|
|
199
|
+
)
|
|
200
|
+
print(result.final_loss, result.total_tokens, result.checkpoint_path)
|
|
201
|
+
```
|
|
202
|
+
|
|
203
|
+
Or `gpt_simple.train(config="config.yaml")`; sub-configs passed
|
|
204
|
+
explicitly override the matching section from the file.
|
|
205
|
+
|
|
206
|
+
## Documentation
|
|
207
|
+
|
|
208
|
+
Full guides live in [`docs/`](docs/README.md):
|
|
209
|
+
|
|
210
|
+
- [Architecture](docs/architecture.md) — the built-in model.
|
|
211
|
+
- [Configuration](docs/configuration.md) — every config field.
|
|
212
|
+
- [Data pipeline](docs/data.md) — tokenization, packing, curriculum.
|
|
213
|
+
- [Training](docs/training.md) — multi-GPU, precision, compile.
|
|
214
|
+
- [Checkpointing & resume](docs/checkpointing-and-resume.md) — the
|
|
215
|
+
stop/resume model.
|
|
216
|
+
- [Orchestration](docs/orchestration.md) — running under any scheduler.
|
|
217
|
+
- [Inference](docs/inference.md) — `generate` / `batch-generate`.
|
|
218
|
+
- [Hardware tuning](docs/hardware-tuning.md) — peak GPU throughput.
|
|
219
|
+
- [Performance](docs/performance.md) — measured 2.8B throughput and MFU.
|
|
220
|
+
|
|
221
|
+
## Development
|
|
222
|
+
|
|
223
|
+
```bash
|
|
224
|
+
pip install -e ".[dev]"
|
|
225
|
+
pytest tests/
|
|
226
|
+
ruff check src/ tests/
|
|
227
|
+
```
|
|
228
|
+
|
|
229
|
+
## License
|
|
230
|
+
|
|
231
|
+
MIT — see [LICENSE](LICENSE).
|
|
@@ -0,0 +1,193 @@
|
|
|
1
|
+
# GPT-Simple
|
|
2
|
+
|
|
3
|
+
[](https://github.com/lb-off/gpt-simple/actions/workflows/ci.yml)
|
|
4
|
+
[](https://www.python.org/downloads/)
|
|
5
|
+
[](LICENSE)
|
|
6
|
+
|
|
7
|
+
A clean, efficient framework for pretraining language models from scratch.
|
|
8
|
+
|
|
9
|
+
GPT-Simple handles the full LLM pretraining workflow — tokenization,
|
|
10
|
+
streaming data loading, multi-GPU training, checkpointing, and inference —
|
|
11
|
+
through a single YAML config and a small CLI. It ships with a modern GPT
|
|
12
|
+
architecture ready to train out of the box.
|
|
13
|
+
|
|
14
|
+
## Features
|
|
15
|
+
|
|
16
|
+
- **Single YAML config + CLI** — `init` / `tokenize` / `train` / `status`
|
|
17
|
+
/ `stop` / `validate` / `generate` / `batch-generate`.
|
|
18
|
+
- **Multi-GPU out of the box** — `--nproc_per_node N` launches `torchrun`
|
|
19
|
+
automatically (Accelerate, bf16, `torch.compile`, gradient
|
|
20
|
+
checkpointing).
|
|
21
|
+
- **Pretokenized streaming** — memory-mapped `.bin/.idx` shards with
|
|
22
|
+
sequence packing; a raw-JSONL fallback for quick experiments.
|
|
23
|
+
- **Deterministic stop/resume** — walltime- and signal-aware checkpoints
|
|
24
|
+
with topology-agnostic data cursors, so *N* short jobs equal one long
|
|
25
|
+
job (every document seen exactly once, even if `world_size` /
|
|
26
|
+
`num_workers` change between restarts).
|
|
27
|
+
- **Orchestrator-friendly** — runs under SLURM, Kubernetes, or a local
|
|
28
|
+
loop; templates in [`examples/orchestrators/`](examples/orchestrators/).
|
|
29
|
+
- **Curriculum learning** — phase-based mixing across named data buckets.
|
|
30
|
+
- **Modern architecture** — pre-norm decoder with RoPE, RMSNorm, and a
|
|
31
|
+
gated (SwiGLU) MLP; also expresses GQA/MQA, vanilla MLPs, and untied
|
|
32
|
+
heads via config.
|
|
33
|
+
- **Python API** — `import gpt_simple; gpt_simple.train(config="config.yaml")`.
|
|
34
|
+
|
|
35
|
+
## Installation
|
|
36
|
+
|
|
37
|
+
```bash
|
|
38
|
+
pip install -e ".[dev]" # from source (development)
|
|
39
|
+
pip install . # core only
|
|
40
|
+
pip install ".[wandb]" # optional: Weights & Biases logging
|
|
41
|
+
pip install ".[cli]" # optional: rich-formatted CLI output
|
|
42
|
+
```
|
|
43
|
+
|
|
44
|
+
## Quick start
|
|
45
|
+
|
|
46
|
+
### 1. Generate a config
|
|
47
|
+
|
|
48
|
+
```bash
|
|
49
|
+
gpt-simple init -o config.yaml
|
|
50
|
+
gpt-simple init --preset small -o config.yaml # ~125M (small | medium | large)
|
|
51
|
+
```
|
|
52
|
+
|
|
53
|
+
### 2. Pretokenize your data
|
|
54
|
+
|
|
55
|
+
```bash
|
|
56
|
+
gpt-simple tokenize \
|
|
57
|
+
--input_dir ./raw_data \
|
|
58
|
+
--output_dir ./data/tokenized \
|
|
59
|
+
--tokenizer_path gpt2 \
|
|
60
|
+
--max_length 2048 \
|
|
61
|
+
--num_workers 8
|
|
62
|
+
```
|
|
63
|
+
|
|
64
|
+
Converts `.jsonl`/`.txt` into memory-mapped `.bin/.idx` shards. See the
|
|
65
|
+
[data pipeline guide](docs/data.md).
|
|
66
|
+
|
|
67
|
+
### 3. Train
|
|
68
|
+
|
|
69
|
+
```bash
|
|
70
|
+
gpt-simple train --config config.yaml # single GPU
|
|
71
|
+
gpt-simple train --config config.yaml --nproc_per_node 4 # 4 GPUs
|
|
72
|
+
|
|
73
|
+
# override any config value; start fresh with --force
|
|
74
|
+
gpt-simple train --config config.yaml --training.max_steps 5000 --force
|
|
75
|
+
```
|
|
76
|
+
|
|
77
|
+
See the [training guide](docs/training.md).
|
|
78
|
+
|
|
79
|
+
### 4. Monitor and control
|
|
80
|
+
|
|
81
|
+
```bash
|
|
82
|
+
gpt-simple status # training progress
|
|
83
|
+
gpt-simple stop # graceful shutdown (saves a checkpoint)
|
|
84
|
+
gpt-simple stop --force # immediate SIGKILL
|
|
85
|
+
```
|
|
86
|
+
|
|
87
|
+
### 5. Generate
|
|
88
|
+
|
|
89
|
+
```bash
|
|
90
|
+
gpt-simple generate --output-dir ./outputs --prompt "Once upon a time" --max-new-tokens 200
|
|
91
|
+
```
|
|
92
|
+
|
|
93
|
+
`--output-dir` auto-picks the latest checkpoint. For multi-model /
|
|
94
|
+
multi-sampling batches and a `--dry-run` submission gate, use
|
|
95
|
+
`batch-generate` — see the [inference guide](docs/inference.md).
|
|
96
|
+
|
|
97
|
+
## Long runs with stop/resume
|
|
98
|
+
|
|
99
|
+
The trainer targets clusters with a hard per-job wall-clock cap. With
|
|
100
|
+
`resume: auto` (the default), re-running the same command resumes the
|
|
101
|
+
latest checkpoint, and the trainer saves and exits cleanly before a
|
|
102
|
+
walltime deadline or on `SIGTERM`/`SIGUSR1` — so an orchestrator just
|
|
103
|
+
re-queues the job.
|
|
104
|
+
|
|
105
|
+
```bash
|
|
106
|
+
gpt-simple train --config config.yaml # resume is automatic on every restart
|
|
107
|
+
gpt-simple status
|
|
108
|
+
gpt-simple stop # or let walltime/SIGUSR1 do it
|
|
109
|
+
```
|
|
110
|
+
|
|
111
|
+
Templates: [`slurm_resume_chain.sh`](examples/orchestrators/slurm_resume_chain.sh),
|
|
112
|
+
[`kubernetes_job.yaml`](examples/orchestrators/kubernetes_job.yaml),
|
|
113
|
+
[`local_loop.sh`](examples/orchestrators/local_loop.sh). See the
|
|
114
|
+
[checkpointing & resume](docs/checkpointing-and-resume.md) and
|
|
115
|
+
[orchestration](docs/orchestration.md) guides.
|
|
116
|
+
|
|
117
|
+
## Configuration
|
|
118
|
+
|
|
119
|
+
All settings live in one YAML file with four sections — `model`, `data`,
|
|
120
|
+
`optimizer`, `training`:
|
|
121
|
+
|
|
122
|
+
```yaml
|
|
123
|
+
model:
|
|
124
|
+
n_embd: 768
|
|
125
|
+
n_layer: 12
|
|
126
|
+
n_head: 12
|
|
127
|
+
n_positions: 2048
|
|
128
|
+
|
|
129
|
+
data:
|
|
130
|
+
path: ./data/tokenized
|
|
131
|
+
tokenizer: gpt2
|
|
132
|
+
format: pretokenized # pretokenized | jsonl
|
|
133
|
+
max_length: 2048
|
|
134
|
+
|
|
135
|
+
optimizer:
|
|
136
|
+
learning_rate: 3.0e-4
|
|
137
|
+
warmup_steps: 100
|
|
138
|
+
|
|
139
|
+
training:
|
|
140
|
+
per_device_batch_size: 4
|
|
141
|
+
gradient_accumulation_steps: 4
|
|
142
|
+
max_steps: 1000
|
|
143
|
+
output_dir: ./outputs
|
|
144
|
+
# wandb_project: my-project # uncomment to enable W&B
|
|
145
|
+
```
|
|
146
|
+
|
|
147
|
+
`gpt-simple init` writes a fully commented template. Every field is
|
|
148
|
+
documented in the [configuration reference](docs/configuration.md), and
|
|
149
|
+
curriculum learning in the [data pipeline guide](docs/data.md).
|
|
150
|
+
|
|
151
|
+
## Python API
|
|
152
|
+
|
|
153
|
+
```python
|
|
154
|
+
import gpt_simple
|
|
155
|
+
|
|
156
|
+
result = gpt_simple.train(
|
|
157
|
+
model=gpt_simple.ModelConfig(n_embd=768, n_layer=12, n_head=12),
|
|
158
|
+
data=gpt_simple.DataConfig(path="./data/tokenized", tokenizer="gpt2"),
|
|
159
|
+
optimizer=gpt_simple.OptimizerConfig(learning_rate=3e-4),
|
|
160
|
+
training=gpt_simple.TrainingConfig(max_steps=1000, output_dir="./outputs"),
|
|
161
|
+
)
|
|
162
|
+
print(result.final_loss, result.total_tokens, result.checkpoint_path)
|
|
163
|
+
```
|
|
164
|
+
|
|
165
|
+
Or `gpt_simple.train(config="config.yaml")`; sub-configs passed
|
|
166
|
+
explicitly override the matching section from the file.
|
|
167
|
+
|
|
168
|
+
## Documentation
|
|
169
|
+
|
|
170
|
+
Full guides live in [`docs/`](docs/README.md):
|
|
171
|
+
|
|
172
|
+
- [Architecture](docs/architecture.md) — the built-in model.
|
|
173
|
+
- [Configuration](docs/configuration.md) — every config field.
|
|
174
|
+
- [Data pipeline](docs/data.md) — tokenization, packing, curriculum.
|
|
175
|
+
- [Training](docs/training.md) — multi-GPU, precision, compile.
|
|
176
|
+
- [Checkpointing & resume](docs/checkpointing-and-resume.md) — the
|
|
177
|
+
stop/resume model.
|
|
178
|
+
- [Orchestration](docs/orchestration.md) — running under any scheduler.
|
|
179
|
+
- [Inference](docs/inference.md) — `generate` / `batch-generate`.
|
|
180
|
+
- [Hardware tuning](docs/hardware-tuning.md) — peak GPU throughput.
|
|
181
|
+
- [Performance](docs/performance.md) — measured 2.8B throughput and MFU.
|
|
182
|
+
|
|
183
|
+
## Development
|
|
184
|
+
|
|
185
|
+
```bash
|
|
186
|
+
pip install -e ".[dev]"
|
|
187
|
+
pytest tests/
|
|
188
|
+
ruff check src/ tests/
|
|
189
|
+
```
|
|
190
|
+
|
|
191
|
+
## License
|
|
192
|
+
|
|
193
|
+
MIT — see [LICENSE](LICENSE).
|
|
@@ -0,0 +1,66 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=68.0", "wheel"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "gpt-simple-lm"
|
|
7
|
+
version = "0.1.0"
|
|
8
|
+
description = "A clean, efficient framework for pretraining language models from scratch"
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
license = {text = "MIT"}
|
|
11
|
+
requires-python = ">=3.10"
|
|
12
|
+
authors = [{ name = "Louis Bertucci" }]
|
|
13
|
+
keywords = ["llm", "pretraining", "gpt", "pytorch", "transformer", "language-model"]
|
|
14
|
+
classifiers = [
|
|
15
|
+
"Development Status :: 4 - Beta",
|
|
16
|
+
"Intended Audience :: Science/Research",
|
|
17
|
+
"License :: OSI Approved :: MIT License",
|
|
18
|
+
"Programming Language :: Python :: 3.10",
|
|
19
|
+
"Programming Language :: Python :: 3.11",
|
|
20
|
+
"Programming Language :: Python :: 3.12",
|
|
21
|
+
"Topic :: Scientific/Engineering :: Artificial Intelligence",
|
|
22
|
+
]
|
|
23
|
+
dependencies = [
|
|
24
|
+
"torch>=2.0.0",
|
|
25
|
+
"numpy>=1.24.0,<2.0",
|
|
26
|
+
"transformers>=4.36.0",
|
|
27
|
+
"accelerate>=0.20.0",
|
|
28
|
+
"sentencepiece>=0.1.99",
|
|
29
|
+
"pyyaml>=6.0",
|
|
30
|
+
]
|
|
31
|
+
|
|
32
|
+
[project.optional-dependencies]
|
|
33
|
+
wandb = ["wandb>=0.15.0"]
|
|
34
|
+
cli = ["rich>=13.0"]
|
|
35
|
+
dev = [
|
|
36
|
+
"pytest>=7.0",
|
|
37
|
+
"ruff>=0.1.0",
|
|
38
|
+
"wandb>=0.15.0",
|
|
39
|
+
"rich>=13.0",
|
|
40
|
+
]
|
|
41
|
+
|
|
42
|
+
[project.scripts]
|
|
43
|
+
gpt-simple = "gpt_simple.cli.main:main"
|
|
44
|
+
|
|
45
|
+
[project.urls]
|
|
46
|
+
Homepage = "https://github.com/lb-off/gpt-simple"
|
|
47
|
+
Documentation = "https://lb-off.github.io/gpt-simple/"
|
|
48
|
+
Repository = "https://github.com/lb-off/gpt-simple"
|
|
49
|
+
Issues = "https://github.com/lb-off/gpt-simple/issues"
|
|
50
|
+
|
|
51
|
+
[tool.setuptools.packages.find]
|
|
52
|
+
where = ["src"]
|
|
53
|
+
|
|
54
|
+
[tool.ruff]
|
|
55
|
+
line-length = 120
|
|
56
|
+
target-version = "py310"
|
|
57
|
+
|
|
58
|
+
[tool.ruff.lint.per-file-ignores]
|
|
59
|
+
# Tests prepend the project root to sys.path before importing the package,
|
|
60
|
+
# so imports legitimately follow a statement (E402).
|
|
61
|
+
"tests/*" = ["E402"]
|
|
62
|
+
|
|
63
|
+
[tool.pytest.ini_options]
|
|
64
|
+
markers = [
|
|
65
|
+
"e2e: end-to-end tests that spawn the gpt-simple CLI as a subprocess (slow, default-included)",
|
|
66
|
+
]
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
"""
|
|
2
|
+
gpt_simple — a clean, hackable GPT pretraining library.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
from importlib.metadata import PackageNotFoundError, version
|
|
6
|
+
|
|
7
|
+
from gpt_simple.config import Config, CurriculumPhase, DataConfig, ModelConfig, OptimizerConfig, TrainingConfig
|
|
8
|
+
from gpt_simple.errors import CheckpointError, ConfigError, DataError, GptSimpleError
|
|
9
|
+
from gpt_simple.generate import generate, load_for_inference, validate_checkpoint
|
|
10
|
+
from gpt_simple.model import SimpleLLM
|
|
11
|
+
from gpt_simple.tokenizer import SimpleLLMTokenizer
|
|
12
|
+
from gpt_simple.train import TrainingResult, train
|
|
13
|
+
|
|
14
|
+
# Single source of truth is pyproject.toml; read it from the installed
|
|
15
|
+
# metadata so the two can't drift. Falls back when running from a source
|
|
16
|
+
# tree that hasn't been installed.
|
|
17
|
+
try:
|
|
18
|
+
__version__ = version("gpt-simple")
|
|
19
|
+
except PackageNotFoundError: # pragma: no cover
|
|
20
|
+
__version__ = "0.0.0+unknown"
|
|
21
|
+
|
|
22
|
+
__all__ = [
|
|
23
|
+
"__version__",
|
|
24
|
+
"train",
|
|
25
|
+
"generate",
|
|
26
|
+
"load_for_inference",
|
|
27
|
+
"validate_checkpoint",
|
|
28
|
+
"Config",
|
|
29
|
+
"CurriculumPhase",
|
|
30
|
+
"TrainingResult",
|
|
31
|
+
"ModelConfig",
|
|
32
|
+
"DataConfig",
|
|
33
|
+
"OptimizerConfig",
|
|
34
|
+
"TrainingConfig",
|
|
35
|
+
"SimpleLLM",
|
|
36
|
+
"SimpleLLMTokenizer",
|
|
37
|
+
"GptSimpleError",
|
|
38
|
+
"ConfigError",
|
|
39
|
+
"DataError",
|
|
40
|
+
"CheckpointError",
|
|
41
|
+
]
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
"""Entry point for ``python -m gpt_simple``.
|
|
2
|
+
|
|
3
|
+
Kept separate from the library modules on purpose: ``gpt_simple/__init__.py``
|
|
4
|
+
re-exports the public API (including ``train``), so launching a re-exported
|
|
5
|
+
submodule with ``-m`` would make runpy import it twice and emit a
|
|
6
|
+
RuntimeWarning. ``__main__`` is never imported by ``__init__``, so running
|
|
7
|
+
``-m gpt_simple`` loads the package once and dispatches cleanly.
|
|
8
|
+
|
|
9
|
+
The distributed launcher (``cli/train_cmd.py``) shells out to this entry point.
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
from gpt_simple.train import _module_main
|
|
13
|
+
|
|
14
|
+
if __name__ == "__main__":
|
|
15
|
+
_module_main()
|