gpt-simple-lm 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (51) hide show
  1. gpt_simple_lm-0.1.0/LICENSE +21 -0
  2. gpt_simple_lm-0.1.0/PKG-INFO +231 -0
  3. gpt_simple_lm-0.1.0/README.md +193 -0
  4. gpt_simple_lm-0.1.0/pyproject.toml +66 -0
  5. gpt_simple_lm-0.1.0/setup.cfg +4 -0
  6. gpt_simple_lm-0.1.0/src/gpt_simple/__init__.py +41 -0
  7. gpt_simple_lm-0.1.0/src/gpt_simple/__main__.py +15 -0
  8. gpt_simple_lm-0.1.0/src/gpt_simple/_checkpoint.py +781 -0
  9. gpt_simple_lm-0.1.0/src/gpt_simple/_logging.py +69 -0
  10. gpt_simple_lm-0.1.0/src/gpt_simple/_run_state.py +115 -0
  11. gpt_simple_lm-0.1.0/src/gpt_simple/_shutdown.py +353 -0
  12. gpt_simple_lm-0.1.0/src/gpt_simple/_streaming.py +971 -0
  13. gpt_simple_lm-0.1.0/src/gpt_simple/_train_loop.py +627 -0
  14. gpt_simple_lm-0.1.0/src/gpt_simple/_train_setup.py +765 -0
  15. gpt_simple_lm-0.1.0/src/gpt_simple/cli/__init__.py +0 -0
  16. gpt_simple_lm-0.1.0/src/gpt_simple/cli/batch_generate_cmd.py +495 -0
  17. gpt_simple_lm-0.1.0/src/gpt_simple/cli/generate_cmd.py +197 -0
  18. gpt_simple_lm-0.1.0/src/gpt_simple/cli/init_cmd.py +187 -0
  19. gpt_simple_lm-0.1.0/src/gpt_simple/cli/main.py +71 -0
  20. gpt_simple_lm-0.1.0/src/gpt_simple/cli/status_cmd.py +403 -0
  21. gpt_simple_lm-0.1.0/src/gpt_simple/cli/stop_cmd.py +112 -0
  22. gpt_simple_lm-0.1.0/src/gpt_simple/cli/tokenize_cmd.py +78 -0
  23. gpt_simple_lm-0.1.0/src/gpt_simple/cli/train_cmd.py +415 -0
  24. gpt_simple_lm-0.1.0/src/gpt_simple/cli/validate_cmd.py +124 -0
  25. gpt_simple_lm-0.1.0/src/gpt_simple/config.py +485 -0
  26. gpt_simple_lm-0.1.0/src/gpt_simple/data.py +1207 -0
  27. gpt_simple_lm-0.1.0/src/gpt_simple/errors.py +26 -0
  28. gpt_simple_lm-0.1.0/src/gpt_simple/generate.py +384 -0
  29. gpt_simple_lm-0.1.0/src/gpt_simple/model.py +892 -0
  30. gpt_simple_lm-0.1.0/src/gpt_simple/pretokenize.py +383 -0
  31. gpt_simple_lm-0.1.0/src/gpt_simple/tokenizer.py +282 -0
  32. gpt_simple_lm-0.1.0/src/gpt_simple/train.py +329 -0
  33. gpt_simple_lm-0.1.0/src/gpt_simple/validate.py +1373 -0
  34. gpt_simple_lm-0.1.0/src/gpt_simple_lm.egg-info/PKG-INFO +231 -0
  35. gpt_simple_lm-0.1.0/src/gpt_simple_lm.egg-info/SOURCES.txt +49 -0
  36. gpt_simple_lm-0.1.0/src/gpt_simple_lm.egg-info/dependency_links.txt +1 -0
  37. gpt_simple_lm-0.1.0/src/gpt_simple_lm.egg-info/entry_points.txt +2 -0
  38. gpt_simple_lm-0.1.0/src/gpt_simple_lm.egg-info/requires.txt +18 -0
  39. gpt_simple_lm-0.1.0/src/gpt_simple_lm.egg-info/top_level.txt +1 -0
  40. gpt_simple_lm-0.1.0/tests/test_bucket_exhaustion.py +150 -0
  41. gpt_simple_lm-0.1.0/tests/test_checkpoint.py +563 -0
  42. gpt_simple_lm-0.1.0/tests/test_cli.py +1704 -0
  43. gpt_simple_lm-0.1.0/tests/test_data_resume.py +854 -0
  44. gpt_simple_lm-0.1.0/tests/test_dataloader_clamp.py +190 -0
  45. gpt_simple_lm-0.1.0/tests/test_distributed_invariants.py +333 -0
  46. gpt_simple_lm-0.1.0/tests/test_e2e_resume.py +478 -0
  47. gpt_simple_lm-0.1.0/tests/test_initialization.py +88 -0
  48. gpt_simple_lm-0.1.0/tests/test_model_arch.py +184 -0
  49. gpt_simple_lm-0.1.0/tests/test_pretokenized.py +802 -0
  50. gpt_simple_lm-0.1.0/tests/test_shutdown.py +367 -0
  51. gpt_simple_lm-0.1.0/tests/test_validate.py +498 -0
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Louis Bertucci
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,231 @@
1
+ Metadata-Version: 2.4
2
+ Name: gpt-simple-lm
3
+ Version: 0.1.0
4
+ Summary: A clean, efficient framework for pretraining language models from scratch
5
+ Author: Louis Bertucci
6
+ License: MIT
7
+ Project-URL: Homepage, https://github.com/lb-off/gpt-simple
8
+ Project-URL: Documentation, https://lb-off.github.io/gpt-simple/
9
+ Project-URL: Repository, https://github.com/lb-off/gpt-simple
10
+ Project-URL: Issues, https://github.com/lb-off/gpt-simple/issues
11
+ Keywords: llm,pretraining,gpt,pytorch,transformer,language-model
12
+ Classifier: Development Status :: 4 - Beta
13
+ Classifier: Intended Audience :: Science/Research
14
+ Classifier: License :: OSI Approved :: MIT License
15
+ Classifier: Programming Language :: Python :: 3.10
16
+ Classifier: Programming Language :: Python :: 3.11
17
+ Classifier: Programming Language :: Python :: 3.12
18
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
19
+ Requires-Python: >=3.10
20
+ Description-Content-Type: text/markdown
21
+ License-File: LICENSE
22
+ Requires-Dist: torch>=2.0.0
23
+ Requires-Dist: numpy<2.0,>=1.24.0
24
+ Requires-Dist: transformers>=4.36.0
25
+ Requires-Dist: accelerate>=0.20.0
26
+ Requires-Dist: sentencepiece>=0.1.99
27
+ Requires-Dist: pyyaml>=6.0
28
+ Provides-Extra: wandb
29
+ Requires-Dist: wandb>=0.15.0; extra == "wandb"
30
+ Provides-Extra: cli
31
+ Requires-Dist: rich>=13.0; extra == "cli"
32
+ Provides-Extra: dev
33
+ Requires-Dist: pytest>=7.0; extra == "dev"
34
+ Requires-Dist: ruff>=0.1.0; extra == "dev"
35
+ Requires-Dist: wandb>=0.15.0; extra == "dev"
36
+ Requires-Dist: rich>=13.0; extra == "dev"
37
+ Dynamic: license-file
38
+
39
+ # GPT-Simple
40
+
41
+ [![CI](https://github.com/lb-off/gpt-simple/actions/workflows/ci.yml/badge.svg)](https://github.com/lb-off/gpt-simple/actions/workflows/ci.yml)
42
+ [![Python 3.10+](https://img.shields.io/badge/python-3.10%2B-blue.svg)](https://www.python.org/downloads/)
43
+ [![License: MIT](https://img.shields.io/badge/License-MIT-green.svg)](LICENSE)
44
+
45
+ A clean, efficient framework for pretraining language models from scratch.
46
+
47
+ GPT-Simple handles the full LLM pretraining workflow — tokenization,
48
+ streaming data loading, multi-GPU training, checkpointing, and inference —
49
+ through a single YAML config and a small CLI. It ships with a modern GPT
50
+ architecture ready to train out of the box.
51
+
52
+ ## Features
53
+
54
+ - **Single YAML config + CLI** — `init` / `tokenize` / `train` / `status`
55
+ / `stop` / `validate` / `generate` / `batch-generate`.
56
+ - **Multi-GPU out of the box** — `--nproc_per_node N` launches `torchrun`
57
+ automatically (Accelerate, bf16, `torch.compile`, gradient
58
+ checkpointing).
59
+ - **Pretokenized streaming** — memory-mapped `.bin/.idx` shards with
60
+ sequence packing; a raw-JSONL fallback for quick experiments.
61
+ - **Deterministic stop/resume** — walltime- and signal-aware checkpoints
62
+ with topology-agnostic data cursors, so *N* short jobs equal one long
63
+ job (every document seen exactly once, even if `world_size` /
64
+ `num_workers` change between restarts).
65
+ - **Orchestrator-friendly** — runs under SLURM, Kubernetes, or a local
66
+ loop; templates in [`examples/orchestrators/`](examples/orchestrators/).
67
+ - **Curriculum learning** — phase-based mixing across named data buckets.
68
+ - **Modern architecture** — pre-norm decoder with RoPE, RMSNorm, and a
69
+ gated (SwiGLU) MLP; also expresses GQA/MQA, vanilla MLPs, and untied
70
+ heads via config.
71
+ - **Python API** — `import gpt_simple; gpt_simple.train(config="config.yaml")`.
72
+
73
+ ## Installation
74
+
75
+ ```bash
76
+ pip install -e ".[dev]" # from source (development)
77
+ pip install . # core only
78
+ pip install ".[wandb]" # optional: Weights & Biases logging
79
+ pip install ".[cli]" # optional: rich-formatted CLI output
80
+ ```
81
+
82
+ ## Quick start
83
+
84
+ ### 1. Generate a config
85
+
86
+ ```bash
87
+ gpt-simple init -o config.yaml
88
+ gpt-simple init --preset small -o config.yaml # ~125M (small | medium | large)
89
+ ```
90
+
91
+ ### 2. Pretokenize your data
92
+
93
+ ```bash
94
+ gpt-simple tokenize \
95
+ --input_dir ./raw_data \
96
+ --output_dir ./data/tokenized \
97
+ --tokenizer_path gpt2 \
98
+ --max_length 2048 \
99
+ --num_workers 8
100
+ ```
101
+
102
+ Converts `.jsonl`/`.txt` into memory-mapped `.bin/.idx` shards. See the
103
+ [data pipeline guide](docs/data.md).
104
+
105
+ ### 3. Train
106
+
107
+ ```bash
108
+ gpt-simple train --config config.yaml # single GPU
109
+ gpt-simple train --config config.yaml --nproc_per_node 4 # 4 GPUs
110
+
111
+ # override any config value; start fresh with --force
112
+ gpt-simple train --config config.yaml --training.max_steps 5000 --force
113
+ ```
114
+
115
+ See the [training guide](docs/training.md).
116
+
117
+ ### 4. Monitor and control
118
+
119
+ ```bash
120
+ gpt-simple status # training progress
121
+ gpt-simple stop # graceful shutdown (saves a checkpoint)
122
+ gpt-simple stop --force # immediate SIGKILL
123
+ ```
124
+
125
+ ### 5. Generate
126
+
127
+ ```bash
128
+ gpt-simple generate --output-dir ./outputs --prompt "Once upon a time" --max-new-tokens 200
129
+ ```
130
+
131
+ `--output-dir` auto-picks the latest checkpoint. For multi-model /
132
+ multi-sampling batches and a `--dry-run` submission gate, use
133
+ `batch-generate` — see the [inference guide](docs/inference.md).
134
+
135
+ ## Long runs with stop/resume
136
+
137
+ The trainer targets clusters with a hard per-job wall-clock cap. With
138
+ `resume: auto` (the default), re-running the same command resumes the
139
+ latest checkpoint, and the trainer saves and exits cleanly before a
140
+ walltime deadline or on `SIGTERM`/`SIGUSR1` — so an orchestrator just
141
+ re-queues the job.
142
+
143
+ ```bash
144
+ gpt-simple train --config config.yaml # resume is automatic on every restart
145
+ gpt-simple status
146
+ gpt-simple stop # or let walltime/SIGUSR1 do it
147
+ ```
148
+
149
+ Templates: [`slurm_resume_chain.sh`](examples/orchestrators/slurm_resume_chain.sh),
150
+ [`kubernetes_job.yaml`](examples/orchestrators/kubernetes_job.yaml),
151
+ [`local_loop.sh`](examples/orchestrators/local_loop.sh). See the
152
+ [checkpointing & resume](docs/checkpointing-and-resume.md) and
153
+ [orchestration](docs/orchestration.md) guides.
154
+
155
+ ## Configuration
156
+
157
+ All settings live in one YAML file with four sections — `model`, `data`,
158
+ `optimizer`, `training`:
159
+
160
+ ```yaml
161
+ model:
162
+ n_embd: 768
163
+ n_layer: 12
164
+ n_head: 12
165
+ n_positions: 2048
166
+
167
+ data:
168
+ path: ./data/tokenized
169
+ tokenizer: gpt2
170
+ format: pretokenized # pretokenized | jsonl
171
+ max_length: 2048
172
+
173
+ optimizer:
174
+ learning_rate: 3.0e-4
175
+ warmup_steps: 100
176
+
177
+ training:
178
+ per_device_batch_size: 4
179
+ gradient_accumulation_steps: 4
180
+ max_steps: 1000
181
+ output_dir: ./outputs
182
+ # wandb_project: my-project # uncomment to enable W&B
183
+ ```
184
+
185
+ `gpt-simple init` writes a fully commented template. Every field is
186
+ documented in the [configuration reference](docs/configuration.md), and
187
+ curriculum learning in the [data pipeline guide](docs/data.md).
188
+
189
+ ## Python API
190
+
191
+ ```python
192
+ import gpt_simple
193
+
194
+ result = gpt_simple.train(
195
+ model=gpt_simple.ModelConfig(n_embd=768, n_layer=12, n_head=12),
196
+ data=gpt_simple.DataConfig(path="./data/tokenized", tokenizer="gpt2"),
197
+ optimizer=gpt_simple.OptimizerConfig(learning_rate=3e-4),
198
+ training=gpt_simple.TrainingConfig(max_steps=1000, output_dir="./outputs"),
199
+ )
200
+ print(result.final_loss, result.total_tokens, result.checkpoint_path)
201
+ ```
202
+
203
+ Or `gpt_simple.train(config="config.yaml")`; sub-configs passed
204
+ explicitly override the matching section from the file.
205
+
206
+ ## Documentation
207
+
208
+ Full guides live in [`docs/`](docs/README.md):
209
+
210
+ - [Architecture](docs/architecture.md) — the built-in model.
211
+ - [Configuration](docs/configuration.md) — every config field.
212
+ - [Data pipeline](docs/data.md) — tokenization, packing, curriculum.
213
+ - [Training](docs/training.md) — multi-GPU, precision, compile.
214
+ - [Checkpointing & resume](docs/checkpointing-and-resume.md) — the
215
+ stop/resume model.
216
+ - [Orchestration](docs/orchestration.md) — running under any scheduler.
217
+ - [Inference](docs/inference.md) — `generate` / `batch-generate`.
218
+ - [Hardware tuning](docs/hardware-tuning.md) — peak GPU throughput.
219
+ - [Performance](docs/performance.md) — measured 2.8B throughput and MFU.
220
+
221
+ ## Development
222
+
223
+ ```bash
224
+ pip install -e ".[dev]"
225
+ pytest tests/
226
+ ruff check src/ tests/
227
+ ```
228
+
229
+ ## License
230
+
231
+ MIT — see [LICENSE](LICENSE).
@@ -0,0 +1,193 @@
1
+ # GPT-Simple
2
+
3
+ [![CI](https://github.com/lb-off/gpt-simple/actions/workflows/ci.yml/badge.svg)](https://github.com/lb-off/gpt-simple/actions/workflows/ci.yml)
4
+ [![Python 3.10+](https://img.shields.io/badge/python-3.10%2B-blue.svg)](https://www.python.org/downloads/)
5
+ [![License: MIT](https://img.shields.io/badge/License-MIT-green.svg)](LICENSE)
6
+
7
+ A clean, efficient framework for pretraining language models from scratch.
8
+
9
+ GPT-Simple handles the full LLM pretraining workflow — tokenization,
10
+ streaming data loading, multi-GPU training, checkpointing, and inference —
11
+ through a single YAML config and a small CLI. It ships with a modern GPT
12
+ architecture ready to train out of the box.
13
+
14
+ ## Features
15
+
16
+ - **Single YAML config + CLI** — `init` / `tokenize` / `train` / `status`
17
+ / `stop` / `validate` / `generate` / `batch-generate`.
18
+ - **Multi-GPU out of the box** — `--nproc_per_node N` launches `torchrun`
19
+ automatically (Accelerate, bf16, `torch.compile`, gradient
20
+ checkpointing).
21
+ - **Pretokenized streaming** — memory-mapped `.bin/.idx` shards with
22
+ sequence packing; a raw-JSONL fallback for quick experiments.
23
+ - **Deterministic stop/resume** — walltime- and signal-aware checkpoints
24
+ with topology-agnostic data cursors, so *N* short jobs equal one long
25
+ job (every document seen exactly once, even if `world_size` /
26
+ `num_workers` change between restarts).
27
+ - **Orchestrator-friendly** — runs under SLURM, Kubernetes, or a local
28
+ loop; templates in [`examples/orchestrators/`](examples/orchestrators/).
29
+ - **Curriculum learning** — phase-based mixing across named data buckets.
30
+ - **Modern architecture** — pre-norm decoder with RoPE, RMSNorm, and a
31
+ gated (SwiGLU) MLP; also expresses GQA/MQA, vanilla MLPs, and untied
32
+ heads via config.
33
+ - **Python API** — `import gpt_simple; gpt_simple.train(config="config.yaml")`.
34
+
35
+ ## Installation
36
+
37
+ ```bash
38
+ pip install -e ".[dev]" # from source (development)
39
+ pip install . # core only
40
+ pip install ".[wandb]" # optional: Weights & Biases logging
41
+ pip install ".[cli]" # optional: rich-formatted CLI output
42
+ ```
43
+
44
+ ## Quick start
45
+
46
+ ### 1. Generate a config
47
+
48
+ ```bash
49
+ gpt-simple init -o config.yaml
50
+ gpt-simple init --preset small -o config.yaml # ~125M (small | medium | large)
51
+ ```
52
+
53
+ ### 2. Pretokenize your data
54
+
55
+ ```bash
56
+ gpt-simple tokenize \
57
+ --input_dir ./raw_data \
58
+ --output_dir ./data/tokenized \
59
+ --tokenizer_path gpt2 \
60
+ --max_length 2048 \
61
+ --num_workers 8
62
+ ```
63
+
64
+ Converts `.jsonl`/`.txt` into memory-mapped `.bin/.idx` shards. See the
65
+ [data pipeline guide](docs/data.md).
66
+
67
+ ### 3. Train
68
+
69
+ ```bash
70
+ gpt-simple train --config config.yaml # single GPU
71
+ gpt-simple train --config config.yaml --nproc_per_node 4 # 4 GPUs
72
+
73
+ # override any config value; start fresh with --force
74
+ gpt-simple train --config config.yaml --training.max_steps 5000 --force
75
+ ```
76
+
77
+ See the [training guide](docs/training.md).
78
+
79
+ ### 4. Monitor and control
80
+
81
+ ```bash
82
+ gpt-simple status # training progress
83
+ gpt-simple stop # graceful shutdown (saves a checkpoint)
84
+ gpt-simple stop --force # immediate SIGKILL
85
+ ```
86
+
87
+ ### 5. Generate
88
+
89
+ ```bash
90
+ gpt-simple generate --output-dir ./outputs --prompt "Once upon a time" --max-new-tokens 200
91
+ ```
92
+
93
+ `--output-dir` auto-picks the latest checkpoint. For multi-model /
94
+ multi-sampling batches and a `--dry-run` submission gate, use
95
+ `batch-generate` — see the [inference guide](docs/inference.md).
96
+
97
+ ## Long runs with stop/resume
98
+
99
+ The trainer targets clusters with a hard per-job wall-clock cap. With
100
+ `resume: auto` (the default), re-running the same command resumes the
101
+ latest checkpoint, and the trainer saves and exits cleanly before a
102
+ walltime deadline or on `SIGTERM`/`SIGUSR1` — so an orchestrator just
103
+ re-queues the job.
104
+
105
+ ```bash
106
+ gpt-simple train --config config.yaml # resume is automatic on every restart
107
+ gpt-simple status
108
+ gpt-simple stop # or let walltime/SIGUSR1 do it
109
+ ```
110
+
111
+ Templates: [`slurm_resume_chain.sh`](examples/orchestrators/slurm_resume_chain.sh),
112
+ [`kubernetes_job.yaml`](examples/orchestrators/kubernetes_job.yaml),
113
+ [`local_loop.sh`](examples/orchestrators/local_loop.sh). See the
114
+ [checkpointing & resume](docs/checkpointing-and-resume.md) and
115
+ [orchestration](docs/orchestration.md) guides.
116
+
117
+ ## Configuration
118
+
119
+ All settings live in one YAML file with four sections — `model`, `data`,
120
+ `optimizer`, `training`:
121
+
122
+ ```yaml
123
+ model:
124
+ n_embd: 768
125
+ n_layer: 12
126
+ n_head: 12
127
+ n_positions: 2048
128
+
129
+ data:
130
+ path: ./data/tokenized
131
+ tokenizer: gpt2
132
+ format: pretokenized # pretokenized | jsonl
133
+ max_length: 2048
134
+
135
+ optimizer:
136
+ learning_rate: 3.0e-4
137
+ warmup_steps: 100
138
+
139
+ training:
140
+ per_device_batch_size: 4
141
+ gradient_accumulation_steps: 4
142
+ max_steps: 1000
143
+ output_dir: ./outputs
144
+ # wandb_project: my-project # uncomment to enable W&B
145
+ ```
146
+
147
+ `gpt-simple init` writes a fully commented template. Every field is
148
+ documented in the [configuration reference](docs/configuration.md), and
149
+ curriculum learning in the [data pipeline guide](docs/data.md).
150
+
151
+ ## Python API
152
+
153
+ ```python
154
+ import gpt_simple
155
+
156
+ result = gpt_simple.train(
157
+ model=gpt_simple.ModelConfig(n_embd=768, n_layer=12, n_head=12),
158
+ data=gpt_simple.DataConfig(path="./data/tokenized", tokenizer="gpt2"),
159
+ optimizer=gpt_simple.OptimizerConfig(learning_rate=3e-4),
160
+ training=gpt_simple.TrainingConfig(max_steps=1000, output_dir="./outputs"),
161
+ )
162
+ print(result.final_loss, result.total_tokens, result.checkpoint_path)
163
+ ```
164
+
165
+ Or `gpt_simple.train(config="config.yaml")`; sub-configs passed
166
+ explicitly override the matching section from the file.
167
+
168
+ ## Documentation
169
+
170
+ Full guides live in [`docs/`](docs/README.md):
171
+
172
+ - [Architecture](docs/architecture.md) — the built-in model.
173
+ - [Configuration](docs/configuration.md) — every config field.
174
+ - [Data pipeline](docs/data.md) — tokenization, packing, curriculum.
175
+ - [Training](docs/training.md) — multi-GPU, precision, compile.
176
+ - [Checkpointing & resume](docs/checkpointing-and-resume.md) — the
177
+ stop/resume model.
178
+ - [Orchestration](docs/orchestration.md) — running under any scheduler.
179
+ - [Inference](docs/inference.md) — `generate` / `batch-generate`.
180
+ - [Hardware tuning](docs/hardware-tuning.md) — peak GPU throughput.
181
+ - [Performance](docs/performance.md) — measured 2.8B throughput and MFU.
182
+
183
+ ## Development
184
+
185
+ ```bash
186
+ pip install -e ".[dev]"
187
+ pytest tests/
188
+ ruff check src/ tests/
189
+ ```
190
+
191
+ ## License
192
+
193
+ MIT — see [LICENSE](LICENSE).
@@ -0,0 +1,66 @@
1
+ [build-system]
2
+ requires = ["setuptools>=68.0", "wheel"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "gpt-simple-lm"
7
+ version = "0.1.0"
8
+ description = "A clean, efficient framework for pretraining language models from scratch"
9
+ readme = "README.md"
10
+ license = {text = "MIT"}
11
+ requires-python = ">=3.10"
12
+ authors = [{ name = "Louis Bertucci" }]
13
+ keywords = ["llm", "pretraining", "gpt", "pytorch", "transformer", "language-model"]
14
+ classifiers = [
15
+ "Development Status :: 4 - Beta",
16
+ "Intended Audience :: Science/Research",
17
+ "License :: OSI Approved :: MIT License",
18
+ "Programming Language :: Python :: 3.10",
19
+ "Programming Language :: Python :: 3.11",
20
+ "Programming Language :: Python :: 3.12",
21
+ "Topic :: Scientific/Engineering :: Artificial Intelligence",
22
+ ]
23
+ dependencies = [
24
+ "torch>=2.0.0",
25
+ "numpy>=1.24.0,<2.0",
26
+ "transformers>=4.36.0",
27
+ "accelerate>=0.20.0",
28
+ "sentencepiece>=0.1.99",
29
+ "pyyaml>=6.0",
30
+ ]
31
+
32
+ [project.optional-dependencies]
33
+ wandb = ["wandb>=0.15.0"]
34
+ cli = ["rich>=13.0"]
35
+ dev = [
36
+ "pytest>=7.0",
37
+ "ruff>=0.1.0",
38
+ "wandb>=0.15.0",
39
+ "rich>=13.0",
40
+ ]
41
+
42
+ [project.scripts]
43
+ gpt-simple = "gpt_simple.cli.main:main"
44
+
45
+ [project.urls]
46
+ Homepage = "https://github.com/lb-off/gpt-simple"
47
+ Documentation = "https://lb-off.github.io/gpt-simple/"
48
+ Repository = "https://github.com/lb-off/gpt-simple"
49
+ Issues = "https://github.com/lb-off/gpt-simple/issues"
50
+
51
+ [tool.setuptools.packages.find]
52
+ where = ["src"]
53
+
54
+ [tool.ruff]
55
+ line-length = 120
56
+ target-version = "py310"
57
+
58
+ [tool.ruff.lint.per-file-ignores]
59
+ # Tests prepend the project root to sys.path before importing the package,
60
+ # so imports legitimately follow a statement (E402).
61
+ "tests/*" = ["E402"]
62
+
63
+ [tool.pytest.ini_options]
64
+ markers = [
65
+ "e2e: end-to-end tests that spawn the gpt-simple CLI as a subprocess (slow, default-included)",
66
+ ]
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
@@ -0,0 +1,41 @@
1
+ """
2
+ gpt_simple — a clean, hackable GPT pretraining library.
3
+ """
4
+
5
+ from importlib.metadata import PackageNotFoundError, version
6
+
7
+ from gpt_simple.config import Config, CurriculumPhase, DataConfig, ModelConfig, OptimizerConfig, TrainingConfig
8
+ from gpt_simple.errors import CheckpointError, ConfigError, DataError, GptSimpleError
9
+ from gpt_simple.generate import generate, load_for_inference, validate_checkpoint
10
+ from gpt_simple.model import SimpleLLM
11
+ from gpt_simple.tokenizer import SimpleLLMTokenizer
12
+ from gpt_simple.train import TrainingResult, train
13
+
14
+ # Single source of truth is pyproject.toml; read it from the installed
15
+ # metadata so the two can't drift. Falls back when running from a source
16
+ # tree that hasn't been installed.
17
+ try:
18
+ __version__ = version("gpt-simple")
19
+ except PackageNotFoundError: # pragma: no cover
20
+ __version__ = "0.0.0+unknown"
21
+
22
+ __all__ = [
23
+ "__version__",
24
+ "train",
25
+ "generate",
26
+ "load_for_inference",
27
+ "validate_checkpoint",
28
+ "Config",
29
+ "CurriculumPhase",
30
+ "TrainingResult",
31
+ "ModelConfig",
32
+ "DataConfig",
33
+ "OptimizerConfig",
34
+ "TrainingConfig",
35
+ "SimpleLLM",
36
+ "SimpleLLMTokenizer",
37
+ "GptSimpleError",
38
+ "ConfigError",
39
+ "DataError",
40
+ "CheckpointError",
41
+ ]
@@ -0,0 +1,15 @@
1
+ """Entry point for ``python -m gpt_simple``.
2
+
3
+ Kept separate from the library modules on purpose: ``gpt_simple/__init__.py``
4
+ re-exports the public API (including ``train``), so launching a re-exported
5
+ submodule with ``-m`` would make runpy import it twice and emit a
6
+ RuntimeWarning. ``__main__`` is never imported by ``__init__``, so running
7
+ ``-m gpt_simple`` loads the package once and dispatches cleanly.
8
+
9
+ The distributed launcher (``cli/train_cmd.py``) shells out to this entry point.
10
+ """
11
+
12
+ from gpt_simple.train import _module_main
13
+
14
+ if __name__ == "__main__":
15
+ _module_main()