olaverse-foundry 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- olaverse_foundry-0.1.0/PKG-INFO +282 -0
- olaverse_foundry-0.1.0/README.md +236 -0
- olaverse_foundry-0.1.0/foundry/__init__.py +70 -0
- olaverse_foundry-0.1.0/foundry/backends.py +100 -0
- olaverse_foundry-0.1.0/foundry/cli.py +238 -0
- olaverse_foundry-0.1.0/foundry/contracts/__init__.py +8 -0
- olaverse_foundry-0.1.0/foundry/contracts/protocols.py +168 -0
- olaverse_foundry-0.1.0/foundry/data/__init__.py +3 -0
- olaverse_foundry-0.1.0/foundry/data/pipeline.py +247 -0
- olaverse_foundry-0.1.0/foundry/fusion/__init__.py +15 -0
- olaverse_foundry-0.1.0/foundry/fusion/align.py +157 -0
- olaverse_foundry-0.1.0/foundry/fusion/kernel.py +112 -0
- olaverse_foundry-0.1.0/foundry/fusion/strategies.py +88 -0
- olaverse_foundry-0.1.0/foundry/fusion/vocab_map.py +223 -0
- olaverse_foundry-0.1.0/foundry/growth/__init__.py +23 -0
- olaverse_foundry-0.1.0/foundry/growth/mergekit_backend.py +214 -0
- olaverse_foundry-0.1.0/foundry/growth/planner.py +155 -0
- olaverse_foundry-0.1.0/foundry/io/__init__.py +7 -0
- olaverse_foundry-0.1.0/foundry/io/loader.py +125 -0
- olaverse_foundry-0.1.0/foundry/io/seed.py +163 -0
- olaverse_foundry-0.1.0/foundry/recipes/__init__.py +19 -0
- olaverse_foundry-0.1.0/foundry/recipes/recipe.py +227 -0
- olaverse_foundry-0.1.0/foundry/recipes/schema.py +200 -0
- olaverse_foundry-0.1.0/foundry/skillpacks/__init__.py +10 -0
- olaverse_foundry-0.1.0/foundry/skillpacks/pack.py +160 -0
- olaverse_foundry-0.1.0/foundry/skillpacks/peft_bridge.py +248 -0
- olaverse_foundry-0.1.0/foundry/teachers/__init__.py +4 -0
- olaverse_foundry-0.1.0/foundry/teachers/cache.py +195 -0
- olaverse_foundry-0.1.0/foundry/teachers/registry.py +222 -0
- olaverse_foundry-0.1.0/foundry/training/__init__.py +13 -0
- olaverse_foundry-0.1.0/foundry/training/_logger.py +84 -0
- olaverse_foundry-0.1.0/foundry/training/_scheduler.py +52 -0
- olaverse_foundry-0.1.0/foundry/training/accelerate_distill.py +448 -0
- olaverse_foundry-0.1.0/foundry/training/distill.py +158 -0
- olaverse_foundry-0.1.0/foundry/training/embed_distill.py +561 -0
- olaverse_foundry-0.1.0/foundry/training/torch_distill.py +432 -0
- olaverse_foundry-0.1.0/olaverse_foundry.egg-info/PKG-INFO +282 -0
- olaverse_foundry-0.1.0/olaverse_foundry.egg-info/SOURCES.txt +49 -0
- olaverse_foundry-0.1.0/olaverse_foundry.egg-info/dependency_links.txt +1 -0
- olaverse_foundry-0.1.0/olaverse_foundry.egg-info/entry_points.txt +2 -0
- olaverse_foundry-0.1.0/olaverse_foundry.egg-info/requires.txt +32 -0
- olaverse_foundry-0.1.0/olaverse_foundry.egg-info/top_level.txt +1 -0
- olaverse_foundry-0.1.0/pyproject.toml +100 -0
- olaverse_foundry-0.1.0/setup.cfg +4 -0
- olaverse_foundry-0.1.0/tests/test_foundry.py +224 -0
- olaverse_foundry-0.1.0/tests/test_m1.py +258 -0
- olaverse_foundry-0.1.0/tests/test_m2.py +351 -0
- olaverse_foundry-0.1.0/tests/test_m3.py +308 -0
- olaverse_foundry-0.1.0/tests/test_m4.py +332 -0
- olaverse_foundry-0.1.0/tests/test_m5.py +306 -0
- olaverse_foundry-0.1.0/tests/test_prod.py +710 -0
|
@@ -0,0 +1,282 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: olaverse-foundry
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: A toolkit for building model families — seed, grow, fuse, freeze, extend.
|
|
5
|
+
Author-email: Olaverse Labs <hello@olaverse.co.uk>
|
|
6
|
+
License: Apache-2.0
|
|
7
|
+
Project-URL: Homepage, https://olaverse.co.uk
|
|
8
|
+
Project-URL: Repository, https://github.com/Olaverse-Labs/olaverse-foundry
|
|
9
|
+
Project-URL: Issues, https://github.com/Olaverse-Labs/olaverse-foundry/issues
|
|
10
|
+
Keywords: llm,distillation,model-merging,lora,skill-packs,african-nlp,olaverse,foundry
|
|
11
|
+
Classifier: Development Status :: 3 - Alpha
|
|
12
|
+
Classifier: Intended Audience :: Science/Research
|
|
13
|
+
Classifier: Programming Language :: Python :: 3
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
18
|
+
Classifier: License :: OSI Approved :: Apache Software License
|
|
19
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
20
|
+
Requires-Python: >=3.9
|
|
21
|
+
Description-Content-Type: text/markdown
|
|
22
|
+
Requires-Dist: numpy>=1.24.0
|
|
23
|
+
Requires-Dist: pydantic>=2.0.0
|
|
24
|
+
Requires-Dist: pyyaml>=6.0
|
|
25
|
+
Provides-Extra: torch
|
|
26
|
+
Requires-Dist: torch>=2.0.0; extra == "torch"
|
|
27
|
+
Requires-Dist: transformers>=4.40.0; extra == "torch"
|
|
28
|
+
Requires-Dist: huggingface_hub>=0.20.0; extra == "torch"
|
|
29
|
+
Requires-Dist: safetensors>=0.4.0; extra == "torch"
|
|
30
|
+
Requires-Dist: accelerate>=0.27.0; extra == "torch"
|
|
31
|
+
Provides-Extra: lego
|
|
32
|
+
Requires-Dist: peft>=0.10.0; extra == "lego"
|
|
33
|
+
Provides-Extra: merge
|
|
34
|
+
Requires-Dist: mergekit; extra == "merge"
|
|
35
|
+
Provides-Extra: data
|
|
36
|
+
Requires-Dist: datasets>=2.18.0; extra == "data"
|
|
37
|
+
Provides-Extra: align
|
|
38
|
+
Requires-Dist: rapidfuzz>=3.0.0; extra == "align"
|
|
39
|
+
Provides-Extra: logging
|
|
40
|
+
Requires-Dist: wandb>=0.16.0; extra == "logging"
|
|
41
|
+
Provides-Extra: all
|
|
42
|
+
Requires-Dist: olaverse-foundry[align,data,lego,logging,merge,torch]; extra == "all"
|
|
43
|
+
Provides-Extra: dev
|
|
44
|
+
Requires-Dist: pytest>=7.0; extra == "dev"
|
|
45
|
+
Requires-Dist: pytest-cov; extra == "dev"
|
|
46
|
+
|
|
47
|
+
# olaverse-foundry
|
|
48
|
+
|
|
49
|
+
**Build model families from a single pretrained seed.**
|
|
50
|
+
|
|
51
|
+
`olaverse-foundry` is the training and model-factory layer of the Olaverse ecosystem. Where `olaverse` gives you ready-to-use NLP models, `foundry` lets you build new ones — distilling, growing, fusing, and adapting them for production.
|
|
52
|
+
|
|
53
|
+
```
|
|
54
|
+
seed → grow → distil / fuse → freeze → skill packs
|
|
55
|
+
```
|
|
56
|
+
|
|
57
|
+
---
|
|
58
|
+
|
|
59
|
+
## Install
|
|
60
|
+
|
|
61
|
+
```bash
|
|
62
|
+
# Core (schema validation, growth planning — no GPU required)
|
|
63
|
+
pip install olaverse-foundry
|
|
64
|
+
|
|
65
|
+
# GPU training
|
|
66
|
+
pip install olaverse-foundry[torch]
|
|
67
|
+
|
|
68
|
+
# LoRA skill packs
|
|
69
|
+
pip install olaverse-foundry[torch,lego]
|
|
70
|
+
|
|
71
|
+
# Everything
|
|
72
|
+
pip install olaverse-foundry[all]
|
|
73
|
+
```
|
|
74
|
+
|
|
75
|
+
---
|
|
76
|
+
|
|
77
|
+
## Quick start — embedding distillation (200M student)
|
|
78
|
+
|
|
79
|
+
```python
|
|
80
|
+
from foundry import DataPipeline, EmbeddingDistillTrainer, EmbeddingDistillConfig
|
|
81
|
+
from transformers import AutoModel, AutoTokenizer
|
|
82
|
+
|
|
83
|
+
# Load student and teacher
|
|
84
|
+
student = AutoModel.from_pretrained("microsoft/deberta-v3-base")
|
|
85
|
+
teacher = AutoModel.from_pretrained("BAAI/bge-large-en-v1.5")
|
|
86
|
+
tok = AutoTokenizer.from_pretrained("microsoft/deberta-v3-base")
|
|
87
|
+
|
|
88
|
+
# Stream data
|
|
89
|
+
pipe = DataPipeline(
|
|
90
|
+
source = my_hf_dataset,
|
|
91
|
+
tokenizer = tok,
|
|
92
|
+
batch_size = 32,
|
|
93
|
+
max_length = 128,
|
|
94
|
+
mode = "embed",
|
|
95
|
+
shuffle_buffer = 10_000,
|
|
96
|
+
)
|
|
97
|
+
|
|
98
|
+
# Train
|
|
99
|
+
trainer = EmbeddingDistillTrainer(
|
|
100
|
+
student = student,
|
|
101
|
+
teacher = teacher,
|
|
102
|
+
config = EmbeddingDistillConfig(
|
|
103
|
+
loss = "cosine",
|
|
104
|
+
pool = "mean",
|
|
105
|
+
epochs = 3,
|
|
106
|
+
lr_scheduler = "cosine",
|
|
107
|
+
warmup_steps = 200,
|
|
108
|
+
torch_dtype = "bfloat16",
|
|
109
|
+
save_every = 1000,
|
|
110
|
+
save_dir = "/checkpoints/embed-200m",
|
|
111
|
+
log_backend = "wandb",
|
|
112
|
+
),
|
|
113
|
+
)
|
|
114
|
+
|
|
115
|
+
result = trainer.train(pipe, eval_dataset=eval_pipe)
|
|
116
|
+
print(result["eval_losses"])
|
|
117
|
+
```
|
|
118
|
+
|
|
119
|
+
---
|
|
120
|
+
|
|
121
|
+
## Quick start — causal LM distillation with multiple teachers
|
|
122
|
+
|
|
123
|
+
```python
|
|
124
|
+
from foundry import (
|
|
125
|
+
DataPipeline, TorchDistillTrainer, TorchTrainConfig,
|
|
126
|
+
TeacherRegistry, FoundryRecipe,
|
|
127
|
+
)
|
|
128
|
+
|
|
129
|
+
# Build a registry of teachers
|
|
130
|
+
teachers = TeacherRegistry.from_names(
|
|
131
|
+
["meta-llama/Llama-3.1-70B", "Qwen/Qwen2-72B-Instruct"],
|
|
132
|
+
weights=[1.0, 0.8],
|
|
133
|
+
)
|
|
134
|
+
teachers.load_all()
|
|
135
|
+
|
|
136
|
+
# Stream training data
|
|
137
|
+
pipe = DataPipeline(
|
|
138
|
+
source = my_dataset,
|
|
139
|
+
tokenizer = tok,
|
|
140
|
+
batch_size = 8,
|
|
141
|
+
max_length = 2048,
|
|
142
|
+
mode = "lm",
|
|
143
|
+
)
|
|
144
|
+
|
|
145
|
+
trainer = TorchDistillTrainer(
|
|
146
|
+
student = my_3b_model,
|
|
147
|
+
teachers = teachers,
|
|
148
|
+
config = TorchTrainConfig(
|
|
149
|
+
epochs = 1,
|
|
150
|
+
lr_scheduler = "cosine",
|
|
151
|
+
warmup_steps = 500,
|
|
152
|
+
torch_dtype = "bfloat16",
|
|
153
|
+
grad_accumulation_steps = 8,
|
|
154
|
+
save_every = 500,
|
|
155
|
+
save_dir = "/checkpoints/run1",
|
|
156
|
+
eval_every = 100,
|
|
157
|
+
log_backend = "wandb",
|
|
158
|
+
),
|
|
159
|
+
)
|
|
160
|
+
|
|
161
|
+
result = trainer.train(pipe, eval_dataset=eval_pipe)
|
|
162
|
+
```
|
|
163
|
+
|
|
164
|
+
---
|
|
165
|
+
|
|
166
|
+
## Key components
|
|
167
|
+
|
|
168
|
+
| Module | What it does |
|
|
169
|
+
|---|---|
|
|
170
|
+
| `DataPipeline` | Converts HF datasets, string lists, or numpy arrays into trainer-ready batches. Supports streaming and reservoir shuffle. |
|
|
171
|
+
| `TorchDistillTrainer` | Single-GPU distillation: CE + KL loss against one or more teachers. |
|
|
172
|
+
| `CachedDistillTrainer` | Like `TorchDistillTrainer` but caches teacher logits on disk after the first pass. Subsequent epochs are free. Supports `accelerate` for multi-GPU. |
|
|
173
|
+
| `EmbeddingDistillTrainer` | MSE / cosine loss on pooled sentence vectors. Use for bi-encoder / reranker distillation. |
|
|
174
|
+
| `TeacherRegistry` | Pool of HF teacher models with relative weights. Handles `AutoModelForCausalLM` and `AutoModel` (encoders). |
|
|
175
|
+
| `LogitCache` | In-memory + on-disk cache for top-k teacher logit distributions. |
|
|
176
|
+
| `GrowthPlan` / `plan_growth` | Depth up-scaling via SOLAR-style layer duplication. Generates mergekit-compatible YAML. |
|
|
177
|
+
| `SkillPack` / `SkillRegistry` | Detachable LoRA adapters bound to a specific base model hash. |
|
|
178
|
+
| `save_as_peft` / `load_from_peft` | PEFT-format adapter round-trip (no peft library required). |
|
|
179
|
+
| `MinEDAlignment` | Cross-tokenizer vocabulary alignment via edit distance. |
|
|
180
|
+
| `DataPipeline` | Unified dataset adapter — HF datasets, streaming, raw text, numpy. |
|
|
181
|
+
| `FoundryRecipe` / `EmbedRecipe` | Pydantic-validated YAML recipes — fail fast before GPU spend. |
|
|
182
|
+
|
|
183
|
+
---
|
|
184
|
+
|
|
185
|
+
## Training features
|
|
186
|
+
|
|
187
|
+
All trainers share the same production-ready feature set:
|
|
188
|
+
|
|
189
|
+
- **Mixed precision** — `torch_dtype="bfloat16"` or `"float16"`
|
|
190
|
+
- **Gradient accumulation** — `grad_accumulation_steps=N`
|
|
191
|
+
- **LR scheduler** — `"cosine"` / `"linear"` / `"constant"` with linear warmup
|
|
192
|
+
- **Reproducibility** — `seed=42` sets torch + numpy + random before training
|
|
193
|
+
- **Checkpointing** — `save_checkpoint(path)` / `resume_from_checkpoint(path)`
|
|
194
|
+
- **Auto-checkpoint** — `save_every=N, save_dir="/path"` saves every N steps
|
|
195
|
+
- **Eval loop** — `eval_every=N` evaluates on a held-out set every N steps
|
|
196
|
+
- **W&B / TensorBoard** — `log_backend="wandb"` or `"tensorboard"`
|
|
197
|
+
- **OOM handling** — CUDA OOM raises with actionable suggestions
|
|
198
|
+
- **Streaming datasets** — `DataPipeline` wraps any HF `IterableDataset`
|
|
199
|
+
- **Dataset shuffling** — `shuffle=True` or `shuffle_buffer=N` for streaming
|
|
200
|
+
|
|
201
|
+
---
|
|
202
|
+
|
|
203
|
+
## CLI
|
|
204
|
+
|
|
205
|
+
```bash
|
|
206
|
+
# Check your environment
|
|
207
|
+
foundry doctor
|
|
208
|
+
|
|
209
|
+
# Preview a recipe plan (no GPU spend)
|
|
210
|
+
foundry plan recipe.yaml
|
|
211
|
+
|
|
212
|
+
# Run a recipe
|
|
213
|
+
foundry run recipe.yaml
|
|
214
|
+
|
|
215
|
+
# Run an embedding distillation recipe
|
|
216
|
+
foundry embed recipe.yaml
|
|
217
|
+
|
|
218
|
+
# List fusion strategies
|
|
219
|
+
foundry strategies
|
|
220
|
+
```
|
|
221
|
+
|
|
222
|
+
---
|
|
223
|
+
|
|
224
|
+
## Recipe YAML
|
|
225
|
+
|
|
226
|
+
```yaml
|
|
227
|
+
# recipe.yaml — full causal-LM factory
|
|
228
|
+
seed:
|
|
229
|
+
model: meta-llama/Llama-3.1-8B
|
|
230
|
+
init: pretrained
|
|
231
|
+
|
|
232
|
+
grow:
|
|
233
|
+
method: depth_upscale
|
|
234
|
+
to_params: 15B
|
|
235
|
+
|
|
236
|
+
teachers:
|
|
237
|
+
- role: reasoning
|
|
238
|
+
model: meta-llama/Llama-3.1-70B
|
|
239
|
+
weight: 1.0
|
|
240
|
+
|
|
241
|
+
fusion:
|
|
242
|
+
strategy: min_ce
|
|
243
|
+
align: min_ed
|
|
244
|
+
cache: topk_64
|
|
245
|
+
|
|
246
|
+
heal:
|
|
247
|
+
tokens: 100B
|
|
248
|
+
alpha: 0.3
|
|
249
|
+
|
|
250
|
+
output:
|
|
251
|
+
freeze_base: true
|
|
252
|
+
skillpacks: [ola_math, ola_code]
|
|
253
|
+
```
|
|
254
|
+
|
|
255
|
+
---
|
|
256
|
+
|
|
257
|
+
## Optional extras
|
|
258
|
+
|
|
259
|
+
| Extra | Installs | When to use |
|
|
260
|
+
|---|---|---|
|
|
261
|
+
| `[torch]` | torch, transformers, safetensors, accelerate | Real training |
|
|
262
|
+
| `[lego]` | peft | LoRA skill packs |
|
|
263
|
+
| `[merge]` | mergekit | SOLAR depth up-scaling |
|
|
264
|
+
| `[data]` | datasets | HuggingFace dataset streaming |
|
|
265
|
+
| `[align]` | rapidfuzz | Fast cross-tokenizer alignment (100× speedup) |
|
|
266
|
+
| `[logging]` | wandb | Experiment tracking |
|
|
267
|
+
| `[all]` | everything | Full setup |
|
|
268
|
+
|
|
269
|
+
---
|
|
270
|
+
|
|
271
|
+
## Links
|
|
272
|
+
|
|
273
|
+
- **Main SDK** — [olaverse](https://pypi.org/project/olaverse/) — ready-to-use African NLP models
|
|
274
|
+
- **Homepage** — [olaverse.co.uk](https://olaverse.co.uk)
|
|
275
|
+
- **GitHub** — [Olaverse-Labs/olaverse-foundry](https://github.com/Olaverse-Labs/olaverse-foundry)
|
|
276
|
+
- **Issues** — [GitHub Issues](https://github.com/Olaverse-Labs/olaverse-foundry/issues)
|
|
277
|
+
|
|
278
|
+
---
|
|
279
|
+
|
|
280
|
+
## License
|
|
281
|
+
|
|
282
|
+
Apache 2.0 — see [LICENSE](LICENSE).
|
|
@@ -0,0 +1,236 @@
|
|
|
1
|
+
# olaverse-foundry
|
|
2
|
+
|
|
3
|
+
**Build model families from a single pretrained seed.**
|
|
4
|
+
|
|
5
|
+
`olaverse-foundry` is the training and model-factory layer of the Olaverse ecosystem. Where `olaverse` gives you ready-to-use NLP models, `foundry` lets you build new ones — distilling, growing, fusing, and adapting them for production.
|
|
6
|
+
|
|
7
|
+
```
|
|
8
|
+
seed → grow → distil / fuse → freeze → skill packs
|
|
9
|
+
```
|
|
10
|
+
|
|
11
|
+
---
|
|
12
|
+
|
|
13
|
+
## Install
|
|
14
|
+
|
|
15
|
+
```bash
|
|
16
|
+
# Core (schema validation, growth planning — no GPU required)
|
|
17
|
+
pip install olaverse-foundry
|
|
18
|
+
|
|
19
|
+
# GPU training
|
|
20
|
+
pip install olaverse-foundry[torch]
|
|
21
|
+
|
|
22
|
+
# LoRA skill packs
|
|
23
|
+
pip install olaverse-foundry[torch,lego]
|
|
24
|
+
|
|
25
|
+
# Everything
|
|
26
|
+
pip install olaverse-foundry[all]
|
|
27
|
+
```
|
|
28
|
+
|
|
29
|
+
---
|
|
30
|
+
|
|
31
|
+
## Quick start — embedding distillation (200M student)
|
|
32
|
+
|
|
33
|
+
```python
|
|
34
|
+
from foundry import DataPipeline, EmbeddingDistillTrainer, EmbeddingDistillConfig
|
|
35
|
+
from transformers import AutoModel, AutoTokenizer
|
|
36
|
+
|
|
37
|
+
# Load student and teacher
|
|
38
|
+
student = AutoModel.from_pretrained("microsoft/deberta-v3-base")
|
|
39
|
+
teacher = AutoModel.from_pretrained("BAAI/bge-large-en-v1.5")
|
|
40
|
+
tok = AutoTokenizer.from_pretrained("microsoft/deberta-v3-base")
|
|
41
|
+
|
|
42
|
+
# Stream data
|
|
43
|
+
pipe = DataPipeline(
|
|
44
|
+
source = my_hf_dataset,
|
|
45
|
+
tokenizer = tok,
|
|
46
|
+
batch_size = 32,
|
|
47
|
+
max_length = 128,
|
|
48
|
+
mode = "embed",
|
|
49
|
+
shuffle_buffer = 10_000,
|
|
50
|
+
)
|
|
51
|
+
|
|
52
|
+
# Train
|
|
53
|
+
trainer = EmbeddingDistillTrainer(
|
|
54
|
+
student = student,
|
|
55
|
+
teacher = teacher,
|
|
56
|
+
config = EmbeddingDistillConfig(
|
|
57
|
+
loss = "cosine",
|
|
58
|
+
pool = "mean",
|
|
59
|
+
epochs = 3,
|
|
60
|
+
lr_scheduler = "cosine",
|
|
61
|
+
warmup_steps = 200,
|
|
62
|
+
torch_dtype = "bfloat16",
|
|
63
|
+
save_every = 1000,
|
|
64
|
+
save_dir = "/checkpoints/embed-200m",
|
|
65
|
+
log_backend = "wandb",
|
|
66
|
+
),
|
|
67
|
+
)
|
|
68
|
+
|
|
69
|
+
result = trainer.train(pipe, eval_dataset=eval_pipe)
|
|
70
|
+
print(result["eval_losses"])
|
|
71
|
+
```
|
|
72
|
+
|
|
73
|
+
---
|
|
74
|
+
|
|
75
|
+
## Quick start — causal LM distillation with multiple teachers
|
|
76
|
+
|
|
77
|
+
```python
|
|
78
|
+
from foundry import (
|
|
79
|
+
DataPipeline, TorchDistillTrainer, TorchTrainConfig,
|
|
80
|
+
TeacherRegistry, FoundryRecipe,
|
|
81
|
+
)
|
|
82
|
+
|
|
83
|
+
# Build a registry of teachers
|
|
84
|
+
teachers = TeacherRegistry.from_names(
|
|
85
|
+
["meta-llama/Llama-3.1-70B", "Qwen/Qwen2-72B-Instruct"],
|
|
86
|
+
weights=[1.0, 0.8],
|
|
87
|
+
)
|
|
88
|
+
teachers.load_all()
|
|
89
|
+
|
|
90
|
+
# Stream training data
|
|
91
|
+
pipe = DataPipeline(
|
|
92
|
+
source = my_dataset,
|
|
93
|
+
tokenizer = tok,
|
|
94
|
+
batch_size = 8,
|
|
95
|
+
max_length = 2048,
|
|
96
|
+
mode = "lm",
|
|
97
|
+
)
|
|
98
|
+
|
|
99
|
+
trainer = TorchDistillTrainer(
|
|
100
|
+
student = my_3b_model,
|
|
101
|
+
teachers = teachers,
|
|
102
|
+
config = TorchTrainConfig(
|
|
103
|
+
epochs = 1,
|
|
104
|
+
lr_scheduler = "cosine",
|
|
105
|
+
warmup_steps = 500,
|
|
106
|
+
torch_dtype = "bfloat16",
|
|
107
|
+
grad_accumulation_steps = 8,
|
|
108
|
+
save_every = 500,
|
|
109
|
+
save_dir = "/checkpoints/run1",
|
|
110
|
+
eval_every = 100,
|
|
111
|
+
log_backend = "wandb",
|
|
112
|
+
),
|
|
113
|
+
)
|
|
114
|
+
|
|
115
|
+
result = trainer.train(pipe, eval_dataset=eval_pipe)
|
|
116
|
+
```
|
|
117
|
+
|
|
118
|
+
---
|
|
119
|
+
|
|
120
|
+
## Key components
|
|
121
|
+
|
|
122
|
+
| Module | What it does |
|
|
123
|
+
|---|---|
|
|
124
|
+
| `DataPipeline` | Converts HF datasets, string lists, or numpy arrays into trainer-ready batches. Supports streaming and reservoir shuffle. |
|
|
125
|
+
| `TorchDistillTrainer` | Single-GPU distillation: CE + KL loss against one or more teachers. |
|
|
126
|
+
| `CachedDistillTrainer` | Like `TorchDistillTrainer` but caches teacher logits on disk after the first pass. Subsequent epochs are free. Supports `accelerate` for multi-GPU. |
|
|
127
|
+
| `EmbeddingDistillTrainer` | MSE / cosine loss on pooled sentence vectors. Use for bi-encoder / reranker distillation. |
|
|
128
|
+
| `TeacherRegistry` | Pool of HF teacher models with relative weights. Handles `AutoModelForCausalLM` and `AutoModel` (encoders). |
|
|
129
|
+
| `LogitCache` | In-memory + on-disk cache for top-k teacher logit distributions. |
|
|
130
|
+
| `GrowthPlan` / `plan_growth` | Depth up-scaling via SOLAR-style layer duplication. Generates mergekit-compatible YAML. |
|
|
131
|
+
| `SkillPack` / `SkillRegistry` | Detachable LoRA adapters bound to a specific base model hash. |
|
|
132
|
+
| `save_as_peft` / `load_from_peft` | PEFT-format adapter round-trip (no peft library required). |
|
|
133
|
+
| `MinEDAlignment` | Cross-tokenizer vocabulary alignment via edit distance. |
|
|
134
|
+
| `DataPipeline` | Unified dataset adapter — HF datasets, streaming, raw text, numpy. |
|
|
135
|
+
| `FoundryRecipe` / `EmbedRecipe` | Pydantic-validated YAML recipes — fail fast before GPU spend. |
|
|
136
|
+
|
|
137
|
+
---
|
|
138
|
+
|
|
139
|
+
## Training features
|
|
140
|
+
|
|
141
|
+
All trainers share the same production-ready feature set:
|
|
142
|
+
|
|
143
|
+
- **Mixed precision** — `torch_dtype="bfloat16"` or `"float16"`
|
|
144
|
+
- **Gradient accumulation** — `grad_accumulation_steps=N`
|
|
145
|
+
- **LR scheduler** — `"cosine"` / `"linear"` / `"constant"` with linear warmup
|
|
146
|
+
- **Reproducibility** — `seed=42` sets torch + numpy + random before training
|
|
147
|
+
- **Checkpointing** — `save_checkpoint(path)` / `resume_from_checkpoint(path)`
|
|
148
|
+
- **Auto-checkpoint** — `save_every=N, save_dir="/path"` saves every N steps
|
|
149
|
+
- **Eval loop** — `eval_every=N` evaluates on a held-out set every N steps
|
|
150
|
+
- **W&B / TensorBoard** — `log_backend="wandb"` or `"tensorboard"`
|
|
151
|
+
- **OOM handling** — CUDA OOM raises with actionable suggestions
|
|
152
|
+
- **Streaming datasets** — `DataPipeline` wraps any HF `IterableDataset`
|
|
153
|
+
- **Dataset shuffling** — `shuffle=True` or `shuffle_buffer=N` for streaming
|
|
154
|
+
|
|
155
|
+
---
|
|
156
|
+
|
|
157
|
+
## CLI
|
|
158
|
+
|
|
159
|
+
```bash
|
|
160
|
+
# Check your environment
|
|
161
|
+
foundry doctor
|
|
162
|
+
|
|
163
|
+
# Preview a recipe plan (no GPU spend)
|
|
164
|
+
foundry plan recipe.yaml
|
|
165
|
+
|
|
166
|
+
# Run a recipe
|
|
167
|
+
foundry run recipe.yaml
|
|
168
|
+
|
|
169
|
+
# Run an embedding distillation recipe
|
|
170
|
+
foundry embed recipe.yaml
|
|
171
|
+
|
|
172
|
+
# List fusion strategies
|
|
173
|
+
foundry strategies
|
|
174
|
+
```
|
|
175
|
+
|
|
176
|
+
---
|
|
177
|
+
|
|
178
|
+
## Recipe YAML
|
|
179
|
+
|
|
180
|
+
```yaml
|
|
181
|
+
# recipe.yaml — full causal-LM factory
|
|
182
|
+
seed:
|
|
183
|
+
model: meta-llama/Llama-3.1-8B
|
|
184
|
+
init: pretrained
|
|
185
|
+
|
|
186
|
+
grow:
|
|
187
|
+
method: depth_upscale
|
|
188
|
+
to_params: 15B
|
|
189
|
+
|
|
190
|
+
teachers:
|
|
191
|
+
- role: reasoning
|
|
192
|
+
model: meta-llama/Llama-3.1-70B
|
|
193
|
+
weight: 1.0
|
|
194
|
+
|
|
195
|
+
fusion:
|
|
196
|
+
strategy: min_ce
|
|
197
|
+
align: min_ed
|
|
198
|
+
cache: topk_64
|
|
199
|
+
|
|
200
|
+
heal:
|
|
201
|
+
tokens: 100B
|
|
202
|
+
alpha: 0.3
|
|
203
|
+
|
|
204
|
+
output:
|
|
205
|
+
freeze_base: true
|
|
206
|
+
skillpacks: [ola_math, ola_code]
|
|
207
|
+
```
|
|
208
|
+
|
|
209
|
+
---
|
|
210
|
+
|
|
211
|
+
## Optional extras
|
|
212
|
+
|
|
213
|
+
| Extra | Installs | When to use |
|
|
214
|
+
|---|---|---|
|
|
215
|
+
| `[torch]` | torch, transformers, safetensors, accelerate | Real training |
|
|
216
|
+
| `[lego]` | peft | LoRA skill packs |
|
|
217
|
+
| `[merge]` | mergekit | SOLAR depth up-scaling |
|
|
218
|
+
| `[data]` | datasets | HuggingFace dataset streaming |
|
|
219
|
+
| `[align]` | rapidfuzz | Fast cross-tokenizer alignment (100× speedup) |
|
|
220
|
+
| `[logging]` | wandb | Experiment tracking |
|
|
221
|
+
| `[all]` | everything | Full setup |
|
|
222
|
+
|
|
223
|
+
---
|
|
224
|
+
|
|
225
|
+
## Links
|
|
226
|
+
|
|
227
|
+
- **Main SDK** — [olaverse](https://pypi.org/project/olaverse/) — ready-to-use African NLP models
|
|
228
|
+
- **Homepage** — [olaverse.co.uk](https://olaverse.co.uk)
|
|
229
|
+
- **GitHub** — [Olaverse-Labs/olaverse-foundry](https://github.com/Olaverse-Labs/olaverse-foundry)
|
|
230
|
+
- **Issues** — [GitHub Issues](https://github.com/Olaverse-Labs/olaverse-foundry/issues)
|
|
231
|
+
|
|
232
|
+
---
|
|
233
|
+
|
|
234
|
+
## License
|
|
235
|
+
|
|
236
|
+
Apache 2.0 — see [LICENSE](LICENSE).
|
|
@@ -0,0 +1,70 @@
|
|
|
1
|
+
"""
|
|
2
|
+
olaverse-foundry — a toolkit for building model families.
|
|
3
|
+
|
|
4
|
+
One expensive ancestor, many cheap descendants.
|
|
5
|
+
|
|
6
|
+
seed → grow → fuse/heal → freeze → skill packs
|
|
7
|
+
|
|
8
|
+
Quick start::
|
|
9
|
+
|
|
10
|
+
from foundry import Recipe
|
|
11
|
+
|
|
12
|
+
recipe = Recipe.load("my_recipe.yaml")
|
|
13
|
+
for line in recipe.plan():
|
|
14
|
+
print(line)
|
|
15
|
+
|
|
16
|
+
base = recipe.run()
|
|
17
|
+
"""
|
|
18
|
+
from foundry.contracts import ArchConfig, Student, Teacher, TokenizerAlignment
|
|
19
|
+
from foundry.fusion import FusionKernel, IdentityAlignment, EMAlignment, MinEDAlignment
|
|
20
|
+
from foundry.fusion import STRATEGY_REGISTRY, min_ce, mean_ce
|
|
21
|
+
from foundry.growth import (
|
|
22
|
+
GrowthPlan, upscale_layer_map, layers_for_param_target, plan_growth,
|
|
23
|
+
growth_plan_to_mergekit_yaml, save_mergekit_config, run_merge,
|
|
24
|
+
)
|
|
25
|
+
from foundry.skillpacks import (
|
|
26
|
+
SkillPack, SkillRegistry,
|
|
27
|
+
save_as_peft, load_from_peft, peft_config_dict,
|
|
28
|
+
)
|
|
29
|
+
from foundry.teachers import TeacherRegistry, ToyTeacher, HFTeacher, LogitCache
|
|
30
|
+
from foundry.training import (
|
|
31
|
+
DistillTrainer, TrainConfig,
|
|
32
|
+
TorchDistillTrainer, TorchTrainConfig,
|
|
33
|
+
CachedDistillTrainer, CachedDistillConfig,
|
|
34
|
+
EmbeddingDistillTrainer, EmbeddingDistillConfig, ToyEmbeddingTeacher,
|
|
35
|
+
)
|
|
36
|
+
from foundry.io import SeedResult, load_seed
|
|
37
|
+
from foundry.recipes import Recipe, FoundryRecipe, EmbedRecipe, EmbedFusionConfig
|
|
38
|
+
from foundry.backends import detect_backend
|
|
39
|
+
from foundry.data import DataPipeline
|
|
40
|
+
|
|
41
|
+
__version__ = "0.1.0"
|
|
42
|
+
|
|
43
|
+
__all__ = [
|
|
44
|
+
# Contracts
|
|
45
|
+
"ArchConfig", "Student", "Teacher", "TokenizerAlignment",
|
|
46
|
+
# Fusion
|
|
47
|
+
"FusionKernel", "IdentityAlignment", "EMAlignment", "MinEDAlignment",
|
|
48
|
+
"STRATEGY_REGISTRY", "min_ce", "mean_ce",
|
|
49
|
+
# Growth
|
|
50
|
+
"GrowthPlan", "upscale_layer_map", "layers_for_param_target", "plan_growth",
|
|
51
|
+
"growth_plan_to_mergekit_yaml", "save_mergekit_config", "run_merge",
|
|
52
|
+
# Skill packs
|
|
53
|
+
"SkillPack", "SkillRegistry",
|
|
54
|
+
"save_as_peft", "load_from_peft", "peft_config_dict",
|
|
55
|
+
# Teachers
|
|
56
|
+
"TeacherRegistry", "ToyTeacher", "HFTeacher", "LogitCache",
|
|
57
|
+
# Training
|
|
58
|
+
"DistillTrainer", "TrainConfig",
|
|
59
|
+
"TorchDistillTrainer", "TorchTrainConfig",
|
|
60
|
+
"CachedDistillTrainer", "CachedDistillConfig",
|
|
61
|
+
"EmbeddingDistillTrainer", "EmbeddingDistillConfig", "ToyEmbeddingTeacher",
|
|
62
|
+
# IO / Seed
|
|
63
|
+
"SeedResult", "load_seed",
|
|
64
|
+
# Recipes
|
|
65
|
+
"Recipe", "FoundryRecipe", "EmbedRecipe", "EmbedFusionConfig",
|
|
66
|
+
# Backends
|
|
67
|
+
"detect_backend",
|
|
68
|
+
# Data
|
|
69
|
+
"DataPipeline",
|
|
70
|
+
]
|
|
@@ -0,0 +1,100 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Backend capability detection — what's available on this machine?
|
|
3
|
+
"""
|
|
4
|
+
from __future__ import annotations
|
|
5
|
+
|
|
6
|
+
import sys
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def detect_backend() -> dict:
|
|
10
|
+
"""
|
|
11
|
+
Check which optional backends are installed and available.
|
|
12
|
+
|
|
13
|
+
Returns:
|
|
14
|
+
dict with keys: torch, cuda, mps, peft, accelerate, mergekit,
|
|
15
|
+
safetensors, rapidfuzz, python_version, cuda_version,
|
|
16
|
+
gpu_count, gpu_vram_gb, summary.
|
|
17
|
+
"""
|
|
18
|
+
result: dict = {
|
|
19
|
+
"torch": False,
|
|
20
|
+
"cuda": False,
|
|
21
|
+
"mps": False,
|
|
22
|
+
"peft": False,
|
|
23
|
+
"accelerate": False,
|
|
24
|
+
"mergekit": False,
|
|
25
|
+
"safetensors": False,
|
|
26
|
+
"rapidfuzz": False,
|
|
27
|
+
"wandb": False,
|
|
28
|
+
"python_version": f"{sys.version_info.major}.{sys.version_info.minor}.{sys.version_info.micro}",
|
|
29
|
+
"torch_version": None,
|
|
30
|
+
"cuda_version": None,
|
|
31
|
+
"gpu_count": 0,
|
|
32
|
+
"gpu_vram_gb": [], # list of VRAM per device in GB
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
try:
|
|
36
|
+
import torch
|
|
37
|
+
result["torch"] = True
|
|
38
|
+
result["torch_version"] = torch.__version__
|
|
39
|
+
result["cuda"] = torch.cuda.is_available()
|
|
40
|
+
result["mps"] = (
|
|
41
|
+
hasattr(torch.backends, "mps") and torch.backends.mps.is_available()
|
|
42
|
+
)
|
|
43
|
+
if result["cuda"]:
|
|
44
|
+
result["cuda_version"] = torch.version.cuda
|
|
45
|
+
n = torch.cuda.device_count()
|
|
46
|
+
result["gpu_count"] = n
|
|
47
|
+
result["gpu_vram_gb"] = [
|
|
48
|
+
round(torch.cuda.get_device_properties(i).total_memory / 1e9, 1)
|
|
49
|
+
for i in range(n)
|
|
50
|
+
]
|
|
51
|
+
except ImportError:
|
|
52
|
+
pass
|
|
53
|
+
|
|
54
|
+
for pkg, key in [
|
|
55
|
+
("peft", "peft"),
|
|
56
|
+
("accelerate", "accelerate"),
|
|
57
|
+
("mergekit", "mergekit"),
|
|
58
|
+
("safetensors", "safetensors"),
|
|
59
|
+
("rapidfuzz", "rapidfuzz"),
|
|
60
|
+
("wandb", "wandb"),
|
|
61
|
+
]:
|
|
62
|
+
try:
|
|
63
|
+
__import__(pkg)
|
|
64
|
+
result[key] = True
|
|
65
|
+
except (ImportError, TypeError):
|
|
66
|
+
pass
|
|
67
|
+
|
|
68
|
+
# Build summary string
|
|
69
|
+
parts = []
|
|
70
|
+
if result["torch"]:
|
|
71
|
+
if result["cuda"]:
|
|
72
|
+
vram = result["gpu_vram_gb"]
|
|
73
|
+
vram_str = "+".join(f"{v}GB" for v in vram) if vram else "?"
|
|
74
|
+
parts.append(
|
|
75
|
+
f"torch {result['torch_version']} "
|
|
76
|
+
f"(CUDA {result['cuda_version']}, {result['gpu_count']}× GPU, {vram_str} VRAM)"
|
|
77
|
+
)
|
|
78
|
+
elif result["mps"]:
|
|
79
|
+
parts.append(f"torch {result['torch_version']} (MPS/Apple Silicon)")
|
|
80
|
+
else:
|
|
81
|
+
parts.append(f"torch {result['torch_version']} (CPU only)")
|
|
82
|
+
else:
|
|
83
|
+
parts.append("no torch — toy/numpy backend only")
|
|
84
|
+
|
|
85
|
+
for key in ("peft", "accelerate", "mergekit", "safetensors", "rapidfuzz", "wandb"):
|
|
86
|
+
if result[key]:
|
|
87
|
+
parts.append(key)
|
|
88
|
+
|
|
89
|
+
result["summary"] = " ".join(parts)
|
|
90
|
+
return result
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
def require_torch(feature: str = "this feature") -> None:
|
|
94
|
+
"""Raise a helpful ImportError if torch is not installed."""
|
|
95
|
+
info = detect_backend()
|
|
96
|
+
if not info["torch"]:
|
|
97
|
+
raise ImportError(
|
|
98
|
+
f"{feature} requires torch. "
|
|
99
|
+
"Install with: pip install olaverse-foundry[torch]"
|
|
100
|
+
)
|