olaverse-foundry 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (51) hide show
  1. olaverse_foundry-0.1.0/PKG-INFO +282 -0
  2. olaverse_foundry-0.1.0/README.md +236 -0
  3. olaverse_foundry-0.1.0/foundry/__init__.py +70 -0
  4. olaverse_foundry-0.1.0/foundry/backends.py +100 -0
  5. olaverse_foundry-0.1.0/foundry/cli.py +238 -0
  6. olaverse_foundry-0.1.0/foundry/contracts/__init__.py +8 -0
  7. olaverse_foundry-0.1.0/foundry/contracts/protocols.py +168 -0
  8. olaverse_foundry-0.1.0/foundry/data/__init__.py +3 -0
  9. olaverse_foundry-0.1.0/foundry/data/pipeline.py +247 -0
  10. olaverse_foundry-0.1.0/foundry/fusion/__init__.py +15 -0
  11. olaverse_foundry-0.1.0/foundry/fusion/align.py +157 -0
  12. olaverse_foundry-0.1.0/foundry/fusion/kernel.py +112 -0
  13. olaverse_foundry-0.1.0/foundry/fusion/strategies.py +88 -0
  14. olaverse_foundry-0.1.0/foundry/fusion/vocab_map.py +223 -0
  15. olaverse_foundry-0.1.0/foundry/growth/__init__.py +23 -0
  16. olaverse_foundry-0.1.0/foundry/growth/mergekit_backend.py +214 -0
  17. olaverse_foundry-0.1.0/foundry/growth/planner.py +155 -0
  18. olaverse_foundry-0.1.0/foundry/io/__init__.py +7 -0
  19. olaverse_foundry-0.1.0/foundry/io/loader.py +125 -0
  20. olaverse_foundry-0.1.0/foundry/io/seed.py +163 -0
  21. olaverse_foundry-0.1.0/foundry/recipes/__init__.py +19 -0
  22. olaverse_foundry-0.1.0/foundry/recipes/recipe.py +227 -0
  23. olaverse_foundry-0.1.0/foundry/recipes/schema.py +200 -0
  24. olaverse_foundry-0.1.0/foundry/skillpacks/__init__.py +10 -0
  25. olaverse_foundry-0.1.0/foundry/skillpacks/pack.py +160 -0
  26. olaverse_foundry-0.1.0/foundry/skillpacks/peft_bridge.py +248 -0
  27. olaverse_foundry-0.1.0/foundry/teachers/__init__.py +4 -0
  28. olaverse_foundry-0.1.0/foundry/teachers/cache.py +195 -0
  29. olaverse_foundry-0.1.0/foundry/teachers/registry.py +222 -0
  30. olaverse_foundry-0.1.0/foundry/training/__init__.py +13 -0
  31. olaverse_foundry-0.1.0/foundry/training/_logger.py +84 -0
  32. olaverse_foundry-0.1.0/foundry/training/_scheduler.py +52 -0
  33. olaverse_foundry-0.1.0/foundry/training/accelerate_distill.py +448 -0
  34. olaverse_foundry-0.1.0/foundry/training/distill.py +158 -0
  35. olaverse_foundry-0.1.0/foundry/training/embed_distill.py +561 -0
  36. olaverse_foundry-0.1.0/foundry/training/torch_distill.py +432 -0
  37. olaverse_foundry-0.1.0/olaverse_foundry.egg-info/PKG-INFO +282 -0
  38. olaverse_foundry-0.1.0/olaverse_foundry.egg-info/SOURCES.txt +49 -0
  39. olaverse_foundry-0.1.0/olaverse_foundry.egg-info/dependency_links.txt +1 -0
  40. olaverse_foundry-0.1.0/olaverse_foundry.egg-info/entry_points.txt +2 -0
  41. olaverse_foundry-0.1.0/olaverse_foundry.egg-info/requires.txt +32 -0
  42. olaverse_foundry-0.1.0/olaverse_foundry.egg-info/top_level.txt +1 -0
  43. olaverse_foundry-0.1.0/pyproject.toml +100 -0
  44. olaverse_foundry-0.1.0/setup.cfg +4 -0
  45. olaverse_foundry-0.1.0/tests/test_foundry.py +224 -0
  46. olaverse_foundry-0.1.0/tests/test_m1.py +258 -0
  47. olaverse_foundry-0.1.0/tests/test_m2.py +351 -0
  48. olaverse_foundry-0.1.0/tests/test_m3.py +308 -0
  49. olaverse_foundry-0.1.0/tests/test_m4.py +332 -0
  50. olaverse_foundry-0.1.0/tests/test_m5.py +306 -0
  51. olaverse_foundry-0.1.0/tests/test_prod.py +710 -0
@@ -0,0 +1,282 @@
1
+ Metadata-Version: 2.4
2
+ Name: olaverse-foundry
3
+ Version: 0.1.0
4
+ Summary: A toolkit for building model families — seed, grow, fuse, freeze, extend.
5
+ Author-email: Olaverse Labs <hello@olaverse.co.uk>
6
+ License: Apache-2.0
7
+ Project-URL: Homepage, https://olaverse.co.uk
8
+ Project-URL: Repository, https://github.com/Olaverse-Labs/olaverse-foundry
9
+ Project-URL: Issues, https://github.com/Olaverse-Labs/olaverse-foundry/issues
10
+ Keywords: llm,distillation,model-merging,lora,skill-packs,african-nlp,olaverse,foundry
11
+ Classifier: Development Status :: 3 - Alpha
12
+ Classifier: Intended Audience :: Science/Research
13
+ Classifier: Programming Language :: Python :: 3
14
+ Classifier: Programming Language :: Python :: 3.9
15
+ Classifier: Programming Language :: Python :: 3.10
16
+ Classifier: Programming Language :: Python :: 3.11
17
+ Classifier: Programming Language :: Python :: 3.12
18
+ Classifier: License :: OSI Approved :: Apache Software License
19
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
20
+ Requires-Python: >=3.9
21
+ Description-Content-Type: text/markdown
22
+ Requires-Dist: numpy>=1.24.0
23
+ Requires-Dist: pydantic>=2.0.0
24
+ Requires-Dist: pyyaml>=6.0
25
+ Provides-Extra: torch
26
+ Requires-Dist: torch>=2.0.0; extra == "torch"
27
+ Requires-Dist: transformers>=4.40.0; extra == "torch"
28
+ Requires-Dist: huggingface_hub>=0.20.0; extra == "torch"
29
+ Requires-Dist: safetensors>=0.4.0; extra == "torch"
30
+ Requires-Dist: accelerate>=0.27.0; extra == "torch"
31
+ Provides-Extra: lego
32
+ Requires-Dist: peft>=0.10.0; extra == "lego"
33
+ Provides-Extra: merge
34
+ Requires-Dist: mergekit; extra == "merge"
35
+ Provides-Extra: data
36
+ Requires-Dist: datasets>=2.18.0; extra == "data"
37
+ Provides-Extra: align
38
+ Requires-Dist: rapidfuzz>=3.0.0; extra == "align"
39
+ Provides-Extra: logging
40
+ Requires-Dist: wandb>=0.16.0; extra == "logging"
41
+ Provides-Extra: all
42
+ Requires-Dist: olaverse-foundry[align,data,lego,logging,merge,torch]; extra == "all"
43
+ Provides-Extra: dev
44
+ Requires-Dist: pytest>=7.0; extra == "dev"
45
+ Requires-Dist: pytest-cov; extra == "dev"
46
+
47
+ # olaverse-foundry
48
+
49
+ **Build model families from a single pretrained seed.**
50
+
51
+ `olaverse-foundry` is the training and model-factory layer of the Olaverse ecosystem. Where `olaverse` gives you ready-to-use NLP models, `foundry` lets you build new ones — distilling, growing, fusing, and adapting them for production.
52
+
53
+ ```
54
+ seed → grow → distil / fuse → freeze → skill packs
55
+ ```
56
+
57
+ ---
58
+
59
+ ## Install
60
+
61
+ ```bash
62
+ # Core (schema validation, growth planning — no GPU required)
63
+ pip install olaverse-foundry
64
+
65
+ # GPU training
66
+ pip install olaverse-foundry[torch]
67
+
68
+ # LoRA skill packs
69
+ pip install olaverse-foundry[torch,lego]
70
+
71
+ # Everything
72
+ pip install olaverse-foundry[all]
73
+ ```
74
+
75
+ ---
76
+
77
+ ## Quick start — embedding distillation (200M student)
78
+
79
+ ```python
80
+ from foundry import DataPipeline, EmbeddingDistillTrainer, EmbeddingDistillConfig
81
+ from transformers import AutoModel, AutoTokenizer
82
+
83
+ # Load student and teacher
84
+ student = AutoModel.from_pretrained("microsoft/deberta-v3-base")
85
+ teacher = AutoModel.from_pretrained("BAAI/bge-large-en-v1.5")
86
+ tok = AutoTokenizer.from_pretrained("microsoft/deberta-v3-base")
87
+
88
+ # Stream data
89
+ pipe = DataPipeline(
90
+ source = my_hf_dataset,
91
+ tokenizer = tok,
92
+ batch_size = 32,
93
+ max_length = 128,
94
+ mode = "embed",
95
+ shuffle_buffer = 10_000,
96
+ )
97
+
98
+ # Train
99
+ trainer = EmbeddingDistillTrainer(
100
+ student = student,
101
+ teacher = teacher,
102
+ config = EmbeddingDistillConfig(
103
+ loss = "cosine",
104
+ pool = "mean",
105
+ epochs = 3,
106
+ lr_scheduler = "cosine",
107
+ warmup_steps = 200,
108
+ torch_dtype = "bfloat16",
109
+ save_every = 1000,
110
+ save_dir = "/checkpoints/embed-200m",
111
+ log_backend = "wandb",
112
+ ),
113
+ )
114
+
115
+ result = trainer.train(pipe, eval_dataset=eval_pipe)
116
+ print(result["eval_losses"])
117
+ ```
118
+
119
+ ---
120
+
121
+ ## Quick start — causal LM distillation with multiple teachers
122
+
123
+ ```python
124
+ from foundry import (
125
+ DataPipeline, TorchDistillTrainer, TorchTrainConfig,
126
+ TeacherRegistry, FoundryRecipe,
127
+ )
128
+
129
+ # Build a registry of teachers
130
+ teachers = TeacherRegistry.from_names(
131
+ ["meta-llama/Llama-3.1-70B", "Qwen/Qwen2-72B-Instruct"],
132
+ weights=[1.0, 0.8],
133
+ )
134
+ teachers.load_all()
135
+
136
+ # Stream training data
137
+ pipe = DataPipeline(
138
+ source = my_dataset,
139
+ tokenizer = tok,
140
+ batch_size = 8,
141
+ max_length = 2048,
142
+ mode = "lm",
143
+ )
144
+
145
+ trainer = TorchDistillTrainer(
146
+ student = my_3b_model,
147
+ teachers = teachers,
148
+ config = TorchTrainConfig(
149
+ epochs = 1,
150
+ lr_scheduler = "cosine",
151
+ warmup_steps = 500,
152
+ torch_dtype = "bfloat16",
153
+ grad_accumulation_steps = 8,
154
+ save_every = 500,
155
+ save_dir = "/checkpoints/run1",
156
+ eval_every = 100,
157
+ log_backend = "wandb",
158
+ ),
159
+ )
160
+
161
+ result = trainer.train(pipe, eval_dataset=eval_pipe)
162
+ ```
163
+
164
+ ---
165
+
166
+ ## Key components
167
+
168
+ | Module | What it does |
169
+ |---|---|
170
+ | `DataPipeline` | Converts HF datasets, string lists, or numpy arrays into trainer-ready batches. Supports streaming and reservoir shuffle. |
171
+ | `TorchDistillTrainer` | Single-GPU distillation: CE + KL loss against one or more teachers. |
172
+ | `CachedDistillTrainer` | Like `TorchDistillTrainer` but caches teacher logits on disk after the first pass. Subsequent epochs are free. Supports `accelerate` for multi-GPU. |
173
+ | `EmbeddingDistillTrainer` | MSE / cosine loss on pooled sentence vectors. Use for bi-encoder / reranker distillation. |
174
+ | `TeacherRegistry` | Pool of HF teacher models with relative weights. Handles `AutoModelForCausalLM` and `AutoModel` (encoders). |
175
+ | `LogitCache` | In-memory + on-disk cache for top-k teacher logit distributions. |
176
+ | `GrowthPlan` / `plan_growth` | Depth up-scaling via SOLAR-style layer duplication. Generates mergekit-compatible YAML. |
177
+ | `SkillPack` / `SkillRegistry` | Detachable LoRA adapters bound to a specific base model hash. |
178
+ | `save_as_peft` / `load_from_peft` | PEFT-format adapter round-trip (no peft library required). |
179
+ | `MinEDAlignment` | Cross-tokenizer vocabulary alignment via edit distance. |
180
+ | `DataPipeline` | Unified dataset adapter — HF datasets, streaming, raw text, numpy. |
181
+ | `FoundryRecipe` / `EmbedRecipe` | Pydantic-validated YAML recipes — fail fast before GPU spend. |
182
+
183
+ ---
184
+
185
+ ## Training features
186
+
187
+ All trainers share the same production-ready feature set:
188
+
189
+ - **Mixed precision** — `torch_dtype="bfloat16"` or `"float16"`
190
+ - **Gradient accumulation** — `grad_accumulation_steps=N`
191
+ - **LR scheduler** — `"cosine"` / `"linear"` / `"constant"` with linear warmup
192
+ - **Reproducibility** — `seed=42` sets torch + numpy + random before training
193
+ - **Checkpointing** — `save_checkpoint(path)` / `resume_from_checkpoint(path)`
194
+ - **Auto-checkpoint** — `save_every=N, save_dir="/path"` saves every N steps
195
+ - **Eval loop** — `eval_every=N` evaluates on a held-out set every N steps
196
+ - **W&B / TensorBoard** — `log_backend="wandb"` or `"tensorboard"`
197
+ - **OOM handling** — CUDA OOM raises with actionable suggestions
198
+ - **Streaming datasets** — `DataPipeline` wraps any HF `IterableDataset`
199
+ - **Dataset shuffling** — `shuffle=True` or `shuffle_buffer=N` for streaming
200
+
201
+ ---
202
+
203
+ ## CLI
204
+
205
+ ```bash
206
+ # Check your environment
207
+ foundry doctor
208
+
209
+ # Preview a recipe plan (no GPU spend)
210
+ foundry plan recipe.yaml
211
+
212
+ # Run a recipe
213
+ foundry run recipe.yaml
214
+
215
+ # Run an embedding distillation recipe
216
+ foundry embed recipe.yaml
217
+
218
+ # List fusion strategies
219
+ foundry strategies
220
+ ```
221
+
222
+ ---
223
+
224
+ ## Recipe YAML
225
+
226
+ ```yaml
227
+ # recipe.yaml — full causal-LM factory
228
+ seed:
229
+ model: meta-llama/Llama-3.1-8B
230
+ init: pretrained
231
+
232
+ grow:
233
+ method: depth_upscale
234
+ to_params: 15B
235
+
236
+ teachers:
237
+ - role: reasoning
238
+ model: meta-llama/Llama-3.1-70B
239
+ weight: 1.0
240
+
241
+ fusion:
242
+ strategy: min_ce
243
+ align: min_ed
244
+ cache: topk_64
245
+
246
+ heal:
247
+ tokens: 100B
248
+ alpha: 0.3
249
+
250
+ output:
251
+ freeze_base: true
252
+ skillpacks: [ola_math, ola_code]
253
+ ```
254
+
255
+ ---
256
+
257
+ ## Optional extras
258
+
259
+ | Extra | Installs | When to use |
260
+ |---|---|---|
261
+ | `[torch]` | torch, transformers, safetensors, accelerate | Real training |
262
+ | `[lego]` | peft | LoRA skill packs |
263
+ | `[merge]` | mergekit | SOLAR depth up-scaling |
264
+ | `[data]` | datasets | HuggingFace dataset streaming |
265
+ | `[align]` | rapidfuzz | Fast cross-tokenizer alignment (100× speedup) |
266
+ | `[logging]` | wandb | Experiment tracking |
267
+ | `[all]` | everything | Full setup |
268
+
269
+ ---
270
+
271
+ ## Links
272
+
273
+ - **Main SDK** — [olaverse](https://pypi.org/project/olaverse/) — ready-to-use African NLP models
274
+ - **Homepage** — [olaverse.co.uk](https://olaverse.co.uk)
275
+ - **GitHub** — [Olaverse-Labs/olaverse-foundry](https://github.com/Olaverse-Labs/olaverse-foundry)
276
+ - **Issues** — [GitHub Issues](https://github.com/Olaverse-Labs/olaverse-foundry/issues)
277
+
278
+ ---
279
+
280
+ ## License
281
+
282
+ Apache 2.0 — see [LICENSE](LICENSE).
@@ -0,0 +1,236 @@
1
+ # olaverse-foundry
2
+
3
+ **Build model families from a single pretrained seed.**
4
+
5
+ `olaverse-foundry` is the training and model-factory layer of the Olaverse ecosystem. Where `olaverse` gives you ready-to-use NLP models, `foundry` lets you build new ones — distilling, growing, fusing, and adapting them for production.
6
+
7
+ ```
8
+ seed → grow → distil / fuse → freeze → skill packs
9
+ ```
10
+
11
+ ---
12
+
13
+ ## Install
14
+
15
+ ```bash
16
+ # Core (schema validation, growth planning — no GPU required)
17
+ pip install olaverse-foundry
18
+
19
+ # GPU training
20
+ pip install olaverse-foundry[torch]
21
+
22
+ # LoRA skill packs
23
+ pip install olaverse-foundry[torch,lego]
24
+
25
+ # Everything
26
+ pip install olaverse-foundry[all]
27
+ ```
28
+
29
+ ---
30
+
31
+ ## Quick start — embedding distillation (200M student)
32
+
33
+ ```python
34
+ from foundry import DataPipeline, EmbeddingDistillTrainer, EmbeddingDistillConfig
35
+ from transformers import AutoModel, AutoTokenizer
36
+
37
+ # Load student and teacher
38
+ student = AutoModel.from_pretrained("microsoft/deberta-v3-base")
39
+ teacher = AutoModel.from_pretrained("BAAI/bge-large-en-v1.5")
40
+ tok = AutoTokenizer.from_pretrained("microsoft/deberta-v3-base")
41
+
42
+ # Stream data
43
+ pipe = DataPipeline(
44
+ source = my_hf_dataset,
45
+ tokenizer = tok,
46
+ batch_size = 32,
47
+ max_length = 128,
48
+ mode = "embed",
49
+ shuffle_buffer = 10_000,
50
+ )
51
+
52
+ # Train
53
+ trainer = EmbeddingDistillTrainer(
54
+ student = student,
55
+ teacher = teacher,
56
+ config = EmbeddingDistillConfig(
57
+ loss = "cosine",
58
+ pool = "mean",
59
+ epochs = 3,
60
+ lr_scheduler = "cosine",
61
+ warmup_steps = 200,
62
+ torch_dtype = "bfloat16",
63
+ save_every = 1000,
64
+ save_dir = "/checkpoints/embed-200m",
65
+ log_backend = "wandb",
66
+ ),
67
+ )
68
+
69
+ result = trainer.train(pipe, eval_dataset=eval_pipe)
70
+ print(result["eval_losses"])
71
+ ```
72
+
73
+ ---
74
+
75
+ ## Quick start — causal LM distillation with multiple teachers
76
+
77
+ ```python
78
+ from foundry import (
79
+ DataPipeline, TorchDistillTrainer, TorchTrainConfig,
80
+ TeacherRegistry, FoundryRecipe,
81
+ )
82
+
83
+ # Build a registry of teachers
84
+ teachers = TeacherRegistry.from_names(
85
+ ["meta-llama/Llama-3.1-70B", "Qwen/Qwen2-72B-Instruct"],
86
+ weights=[1.0, 0.8],
87
+ )
88
+ teachers.load_all()
89
+
90
+ # Stream training data
91
+ pipe = DataPipeline(
92
+ source = my_dataset,
93
+ tokenizer = tok,
94
+ batch_size = 8,
95
+ max_length = 2048,
96
+ mode = "lm",
97
+ )
98
+
99
+ trainer = TorchDistillTrainer(
100
+ student = my_3b_model,
101
+ teachers = teachers,
102
+ config = TorchTrainConfig(
103
+ epochs = 1,
104
+ lr_scheduler = "cosine",
105
+ warmup_steps = 500,
106
+ torch_dtype = "bfloat16",
107
+ grad_accumulation_steps = 8,
108
+ save_every = 500,
109
+ save_dir = "/checkpoints/run1",
110
+ eval_every = 100,
111
+ log_backend = "wandb",
112
+ ),
113
+ )
114
+
115
+ result = trainer.train(pipe, eval_dataset=eval_pipe)
116
+ ```
117
+
118
+ ---
119
+
120
+ ## Key components
121
+
122
+ | Module | What it does |
123
+ |---|---|
124
+ | `DataPipeline` | Converts HF datasets, string lists, or numpy arrays into trainer-ready batches. Supports streaming and reservoir shuffle. |
125
+ | `TorchDistillTrainer` | Single-GPU distillation: CE + KL loss against one or more teachers. |
126
+ | `CachedDistillTrainer` | Like `TorchDistillTrainer` but caches teacher logits on disk after the first pass. Subsequent epochs are free. Supports `accelerate` for multi-GPU. |
127
+ | `EmbeddingDistillTrainer` | MSE / cosine loss on pooled sentence vectors. Use for bi-encoder / reranker distillation. |
128
+ | `TeacherRegistry` | Pool of HF teacher models with relative weights. Handles `AutoModelForCausalLM` and `AutoModel` (encoders). |
129
+ | `LogitCache` | In-memory + on-disk cache for top-k teacher logit distributions. |
130
+ | `GrowthPlan` / `plan_growth` | Depth up-scaling via SOLAR-style layer duplication. Generates mergekit-compatible YAML. |
131
+ | `SkillPack` / `SkillRegistry` | Detachable LoRA adapters bound to a specific base model hash. |
132
+ | `save_as_peft` / `load_from_peft` | PEFT-format adapter round-trip (no peft library required). |
133
+ | `MinEDAlignment` | Cross-tokenizer vocabulary alignment via edit distance. |
134
+ | `DataPipeline` | Unified dataset adapter — HF datasets, streaming, raw text, numpy. |
135
+ | `FoundryRecipe` / `EmbedRecipe` | Pydantic-validated YAML recipes — fail fast before GPU spend. |
136
+
137
+ ---
138
+
139
+ ## Training features
140
+
141
+ All trainers share the same production-ready feature set:
142
+
143
+ - **Mixed precision** — `torch_dtype="bfloat16"` or `"float16"`
144
+ - **Gradient accumulation** — `grad_accumulation_steps=N`
145
+ - **LR scheduler** — `"cosine"` / `"linear"` / `"constant"` with linear warmup
146
+ - **Reproducibility** — `seed=42` sets torch + numpy + random before training
147
+ - **Checkpointing** — `save_checkpoint(path)` / `resume_from_checkpoint(path)`
148
+ - **Auto-checkpoint** — `save_every=N, save_dir="/path"` saves every N steps
149
+ - **Eval loop** — `eval_every=N` evaluates on a held-out set every N steps
150
+ - **W&B / TensorBoard** — `log_backend="wandb"` or `"tensorboard"`
151
+ - **OOM handling** — CUDA OOM raises with actionable suggestions
152
+ - **Streaming datasets** — `DataPipeline` wraps any HF `IterableDataset`
153
+ - **Dataset shuffling** — `shuffle=True` or `shuffle_buffer=N` for streaming
154
+
155
+ ---
156
+
157
+ ## CLI
158
+
159
+ ```bash
160
+ # Check your environment
161
+ foundry doctor
162
+
163
+ # Preview a recipe plan (no GPU spend)
164
+ foundry plan recipe.yaml
165
+
166
+ # Run a recipe
167
+ foundry run recipe.yaml
168
+
169
+ # Run an embedding distillation recipe
170
+ foundry embed recipe.yaml
171
+
172
+ # List fusion strategies
173
+ foundry strategies
174
+ ```
175
+
176
+ ---
177
+
178
+ ## Recipe YAML
179
+
180
+ ```yaml
181
+ # recipe.yaml — full causal-LM factory
182
+ seed:
183
+ model: meta-llama/Llama-3.1-8B
184
+ init: pretrained
185
+
186
+ grow:
187
+ method: depth_upscale
188
+ to_params: 15B
189
+
190
+ teachers:
191
+ - role: reasoning
192
+ model: meta-llama/Llama-3.1-70B
193
+ weight: 1.0
194
+
195
+ fusion:
196
+ strategy: min_ce
197
+ align: min_ed
198
+ cache: topk_64
199
+
200
+ heal:
201
+ tokens: 100B
202
+ alpha: 0.3
203
+
204
+ output:
205
+ freeze_base: true
206
+ skillpacks: [ola_math, ola_code]
207
+ ```
208
+
209
+ ---
210
+
211
+ ## Optional extras
212
+
213
+ | Extra | Installs | When to use |
214
+ |---|---|---|
215
+ | `[torch]` | torch, transformers, safetensors, accelerate | Real training |
216
+ | `[lego]` | peft | LoRA skill packs |
217
+ | `[merge]` | mergekit | SOLAR depth up-scaling |
218
+ | `[data]` | datasets | HuggingFace dataset streaming |
219
+ | `[align]` | rapidfuzz | Fast cross-tokenizer alignment (100× speedup) |
220
+ | `[logging]` | wandb | Experiment tracking |
221
+ | `[all]` | everything | Full setup |
222
+
223
+ ---
224
+
225
+ ## Links
226
+
227
+ - **Main SDK** — [olaverse](https://pypi.org/project/olaverse/) — ready-to-use African NLP models
228
+ - **Homepage** — [olaverse.co.uk](https://olaverse.co.uk)
229
+ - **GitHub** — [Olaverse-Labs/olaverse-foundry](https://github.com/Olaverse-Labs/olaverse-foundry)
230
+ - **Issues** — [GitHub Issues](https://github.com/Olaverse-Labs/olaverse-foundry/issues)
231
+
232
+ ---
233
+
234
+ ## License
235
+
236
+ Apache 2.0 — see [LICENSE](LICENSE).
@@ -0,0 +1,70 @@
1
+ """
2
+ olaverse-foundry — a toolkit for building model families.
3
+
4
+ One expensive ancestor, many cheap descendants.
5
+
6
+ seed → grow → fuse/heal → freeze → skill packs
7
+
8
+ Quick start::
9
+
10
+ from foundry import Recipe
11
+
12
+ recipe = Recipe.load("my_recipe.yaml")
13
+ for line in recipe.plan():
14
+ print(line)
15
+
16
+ base = recipe.run()
17
+ """
18
+ from foundry.contracts import ArchConfig, Student, Teacher, TokenizerAlignment
19
+ from foundry.fusion import FusionKernel, IdentityAlignment, EMAlignment, MinEDAlignment
20
+ from foundry.fusion import STRATEGY_REGISTRY, min_ce, mean_ce
21
+ from foundry.growth import (
22
+ GrowthPlan, upscale_layer_map, layers_for_param_target, plan_growth,
23
+ growth_plan_to_mergekit_yaml, save_mergekit_config, run_merge,
24
+ )
25
+ from foundry.skillpacks import (
26
+ SkillPack, SkillRegistry,
27
+ save_as_peft, load_from_peft, peft_config_dict,
28
+ )
29
+ from foundry.teachers import TeacherRegistry, ToyTeacher, HFTeacher, LogitCache
30
+ from foundry.training import (
31
+ DistillTrainer, TrainConfig,
32
+ TorchDistillTrainer, TorchTrainConfig,
33
+ CachedDistillTrainer, CachedDistillConfig,
34
+ EmbeddingDistillTrainer, EmbeddingDistillConfig, ToyEmbeddingTeacher,
35
+ )
36
+ from foundry.io import SeedResult, load_seed
37
+ from foundry.recipes import Recipe, FoundryRecipe, EmbedRecipe, EmbedFusionConfig
38
+ from foundry.backends import detect_backend
39
+ from foundry.data import DataPipeline
40
+
41
+ __version__ = "0.1.0"
42
+
43
+ __all__ = [
44
+ # Contracts
45
+ "ArchConfig", "Student", "Teacher", "TokenizerAlignment",
46
+ # Fusion
47
+ "FusionKernel", "IdentityAlignment", "EMAlignment", "MinEDAlignment",
48
+ "STRATEGY_REGISTRY", "min_ce", "mean_ce",
49
+ # Growth
50
+ "GrowthPlan", "upscale_layer_map", "layers_for_param_target", "plan_growth",
51
+ "growth_plan_to_mergekit_yaml", "save_mergekit_config", "run_merge",
52
+ # Skill packs
53
+ "SkillPack", "SkillRegistry",
54
+ "save_as_peft", "load_from_peft", "peft_config_dict",
55
+ # Teachers
56
+ "TeacherRegistry", "ToyTeacher", "HFTeacher", "LogitCache",
57
+ # Training
58
+ "DistillTrainer", "TrainConfig",
59
+ "TorchDistillTrainer", "TorchTrainConfig",
60
+ "CachedDistillTrainer", "CachedDistillConfig",
61
+ "EmbeddingDistillTrainer", "EmbeddingDistillConfig", "ToyEmbeddingTeacher",
62
+ # IO / Seed
63
+ "SeedResult", "load_seed",
64
+ # Recipes
65
+ "Recipe", "FoundryRecipe", "EmbedRecipe", "EmbedFusionConfig",
66
+ # Backends
67
+ "detect_backend",
68
+ # Data
69
+ "DataPipeline",
70
+ ]
@@ -0,0 +1,100 @@
1
+ """
2
+ Backend capability detection — what's available on this machine?
3
+ """
4
+ from __future__ import annotations
5
+
6
+ import sys
7
+
8
+
9
+ def detect_backend() -> dict:
10
+ """
11
+ Check which optional backends are installed and available.
12
+
13
+ Returns:
14
+ dict with keys: torch, cuda, mps, peft, accelerate, mergekit,
15
+ safetensors, rapidfuzz, python_version, cuda_version,
16
+ gpu_count, gpu_vram_gb, summary.
17
+ """
18
+ result: dict = {
19
+ "torch": False,
20
+ "cuda": False,
21
+ "mps": False,
22
+ "peft": False,
23
+ "accelerate": False,
24
+ "mergekit": False,
25
+ "safetensors": False,
26
+ "rapidfuzz": False,
27
+ "wandb": False,
28
+ "python_version": f"{sys.version_info.major}.{sys.version_info.minor}.{sys.version_info.micro}",
29
+ "torch_version": None,
30
+ "cuda_version": None,
31
+ "gpu_count": 0,
32
+ "gpu_vram_gb": [], # list of VRAM per device in GB
33
+ }
34
+
35
+ try:
36
+ import torch
37
+ result["torch"] = True
38
+ result["torch_version"] = torch.__version__
39
+ result["cuda"] = torch.cuda.is_available()
40
+ result["mps"] = (
41
+ hasattr(torch.backends, "mps") and torch.backends.mps.is_available()
42
+ )
43
+ if result["cuda"]:
44
+ result["cuda_version"] = torch.version.cuda
45
+ n = torch.cuda.device_count()
46
+ result["gpu_count"] = n
47
+ result["gpu_vram_gb"] = [
48
+ round(torch.cuda.get_device_properties(i).total_memory / 1e9, 1)
49
+ for i in range(n)
50
+ ]
51
+ except ImportError:
52
+ pass
53
+
54
+ for pkg, key in [
55
+ ("peft", "peft"),
56
+ ("accelerate", "accelerate"),
57
+ ("mergekit", "mergekit"),
58
+ ("safetensors", "safetensors"),
59
+ ("rapidfuzz", "rapidfuzz"),
60
+ ("wandb", "wandb"),
61
+ ]:
62
+ try:
63
+ __import__(pkg)
64
+ result[key] = True
65
+ except (ImportError, TypeError):
66
+ pass
67
+
68
+ # Build summary string
69
+ parts = []
70
+ if result["torch"]:
71
+ if result["cuda"]:
72
+ vram = result["gpu_vram_gb"]
73
+ vram_str = "+".join(f"{v}GB" for v in vram) if vram else "?"
74
+ parts.append(
75
+ f"torch {result['torch_version']} "
76
+ f"(CUDA {result['cuda_version']}, {result['gpu_count']}× GPU, {vram_str} VRAM)"
77
+ )
78
+ elif result["mps"]:
79
+ parts.append(f"torch {result['torch_version']} (MPS/Apple Silicon)")
80
+ else:
81
+ parts.append(f"torch {result['torch_version']} (CPU only)")
82
+ else:
83
+ parts.append("no torch — toy/numpy backend only")
84
+
85
+ for key in ("peft", "accelerate", "mergekit", "safetensors", "rapidfuzz", "wandb"):
86
+ if result[key]:
87
+ parts.append(key)
88
+
89
+ result["summary"] = " ".join(parts)
90
+ return result
91
+
92
+
93
+ def require_torch(feature: str = "this feature") -> None:
94
+ """Raise a helpful ImportError if torch is not installed."""
95
+ info = detect_backend()
96
+ if not info["torch"]:
97
+ raise ImportError(
98
+ f"{feature} requires torch. "
99
+ "Install with: pip install olaverse-foundry[torch]"
100
+ )