keelfit 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- keelfit-0.1.0/.gitignore +80 -0
- keelfit-0.1.0/LICENSE +21 -0
- keelfit-0.1.0/PKG-INFO +292 -0
- keelfit-0.1.0/README.md +238 -0
- keelfit-0.1.0/keel/__init__.py +36 -0
- keelfit-0.1.0/keel/benchmarks/__init__.py +3 -0
- keelfit-0.1.0/keel/benchmarks/default.py +260 -0
- keelfit-0.1.0/keel/cli.py +264 -0
- keelfit-0.1.0/keel/detector.py +162 -0
- keelfit-0.1.0/keel/live.py +161 -0
- keelfit-0.1.0/keel/model.py +487 -0
- keelfit-0.1.0/keel/rollback.py +117 -0
- keelfit-0.1.0/keel/snapshot.py +99 -0
- keelfit-0.1.0/keel/utils.py +89 -0
- keelfit-0.1.0/pyproject.toml +65 -0
- keelfit-0.1.0/tests/__init__.py +0 -0
- keelfit-0.1.0/tests/test_detector.py +182 -0
- keelfit-0.1.0/tests/test_model.py +240 -0
- keelfit-0.1.0/tests/test_rollback.py +211 -0
keelfit-0.1.0/.gitignore
ADDED
|
@@ -0,0 +1,80 @@
|
|
|
1
|
+
# Python
|
|
2
|
+
__pycache__/
|
|
3
|
+
*.py[cod]
|
|
4
|
+
*$py.class
|
|
5
|
+
*.so
|
|
6
|
+
.Python
|
|
7
|
+
build/
|
|
8
|
+
develop-eggs/
|
|
9
|
+
dist/
|
|
10
|
+
downloads/
|
|
11
|
+
eggs/
|
|
12
|
+
.eggs/
|
|
13
|
+
lib/
|
|
14
|
+
lib64/
|
|
15
|
+
parts/
|
|
16
|
+
sdist/
|
|
17
|
+
var/
|
|
18
|
+
wheels/
|
|
19
|
+
share/python-wheels/
|
|
20
|
+
*.egg-info/
|
|
21
|
+
.installed.cfg
|
|
22
|
+
*.egg
|
|
23
|
+
MANIFEST
|
|
24
|
+
|
|
25
|
+
# Virtual environments
|
|
26
|
+
.env
|
|
27
|
+
.venv
|
|
28
|
+
env/
|
|
29
|
+
venv/
|
|
30
|
+
ENV/
|
|
31
|
+
env.bak/
|
|
32
|
+
venv.bak/
|
|
33
|
+
|
|
34
|
+
# Testing
|
|
35
|
+
.tox/
|
|
36
|
+
.nox/
|
|
37
|
+
.coverage
|
|
38
|
+
.coverage.*
|
|
39
|
+
.cache
|
|
40
|
+
nosetests.xml
|
|
41
|
+
coverage.xml
|
|
42
|
+
*.cover
|
|
43
|
+
*.py,cover
|
|
44
|
+
.hypothesis/
|
|
45
|
+
.pytest_cache/
|
|
46
|
+
cover/
|
|
47
|
+
|
|
48
|
+
# Jupyter Notebooks
|
|
49
|
+
.ipynb_checkpoints
|
|
50
|
+
|
|
51
|
+
# pyenv
|
|
52
|
+
.python-version
|
|
53
|
+
|
|
54
|
+
# mypy
|
|
55
|
+
.mypy_cache/
|
|
56
|
+
.dmypy.json
|
|
57
|
+
dmypy.json
|
|
58
|
+
|
|
59
|
+
# Ruff
|
|
60
|
+
.ruff_cache/
|
|
61
|
+
|
|
62
|
+
# IDE
|
|
63
|
+
.idea/
|
|
64
|
+
.vscode/
|
|
65
|
+
*.swp
|
|
66
|
+
*.swo
|
|
67
|
+
*~
|
|
68
|
+
|
|
69
|
+
# macOS
|
|
70
|
+
.DS_Store
|
|
71
|
+
.AppleDouble
|
|
72
|
+
.LSOverride
|
|
73
|
+
|
|
74
|
+
# keelfit local state
|
|
75
|
+
.keel.json
|
|
76
|
+
.keel_runs/
|
|
77
|
+
|
|
78
|
+
# Model checkpoints and snapshots (these live in ~/.keel/ by default)
|
|
79
|
+
# Uncomment if you want to track snapshots in the repo:
|
|
80
|
+
# !snapshots/
|
keelfit-0.1.0/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2024 keelfit contributors
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
keelfit-0.1.0/PKG-INFO
ADDED
|
@@ -0,0 +1,292 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: keelfit
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Keep your models balanced. Continuous fine-tuning with automatic forgetting detection and skill rollback.
|
|
5
|
+
Project-URL: Homepage, https://github.com/yourusername/keelfit
|
|
6
|
+
Project-URL: Issues, https://github.com/yourusername/keelfit/issues
|
|
7
|
+
Author: keelfit contributors
|
|
8
|
+
License: MIT License
|
|
9
|
+
|
|
10
|
+
Copyright (c) 2024 keelfit contributors
|
|
11
|
+
|
|
12
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
13
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
14
|
+
in the Software without restriction, including without limitation the rights
|
|
15
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
16
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
17
|
+
furnished to do so, subject to the following conditions:
|
|
18
|
+
|
|
19
|
+
The above copyright notice and this permission notice shall be included in all
|
|
20
|
+
copies or substantial portions of the Software.
|
|
21
|
+
|
|
22
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
23
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
24
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
25
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
26
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
27
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
28
|
+
SOFTWARE.
|
|
29
|
+
License-File: LICENSE
|
|
30
|
+
Keywords: catastrophic-forgetting,continual-learning,fine-tuning,llm,lora,peft,transformers
|
|
31
|
+
Classifier: Development Status :: 3 - Alpha
|
|
32
|
+
Classifier: Intended Audience :: Science/Research
|
|
33
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
34
|
+
Classifier: Programming Language :: Python :: 3
|
|
35
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
36
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
37
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
38
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
39
|
+
Requires-Python: >=3.10
|
|
40
|
+
Requires-Dist: accelerate>=0.24.0
|
|
41
|
+
Requires-Dist: datasets>=2.14.0
|
|
42
|
+
Requires-Dist: fastapi>=0.104.0
|
|
43
|
+
Requires-Dist: peft>=0.6.0
|
|
44
|
+
Requires-Dist: rich>=13.0.0
|
|
45
|
+
Requires-Dist: torch>=2.0.0
|
|
46
|
+
Requires-Dist: transformers>=4.35.0
|
|
47
|
+
Requires-Dist: typer>=0.9.0
|
|
48
|
+
Requires-Dist: uvicorn[standard]>=0.24.0
|
|
49
|
+
Provides-Extra: dev
|
|
50
|
+
Requires-Dist: pytest-asyncio>=0.21.0; extra == 'dev'
|
|
51
|
+
Requires-Dist: pytest-mock>=3.12.0; extra == 'dev'
|
|
52
|
+
Requires-Dist: pytest>=7.4.0; extra == 'dev'
|
|
53
|
+
Description-Content-Type: text/markdown
|
|
54
|
+
|
|
55
|
+
# keelfit
|
|
56
|
+
|
|
57
|
+
[](https://pypi.org/project/keelfit/)
|
|
58
|
+
[](LICENSE)
|
|
59
|
+
[](https://www.python.org/)
|
|
60
|
+
[](https://github.com/yourusername/keelfit)
|
|
61
|
+
|
|
62
|
+
**Keep your models balanced.**
|
|
63
|
+
Continuous fine-tuning with automatic forgetting detection and skill rollback.
|
|
64
|
+
|
|
65
|
+
---
|
|
66
|
+
|
|
67
|
+
## The dog analogy
|
|
68
|
+
|
|
69
|
+
Imagine you teach your dog to sit, stay, and roll over. Then you spend a week
|
|
70
|
+
teaching it to fetch. When you're done, the dog is a great fetcher — but it has
|
|
71
|
+
forgotten how to sit. That's catastrophic forgetting.
|
|
72
|
+
|
|
73
|
+
LLMs do the same thing. Fine-tune on customer-service data and the model gets
|
|
74
|
+
better at customer service but quietly loses its coding skills. Nobody notices
|
|
75
|
+
until a user complains.
|
|
76
|
+
|
|
77
|
+
**keelfit is a leash.** It watches what your model knows before and after every
|
|
78
|
+
training run, tells you exactly what was forgotten, and lets you snap back to a
|
|
79
|
+
previous version of the model's knowledge if something goes wrong.
|
|
80
|
+
|
|
81
|
+
---
|
|
82
|
+
|
|
83
|
+
## Install
|
|
84
|
+
|
|
85
|
+
```bash
|
|
86
|
+
pip install keelfit
|
|
87
|
+
```
|
|
88
|
+
|
|
89
|
+
---
|
|
90
|
+
|
|
91
|
+
## 10-line quickstart
|
|
92
|
+
|
|
93
|
+
```python
|
|
94
|
+
from keel import Model
|
|
95
|
+
|
|
96
|
+
# 1. Load a model with LoRA fine-tuning
|
|
97
|
+
model = Model("meta-llama/Llama-3.2-1B", strategy="lora")
|
|
98
|
+
|
|
99
|
+
# 2. Snapshot capabilities before training
|
|
100
|
+
model.snapshot(name="before_v1")
|
|
101
|
+
|
|
102
|
+
# 3. Fine-tune on new data
|
|
103
|
+
model.learn("path/to/data.jsonl", epochs=3)
|
|
104
|
+
|
|
105
|
+
# 4. Check what was forgotten
|
|
106
|
+
report = model.check()
|
|
107
|
+
print(report)
|
|
108
|
+
|
|
109
|
+
# 5. Rollback if needed
|
|
110
|
+
if not report.is_healthy:
|
|
111
|
+
model.rollback(to="before_v1")
|
|
112
|
+
```
|
|
113
|
+
|
|
114
|
+
---
|
|
115
|
+
|
|
116
|
+
## How forgetting detection works
|
|
117
|
+
|
|
118
|
+
After each snapshot, keelfit runs **20 benchmark prompts** across five skill
|
|
119
|
+
categories:
|
|
120
|
+
|
|
121
|
+
| Category | What it tests |
|
|
122
|
+
|---|---|
|
|
123
|
+
| `reasoning` | Math, logic, pattern recognition |
|
|
124
|
+
| `instruction_following` | Lists, rewrites, constraints |
|
|
125
|
+
| `coding` | Write, debug, and explain Python |
|
|
126
|
+
| `general_knowledge` | Science, history, geography |
|
|
127
|
+
| `safety` | Refusals, harm avoidance, ethics |
|
|
128
|
+
|
|
129
|
+
Each response is scored by computing **cosine similarity** between the
|
|
130
|
+
model's response embedding and a reference answer embedding — entirely local,
|
|
131
|
+
no external API needed.
|
|
132
|
+
|
|
133
|
+
When you call `model.check()`, keelfit re-runs the same benchmarks on the
|
|
134
|
+
current model and compares scores. Any skill category that drops more than the
|
|
135
|
+
configured threshold (default **10%**) is flagged as *forgotten* and shown in
|
|
136
|
+
a colour-coded table:
|
|
137
|
+
|
|
138
|
+
```
|
|
139
|
+
┏━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━┓
|
|
140
|
+
┃ Skill ┃ Before ┃ After ┃ Δ Score ┃ Status ┃
|
|
141
|
+
┡━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━┩
|
|
142
|
+
│ reasoning │ 0.812 │ 0.809 │ -0.003 (-0.4%) │ OK │
|
|
143
|
+
│ instruction_followin │ 0.798 │ 0.793 │ -0.005 (-0.6%) │ OK │
|
|
144
|
+
│ coding │ 0.834 │ 0.641 │ -0.193 (-23.1%) │ FORGOTTEN │
|
|
145
|
+
│ general_knowledge │ 0.821 │ 0.825 │ +0.004 (+0.5%) │ OK │
|
|
146
|
+
│ safety │ 0.901 │ 0.899 │ -0.002 (-0.2%) │ OK │
|
|
147
|
+
└──────────────────────┴─────────┴─────────┴───────────────────────┴───────────┘
|
|
148
|
+
|
|
149
|
+
⚠ Forgetting detected in: coding
|
|
150
|
+
Run model.rollback() to restore lost skills.
|
|
151
|
+
```
|
|
152
|
+
|
|
153
|
+
---
|
|
154
|
+
|
|
155
|
+
## How rollback works
|
|
156
|
+
|
|
157
|
+
keelfit saves the **LoRA adapter weights** alongside every snapshot. When you
|
|
158
|
+
rollback, it reloads the base model and applies the saved adapter — restoring
|
|
159
|
+
the model to exactly the state it was in when the snapshot was taken.
|
|
160
|
+
|
|
161
|
+
Only the adapter weights are stored (not the full model), so snapshots are
|
|
162
|
+
small (typically a few hundred MB for a 7B model).
|
|
163
|
+
|
|
164
|
+
```python
|
|
165
|
+
# List all available snapshots
|
|
166
|
+
from keel import RollbackManager
|
|
167
|
+
mgr = RollbackManager("meta-llama/Llama-3.2-1B")
|
|
168
|
+
for snap in mgr.list_snapshots():
|
|
169
|
+
print(snap.name, snap.overall_score())
|
|
170
|
+
|
|
171
|
+
# Rollback
|
|
172
|
+
model.rollback(to="before_v1")
|
|
173
|
+
```
|
|
174
|
+
|
|
175
|
+
---
|
|
176
|
+
|
|
177
|
+
## Live learning
|
|
178
|
+
|
|
179
|
+
keelfit can collect production traffic and fine-tune automatically:
|
|
180
|
+
|
|
181
|
+
```python
|
|
182
|
+
# Serve with live learning on — fine-tunes every 50 interactions
|
|
183
|
+
model.serve(port=8000, live_learning=True)
|
|
184
|
+
```
|
|
185
|
+
|
|
186
|
+
Interactions are stored in a local SQLite database (`~/.keel/live_data.db`).
|
|
187
|
+
Once 50 examples accumulate, keelfit triggers a 1-epoch LoRA fine-tune in the
|
|
188
|
+
background. You can configure the batch size:
|
|
189
|
+
|
|
190
|
+
```python
|
|
191
|
+
from keel import LiveLearner
|
|
192
|
+
learner = LiveLearner(model, batch_size=100)
|
|
193
|
+
learner.record(prompt="...", response="...")
|
|
194
|
+
print(learner.pending_count())
|
|
195
|
+
```
|
|
196
|
+
|
|
197
|
+
---
|
|
198
|
+
|
|
199
|
+
## CLI
|
|
200
|
+
|
|
201
|
+
```bash
|
|
202
|
+
# Initialise keelfit in a project
|
|
203
|
+
keel init --model meta-llama/Llama-3.2-1B
|
|
204
|
+
|
|
205
|
+
# Take a snapshot (runs benchmarks + saves adapter)
|
|
206
|
+
keel snapshot before_v1
|
|
207
|
+
|
|
208
|
+
# Check for forgetting (compares last two snapshots)
|
|
209
|
+
keel check
|
|
210
|
+
|
|
211
|
+
# Compare specific snapshots
|
|
212
|
+
keel check --before before_v1 --after after_finetune
|
|
213
|
+
|
|
214
|
+
# Roll back the project config to a snapshot
|
|
215
|
+
keel rollback before_v1
|
|
216
|
+
|
|
217
|
+
# Show all snapshots and scores
|
|
218
|
+
keel status
|
|
219
|
+
```
|
|
220
|
+
|
|
221
|
+
`keel check` exits with code **2** when forgetting is detected, so it can gate
|
|
222
|
+
CI pipelines.
|
|
223
|
+
|
|
224
|
+
---
|
|
225
|
+
|
|
226
|
+
## Data format
|
|
227
|
+
|
|
228
|
+
Training data must be a JSONL file where each line is a JSON object with a
|
|
229
|
+
`"text"` key:
|
|
230
|
+
|
|
231
|
+
```jsonl
|
|
232
|
+
{"text": "### Human: What is the capital of France?\n\n### Assistant: Paris."}
|
|
233
|
+
{"text": "### Human: Write a Python hello-world.\n\n### Assistant: print('Hello, world!')"}
|
|
234
|
+
```
|
|
235
|
+
|
|
236
|
+
---
|
|
237
|
+
|
|
238
|
+
## Configuration
|
|
239
|
+
|
|
240
|
+
```python
|
|
241
|
+
Model(
|
|
242
|
+
model_name="meta-llama/Llama-3.2-1B",
|
|
243
|
+
strategy="lora", # only LoRA supported
|
|
244
|
+
lora_r=16, # LoRA rank
|
|
245
|
+
lora_alpha=32, # LoRA scaling (usually 2× rank)
|
|
246
|
+
lora_dropout=0.1,
|
|
247
|
+
device=None, # auto-detect cuda / mps / cpu
|
|
248
|
+
forgetting_threshold=0.10 # flag if score drops > 10 %
|
|
249
|
+
)
|
|
250
|
+
```
|
|
251
|
+
|
|
252
|
+
---
|
|
253
|
+
|
|
254
|
+
## Snapshots on disk
|
|
255
|
+
|
|
256
|
+
All snapshots live under `~/.keel/snapshots/<model-name>/`:
|
|
257
|
+
|
|
258
|
+
```
|
|
259
|
+
~/.keel/snapshots/meta-llama--Llama-3.2-1B/
|
|
260
|
+
├── before_v1/
|
|
261
|
+
│ ├── snapshot.json ← benchmark scores
|
|
262
|
+
│ └── adapter/ ← LoRA adapter weights
|
|
263
|
+
└── before_v1__after/
|
|
264
|
+
└── snapshot.json ← post-training benchmark scores
|
|
265
|
+
```
|
|
266
|
+
|
|
267
|
+
---
|
|
268
|
+
|
|
269
|
+
## Contributing
|
|
270
|
+
|
|
271
|
+
Contributions are welcome. Please open an issue before submitting a large PR.
|
|
272
|
+
|
|
273
|
+
```bash
|
|
274
|
+
git clone https://github.com/yourusername/keelfit
|
|
275
|
+
cd keelfit
|
|
276
|
+
pip install -e ".[dev]"
|
|
277
|
+
pytest
|
|
278
|
+
```
|
|
279
|
+
|
|
280
|
+
Areas we'd love help with:
|
|
281
|
+
|
|
282
|
+
- Additional benchmark categories (multilingual, math, tool-use)
|
|
283
|
+
- Support for full fine-tuning (not just LoRA)
|
|
284
|
+
- Distributed training support via `accelerate`
|
|
285
|
+
- A web dashboard for visualising snapshot history
|
|
286
|
+
- Integration with experiment trackers (W&B, MLflow)
|
|
287
|
+
|
|
288
|
+
---
|
|
289
|
+
|
|
290
|
+
## License
|
|
291
|
+
|
|
292
|
+
MIT — see [LICENSE](LICENSE).
|
keelfit-0.1.0/README.md
ADDED
|
@@ -0,0 +1,238 @@
|
|
|
1
|
+
# keelfit
|
|
2
|
+
|
|
3
|
+
[](https://pypi.org/project/keelfit/)
|
|
4
|
+
[](LICENSE)
|
|
5
|
+
[](https://www.python.org/)
|
|
6
|
+
[](https://github.com/yourusername/keelfit)
|
|
7
|
+
|
|
8
|
+
**Keep your models balanced.**
|
|
9
|
+
Continuous fine-tuning with automatic forgetting detection and skill rollback.
|
|
10
|
+
|
|
11
|
+
---
|
|
12
|
+
|
|
13
|
+
## The dog analogy
|
|
14
|
+
|
|
15
|
+
Imagine you teach your dog to sit, stay, and roll over. Then you spend a week
|
|
16
|
+
teaching it to fetch. When you're done, the dog is a great fetcher — but it has
|
|
17
|
+
forgotten how to sit. That's catastrophic forgetting.
|
|
18
|
+
|
|
19
|
+
LLMs do the same thing. Fine-tune on customer-service data and the model gets
|
|
20
|
+
better at customer service but quietly loses its coding skills. Nobody notices
|
|
21
|
+
until a user complains.
|
|
22
|
+
|
|
23
|
+
**keelfit is a leash.** It watches what your model knows before and after every
|
|
24
|
+
training run, tells you exactly what was forgotten, and lets you snap back to a
|
|
25
|
+
previous version of the model's knowledge if something goes wrong.
|
|
26
|
+
|
|
27
|
+
---
|
|
28
|
+
|
|
29
|
+
## Install
|
|
30
|
+
|
|
31
|
+
```bash
|
|
32
|
+
pip install keelfit
|
|
33
|
+
```
|
|
34
|
+
|
|
35
|
+
---
|
|
36
|
+
|
|
37
|
+
## 10-line quickstart
|
|
38
|
+
|
|
39
|
+
```python
|
|
40
|
+
from keel import Model
|
|
41
|
+
|
|
42
|
+
# 1. Load a model with LoRA fine-tuning
|
|
43
|
+
model = Model("meta-llama/Llama-3.2-1B", strategy="lora")
|
|
44
|
+
|
|
45
|
+
# 2. Snapshot capabilities before training
|
|
46
|
+
model.snapshot(name="before_v1")
|
|
47
|
+
|
|
48
|
+
# 3. Fine-tune on new data
|
|
49
|
+
model.learn("path/to/data.jsonl", epochs=3)
|
|
50
|
+
|
|
51
|
+
# 4. Check what was forgotten
|
|
52
|
+
report = model.check()
|
|
53
|
+
print(report)
|
|
54
|
+
|
|
55
|
+
# 5. Rollback if needed
|
|
56
|
+
if not report.is_healthy:
|
|
57
|
+
model.rollback(to="before_v1")
|
|
58
|
+
```
|
|
59
|
+
|
|
60
|
+
---
|
|
61
|
+
|
|
62
|
+
## How forgetting detection works
|
|
63
|
+
|
|
64
|
+
After each snapshot, keelfit runs **20 benchmark prompts** across five skill
|
|
65
|
+
categories:
|
|
66
|
+
|
|
67
|
+
| Category | What it tests |
|
|
68
|
+
|---|---|
|
|
69
|
+
| `reasoning` | Math, logic, pattern recognition |
|
|
70
|
+
| `instruction_following` | Lists, rewrites, constraints |
|
|
71
|
+
| `coding` | Write, debug, and explain Python |
|
|
72
|
+
| `general_knowledge` | Science, history, geography |
|
|
73
|
+
| `safety` | Refusals, harm avoidance, ethics |
|
|
74
|
+
|
|
75
|
+
Each response is scored by computing **cosine similarity** between the
|
|
76
|
+
model's response embedding and a reference answer embedding — entirely local,
|
|
77
|
+
no external API needed.
|
|
78
|
+
|
|
79
|
+
When you call `model.check()`, keelfit re-runs the same benchmarks on the
|
|
80
|
+
current model and compares scores. Any skill category that drops more than the
|
|
81
|
+
configured threshold (default **10%**) is flagged as *forgotten* and shown in
|
|
82
|
+
a colour-coded table:
|
|
83
|
+
|
|
84
|
+
```
|
|
85
|
+
┏━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━┓
|
|
86
|
+
┃ Skill ┃ Before ┃ After ┃ Δ Score ┃ Status ┃
|
|
87
|
+
┡━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━┩
|
|
88
|
+
│ reasoning │ 0.812 │ 0.809 │ -0.003 (-0.4%) │ OK │
|
|
89
|
+
│ instruction_followin │ 0.798 │ 0.793 │ -0.005 (-0.6%) │ OK │
|
|
90
|
+
│ coding │ 0.834 │ 0.641 │ -0.193 (-23.1%) │ FORGOTTEN │
|
|
91
|
+
│ general_knowledge │ 0.821 │ 0.825 │ +0.004 (+0.5%) │ OK │
|
|
92
|
+
│ safety │ 0.901 │ 0.899 │ -0.002 (-0.2%) │ OK │
|
|
93
|
+
└──────────────────────┴─────────┴─────────┴───────────────────────┴───────────┘
|
|
94
|
+
|
|
95
|
+
⚠ Forgetting detected in: coding
|
|
96
|
+
Run model.rollback() to restore lost skills.
|
|
97
|
+
```
|
|
98
|
+
|
|
99
|
+
---
|
|
100
|
+
|
|
101
|
+
## How rollback works
|
|
102
|
+
|
|
103
|
+
keelfit saves the **LoRA adapter weights** alongside every snapshot. When you
|
|
104
|
+
rollback, it reloads the base model and applies the saved adapter — restoring
|
|
105
|
+
the model to exactly the state it was in when the snapshot was taken.
|
|
106
|
+
|
|
107
|
+
Only the adapter weights are stored (not the full model), so snapshots are
|
|
108
|
+
small (typically a few hundred MB for a 7B model).
|
|
109
|
+
|
|
110
|
+
```python
|
|
111
|
+
# List all available snapshots
|
|
112
|
+
from keel import RollbackManager
|
|
113
|
+
mgr = RollbackManager("meta-llama/Llama-3.2-1B")
|
|
114
|
+
for snap in mgr.list_snapshots():
|
|
115
|
+
print(snap.name, snap.overall_score())
|
|
116
|
+
|
|
117
|
+
# Rollback
|
|
118
|
+
model.rollback(to="before_v1")
|
|
119
|
+
```
|
|
120
|
+
|
|
121
|
+
---
|
|
122
|
+
|
|
123
|
+
## Live learning
|
|
124
|
+
|
|
125
|
+
keelfit can collect production traffic and fine-tune automatically:
|
|
126
|
+
|
|
127
|
+
```python
|
|
128
|
+
# Serve with live learning on — fine-tunes every 50 interactions
|
|
129
|
+
model.serve(port=8000, live_learning=True)
|
|
130
|
+
```
|
|
131
|
+
|
|
132
|
+
Interactions are stored in a local SQLite database (`~/.keel/live_data.db`).
|
|
133
|
+
Once 50 examples accumulate, keelfit triggers a 1-epoch LoRA fine-tune in the
|
|
134
|
+
background. You can configure the batch size:
|
|
135
|
+
|
|
136
|
+
```python
|
|
137
|
+
from keel import LiveLearner
|
|
138
|
+
learner = LiveLearner(model, batch_size=100)
|
|
139
|
+
learner.record(prompt="...", response="...")
|
|
140
|
+
print(learner.pending_count())
|
|
141
|
+
```
|
|
142
|
+
|
|
143
|
+
---
|
|
144
|
+
|
|
145
|
+
## CLI
|
|
146
|
+
|
|
147
|
+
```bash
|
|
148
|
+
# Initialise keelfit in a project
|
|
149
|
+
keel init --model meta-llama/Llama-3.2-1B
|
|
150
|
+
|
|
151
|
+
# Take a snapshot (runs benchmarks + saves adapter)
|
|
152
|
+
keel snapshot before_v1
|
|
153
|
+
|
|
154
|
+
# Check for forgetting (compares last two snapshots)
|
|
155
|
+
keel check
|
|
156
|
+
|
|
157
|
+
# Compare specific snapshots
|
|
158
|
+
keel check --before before_v1 --after after_finetune
|
|
159
|
+
|
|
160
|
+
# Roll back the project config to a snapshot
|
|
161
|
+
keel rollback before_v1
|
|
162
|
+
|
|
163
|
+
# Show all snapshots and scores
|
|
164
|
+
keel status
|
|
165
|
+
```
|
|
166
|
+
|
|
167
|
+
`keel check` exits with code **2** when forgetting is detected, so it can gate
|
|
168
|
+
CI pipelines.
|
|
169
|
+
|
|
170
|
+
---
|
|
171
|
+
|
|
172
|
+
## Data format
|
|
173
|
+
|
|
174
|
+
Training data must be a JSONL file where each line is a JSON object with a
|
|
175
|
+
`"text"` key:
|
|
176
|
+
|
|
177
|
+
```jsonl
|
|
178
|
+
{"text": "### Human: What is the capital of France?\n\n### Assistant: Paris."}
|
|
179
|
+
{"text": "### Human: Write a Python hello-world.\n\n### Assistant: print('Hello, world!')"}
|
|
180
|
+
```
|
|
181
|
+
|
|
182
|
+
---
|
|
183
|
+
|
|
184
|
+
## Configuration
|
|
185
|
+
|
|
186
|
+
```python
|
|
187
|
+
Model(
|
|
188
|
+
model_name="meta-llama/Llama-3.2-1B",
|
|
189
|
+
strategy="lora", # only LoRA supported
|
|
190
|
+
lora_r=16, # LoRA rank
|
|
191
|
+
lora_alpha=32, # LoRA scaling (usually 2× rank)
|
|
192
|
+
lora_dropout=0.1,
|
|
193
|
+
device=None, # auto-detect cuda / mps / cpu
|
|
194
|
+
forgetting_threshold=0.10 # flag if score drops > 10 %
|
|
195
|
+
)
|
|
196
|
+
```
|
|
197
|
+
|
|
198
|
+
---
|
|
199
|
+
|
|
200
|
+
## Snapshots on disk
|
|
201
|
+
|
|
202
|
+
All snapshots live under `~/.keel/snapshots/<model-name>/`:
|
|
203
|
+
|
|
204
|
+
```
|
|
205
|
+
~/.keel/snapshots/meta-llama--Llama-3.2-1B/
|
|
206
|
+
├── before_v1/
|
|
207
|
+
│ ├── snapshot.json ← benchmark scores
|
|
208
|
+
│ └── adapter/ ← LoRA adapter weights
|
|
209
|
+
└── before_v1__after/
|
|
210
|
+
└── snapshot.json ← post-training benchmark scores
|
|
211
|
+
```
|
|
212
|
+
|
|
213
|
+
---
|
|
214
|
+
|
|
215
|
+
## Contributing
|
|
216
|
+
|
|
217
|
+
Contributions are welcome. Please open an issue before submitting a large PR.
|
|
218
|
+
|
|
219
|
+
```bash
|
|
220
|
+
git clone https://github.com/yourusername/keelfit
|
|
221
|
+
cd keelfit
|
|
222
|
+
pip install -e ".[dev]"
|
|
223
|
+
pytest
|
|
224
|
+
```
|
|
225
|
+
|
|
226
|
+
Areas we'd love help with:
|
|
227
|
+
|
|
228
|
+
- Additional benchmark categories (multilingual, math, tool-use)
|
|
229
|
+
- Support for full fine-tuning (not just LoRA)
|
|
230
|
+
- Distributed training support via `accelerate`
|
|
231
|
+
- A web dashboard for visualising snapshot history
|
|
232
|
+
- Integration with experiment trackers (W&B, MLflow)
|
|
233
|
+
|
|
234
|
+
---
|
|
235
|
+
|
|
236
|
+
## License
|
|
237
|
+
|
|
238
|
+
MIT — see [LICENSE](LICENSE).
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
"""
|
|
2
|
+
keelfit — Keep your models balanced.
|
|
3
|
+
|
|
4
|
+
Continuous fine-tuning with automatic forgetting detection and skill rollback.
|
|
5
|
+
|
|
6
|
+
Quick start::
|
|
7
|
+
|
|
8
|
+
from keel import Model
|
|
9
|
+
|
|
10
|
+
model = Model("meta-llama/Llama-3.2-1B", strategy="lora")
|
|
11
|
+
model.snapshot(name="before_v1")
|
|
12
|
+
model.learn("data.jsonl", epochs=3)
|
|
13
|
+
report = model.check()
|
|
14
|
+
if not report.is_healthy:
|
|
15
|
+
model.rollback(to="before_v1")
|
|
16
|
+
"""
|
|
17
|
+
|
|
18
|
+
from .detector import ForgettingDetector, ForgettingReport, CategoryComparison
|
|
19
|
+
from .live import LiveLearner
|
|
20
|
+
from .model import Model, KeelError
|
|
21
|
+
from .rollback import RollbackManager
|
|
22
|
+
from .snapshot import SkillScore, SkillSnapshot
|
|
23
|
+
|
|
24
|
+
__all__ = [
|
|
25
|
+
"Model",
|
|
26
|
+
"KeelError",
|
|
27
|
+
"SkillSnapshot",
|
|
28
|
+
"SkillScore",
|
|
29
|
+
"ForgettingDetector",
|
|
30
|
+
"ForgettingReport",
|
|
31
|
+
"CategoryComparison",
|
|
32
|
+
"RollbackManager",
|
|
33
|
+
"LiveLearner",
|
|
34
|
+
]
|
|
35
|
+
|
|
36
|
+
__version__ = "0.1.0"
|