vitalroute 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- vitalroute-0.1.0/LICENSE +21 -0
- vitalroute-0.1.0/PKG-INFO +321 -0
- vitalroute-0.1.0/README.md +285 -0
- vitalroute-0.1.0/pyproject.toml +50 -0
- vitalroute-0.1.0/setup.cfg +4 -0
- vitalroute-0.1.0/tests/test_router.py +52 -0
- vitalroute-0.1.0/tests/test_stress.py +52 -0
- vitalroute-0.1.0/tests/test_torch.py +206 -0
- vitalroute-0.1.0/vitalroute/__init__.py +72 -0
- vitalroute-0.1.0/vitalroute/backbone/__init__.py +5 -0
- vitalroute-0.1.0/vitalroute/backbone/mlp.py +438 -0
- vitalroute-0.1.0/vitalroute/hard_samples.py +62 -0
- vitalroute-0.1.0/vitalroute/imbalance.py +82 -0
- vitalroute-0.1.0/vitalroute/lr_scale.py +129 -0
- vitalroute-0.1.0/vitalroute/router.py +329 -0
- vitalroute-0.1.0/vitalroute/torch_controller.py +298 -0
- vitalroute-0.1.0/vitalroute/torch_probe.py +386 -0
- vitalroute-0.1.0/vitalroute/torch_samplers.py +228 -0
- vitalroute-0.1.0/vitalroute/transfer.py +59 -0
- vitalroute-0.1.0/vitalroute/vitality.py +631 -0
- vitalroute-0.1.0/vitalroute.egg-info/PKG-INFO +321 -0
- vitalroute-0.1.0/vitalroute.egg-info/SOURCES.txt +23 -0
- vitalroute-0.1.0/vitalroute.egg-info/dependency_links.txt +1 -0
- vitalroute-0.1.0/vitalroute.egg-info/requires.txt +14 -0
- vitalroute-0.1.0/vitalroute.egg-info/top_level.txt +1 -0
vitalroute-0.1.0/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 VitalRoute Contributors
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,321 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: vitalroute
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Task-aware training controller via layer vitality monitoring
|
|
5
|
+
License-Expression: MIT
|
|
6
|
+
Project-URL: Homepage, https://github.com/vitalroute/vitalroute
|
|
7
|
+
Project-URL: Repository, https://github.com/vitalroute/vitalroute
|
|
8
|
+
Project-URL: Bug Tracker, https://github.com/vitalroute/vitalroute/issues
|
|
9
|
+
Project-URL: Changelog, https://github.com/vitalroute/vitalroute/blob/main/CHANGELOG.md
|
|
10
|
+
Keywords: imbalanced learning,neural network,dead neurons,adaptive training,class sampling,transfer learning,pytorch,machine learning
|
|
11
|
+
Classifier: Development Status :: 3 - Alpha
|
|
12
|
+
Classifier: Intended Audience :: Science/Research
|
|
13
|
+
Classifier: Intended Audience :: Developers
|
|
14
|
+
Classifier: Programming Language :: Python :: 3
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
18
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
19
|
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
20
|
+
Classifier: Operating System :: OS Independent
|
|
21
|
+
Requires-Python: >=3.10
|
|
22
|
+
Description-Content-Type: text/markdown
|
|
23
|
+
License-File: LICENSE
|
|
24
|
+
Requires-Dist: numpy>=1.24
|
|
25
|
+
Provides-Extra: demo
|
|
26
|
+
Requires-Dist: scikit-learn>=1.3; extra == "demo"
|
|
27
|
+
Provides-Extra: torch
|
|
28
|
+
Requires-Dist: torch>=2.0; extra == "torch"
|
|
29
|
+
Requires-Dist: torchvision>=0.15; extra == "torch"
|
|
30
|
+
Provides-Extra: dev
|
|
31
|
+
Requires-Dist: scikit-learn>=1.3; extra == "dev"
|
|
32
|
+
Requires-Dist: torch>=2.0; extra == "dev"
|
|
33
|
+
Requires-Dist: torchvision>=0.15; extra == "dev"
|
|
34
|
+
Requires-Dist: pytest>=7.0; extra == "dev"
|
|
35
|
+
Dynamic: license-file
|
|
36
|
+
|
|
37
|
+
# VitalRoute
|
|
38
|
+
|
|
39
|
+
[](https://pypi.org/project/vitalroute/)
|
|
40
|
+
[](https://www.python.org/downloads/)
|
|
41
|
+
[](LICENSE)
|
|
42
|
+
[](tests/)
|
|
43
|
+
|
|
44
|
+
**Task-aware training controller** for feed-forward classifiers. It sits on top of
|
|
45
|
+
your normal optimizer (Adam, SGD, etc.) and decides *when* to apply training
|
|
46
|
+
tactics based only on **how your training set is shaped** — not by
|
|
47
|
+
hand-tuning flags for every dataset.
|
|
48
|
+
|
|
49
|
+
## Background
|
|
50
|
+
|
|
51
|
+
VitalRoute grew out of a research line that treated neural networks like
|
|
52
|
+
organisms: hidden units can show **stasis** (non-responding), **weak
|
|
53
|
+
coupling**, or **saturation**, and pretrained models can be **inherited**
|
|
54
|
+
into a child task the way biological structure carries over. The original
|
|
55
|
+
work framed those ideas as pathology and inheritance on a cell hierarchy;
|
|
56
|
+
here they are distilled into a small, practical library — vitality probes,
|
|
57
|
+
label-free parent choice, and class-aware sampling — without tying you to
|
|
58
|
+
any particular legacy codebase or naming scheme.
|
|
59
|
+
|
|
60
|
+
## Idea (plain language)
|
|
61
|
+
|
|
62
|
+
A classic biological metaphor inspired this work: treat the network like a
|
|
63
|
+
body you can **examine** while it learns.
|
|
64
|
+
|
|
65
|
+
| Signal | Meaning |
|
|
66
|
+
|---|---|
|
|
67
|
+
| **Stasis** | Hidden unit barely responds (dead ReLU, etc.) |
|
|
68
|
+
| **Weak weights** | Weight column has collapsed |
|
|
69
|
+
| **Weak input** | Incoming activations are tiny vs weights |
|
|
70
|
+
| **Saturation** | Unit stuck near a constant output |
|
|
71
|
+
|
|
72
|
+
From those readings, VitalRoute can:
|
|
73
|
+
|
|
74
|
+
1. **Vitality sampler** — For **imbalanced** data, oversample classes with high
|
|
75
|
+
**composite stress** (all four signals, not only stasis).
|
|
76
|
+
2. **Transfer pick** — For **scarce** data, choose the best pretrained parent by
|
|
77
|
+
lowest stasis on the new inputs (no labels needed), then warm-start weights.
|
|
78
|
+
3. **Hard-sample sampler** — When class rebalancing is off, oversample individual
|
|
79
|
+
examples with high per-sample stress (stasis + weak coupling + low confidence).
|
|
80
|
+
4. **LR scale** — Slow learning on layers with high stasis:
|
|
81
|
+
`lr_l = base_lr / (1 + α · stasis_l)` (helps on hard tasks at hot LR).
|
|
82
|
+
5. **Monitor** — Watch layer health; **reset** stuck units only when stasis is
|
|
83
|
+
high (skipped for CNN-style models with a separate head).
|
|
84
|
+
|
|
85
|
+
An **adaptive router** turns (1)–(4) on or off from class counts and dataset size.
|
|
86
|
+
|
|
87
|
+
## Install
|
|
88
|
+
|
|
89
|
+
```powershell
|
|
90
|
+
cd vitalroute
|
|
91
|
+
pip install -r requirements.txt
|
|
92
|
+
pip install -e .
|
|
93
|
+
```
|
|
94
|
+
|
|
95
|
+
Requires Python 3.10+ and NumPy.
|
|
96
|
+
|
|
97
|
+
## Quick use
|
|
98
|
+
|
|
99
|
+
```python
|
|
100
|
+
import numpy as np
|
|
101
|
+
from vitalroute import adaptive_controller, profile_task, route_plan
|
|
102
|
+
from vitalroute.backbone import MLP, LayerSpec, Adam
|
|
103
|
+
|
|
104
|
+
# Your training arrays
|
|
105
|
+
X_train, y_train = ...
|
|
106
|
+
num_classes = 10
|
|
107
|
+
|
|
108
|
+
# Preview routing (no training)
|
|
109
|
+
prof = profile_task(y_train, num_classes)
|
|
110
|
+
plan = route_plan(prof, parent_pool_available=False)
|
|
111
|
+
print(plan.label) # e.g. "imbalance", "transfer", "transfer+imbalance", "monitor"
|
|
112
|
+
|
|
113
|
+
# Attach to your training loop
|
|
114
|
+
ctrl = adaptive_controller(y_train, num_classes, parent_pool=None, verbose=True)
|
|
115
|
+
opt = ctrl.make_optimizer("adam", lr=1e-3) # vitality-scaled when route includes lr_scale
|
|
116
|
+
|
|
117
|
+
sampler, _ = ctrl.bootstrap(model, X_train, y_train, num_classes=num_classes)
|
|
118
|
+
|
|
119
|
+
for epoch in range(epochs):
|
|
120
|
+
ctrl.on_epoch_start(model, X_train, opt, epoch) # refresh LR scales if enabled
|
|
121
|
+
if sampler is not None:
|
|
122
|
+
idx = sampler.sample_indices(epoch, model, X_train, y_train, len(y_train))
|
|
123
|
+
X_ep, y_ep = X_train[idx], y_train[idx]
|
|
124
|
+
else:
|
|
125
|
+
X_ep, y_ep = X_train, y_train
|
|
126
|
+
# ... your batches, loss, optimizer step ...
|
|
127
|
+
ctrl.after_epoch(model, X_train, rng=np.random.default_rng(epoch))
|
|
128
|
+
```
|
|
129
|
+
|
|
130
|
+
See `examples/digits_imbalanced_demo.py` for a runnable sketch.
|
|
131
|
+
|
|
132
|
+
## Router rules (defaults)
|
|
133
|
+
|
|
134
|
+
| Condition | Enabled |
|
|
135
|
+
|---|---|
|
|
136
|
+
| `min_class / max_class < 0.25` and minority ≥ 15 samples | Vitality sampler (composite stress) |
|
|
137
|
+
| `n ≤ 200` or `min_class ≤ 12`, and parent pool provided | Transfer pick |
|
|
138
|
+
| Scarce **balanced** data | Transfer + hard-sample sampler (no class sampler) |
|
|
139
|
+
| `n ≥ 80`, sampler off | LR scale |
|
|
140
|
+
| `n ≥ 40`, sampler off | Hard-sample sampler |
|
|
141
|
+
| Always (when training) | Monitor (+ conditional reset) |
|
|
142
|
+
|
|
143
|
+
## What this project is / is not
|
|
144
|
+
|
|
145
|
+
**Is:**
|
|
146
|
+
|
|
147
|
+
- A small library (NumPy + optional PyTorch) extracted from a larger neural-network research codebase
|
|
148
|
+
- Evidence-backed on imbalanced digits, Fashion-MNIST long-tail, and scarce transfer tasks
|
|
149
|
+
- Compatible with any PyTorch `nn.Module` via `VitalityProbe` forward hooks
|
|
150
|
+
- Compatible with the custom NumPy backbone via `adaptive_controller`
|
|
151
|
+
|
|
152
|
+
**Is not:**
|
|
153
|
+
|
|
154
|
+
- A replacement for backprop or PyTorch
|
|
155
|
+
- A guarantee of SOTA accuracy on vision (use a real CNN framework for that)
|
|
156
|
+
- A claim of novelty vs all of ML — curriculum and transfer learning exist; the hook is **vitality-driven routing**
|
|
157
|
+
|
|
158
|
+
## How it compares to inverse-frequency weighting
|
|
159
|
+
|
|
160
|
+
On a clean long-tail benchmark, VitalRoute ≈ inverse-frequency (inv_freq). They converge to the same answer because rare classes and broken-neuron classes heavily overlap — the network sees minority classes less, so their neurons die more.
|
|
161
|
+
|
|
162
|
+
**Where VitalRoute has a real edge over inv_freq:**
|
|
163
|
+
|
|
164
|
+
| Scenario | Why VitalRoute helps |
|
|
165
|
+
|---|---|
|
|
166
|
+
| Imbalanced but not uniformly scarce | A class with enough samples but high confusability (broken neurons) gets oversampled; inv_freq ignores it |
|
|
167
|
+
| Difficulty shifts mid-training | VitalRoute refreshes stress every N epochs; inv_freq is static |
|
|
168
|
+
| Label-free transfer selection | Picks the best pretrained parent by stasis on new inputs — no labels needed. inv_freq has no equivalent |
|
|
169
|
+
| Hard-sample curriculum | Per-sample stress (stasis + low confidence) for scarce balanced data; inv_freq only works at class level |
|
|
170
|
+
|
|
171
|
+
If your problem is purely long-tail with clean class boundaries, inv_freq is simpler and nearly as good. If classes overlap, difficulty shifts, or you need transfer selection without labels, VitalRoute adds real value.
|
|
172
|
+
|
|
173
|
+
## Package layout
|
|
174
|
+
|
|
175
|
+
```text
|
|
176
|
+
vitalroute/
|
|
177
|
+
README.md
|
|
178
|
+
PAPER.md # research paper style writeup
|
|
179
|
+
INTEGRATION.md
|
|
180
|
+
pyproject.toml
|
|
181
|
+
vitalroute/
|
|
182
|
+
vitality.py # layer stress probes + per-class/per-sample stress
|
|
183
|
+
imbalance.py # composite vitality class sampler (NumPy)
|
|
184
|
+
hard_samples.py # per-sample stress sampler (NumPy)
|
|
185
|
+
lr_scale.py # vitality-scaled Adam / SGD (NumPy)
|
|
186
|
+
transfer.py # label-free parent pick
|
|
187
|
+
router.py # task profile + adaptive controller (NumPy)
|
|
188
|
+
torch_probe.py # VitalityProbe — forward hooks for any nn.Module
|
|
189
|
+
torch_samplers.py # TorchVitalitySampler + TorchHardSampleSampler
|
|
190
|
+
torch_controller.py # TorchTrainingController + torch_adaptive_controller
|
|
191
|
+
backbone/ # optional reference MLP for demos
|
|
192
|
+
examples/
|
|
193
|
+
digits_imbalanced_demo.py # NumPy backbone quick demo
|
|
194
|
+
benchmark_baselines.py # NumPy baseline comparison (digits)
|
|
195
|
+
torch_probe_demo.py # VitalityProbe on a PyTorch MLP
|
|
196
|
+
torch_benchmark_fmnist.py # PyTorch baseline comparison (Fashion-MNIST)
|
|
197
|
+
cifar10_resnet_benchmark.py # ResNet18 / CIFAR-10 (GPU recommended)
|
|
198
|
+
tests/
|
|
199
|
+
```
|
|
200
|
+
|
|
201
|
+
## Evidence summary
|
|
202
|
+
|
|
203
|
+
Measured on public-style benchmarks during development:
|
|
204
|
+
|
|
205
|
+
| Setting | Typical gain |
|
|
206
|
+
|---|---|
|
|
207
|
+
| Imbalanced digits / Fashion minority classes | +2–4% minority accuracy vs uniform |
|
|
208
|
+
| vs inverse-frequency baseline (same imbalanced digits) | +0.7% minority, lower variance |
|
|
209
|
+
| Scarce digit subset with parent pool | up to +10% vs cold start |
|
|
210
|
+
| Scarce cat/dog (MLP / small CNN) | +2–3% with transfer pick |
|
|
211
|
+
|
|
212
|
+
**NumPy backbone benchmark** (`examples/benchmark_baselines.py`), 3 seeds, 30 epochs, 5:1 imbalance on digits:
|
|
213
|
+
|
|
214
|
+
```
|
|
215
|
+
Method Overall Minority
|
|
216
|
+
uniform 93.7%±1.1% 87.9%±2.3%
|
|
217
|
+
inv_freq 94.4%±0.8% 90.1%±1.0%
|
|
218
|
+
vitalroute 95.1%±0.3% 90.8%±1.0% ← best overall + lowest variance
|
|
219
|
+
stasis_only 95.0%±0.7% 90.7%±1.5%
|
|
220
|
+
```
|
|
221
|
+
|
|
222
|
+
**PyTorch benchmark** (`examples/torch_benchmark_fmnist.py`), 3 seeds, 20 epochs, 10:1 imbalance on Fashion-MNIST MLP:
|
|
223
|
+
|
|
224
|
+
```
|
|
225
|
+
Method Overall Minority
|
|
226
|
+
uniform 80.1%±0.4% 72.8%±1.2%
|
|
227
|
+
inv_freq 81.7%±0.5% 77.6%±0.9%
|
|
228
|
+
focal 80.0%±0.3% 72.6%±0.2%
|
|
229
|
+
vitalroute 81.7%±0.2% 76.5%±0.6% ← matches inv_freq, beats focal/uniform
|
|
230
|
+
```
|
|
231
|
+
|
|
232
|
+
VitalRoute matches inverse-frequency on overall accuracy and minority accuracy, while showing notably lower variance than competing methods. On the digits backbone it gains an additional +0.7% minority over inv_freq at significantly lower variance.
|
|
233
|
+
|
|
234
|
+
## PyTorch integration
|
|
235
|
+
|
|
236
|
+
### Probe only (read vitality signals)
|
|
237
|
+
|
|
238
|
+
`VitalityProbe` attaches to any `torch.nn.Module` via forward hooks — no
|
|
239
|
+
changes to your model or optimizer required:
|
|
240
|
+
|
|
241
|
+
```python
|
|
242
|
+
from vitalroute.torch_probe import VitalityProbe
|
|
243
|
+
|
|
244
|
+
probe = VitalityProbe(model) # attach once; pairs Linear→ReLU automatically
|
|
245
|
+
|
|
246
|
+
for epoch in range(epochs):
|
|
247
|
+
train_one_epoch(model, ...)
|
|
248
|
+
probe.observe(X_train) # one forward pass, no gradients
|
|
249
|
+
print(probe.summary()) # per-layer stasis + composite stress
|
|
250
|
+
print(f"mean stasis: {probe.mean_stasis():.3f}")
|
|
251
|
+
|
|
252
|
+
# Per-class and per-sample stress for custom sampling
|
|
253
|
+
class_scores = probe.per_class_stress(X_train, y_train, num_classes=10)
|
|
254
|
+
sample_scores = probe.per_sample_stress(X_train, y_train)
|
|
255
|
+
|
|
256
|
+
probe.detach() # clean up hooks
|
|
257
|
+
```
|
|
258
|
+
|
|
259
|
+
### Full adaptive controller
|
|
260
|
+
|
|
261
|
+
`torch_adaptive_controller` reads your class distribution and picks tactics automatically:
|
|
262
|
+
|
|
263
|
+
```python
|
|
264
|
+
from vitalroute.torch_controller import torch_adaptive_controller
|
|
265
|
+
from torch.utils.data import DataLoader
|
|
266
|
+
|
|
267
|
+
ctrl = torch_adaptive_controller(y_train, num_classes=10, verbose=True)
|
|
268
|
+
|
|
269
|
+
# X_probe/y_probe: small stratified batch (~50/class) for the probe
|
|
270
|
+
# y_full: full training labels for the sampler's class pools
|
|
271
|
+
sampler = ctrl.setup(model, X_probe, y_probe, y_full=y_train_full,
|
|
272
|
+
num_classes=10)
|
|
273
|
+
|
|
274
|
+
loader = DataLoader(dataset, sampler=sampler, batch_size=64)
|
|
275
|
+
|
|
276
|
+
for epoch in range(epochs):
|
|
277
|
+
ctrl.on_epoch_start(model, X_probe, optimizer, epoch)
|
|
278
|
+
for X_batch, y_batch in loader:
|
|
279
|
+
... # your normal loss + backward + step
|
|
280
|
+
ctrl.after_epoch(model, X_probe, y_probe)
|
|
281
|
+
|
|
282
|
+
ctrl.detach()
|
|
283
|
+
```
|
|
284
|
+
|
|
285
|
+
See `examples/torch_probe_demo.py` and `examples/torch_benchmark_fmnist.py` for runnable examples.
|
|
286
|
+
|
|
287
|
+
## License
|
|
288
|
+
|
|
289
|
+
MIT
|
|
290
|
+
|
|
291
|
+
## Related Work
|
|
292
|
+
|
|
293
|
+
VitalRoute draws on or is informed by the following lines of research. Where VitalRoute differs is noted.
|
|
294
|
+
|
|
295
|
+
**Adaptive class resampling**
|
|
296
|
+
- [ART: Adaptive Resampling-based Training for Imbalanced Classification](https://arxiv.org/abs/2509.00955) (2025) — periodically refreshes class sampling weights using class-wise F1 scores. VitalRoute uses internal neuron health signals instead of output metrics.
|
|
297
|
+
|
|
298
|
+
**Dead neuron analysis and pruning**
|
|
299
|
+
- [When to Prune? A Policy towards Early Structural Pruning](https://openreview.net/pdf?id=2wFXD2upSQ) — uses dead-neuron rates to guide structured pruning during training. VitalRoute uses the same signal to drive *sampling*, not pruning.
|
|
300
|
+
- [Dead neurons in Deep Learning (overview)](https://medium.com/@abhishekjainindore24/dead-neurons-in-deep-learning-their-effects-and-remedies-to-solve-it-e63da4dd9212)
|
|
301
|
+
|
|
302
|
+
**Dynamic network structure for imbalanced learning**
|
|
303
|
+
- [Adaptive Neuron Growth/Pruning for Imbalanced Classification](https://arxiv.org/abs/2507.09940) (2025) — adds/removes neurons per class using gradient magnitude. Orthogonal to VitalRoute: modifies architecture rather than sampling.
|
|
304
|
+
|
|
305
|
+
**Per-layer learning rate scaling**
|
|
306
|
+
- [LENA: Layer-wise Adaptive LR Scaling](https://dl.acm.org/doi/fullHtml/10.1145/3485447.3511989) — scales per-layer LR by gradient variance. VitalRoute scales by stasis (dead unit fraction), a complementary signal.
|
|
307
|
+
- [LLR: Heavy-Tail Guided Layerwise LR for LLMs](https://arxiv.org/html/2605.22297v1) (2025) — uses weight spectrum heavy-tailedness. Same goal, different diagnostic.
|
|
308
|
+
- [AdaLip: Adaptive LR per Layer via Lipschitz Estimation](https://d-nb.info/1283272997/34) — Lipschitz-constant-based per-layer LR.
|
|
309
|
+
- [LARS](https://arxiv.org/abs/1708.03888) / [LAMB](https://arxiv.org/abs/1904.00962) — weight/gradient ratio scaling; used in large-batch distributed training.
|
|
310
|
+
|
|
311
|
+
**Label-free transfer model selection**
|
|
312
|
+
- [TURTLE: Unsupervised Transfer Learning](https://arxiv.org/html/2406.07236v1) (2024) — selects pretrained models without labels via representation-level generalization objectives. VitalRoute uses stasis rate on new data — simpler, different rationale.
|
|
313
|
+
- [DISCO: Spectral Component Distribution for Transfer Assessment](https://arxiv.org/html/2412.19085v2) (2024) — SVD of feature distributions for transferability scoring.
|
|
314
|
+
|
|
315
|
+
**Focal Loss (baseline used in benchmarks)**
|
|
316
|
+
- [Focal Loss for Dense Object Detection](https://arxiv.org/abs/1708.02002) — Lin et al., 2017. Standard hard-example weighting via loss modulation.
|
|
317
|
+
|
|
318
|
+
**Curriculum / hard-sample learning**
|
|
319
|
+
- [Self-Paced Learning](https://papers.nips.cc/paper_files/paper/2010/hash/e57c6b956a6521b28495f2886ca0977a-Abstract.html) — Bengio et al., 2009. Foundation for curriculum-style training.
|
|
320
|
+
- [Online Hard Example Mining](https://arxiv.org/abs/1604.03540) — Shrivastava et al., 2016. Per-sample difficulty weighting from loss values.
|
|
321
|
+
|
|
@@ -0,0 +1,285 @@
|
|
|
1
|
+
# VitalRoute
|
|
2
|
+
|
|
3
|
+
[](https://pypi.org/project/vitalroute/)
|
|
4
|
+
[](https://www.python.org/downloads/)
|
|
5
|
+
[](LICENSE)
|
|
6
|
+
[](tests/)
|
|
7
|
+
|
|
8
|
+
**Task-aware training controller** for feed-forward classifiers. It sits on top of
|
|
9
|
+
your normal optimizer (Adam, SGD, etc.) and decides *when* to apply training
|
|
10
|
+
tactics based only on **how your training set is shaped** — not by
|
|
11
|
+
hand-tuning flags for every dataset.
|
|
12
|
+
|
|
13
|
+
## Background
|
|
14
|
+
|
|
15
|
+
VitalRoute grew out of a research line that treated neural networks like
|
|
16
|
+
organisms: hidden units can show **stasis** (non-responding), **weak
|
|
17
|
+
coupling**, or **saturation**, and pretrained models can be **inherited**
|
|
18
|
+
into a child task the way biological structure carries over. The original
|
|
19
|
+
work framed those ideas as pathology and inheritance on a cell hierarchy;
|
|
20
|
+
here they are distilled into a small, practical library — vitality probes,
|
|
21
|
+
label-free parent choice, and class-aware sampling — without tying you to
|
|
22
|
+
any particular legacy codebase or naming scheme.
|
|
23
|
+
|
|
24
|
+
## Idea (plain language)
|
|
25
|
+
|
|
26
|
+
A classic biological metaphor inspired this work: treat the network like a
|
|
27
|
+
body you can **examine** while it learns.
|
|
28
|
+
|
|
29
|
+
| Signal | Meaning |
|
|
30
|
+
|---|---|
|
|
31
|
+
| **Stasis** | Hidden unit barely responds (dead ReLU, etc.) |
|
|
32
|
+
| **Weak weights** | Weight column has collapsed |
|
|
33
|
+
| **Weak input** | Incoming activations are tiny vs weights |
|
|
34
|
+
| **Saturation** | Unit stuck near a constant output |
|
|
35
|
+
|
|
36
|
+
From those readings, VitalRoute can:
|
|
37
|
+
|
|
38
|
+
1. **Vitality sampler** — For **imbalanced** data, oversample classes with high
|
|
39
|
+
**composite stress** (all four signals, not only stasis).
|
|
40
|
+
2. **Transfer pick** — For **scarce** data, choose the best pretrained parent by
|
|
41
|
+
lowest stasis on the new inputs (no labels needed), then warm-start weights.
|
|
42
|
+
3. **Hard-sample sampler** — When class rebalancing is off, oversample individual
|
|
43
|
+
examples with high per-sample stress (stasis + weak coupling + low confidence).
|
|
44
|
+
4. **LR scale** — Slow learning on layers with high stasis:
|
|
45
|
+
`lr_l = base_lr / (1 + α · stasis_l)` (helps on hard tasks at hot LR).
|
|
46
|
+
5. **Monitor** — Watch layer health; **reset** stuck units only when stasis is
|
|
47
|
+
high (skipped for CNN-style models with a separate head).
|
|
48
|
+
|
|
49
|
+
An **adaptive router** turns (1)–(4) on or off from class counts and dataset size.
|
|
50
|
+
|
|
51
|
+
## Install
|
|
52
|
+
|
|
53
|
+
```powershell
|
|
54
|
+
cd vitalroute
|
|
55
|
+
pip install -r requirements.txt
|
|
56
|
+
pip install -e .
|
|
57
|
+
```
|
|
58
|
+
|
|
59
|
+
Requires Python 3.10+ and NumPy.
|
|
60
|
+
|
|
61
|
+
## Quick use
|
|
62
|
+
|
|
63
|
+
```python
|
|
64
|
+
import numpy as np
|
|
65
|
+
from vitalroute import adaptive_controller, profile_task, route_plan
|
|
66
|
+
from vitalroute.backbone import MLP, LayerSpec, Adam
|
|
67
|
+
|
|
68
|
+
# Your training arrays
|
|
69
|
+
X_train, y_train = ...
|
|
70
|
+
num_classes = 10
|
|
71
|
+
|
|
72
|
+
# Preview routing (no training)
|
|
73
|
+
prof = profile_task(y_train, num_classes)
|
|
74
|
+
plan = route_plan(prof, parent_pool_available=False)
|
|
75
|
+
print(plan.label) # e.g. "imbalance", "transfer", "transfer+imbalance", "monitor"
|
|
76
|
+
|
|
77
|
+
# Attach to your training loop
|
|
78
|
+
ctrl = adaptive_controller(y_train, num_classes, parent_pool=None, verbose=True)
|
|
79
|
+
opt = ctrl.make_optimizer("adam", lr=1e-3) # vitality-scaled when route includes lr_scale
|
|
80
|
+
|
|
81
|
+
sampler, _ = ctrl.bootstrap(model, X_train, y_train, num_classes=num_classes)
|
|
82
|
+
|
|
83
|
+
for epoch in range(epochs):
|
|
84
|
+
ctrl.on_epoch_start(model, X_train, opt, epoch) # refresh LR scales if enabled
|
|
85
|
+
if sampler is not None:
|
|
86
|
+
idx = sampler.sample_indices(epoch, model, X_train, y_train, len(y_train))
|
|
87
|
+
X_ep, y_ep = X_train[idx], y_train[idx]
|
|
88
|
+
else:
|
|
89
|
+
X_ep, y_ep = X_train, y_train
|
|
90
|
+
# ... your batches, loss, optimizer step ...
|
|
91
|
+
ctrl.after_epoch(model, X_train, rng=np.random.default_rng(epoch))
|
|
92
|
+
```
|
|
93
|
+
|
|
94
|
+
See `examples/digits_imbalanced_demo.py` for a runnable sketch.
|
|
95
|
+
|
|
96
|
+
## Router rules (defaults)
|
|
97
|
+
|
|
98
|
+
| Condition | Enabled |
|
|
99
|
+
|---|---|
|
|
100
|
+
| `min_class / max_class < 0.25` and minority ≥ 15 samples | Vitality sampler (composite stress) |
|
|
101
|
+
| `n ≤ 200` or `min_class ≤ 12`, and parent pool provided | Transfer pick |
|
|
102
|
+
| Scarce **balanced** data | Transfer + hard-sample sampler (no class sampler) |
|
|
103
|
+
| `n ≥ 80`, sampler off | LR scale |
|
|
104
|
+
| `n ≥ 40`, sampler off | Hard-sample sampler |
|
|
105
|
+
| Always (when training) | Monitor (+ conditional reset) |
|
|
106
|
+
|
|
107
|
+
## What this project is / is not
|
|
108
|
+
|
|
109
|
+
**Is:**
|
|
110
|
+
|
|
111
|
+
- A small library (NumPy + optional PyTorch) extracted from a larger neural-network research codebase
|
|
112
|
+
- Evidence-backed on imbalanced digits, Fashion-MNIST long-tail, and scarce transfer tasks
|
|
113
|
+
- Compatible with any PyTorch `nn.Module` via `VitalityProbe` forward hooks
|
|
114
|
+
- Compatible with the custom NumPy backbone via `adaptive_controller`
|
|
115
|
+
|
|
116
|
+
**Is not:**
|
|
117
|
+
|
|
118
|
+
- A replacement for backprop or PyTorch
|
|
119
|
+
- A guarantee of SOTA accuracy on vision (use a real CNN framework for that)
|
|
120
|
+
- A claim of novelty vs all of ML — curriculum and transfer learning exist; the hook is **vitality-driven routing**
|
|
121
|
+
|
|
122
|
+
## How it compares to inverse-frequency weighting
|
|
123
|
+
|
|
124
|
+
On a clean long-tail benchmark, VitalRoute ≈ inverse-frequency (inv_freq). They converge to the same answer because rare classes and broken-neuron classes heavily overlap — the network sees minority classes less, so their neurons die more.
|
|
125
|
+
|
|
126
|
+
**Where VitalRoute has a real edge over inv_freq:**
|
|
127
|
+
|
|
128
|
+
| Scenario | Why VitalRoute helps |
|
|
129
|
+
|---|---|
|
|
130
|
+
| Imbalanced but not uniformly scarce | A class with enough samples but high confusability (broken neurons) gets oversampled; inv_freq ignores it |
|
|
131
|
+
| Difficulty shifts mid-training | VitalRoute refreshes stress every N epochs; inv_freq is static |
|
|
132
|
+
| Label-free transfer selection | Picks the best pretrained parent by stasis on new inputs — no labels needed. inv_freq has no equivalent |
|
|
133
|
+
| Hard-sample curriculum | Per-sample stress (stasis + low confidence) for scarce balanced data; inv_freq only works at class level |
|
|
134
|
+
|
|
135
|
+
If your problem is purely long-tail with clean class boundaries, inv_freq is simpler and nearly as good. If classes overlap, difficulty shifts, or you need transfer selection without labels, VitalRoute adds real value.
|
|
136
|
+
|
|
137
|
+
## Package layout
|
|
138
|
+
|
|
139
|
+
```text
|
|
140
|
+
vitalroute/
|
|
141
|
+
README.md
|
|
142
|
+
PAPER.md # research paper style writeup
|
|
143
|
+
INTEGRATION.md
|
|
144
|
+
pyproject.toml
|
|
145
|
+
vitalroute/
|
|
146
|
+
vitality.py # layer stress probes + per-class/per-sample stress
|
|
147
|
+
imbalance.py # composite vitality class sampler (NumPy)
|
|
148
|
+
hard_samples.py # per-sample stress sampler (NumPy)
|
|
149
|
+
lr_scale.py # vitality-scaled Adam / SGD (NumPy)
|
|
150
|
+
transfer.py # label-free parent pick
|
|
151
|
+
router.py # task profile + adaptive controller (NumPy)
|
|
152
|
+
torch_probe.py # VitalityProbe — forward hooks for any nn.Module
|
|
153
|
+
torch_samplers.py # TorchVitalitySampler + TorchHardSampleSampler
|
|
154
|
+
torch_controller.py # TorchTrainingController + torch_adaptive_controller
|
|
155
|
+
backbone/ # optional reference MLP for demos
|
|
156
|
+
examples/
|
|
157
|
+
digits_imbalanced_demo.py # NumPy backbone quick demo
|
|
158
|
+
benchmark_baselines.py # NumPy baseline comparison (digits)
|
|
159
|
+
torch_probe_demo.py # VitalityProbe on a PyTorch MLP
|
|
160
|
+
torch_benchmark_fmnist.py # PyTorch baseline comparison (Fashion-MNIST)
|
|
161
|
+
cifar10_resnet_benchmark.py # ResNet18 / CIFAR-10 (GPU recommended)
|
|
162
|
+
tests/
|
|
163
|
+
```
|
|
164
|
+
|
|
165
|
+
## Evidence summary
|
|
166
|
+
|
|
167
|
+
Measured on public-style benchmarks during development:
|
|
168
|
+
|
|
169
|
+
| Setting | Typical gain |
|
|
170
|
+
|---|---|
|
|
171
|
+
| Imbalanced digits / Fashion minority classes | +2–4% minority accuracy vs uniform |
|
|
172
|
+
| vs inverse-frequency baseline (same imbalanced digits) | +0.7% minority, lower variance |
|
|
173
|
+
| Scarce digit subset with parent pool | up to +10% vs cold start |
|
|
174
|
+
| Scarce cat/dog (MLP / small CNN) | +2–3% with transfer pick |
|
|
175
|
+
|
|
176
|
+
**NumPy backbone benchmark** (`examples/benchmark_baselines.py`), 3 seeds, 30 epochs, 5:1 imbalance on digits:
|
|
177
|
+
|
|
178
|
+
```
|
|
179
|
+
Method Overall Minority
|
|
180
|
+
uniform 93.7%±1.1% 87.9%±2.3%
|
|
181
|
+
inv_freq 94.4%±0.8% 90.1%±1.0%
|
|
182
|
+
vitalroute 95.1%±0.3% 90.8%±1.0% ← best overall + lowest variance
|
|
183
|
+
stasis_only 95.0%±0.7% 90.7%±1.5%
|
|
184
|
+
```
|
|
185
|
+
|
|
186
|
+
**PyTorch benchmark** (`examples/torch_benchmark_fmnist.py`), 3 seeds, 20 epochs, 10:1 imbalance on Fashion-MNIST MLP:
|
|
187
|
+
|
|
188
|
+
```
|
|
189
|
+
Method Overall Minority
|
|
190
|
+
uniform 80.1%±0.4% 72.8%±1.2%
|
|
191
|
+
inv_freq 81.7%±0.5% 77.6%±0.9%
|
|
192
|
+
focal 80.0%±0.3% 72.6%±0.2%
|
|
193
|
+
vitalroute 81.7%±0.2% 76.5%±0.6% ← matches inv_freq, beats focal/uniform
|
|
194
|
+
```
|
|
195
|
+
|
|
196
|
+
VitalRoute matches inverse-frequency on overall accuracy and minority accuracy, while showing notably lower variance than competing methods. On the digits backbone it gains an additional +0.7% minority over inv_freq at significantly lower variance.
|
|
197
|
+
|
|
198
|
+
## PyTorch integration
|
|
199
|
+
|
|
200
|
+
### Probe only (read vitality signals)
|
|
201
|
+
|
|
202
|
+
`VitalityProbe` attaches to any `torch.nn.Module` via forward hooks — no
|
|
203
|
+
changes to your model or optimizer required:
|
|
204
|
+
|
|
205
|
+
```python
|
|
206
|
+
from vitalroute.torch_probe import VitalityProbe
|
|
207
|
+
|
|
208
|
+
probe = VitalityProbe(model) # attach once; pairs Linear→ReLU automatically
|
|
209
|
+
|
|
210
|
+
for epoch in range(epochs):
|
|
211
|
+
train_one_epoch(model, ...)
|
|
212
|
+
probe.observe(X_train) # one forward pass, no gradients
|
|
213
|
+
print(probe.summary()) # per-layer stasis + composite stress
|
|
214
|
+
print(f"mean stasis: {probe.mean_stasis():.3f}")
|
|
215
|
+
|
|
216
|
+
# Per-class and per-sample stress for custom sampling
|
|
217
|
+
class_scores = probe.per_class_stress(X_train, y_train, num_classes=10)
|
|
218
|
+
sample_scores = probe.per_sample_stress(X_train, y_train)
|
|
219
|
+
|
|
220
|
+
probe.detach() # clean up hooks
|
|
221
|
+
```
|
|
222
|
+
|
|
223
|
+
### Full adaptive controller
|
|
224
|
+
|
|
225
|
+
`torch_adaptive_controller` reads your class distribution and picks tactics automatically:
|
|
226
|
+
|
|
227
|
+
```python
|
|
228
|
+
from vitalroute.torch_controller import torch_adaptive_controller
|
|
229
|
+
from torch.utils.data import DataLoader
|
|
230
|
+
|
|
231
|
+
ctrl = torch_adaptive_controller(y_train, num_classes=10, verbose=True)
|
|
232
|
+
|
|
233
|
+
# X_probe/y_probe: small stratified batch (~50/class) for the probe
|
|
234
|
+
# y_full: full training labels for the sampler's class pools
|
|
235
|
+
sampler = ctrl.setup(model, X_probe, y_probe, y_full=y_train_full,
|
|
236
|
+
num_classes=10)
|
|
237
|
+
|
|
238
|
+
loader = DataLoader(dataset, sampler=sampler, batch_size=64)
|
|
239
|
+
|
|
240
|
+
for epoch in range(epochs):
|
|
241
|
+
ctrl.on_epoch_start(model, X_probe, optimizer, epoch)
|
|
242
|
+
for X_batch, y_batch in loader:
|
|
243
|
+
... # your normal loss + backward + step
|
|
244
|
+
ctrl.after_epoch(model, X_probe, y_probe)
|
|
245
|
+
|
|
246
|
+
ctrl.detach()
|
|
247
|
+
```
|
|
248
|
+
|
|
249
|
+
See `examples/torch_probe_demo.py` and `examples/torch_benchmark_fmnist.py` for runnable examples.
|
|
250
|
+
|
|
251
|
+
## License
|
|
252
|
+
|
|
253
|
+
MIT
|
|
254
|
+
|
|
255
|
+
## Related Work
|
|
256
|
+
|
|
257
|
+
VitalRoute draws on or is informed by the following lines of research. Where VitalRoute differs is noted.
|
|
258
|
+
|
|
259
|
+
**Adaptive class resampling**
|
|
260
|
+
- [ART: Adaptive Resampling-based Training for Imbalanced Classification](https://arxiv.org/abs/2509.00955) (2025) — periodically refreshes class sampling weights using class-wise F1 scores. VitalRoute uses internal neuron health signals instead of output metrics.
|
|
261
|
+
|
|
262
|
+
**Dead neuron analysis and pruning**
|
|
263
|
+
- [When to Prune? A Policy towards Early Structural Pruning](https://openreview.net/pdf?id=2wFXD2upSQ) — uses dead-neuron rates to guide structured pruning during training. VitalRoute uses the same signal to drive *sampling*, not pruning.
|
|
264
|
+
- [Dead neurons in Deep Learning (overview)](https://medium.com/@abhishekjainindore24/dead-neurons-in-deep-learning-their-effects-and-remedies-to-solve-it-e63da4dd9212)
|
|
265
|
+
|
|
266
|
+
**Dynamic network structure for imbalanced learning**
|
|
267
|
+
- [Adaptive Neuron Growth/Pruning for Imbalanced Classification](https://arxiv.org/abs/2507.09940) (2025) — adds/removes neurons per class using gradient magnitude. Orthogonal to VitalRoute: modifies architecture rather than sampling.
|
|
268
|
+
|
|
269
|
+
**Per-layer learning rate scaling**
|
|
270
|
+
- [LENA: Layer-wise Adaptive LR Scaling](https://dl.acm.org/doi/fullHtml/10.1145/3485447.3511989) — scales per-layer LR by gradient variance. VitalRoute scales by stasis (dead unit fraction), a complementary signal.
|
|
271
|
+
- [LLR: Heavy-Tail Guided Layerwise LR for LLMs](https://arxiv.org/html/2605.22297v1) (2025) — uses weight spectrum heavy-tailedness. Same goal, different diagnostic.
|
|
272
|
+
- [AdaLip: Adaptive LR per Layer via Lipschitz Estimation](https://d-nb.info/1283272997/34) — Lipschitz-constant-based per-layer LR.
|
|
273
|
+
- [LARS](https://arxiv.org/abs/1708.03888) / [LAMB](https://arxiv.org/abs/1904.00962) — weight/gradient ratio scaling; used in large-batch distributed training.
|
|
274
|
+
|
|
275
|
+
**Label-free transfer model selection**
|
|
276
|
+
- [TURTLE: Unsupervised Transfer Learning](https://arxiv.org/html/2406.07236v1) (2024) — selects pretrained models without labels via representation-level generalization objectives. VitalRoute uses stasis rate on new data — simpler, different rationale.
|
|
277
|
+
- [DISCO: Spectral Component Distribution for Transfer Assessment](https://arxiv.org/html/2412.19085v2) (2024) — SVD of feature distributions for transferability scoring.
|
|
278
|
+
|
|
279
|
+
**Focal Loss (baseline used in benchmarks)**
|
|
280
|
+
- [Focal Loss for Dense Object Detection](https://arxiv.org/abs/1708.02002) — Lin et al., 2017. Standard hard-example weighting via loss modulation.
|
|
281
|
+
|
|
282
|
+
**Curriculum / hard-sample learning**
|
|
283
|
+
- [Self-Paced Learning](https://papers.nips.cc/paper_files/paper/2010/hash/e57c6b956a6521b28495f2886ca0977a-Abstract.html) — Bengio et al., 2009. Foundation for curriculum-style training.
|
|
284
|
+
- [Online Hard Example Mining](https://arxiv.org/abs/1604.03540) — Shrivastava et al., 2016. Per-sample difficulty weighting from loss values.
|
|
285
|
+
|
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=61"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "vitalroute"
|
|
7
|
+
version = "0.1.0"
|
|
8
|
+
description = "Task-aware training controller via layer vitality monitoring"
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
requires-python = ">=3.10"
|
|
11
|
+
license = "MIT"
|
|
12
|
+
license-files = ["LICENSE"]
|
|
13
|
+
dependencies = ["numpy>=1.24"]
|
|
14
|
+
keywords = [
|
|
15
|
+
"imbalanced learning",
|
|
16
|
+
"neural network",
|
|
17
|
+
"dead neurons",
|
|
18
|
+
"adaptive training",
|
|
19
|
+
"class sampling",
|
|
20
|
+
"transfer learning",
|
|
21
|
+
"pytorch",
|
|
22
|
+
"machine learning",
|
|
23
|
+
]
|
|
24
|
+
classifiers = [
|
|
25
|
+
"Development Status :: 3 - Alpha",
|
|
26
|
+
"Intended Audience :: Science/Research",
|
|
27
|
+
"Intended Audience :: Developers",
|
|
28
|
+
"Programming Language :: Python :: 3",
|
|
29
|
+
"Programming Language :: Python :: 3.10",
|
|
30
|
+
"Programming Language :: Python :: 3.11",
|
|
31
|
+
"Programming Language :: Python :: 3.12",
|
|
32
|
+
"Topic :: Scientific/Engineering :: Artificial Intelligence",
|
|
33
|
+
"Topic :: Software Development :: Libraries :: Python Modules",
|
|
34
|
+
"Operating System :: OS Independent",
|
|
35
|
+
]
|
|
36
|
+
|
|
37
|
+
[project.urls]
|
|
38
|
+
Homepage = "https://github.com/vitalroute/vitalroute"
|
|
39
|
+
Repository = "https://github.com/vitalroute/vitalroute"
|
|
40
|
+
"Bug Tracker" = "https://github.com/vitalroute/vitalroute/issues"
|
|
41
|
+
Changelog = "https://github.com/vitalroute/vitalroute/blob/main/CHANGELOG.md"
|
|
42
|
+
|
|
43
|
+
[project.optional-dependencies]
|
|
44
|
+
demo = ["scikit-learn>=1.3"]
|
|
45
|
+
torch = ["torch>=2.0", "torchvision>=0.15"]
|
|
46
|
+
dev = ["scikit-learn>=1.3", "torch>=2.0", "torchvision>=0.15", "pytest>=7.0"]
|
|
47
|
+
|
|
48
|
+
[tool.setuptools.packages.find]
|
|
49
|
+
where = ["."]
|
|
50
|
+
include = ["vitalroute*"]
|