hotcb 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- hotcb-0.1.0/LICENSE +0 -0
- hotcb-0.1.0/PKG-INFO +551 -0
- hotcb-0.1.0/README.md +519 -0
- hotcb-0.1.0/pyproject.toml +58 -0
- hotcb-0.1.0/setup.cfg +4 -0
- hotcb-0.1.0/src/hotcb/__init__.py +3 -0
- hotcb-0.1.0/src/hotcb/adapters/__init__.py +1 -0
- hotcb-0.1.0/src/hotcb/adapters/hf.py +390 -0
- hotcb-0.1.0/src/hotcb/adapters/lightning.py +348 -0
- hotcb-0.1.0/src/hotcb/callbacks/__init__.py +17 -0
- hotcb-0.1.0/src/hotcb/callbacks/anomaly_guard.py +124 -0
- hotcb-0.1.0/src/hotcb/callbacks/grad_stats.py +124 -0
- hotcb-0.1.0/src/hotcb/callbacks/heartbeat.py +98 -0
- hotcb-0.1.0/src/hotcb/callbacks/jsonl_logger.py +142 -0
- hotcb-0.1.0/src/hotcb/callbacks/system_stats.py +105 -0
- hotcb-0.1.0/src/hotcb/callbacks/tensor_stats.py +114 -0
- hotcb-0.1.0/src/hotcb/callbacks/timing.py +121 -0
- hotcb-0.1.0/src/hotcb/callbacks/utils.py +278 -0
- hotcb-0.1.0/src/hotcb/cli.py +385 -0
- hotcb-0.1.0/src/hotcb/config.py +147 -0
- hotcb-0.1.0/src/hotcb/controller.py +615 -0
- hotcb-0.1.0/src/hotcb/loader.py +265 -0
- hotcb-0.1.0/src/hotcb/ops.py +66 -0
- hotcb-0.1.0/src/hotcb/protocol.py +169 -0
- hotcb-0.1.0/src/hotcb/tests/conftest.py +15 -0
- hotcb-0.1.0/src/hotcb/tests/test_cli_parse_kv.py +26 -0
- hotcb-0.1.0/src/hotcb/tests/test_config_yaml_parse.py +18 -0
- hotcb-0.1.0/src/hotcb/tests/test_controller_aytoreload_file.py +72 -0
- hotcb-0.1.0/src/hotcb/tests/test_controller_core_ops.py +57 -0
- hotcb-0.1.0/src/hotcb/tests/test_controller_failure_isolation.py +66 -0
- hotcb-0.1.0/src/hotcb/tests/test_controller_param_mutation.py +69 -0
- hotcb-0.1.0/src/hotcb/tests/test_env_logger_access_hf.py +42 -0
- hotcb-0.1.0/src/hotcb/tests/test_env_logger_access_lightning.py +48 -0
- hotcb-0.1.0/src/hotcb/tests/test_util_jsonl_cursor.py +28 -0
- hotcb-0.1.0/src/hotcb/tests/test_util_logger_resolution.py +119 -0
- hotcb-0.1.0/src/hotcb/tests/test_util_logger_resolution_lightning_real_tb.py +22 -0
- hotcb-0.1.0/src/hotcb/util.py +468 -0
- hotcb-0.1.0/src/hotcb.egg-info/PKG-INFO +551 -0
- hotcb-0.1.0/src/hotcb.egg-info/SOURCES.txt +41 -0
- hotcb-0.1.0/src/hotcb.egg-info/dependency_links.txt +1 -0
- hotcb-0.1.0/src/hotcb.egg-info/entry_points.txt +2 -0
- hotcb-0.1.0/src/hotcb.egg-info/requires.txt +24 -0
- hotcb-0.1.0/src/hotcb.egg-info/top_level.txt +1 -0
hotcb-0.1.0/LICENSE
ADDED
|
File without changes
|
hotcb-0.1.0/PKG-INFO
ADDED
|
@@ -0,0 +1,551 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: hotcb
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Hot-swappable callbacks for training loops (Lightning, HF, or bare PyTorch)
|
|
5
|
+
Author: Sidharth Pal
|
|
6
|
+
Keywords: pytorch,lightning,huggingface,callbacks,training,debugging
|
|
7
|
+
Classifier: Programming Language :: Python :: 3
|
|
8
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
9
|
+
Classifier: Operating System :: OS Independent
|
|
10
|
+
Requires-Python: >=3.9
|
|
11
|
+
Description-Content-Type: text/markdown
|
|
12
|
+
License-File: LICENSE
|
|
13
|
+
Provides-Extra: yaml
|
|
14
|
+
Requires-Dist: pyyaml>=6.0; extra == "yaml"
|
|
15
|
+
Provides-Extra: lightning
|
|
16
|
+
Requires-Dist: lightning>=2.0; extra == "lightning"
|
|
17
|
+
Provides-Extra: hf
|
|
18
|
+
Requires-Dist: transformers>=4.30; extra == "hf"
|
|
19
|
+
Provides-Extra: all
|
|
20
|
+
Requires-Dist: pyyaml>=6.0; extra == "all"
|
|
21
|
+
Requires-Dist: lightning>=2.0; extra == "all"
|
|
22
|
+
Requires-Dist: transformers>=4.30; extra == "all"
|
|
23
|
+
Provides-Extra: docs
|
|
24
|
+
Requires-Dist: mkdocs; extra == "docs"
|
|
25
|
+
Requires-Dist: mkdocs-material; extra == "docs"
|
|
26
|
+
Requires-Dist: mkdocstrings[python]; extra == "docs"
|
|
27
|
+
Requires-Dist: mkdocs-awesome-pages-plugin; extra == "docs"
|
|
28
|
+
Provides-Extra: dev
|
|
29
|
+
Requires-Dist: pytest>=8.0; extra == "dev"
|
|
30
|
+
Requires-Dist: pytest-cov>=5.0; extra == "dev"
|
|
31
|
+
Dynamic: license-file
|
|
32
|
+
|
|
33
|
+
# hotcb 🔥
|
|
34
|
+
Hot-swappable callbacks for PyTorch Lightning, HuggingFace Trainer, or bare PyTorch.
|
|
35
|
+
|
|
36
|
+
Enable, disable, modify, or load new callbacks **while training is running** — without restarting.
|
|
37
|
+
|
|
38
|
+
---
|
|
39
|
+
|
|
40
|
+
## ✨ Features
|
|
41
|
+
|
|
42
|
+
- ✅ Enable / disable callbacks live
|
|
43
|
+
- ✅ Update callback parameters at runtime
|
|
44
|
+
- ✅ Load callbacks from a new Python file path
|
|
45
|
+
- ✅ Works with:
|
|
46
|
+
- PyTorch Lightning
|
|
47
|
+
- HuggingFace Trainer
|
|
48
|
+
- Bare PyTorch loops
|
|
49
|
+
- ✅ No DDP required
|
|
50
|
+
- ✅ CLI helper included
|
|
51
|
+
- ✅ Minimal and framework-agnostic core
|
|
52
|
+
|
|
53
|
+
---
|
|
54
|
+
|
|
55
|
+
## 📦 Installation
|
|
56
|
+
|
|
57
|
+
Core only:
|
|
58
|
+
|
|
59
|
+
```bash
|
|
60
|
+
pip install hotcb
|
|
61
|
+
```
|
|
62
|
+
|
|
63
|
+
With YAML support:
|
|
64
|
+
|
|
65
|
+
```bash
|
|
66
|
+
pip install "hotcb[yaml]"
|
|
67
|
+
```
|
|
68
|
+
|
|
69
|
+
With Lightning adapter:
|
|
70
|
+
|
|
71
|
+
```bash
|
|
72
|
+
pip install "hotcb[lightning]"
|
|
73
|
+
```
|
|
74
|
+
|
|
75
|
+
With HuggingFace adapter:
|
|
76
|
+
|
|
77
|
+
```bash
|
|
78
|
+
pip install "hotcb[hf]"
|
|
79
|
+
```
|
|
80
|
+
|
|
81
|
+
Install everything:
|
|
82
|
+
|
|
83
|
+
```bash
|
|
84
|
+
pip install "hotcb[all]"
|
|
85
|
+
```
|
|
86
|
+
|
|
87
|
+
---
|
|
88
|
+
|
|
89
|
+
# 🚀 Quickstart (PyTorch Lightning)
|
|
90
|
+
|
|
91
|
+
```python
|
|
92
|
+
from hotcb import HotController
|
|
93
|
+
from hotcb.adapters.lightning import HotCallbackController
|
|
94
|
+
import lightning.pytorch as pl
|
|
95
|
+
|
|
96
|
+
controller = HotController(
|
|
97
|
+
config_path="runs/exp1/hotcb.yaml",
|
|
98
|
+
commands_path="runs/exp1/hotcb.commands.jsonl",
|
|
99
|
+
debounce_steps=5,
|
|
100
|
+
)
|
|
101
|
+
|
|
102
|
+
trainer = pl.Trainer(
|
|
103
|
+
callbacks=[HotCallbackController(controller)],
|
|
104
|
+
)
|
|
105
|
+
```
|
|
106
|
+
|
|
107
|
+
---
|
|
108
|
+
|
|
109
|
+
# 🚀 Quickstart (HuggingFace Trainer)
|
|
110
|
+
|
|
111
|
+
```python
|
|
112
|
+
from hotcb import HotController
|
|
113
|
+
from hotcb.adapters.hf import HotHFCallback
|
|
114
|
+
from transformers import Trainer
|
|
115
|
+
|
|
116
|
+
controller = HotController(
|
|
117
|
+
config_path="runs/exp1/hotcb.yaml",
|
|
118
|
+
commands_path="runs/exp1/hotcb.commands.jsonl",
|
|
119
|
+
)
|
|
120
|
+
|
|
121
|
+
trainer = Trainer(
|
|
122
|
+
...,
|
|
123
|
+
callbacks=[HotHFCallback(controller)],
|
|
124
|
+
)
|
|
125
|
+
```
|
|
126
|
+
|
|
127
|
+
---
|
|
128
|
+
|
|
129
|
+
# 🚀 Quickstart (Bare PyTorch)
|
|
130
|
+
|
|
131
|
+
```python
|
|
132
|
+
controller = HotController(
|
|
133
|
+
config_path="runs/exp1/hotcb.yaml",
|
|
134
|
+
commands_path="runs/exp1/hotcb.commands.jsonl",
|
|
135
|
+
)
|
|
136
|
+
|
|
137
|
+
for step, batch in enumerate(loader):
|
|
138
|
+
# training logic...
|
|
139
|
+
controller.apply(
|
|
140
|
+
env={
|
|
141
|
+
"step": step,
|
|
142
|
+
"phase": "train",
|
|
143
|
+
"model": model,
|
|
144
|
+
"log": print,
|
|
145
|
+
},
|
|
146
|
+
events=["train_step_end"],
|
|
147
|
+
)
|
|
148
|
+
```
|
|
149
|
+
|
|
150
|
+
# 🧭 CLI Control (Live, No Restart)
|
|
151
|
+
|
|
152
|
+
`hotcb` includes a lightweight CLI to control callbacks while training is running.
|
|
153
|
+
|
|
154
|
+
First, initialize a run directory:
|
|
155
|
+
|
|
156
|
+
```bash
|
|
157
|
+
hotcb --dir runs/exp1 init
|
|
158
|
+
```
|
|
159
|
+
|
|
160
|
+
This creates:
|
|
161
|
+
|
|
162
|
+
```
|
|
163
|
+
runs/exp1/
|
|
164
|
+
hotcb.yaml
|
|
165
|
+
hotcb.commands.jsonl
|
|
166
|
+
```
|
|
167
|
+
|
|
168
|
+
---
|
|
169
|
+
|
|
170
|
+
### 🔥 Load a callback from a new file
|
|
171
|
+
|
|
172
|
+
```bash
|
|
173
|
+
hotcb --dir runs/exp1 load feat_viz \
|
|
174
|
+
--file /tmp/feat_viz.py \
|
|
175
|
+
--symbol FeatureVizCallback \
|
|
176
|
+
--enabled \
|
|
177
|
+
--init every=100 out_dir=debug/features
|
|
178
|
+
```
|
|
179
|
+
|
|
180
|
+
It starts running immediately (at the next safe step).
|
|
181
|
+
|
|
182
|
+
---
|
|
183
|
+
|
|
184
|
+
### ⚡ Enable / Disable instantly
|
|
185
|
+
|
|
186
|
+
```bash
|
|
187
|
+
hotcb --dir runs/exp1 enable feat_viz
|
|
188
|
+
hotcb --dir runs/exp1 disable feat_viz
|
|
189
|
+
```
|
|
190
|
+
|
|
191
|
+
Disable = soft remove (no restart required).
|
|
192
|
+
|
|
193
|
+
---
|
|
194
|
+
|
|
195
|
+
### 🎛 Adjust parameters live
|
|
196
|
+
|
|
197
|
+
```bash
|
|
198
|
+
hotcb --dir runs/exp1 set feat_viz every=25
|
|
199
|
+
hotcb --dir runs/exp1 set feat_viz threshold=30.5 prefix=[debug]
|
|
200
|
+
```
|
|
201
|
+
|
|
202
|
+
Changes are applied at the next safe point.
|
|
203
|
+
|
|
204
|
+
---
|
|
205
|
+
|
|
206
|
+
### 🧹 Unload completely (optional)
|
|
207
|
+
|
|
208
|
+
```bash
|
|
209
|
+
hotcb --dir runs/exp1 unload feat_viz
|
|
210
|
+
```
|
|
211
|
+
|
|
212
|
+
This disables and drops the instance.
|
|
213
|
+
|
|
214
|
+
---
|
|
215
|
+
|
|
216
|
+
### 💡 Typical Workflow
|
|
217
|
+
|
|
218
|
+
1. Start training once.
|
|
219
|
+
2. Notice something odd.
|
|
220
|
+
3. Drop a new `.py` diagnostic file.
|
|
221
|
+
4. `hotcb load ...`
|
|
222
|
+
5. Inspect.
|
|
223
|
+
6. `hotcb disable ...`
|
|
224
|
+
7. Continue training uninterrupted.
|
|
225
|
+
|
|
226
|
+
No restarts. No trainer hacks. No killing long runs.
|
|
227
|
+
|
|
228
|
+
|
|
229
|
+
---
|
|
230
|
+
|
|
231
|
+
# 🧠 Writing a Hot Callback
|
|
232
|
+
|
|
233
|
+
Minimal contract:
|
|
234
|
+
|
|
235
|
+
```python
|
|
236
|
+
class MyCallback:
|
|
237
|
+
def __init__(self, id: str, every: int = 50):
|
|
238
|
+
self.id = id
|
|
239
|
+
self.every = every
|
|
240
|
+
|
|
241
|
+
def set_params(self, **kwargs):
|
|
242
|
+
if "every" in kwargs:
|
|
243
|
+
self.every = int(kwargs["every"])
|
|
244
|
+
|
|
245
|
+
def handle(self, event: str, env: dict):
|
|
246
|
+
step = env.get("step", 0)
|
|
247
|
+
if step % self.every == 0:
|
|
248
|
+
env.get("log", print)(f"[{self.id}] step={step}")
|
|
249
|
+
```
|
|
250
|
+
|
|
251
|
+
That’s it.
|
|
252
|
+
|
|
253
|
+
---
|
|
254
|
+
|
|
255
|
+
### 📚 See Real Examples
|
|
256
|
+
|
|
257
|
+
For more complete examples (including file-based hot loading and artifact writing), check:
|
|
258
|
+
|
|
259
|
+
- `examples/callbacks/print_metrics.py` — minimal logging callback
|
|
260
|
+
- `examples/callbacks/feat_viz.py` — writes step-based artifacts to disk
|
|
261
|
+
- `examples/lightning_train.py` — Lightning integration example
|
|
262
|
+
- `examples/hf_train.py` — HuggingFace Trainer integration example
|
|
263
|
+
|
|
264
|
+
These examples are fully runnable and demonstrate live parameter updates via the CLI.
|
|
265
|
+
|
|
266
|
+
---
|
|
267
|
+
|
|
268
|
+
---
|
|
269
|
+
|
|
270
|
+
# 🧰 Included Diagnostic Callbacks
|
|
271
|
+
|
|
272
|
+
`hotcb` includes a lightweight built-in diagnostics pack so you can start instrumenting runs immediately:
|
|
273
|
+
|
|
274
|
+
- **HeartbeatCallback** — periodic “I’m alive” signal for long runs
|
|
275
|
+
- **TimingCallback** — step timing & throughput tracking
|
|
276
|
+
- **SystemStatsCallback** — CPU / RAM / (optional) GPU utilization
|
|
277
|
+
- **TensorStatsCallback** — tensor mean/std/min/max tracking
|
|
278
|
+
- **GradStatsCallback** — gradient norm & stability diagnostics
|
|
279
|
+
- **AnomalyGuardCallback** — basic NaN / Inf detection & auto-disable protection
|
|
280
|
+
- **JSONLLoggerCallback** — structured append-only JSONL event logging
|
|
281
|
+
|
|
282
|
+
These are intentionally minimal, composable, and safe to enable/disable at runtime.
|
|
283
|
+
|
|
284
|
+
Example:
|
|
285
|
+
|
|
286
|
+
```bash
|
|
287
|
+
hotcb --dir runs/exp1 load heartbeat \
|
|
288
|
+
--module hotcb.callbacks.heartbeat \
|
|
289
|
+
--symbol HeartbeatCallback \
|
|
290
|
+
--enabled \
|
|
291
|
+
--init every=100
|
|
292
|
+
```
|
|
293
|
+
|
|
294
|
+
Or enable a gradient monitor mid-training:
|
|
295
|
+
|
|
296
|
+
```bash
|
|
297
|
+
hotcb --dir runs/exp1 load grad_stats \
|
|
298
|
+
--module hotcb.callbacks.grad_stats \
|
|
299
|
+
--symbol GradStatsCallback \
|
|
300
|
+
--enabled \
|
|
301
|
+
--init every=50
|
|
302
|
+
```
|
|
303
|
+
|
|
304
|
+
All included callbacks support live parameter updates:
|
|
305
|
+
|
|
306
|
+
```bash
|
|
307
|
+
hotcb --dir runs/exp1 set grad_stats every=10
|
|
308
|
+
```
|
|
309
|
+
|
|
310
|
+
No restart required.
|
|
311
|
+
|
|
312
|
+
---
|
|
313
|
+
|
|
314
|
+
# 🔎 Intelligent Logging Resolvers
|
|
315
|
+
|
|
316
|
+
hotcb callbacks run inside the same Python process as your training loop.
|
|
317
|
+
That means they can often discover and reuse the logging infrastructure already configured by your framework (Lightning, HuggingFace Trainer, or custom code).
|
|
318
|
+
|
|
319
|
+
To support this cleanly and safely, hotcb provides logging resolvers — utilities that attempt to discover common logging backends from the runtime env passed to callbacks.
|
|
320
|
+
|
|
321
|
+
### Individual resolvers - For Scalars, Images, Histograms etc
|
|
322
|
+
|
|
323
|
+
Resolvers allow a callback to “plug into” existing logging backends automatically.
|
|
324
|
+
|
|
325
|
+
- Discover logger candidates (no strict contract required) - Resolvers inspect the env dictionary and attempt to extract logger-like objects from common locations (No adapter-level constraints are imposed. This is purely best-effort introspection.)
|
|
326
|
+
|
|
327
|
+
- Or, Resolve specific backends (official + heuristic detection) - Resolvers find Known official classes (when installed) + Safe attribute-based heuristics
|
|
328
|
+
|
|
329
|
+
|Supported backends| resolver function | Returns |
|
|
330
|
+
|------------------|-----------------------------------|----------------------|
|
|
331
|
+
|Tensorboard| `resolve_tensorboard_writer(env)` | `writer` |
|
|
332
|
+
|MLFlow| `resolve_mlflow(env)` | `experiment, run_id` |
|
|
333
|
+
|Comet| `resolve_comet_experiment(env)` | `experiment` |
|
|
334
|
+
|
|
335
|
+
Typical sources:
|
|
336
|
+
|
|
337
|
+
- Lightning TensorBoardLogger, MLFlowLogger or CometLogger
|
|
338
|
+
- HF TensorBoardCallback, MLflowCallback or CometCallback (best effort)
|
|
339
|
+
- Direct SummaryWriter or (client, run_id) tuple passed in `env["mlflow"]` or object passed in `env["comet_experiment"]`
|
|
340
|
+
|
|
341
|
+
### Holistic logging Convenience Helper - For Scalars only
|
|
342
|
+
|
|
343
|
+
A unified helper is also provided:
|
|
344
|
+
```
|
|
345
|
+
log_scalar(env, key, value, step=None)
|
|
346
|
+
```
|
|
347
|
+
Behavior:
|
|
348
|
+
|
|
349
|
+
- Try TensorBoard (add_scalar)
|
|
350
|
+
- Try MLflow (log_metric)
|
|
351
|
+
- Try Comet (log_metric)
|
|
352
|
+
|
|
353
|
+
Returns `True` if logging succeeded to at least one backend.
|
|
354
|
+
|
|
355
|
+
Failures are swallowed — logging will never crash your training loop.
|
|
356
|
+
|
|
357
|
+
Use framework-native logging for training-critical metrics.
|
|
358
|
+
Use hotcb resolvers for live instrumentation, debugging, and temporary analytics.
|
|
359
|
+
|
|
360
|
+
---
|
|
361
|
+
|
|
362
|
+
# 🔌 Import Scope in Hot-Loaded Callbacks
|
|
363
|
+
|
|
364
|
+
Callbacks run in the same interpreter as your training job.
|
|
365
|
+
|
|
366
|
+
You can import from your training repo if:
|
|
367
|
+
- You run from repo root, or
|
|
368
|
+
- The project is installed (editable or normal), or
|
|
369
|
+
- PYTHONPATH is configured.
|
|
370
|
+
|
|
371
|
+
Prefer absolute imports in hot-loaded `.py` files.
|
|
372
|
+
|
|
373
|
+
---
|
|
374
|
+
|
|
375
|
+
|
|
376
|
+
# 🧬 A Unified Callback Model
|
|
377
|
+
|
|
378
|
+
`hotcb` is a thin portability layer: it lets you write one callback once, then run it across
|
|
379
|
+
PyTorch Lightning, HuggingFace Trainer, or bare PyTorch by mapping framework hook arguments into a
|
|
380
|
+
small, normalized `env` dictionary.
|
|
381
|
+
|
|
382
|
+
`env` is intentionally small and predictable. Adapters fill it from the native framework objects:
|
|
383
|
+
|
|
384
|
+
| `env` key | Lightning (source) | HF Trainer (source) | Bare PyTorch (source) |
|
|
385
|
+
|---|---|---|---|
|
|
386
|
+
| `env["step"]` | `trainer.global_step` | `state.global_step` | loop `step` |
|
|
387
|
+
| `env["epoch"]` | `trainer.current_epoch` | `state.epoch` | loop `epoch` |
|
|
388
|
+
| `env["phase"]` | adapter sets `"train"/"val"` | adapter sets `"train"/"eval"` | you set it |
|
|
389
|
+
| `env["model"]` | `pl_module` | *(adapter-provided)* | your model |
|
|
390
|
+
| `env["batch"]` | `batch` | *(adapter-provided)* | your batch |
|
|
391
|
+
| `env["outputs"]` | `outputs` | *(optional)* | your outputs |
|
|
392
|
+
| `env["log"]` | adapter wraps `trainer.print` | adapter wraps `print` | your logger |
|
|
393
|
+
|
|
394
|
+
> `env` is a portability contract. If you want extra fields, you can always include them in `env`
|
|
395
|
+
> from your own loop, or extend adapters later — but the minimal set above keeps callbacks simple
|
|
396
|
+
> and avoids accidental retention of large tensors.
|
|
397
|
+
|
|
398
|
+
|
|
399
|
+
---
|
|
400
|
+
|
|
401
|
+
# 🛠 Making existing callbacks Hot-Adjustable
|
|
402
|
+
|
|
403
|
+
All four variants below do the same thing:
|
|
404
|
+
- print once every `every` steps
|
|
405
|
+
- easy to tune `every`
|
|
406
|
+
- the hotcb version supports runtime updates via the CLI (`hotcb set ...`)
|
|
407
|
+
|
|
408
|
+
### 1) PyTorch Lightning callback
|
|
409
|
+
|
|
410
|
+
```python
|
|
411
|
+
import lightning.pytorch as pl
|
|
412
|
+
|
|
413
|
+
class PrintEveryN_Lightning(pl.Callback):
|
|
414
|
+
def __init__(self, every: int = 50, prefix: str = "[metrics]"):
|
|
415
|
+
self.every = int(every)
|
|
416
|
+
self.prefix = str(prefix)
|
|
417
|
+
|
|
418
|
+
def on_train_batch_end(self, trainer, pl_module, outputs, batch, batch_idx):
|
|
419
|
+
step = int(trainer.global_step)
|
|
420
|
+
if self.every > 0 and (step % self.every) == 0:
|
|
421
|
+
trainer.print(f"{self.prefix} step={step} batch_idx={batch_idx}")
|
|
422
|
+
```
|
|
423
|
+
|
|
424
|
+
### 2) HuggingFace Trainer callback
|
|
425
|
+
|
|
426
|
+
```python
|
|
427
|
+
from transformers import TrainerCallback
|
|
428
|
+
|
|
429
|
+
class PrintEveryN_HF(TrainerCallback):
|
|
430
|
+
def __init__(self, every: int = 50, prefix: str = "[metrics]"):
|
|
431
|
+
self.every = int(every)
|
|
432
|
+
self.prefix = str(prefix)
|
|
433
|
+
|
|
434
|
+
def on_step_end(self, args, state, control, **kwargs):
|
|
435
|
+
step = int(state.global_step)
|
|
436
|
+
if self.every > 0 and (step % self.every) == 0:
|
|
437
|
+
print(f"{self.prefix} step={step}")
|
|
438
|
+
return control
|
|
439
|
+
```
|
|
440
|
+
|
|
441
|
+
### 3) Bare PyTorch “hook style”
|
|
442
|
+
|
|
443
|
+
```python
|
|
444
|
+
class PrintEveryN_TorchHook:
|
|
445
|
+
def __init__(self, every: int = 50, prefix: str = "[metrics]"):
|
|
446
|
+
self.every = int(every)
|
|
447
|
+
self.prefix = str(prefix)
|
|
448
|
+
|
|
449
|
+
def on_step_end(self, step: int, batch_idx: int):
|
|
450
|
+
if self.every > 0 and (step % self.every) == 0:
|
|
451
|
+
print(f"{self.prefix} step={step} batch_idx={batch_idx}")
|
|
452
|
+
```
|
|
453
|
+
|
|
454
|
+
Usage:
|
|
455
|
+
|
|
456
|
+
```python
|
|
457
|
+
hook = PrintEveryN_TorchHook(every=50)
|
|
458
|
+
|
|
459
|
+
for step, batch in enumerate(loader):
|
|
460
|
+
# forward/backward/step...
|
|
461
|
+
hook.on_step_end(step=step, batch_idx=step)
|
|
462
|
+
```
|
|
463
|
+
|
|
464
|
+
### 4) hotcb callback (portable + hot-adjustable)
|
|
465
|
+
|
|
466
|
+
```python
|
|
467
|
+
class PrintEveryN_HotCB:
|
|
468
|
+
def __init__(self, id: str, every: int = 50, prefix: str = "[metrics]"):
|
|
469
|
+
self.id = id
|
|
470
|
+
self.every = int(every)
|
|
471
|
+
self.prefix = str(prefix)
|
|
472
|
+
|
|
473
|
+
def set_params(self, **kwargs):
|
|
474
|
+
if "every" in kwargs:
|
|
475
|
+
self.every = int(kwargs["every"])
|
|
476
|
+
if "prefix" in kwargs:
|
|
477
|
+
self.prefix = str(kwargs["prefix"])
|
|
478
|
+
|
|
479
|
+
def handle(self, event: str, env: dict):
|
|
480
|
+
step = int(env.get("step", 0))
|
|
481
|
+
batch_idx = env.get("batch_idx", None)
|
|
482
|
+
log = env.get("log", print)
|
|
483
|
+
|
|
484
|
+
if self.every > 0 and (step % self.every) == 0:
|
|
485
|
+
log(f"{self.prefix} id={self.id} step={step} event={event} batch_idx={batch_idx}")
|
|
486
|
+
```
|
|
487
|
+
|
|
488
|
+
Runtime tuning (no restart):
|
|
489
|
+
|
|
490
|
+
```bash
|
|
491
|
+
hotcb --dir runs/exp1 set print_metrics every=5
|
|
492
|
+
```
|
|
493
|
+
|
|
494
|
+
---
|
|
495
|
+
|
|
496
|
+
# 📡 How It Works
|
|
497
|
+
|
|
498
|
+
Two control layers:
|
|
499
|
+
|
|
500
|
+
1. `hotcb.yaml` — desired state (optional)
|
|
501
|
+
2. `hotcb.commands.jsonl` — append-only command stream
|
|
502
|
+
|
|
503
|
+
Changes are applied at safe adapter-defined boundaries:
|
|
504
|
+
- Lightning → end of batch
|
|
505
|
+
- HF → end of step / eval
|
|
506
|
+
- Bare torch → wherever you call `apply()`
|
|
507
|
+
|
|
508
|
+
---
|
|
509
|
+
|
|
510
|
+
# ❓ Why this exists
|
|
511
|
+
|
|
512
|
+
Hot-swappable callbacks are often used for:
|
|
513
|
+
|
|
514
|
+
- Temporary diagnostics
|
|
515
|
+
- Feature visualization
|
|
516
|
+
- Gradient/statistics inspection
|
|
517
|
+
- Mid-run debugging
|
|
518
|
+
- Experiment instrumentation
|
|
519
|
+
|
|
520
|
+
You don’t want to:
|
|
521
|
+
|
|
522
|
+
- Modify your Trainer code
|
|
523
|
+
- Restart a long training job
|
|
524
|
+
|
|
525
|
+
The in-built hot-reloaders and logging resolvers allow a callback to “plug into” existing training run and logging backends automatically.
|
|
526
|
+
|
|
527
|
+
Safely - without impacting your run even when it fails.
|
|
528
|
+
|
|
529
|
+
---
|
|
530
|
+
|
|
531
|
+
# 🛡 Safety
|
|
532
|
+
|
|
533
|
+
- No training loop mutation
|
|
534
|
+
- No framework internals modified
|
|
535
|
+
- Fail-safe: crashing callbacks can auto-disable
|
|
536
|
+
- “Remove” = disable (optional unload supported)
|
|
537
|
+
|
|
538
|
+
---
|
|
539
|
+
|
|
540
|
+
# 🌱 Philosophy
|
|
541
|
+
|
|
542
|
+
Training shouldn’t require restarts for diagnostics.
|
|
543
|
+
|
|
544
|
+
`hotcb` treats debugging and visualization as live instrumentation — not static configuration.
|
|
545
|
+
|
|
546
|
+
---
|
|
547
|
+
|
|
548
|
+
# 📄 License
|
|
549
|
+
|
|
550
|
+
MIT License (see LICENSE file).
|
|
551
|
+
|