hotcb 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. hotcb-0.1.0/LICENSE +0 -0
  2. hotcb-0.1.0/PKG-INFO +551 -0
  3. hotcb-0.1.0/README.md +519 -0
  4. hotcb-0.1.0/pyproject.toml +58 -0
  5. hotcb-0.1.0/setup.cfg +4 -0
  6. hotcb-0.1.0/src/hotcb/__init__.py +3 -0
  7. hotcb-0.1.0/src/hotcb/adapters/__init__.py +1 -0
  8. hotcb-0.1.0/src/hotcb/adapters/hf.py +390 -0
  9. hotcb-0.1.0/src/hotcb/adapters/lightning.py +348 -0
  10. hotcb-0.1.0/src/hotcb/callbacks/__init__.py +17 -0
  11. hotcb-0.1.0/src/hotcb/callbacks/anomaly_guard.py +124 -0
  12. hotcb-0.1.0/src/hotcb/callbacks/grad_stats.py +124 -0
  13. hotcb-0.1.0/src/hotcb/callbacks/heartbeat.py +98 -0
  14. hotcb-0.1.0/src/hotcb/callbacks/jsonl_logger.py +142 -0
  15. hotcb-0.1.0/src/hotcb/callbacks/system_stats.py +105 -0
  16. hotcb-0.1.0/src/hotcb/callbacks/tensor_stats.py +114 -0
  17. hotcb-0.1.0/src/hotcb/callbacks/timing.py +121 -0
  18. hotcb-0.1.0/src/hotcb/callbacks/utils.py +278 -0
  19. hotcb-0.1.0/src/hotcb/cli.py +385 -0
  20. hotcb-0.1.0/src/hotcb/config.py +147 -0
  21. hotcb-0.1.0/src/hotcb/controller.py +615 -0
  22. hotcb-0.1.0/src/hotcb/loader.py +265 -0
  23. hotcb-0.1.0/src/hotcb/ops.py +66 -0
  24. hotcb-0.1.0/src/hotcb/protocol.py +169 -0
  25. hotcb-0.1.0/src/hotcb/tests/conftest.py +15 -0
  26. hotcb-0.1.0/src/hotcb/tests/test_cli_parse_kv.py +26 -0
  27. hotcb-0.1.0/src/hotcb/tests/test_config_yaml_parse.py +18 -0
  28. hotcb-0.1.0/src/hotcb/tests/test_controller_aytoreload_file.py +72 -0
  29. hotcb-0.1.0/src/hotcb/tests/test_controller_core_ops.py +57 -0
  30. hotcb-0.1.0/src/hotcb/tests/test_controller_failure_isolation.py +66 -0
  31. hotcb-0.1.0/src/hotcb/tests/test_controller_param_mutation.py +69 -0
  32. hotcb-0.1.0/src/hotcb/tests/test_env_logger_access_hf.py +42 -0
  33. hotcb-0.1.0/src/hotcb/tests/test_env_logger_access_lightning.py +48 -0
  34. hotcb-0.1.0/src/hotcb/tests/test_util_jsonl_cursor.py +28 -0
  35. hotcb-0.1.0/src/hotcb/tests/test_util_logger_resolution.py +119 -0
  36. hotcb-0.1.0/src/hotcb/tests/test_util_logger_resolution_lightning_real_tb.py +22 -0
  37. hotcb-0.1.0/src/hotcb/util.py +468 -0
  38. hotcb-0.1.0/src/hotcb.egg-info/PKG-INFO +551 -0
  39. hotcb-0.1.0/src/hotcb.egg-info/SOURCES.txt +41 -0
  40. hotcb-0.1.0/src/hotcb.egg-info/dependency_links.txt +1 -0
  41. hotcb-0.1.0/src/hotcb.egg-info/entry_points.txt +2 -0
  42. hotcb-0.1.0/src/hotcb.egg-info/requires.txt +24 -0
  43. hotcb-0.1.0/src/hotcb.egg-info/top_level.txt +1 -0
hotcb-0.1.0/LICENSE ADDED
File without changes
hotcb-0.1.0/PKG-INFO ADDED
@@ -0,0 +1,551 @@
1
+ Metadata-Version: 2.4
2
+ Name: hotcb
3
+ Version: 0.1.0
4
+ Summary: Hot-swappable callbacks for training loops (Lightning, HF, or bare PyTorch)
5
+ Author: Sidharth Pal
6
+ Keywords: pytorch,lightning,huggingface,callbacks,training,debugging
7
+ Classifier: Programming Language :: Python :: 3
8
+ Classifier: License :: OSI Approved :: MIT License
9
+ Classifier: Operating System :: OS Independent
10
+ Requires-Python: >=3.9
11
+ Description-Content-Type: text/markdown
12
+ License-File: LICENSE
13
+ Provides-Extra: yaml
14
+ Requires-Dist: pyyaml>=6.0; extra == "yaml"
15
+ Provides-Extra: lightning
16
+ Requires-Dist: lightning>=2.0; extra == "lightning"
17
+ Provides-Extra: hf
18
+ Requires-Dist: transformers>=4.30; extra == "hf"
19
+ Provides-Extra: all
20
+ Requires-Dist: pyyaml>=6.0; extra == "all"
21
+ Requires-Dist: lightning>=2.0; extra == "all"
22
+ Requires-Dist: transformers>=4.30; extra == "all"
23
+ Provides-Extra: docs
24
+ Requires-Dist: mkdocs; extra == "docs"
25
+ Requires-Dist: mkdocs-material; extra == "docs"
26
+ Requires-Dist: mkdocstrings[python]; extra == "docs"
27
+ Requires-Dist: mkdocs-awesome-pages-plugin; extra == "docs"
28
+ Provides-Extra: dev
29
+ Requires-Dist: pytest>=8.0; extra == "dev"
30
+ Requires-Dist: pytest-cov>=5.0; extra == "dev"
31
+ Dynamic: license-file
32
+
33
+ # hotcb 🔥
34
+ Hot-swappable callbacks for PyTorch Lightning, HuggingFace Trainer, or bare PyTorch.
35
+
36
+ Enable, disable, modify, or load new callbacks **while training is running** — without restarting.
37
+
38
+ ---
39
+
40
+ ## ✨ Features
41
+
42
+ - ✅ Enable / disable callbacks live
43
+ - ✅ Update callback parameters at runtime
44
+ - ✅ Load callbacks from a new Python file path
45
+ - ✅ Works with:
46
+ - PyTorch Lightning
47
+ - HuggingFace Trainer
48
+ - Bare PyTorch loops
49
+ - ✅ No DDP required
50
+ - ✅ CLI helper included
51
+ - ✅ Minimal and framework-agnostic core
52
+
53
+ ---
54
+
55
+ ## 📦 Installation
56
+
57
+ Core only:
58
+
59
+ ```bash
60
+ pip install hotcb
61
+ ```
62
+
63
+ With YAML support:
64
+
65
+ ```bash
66
+ pip install "hotcb[yaml]"
67
+ ```
68
+
69
+ With Lightning adapter:
70
+
71
+ ```bash
72
+ pip install "hotcb[lightning]"
73
+ ```
74
+
75
+ With HuggingFace adapter:
76
+
77
+ ```bash
78
+ pip install "hotcb[hf]"
79
+ ```
80
+
81
+ Install everything:
82
+
83
+ ```bash
84
+ pip install "hotcb[all]"
85
+ ```
86
+
87
+ ---
88
+
89
+ # 🚀 Quickstart (PyTorch Lightning)
90
+
91
+ ```python
92
+ from hotcb import HotController
93
+ from hotcb.adapters.lightning import HotCallbackController
94
+ import lightning.pytorch as pl
95
+
96
+ controller = HotController(
97
+ config_path="runs/exp1/hotcb.yaml",
98
+ commands_path="runs/exp1/hotcb.commands.jsonl",
99
+ debounce_steps=5,
100
+ )
101
+
102
+ trainer = pl.Trainer(
103
+ callbacks=[HotCallbackController(controller)],
104
+ )
105
+ ```
106
+
107
+ ---
108
+
109
+ # 🚀 Quickstart (HuggingFace Trainer)
110
+
111
+ ```python
112
+ from hotcb import HotController
113
+ from hotcb.adapters.hf import HotHFCallback
114
+ from transformers import Trainer
115
+
116
+ controller = HotController(
117
+ config_path="runs/exp1/hotcb.yaml",
118
+ commands_path="runs/exp1/hotcb.commands.jsonl",
119
+ )
120
+
121
+ trainer = Trainer(
122
+ ...,
123
+ callbacks=[HotHFCallback(controller)],
124
+ )
125
+ ```
126
+
127
+ ---
128
+
129
+ # 🚀 Quickstart (Bare PyTorch)
130
+
131
+ ```python
132
+ controller = HotController(
133
+ config_path="runs/exp1/hotcb.yaml",
134
+ commands_path="runs/exp1/hotcb.commands.jsonl",
135
+ )
136
+
137
+ for step, batch in enumerate(loader):
138
+ # training logic...
139
+ controller.apply(
140
+ env={
141
+ "step": step,
142
+ "phase": "train",
143
+ "model": model,
144
+ "log": print,
145
+ },
146
+ events=["train_step_end"],
147
+ )
148
+ ```
149
+
150
+ # 🧭 CLI Control (Live, No Restart)
151
+
152
+ `hotcb` includes a lightweight CLI to control callbacks while training is running.
153
+
154
+ First, initialize a run directory:
155
+
156
+ ```bash
157
+ hotcb --dir runs/exp1 init
158
+ ```
159
+
160
+ This creates:
161
+
162
+ ```
163
+ runs/exp1/
164
+ hotcb.yaml
165
+ hotcb.commands.jsonl
166
+ ```
167
+
168
+ ---
169
+
170
+ ### 🔥 Load a callback from a new file
171
+
172
+ ```bash
173
+ hotcb --dir runs/exp1 load feat_viz \
174
+ --file /tmp/feat_viz.py \
175
+ --symbol FeatureVizCallback \
176
+ --enabled \
177
+ --init every=100 out_dir=debug/features
178
+ ```
179
+
180
+ It starts running immediately (at the next safe step).
181
+
182
+ ---
183
+
184
+ ### ⚡ Enable / Disable instantly
185
+
186
+ ```bash
187
+ hotcb --dir runs/exp1 enable feat_viz
188
+ hotcb --dir runs/exp1 disable feat_viz
189
+ ```
190
+
191
+ Disable = soft remove (no restart required).
192
+
193
+ ---
194
+
195
+ ### 🎛 Adjust parameters live
196
+
197
+ ```bash
198
+ hotcb --dir runs/exp1 set feat_viz every=25
199
+ hotcb --dir runs/exp1 set feat_viz threshold=30.5 prefix=[debug]
200
+ ```
201
+
202
+ Changes are applied at the next safe point.
203
+
204
+ ---
205
+
206
+ ### 🧹 Unload completely (optional)
207
+
208
+ ```bash
209
+ hotcb --dir runs/exp1 unload feat_viz
210
+ ```
211
+
212
+ This disables and drops the instance.
213
+
214
+ ---
215
+
216
+ ### 💡 Typical Workflow
217
+
218
+ 1. Start training once.
219
+ 2. Notice something odd.
220
+ 3. Drop a new `.py` diagnostic file.
221
+ 4. `hotcb load ...`
222
+ 5. Inspect.
223
+ 6. `hotcb disable ...`
224
+ 7. Continue training uninterrupted.
225
+
226
+ No restarts. No trainer hacks. No killing long runs.
227
+
228
+
229
+ ---
230
+
231
+ # 🧠 Writing a Hot Callback
232
+
233
+ Minimal contract:
234
+
235
+ ```python
236
+ class MyCallback:
237
+ def __init__(self, id: str, every: int = 50):
238
+ self.id = id
239
+ self.every = every
240
+
241
+ def set_params(self, **kwargs):
242
+ if "every" in kwargs:
243
+ self.every = int(kwargs["every"])
244
+
245
+ def handle(self, event: str, env: dict):
246
+ step = env.get("step", 0)
247
+ if step % self.every == 0:
248
+ env.get("log", print)(f"[{self.id}] step={step}")
249
+ ```
250
+
251
+ That’s it.
252
+
253
+ ---
254
+
255
+ ### 📚 See Real Examples
256
+
257
+ For more complete examples (including file-based hot loading and artifact writing), check:
258
+
259
+ - `examples/callbacks/print_metrics.py` — minimal logging callback
260
+ - `examples/callbacks/feat_viz.py` — writes step-based artifacts to disk
261
+ - `examples/lightning_train.py` — Lightning integration example
262
+ - `examples/hf_train.py` — HuggingFace Trainer integration example
263
+
264
+ These examples are fully runnable and demonstrate live parameter updates via the CLI.
265
+
266
+ ---
267
+
268
+ ---
269
+
270
+ # 🧰 Included Diagnostic Callbacks
271
+
272
+ `hotcb` includes a lightweight built-in diagnostics pack so you can start instrumenting runs immediately:
273
+
274
+ - **HeartbeatCallback** — periodic “I’m alive” signal for long runs
275
+ - **TimingCallback** — step timing & throughput tracking
276
+ - **SystemStatsCallback** — CPU / RAM / (optional) GPU utilization
277
+ - **TensorStatsCallback** — tensor mean/std/min/max tracking
278
+ - **GradStatsCallback** — gradient norm & stability diagnostics
279
+ - **AnomalyGuardCallback** — basic NaN / Inf detection & auto-disable protection
280
+ - **JSONLLoggerCallback** — structured append-only JSONL event logging
281
+
282
+ These are intentionally minimal, composable, and safe to enable/disable at runtime.
283
+
284
+ Example:
285
+
286
+ ```bash
287
+ hotcb --dir runs/exp1 load heartbeat \
288
+ --module hotcb.callbacks.heartbeat \
289
+ --symbol HeartbeatCallback \
290
+ --enabled \
291
+ --init every=100
292
+ ```
293
+
294
+ Or enable a gradient monitor mid-training:
295
+
296
+ ```bash
297
+ hotcb --dir runs/exp1 load grad_stats \
298
+ --module hotcb.callbacks.grad_stats \
299
+ --symbol GradStatsCallback \
300
+ --enabled \
301
+ --init every=50
302
+ ```
303
+
304
+ All included callbacks support live parameter updates:
305
+
306
+ ```bash
307
+ hotcb --dir runs/exp1 set grad_stats every=10
308
+ ```
309
+
310
+ No restart required.
311
+
312
+ ---
313
+
314
+ # 🔎 Intelligent Logging Resolvers
315
+
316
+ hotcb callbacks run inside the same Python process as your training loop.
317
+ That means they can often discover and reuse the logging infrastructure already configured by your framework (Lightning, HuggingFace Trainer, or custom code).
318
+
319
+ To support this cleanly and safely, hotcb provides logging resolvers — utilities that attempt to discover common logging backends from the runtime env passed to callbacks.
320
+
321
+ ### Individual resolvers - For Scalars, Images, Histograms etc
322
+
323
+ Resolvers allow a callback to “plug into” existing logging backends automatically.
324
+
325
+ - Discover logger candidates (no strict contract required) - Resolvers inspect the env dictionary and attempt to extract logger-like objects from common locations (No adapter-level constraints are imposed. This is purely best-effort introspection.)
326
+
327
+ - Or, Resolve specific backends (official + heuristic detection) - Resolvers find Known official classes (when installed) + Safe attribute-based heuristics
328
+
329
+ |Supported backends| resolver function | Returns |
330
+ |------------------|-----------------------------------|----------------------|
331
+ |Tensorboard| `resolve_tensorboard_writer(env)` | `writer` |
332
+ |MLFlow| `resolve_mlflow(env)` | `experiment, run_id` |
333
+ |Comet| `resolve_comet_experiment(env)` | `experiment` |
334
+
335
+ Typical sources:
336
+
337
+ - Lightning TensorBoardLogger, MLFlowLogger or CometLogger
338
+ - HF TensorBoardCallback, MLflowCallback or CometCallback (best effort)
339
+ - Direct SummaryWriter or (client, run_id) tuple passed in `env["mlflow"]` or object passed in `env["comet_experiment"]`
340
+
341
+ ### Holistic logging Convenience Helper - For Scalars only
342
+
343
+ A unified helper is also provided:
344
+ ```
345
+ log_scalar(env, key, value, step=None)
346
+ ```
347
+ Behavior:
348
+
349
+ - Try TensorBoard (add_scalar)
350
+ - Try MLflow (log_metric)
351
+ - Try Comet (log_metric)
352
+
353
+ Returns `True` if logging succeeded to at least one backend.
354
+
355
+ Failures are swallowed — logging will never crash your training loop.
356
+
357
+ Use framework-native logging for training-critical metrics.
358
+ Use hotcb resolvers for live instrumentation, debugging, and temporary analytics.
359
+
360
+ ---
361
+
362
+ # 🔌 Import Scope in Hot-Loaded Callbacks
363
+
364
+ Callbacks run in the same interpreter as your training job.
365
+
366
+ You can import from your training repo if:
367
+ - You run from repo root, or
368
+ - The project is installed (editable or normal), or
369
+ - PYTHONPATH is configured.
370
+
371
+ Prefer absolute imports in hot-loaded `.py` files.
372
+
373
+ ---
374
+
375
+
376
+ # 🧬 A Unified Callback Model
377
+
378
+ `hotcb` is a thin portability layer: it lets you write one callback once, then run it across
379
+ PyTorch Lightning, HuggingFace Trainer, or bare PyTorch by mapping framework hook arguments into a
380
+ small, normalized `env` dictionary.
381
+
382
+ `env` is intentionally small and predictable. Adapters fill it from the native framework objects:
383
+
384
+ | `env` key | Lightning (source) | HF Trainer (source) | Bare PyTorch (source) |
385
+ |---|---|---|---|
386
+ | `env["step"]` | `trainer.global_step` | `state.global_step` | loop `step` |
387
+ | `env["epoch"]` | `trainer.current_epoch` | `state.epoch` | loop `epoch` |
388
+ | `env["phase"]` | adapter sets `"train"/"val"` | adapter sets `"train"/"eval"` | you set it |
389
+ | `env["model"]` | `pl_module` | *(adapter-provided)* | your model |
390
+ | `env["batch"]` | `batch` | *(adapter-provided)* | your batch |
391
+ | `env["outputs"]` | `outputs` | *(optional)* | your outputs |
392
+ | `env["log"]` | adapter wraps `trainer.print` | adapter wraps `print` | your logger |
393
+
394
+ > `env` is a portability contract. If you want extra fields, you can always include them in `env`
395
+ > from your own loop, or extend adapters later — but the minimal set above keeps callbacks simple
396
+ > and avoids accidental retention of large tensors.
397
+
398
+
399
+ ---
400
+
401
+ # 🛠 Making existing callbacks Hot-Adjustable
402
+
403
+ All four variants below do the same thing:
404
+ - print once every `every` steps
405
+ - easy to tune `every`
406
+ - the hotcb version supports runtime updates via the CLI (`hotcb set ...`)
407
+
408
+ ### 1) PyTorch Lightning callback
409
+
410
+ ```python
411
+ import lightning.pytorch as pl
412
+
413
+ class PrintEveryN_Lightning(pl.Callback):
414
+ def __init__(self, every: int = 50, prefix: str = "[metrics]"):
415
+ self.every = int(every)
416
+ self.prefix = str(prefix)
417
+
418
+ def on_train_batch_end(self, trainer, pl_module, outputs, batch, batch_idx):
419
+ step = int(trainer.global_step)
420
+ if self.every > 0 and (step % self.every) == 0:
421
+ trainer.print(f"{self.prefix} step={step} batch_idx={batch_idx}")
422
+ ```
423
+
424
+ ### 2) HuggingFace Trainer callback
425
+
426
+ ```python
427
+ from transformers import TrainerCallback
428
+
429
+ class PrintEveryN_HF(TrainerCallback):
430
+ def __init__(self, every: int = 50, prefix: str = "[metrics]"):
431
+ self.every = int(every)
432
+ self.prefix = str(prefix)
433
+
434
+ def on_step_end(self, args, state, control, **kwargs):
435
+ step = int(state.global_step)
436
+ if self.every > 0 and (step % self.every) == 0:
437
+ print(f"{self.prefix} step={step}")
438
+ return control
439
+ ```
440
+
441
+ ### 3) Bare PyTorch “hook style”
442
+
443
+ ```python
444
+ class PrintEveryN_TorchHook:
445
+ def __init__(self, every: int = 50, prefix: str = "[metrics]"):
446
+ self.every = int(every)
447
+ self.prefix = str(prefix)
448
+
449
+ def on_step_end(self, step: int, batch_idx: int):
450
+ if self.every > 0 and (step % self.every) == 0:
451
+ print(f"{self.prefix} step={step} batch_idx={batch_idx}")
452
+ ```
453
+
454
+ Usage:
455
+
456
+ ```python
457
+ hook = PrintEveryN_TorchHook(every=50)
458
+
459
+ for step, batch in enumerate(loader):
460
+ # forward/backward/step...
461
+ hook.on_step_end(step=step, batch_idx=step)
462
+ ```
463
+
464
+ ### 4) hotcb callback (portable + hot-adjustable)
465
+
466
+ ```python
467
+ class PrintEveryN_HotCB:
468
+ def __init__(self, id: str, every: int = 50, prefix: str = "[metrics]"):
469
+ self.id = id
470
+ self.every = int(every)
471
+ self.prefix = str(prefix)
472
+
473
+ def set_params(self, **kwargs):
474
+ if "every" in kwargs:
475
+ self.every = int(kwargs["every"])
476
+ if "prefix" in kwargs:
477
+ self.prefix = str(kwargs["prefix"])
478
+
479
+ def handle(self, event: str, env: dict):
480
+ step = int(env.get("step", 0))
481
+ batch_idx = env.get("batch_idx", None)
482
+ log = env.get("log", print)
483
+
484
+ if self.every > 0 and (step % self.every) == 0:
485
+ log(f"{self.prefix} id={self.id} step={step} event={event} batch_idx={batch_idx}")
486
+ ```
487
+
488
+ Runtime tuning (no restart):
489
+
490
+ ```bash
491
+ hotcb --dir runs/exp1 set print_metrics every=5
492
+ ```
493
+
494
+ ---
495
+
496
+ # 📡 How It Works
497
+
498
+ Two control layers:
499
+
500
+ 1. `hotcb.yaml` — desired state (optional)
501
+ 2. `hotcb.commands.jsonl` — append-only command stream
502
+
503
+ Changes are applied at safe adapter-defined boundaries:
504
+ - Lightning → end of batch
505
+ - HF → end of step / eval
506
+ - Bare torch → wherever you call `apply()`
507
+
508
+ ---
509
+
510
+ # ❓ Why this exists
511
+
512
+ Hot-swappable callbacks are often used for:
513
+
514
+ - Temporary diagnostics
515
+ - Feature visualization
516
+ - Gradient/statistics inspection
517
+ - Mid-run debugging
518
+ - Experiment instrumentation
519
+
520
+ You don’t want to:
521
+
522
+ - Modify your Trainer code
523
+ - Restart a long training job
524
+
525
+ The in-built hot-reloaders and logging resolvers allow a callback to “plug into” existing training run and logging backends automatically.
526
+
527
+ Safely - without impacting your run even when it fails.
528
+
529
+ ---
530
+
531
+ # 🛡 Safety
532
+
533
+ - No training loop mutation
534
+ - No framework internals modified
535
+ - Fail-safe: crashing callbacks can auto-disable
536
+ - “Remove” = disable (optional unload supported)
537
+
538
+ ---
539
+
540
+ # 🌱 Philosophy
541
+
542
+ Training shouldn’t require restarts for diagnostics.
543
+
544
+ `hotcb` treats debugging and visualization as live instrumentation — not static configuration.
545
+
546
+ ---
547
+
548
+ # 📄 License
549
+
550
+ MIT License (see LICENSE file).
551
+