torchlit-lib 0.2.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- torchlit_lib-0.2.0/LICENSE +21 -0
- torchlit_lib-0.2.0/PKG-INFO +108 -0
- torchlit_lib-0.2.0/README.md +91 -0
- torchlit_lib-0.2.0/pyproject.toml +31 -0
- torchlit_lib-0.2.0/setup.cfg +4 -0
- torchlit_lib-0.2.0/torchlit/__init__.py +3 -0
- torchlit_lib-0.2.0/torchlit/backend/__init__.py +1 -0
- torchlit_lib-0.2.0/torchlit/backend/main.py +203 -0
- torchlit_lib-0.2.0/torchlit/bin/__init__.py +0 -0
- torchlit_lib-0.2.0/torchlit/bin/torchlit-progress-darwin-arm64 +0 -0
- torchlit_lib-0.2.0/torchlit/bin/torchlit-progress-darwin-x86_64 +0 -0
- torchlit_lib-0.2.0/torchlit/bin/torchlit-progress-linux-x86_64 +0 -0
- torchlit_lib-0.2.0/torchlit/bin/torchlit-progress-windows-x86_64.exe +0 -0
- torchlit_lib-0.2.0/torchlit/frontend/__init__.py +0 -0
- torchlit_lib-0.2.0/torchlit/monitor.py +377 -0
- torchlit_lib-0.2.0/torchlit_lib.egg-info/PKG-INFO +108 -0
- torchlit_lib-0.2.0/torchlit_lib.egg-info/SOURCES.txt +18 -0
- torchlit_lib-0.2.0/torchlit_lib.egg-info/dependency_links.txt +1 -0
- torchlit_lib-0.2.0/torchlit_lib.egg-info/requires.txt +6 -0
- torchlit_lib-0.2.0/torchlit_lib.egg-info/top_level.txt +1 -0
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Chan Lee
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,108 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: torchlit-lib
|
|
3
|
+
Version: 0.2.0
|
|
4
|
+
Summary: A lightweight real-time ML training monitor.
|
|
5
|
+
Author: ChanLumerico
|
|
6
|
+
Project-URL: Homepage, https://github.com/ChanLumerico/torchlit
|
|
7
|
+
Requires-Python: >=3.8
|
|
8
|
+
Description-Content-Type: text/markdown
|
|
9
|
+
License-File: LICENSE
|
|
10
|
+
Requires-Dist: requests
|
|
11
|
+
Requires-Dist: psutil
|
|
12
|
+
Requires-Dist: fastapi>=0.100.0
|
|
13
|
+
Requires-Dist: uvicorn>=0.20.0
|
|
14
|
+
Requires-Dist: websockets>=10.4
|
|
15
|
+
Requires-Dist: pydantic>=2.0.0
|
|
16
|
+
Dynamic: license-file
|
|
17
|
+
|
|
18
|
+
# torchlit 🔥
|
|
19
|
+
|
|
20
|
+
<p align="center">
|
|
21
|
+
<b>A lightweight, beautiful, and interactive real-time PyTorch training dashboard.</b>
|
|
22
|
+
</p>
|
|
23
|
+
|
|
24
|
+
<p align="center">
|
|
25
|
+
<img src="assets/dashboard.png" alt="torchlit Dashboard" width="80%" />
|
|
26
|
+
</p>
|
|
27
|
+
|
|
28
|
+
---
|
|
29
|
+
|
|
30
|
+
`torchlit` is a zero-setup desktop GUI that hooks directly into your PyTorch training loops to monitor metrics, system stats, and model architecture in real time. Stop relying on cluttered TQDM bars or heavy logging frameworks.
|
|
31
|
+
|
|
32
|
+
## ✨ Features
|
|
33
|
+
|
|
34
|
+
- **Zero Configuration** — `with torchlit.Monitor():` and your dashboard spins up instantly
|
|
35
|
+
- **Real-Time Streaming** — FastAPI + WebSockets push metrics to the browser immediately
|
|
36
|
+
- **Model Architecture Explorer** — Interactive, color-coded layer tree with parameter counts
|
|
37
|
+
- **Multi-Session Comparison** — Overlay and compare multiple experiments side-by-side
|
|
38
|
+
- **Rust CLI Progress Display** — Beautiful terminal TUI (powered by `ratatui`) while training runs
|
|
39
|
+
- **System Resource Sparklines** — Live CPU, RAM, and VRAM usage tracking
|
|
40
|
+
- **CSV Export** — Download all aggregated metrics at any time
|
|
41
|
+
- **Auto-Shutdown** — Background server cleans up automatically when the browser is closed
|
|
42
|
+
|
|
43
|
+
## 🚀 Quick Start
|
|
44
|
+
|
|
45
|
+
```bash
|
|
46
|
+
pip install torchlit
|
|
47
|
+
```
|
|
48
|
+
|
|
49
|
+
```python
|
|
50
|
+
import torch
|
|
51
|
+
import torchlit
|
|
52
|
+
|
|
53
|
+
model = torch.nn.Sequential(
|
|
54
|
+
torch.nn.Linear(10, 50),
|
|
55
|
+
torch.nn.ReLU(),
|
|
56
|
+
torch.nn.Linear(50, 2)
|
|
57
|
+
)
|
|
58
|
+
|
|
59
|
+
with torchlit.Monitor(exp_name="my_experiment", model=model, total_steps=100) as logger:
|
|
60
|
+
for step in range(1, 101):
|
|
61
|
+
loss = max(0, 1.0 - step * 0.01)
|
|
62
|
+
logger.log({"loss": loss, "accuracy": step / 100.0}, step=step)
|
|
63
|
+
```
|
|
64
|
+
|
|
65
|
+
*Your browser opens automatically at `http://localhost:8000`.*
|
|
66
|
+
*A live Rust-powered TUI also appears directly in your terminal.*
|
|
67
|
+
|
|
68
|
+
## 📖 Full Example
|
|
69
|
+
|
|
70
|
+
See [`examples/example.py`](examples/example.py) for a complete CIFAR-10 + ResNet-50 training loop demonstrating all features.
|
|
71
|
+
|
|
72
|
+
```bash
|
|
73
|
+
python examples/example.py
|
|
74
|
+
```
|
|
75
|
+
|
|
76
|
+
## ⚙️ Monitor Options
|
|
77
|
+
|
|
78
|
+
| Parameter | Type | Default | Description |
|
|
79
|
+
|---|---|---|---|
|
|
80
|
+
| `exp_name` | `str` | `"default_experiment"` | Name for this training run |
|
|
81
|
+
| `model` | `nn.Module` | `None` | PyTorch model (for architecture extraction) |
|
|
82
|
+
| `total_steps` | `int` | `None` | Total steps (enables ETA in CLI display) |
|
|
83
|
+
| `server_url` | `str` | `http://localhost:8000` | Dashboard server URL |
|
|
84
|
+
| `flush_interval` | `float` | `1.0` | Seconds between network flushes |
|
|
85
|
+
|
|
86
|
+
## 🏗️ Architecture
|
|
87
|
+
|
|
88
|
+
```
|
|
89
|
+
torchlit.Monitor ──► FastAPI Backend ──► React Dashboard (browser)
|
|
90
|
+
│ │
|
|
91
|
+
└──► Rust CLI (torchlit-progress) └──► WebSocket streaming
|
|
92
|
+
```
|
|
93
|
+
|
|
94
|
+
## 📦 Platform Support
|
|
95
|
+
|
|
96
|
+
`pip install torchlit` includes pre-compiled CLI binaries for all major platforms:
|
|
97
|
+
|
|
98
|
+
| Platform | Binary |
|
|
99
|
+
|---|---|
|
|
100
|
+
| macOS ARM (M1/M2/M3) | ✅ Included |
|
|
101
|
+
| macOS Intel | ✅ Included |
|
|
102
|
+
| Linux x86_64 | ✅ Included |
|
|
103
|
+
| Windows x64 | ✅ Included |
|
|
104
|
+
|
|
105
|
+
## 🔗 Links
|
|
106
|
+
|
|
107
|
+
- [GitHub](https://github.com/ChanLumerico/torchlit)
|
|
108
|
+
- [PyPI](https://pypi.org/project/torchlit/)
|
|
@@ -0,0 +1,91 @@
|
|
|
1
|
+
# torchlit 🔥
|
|
2
|
+
|
|
3
|
+
<p align="center">
|
|
4
|
+
<b>A lightweight, beautiful, and interactive real-time PyTorch training dashboard.</b>
|
|
5
|
+
</p>
|
|
6
|
+
|
|
7
|
+
<p align="center">
|
|
8
|
+
<img src="assets/dashboard.png" alt="torchlit Dashboard" width="80%" />
|
|
9
|
+
</p>
|
|
10
|
+
|
|
11
|
+
---
|
|
12
|
+
|
|
13
|
+
`torchlit` is a zero-setup desktop GUI that hooks directly into your PyTorch training loops to monitor metrics, system stats, and model architecture in real time. Stop relying on cluttered TQDM bars or heavy logging frameworks.
|
|
14
|
+
|
|
15
|
+
## ✨ Features
|
|
16
|
+
|
|
17
|
+
- **Zero Configuration** — `with torchlit.Monitor():` and your dashboard spins up instantly
|
|
18
|
+
- **Real-Time Streaming** — FastAPI + WebSockets push metrics to the browser immediately
|
|
19
|
+
- **Model Architecture Explorer** — Interactive, color-coded layer tree with parameter counts
|
|
20
|
+
- **Multi-Session Comparison** — Overlay and compare multiple experiments side-by-side
|
|
21
|
+
- **Rust CLI Progress Display** — Beautiful terminal TUI (powered by `ratatui`) while training runs
|
|
22
|
+
- **System Resource Sparklines** — Live CPU, RAM, and VRAM usage tracking
|
|
23
|
+
- **CSV Export** — Download all aggregated metrics at any time
|
|
24
|
+
- **Auto-Shutdown** — Background server cleans up automatically when the browser is closed
|
|
25
|
+
|
|
26
|
+
## 🚀 Quick Start
|
|
27
|
+
|
|
28
|
+
```bash
|
|
29
|
+
pip install torchlit
|
|
30
|
+
```
|
|
31
|
+
|
|
32
|
+
```python
|
|
33
|
+
import torch
|
|
34
|
+
import torchlit
|
|
35
|
+
|
|
36
|
+
model = torch.nn.Sequential(
|
|
37
|
+
torch.nn.Linear(10, 50),
|
|
38
|
+
torch.nn.ReLU(),
|
|
39
|
+
torch.nn.Linear(50, 2)
|
|
40
|
+
)
|
|
41
|
+
|
|
42
|
+
with torchlit.Monitor(exp_name="my_experiment", model=model, total_steps=100) as logger:
|
|
43
|
+
for step in range(1, 101):
|
|
44
|
+
loss = max(0, 1.0 - step * 0.01)
|
|
45
|
+
logger.log({"loss": loss, "accuracy": step / 100.0}, step=step)
|
|
46
|
+
```
|
|
47
|
+
|
|
48
|
+
*Your browser opens automatically at `http://localhost:8000`.*
|
|
49
|
+
*A live Rust-powered TUI also appears directly in your terminal.*
|
|
50
|
+
|
|
51
|
+
## 📖 Full Example
|
|
52
|
+
|
|
53
|
+
See [`examples/example.py`](examples/example.py) for a complete CIFAR-10 + ResNet-50 training loop demonstrating all features.
|
|
54
|
+
|
|
55
|
+
```bash
|
|
56
|
+
python examples/example.py
|
|
57
|
+
```
|
|
58
|
+
|
|
59
|
+
## ⚙️ Monitor Options
|
|
60
|
+
|
|
61
|
+
| Parameter | Type | Default | Description |
|
|
62
|
+
|---|---|---|---|
|
|
63
|
+
| `exp_name` | `str` | `"default_experiment"` | Name for this training run |
|
|
64
|
+
| `model` | `nn.Module` | `None` | PyTorch model (for architecture extraction) |
|
|
65
|
+
| `total_steps` | `int` | `None` | Total steps (enables ETA in CLI display) |
|
|
66
|
+
| `server_url` | `str` | `http://localhost:8000` | Dashboard server URL |
|
|
67
|
+
| `flush_interval` | `float` | `1.0` | Seconds between network flushes |
|
|
68
|
+
|
|
69
|
+
## 🏗️ Architecture
|
|
70
|
+
|
|
71
|
+
```
|
|
72
|
+
torchlit.Monitor ──► FastAPI Backend ──► React Dashboard (browser)
|
|
73
|
+
│ │
|
|
74
|
+
└──► Rust CLI (torchlit-progress) └──► WebSocket streaming
|
|
75
|
+
```
|
|
76
|
+
|
|
77
|
+
## 📦 Platform Support
|
|
78
|
+
|
|
79
|
+
`pip install torchlit` includes pre-compiled CLI binaries for all major platforms:
|
|
80
|
+
|
|
81
|
+
| Platform | Binary |
|
|
82
|
+
|---|---|
|
|
83
|
+
| macOS ARM (M1/M2/M3) | ✅ Included |
|
|
84
|
+
| macOS Intel | ✅ Included |
|
|
85
|
+
| Linux x86_64 | ✅ Included |
|
|
86
|
+
| Windows x64 | ✅ Included |
|
|
87
|
+
|
|
88
|
+
## 🔗 Links
|
|
89
|
+
|
|
90
|
+
- [GitHub](https://github.com/ChanLumerico/torchlit)
|
|
91
|
+
- [PyPI](https://pypi.org/project/torchlit/)
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=61.0"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "torchlit-lib"
|
|
7
|
+
version = "0.2.0"
|
|
8
|
+
description = "A lightweight real-time ML training monitor."
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
authors = [
|
|
11
|
+
{ name="ChanLumerico" }
|
|
12
|
+
]
|
|
13
|
+
requires-python = ">=3.8"
|
|
14
|
+
dependencies = [
|
|
15
|
+
"requests",
|
|
16
|
+
"psutil",
|
|
17
|
+
"fastapi>=0.100.0",
|
|
18
|
+
"uvicorn>=0.20.0",
|
|
19
|
+
"websockets>=10.4",
|
|
20
|
+
"pydantic>=2.0.0",
|
|
21
|
+
]
|
|
22
|
+
|
|
23
|
+
[tool.setuptools]
|
|
24
|
+
packages = ["torchlit", "torchlit.backend", "torchlit.frontend", "torchlit.bin"]
|
|
25
|
+
|
|
26
|
+
[tool.setuptools.package-data]
|
|
27
|
+
"torchlit.frontend" = ["dist/**/*"]
|
|
28
|
+
"torchlit.bin" = ["torchlit-progress-*"]
|
|
29
|
+
|
|
30
|
+
[project.urls]
|
|
31
|
+
Homepage = "https://github.com/ChanLumerico/torchlit"
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
# Empty init to make it a package
|
|
@@ -0,0 +1,203 @@
|
|
|
1
|
+
from fastapi import FastAPI, WebSocket, WebSocketDisconnect, HTTPException
|
|
2
|
+
from fastapi.middleware.cors import CORSMiddleware
|
|
3
|
+
from fastapi.responses import FileResponse
|
|
4
|
+
from fastapi.staticfiles import StaticFiles
|
|
5
|
+
from pydantic import BaseModel
|
|
6
|
+
from typing import Dict, List, Any
|
|
7
|
+
import asyncio
|
|
8
|
+
import os
|
|
9
|
+
import signal
|
|
10
|
+
from collections import defaultdict, deque
|
|
11
|
+
|
|
12
|
+
app = FastAPI(title="torchlit broker")
|
|
13
|
+
|
|
14
|
+
app.add_middleware(
|
|
15
|
+
CORSMiddleware,
|
|
16
|
+
allow_origins=["http://localhost:5173"],
|
|
17
|
+
allow_credentials=True,
|
|
18
|
+
allow_methods=["*"],
|
|
19
|
+
allow_headers=["*"],
|
|
20
|
+
)
|
|
21
|
+
|
|
22
|
+
# In-memory storage for metrics. structure:
|
|
23
|
+
# {
|
|
24
|
+
# "experiment_name": deque([metric_dict, metric_dict, ...], maxlen=1000)
|
|
25
|
+
# }
|
|
26
|
+
experiment_metrics: Dict[str, deque] = defaultdict(lambda: deque(maxlen=1000))
|
|
27
|
+
|
|
28
|
+
# {
|
|
29
|
+
# "experiment_name": [websocket1, websocket2, ...]
|
|
30
|
+
# }
|
|
31
|
+
active_connections: Dict[str, List[WebSocket]] = defaultdict(list)
|
|
32
|
+
|
|
33
|
+
# Auto-shutdown state
|
|
34
|
+
training_finished = False
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
async def delayed_shutdown(delay: int = 2):
|
|
38
|
+
"""Wait for delay, then shut down if conditions are still met."""
|
|
39
|
+
await asyncio.sleep(delay)
|
|
40
|
+
total_connections = sum(len(conns) for conns in active_connections.values())
|
|
41
|
+
if training_finished and total_connections == 0:
|
|
42
|
+
print(
|
|
43
|
+
"\n⚡ torchlit dashboard auto-shutting down because there are no active connections."
|
|
44
|
+
)
|
|
45
|
+
# Force terminate from a separate thread to bypass uvicorn's signal interception
|
|
46
|
+
import threading
|
|
47
|
+
|
|
48
|
+
threading.Thread(target=lambda: os._exit(0), daemon=True).start()
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
class MetricLog(BaseModel):
|
|
52
|
+
exp_name: str
|
|
53
|
+
step: int
|
|
54
|
+
metrics: Dict[str, Any]
|
|
55
|
+
sys_stats: Dict[str, Any]
|
|
56
|
+
model_info: Dict[str, Any] = None
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
class StatusLog(BaseModel):
|
|
60
|
+
status: str
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
@app.post("/api/status")
|
|
64
|
+
async def update_status(status_log: StatusLog):
|
|
65
|
+
"""Receive status updates. Auto-shutdown if training finished and no one is watching."""
|
|
66
|
+
global training_finished
|
|
67
|
+
if status_log.status == "finished":
|
|
68
|
+
training_finished = True
|
|
69
|
+
total_connections = sum(len(conns) for conns in active_connections.values())
|
|
70
|
+
if total_connections == 0:
|
|
71
|
+
# Give user 10 seconds to open browser if they haven't yet
|
|
72
|
+
asyncio.create_task(delayed_shutdown(delay=10))
|
|
73
|
+
return {"status": "ok"}
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
@app.post("/api/log")
|
|
77
|
+
async def log_metrics(log_data: MetricLog):
|
|
78
|
+
"""
|
|
79
|
+
Receive metrics from the torchlit python client and broadcast to connected frontends.
|
|
80
|
+
"""
|
|
81
|
+
exp_name = log_data.exp_name
|
|
82
|
+
data_point = log_data.dict()
|
|
83
|
+
|
|
84
|
+
# Store in memory cache
|
|
85
|
+
experiment_metrics[exp_name].append(data_point)
|
|
86
|
+
|
|
87
|
+
# Broadcast to connected clients for this experiment
|
|
88
|
+
if exp_name in active_connections:
|
|
89
|
+
dead_connections = []
|
|
90
|
+
for connection in active_connections[exp_name]:
|
|
91
|
+
try:
|
|
92
|
+
await connection.send_json(data_point)
|
|
93
|
+
except Exception:
|
|
94
|
+
dead_connections.append(connection)
|
|
95
|
+
|
|
96
|
+
# Cleanup dead connections
|
|
97
|
+
for dead in dead_connections:
|
|
98
|
+
active_connections[exp_name].remove(dead)
|
|
99
|
+
|
|
100
|
+
return {"status": "ok"}
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
@app.websocket("/ws/stream/{exp_name}")
|
|
104
|
+
async def websocket_endpoint(websocket: WebSocket, exp_name: str):
|
|
105
|
+
"""
|
|
106
|
+
WebSocket endpoint for frontend to receive live real-time metrics for a specific experiment.
|
|
107
|
+
On connection, rehydrate with the last N cached metrics.
|
|
108
|
+
"""
|
|
109
|
+
await websocket.accept()
|
|
110
|
+
active_connections[exp_name].append(websocket)
|
|
111
|
+
|
|
112
|
+
try:
|
|
113
|
+
# Rehydrate existing data
|
|
114
|
+
if exp_name in experiment_metrics and len(experiment_metrics[exp_name]) > 0:
|
|
115
|
+
# Send all historical metrics
|
|
116
|
+
for data_point in list(experiment_metrics[exp_name]):
|
|
117
|
+
await websocket.send_json(data_point)
|
|
118
|
+
|
|
119
|
+
# Keep connection alive
|
|
120
|
+
while True:
|
|
121
|
+
# Wait for any messages from client (e.g. ping)
|
|
122
|
+
await websocket.receive_text()
|
|
123
|
+
|
|
124
|
+
except WebSocketDisconnect:
|
|
125
|
+
active_connections[exp_name].remove(websocket)
|
|
126
|
+
if not active_connections[exp_name]:
|
|
127
|
+
del active_connections[exp_name]
|
|
128
|
+
|
|
129
|
+
# Trigger auto-shutdown check if training is done
|
|
130
|
+
total_connections = sum(len(conns) for conns in active_connections.values())
|
|
131
|
+
if training_finished and total_connections == 0:
|
|
132
|
+
asyncio.create_task(delayed_shutdown(delay=2))
|
|
133
|
+
|
|
134
|
+
|
|
135
|
+
@app.get("/api/experiments")
|
|
136
|
+
async def list_experiments():
|
|
137
|
+
"""List all active experiments"""
|
|
138
|
+
return {"experiments": list(experiment_metrics.keys())}
|
|
139
|
+
|
|
140
|
+
|
|
141
|
+
@app.post("/api/experiments/clear")
|
|
142
|
+
async def clear_all_experiments():
|
|
143
|
+
"""Clear all experiment data and drop connections."""
|
|
144
|
+
try:
|
|
145
|
+
# Close all websocket connections
|
|
146
|
+
for exp_name in list(active_connections.keys()):
|
|
147
|
+
for ws in list(active_connections[exp_name]):
|
|
148
|
+
try:
|
|
149
|
+
await ws.close()
|
|
150
|
+
except Exception:
|
|
151
|
+
pass
|
|
152
|
+
|
|
153
|
+
# Clear in-memory datastores
|
|
154
|
+
experiment_metrics.clear()
|
|
155
|
+
active_connections.clear()
|
|
156
|
+
|
|
157
|
+
return {"status": "success"}
|
|
158
|
+
except Exception as e:
|
|
159
|
+
import traceback
|
|
160
|
+
|
|
161
|
+
traceback.print_exc()
|
|
162
|
+
raise HTTPException(status_code=500, detail=str(e))
|
|
163
|
+
|
|
164
|
+
|
|
165
|
+
@app.delete("/api/experiments/{exp_name}")
|
|
166
|
+
async def delete_experiment(exp_name: str):
|
|
167
|
+
"""Delete all data for a specific experiment and close its connections"""
|
|
168
|
+
if exp_name in experiment_metrics:
|
|
169
|
+
del experiment_metrics[exp_name]
|
|
170
|
+
|
|
171
|
+
# Send close signal to connected clients
|
|
172
|
+
if exp_name in active_connections:
|
|
173
|
+
for ws in list(active_connections[exp_name]):
|
|
174
|
+
try:
|
|
175
|
+
await ws.close()
|
|
176
|
+
except Exception:
|
|
177
|
+
pass
|
|
178
|
+
del active_connections[exp_name]
|
|
179
|
+
|
|
180
|
+
return {"status": "ok", "deleted": exp_name}
|
|
181
|
+
|
|
182
|
+
|
|
183
|
+
# --- Serve Frontend SPA ---
|
|
184
|
+
BASE_DIR = os.path.dirname(os.path.abspath(__file__))
|
|
185
|
+
FRONTEND_DIST = os.path.join(BASE_DIR, "..", "frontend", "dist")
|
|
186
|
+
FRONTEND_ASSETS = os.path.join(FRONTEND_DIST, "assets")
|
|
187
|
+
|
|
188
|
+
if os.path.exists(FRONTEND_ASSETS):
|
|
189
|
+
app.mount("/assets", StaticFiles(directory=FRONTEND_ASSETS), name="assets")
|
|
190
|
+
|
|
191
|
+
|
|
192
|
+
@app.get("/{full_path:path}")
|
|
193
|
+
async def serve_frontend(full_path: str):
|
|
194
|
+
"""Fallback route to serve the React SPA index.html for all non-API paths."""
|
|
195
|
+
# Check if we have the built frontend
|
|
196
|
+
index_path = os.path.join(FRONTEND_DIST, "index.html")
|
|
197
|
+
if os.path.exists(index_path):
|
|
198
|
+
# We explicitly serve index.html and let React handle the client-side routing
|
|
199
|
+
return FileResponse(index_path)
|
|
200
|
+
|
|
201
|
+
return {
|
|
202
|
+
"error": "Frontend build not found. Run 'npm run build' inside torchlit/frontend"
|
|
203
|
+
}
|
|
File without changes
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
File without changes
|
|
@@ -0,0 +1,377 @@
|
|
|
1
|
+
import contextlib
|
|
2
|
+
import json
|
|
3
|
+
import os
|
|
4
|
+
import platform
|
|
5
|
+
import subprocess
|
|
6
|
+
import threading
|
|
7
|
+
import time
|
|
8
|
+
import requests
|
|
9
|
+
import queue
|
|
10
|
+
import psutil
|
|
11
|
+
import socket
|
|
12
|
+
import sys
|
|
13
|
+
from pathlib import Path
|
|
14
|
+
from typing import Dict, Any, Optional
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def _get_bin_path() -> Path:
|
|
18
|
+
"""Return the path to the platform-specific torchlit-progress binary."""
|
|
19
|
+
system = platform.system() # Darwin, Linux, Windows
|
|
20
|
+
machine = platform.machine() # arm64, x86_64, AMD64
|
|
21
|
+
|
|
22
|
+
if system == "Darwin":
|
|
23
|
+
suffix = f"darwin-{machine}" # darwin-arm64 | darwin-x86_64
|
|
24
|
+
elif system == "Linux":
|
|
25
|
+
suffix = f"linux-{machine}" # linux-x86_64 | linux-aarch64
|
|
26
|
+
elif system == "Windows":
|
|
27
|
+
suffix = f"windows-x86_64.exe" # windows-x86_64.exe
|
|
28
|
+
else:
|
|
29
|
+
suffix = None
|
|
30
|
+
|
|
31
|
+
if suffix is None:
|
|
32
|
+
return Path() # Empty path — will not exist, falls back gracefully
|
|
33
|
+
|
|
34
|
+
return Path(__file__).parent / "bin" / f"torchlit-progress-{suffix}"
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
_BIN_PATH = _get_bin_path()
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
class Monitor(contextlib.ContextDecorator):
|
|
41
|
+
"""
|
|
42
|
+
Context manager and decorator for monitoring PyTorch training loops.
|
|
43
|
+
Sends real-time telemetry to the local torchlit visualization server.
|
|
44
|
+
Launches a Rust-based CLI display (torchlit-progress) for terminal progress.
|
|
45
|
+
Falls back to plain text if the binary is not available.
|
|
46
|
+
"""
|
|
47
|
+
|
|
48
|
+
def __init__(
|
|
49
|
+
self,
|
|
50
|
+
exp_name: str = "default_experiment",
|
|
51
|
+
server_url: str | None = None,
|
|
52
|
+
flush_interval: float = 1.0,
|
|
53
|
+
model_info: Dict[str, Any] = None,
|
|
54
|
+
model: Optional[Any] = None,
|
|
55
|
+
optimizer: Optional[Any] = None,
|
|
56
|
+
start_server: bool = True,
|
|
57
|
+
total_steps: Optional[int] = None,
|
|
58
|
+
):
|
|
59
|
+
self.exp_name = exp_name
|
|
60
|
+
self.server_url = (
|
|
61
|
+
server_url.rstrip("/") if server_url else "http://localhost:8000"
|
|
62
|
+
)
|
|
63
|
+
self.flush_interval = flush_interval
|
|
64
|
+
self.model_info = model_info or {}
|
|
65
|
+
self.model = model
|
|
66
|
+
self.optimizer = optimizer
|
|
67
|
+
self.start_server = start_server
|
|
68
|
+
self.total_steps = total_steps
|
|
69
|
+
|
|
70
|
+
self.queue = queue.Queue()
|
|
71
|
+
self.is_running = False
|
|
72
|
+
self.worker_thread: Optional[threading.Thread] = None
|
|
73
|
+
|
|
74
|
+
# Hardware Detection (Cache once)
|
|
75
|
+
self.device_type = "cpu"
|
|
76
|
+
self.device_name = "CPU"
|
|
77
|
+
self._torch = None
|
|
78
|
+
|
|
79
|
+
try:
|
|
80
|
+
import torch
|
|
81
|
+
|
|
82
|
+
self._torch = torch
|
|
83
|
+
if torch.cuda.is_available():
|
|
84
|
+
self.device_type = "cuda"
|
|
85
|
+
self.device_name = torch.cuda.get_device_name(0)
|
|
86
|
+
elif hasattr(torch.backends, "mps") and torch.backends.mps.is_available():
|
|
87
|
+
self.device_type = "mps"
|
|
88
|
+
self.device_name = "Apple Silicon (MPS)"
|
|
89
|
+
except Exception:
|
|
90
|
+
pass
|
|
91
|
+
|
|
92
|
+
# Auto-extract model info if provided
|
|
93
|
+
if self.model is not None:
|
|
94
|
+
self._extract_model_info()
|
|
95
|
+
|
|
96
|
+
# Rust CLI display state
|
|
97
|
+
self._cli_proc: Optional[subprocess.Popen] = None
|
|
98
|
+
self._start_time: Optional[float] = None
|
|
99
|
+
|
|
100
|
+
def _format_num(self, num: int) -> str:
|
|
101
|
+
if num >= 1e9:
|
|
102
|
+
return f"{num / 1e9:.1f} B"
|
|
103
|
+
elif num >= 1e6:
|
|
104
|
+
return f"{num / 1e6:.1f} M"
|
|
105
|
+
elif num >= 1e3:
|
|
106
|
+
return f"{num / 1e3:.1f} K"
|
|
107
|
+
return str(num)
|
|
108
|
+
|
|
109
|
+
def _extract_model_info(self):
|
|
110
|
+
try:
|
|
111
|
+
self.model_info["name"] = self.model_info.get(
|
|
112
|
+
"name", self.model.__class__.__name__
|
|
113
|
+
)
|
|
114
|
+
|
|
115
|
+
# Count parameters
|
|
116
|
+
total_params = sum(p.numel() for p in self.model.parameters())
|
|
117
|
+
trainable_params = sum(
|
|
118
|
+
p.numel() for p in self.model.parameters() if p.requires_grad
|
|
119
|
+
)
|
|
120
|
+
|
|
121
|
+
self.model_info["total_params"] = self.model_info.get(
|
|
122
|
+
"total_params", self._format_num(total_params)
|
|
123
|
+
)
|
|
124
|
+
self.model_info["trainable_params"] = self.model_info.get(
|
|
125
|
+
"trainable_params", self._format_num(trainable_params)
|
|
126
|
+
)
|
|
127
|
+
|
|
128
|
+
# Try to infer device from first parameter
|
|
129
|
+
first_param = next(self.model.parameters(), None)
|
|
130
|
+
if first_param is not None and hasattr(first_param, "device"):
|
|
131
|
+
dev_type = first_param.device.type
|
|
132
|
+
if dev_type == "cuda":
|
|
133
|
+
self.device_type = "cuda"
|
|
134
|
+
if self._torch and self._torch.cuda.is_available():
|
|
135
|
+
self.device_name = self._torch.cuda.get_device_name(
|
|
136
|
+
first_param.device.index or 0
|
|
137
|
+
)
|
|
138
|
+
elif dev_type == "mps":
|
|
139
|
+
self.device_type = "mps"
|
|
140
|
+
self.device_name = "Apple Silicon (MPS)"
|
|
141
|
+
elif dev_type == "cpu":
|
|
142
|
+
self.device_type = "cpu"
|
|
143
|
+
self.device_name = "CPU"
|
|
144
|
+
|
|
145
|
+
# Extract architecture tree
|
|
146
|
+
def _get_module_tree(module, name="Root"):
|
|
147
|
+
children = list(module.named_children())
|
|
148
|
+
node_params = sum(p.numel() for p in module.parameters(recurse=False))
|
|
149
|
+
total_node_params = sum(p.numel() for p in module.parameters())
|
|
150
|
+
|
|
151
|
+
node = {
|
|
152
|
+
"name": name,
|
|
153
|
+
"class_name": module.__class__.__name__,
|
|
154
|
+
"params": node_params,
|
|
155
|
+
"total_params": total_node_params,
|
|
156
|
+
"children": [],
|
|
157
|
+
}
|
|
158
|
+
|
|
159
|
+
for child_name, child_module in children:
|
|
160
|
+
node["children"].append(_get_module_tree(child_module, child_name))
|
|
161
|
+
|
|
162
|
+
return node
|
|
163
|
+
|
|
164
|
+
self.model_info["architecture"] = _get_module_tree(self.model)
|
|
165
|
+
|
|
166
|
+
except Exception:
|
|
167
|
+
pass
|
|
168
|
+
|
|
169
|
+
def _start_server_if_needed(self):
|
|
170
|
+
"""Checks if port 8000 is open. If not, spawns the FastAPI server as a detached daemon."""
|
|
171
|
+
try:
|
|
172
|
+
from urllib.parse import urlparse
|
|
173
|
+
|
|
174
|
+
port = urlparse(self.server_url).port or 8000
|
|
175
|
+
host = urlparse(self.server_url).hostname or "localhost"
|
|
176
|
+
|
|
177
|
+
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
|
|
178
|
+
s.settimeout(0.1)
|
|
179
|
+
if s.connect_ex((host, port)) != 0:
|
|
180
|
+
print(
|
|
181
|
+
f"⚡ torchlit catching up! Spawning dashboard background server at {self.server_url}..."
|
|
182
|
+
)
|
|
183
|
+
subprocess.Popen(
|
|
184
|
+
[
|
|
185
|
+
sys.executable,
|
|
186
|
+
"-m",
|
|
187
|
+
"uvicorn",
|
|
188
|
+
"torchlit.backend.main:app",
|
|
189
|
+
"--port",
|
|
190
|
+
str(port),
|
|
191
|
+
"--log-level",
|
|
192
|
+
"error",
|
|
193
|
+
"--no-access-log",
|
|
194
|
+
],
|
|
195
|
+
stdout=subprocess.DEVNULL,
|
|
196
|
+
stderr=subprocess.DEVNULL,
|
|
197
|
+
start_new_session=True,
|
|
198
|
+
)
|
|
199
|
+
time.sleep(1.5)
|
|
200
|
+
except Exception as e:
|
|
201
|
+
print(f"⚠️ torchlit could not start background server: {e}")
|
|
202
|
+
|
|
203
|
+
# ─────────────────────────────────────────────────────────────────────────
|
|
204
|
+
# Rust CLI Display
|
|
205
|
+
# ─────────────────────────────────────────────────────────────────────────
|
|
206
|
+
|
|
207
|
+
def _write_cli(self, msg: dict) -> None:
|
|
208
|
+
"""Write a JSON message line to the Rust CLI process stdin."""
|
|
209
|
+
if self._cli_proc is not None and self._cli_proc.poll() is None:
|
|
210
|
+
try:
|
|
211
|
+
line = json.dumps(msg) + "\n"
|
|
212
|
+
self._cli_proc.stdin.write(line.encode())
|
|
213
|
+
self._cli_proc.stdin.flush()
|
|
214
|
+
except (BrokenPipeError, OSError):
|
|
215
|
+
self._cli_proc = None
|
|
216
|
+
|
|
217
|
+
def _start_cli(self) -> None:
|
|
218
|
+
"""Spawn the Rust CLI binary as a subprocess."""
|
|
219
|
+
if not _BIN_PATH.exists():
|
|
220
|
+
return # Binary not compiled yet — skip silently
|
|
221
|
+
|
|
222
|
+
try:
|
|
223
|
+
self._cli_proc = subprocess.Popen(
|
|
224
|
+
[str(_BIN_PATH)],
|
|
225
|
+
stdin=subprocess.PIPE,
|
|
226
|
+
stdout=None, # inherit terminal
|
|
227
|
+
stderr=subprocess.DEVNULL,
|
|
228
|
+
)
|
|
229
|
+
# Give Rust process a moment to initialize before writing
|
|
230
|
+
time.sleep(0.15)
|
|
231
|
+
# Send init message
|
|
232
|
+
self._write_cli(
|
|
233
|
+
{
|
|
234
|
+
"type": "init",
|
|
235
|
+
"exp_name": self.exp_name,
|
|
236
|
+
"model_name": self.model_info.get("name"),
|
|
237
|
+
"total_params": self.model_info.get("total_params"),
|
|
238
|
+
"trainable_params": self.model_info.get("trainable_params"),
|
|
239
|
+
"device": self.device_name,
|
|
240
|
+
"total_steps": self.total_steps,
|
|
241
|
+
}
|
|
242
|
+
)
|
|
243
|
+
except Exception:
|
|
244
|
+
self._cli_proc = None
|
|
245
|
+
|
|
246
|
+
def _stop_cli(self, final_step: int = 0) -> None:
|
|
247
|
+
"""Send done message and wait for the Rust CLI to exit cleanly."""
|
|
248
|
+
if self._cli_proc is None:
|
|
249
|
+
return
|
|
250
|
+
try:
|
|
251
|
+
self._write_cli({"type": "done", "step": final_step})
|
|
252
|
+
self._cli_proc.stdin.close()
|
|
253
|
+
self._cli_proc.wait(timeout=5)
|
|
254
|
+
except Exception:
|
|
255
|
+
try:
|
|
256
|
+
self._cli_proc.terminate()
|
|
257
|
+
except Exception:
|
|
258
|
+
pass
|
|
259
|
+
self._cli_proc = None
|
|
260
|
+
|
|
261
|
+
# ─────────────────────────────────────────────────────────────────────────
|
|
262
|
+
|
|
263
|
+
def __enter__(self):
|
|
264
|
+
if self.start_server:
|
|
265
|
+
self._start_server_if_needed()
|
|
266
|
+
|
|
267
|
+
self._start_time = time.time()
|
|
268
|
+
self.is_running = True
|
|
269
|
+
self.worker_thread = threading.Thread(target=self._worker_loop, daemon=True)
|
|
270
|
+
self.worker_thread.start()
|
|
271
|
+
|
|
272
|
+
self._start_cli()
|
|
273
|
+
return self
|
|
274
|
+
|
|
275
|
+
def __exit__(self, exc_type, exc_val, exc_tb):
|
|
276
|
+
self.is_running = False
|
|
277
|
+
if self.worker_thread is not None:
|
|
278
|
+
self.worker_thread.join(timeout=2.0)
|
|
279
|
+
|
|
280
|
+
# Flush remaining queued items
|
|
281
|
+
self._flush_queue()
|
|
282
|
+
|
|
283
|
+
self._stop_cli(final_step=self._last_step)
|
|
284
|
+
|
|
285
|
+
if self.start_server:
|
|
286
|
+
try:
|
|
287
|
+
requests.post(
|
|
288
|
+
f"{self.server_url}/api/status",
|
|
289
|
+
json={"status": "finished"},
|
|
290
|
+
timeout=1.0,
|
|
291
|
+
)
|
|
292
|
+
print(
|
|
293
|
+
f"⚡ torchlit training complete! Dashboard stays active at {self.server_url}"
|
|
294
|
+
)
|
|
295
|
+
print(
|
|
296
|
+
" (It will automatically shut down when you close the browser window)"
|
|
297
|
+
)
|
|
298
|
+
except requests.RequestException:
|
|
299
|
+
pass
|
|
300
|
+
|
|
301
|
+
return False
|
|
302
|
+
|
|
303
|
+
_last_step: int = 0
|
|
304
|
+
|
|
305
|
+
def log(self, metrics: Dict[str, Any], step: int):
|
|
306
|
+
"""Queue metrics for the server and push to the Rust CLI display."""
|
|
307
|
+
self._last_step = step
|
|
308
|
+
elapsed = time.time() - self._start_time if self._start_time else 0.0
|
|
309
|
+
|
|
310
|
+
self.queue.put({"step": step, "metrics": metrics})
|
|
311
|
+
|
|
312
|
+
# Push to Rust TUI
|
|
313
|
+
self._write_cli(
|
|
314
|
+
{
|
|
315
|
+
"type": "step",
|
|
316
|
+
"step": step,
|
|
317
|
+
"metrics": metrics,
|
|
318
|
+
"elapsed": elapsed,
|
|
319
|
+
}
|
|
320
|
+
)
|
|
321
|
+
|
|
322
|
+
def _get_system_stats(self) -> Dict[str, Any]:
|
|
323
|
+
"""Collect system usage metrics"""
|
|
324
|
+
stats = {
|
|
325
|
+
"cpu_percent": psutil.cpu_percent(interval=None),
|
|
326
|
+
"ram_percent": psutil.virtual_memory().percent,
|
|
327
|
+
"device_type": self.device_type,
|
|
328
|
+
"device_name": self.device_name,
|
|
329
|
+
"vram_percent": None,
|
|
330
|
+
}
|
|
331
|
+
|
|
332
|
+
if self._torch is not None:
|
|
333
|
+
try:
|
|
334
|
+
if self.device_type == "cuda":
|
|
335
|
+
mem_alloc = self._torch.cuda.memory_allocated(0)
|
|
336
|
+
mem_total = self._torch.cuda.get_device_properties(0).total_memory
|
|
337
|
+
if mem_total > 0:
|
|
338
|
+
stats["vram_percent"] = (mem_alloc / mem_total) * 100
|
|
339
|
+
elif self.device_type == "mps":
|
|
340
|
+
alloc = self._torch.mps.current_allocated_memory()
|
|
341
|
+
total = psutil.virtual_memory().total
|
|
342
|
+
stats["vram_percent"] = (alloc / total) * 100
|
|
343
|
+
except Exception:
|
|
344
|
+
pass
|
|
345
|
+
|
|
346
|
+
return stats
|
|
347
|
+
|
|
348
|
+
def _worker_loop(self):
|
|
349
|
+
"""Background thread loop to send data"""
|
|
350
|
+
while self.is_running:
|
|
351
|
+
self._flush_queue()
|
|
352
|
+
time.sleep(self.flush_interval)
|
|
353
|
+
|
|
354
|
+
def _flush_queue(self):
|
|
355
|
+
"""Send all items currently in the queue"""
|
|
356
|
+
while not self.queue.empty():
|
|
357
|
+
try:
|
|
358
|
+
item = self.queue.get_nowait()
|
|
359
|
+
self._send_data(int(item["step"]), dict(item["metrics"]))
|
|
360
|
+
self.queue.task_done()
|
|
361
|
+
except queue.Empty:
|
|
362
|
+
break
|
|
363
|
+
|
|
364
|
+
def _send_data(self, step: int, metrics: Dict[str, Any]):
|
|
365
|
+
"""Perform the actual HTTP POST request"""
|
|
366
|
+
payload = {
|
|
367
|
+
"exp_name": self.exp_name,
|
|
368
|
+
"step": step,
|
|
369
|
+
"metrics": metrics,
|
|
370
|
+
"sys_stats": self._get_system_stats(),
|
|
371
|
+
"model_info": self.model_info if step == 1 else {},
|
|
372
|
+
}
|
|
373
|
+
|
|
374
|
+
try:
|
|
375
|
+
requests.post(f"{self.server_url}/api/log", json=payload, timeout=1.0)
|
|
376
|
+
except requests.RequestException:
|
|
377
|
+
pass
|
|
@@ -0,0 +1,108 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: torchlit-lib
|
|
3
|
+
Version: 0.2.0
|
|
4
|
+
Summary: A lightweight real-time ML training monitor.
|
|
5
|
+
Author: ChanLumerico
|
|
6
|
+
Project-URL: Homepage, https://github.com/ChanLumerico/torchlit
|
|
7
|
+
Requires-Python: >=3.8
|
|
8
|
+
Description-Content-Type: text/markdown
|
|
9
|
+
License-File: LICENSE
|
|
10
|
+
Requires-Dist: requests
|
|
11
|
+
Requires-Dist: psutil
|
|
12
|
+
Requires-Dist: fastapi>=0.100.0
|
|
13
|
+
Requires-Dist: uvicorn>=0.20.0
|
|
14
|
+
Requires-Dist: websockets>=10.4
|
|
15
|
+
Requires-Dist: pydantic>=2.0.0
|
|
16
|
+
Dynamic: license-file
|
|
17
|
+
|
|
18
|
+
# torchlit 🔥
|
|
19
|
+
|
|
20
|
+
<p align="center">
|
|
21
|
+
<b>A lightweight, beautiful, and interactive real-time PyTorch training dashboard.</b>
|
|
22
|
+
</p>
|
|
23
|
+
|
|
24
|
+
<p align="center">
|
|
25
|
+
<img src="assets/dashboard.png" alt="torchlit Dashboard" width="80%" />
|
|
26
|
+
</p>
|
|
27
|
+
|
|
28
|
+
---
|
|
29
|
+
|
|
30
|
+
`torchlit` is a zero-setup desktop GUI that hooks directly into your PyTorch training loops to monitor metrics, system stats, and model architecture in real time. Stop relying on cluttered TQDM bars or heavy logging frameworks.
|
|
31
|
+
|
|
32
|
+
## ✨ Features
|
|
33
|
+
|
|
34
|
+
- **Zero Configuration** — `with torchlit.Monitor():` and your dashboard spins up instantly
|
|
35
|
+
- **Real-Time Streaming** — FastAPI + WebSockets push metrics to the browser immediately
|
|
36
|
+
- **Model Architecture Explorer** — Interactive, color-coded layer tree with parameter counts
|
|
37
|
+
- **Multi-Session Comparison** — Overlay and compare multiple experiments side-by-side
|
|
38
|
+
- **Rust CLI Progress Display** — Beautiful terminal TUI (powered by `ratatui`) while training runs
|
|
39
|
+
- **System Resource Sparklines** — Live CPU, RAM, and VRAM usage tracking
|
|
40
|
+
- **CSV Export** — Download all aggregated metrics at any time
|
|
41
|
+
- **Auto-Shutdown** — Background server cleans up automatically when the browser is closed
|
|
42
|
+
|
|
43
|
+
## 🚀 Quick Start
|
|
44
|
+
|
|
45
|
+
```bash
|
|
46
|
+
pip install torchlit
|
|
47
|
+
```
|
|
48
|
+
|
|
49
|
+
```python
|
|
50
|
+
import torch
|
|
51
|
+
import torchlit
|
|
52
|
+
|
|
53
|
+
model = torch.nn.Sequential(
|
|
54
|
+
torch.nn.Linear(10, 50),
|
|
55
|
+
torch.nn.ReLU(),
|
|
56
|
+
torch.nn.Linear(50, 2)
|
|
57
|
+
)
|
|
58
|
+
|
|
59
|
+
with torchlit.Monitor(exp_name="my_experiment", model=model, total_steps=100) as logger:
|
|
60
|
+
for step in range(1, 101):
|
|
61
|
+
loss = max(0, 1.0 - step * 0.01)
|
|
62
|
+
logger.log({"loss": loss, "accuracy": step / 100.0}, step=step)
|
|
63
|
+
```
|
|
64
|
+
|
|
65
|
+
*Your browser opens automatically at `http://localhost:8000`.*
|
|
66
|
+
*A live Rust-powered TUI also appears directly in your terminal.*
|
|
67
|
+
|
|
68
|
+
## 📖 Full Example
|
|
69
|
+
|
|
70
|
+
See [`examples/example.py`](examples/example.py) for a complete CIFAR-10 + ResNet-50 training loop demonstrating all features.
|
|
71
|
+
|
|
72
|
+
```bash
|
|
73
|
+
python examples/example.py
|
|
74
|
+
```
|
|
75
|
+
|
|
76
|
+
## ⚙️ Monitor Options
|
|
77
|
+
|
|
78
|
+
| Parameter | Type | Default | Description |
|
|
79
|
+
|---|---|---|---|
|
|
80
|
+
| `exp_name` | `str` | `"default_experiment"` | Name for this training run |
|
|
81
|
+
| `model` | `nn.Module` | `None` | PyTorch model (for architecture extraction) |
|
|
82
|
+
| `total_steps` | `int` | `None` | Total steps (enables ETA in CLI display) |
|
|
83
|
+
| `server_url` | `str` | `http://localhost:8000` | Dashboard server URL |
|
|
84
|
+
| `flush_interval` | `float` | `1.0` | Seconds between network flushes |
|
|
85
|
+
|
|
86
|
+
## 🏗️ Architecture
|
|
87
|
+
|
|
88
|
+
```
|
|
89
|
+
torchlit.Monitor ──► FastAPI Backend ──► React Dashboard (browser)
|
|
90
|
+
│ │
|
|
91
|
+
└──► Rust CLI (torchlit-progress) └──► WebSocket streaming
|
|
92
|
+
```
|
|
93
|
+
|
|
94
|
+
## 📦 Platform Support
|
|
95
|
+
|
|
96
|
+
`pip install torchlit` includes pre-compiled CLI binaries for all major platforms:
|
|
97
|
+
|
|
98
|
+
| Platform | Binary |
|
|
99
|
+
|---|---|
|
|
100
|
+
| macOS ARM (M1/M2/M3) | ✅ Included |
|
|
101
|
+
| macOS Intel | ✅ Included |
|
|
102
|
+
| Linux x86_64 | ✅ Included |
|
|
103
|
+
| Windows x64 | ✅ Included |
|
|
104
|
+
|
|
105
|
+
## 🔗 Links
|
|
106
|
+
|
|
107
|
+
- [GitHub](https://github.com/ChanLumerico/torchlit)
|
|
108
|
+
- [PyPI](https://pypi.org/project/torchlit/)
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
LICENSE
|
|
2
|
+
README.md
|
|
3
|
+
pyproject.toml
|
|
4
|
+
torchlit/__init__.py
|
|
5
|
+
torchlit/monitor.py
|
|
6
|
+
torchlit/backend/__init__.py
|
|
7
|
+
torchlit/backend/main.py
|
|
8
|
+
torchlit/bin/__init__.py
|
|
9
|
+
torchlit/bin/torchlit-progress-darwin-arm64
|
|
10
|
+
torchlit/bin/torchlit-progress-darwin-x86_64
|
|
11
|
+
torchlit/bin/torchlit-progress-linux-x86_64
|
|
12
|
+
torchlit/bin/torchlit-progress-windows-x86_64.exe
|
|
13
|
+
torchlit/frontend/__init__.py
|
|
14
|
+
torchlit_lib.egg-info/PKG-INFO
|
|
15
|
+
torchlit_lib.egg-info/SOURCES.txt
|
|
16
|
+
torchlit_lib.egg-info/dependency_links.txt
|
|
17
|
+
torchlit_lib.egg-info/requires.txt
|
|
18
|
+
torchlit_lib.egg-info/top_level.txt
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
torchlit
|