runmonitor 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- runmonitor/__init__.py +125 -0
- runmonitor/__main__.py +54 -0
- runmonitor/server.py +163 -0
- runmonitor/static/style.css +398 -0
- runmonitor/storage.py +235 -0
- runmonitor/templates/dashboard.html +615 -0
- runmonitor-0.2.0.dist-info/METADATA +148 -0
- runmonitor-0.2.0.dist-info/RECORD +12 -0
- runmonitor-0.2.0.dist-info/WHEEL +5 -0
- runmonitor-0.2.0.dist-info/entry_points.txt +2 -0
- runmonitor-0.2.0.dist-info/licenses/LICENSE +21 -0
- runmonitor-0.2.0.dist-info/top_level.txt +1 -0
runmonitor/__init__.py
ADDED
|
@@ -0,0 +1,125 @@
|
|
|
1
|
+
"""
|
|
2
|
+
runmonitor — a lean, local experiment tracker with a live web dashboard.
|
|
3
|
+
Import and go. Dashboard auto-starts on port 8080.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
import json
|
|
7
|
+
import uuid
|
|
8
|
+
import atexit
|
|
9
|
+
import threading
|
|
10
|
+
import time
|
|
11
|
+
from . import server # noqa: F401 — starts the daemon
|
|
12
|
+
from . import storage
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class Run:
|
|
16
|
+
"""A single experiment run. Created via rm.init()."""
|
|
17
|
+
|
|
18
|
+
def __init__(self, run_id: str, project_id: int, name: str | None,
|
|
19
|
+
config: dict, total_steps: int | None):
|
|
20
|
+
self._id = run_id
|
|
21
|
+
self._project_id = project_id
|
|
22
|
+
self._name = name
|
|
23
|
+
self._config = config
|
|
24
|
+
self._total_steps = total_steps
|
|
25
|
+
self._finished = False
|
|
26
|
+
self._sysmon_stop = threading.Event()
|
|
27
|
+
self._sysmon_thread = None
|
|
28
|
+
self._start_sysmon()
|
|
29
|
+
atexit.register(self._on_exit)
|
|
30
|
+
|
|
31
|
+
@property
|
|
32
|
+
def id(self) -> str:
|
|
33
|
+
return self._id
|
|
34
|
+
|
|
35
|
+
@property
|
|
36
|
+
def name(self) -> str | None:
|
|
37
|
+
return self._name
|
|
38
|
+
|
|
39
|
+
def log(self, metrics: dict, step: int) -> None:
|
|
40
|
+
"""Log a dictionary of metric-name → float at a given step."""
|
|
41
|
+
if self._finished:
|
|
42
|
+
raise RuntimeError("Cannot log to a finished run.")
|
|
43
|
+
if not isinstance(metrics, dict):
|
|
44
|
+
raise TypeError("metrics must be a dict")
|
|
45
|
+
storage.log_metrics(self._id, metrics, step)
|
|
46
|
+
|
|
47
|
+
def save(self, filepath: str) -> dict:
|
|
48
|
+
"""Save an artifact file alongside this run. Returns artifact info dict."""
|
|
49
|
+
return storage.save_artifact(self._id, filepath)
|
|
50
|
+
|
|
51
|
+
def finish(self) -> None:
|
|
52
|
+
"""Mark the run as successfully finished."""
|
|
53
|
+
if self._finished:
|
|
54
|
+
return
|
|
55
|
+
self._stop_sysmon()
|
|
56
|
+
storage.finish_run(self._id, "finished")
|
|
57
|
+
self._finished = True
|
|
58
|
+
|
|
59
|
+
def fail(self) -> None:
|
|
60
|
+
"""Mark the run as crashed."""
|
|
61
|
+
if self._finished:
|
|
62
|
+
return
|
|
63
|
+
self._stop_sysmon()
|
|
64
|
+
storage.finish_run(self._id, "crashed")
|
|
65
|
+
self._finished = True
|
|
66
|
+
|
|
67
|
+
# ── system metrics background thread ───────────────────
|
|
68
|
+
|
|
69
|
+
def _start_sysmon(self):
|
|
70
|
+
"""Start a daemon thread that logs CPU/RAM every 10 seconds."""
|
|
71
|
+
def _collect():
|
|
72
|
+
try:
|
|
73
|
+
import psutil
|
|
74
|
+
has_psutil = True
|
|
75
|
+
except ImportError:
|
|
76
|
+
has_psutil = False
|
|
77
|
+
|
|
78
|
+
last_cpu_sample = None
|
|
79
|
+
step_counter = 0
|
|
80
|
+
while not self._sysmon_stop.wait(timeout=10):
|
|
81
|
+
if not has_psutil:
|
|
82
|
+
return
|
|
83
|
+
step_counter += 1
|
|
84
|
+
try:
|
|
85
|
+
cpu = psutil.cpu_percent(interval=0.1)
|
|
86
|
+
mem = psutil.virtual_memory().percent
|
|
87
|
+
storage.log_system_metrics(self._id, step_counter, cpu, mem)
|
|
88
|
+
except Exception:
|
|
89
|
+
pass
|
|
90
|
+
|
|
91
|
+
self._sysmon_thread = threading.Thread(
|
|
92
|
+
target=_collect, name="rm-sysmon", daemon=True
|
|
93
|
+
)
|
|
94
|
+
self._sysmon_thread.start()
|
|
95
|
+
|
|
96
|
+
def _stop_sysmon(self):
|
|
97
|
+
self._sysmon_stop.set()
|
|
98
|
+
if self._sysmon_thread:
|
|
99
|
+
self._sysmon_thread.join(timeout=2)
|
|
100
|
+
|
|
101
|
+
def _on_exit(self):
|
|
102
|
+
"""Mark the run as crashed if it exits without finish()/fail()."""
|
|
103
|
+
if self._finished:
|
|
104
|
+
return
|
|
105
|
+
self._stop_sysmon()
|
|
106
|
+
storage.finish_run(self._id, "crashed")
|
|
107
|
+
self._finished = True
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
def init(project: str, name: str | None = None, config: dict | None = None,
|
|
111
|
+
total_steps: int | None = None) -> Run:
|
|
112
|
+
"""
|
|
113
|
+
Create (or reuse) a project and start a new run.
|
|
114
|
+
|
|
115
|
+
import runmonitor as rm
|
|
116
|
+
run = rm.init("mnist-experiment", config={"lr": 0.001}, total_steps=1000)
|
|
117
|
+
|
|
118
|
+
Opens http://localhost:8080 for the live dashboard.
|
|
119
|
+
"""
|
|
120
|
+
storage.init_db()
|
|
121
|
+
project_id = storage.create_project(project)
|
|
122
|
+
run_id = uuid.uuid4().hex[:12]
|
|
123
|
+
config_json = json.dumps(config or {})
|
|
124
|
+
storage.create_run(run_id, project_id, name, config_json, total_steps)
|
|
125
|
+
return Run(run_id, project_id, name, config or {}, total_steps)
|
runmonitor/__main__.py
ADDED
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
"""Run the dashboard standalone — no training script needed.
|
|
2
|
+
|
|
3
|
+
runmonitor # if pip-installed (console script)
|
|
4
|
+
python -m runmonitor # from a checkout / vendored copy
|
|
5
|
+
RUNMONITOR_PORT=9000 runmonitor # choose a port (set before launch)
|
|
6
|
+
|
|
7
|
+
Importing ``runmonitor`` already auto-starts the dashboard daemon, so this
|
|
8
|
+
just makes sure it's up, opens a browser, and keeps the process alive.
|
|
9
|
+
"""
|
|
10
|
+
import argparse
|
|
11
|
+
import os
|
|
12
|
+
import threading
|
|
13
|
+
import webbrowser
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def main() -> None:
|
|
17
|
+
parser = argparse.ArgumentParser(
|
|
18
|
+
prog="runmonitor",
|
|
19
|
+
description="Live, terminal-styled experiment-tracking dashboard.",
|
|
20
|
+
)
|
|
21
|
+
parser.add_argument(
|
|
22
|
+
"--port", type=int, default=None,
|
|
23
|
+
help="Preferred port (most reliable via RUNMONITOR_PORT before launch).",
|
|
24
|
+
)
|
|
25
|
+
parser.add_argument(
|
|
26
|
+
"--no-browser", action="store_true",
|
|
27
|
+
help="Do not open a browser window.",
|
|
28
|
+
)
|
|
29
|
+
args = parser.parse_args()
|
|
30
|
+
if args.port:
|
|
31
|
+
os.environ.setdefault("RUNMONITOR_PORT", str(args.port))
|
|
32
|
+
|
|
33
|
+
from .storage import init_db
|
|
34
|
+
from . import server
|
|
35
|
+
|
|
36
|
+
init_db()
|
|
37
|
+
server._start_server() # idempotent — reuses the daemon if already running
|
|
38
|
+
url = f"http://localhost:{server._port}"
|
|
39
|
+
print(f" runmonitor dashboard → {url}")
|
|
40
|
+
if args.port and args.port != server._port:
|
|
41
|
+
print(f" (port {args.port} was unavailable; bound {server._port} instead)")
|
|
42
|
+
if not args.no_browser:
|
|
43
|
+
try:
|
|
44
|
+
webbrowser.open(url)
|
|
45
|
+
except Exception:
|
|
46
|
+
pass
|
|
47
|
+
try:
|
|
48
|
+
threading.Event().wait() # block forever; Ctrl-C to quit
|
|
49
|
+
except KeyboardInterrupt:
|
|
50
|
+
print("\n bye.")
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
if __name__ == "__main__":
|
|
54
|
+
main()
|
runmonitor/server.py
ADDED
|
@@ -0,0 +1,163 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Flask server that auto-starts in a daemon thread on first import.
|
|
3
|
+
Serves the dashboard + JSON API.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
import threading
|
|
7
|
+
import os
|
|
8
|
+
import time
|
|
9
|
+
import socket
|
|
10
|
+
from flask import Flask, jsonify, request, send_from_directory
|
|
11
|
+
|
|
12
|
+
from . import storage
|
|
13
|
+
|
|
14
|
+
app = Flask(__name__,
|
|
15
|
+
template_folder=os.path.join(os.path.dirname(__file__), "templates"),
|
|
16
|
+
static_folder=os.path.join(os.path.dirname(__file__), "static"))
|
|
17
|
+
app.config["JSONIFY_PRETTYPRINT_REGULAR"] = False
|
|
18
|
+
|
|
19
|
+
_started = False
|
|
20
|
+
_port = 8080
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def _find_port(start=None):
|
|
24
|
+
"""Find the first port we can actually bind, starting from `start`.
|
|
25
|
+
|
|
26
|
+
Defaults to ``$RUNMONITOR_PORT`` (set before launch) or 8080.
|
|
27
|
+
"""
|
|
28
|
+
if start is None:
|
|
29
|
+
try:
|
|
30
|
+
start = int(os.environ.get("RUNMONITOR_PORT", 8080))
|
|
31
|
+
except ValueError:
|
|
32
|
+
start = 8080
|
|
33
|
+
for p in range(start, start + 100):
|
|
34
|
+
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
|
|
35
|
+
try:
|
|
36
|
+
s.bind(("127.0.0.1", p))
|
|
37
|
+
return p
|
|
38
|
+
except OSError:
|
|
39
|
+
continue
|
|
40
|
+
return start
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
# ── API routes ──────────────────────────────────────────────────
|
|
44
|
+
|
|
45
|
+
@app.route("/api/projects")
|
|
46
|
+
def api_projects():
|
|
47
|
+
return jsonify(storage.get_projects())
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
@app.route("/api/runs")
|
|
51
|
+
def api_runs():
|
|
52
|
+
project = request.args.get("project")
|
|
53
|
+
return jsonify(storage.get_runs(project))
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
@app.route("/api/runs/<run_id>/config")
|
|
57
|
+
def api_run_config(run_id):
|
|
58
|
+
row = storage.get_run_config(run_id)
|
|
59
|
+
if row is None:
|
|
60
|
+
return jsonify({"error": "not found"}), 404
|
|
61
|
+
return jsonify(row)
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
@app.route("/api/runs/<run_id>/metrics")
|
|
65
|
+
def api_run_metrics(run_id):
|
|
66
|
+
key = request.args.get("key")
|
|
67
|
+
limit = request.args.get("limit")
|
|
68
|
+
if limit:
|
|
69
|
+
limit = int(limit)
|
|
70
|
+
return jsonify(storage.get_metrics(run_id, key, limit))
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
@app.route("/api/runs/<run_id>/metrics/live")
|
|
74
|
+
def api_run_metrics_live(run_id):
|
|
75
|
+
"""Return only metrics with step > `since` for efficient polling."""
|
|
76
|
+
since = request.args.get("since", 0, type=int)
|
|
77
|
+
return jsonify(storage.get_metrics(run_id, since=since))
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
@app.route("/api/runs/<run_id>/artifacts")
|
|
81
|
+
def api_run_artifacts(run_id):
|
|
82
|
+
return jsonify(storage.get_artifacts(run_id))
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
@app.route("/api/runs/<run_id>/system")
|
|
86
|
+
def api_run_system(run_id):
|
|
87
|
+
return jsonify(storage.get_system_metrics(run_id))
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
@app.route("/api/runs/<run_id>/export")
|
|
91
|
+
def api_run_export(run_id):
|
|
92
|
+
fmt = request.args.get("format", "json")
|
|
93
|
+
metrics = storage.get_metrics(run_id)
|
|
94
|
+
config = storage.get_run_config(run_id)
|
|
95
|
+
|
|
96
|
+
if fmt == "csv":
|
|
97
|
+
import csv
|
|
98
|
+
import io
|
|
99
|
+
out = io.StringIO()
|
|
100
|
+
writer = csv.writer(out)
|
|
101
|
+
writer.writerow(["step", "key", "value", "timestamp"])
|
|
102
|
+
for m in metrics:
|
|
103
|
+
writer.writerow([m["step"], m["key"], m["value"], m["timestamp"]])
|
|
104
|
+
csv_str = out.getvalue()
|
|
105
|
+
from flask import Response
|
|
106
|
+
return Response(
|
|
107
|
+
csv_str,
|
|
108
|
+
mimetype="text/csv",
|
|
109
|
+
headers={"Content-Disposition": f"attachment; filename=run_{run_id}.csv"},
|
|
110
|
+
)
|
|
111
|
+
else:
|
|
112
|
+
return jsonify({
|
|
113
|
+
"run_id": run_id,
|
|
114
|
+
"config": config,
|
|
115
|
+
"metrics": metrics,
|
|
116
|
+
"artifacts": storage.get_artifacts(run_id),
|
|
117
|
+
"system_metrics": storage.get_system_metrics(run_id),
|
|
118
|
+
})
|
|
119
|
+
|
|
120
|
+
|
|
121
|
+
@app.route("/api/runs/<run_id>/compare")
|
|
122
|
+
def api_run_compare(run_id):
|
|
123
|
+
"""Return metrics for two runs keyed by the same metric name."""
|
|
124
|
+
other_id = request.args.get("other")
|
|
125
|
+
key = request.args.get("key")
|
|
126
|
+
if not other_id or not key:
|
|
127
|
+
return jsonify({"error": "need `other` and `key` params"}), 400
|
|
128
|
+
run_a = storage.get_metrics(run_id, key=key)
|
|
129
|
+
run_b = storage.get_metrics(other_id, key=key)
|
|
130
|
+
return jsonify({
|
|
131
|
+
"run_a": {"id": run_id, "metrics": run_a},
|
|
132
|
+
"run_b": {"id": other_id, "metrics": run_b},
|
|
133
|
+
})
|
|
134
|
+
|
|
135
|
+
|
|
136
|
+
# ── Dashboard ───────────────────────────────────────────────────
|
|
137
|
+
|
|
138
|
+
@app.route("/")
|
|
139
|
+
def dashboard():
|
|
140
|
+
from flask import render_template
|
|
141
|
+
return render_template("dashboard.html", port=_port)
|
|
142
|
+
|
|
143
|
+
|
|
144
|
+
# ── Daemon start ────────────────────────────────────────────────
|
|
145
|
+
|
|
146
|
+
def _start_server():
|
|
147
|
+
global _started, _port
|
|
148
|
+
if _started:
|
|
149
|
+
return
|
|
150
|
+
_started = True
|
|
151
|
+
_port = _find_port()
|
|
152
|
+
|
|
153
|
+
def _run():
|
|
154
|
+
app.run(host="127.0.0.1", port=_port, debug=False, use_reloader=False)
|
|
155
|
+
|
|
156
|
+
t = threading.Thread(target=_run, name="runmonitor-server", daemon=True)
|
|
157
|
+
t.start()
|
|
158
|
+
time.sleep(0.3) # give it a moment to bind
|
|
159
|
+
|
|
160
|
+
|
|
161
|
+
import os as _os
|
|
162
|
+
if not _os.environ.get("RUNMONITOR_STANDALONE"):
|
|
163
|
+
_start_server()
|