gnvitop 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- gnvitop-0.1.0/LICENSE +21 -0
- gnvitop-0.1.0/PKG-INFO +119 -0
- gnvitop-0.1.0/README.md +94 -0
- gnvitop-0.1.0/gnvitop/__init__.py +3 -0
- gnvitop-0.1.0/gnvitop/__main__.py +4 -0
- gnvitop-0.1.0/gnvitop/cli.py +76 -0
- gnvitop-0.1.0/gnvitop/dashboard.py +477 -0
- gnvitop-0.1.0/gnvitop/server.py +197 -0
- gnvitop-0.1.0/gnvitop.egg-info/PKG-INFO +119 -0
- gnvitop-0.1.0/gnvitop.egg-info/SOURCES.txt +14 -0
- gnvitop-0.1.0/gnvitop.egg-info/dependency_links.txt +1 -0
- gnvitop-0.1.0/gnvitop.egg-info/entry_points.txt +2 -0
- gnvitop-0.1.0/gnvitop.egg-info/requires.txt +2 -0
- gnvitop-0.1.0/gnvitop.egg-info/top_level.txt +1 -0
- gnvitop-0.1.0/pyproject.toml +37 -0
- gnvitop-0.1.0/setup.cfg +4 -0
gnvitop-0.1.0/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2024 Linwei94
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
gnvitop-0.1.0/PKG-INFO
ADDED
|
@@ -0,0 +1,119 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: gnvitop
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Global nvitop: web-based GPU monitoring dashboard for all your remote servers via SSH
|
|
5
|
+
Author: Linwei94
|
|
6
|
+
License-Expression: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/Linwei94/gnvitop
|
|
8
|
+
Project-URL: Repository, https://github.com/Linwei94/gnvitop
|
|
9
|
+
Project-URL: Issues, https://github.com/Linwei94/gnvitop/issues
|
|
10
|
+
Keywords: gpu,monitor,nvidia,ssh,nvitop,dashboard,nvidia-smi
|
|
11
|
+
Classifier: Development Status :: 4 - Beta
|
|
12
|
+
Classifier: Environment :: Console
|
|
13
|
+
Classifier: Environment :: Web Environment
|
|
14
|
+
Classifier: Intended Audience :: Developers
|
|
15
|
+
Classifier: Intended Audience :: Science/Research
|
|
16
|
+
Classifier: Intended Audience :: System Administrators
|
|
17
|
+
Classifier: Programming Language :: Python :: 3
|
|
18
|
+
Classifier: Topic :: System :: Monitoring
|
|
19
|
+
Requires-Python: >=3.7
|
|
20
|
+
Description-Content-Type: text/markdown
|
|
21
|
+
License-File: LICENSE
|
|
22
|
+
Requires-Dist: flask>=2.0
|
|
23
|
+
Requires-Dist: paramiko>=2.0
|
|
24
|
+
Dynamic: license-file
|
|
25
|
+
|
|
26
|
+
# gnvitop
|
|
27
|
+
|
|
28
|
+
**Global nvitop** -- a web-based GPU monitoring dashboard that monitors **all** your remote GPU servers from a single page.
|
|
29
|
+
|
|
30
|
+
Like [nvitop](https://github.com/XuehaiPan/nvitop), but for **all your servers at once**, displayed as a beautiful web dashboard.
|
|
31
|
+
|
|
32
|
+
```
|
|
33
|
+
pip install gnvitop
|
|
34
|
+
gnvitop
|
|
35
|
+
```
|
|
36
|
+
|
|
37
|
+
## How It Works
|
|
38
|
+
|
|
39
|
+
1. Reads your `~/.ssh/config` automatically
|
|
40
|
+
2. SSH into each server and runs `nvidia-smi`
|
|
41
|
+
3. Displays everything in a real-time web dashboard
|
|
42
|
+
4. Auto-refreshes every 30 seconds
|
|
43
|
+
|
|
44
|
+
```
|
|
45
|
+
┌──> Server A (nvidia-smi) ──> 4x A100
|
|
46
|
+
gnvitop ──> Browser ──> ├──> Server B (nvidia-smi) ──> 8x V100
|
|
47
|
+
├──> Server C (nvidia-smi) ──> 2x RTX 4090
|
|
48
|
+
└──> Server D ──> offline
|
|
49
|
+
```
|
|
50
|
+
|
|
51
|
+
## Installation
|
|
52
|
+
|
|
53
|
+
```bash
|
|
54
|
+
pip install gnvitop
|
|
55
|
+
```
|
|
56
|
+
|
|
57
|
+
## Usage
|
|
58
|
+
|
|
59
|
+
```bash
|
|
60
|
+
gnvitop # start and auto-open browser
|
|
61
|
+
gnvitop -p 8080 # custom port
|
|
62
|
+
gnvitop --host 0.0.0.0 # expose to LAN
|
|
63
|
+
gnvitop --no-browser # don't auto-open browser
|
|
64
|
+
gnvitop --ssh-config /path/to/config # custom SSH config
|
|
65
|
+
gnvitop -v # show version
|
|
66
|
+
```
|
|
67
|
+
|
|
68
|
+
Or run as a module:
|
|
69
|
+
|
|
70
|
+
```bash
|
|
71
|
+
python -m gnvitop
|
|
72
|
+
```
|
|
73
|
+
|
|
74
|
+
## Prerequisites
|
|
75
|
+
|
|
76
|
+
1. **SSH config** -- your `~/.ssh/config` should have server entries:
|
|
77
|
+
|
|
78
|
+
```
|
|
79
|
+
Host gpu-server-01
|
|
80
|
+
HostName 192.168.1.101
|
|
81
|
+
User alice
|
|
82
|
+
IdentityFile ~/.ssh/id_rsa
|
|
83
|
+
|
|
84
|
+
Host gpu-server-02
|
|
85
|
+
HostName 192.168.1.102
|
|
86
|
+
User bob
|
|
87
|
+
```
|
|
88
|
+
|
|
89
|
+
2. **SSH key auth** -- password-less login should be set up
|
|
90
|
+
3. **nvidia-smi** -- must be installed on the remote servers
|
|
91
|
+
|
|
92
|
+
## Features
|
|
93
|
+
|
|
94
|
+
- **Zero config** -- reads `~/.ssh/config` automatically, no setup needed
|
|
95
|
+
- **One command** -- `pip install gnvitop && gnvitop`, that's it
|
|
96
|
+
- **Auto browser** -- opens dashboard in your browser on start
|
|
97
|
+
- **Real-time** -- 30s auto-refresh with manual refresh button
|
|
98
|
+
- **Concurrent** -- queries all servers in parallel (10 workers)
|
|
99
|
+
- **Cached** -- 30s cache to avoid hammering your servers
|
|
100
|
+
- **Dark UI** -- clean, responsive dark-themed dashboard
|
|
101
|
+
- **At a glance** -- summary bar shows online hosts, total GPUs, idle GPUs, free memory
|
|
102
|
+
- **Color coded** -- green (online), yellow (no GPU), red (offline)
|
|
103
|
+
- **GPU details** -- utilization bars, memory bars, temperature with color alerts
|
|
104
|
+
|
|
105
|
+
## Comparison with nvitop
|
|
106
|
+
|
|
107
|
+
| Feature | nvitop | gnvitop |
|
|
108
|
+
|---------|--------|---------|
|
|
109
|
+
| Monitor local GPU | Yes | No |
|
|
110
|
+
| Monitor remote GPUs | No | Yes |
|
|
111
|
+
| Multiple servers | No | Yes |
|
|
112
|
+
| Interface | Terminal | Web browser |
|
|
113
|
+
| Setup | Run on each server | Run once, reads SSH config |
|
|
114
|
+
|
|
115
|
+
**gnvitop** is not a replacement for nvitop -- it's a complement. Use nvitop for detailed local GPU monitoring, use gnvitop to get an overview of all your GPU servers from one place.
|
|
116
|
+
|
|
117
|
+
## License
|
|
118
|
+
|
|
119
|
+
MIT
|
gnvitop-0.1.0/README.md
ADDED
|
@@ -0,0 +1,94 @@
|
|
|
1
|
+
# gnvitop
|
|
2
|
+
|
|
3
|
+
**Global nvitop** -- a web-based GPU monitoring dashboard that monitors **all** your remote GPU servers from a single page.
|
|
4
|
+
|
|
5
|
+
Like [nvitop](https://github.com/XuehaiPan/nvitop), but for **all your servers at once**, displayed as a beautiful web dashboard.
|
|
6
|
+
|
|
7
|
+
```
|
|
8
|
+
pip install gnvitop
|
|
9
|
+
gnvitop
|
|
10
|
+
```
|
|
11
|
+
|
|
12
|
+
## How It Works
|
|
13
|
+
|
|
14
|
+
1. Reads your `~/.ssh/config` automatically
|
|
15
|
+
2. SSH into each server and runs `nvidia-smi`
|
|
16
|
+
3. Displays everything in a real-time web dashboard
|
|
17
|
+
4. Auto-refreshes every 30 seconds
|
|
18
|
+
|
|
19
|
+
```
|
|
20
|
+
┌──> Server A (nvidia-smi) ──> 4x A100
|
|
21
|
+
gnvitop ──> Browser ──> ├──> Server B (nvidia-smi) ──> 8x V100
|
|
22
|
+
├──> Server C (nvidia-smi) ──> 2x RTX 4090
|
|
23
|
+
└──> Server D ──> offline
|
|
24
|
+
```
|
|
25
|
+
|
|
26
|
+
## Installation
|
|
27
|
+
|
|
28
|
+
```bash
|
|
29
|
+
pip install gnvitop
|
|
30
|
+
```
|
|
31
|
+
|
|
32
|
+
## Usage
|
|
33
|
+
|
|
34
|
+
```bash
|
|
35
|
+
gnvitop # start and auto-open browser
|
|
36
|
+
gnvitop -p 8080 # custom port
|
|
37
|
+
gnvitop --host 0.0.0.0 # expose to LAN
|
|
38
|
+
gnvitop --no-browser # don't auto-open browser
|
|
39
|
+
gnvitop --ssh-config /path/to/config # custom SSH config
|
|
40
|
+
gnvitop -v # show version
|
|
41
|
+
```
|
|
42
|
+
|
|
43
|
+
Or run as a module:
|
|
44
|
+
|
|
45
|
+
```bash
|
|
46
|
+
python -m gnvitop
|
|
47
|
+
```
|
|
48
|
+
|
|
49
|
+
## Prerequisites
|
|
50
|
+
|
|
51
|
+
1. **SSH config** -- your `~/.ssh/config` should have server entries:
|
|
52
|
+
|
|
53
|
+
```
|
|
54
|
+
Host gpu-server-01
|
|
55
|
+
HostName 192.168.1.101
|
|
56
|
+
User alice
|
|
57
|
+
IdentityFile ~/.ssh/id_rsa
|
|
58
|
+
|
|
59
|
+
Host gpu-server-02
|
|
60
|
+
HostName 192.168.1.102
|
|
61
|
+
User bob
|
|
62
|
+
```
|
|
63
|
+
|
|
64
|
+
2. **SSH key auth** -- password-less login should be set up
|
|
65
|
+
3. **nvidia-smi** -- must be installed on the remote servers
|
|
66
|
+
|
|
67
|
+
## Features
|
|
68
|
+
|
|
69
|
+
- **Zero config** -- reads `~/.ssh/config` automatically, no setup needed
|
|
70
|
+
- **One command** -- `pip install gnvitop && gnvitop`, that's it
|
|
71
|
+
- **Auto browser** -- opens dashboard in your browser on start
|
|
72
|
+
- **Real-time** -- 30s auto-refresh with manual refresh button
|
|
73
|
+
- **Concurrent** -- queries all servers in parallel (10 workers)
|
|
74
|
+
- **Cached** -- 30s cache to avoid hammering your servers
|
|
75
|
+
- **Dark UI** -- clean, responsive dark-themed dashboard
|
|
76
|
+
- **At a glance** -- summary bar shows online hosts, total GPUs, idle GPUs, free memory
|
|
77
|
+
- **Color coded** -- green (online), yellow (no GPU), red (offline)
|
|
78
|
+
- **GPU details** -- utilization bars, memory bars, temperature with color alerts
|
|
79
|
+
|
|
80
|
+
## Comparison with nvitop
|
|
81
|
+
|
|
82
|
+
| Feature | nvitop | gnvitop |
|
|
83
|
+
|---------|--------|---------|
|
|
84
|
+
| Monitor local GPU | Yes | No |
|
|
85
|
+
| Monitor remote GPUs | No | Yes |
|
|
86
|
+
| Multiple servers | No | Yes |
|
|
87
|
+
| Interface | Terminal | Web browser |
|
|
88
|
+
| Setup | Run on each server | Run once, reads SSH config |
|
|
89
|
+
|
|
90
|
+
**gnvitop** is not a replacement for nvitop -- it's a complement. Use nvitop for detailed local GPU monitoring, use gnvitop to get an overview of all your GPU servers from one place.
|
|
91
|
+
|
|
92
|
+
## License
|
|
93
|
+
|
|
94
|
+
MIT
|
|
@@ -0,0 +1,76 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""CLI entry point for gnvitop."""
|
|
3
|
+
|
|
4
|
+
import argparse
|
|
5
|
+
import os
|
|
6
|
+
import webbrowser
|
|
7
|
+
import threading
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def main():
|
|
11
|
+
parser = argparse.ArgumentParser(
|
|
12
|
+
prog="gnvitop",
|
|
13
|
+
description="Global nvitop: web-based GPU monitoring dashboard for remote servers via SSH.",
|
|
14
|
+
)
|
|
15
|
+
parser.add_argument(
|
|
16
|
+
"-p", "--port",
|
|
17
|
+
type=int,
|
|
18
|
+
default=5050,
|
|
19
|
+
help="Port to run the server on (default: 5050)",
|
|
20
|
+
)
|
|
21
|
+
parser.add_argument(
|
|
22
|
+
"--host",
|
|
23
|
+
default="127.0.0.1",
|
|
24
|
+
help="Host to bind to (default: 127.0.0.1)",
|
|
25
|
+
)
|
|
26
|
+
parser.add_argument(
|
|
27
|
+
"--no-browser",
|
|
28
|
+
action="store_true",
|
|
29
|
+
help="Do not open browser automatically",
|
|
30
|
+
)
|
|
31
|
+
parser.add_argument(
|
|
32
|
+
"--ssh-config",
|
|
33
|
+
default=None,
|
|
34
|
+
help="Path to SSH config file (default: ~/.ssh/config)",
|
|
35
|
+
)
|
|
36
|
+
parser.add_argument(
|
|
37
|
+
"-v", "--version",
|
|
38
|
+
action="store_true",
|
|
39
|
+
help="Show version and exit",
|
|
40
|
+
)
|
|
41
|
+
|
|
42
|
+
args = parser.parse_args()
|
|
43
|
+
|
|
44
|
+
from . import __version__
|
|
45
|
+
|
|
46
|
+
if args.version:
|
|
47
|
+
print(f"gnvitop {__version__}")
|
|
48
|
+
return
|
|
49
|
+
|
|
50
|
+
# Check SSH config exists
|
|
51
|
+
ssh_config = args.ssh_config or os.path.expanduser("~/.ssh/config")
|
|
52
|
+
if not os.path.exists(ssh_config):
|
|
53
|
+
print(f"Warning: SSH config not found at {ssh_config}")
|
|
54
|
+
print("gnvitop will start but no hosts will be queried.")
|
|
55
|
+
print("Create ~/.ssh/config or use --ssh-config to specify a path.\n")
|
|
56
|
+
|
|
57
|
+
# Set custom SSH config path if provided
|
|
58
|
+
if args.ssh_config:
|
|
59
|
+
from . import server
|
|
60
|
+
server.SSH_CONFIG_PATH = args.ssh_config
|
|
61
|
+
|
|
62
|
+
from .server import app
|
|
63
|
+
|
|
64
|
+
url = f"http://{args.host}:{args.port}"
|
|
65
|
+
print(f"gnvitop v{__version__} starting on {url}")
|
|
66
|
+
print(f"Reading SSH config from: {ssh_config}")
|
|
67
|
+
print("Press Ctrl+C to stop.\n")
|
|
68
|
+
|
|
69
|
+
if not args.no_browser:
|
|
70
|
+
threading.Timer(1.0, lambda: webbrowser.open(url)).start()
|
|
71
|
+
|
|
72
|
+
app.run(host=args.host, port=args.port, debug=False)
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
if __name__ == "__main__":
|
|
76
|
+
main()
|
|
@@ -0,0 +1,477 @@
|
|
|
1
|
+
"""Embedded dashboard HTML."""
|
|
2
|
+
|
|
3
|
+
DASHBOARD_HTML = r"""<!DOCTYPE html>
|
|
4
|
+
<html lang="zh-CN">
|
|
5
|
+
<head>
|
|
6
|
+
<meta charset="UTF-8">
|
|
7
|
+
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
|
8
|
+
<title>GPU Monitor</title>
|
|
9
|
+
<style>
|
|
10
|
+
* { margin: 0; padding: 0; box-sizing: border-box; }
|
|
11
|
+
|
|
12
|
+
body {
|
|
13
|
+
font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, sans-serif;
|
|
14
|
+
background: #0f172a;
|
|
15
|
+
color: #e2e8f0;
|
|
16
|
+
min-height: 100vh;
|
|
17
|
+
padding: 24px;
|
|
18
|
+
}
|
|
19
|
+
|
|
20
|
+
.header {
|
|
21
|
+
display: flex;
|
|
22
|
+
justify-content: space-between;
|
|
23
|
+
align-items: center;
|
|
24
|
+
margin-bottom: 28px;
|
|
25
|
+
flex-wrap: wrap;
|
|
26
|
+
gap: 12px;
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
.header h1 {
|
|
30
|
+
font-size: 28px;
|
|
31
|
+
font-weight: 700;
|
|
32
|
+
color: #f1f5f9;
|
|
33
|
+
letter-spacing: -0.5px;
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
.header-right {
|
|
37
|
+
display: flex;
|
|
38
|
+
align-items: center;
|
|
39
|
+
gap: 16px;
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
.status-text {
|
|
43
|
+
font-size: 13px;
|
|
44
|
+
color: #94a3b8;
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
.btn-refresh {
|
|
48
|
+
padding: 8px 20px;
|
|
49
|
+
border: 1px solid #334155;
|
|
50
|
+
background: #1e293b;
|
|
51
|
+
color: #e2e8f0;
|
|
52
|
+
border-radius: 8px;
|
|
53
|
+
cursor: pointer;
|
|
54
|
+
font-size: 14px;
|
|
55
|
+
transition: all 0.2s;
|
|
56
|
+
}
|
|
57
|
+
.btn-refresh:hover { background: #334155; border-color: #475569; }
|
|
58
|
+
.btn-refresh:disabled { opacity: 0.5; cursor: not-allowed; }
|
|
59
|
+
|
|
60
|
+
.summary-bar {
|
|
61
|
+
display: flex;
|
|
62
|
+
gap: 16px;
|
|
63
|
+
margin-bottom: 24px;
|
|
64
|
+
flex-wrap: wrap;
|
|
65
|
+
}
|
|
66
|
+
|
|
67
|
+
.summary-card {
|
|
68
|
+
background: #1e293b;
|
|
69
|
+
border: 1px solid #334155;
|
|
70
|
+
border-radius: 10px;
|
|
71
|
+
padding: 16px 24px;
|
|
72
|
+
min-width: 160px;
|
|
73
|
+
flex: 1;
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
.summary-card .label {
|
|
77
|
+
font-size: 12px;
|
|
78
|
+
color: #94a3b8;
|
|
79
|
+
text-transform: uppercase;
|
|
80
|
+
letter-spacing: 0.5px;
|
|
81
|
+
margin-bottom: 4px;
|
|
82
|
+
}
|
|
83
|
+
|
|
84
|
+
.summary-card .value {
|
|
85
|
+
font-size: 28px;
|
|
86
|
+
font-weight: 700;
|
|
87
|
+
}
|
|
88
|
+
|
|
89
|
+
.host-grid {
|
|
90
|
+
display: grid;
|
|
91
|
+
grid-template-columns: repeat(auto-fill, minmax(420px, 1fr));
|
|
92
|
+
gap: 16px;
|
|
93
|
+
}
|
|
94
|
+
|
|
95
|
+
.host-card {
|
|
96
|
+
background: #1e293b;
|
|
97
|
+
border: 1px solid #334155;
|
|
98
|
+
border-radius: 12px;
|
|
99
|
+
overflow: hidden;
|
|
100
|
+
transition: border-color 0.2s;
|
|
101
|
+
}
|
|
102
|
+
.host-card:hover { border-color: #475569; }
|
|
103
|
+
|
|
104
|
+
.host-card.status-ok { border-left: 3px solid #22c55e; }
|
|
105
|
+
.host-card.status-no_gpu { border-left: 3px solid #eab308; }
|
|
106
|
+
.host-card.status-error { border-left: 3px solid #ef4444; }
|
|
107
|
+
|
|
108
|
+
.host-header {
|
|
109
|
+
padding: 16px 20px;
|
|
110
|
+
display: flex;
|
|
111
|
+
justify-content: space-between;
|
|
112
|
+
align-items: center;
|
|
113
|
+
border-bottom: 1px solid #334155;
|
|
114
|
+
}
|
|
115
|
+
|
|
116
|
+
.host-name {
|
|
117
|
+
font-size: 16px;
|
|
118
|
+
font-weight: 600;
|
|
119
|
+
color: #f1f5f9;
|
|
120
|
+
}
|
|
121
|
+
|
|
122
|
+
.host-info {
|
|
123
|
+
font-size: 12px;
|
|
124
|
+
color: #64748b;
|
|
125
|
+
}
|
|
126
|
+
|
|
127
|
+
.status-badge {
|
|
128
|
+
font-size: 11px;
|
|
129
|
+
padding: 3px 10px;
|
|
130
|
+
border-radius: 12px;
|
|
131
|
+
font-weight: 600;
|
|
132
|
+
text-transform: uppercase;
|
|
133
|
+
letter-spacing: 0.3px;
|
|
134
|
+
}
|
|
135
|
+
.badge-ok { background: #052e16; color: #4ade80; }
|
|
136
|
+
.badge-no_gpu { background: #422006; color: #facc15; }
|
|
137
|
+
.badge-error { background: #450a0a; color: #f87171; }
|
|
138
|
+
|
|
139
|
+
.host-body { padding: 16px 20px; }
|
|
140
|
+
|
|
141
|
+
.error-msg {
|
|
142
|
+
color: #f87171;
|
|
143
|
+
font-size: 13px;
|
|
144
|
+
padding: 8px 0;
|
|
145
|
+
}
|
|
146
|
+
|
|
147
|
+
.no-gpu-msg {
|
|
148
|
+
color: #facc15;
|
|
149
|
+
font-size: 13px;
|
|
150
|
+
padding: 8px 0;
|
|
151
|
+
}
|
|
152
|
+
|
|
153
|
+
.gpu-item {
|
|
154
|
+
padding: 12px 0;
|
|
155
|
+
}
|
|
156
|
+
.gpu-item + .gpu-item { border-top: 1px solid #1e293b; }
|
|
157
|
+
|
|
158
|
+
.gpu-title {
|
|
159
|
+
display: flex;
|
|
160
|
+
justify-content: space-between;
|
|
161
|
+
align-items: center;
|
|
162
|
+
margin-bottom: 10px;
|
|
163
|
+
}
|
|
164
|
+
|
|
165
|
+
.gpu-name {
|
|
166
|
+
font-size: 14px;
|
|
167
|
+
font-weight: 600;
|
|
168
|
+
color: #cbd5e1;
|
|
169
|
+
}
|
|
170
|
+
|
|
171
|
+
.gpu-temp {
|
|
172
|
+
font-size: 12px;
|
|
173
|
+
padding: 2px 8px;
|
|
174
|
+
border-radius: 6px;
|
|
175
|
+
font-weight: 600;
|
|
176
|
+
}
|
|
177
|
+
.temp-cool { background: #052e16; color: #4ade80; }
|
|
178
|
+
.temp-warm { background: #422006; color: #facc15; }
|
|
179
|
+
.temp-hot { background: #450a0a; color: #f87171; }
|
|
180
|
+
|
|
181
|
+
.bar-container {
|
|
182
|
+
margin-bottom: 8px;
|
|
183
|
+
}
|
|
184
|
+
|
|
185
|
+
.bar-label {
|
|
186
|
+
display: flex;
|
|
187
|
+
justify-content: space-between;
|
|
188
|
+
font-size: 12px;
|
|
189
|
+
color: #94a3b8;
|
|
190
|
+
margin-bottom: 4px;
|
|
191
|
+
}
|
|
192
|
+
|
|
193
|
+
.bar-track {
|
|
194
|
+
height: 8px;
|
|
195
|
+
background: #0f172a;
|
|
196
|
+
border-radius: 4px;
|
|
197
|
+
overflow: hidden;
|
|
198
|
+
}
|
|
199
|
+
|
|
200
|
+
.bar-fill {
|
|
201
|
+
height: 100%;
|
|
202
|
+
border-radius: 4px;
|
|
203
|
+
transition: width 0.5s ease;
|
|
204
|
+
}
|
|
205
|
+
|
|
206
|
+
.bar-fill.usage-low { background: linear-gradient(90deg, #22c55e, #4ade80); }
|
|
207
|
+
.bar-fill.usage-mid { background: linear-gradient(90deg, #eab308, #facc15); }
|
|
208
|
+
.bar-fill.usage-high { background: linear-gradient(90deg, #ef4444, #f87171); }
|
|
209
|
+
|
|
210
|
+
.gpu-stats {
|
|
211
|
+
display: grid;
|
|
212
|
+
grid-template-columns: repeat(3, 1fr);
|
|
213
|
+
gap: 8px;
|
|
214
|
+
margin-top: 8px;
|
|
215
|
+
}
|
|
216
|
+
|
|
217
|
+
.stat {
|
|
218
|
+
text-align: center;
|
|
219
|
+
background: #0f172a;
|
|
220
|
+
border-radius: 6px;
|
|
221
|
+
padding: 8px;
|
|
222
|
+
}
|
|
223
|
+
|
|
224
|
+
.stat .stat-value {
|
|
225
|
+
font-size: 16px;
|
|
226
|
+
font-weight: 700;
|
|
227
|
+
color: #f1f5f9;
|
|
228
|
+
}
|
|
229
|
+
|
|
230
|
+
.stat .stat-label {
|
|
231
|
+
font-size: 10px;
|
|
232
|
+
color: #64748b;
|
|
233
|
+
margin-top: 2px;
|
|
234
|
+
}
|
|
235
|
+
|
|
236
|
+
.loading {
|
|
237
|
+
text-align: center;
|
|
238
|
+
padding: 80px 20px;
|
|
239
|
+
color: #94a3b8;
|
|
240
|
+
font-size: 16px;
|
|
241
|
+
}
|
|
242
|
+
|
|
243
|
+
.spinner {
|
|
244
|
+
display: inline-block;
|
|
245
|
+
width: 28px;
|
|
246
|
+
height: 28px;
|
|
247
|
+
border: 3px solid #334155;
|
|
248
|
+
border-top-color: #60a5fa;
|
|
249
|
+
border-radius: 50%;
|
|
250
|
+
animation: spin 0.8s linear infinite;
|
|
251
|
+
margin-bottom: 12px;
|
|
252
|
+
}
|
|
253
|
+
|
|
254
|
+
@keyframes spin { to { transform: rotate(360deg); } }
|
|
255
|
+
|
|
256
|
+
.auto-refresh-toggle {
|
|
257
|
+
display: flex;
|
|
258
|
+
align-items: center;
|
|
259
|
+
gap: 6px;
|
|
260
|
+
font-size: 13px;
|
|
261
|
+
color: #94a3b8;
|
|
262
|
+
cursor: pointer;
|
|
263
|
+
user-select: none;
|
|
264
|
+
}
|
|
265
|
+
|
|
266
|
+
.auto-refresh-toggle input { cursor: pointer; }
|
|
267
|
+
</style>
|
|
268
|
+
</head>
|
|
269
|
+
<body>
|
|
270
|
+
|
|
271
|
+
<div class="header">
|
|
272
|
+
<h1>GPU Monitor</h1>
|
|
273
|
+
<div class="header-right">
|
|
274
|
+
<span class="status-text" id="update-time"></span>
|
|
275
|
+
<label class="auto-refresh-toggle">
|
|
276
|
+
<input type="checkbox" id="auto-refresh" checked>
|
|
277
|
+
Auto (30s)
|
|
278
|
+
</label>
|
|
279
|
+
<button class="btn-refresh" id="btn-refresh" onclick="refresh()">Refresh</button>
|
|
280
|
+
</div>
|
|
281
|
+
</div>
|
|
282
|
+
|
|
283
|
+
<div class="summary-bar" id="summary-bar"></div>
|
|
284
|
+
<div id="content">
|
|
285
|
+
<div class="loading"><div class="spinner"></div><br>Connecting to hosts...</div>
|
|
286
|
+
</div>
|
|
287
|
+
|
|
288
|
+
<script>
|
|
289
|
+
let autoRefreshTimer = null;
|
|
290
|
+
|
|
291
|
+
function usageClass(pct) {
|
|
292
|
+
if (pct < 50) return 'usage-low';
|
|
293
|
+
if (pct < 80) return 'usage-mid';
|
|
294
|
+
return 'usage-high';
|
|
295
|
+
}
|
|
296
|
+
|
|
297
|
+
function tempClass(t) {
|
|
298
|
+
if (t < 50) return 'temp-cool';
|
|
299
|
+
if (t < 75) return 'temp-warm';
|
|
300
|
+
return 'temp-hot';
|
|
301
|
+
}
|
|
302
|
+
|
|
303
|
+
function formatMB(mb) {
|
|
304
|
+
if (mb >= 1024) return (mb / 1024).toFixed(1) + ' GB';
|
|
305
|
+
return mb.toFixed(0) + ' MB';
|
|
306
|
+
}
|
|
307
|
+
|
|
308
|
+
function renderSummary(hosts) {
|
|
309
|
+
const online = hosts.filter(h => h.status === 'ok');
|
|
310
|
+
const totalGPUs = online.reduce((s, h) => s + h.gpus.length, 0);
|
|
311
|
+
const totalFree = online.reduce((s, h) => s + h.gpus.reduce((gs, g) => gs + g.memory_free_mb, 0), 0);
|
|
312
|
+
const idleGPUs = online.reduce((s, h) => s + h.gpus.filter(g => g.gpu_utilization_pct < 10).length, 0);
|
|
313
|
+
|
|
314
|
+
document.getElementById('summary-bar').innerHTML = `
|
|
315
|
+
<div class="summary-card">
|
|
316
|
+
<div class="label">Online Hosts</div>
|
|
317
|
+
<div class="value" style="color:#4ade80">${online.length}<span style="color:#64748b;font-size:16px"> / ${hosts.length}</span></div>
|
|
318
|
+
</div>
|
|
319
|
+
<div class="summary-card">
|
|
320
|
+
<div class="label">Total GPUs</div>
|
|
321
|
+
<div class="value" style="color:#60a5fa">${totalGPUs}</div>
|
|
322
|
+
</div>
|
|
323
|
+
<div class="summary-card">
|
|
324
|
+
<div class="label">Idle GPUs (< 10%)</div>
|
|
325
|
+
<div class="value" style="color:#4ade80">${idleGPUs}</div>
|
|
326
|
+
</div>
|
|
327
|
+
<div class="summary-card">
|
|
328
|
+
<div class="label">Total Free Memory</div>
|
|
329
|
+
<div class="value" style="color:#a78bfa">${formatMB(totalFree)}</div>
|
|
330
|
+
</div>
|
|
331
|
+
`;
|
|
332
|
+
}
|
|
333
|
+
|
|
334
|
+
function renderGPU(gpu) {
|
|
335
|
+
const memPct = gpu.memory_usage_pct;
|
|
336
|
+
const gpuPct = gpu.gpu_utilization_pct;
|
|
337
|
+
return `
|
|
338
|
+
<div class="gpu-item">
|
|
339
|
+
<div class="gpu-title">
|
|
340
|
+
<span class="gpu-name">GPU ${gpu.index}: ${gpu.name}</span>
|
|
341
|
+
<span class="gpu-temp ${tempClass(gpu.temperature_c)}">${gpu.temperature_c}°C</span>
|
|
342
|
+
</div>
|
|
343
|
+
<div class="bar-container">
|
|
344
|
+
<div class="bar-label">
|
|
345
|
+
<span>GPU Utilization</span>
|
|
346
|
+
<span>${gpuPct}%</span>
|
|
347
|
+
</div>
|
|
348
|
+
<div class="bar-track">
|
|
349
|
+
<div class="bar-fill ${usageClass(gpuPct)}" style="width:${gpuPct}%"></div>
|
|
350
|
+
</div>
|
|
351
|
+
</div>
|
|
352
|
+
<div class="bar-container">
|
|
353
|
+
<div class="bar-label">
|
|
354
|
+
<span>Memory</span>
|
|
355
|
+
<span>${formatMB(gpu.memory_used_mb)} / ${formatMB(gpu.memory_total_mb)}</span>
|
|
356
|
+
</div>
|
|
357
|
+
<div class="bar-track">
|
|
358
|
+
<div class="bar-fill ${usageClass(memPct)}" style="width:${memPct}%"></div>
|
|
359
|
+
</div>
|
|
360
|
+
</div>
|
|
361
|
+
<div class="gpu-stats">
|
|
362
|
+
<div class="stat">
|
|
363
|
+
<div class="stat-value" style="color:${gpuPct < 10 ? '#4ade80' : gpuPct < 50 ? '#facc15' : '#f87171'}">${gpuPct}%</div>
|
|
364
|
+
<div class="stat-label">Utilization</div>
|
|
365
|
+
</div>
|
|
366
|
+
<div class="stat">
|
|
367
|
+
<div class="stat-value">${formatMB(gpu.memory_free_mb)}</div>
|
|
368
|
+
<div class="stat-label">Free Memory</div>
|
|
369
|
+
</div>
|
|
370
|
+
<div class="stat">
|
|
371
|
+
<div class="stat-value">${gpu.temperature_c}°C</div>
|
|
372
|
+
<div class="stat-label">Temperature</div>
|
|
373
|
+
</div>
|
|
374
|
+
</div>
|
|
375
|
+
</div>
|
|
376
|
+
`;
|
|
377
|
+
}
|
|
378
|
+
|
|
379
|
+
function renderHosts(hosts) {
|
|
380
|
+
const container = document.getElementById('content');
|
|
381
|
+
if (!hosts.length) {
|
|
382
|
+
container.innerHTML = '<div class="loading">No hosts found in SSH config.</div>';
|
|
383
|
+
return;
|
|
384
|
+
}
|
|
385
|
+
|
|
386
|
+
container.innerHTML = '<div class="host-grid">' + hosts.map(host => {
|
|
387
|
+
let body = '';
|
|
388
|
+
if (host.status === 'ok') {
|
|
389
|
+
body = host.gpus.map(renderGPU).join('');
|
|
390
|
+
} else if (host.status === 'no_gpu') {
|
|
391
|
+
body = `<div class="no-gpu-msg">${host.error || 'No NVIDIA GPU detected'}</div>`;
|
|
392
|
+
} else {
|
|
393
|
+
body = `<div class="error-msg">${host.error || 'Unknown error'}</div>`;
|
|
394
|
+
}
|
|
395
|
+
|
|
396
|
+
const badgeClass = host.status === 'ok' ? 'badge-ok' : host.status === 'no_gpu' ? 'badge-no_gpu' : 'badge-error';
|
|
397
|
+
const badgeText = host.status === 'ok' ? 'Online' : host.status === 'no_gpu' ? 'No GPU' : 'Offline';
|
|
398
|
+
|
|
399
|
+
return `
|
|
400
|
+
<div class="host-card status-${host.status}">
|
|
401
|
+
<div class="host-header">
|
|
402
|
+
<div>
|
|
403
|
+
<div class="host-name">${host.alias}</div>
|
|
404
|
+
<div class="host-info">${host.user}@${host.hostname}:${host.port}</div>
|
|
405
|
+
</div>
|
|
406
|
+
<span class="status-badge ${badgeClass}">${badgeText}</span>
|
|
407
|
+
</div>
|
|
408
|
+
<div class="host-body">${body}</div>
|
|
409
|
+
</div>
|
|
410
|
+
`;
|
|
411
|
+
}).join('') + '</div>';
|
|
412
|
+
}
|
|
413
|
+
|
|
414
|
+
async function fetchData(force) {
|
|
415
|
+
const url = force ? '/api/refresh' : '/api/gpus';
|
|
416
|
+
const resp = await fetch(url);
|
|
417
|
+
return await resp.json();
|
|
418
|
+
}
|
|
419
|
+
|
|
420
|
+
async function refresh() {
|
|
421
|
+
const btn = document.getElementById('btn-refresh');
|
|
422
|
+
btn.disabled = true;
|
|
423
|
+
btn.textContent = 'Refreshing...';
|
|
424
|
+
try {
|
|
425
|
+
const data = await fetchData(true);
|
|
426
|
+
renderSummary(data.hosts);
|
|
427
|
+
renderHosts(data.hosts);
|
|
428
|
+
updateTime(data.updated_at);
|
|
429
|
+
} catch (e) {
|
|
430
|
+
console.error(e);
|
|
431
|
+
} finally {
|
|
432
|
+
btn.disabled = false;
|
|
433
|
+
btn.textContent = 'Refresh';
|
|
434
|
+
}
|
|
435
|
+
}
|
|
436
|
+
|
|
437
|
+
function updateTime(ts) {
|
|
438
|
+
const d = new Date(ts * 1000);
|
|
439
|
+
document.getElementById('update-time').textContent = 'Updated: ' + d.toLocaleTimeString();
|
|
440
|
+
}
|
|
441
|
+
|
|
442
|
+
async function init() {
|
|
443
|
+
try {
|
|
444
|
+
const data = await fetchData(false);
|
|
445
|
+
renderSummary(data.hosts);
|
|
446
|
+
renderHosts(data.hosts);
|
|
447
|
+
updateTime(data.updated_at);
|
|
448
|
+
} catch (e) {
|
|
449
|
+
document.getElementById('content').innerHTML =
|
|
450
|
+
'<div class="loading" style="color:#f87171">Failed to connect to server.</div>';
|
|
451
|
+
}
|
|
452
|
+
}
|
|
453
|
+
|
|
454
|
+
function setupAutoRefresh() {
|
|
455
|
+
const checkbox = document.getElementById('auto-refresh');
|
|
456
|
+
function doRefresh() {
|
|
457
|
+
fetchData(false).then(data => {
|
|
458
|
+
renderSummary(data.hosts);
|
|
459
|
+
renderHosts(data.hosts);
|
|
460
|
+
updateTime(data.updated_at);
|
|
461
|
+
}).catch(() => {});
|
|
462
|
+
}
|
|
463
|
+
checkbox.addEventListener('change', () => {
|
|
464
|
+
if (checkbox.checked) {
|
|
465
|
+
autoRefreshTimer = setInterval(doRefresh, 30000);
|
|
466
|
+
} else {
|
|
467
|
+
clearInterval(autoRefreshTimer);
|
|
468
|
+
}
|
|
469
|
+
});
|
|
470
|
+
autoRefreshTimer = setInterval(doRefresh, 30000);
|
|
471
|
+
}
|
|
472
|
+
|
|
473
|
+
init();
|
|
474
|
+
setupAutoRefresh();
|
|
475
|
+
</script>
|
|
476
|
+
</body>
|
|
477
|
+
</html>"""
|
|
@@ -0,0 +1,197 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""GPU Monitor - Flask server that reads SSH config and queries remote GPUs."""
|
|
3
|
+
|
|
4
|
+
import os
|
|
5
|
+
import re
|
|
6
|
+
import time
|
|
7
|
+
import threading
|
|
8
|
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
9
|
+
|
|
10
|
+
from flask import Flask, jsonify, Response
|
|
11
|
+
import paramiko
|
|
12
|
+
|
|
13
|
+
from . import __version__
|
|
14
|
+
from .dashboard import DASHBOARD_HTML
|
|
15
|
+
|
|
16
|
+
app = Flask(__name__)
|
|
17
|
+
|
|
18
|
+
SSH_CONFIG_PATH = os.path.expanduser("~/.ssh/config")
|
|
19
|
+
SSH_TIMEOUT = 8
|
|
20
|
+
GPU_QUERY_CMD = (
|
|
21
|
+
"nvidia-smi --query-gpu=index,name,memory.total,memory.used,memory.free,"
|
|
22
|
+
"utilization.gpu,temperature.gpu --format=csv,noheader,nounits 2>/dev/null"
|
|
23
|
+
)
|
|
24
|
+
|
|
25
|
+
cache = {"data": [], "last_update": 0}
|
|
26
|
+
cache_lock = threading.Lock()
|
|
27
|
+
CACHE_TTL = 30
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def parse_ssh_config(path):
|
|
31
|
+
"""Parse ~/.ssh/config and return a list of hosts."""
|
|
32
|
+
hosts = []
|
|
33
|
+
current = None
|
|
34
|
+
|
|
35
|
+
if not os.path.exists(path):
|
|
36
|
+
return hosts
|
|
37
|
+
|
|
38
|
+
with open(path, "r") as f:
|
|
39
|
+
for line in f:
|
|
40
|
+
line = line.strip()
|
|
41
|
+
if not line or line.startswith("#"):
|
|
42
|
+
continue
|
|
43
|
+
|
|
44
|
+
key_match = re.match(r"^(\w+)\s+(.+)$", line)
|
|
45
|
+
if not key_match:
|
|
46
|
+
continue
|
|
47
|
+
|
|
48
|
+
key, value = key_match.group(1), key_match.group(2)
|
|
49
|
+
|
|
50
|
+
if key.lower() == "host":
|
|
51
|
+
if "*" in value or "?" in value:
|
|
52
|
+
current = None
|
|
53
|
+
continue
|
|
54
|
+
current = {
|
|
55
|
+
"alias": value,
|
|
56
|
+
"hostname": None,
|
|
57
|
+
"user": None,
|
|
58
|
+
"port": 22,
|
|
59
|
+
"identity_file": None,
|
|
60
|
+
}
|
|
61
|
+
hosts.append(current)
|
|
62
|
+
elif current is not None:
|
|
63
|
+
if key.lower() == "hostname":
|
|
64
|
+
current["hostname"] = value
|
|
65
|
+
elif key.lower() == "user":
|
|
66
|
+
current["user"] = value
|
|
67
|
+
elif key.lower() == "port":
|
|
68
|
+
current["port"] = int(value)
|
|
69
|
+
elif key.lower() == "identityfile":
|
|
70
|
+
current["identity_file"] = os.path.expanduser(value)
|
|
71
|
+
|
|
72
|
+
return hosts
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
def query_gpu(host_info):
|
|
76
|
+
"""SSH into a host and query GPU information."""
|
|
77
|
+
alias = host_info["alias"]
|
|
78
|
+
hostname = host_info["hostname"] or alias
|
|
79
|
+
user = host_info["user"]
|
|
80
|
+
port = host_info["port"]
|
|
81
|
+
|
|
82
|
+
result = {
|
|
83
|
+
"alias": alias,
|
|
84
|
+
"hostname": hostname,
|
|
85
|
+
"user": user or "unknown",
|
|
86
|
+
"port": port,
|
|
87
|
+
"status": "error",
|
|
88
|
+
"error": None,
|
|
89
|
+
"gpus": [],
|
|
90
|
+
}
|
|
91
|
+
|
|
92
|
+
try:
|
|
93
|
+
client = paramiko.SSHClient()
|
|
94
|
+
client.set_missing_host_key_policy(paramiko.AutoAddPolicy())
|
|
95
|
+
|
|
96
|
+
connect_kwargs = {
|
|
97
|
+
"hostname": hostname,
|
|
98
|
+
"port": port,
|
|
99
|
+
"username": user,
|
|
100
|
+
"timeout": SSH_TIMEOUT,
|
|
101
|
+
"banner_timeout": SSH_TIMEOUT,
|
|
102
|
+
"auth_timeout": SSH_TIMEOUT,
|
|
103
|
+
"allow_agent": True,
|
|
104
|
+
"look_for_keys": True,
|
|
105
|
+
}
|
|
106
|
+
if host_info.get("identity_file"):
|
|
107
|
+
connect_kwargs["key_filename"] = host_info["identity_file"]
|
|
108
|
+
|
|
109
|
+
client.connect(**connect_kwargs)
|
|
110
|
+
|
|
111
|
+
stdin, stdout, stderr = client.exec_command(GPU_QUERY_CMD, timeout=SSH_TIMEOUT)
|
|
112
|
+
output = stdout.read().decode("utf-8").strip()
|
|
113
|
+
|
|
114
|
+
if not output:
|
|
115
|
+
result["status"] = "no_gpu"
|
|
116
|
+
result["error"] = "No NVIDIA GPU found or nvidia-smi not available"
|
|
117
|
+
else:
|
|
118
|
+
gpus = []
|
|
119
|
+
for line in output.split("\n"):
|
|
120
|
+
parts = [p.strip() for p in line.split(",")]
|
|
121
|
+
if len(parts) >= 7:
|
|
122
|
+
mem_total = float(parts[2])
|
|
123
|
+
mem_used = float(parts[3])
|
|
124
|
+
mem_free = float(parts[4])
|
|
125
|
+
utilization = float(parts[5])
|
|
126
|
+
gpus.append({
|
|
127
|
+
"index": int(parts[0]),
|
|
128
|
+
"name": parts[1],
|
|
129
|
+
"memory_total_mb": mem_total,
|
|
130
|
+
"memory_used_mb": mem_used,
|
|
131
|
+
"memory_free_mb": mem_free,
|
|
132
|
+
"memory_usage_pct": round(mem_used / mem_total * 100, 1) if mem_total > 0 else 0,
|
|
133
|
+
"gpu_utilization_pct": utilization,
|
|
134
|
+
"temperature_c": float(parts[6]),
|
|
135
|
+
})
|
|
136
|
+
result["gpus"] = gpus
|
|
137
|
+
if gpus:
|
|
138
|
+
result["status"] = "ok"
|
|
139
|
+
else:
|
|
140
|
+
result["status"] = "no_gpu"
|
|
141
|
+
result["error"] = "nvidia-smi returned no valid GPU data"
|
|
142
|
+
|
|
143
|
+
client.close()
|
|
144
|
+
|
|
145
|
+
except paramiko.AuthenticationException:
|
|
146
|
+
result["error"] = "Authentication failed"
|
|
147
|
+
except paramiko.SSHException as e:
|
|
148
|
+
result["error"] = f"SSH error: {e}"
|
|
149
|
+
except TimeoutError:
|
|
150
|
+
result["error"] = "Connection timed out"
|
|
151
|
+
except OSError as e:
|
|
152
|
+
result["error"] = f"Connection failed: {e}"
|
|
153
|
+
except Exception as e:
|
|
154
|
+
result["error"] = f"{type(e).__name__}: {e}"
|
|
155
|
+
|
|
156
|
+
return result
|
|
157
|
+
|
|
158
|
+
|
|
159
|
+
def fetch_all_gpu_info():
|
|
160
|
+
"""Query all hosts concurrently."""
|
|
161
|
+
hosts = parse_ssh_config(SSH_CONFIG_PATH)
|
|
162
|
+
results = []
|
|
163
|
+
|
|
164
|
+
if not hosts:
|
|
165
|
+
return results
|
|
166
|
+
|
|
167
|
+
with ThreadPoolExecutor(max_workers=10) as executor:
|
|
168
|
+
futures = {executor.submit(query_gpu, h): h for h in hosts}
|
|
169
|
+
for future in as_completed(futures):
|
|
170
|
+
results.append(future.result())
|
|
171
|
+
|
|
172
|
+
order = {"ok": 0, "no_gpu": 1, "error": 2}
|
|
173
|
+
results.sort(key=lambda x: (order.get(x["status"], 3), x["alias"]))
|
|
174
|
+
return results
|
|
175
|
+
|
|
176
|
+
|
|
177
|
+
@app.route("/")
|
|
178
|
+
def index():
|
|
179
|
+
return Response(DASHBOARD_HTML, mimetype="text/html")
|
|
180
|
+
|
|
181
|
+
|
|
182
|
+
@app.route("/api/gpus")
|
|
183
|
+
def api_gpus():
|
|
184
|
+
now = time.time()
|
|
185
|
+
with cache_lock:
|
|
186
|
+
if now - cache["last_update"] > CACHE_TTL:
|
|
187
|
+
cache["data"] = fetch_all_gpu_info()
|
|
188
|
+
cache["last_update"] = now
|
|
189
|
+
return jsonify({"hosts": cache["data"], "updated_at": cache["last_update"]})
|
|
190
|
+
|
|
191
|
+
|
|
192
|
+
@app.route("/api/refresh")
|
|
193
|
+
def api_refresh():
|
|
194
|
+
with cache_lock:
|
|
195
|
+
cache["data"] = fetch_all_gpu_info()
|
|
196
|
+
cache["last_update"] = time.time()
|
|
197
|
+
return jsonify({"hosts": cache["data"], "updated_at": cache["last_update"]})
|
|
@@ -0,0 +1,119 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: gnvitop
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Global nvitop: web-based GPU monitoring dashboard for all your remote servers via SSH
|
|
5
|
+
Author: Linwei94
|
|
6
|
+
License-Expression: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/Linwei94/gnvitop
|
|
8
|
+
Project-URL: Repository, https://github.com/Linwei94/gnvitop
|
|
9
|
+
Project-URL: Issues, https://github.com/Linwei94/gnvitop/issues
|
|
10
|
+
Keywords: gpu,monitor,nvidia,ssh,nvitop,dashboard,nvidia-smi
|
|
11
|
+
Classifier: Development Status :: 4 - Beta
|
|
12
|
+
Classifier: Environment :: Console
|
|
13
|
+
Classifier: Environment :: Web Environment
|
|
14
|
+
Classifier: Intended Audience :: Developers
|
|
15
|
+
Classifier: Intended Audience :: Science/Research
|
|
16
|
+
Classifier: Intended Audience :: System Administrators
|
|
17
|
+
Classifier: Programming Language :: Python :: 3
|
|
18
|
+
Classifier: Topic :: System :: Monitoring
|
|
19
|
+
Requires-Python: >=3.7
|
|
20
|
+
Description-Content-Type: text/markdown
|
|
21
|
+
License-File: LICENSE
|
|
22
|
+
Requires-Dist: flask>=2.0
|
|
23
|
+
Requires-Dist: paramiko>=2.0
|
|
24
|
+
Dynamic: license-file
|
|
25
|
+
|
|
26
|
+
# gnvitop
|
|
27
|
+
|
|
28
|
+
**Global nvitop** -- a web-based GPU monitoring dashboard that monitors **all** your remote GPU servers from a single page.
|
|
29
|
+
|
|
30
|
+
Like [nvitop](https://github.com/XuehaiPan/nvitop), but for **all your servers at once**, displayed as a beautiful web dashboard.
|
|
31
|
+
|
|
32
|
+
```
|
|
33
|
+
pip install gnvitop
|
|
34
|
+
gnvitop
|
|
35
|
+
```
|
|
36
|
+
|
|
37
|
+
## How It Works
|
|
38
|
+
|
|
39
|
+
1. Reads your `~/.ssh/config` automatically
|
|
40
|
+
2. SSH into each server and runs `nvidia-smi`
|
|
41
|
+
3. Displays everything in a real-time web dashboard
|
|
42
|
+
4. Auto-refreshes every 30 seconds
|
|
43
|
+
|
|
44
|
+
```
|
|
45
|
+
┌──> Server A (nvidia-smi) ──> 4x A100
|
|
46
|
+
gnvitop ──> Browser ──> ├──> Server B (nvidia-smi) ──> 8x V100
|
|
47
|
+
├──> Server C (nvidia-smi) ──> 2x RTX 4090
|
|
48
|
+
└──> Server D ──> offline
|
|
49
|
+
```
|
|
50
|
+
|
|
51
|
+
## Installation
|
|
52
|
+
|
|
53
|
+
```bash
|
|
54
|
+
pip install gnvitop
|
|
55
|
+
```
|
|
56
|
+
|
|
57
|
+
## Usage
|
|
58
|
+
|
|
59
|
+
```bash
|
|
60
|
+
gnvitop # start and auto-open browser
|
|
61
|
+
gnvitop -p 8080 # custom port
|
|
62
|
+
gnvitop --host 0.0.0.0 # expose to LAN
|
|
63
|
+
gnvitop --no-browser # don't auto-open browser
|
|
64
|
+
gnvitop --ssh-config /path/to/config # custom SSH config
|
|
65
|
+
gnvitop -v # show version
|
|
66
|
+
```
|
|
67
|
+
|
|
68
|
+
Or run as a module:
|
|
69
|
+
|
|
70
|
+
```bash
|
|
71
|
+
python -m gnvitop
|
|
72
|
+
```
|
|
73
|
+
|
|
74
|
+
## Prerequisites
|
|
75
|
+
|
|
76
|
+
1. **SSH config** -- your `~/.ssh/config` should have server entries:
|
|
77
|
+
|
|
78
|
+
```
|
|
79
|
+
Host gpu-server-01
|
|
80
|
+
HostName 192.168.1.101
|
|
81
|
+
User alice
|
|
82
|
+
IdentityFile ~/.ssh/id_rsa
|
|
83
|
+
|
|
84
|
+
Host gpu-server-02
|
|
85
|
+
HostName 192.168.1.102
|
|
86
|
+
User bob
|
|
87
|
+
```
|
|
88
|
+
|
|
89
|
+
2. **SSH key auth** -- password-less login should be set up
|
|
90
|
+
3. **nvidia-smi** -- must be installed on the remote servers
|
|
91
|
+
|
|
92
|
+
## Features
|
|
93
|
+
|
|
94
|
+
- **Zero config** -- reads `~/.ssh/config` automatically, no setup needed
|
|
95
|
+
- **One command** -- `pip install gnvitop && gnvitop`, that's it
|
|
96
|
+
- **Auto browser** -- opens dashboard in your browser on start
|
|
97
|
+
- **Real-time** -- 30s auto-refresh with manual refresh button
|
|
98
|
+
- **Concurrent** -- queries all servers in parallel (10 workers)
|
|
99
|
+
- **Cached** -- 30s cache to avoid hammering your servers
|
|
100
|
+
- **Dark UI** -- clean, responsive dark-themed dashboard
|
|
101
|
+
- **At a glance** -- summary bar shows online hosts, total GPUs, idle GPUs, free memory
|
|
102
|
+
- **Color coded** -- green (online), yellow (no GPU), red (offline)
|
|
103
|
+
- **GPU details** -- utilization bars, memory bars, temperature with color alerts
|
|
104
|
+
|
|
105
|
+
## Comparison with nvitop
|
|
106
|
+
|
|
107
|
+
| Feature | nvitop | gnvitop |
|
|
108
|
+
|---------|--------|---------|
|
|
109
|
+
| Monitor local GPU | Yes | No |
|
|
110
|
+
| Monitor remote GPUs | No | Yes |
|
|
111
|
+
| Multiple servers | No | Yes |
|
|
112
|
+
| Interface | Terminal | Web browser |
|
|
113
|
+
| Setup | Run on each server | Run once, reads SSH config |
|
|
114
|
+
|
|
115
|
+
**gnvitop** is not a replacement for nvitop -- it's a complement. Use nvitop for detailed local GPU monitoring, use gnvitop to get an overview of all your GPU servers from one place.
|
|
116
|
+
|
|
117
|
+
## License
|
|
118
|
+
|
|
119
|
+
MIT
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
LICENSE
|
|
2
|
+
README.md
|
|
3
|
+
pyproject.toml
|
|
4
|
+
gnvitop/__init__.py
|
|
5
|
+
gnvitop/__main__.py
|
|
6
|
+
gnvitop/cli.py
|
|
7
|
+
gnvitop/dashboard.py
|
|
8
|
+
gnvitop/server.py
|
|
9
|
+
gnvitop.egg-info/PKG-INFO
|
|
10
|
+
gnvitop.egg-info/SOURCES.txt
|
|
11
|
+
gnvitop.egg-info/dependency_links.txt
|
|
12
|
+
gnvitop.egg-info/entry_points.txt
|
|
13
|
+
gnvitop.egg-info/requires.txt
|
|
14
|
+
gnvitop.egg-info/top_level.txt
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
gnvitop
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=64", "wheel"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "gnvitop"
|
|
7
|
+
version = "0.1.0"
|
|
8
|
+
description = "Global nvitop: web-based GPU monitoring dashboard for all your remote servers via SSH"
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
license = "MIT"
|
|
11
|
+
requires-python = ">=3.7"
|
|
12
|
+
authors = [
|
|
13
|
+
{name = "Linwei94"},
|
|
14
|
+
]
|
|
15
|
+
keywords = ["gpu", "monitor", "nvidia", "ssh", "nvitop", "dashboard", "nvidia-smi"]
|
|
16
|
+
classifiers = [
|
|
17
|
+
"Development Status :: 4 - Beta",
|
|
18
|
+
"Environment :: Console",
|
|
19
|
+
"Environment :: Web Environment",
|
|
20
|
+
"Intended Audience :: Developers",
|
|
21
|
+
"Intended Audience :: Science/Research",
|
|
22
|
+
"Intended Audience :: System Administrators",
|
|
23
|
+
"Programming Language :: Python :: 3",
|
|
24
|
+
"Topic :: System :: Monitoring",
|
|
25
|
+
]
|
|
26
|
+
dependencies = [
|
|
27
|
+
"flask>=2.0",
|
|
28
|
+
"paramiko>=2.0",
|
|
29
|
+
]
|
|
30
|
+
|
|
31
|
+
[project.scripts]
|
|
32
|
+
gnvitop = "gnvitop.cli:main"
|
|
33
|
+
|
|
34
|
+
[project.urls]
|
|
35
|
+
Homepage = "https://github.com/Linwei94/gnvitop"
|
|
36
|
+
Repository = "https://github.com/Linwei94/gnvitop"
|
|
37
|
+
Issues = "https://github.com/Linwei94/gnvitop/issues"
|
gnvitop-0.1.0/setup.cfg
ADDED