scopos 2.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- scopos/__init__.py +10 -0
- scopos/app.py +185 -0
- scopos/cli.py +50 -0
- scopos/monitor.py +393 -0
- scopos/widgets.py +308 -0
- scopos-2.0.0.dist-info/METADATA +106 -0
- scopos-2.0.0.dist-info/RECORD +11 -0
- scopos-2.0.0.dist-info/WHEEL +5 -0
- scopos-2.0.0.dist-info/entry_points.txt +2 -0
- scopos-2.0.0.dist-info/licenses/LICENSE +21 -0
- scopos-2.0.0.dist-info/top_level.txt +1 -0
scopos/__init__.py
ADDED
scopos/app.py
ADDED
|
@@ -0,0 +1,185 @@
|
|
|
1
|
+
# -*- coding: utf-8 -*-
|
|
2
|
+
"""The Scopos Textual application."""
|
|
3
|
+
|
|
4
|
+
from __future__ import annotations
|
|
5
|
+
import time
|
|
6
|
+
from rich.text import Text
|
|
7
|
+
from textual.app import (App, ComposeResult)
|
|
8
|
+
from textual.containers import (Container, Horizontal, VerticalScroll)
|
|
9
|
+
from textual.widgets import (Footer, Static)
|
|
10
|
+
from typing import (Dict, List)
|
|
11
|
+
|
|
12
|
+
from . import __version__
|
|
13
|
+
from .monitor import GPUInfo, Monitor
|
|
14
|
+
from .widgets import GpuCard, Logo, SysMeter
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class Clock(Static):
|
|
18
|
+
"""Date / time / version, pinned top-right."""
|
|
19
|
+
|
|
20
|
+
def on_mount(self):
|
|
21
|
+
self.update_clock()
|
|
22
|
+
self.set_interval(1.0, self.update_clock)
|
|
23
|
+
|
|
24
|
+
def update_clock(self):
|
|
25
|
+
now = time.localtime()
|
|
26
|
+
text = Text(justify="left")
|
|
27
|
+
text.append(time.strftime("%Y-%m-%d ", now), style="bold")
|
|
28
|
+
text.append(time.strftime("%H:%M:%S", now), style="bold cyan")
|
|
29
|
+
self.update(text)
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
class ScoposApp(App):
|
|
33
|
+
"""Monitor GPU memory usage, grouped by user."""
|
|
34
|
+
|
|
35
|
+
TITLE = "SCOPOS"
|
|
36
|
+
|
|
37
|
+
# Roughly the narrowest a card stays readable; used to pick column count.
|
|
38
|
+
# The full COMMAND column needs room, so cards stay wide and only tile into
|
|
39
|
+
# multiple columns on genuinely wide terminals.
|
|
40
|
+
CARD_MIN_WIDTH = 100
|
|
41
|
+
|
|
42
|
+
CSS = """
|
|
43
|
+
Screen {
|
|
44
|
+
layout: vertical;
|
|
45
|
+
}
|
|
46
|
+
#topbar {
|
|
47
|
+
height: 5;
|
|
48
|
+
padding: 0 1;
|
|
49
|
+
background: $panel;
|
|
50
|
+
}
|
|
51
|
+
#topbar Logo {
|
|
52
|
+
width: auto;
|
|
53
|
+
height: 5;
|
|
54
|
+
content-align: left top;
|
|
55
|
+
}
|
|
56
|
+
#topbar Clock {
|
|
57
|
+
width: auto;
|
|
58
|
+
height: 4;
|
|
59
|
+
padding-bottom: 0;
|
|
60
|
+
content-align: center bottom;
|
|
61
|
+
}
|
|
62
|
+
#topbar #spacer1 {
|
|
63
|
+
width: 1fr;
|
|
64
|
+
}
|
|
65
|
+
#topbar #spacer2 {
|
|
66
|
+
width: 1fr;
|
|
67
|
+
}
|
|
68
|
+
#topbar SysMeter {
|
|
69
|
+
width: auto;
|
|
70
|
+
height: 5;
|
|
71
|
+
padding-right: 4;
|
|
72
|
+
padding-bottom: 1;
|
|
73
|
+
content-align: right bottom;
|
|
74
|
+
}
|
|
75
|
+
#grid {
|
|
76
|
+
layout: grid;
|
|
77
|
+
grid-size: 1;
|
|
78
|
+
grid-rows: auto;
|
|
79
|
+
grid-gutter: 1 2;
|
|
80
|
+
height: auto;
|
|
81
|
+
padding: 1 2;
|
|
82
|
+
}
|
|
83
|
+
#body {
|
|
84
|
+
height: 1fr;
|
|
85
|
+
}
|
|
86
|
+
#status {
|
|
87
|
+
height: 1;
|
|
88
|
+
padding: 0 2;
|
|
89
|
+
color: $text-muted;
|
|
90
|
+
}
|
|
91
|
+
"""
|
|
92
|
+
|
|
93
|
+
BINDINGS = [
|
|
94
|
+
("q", "quit", "Quit"),
|
|
95
|
+
("r", "refresh", "Refresh now"),
|
|
96
|
+
("d", "toggle_dark", "Light/Dark"),
|
|
97
|
+
]
|
|
98
|
+
|
|
99
|
+
def __init__(self, watch_user: str = "", interval: int = 5, demo: bool = False):
|
|
100
|
+
super().__init__()
|
|
101
|
+
self.interval = max(1, interval)
|
|
102
|
+
self.monitor = Monitor(watch_user=watch_user, demo=demo)
|
|
103
|
+
self.show_detail = bool(self.monitor.watch_user)
|
|
104
|
+
self._cards: Dict[int, GpuCard] = {}
|
|
105
|
+
|
|
106
|
+
def compose(self) -> ComposeResult:
|
|
107
|
+
with Horizontal(id="topbar"):
|
|
108
|
+
yield Logo()
|
|
109
|
+
yield Static(id="spacer1")
|
|
110
|
+
yield Clock()
|
|
111
|
+
yield Static(id="spacer2")
|
|
112
|
+
yield SysMeter(self.monitor)
|
|
113
|
+
with VerticalScroll(id="body"):
|
|
114
|
+
yield Container(id="grid")
|
|
115
|
+
yield Static(id="status")
|
|
116
|
+
yield Footer()
|
|
117
|
+
|
|
118
|
+
def on_mount(self):
|
|
119
|
+
self.refresh_data()
|
|
120
|
+
self.set_interval(self.interval, self.refresh_data)
|
|
121
|
+
|
|
122
|
+
def on_resize(self):
|
|
123
|
+
self._relayout_columns()
|
|
124
|
+
|
|
125
|
+
# -- layout ------------------------------------------------------------
|
|
126
|
+
def _relayout_columns(self):
|
|
127
|
+
if not self._cards:
|
|
128
|
+
return
|
|
129
|
+
width = self.size.width
|
|
130
|
+
cols = max(1, width // self.CARD_MIN_WIDTH)
|
|
131
|
+
cols = min(cols, len(self._cards))
|
|
132
|
+
grid = self.query_one("#grid")
|
|
133
|
+
grid.styles.grid_size_columns = cols
|
|
134
|
+
|
|
135
|
+
# -- data --------------------------------------------------------------
|
|
136
|
+
def action_refresh(self):
|
|
137
|
+
self.refresh_data()
|
|
138
|
+
|
|
139
|
+
def refresh_data(self):
|
|
140
|
+
try:
|
|
141
|
+
gpus = self.monitor.collect()
|
|
142
|
+
except Exception as exc: # keep the UI alive on transient NVML errors
|
|
143
|
+
self.query_one("#status", Static).update(
|
|
144
|
+
Text(f"collection error: {exc}", style="red")
|
|
145
|
+
)
|
|
146
|
+
return
|
|
147
|
+
self._sync_cards(gpus)
|
|
148
|
+
for gpu in gpus:
|
|
149
|
+
self._cards[gpu.index].update(gpu)
|
|
150
|
+
self._update_status(gpus)
|
|
151
|
+
|
|
152
|
+
def _sync_cards(self, gpus: List[GPUInfo]):
|
|
153
|
+
wanted = {g.index for g in gpus}
|
|
154
|
+
if wanted == set(self._cards):
|
|
155
|
+
return
|
|
156
|
+
# GPU set changed (first run, or hot-plug): rebuild the grid.
|
|
157
|
+
grid = self.query_one("#grid")
|
|
158
|
+
grid.remove_children()
|
|
159
|
+
self._cards.clear()
|
|
160
|
+
for gpu in gpus:
|
|
161
|
+
card = GpuCard(self.monitor, self.show_detail)
|
|
162
|
+
self._cards[gpu.index] = card
|
|
163
|
+
grid.mount(card)
|
|
164
|
+
self.call_after_refresh(self._relayout_columns)
|
|
165
|
+
|
|
166
|
+
def _update_status(self, gpus: List[GPUInfo]):
|
|
167
|
+
n_proc = sum(len(g.procs) for g in gpus)
|
|
168
|
+
users = {p.user for g in gpus for p in g.procs}
|
|
169
|
+
mode = "demo" if self.monitor.demo else "live"
|
|
170
|
+
watch = (
|
|
171
|
+
f" · watching [{self.monitor.watch_user}]"
|
|
172
|
+
if self.monitor.watch_user
|
|
173
|
+
else ""
|
|
174
|
+
)
|
|
175
|
+
text = Text()
|
|
176
|
+
text.append(f"{len(gpus)} GPU · {n_proc} proc · {len(users)} users")
|
|
177
|
+
text.append(
|
|
178
|
+
f" · refresh {self.interval}s · {mode}{watch}"
|
|
179
|
+
" · click a column header to sort",
|
|
180
|
+
style="dim",
|
|
181
|
+
)
|
|
182
|
+
self.query_one("#status", Static).update(text)
|
|
183
|
+
|
|
184
|
+
def on_unmount(self):
|
|
185
|
+
self.monitor.stop()
|
scopos/cli.py
ADDED
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
# -*- coding: utf-8 -*-
|
|
2
|
+
"""Command line entry point: ``python -m scopos``."""
|
|
3
|
+
|
|
4
|
+
from __future__ import annotations
|
|
5
|
+
import argparse
|
|
6
|
+
|
|
7
|
+
from . import __version__
|
|
8
|
+
from .app import ScoposApp
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def parse_args(argv=None) -> argparse.Namespace:
|
|
12
|
+
parser = argparse.ArgumentParser(
|
|
13
|
+
prog="scopos",
|
|
14
|
+
description="Monitor GPU memory usage, grouped by user (Textual TUI).",
|
|
15
|
+
)
|
|
16
|
+
parser.add_argument(
|
|
17
|
+
"-u",
|
|
18
|
+
"--user",
|
|
19
|
+
default="",
|
|
20
|
+
help="Highlight this user and show their shell-script task details.",
|
|
21
|
+
)
|
|
22
|
+
parser.add_argument(
|
|
23
|
+
"-i",
|
|
24
|
+
"--interval",
|
|
25
|
+
type=int,
|
|
26
|
+
default=5,
|
|
27
|
+
help="Refresh interval in seconds (default: 5).",
|
|
28
|
+
)
|
|
29
|
+
parser.add_argument(
|
|
30
|
+
"--demo",
|
|
31
|
+
action="store_true",
|
|
32
|
+
help="Run with synthetic GPU data (no NVIDIA driver required).",
|
|
33
|
+
)
|
|
34
|
+
parser.add_argument(
|
|
35
|
+
"-v",
|
|
36
|
+
"--version",
|
|
37
|
+
action="version",
|
|
38
|
+
version=f"scopos {__version__}",
|
|
39
|
+
)
|
|
40
|
+
return parser.parse_args(argv)
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def main(argv=None):
|
|
44
|
+
args = parse_args(argv)
|
|
45
|
+
app = ScoposApp(
|
|
46
|
+
watch_user=args.user,
|
|
47
|
+
interval=args.interval,
|
|
48
|
+
demo=args.demo,
|
|
49
|
+
)
|
|
50
|
+
app.run()
|
scopos/monitor.py
ADDED
|
@@ -0,0 +1,393 @@
|
|
|
1
|
+
# -*- coding: utf-8 -*-
|
|
2
|
+
"""Data collection layer for Scopos.
|
|
3
|
+
|
|
4
|
+
This module is intentionally free of any UI code so it can be reused or
|
|
5
|
+
tested on its own. :class:`Monitor` keeps a small amount of state between
|
|
6
|
+
refreshes so that a given user always keeps the same colour and the same
|
|
7
|
+
process numbering, exactly like the original CLI did.
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
from __future__ import annotations
|
|
11
|
+
import re
|
|
12
|
+
import time
|
|
13
|
+
import random
|
|
14
|
+
from dataclasses import (dataclass, field)
|
|
15
|
+
from typing import (Dict, List, Optional)
|
|
16
|
+
|
|
17
|
+
try: # pynvml is only available on machines with an NVIDIA driver.
|
|
18
|
+
import pynvml as pn
|
|
19
|
+
except Exception: # pragma: no cover - exercised only without a driver.
|
|
20
|
+
pn = None
|
|
21
|
+
|
|
22
|
+
try:
|
|
23
|
+
import psutil
|
|
24
|
+
except Exception: # pragma: no cover
|
|
25
|
+
psutil = None
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
# A palette of visually distinct colours assigned to users in order of
|
|
29
|
+
# first appearance. Names are Rich/Textual colour names so they render the
|
|
30
|
+
# same in tables, bars and legends.
|
|
31
|
+
USER_PALETTE: List[str] = [
|
|
32
|
+
# "bright_red",
|
|
33
|
+
"bright_green",
|
|
34
|
+
"bright_yellow",
|
|
35
|
+
"bright_blue",
|
|
36
|
+
"bright_magenta",
|
|
37
|
+
"bright_cyan",
|
|
38
|
+
"orange1",
|
|
39
|
+
"spring_green2",
|
|
40
|
+
"deep_pink2",
|
|
41
|
+
"gold1",
|
|
42
|
+
"dodger_blue1",
|
|
43
|
+
"medium_purple1",
|
|
44
|
+
"chartreuse2",
|
|
45
|
+
"hot_pink",
|
|
46
|
+
]
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
@dataclass
|
|
50
|
+
class ProcInfo:
|
|
51
|
+
"""A single compute process running on a GPU."""
|
|
52
|
+
|
|
53
|
+
pid: int
|
|
54
|
+
name: str
|
|
55
|
+
user: str
|
|
56
|
+
mem: int # bytes of GPU memory used
|
|
57
|
+
started: str # parent-process creation time, formatted
|
|
58
|
+
runtime: str # how long this process has been running, formatted
|
|
59
|
+
number: str # per-user "parentNo-childNo" label, filled by Monitor
|
|
60
|
+
detail: str # script/task detail (only filled for the watched user)
|
|
61
|
+
cmd: str = "" # full command line, including arguments
|
|
62
|
+
ppid: int = 0 # parent pid, used for per-user numbering
|
|
63
|
+
started_ts: float = 0.0 # raw parent start time, for sorting
|
|
64
|
+
runtime_sec: int = 0 # raw runtime in seconds, for sorting
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
@dataclass
|
|
68
|
+
class GPUInfo:
|
|
69
|
+
"""A snapshot of one GPU and the processes running on it."""
|
|
70
|
+
|
|
71
|
+
index: int
|
|
72
|
+
name: str
|
|
73
|
+
mem_used: int
|
|
74
|
+
mem_total: int
|
|
75
|
+
mem_free: int
|
|
76
|
+
util: int # core utilisation %, -1 if unknown
|
|
77
|
+
temperature: int # degrees C, -1 if unknown
|
|
78
|
+
procs: List[ProcInfo] = field(default_factory=list)
|
|
79
|
+
user_mems: Dict[str, int] = field(default_factory=dict)
|
|
80
|
+
|
|
81
|
+
@property
|
|
82
|
+
def idle_rate(self) -> float:
|
|
83
|
+
return self.mem_free / self.mem_total if self.mem_total else 0.0
|
|
84
|
+
|
|
85
|
+
@property
|
|
86
|
+
def used_rate(self) -> float:
|
|
87
|
+
return self.mem_used / self.mem_total if self.mem_total else 0.0
|
|
88
|
+
|
|
89
|
+
def mvp(self) -> Optional[str]:
|
|
90
|
+
"""Return the user holding the most memory on this GPU, if any."""
|
|
91
|
+
if not self.user_mems:
|
|
92
|
+
return None
|
|
93
|
+
return max(self.user_mems.items(), key=lambda kv: kv[1])[0]
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
def fmt_gb(num_bytes: float) -> str:
|
|
97
|
+
return "%.2f" % (num_bytes / (1024 ** 3))
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
def fmt_duration(seconds: int) -> str:
|
|
101
|
+
"""Format a time span with unit symbols, e.g. "2d 03h", "3h 20m", "45s"."""
|
|
102
|
+
seconds = max(0, int(seconds))
|
|
103
|
+
d, rem = divmod(seconds, 86400)
|
|
104
|
+
h, rem = divmod(rem, 3600)
|
|
105
|
+
m, s = divmod(rem, 60)
|
|
106
|
+
if d:
|
|
107
|
+
return f"{d}d {h:02d}h"
|
|
108
|
+
if h:
|
|
109
|
+
return f"{h}h {m:02d}m"
|
|
110
|
+
if m:
|
|
111
|
+
return f"{m}m {s:02d}s"
|
|
112
|
+
return f"{s}s"
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
class Monitor:
|
|
116
|
+
"""Collects GPU snapshots, keeping per-user state stable across refreshes."""
|
|
117
|
+
|
|
118
|
+
def __init__(self, watch_user: str = "", demo: bool = False):
|
|
119
|
+
self.watch_user = watch_user.strip()
|
|
120
|
+
self.demo = demo
|
|
121
|
+
# username -> colour, assigned on first sight and kept forever.
|
|
122
|
+
self._user_colors: Dict[str, str] = {}
|
|
123
|
+
self._next_color = 0
|
|
124
|
+
self._initialised = False
|
|
125
|
+
if self.watch_user:
|
|
126
|
+
# Make sure the watched user always gets `bright_red`.
|
|
127
|
+
self._user_colors.setdefault(self.watch_user, "bright_red")
|
|
128
|
+
|
|
129
|
+
# -- colours -----------------------------------------------------------
|
|
130
|
+
def color_for(self, user: str) -> str:
|
|
131
|
+
if user not in self._user_colors:
|
|
132
|
+
color = USER_PALETTE[self._next_color % len(USER_PALETTE)]
|
|
133
|
+
self._user_colors[user] = color
|
|
134
|
+
self._next_color += 1
|
|
135
|
+
return self._user_colors[user]
|
|
136
|
+
|
|
137
|
+
# -- lifecycle ---------------------------------------------------------
|
|
138
|
+
def start(self):
|
|
139
|
+
if not self.demo and not self._initialised and pn is not None:
|
|
140
|
+
pn.nvmlInit()
|
|
141
|
+
self._initialised = True
|
|
142
|
+
|
|
143
|
+
def stop(self):
|
|
144
|
+
if not self.demo and self._initialised and pn is not None:
|
|
145
|
+
try:
|
|
146
|
+
pn.nvmlShutdown()
|
|
147
|
+
except Exception:
|
|
148
|
+
pass
|
|
149
|
+
self._initialised = False
|
|
150
|
+
|
|
151
|
+
# -- system memory -----------------------------------------------------
|
|
152
|
+
def system_stats(self) -> Dict[str, tuple]:
|
|
153
|
+
"""Return host RAM/swap usage as {"mem": (used, total), "swap": (...)}.
|
|
154
|
+
|
|
155
|
+
Falls back to plausible synthetic values when psutil is unavailable
|
|
156
|
+
(e.g. demo mode on a machine without it installed).
|
|
157
|
+
"""
|
|
158
|
+
if psutil is not None:
|
|
159
|
+
vm = psutil.virtual_memory()
|
|
160
|
+
sm = psutil.swap_memory()
|
|
161
|
+
return {"mem": (vm.used, vm.total), "swap": (sm.used, sm.total)}
|
|
162
|
+
rng = random.Random()
|
|
163
|
+
mem_total = 32 * 1024 ** 3
|
|
164
|
+
swap_total = 8 * 1024 ** 3
|
|
165
|
+
return {
|
|
166
|
+
"mem": (int(mem_total * rng.uniform(0.2, 0.8)), mem_total),
|
|
167
|
+
"swap": (int(swap_total * rng.uniform(0.0, 0.4)), swap_total),
|
|
168
|
+
}
|
|
169
|
+
|
|
170
|
+
# -- collection --------------------------------------------------------
|
|
171
|
+
def collect(self) -> List[GPUInfo]:
|
|
172
|
+
if self.demo:
|
|
173
|
+
return self._collect_demo()
|
|
174
|
+
return self._collect_real()
|
|
175
|
+
|
|
176
|
+
# -- real collection ---------------------------------------------------
|
|
177
|
+
def _collect_real(self) -> List[GPUInfo]:
|
|
178
|
+
self.start()
|
|
179
|
+
gpus: List[GPUInfo] = []
|
|
180
|
+
|
|
181
|
+
for gpu_id in range(pn.nvmlDeviceGetCount()):
|
|
182
|
+
handle = pn.nvmlDeviceGetHandleByIndex(gpu_id)
|
|
183
|
+
name = _decode(pn.nvmlDeviceGetName(handle))
|
|
184
|
+
mem = pn.nvmlDeviceGetMemoryInfo(handle)
|
|
185
|
+
used, free = int(mem.used), int(mem.free)
|
|
186
|
+
total = used + free
|
|
187
|
+
util, temp = -1, -1
|
|
188
|
+
try:
|
|
189
|
+
util = pn.nvmlDeviceGetUtilizationRates(handle).gpu
|
|
190
|
+
except Exception:
|
|
191
|
+
pass
|
|
192
|
+
try:
|
|
193
|
+
temp = pn.nvmlDeviceGetTemperature(handle, pn.NVML_TEMPERATURE_GPU)
|
|
194
|
+
except Exception:
|
|
195
|
+
pass
|
|
196
|
+
|
|
197
|
+
gpu = GPUInfo(gpu_id, name, used, total, free, util, temp)
|
|
198
|
+
|
|
199
|
+
try:
|
|
200
|
+
processes = pn.nvmlDeviceGetComputeRunningProcesses_v2(handle)
|
|
201
|
+
except Exception:
|
|
202
|
+
processes = []
|
|
203
|
+
|
|
204
|
+
for process in processes:
|
|
205
|
+
info = self._build_proc(process)
|
|
206
|
+
if info is None:
|
|
207
|
+
continue
|
|
208
|
+
gpu.procs.append(info)
|
|
209
|
+
gpu.user_mems[info.user] = (
|
|
210
|
+
gpu.user_mems.get(info.user, 0) + info.mem
|
|
211
|
+
)
|
|
212
|
+
gpus.append(gpu)
|
|
213
|
+
self._assign_numbers(gpus)
|
|
214
|
+
return gpus
|
|
215
|
+
|
|
216
|
+
def _assign_numbers(self, gpus: List[GPUInfo]) -> None:
|
|
217
|
+
"""Fill in each process's "parentNo-childNo" label.
|
|
218
|
+
|
|
219
|
+
Numbering is per user and spans every GPU: a user's parent processes
|
|
220
|
+
are numbered in the order they are first seen across all GPUs, and the
|
|
221
|
+
child counter increments for every process sharing that parent.
|
|
222
|
+
"""
|
|
223
|
+
user_ppids: Dict[str, List[int]] = {}
|
|
224
|
+
child_count: Dict[tuple, int] = {}
|
|
225
|
+
for gpu in gpus:
|
|
226
|
+
for proc in gpu.procs:
|
|
227
|
+
ppids = user_ppids.setdefault(proc.user, [])
|
|
228
|
+
if proc.ppid not in ppids:
|
|
229
|
+
ppids.append(proc.ppid)
|
|
230
|
+
pp_no = ppids.index(proc.ppid) + 1
|
|
231
|
+
key = (proc.user, proc.ppid)
|
|
232
|
+
child_count[key] = child_count.get(key, 0) + 1
|
|
233
|
+
proc.number = f"{pp_no:02d}-{child_count[key]:02d}"
|
|
234
|
+
|
|
235
|
+
def _build_proc(self, process) -> Optional[ProcInfo]:
|
|
236
|
+
try:
|
|
237
|
+
pid = int(process.pid)
|
|
238
|
+
p = psutil.Process(pid)
|
|
239
|
+
except Exception:
|
|
240
|
+
return None
|
|
241
|
+
started_ts = 0.0
|
|
242
|
+
try:
|
|
243
|
+
ppid = p.ppid()
|
|
244
|
+
pp = psutil.Process(ppid)
|
|
245
|
+
started_ts = pp.create_time()
|
|
246
|
+
started = time.strftime("%y-%m-%d %H:%M:%S", time.localtime(started_ts))
|
|
247
|
+
except Exception:
|
|
248
|
+
ppid = 0
|
|
249
|
+
pp = None
|
|
250
|
+
started = "?"
|
|
251
|
+
|
|
252
|
+
runtime_sec = int(time.time() - p.create_time())
|
|
253
|
+
runtime = fmt_duration(runtime_sec)
|
|
254
|
+
try:
|
|
255
|
+
user = p.username()
|
|
256
|
+
except Exception:
|
|
257
|
+
user = "?"
|
|
258
|
+
self.color_for(user)
|
|
259
|
+
|
|
260
|
+
try:
|
|
261
|
+
cmd = " ".join(p.cmdline()).strip()
|
|
262
|
+
except Exception:
|
|
263
|
+
cmd = ""
|
|
264
|
+
if not cmd:
|
|
265
|
+
cmd = p.name()
|
|
266
|
+
|
|
267
|
+
mem = int(process.usedGpuMemory or 0)
|
|
268
|
+
|
|
269
|
+
detail = "-"
|
|
270
|
+
if user == self.watch_user and pp is not None:
|
|
271
|
+
detail = self._script_detail(p, pp)
|
|
272
|
+
|
|
273
|
+
# number is assigned later, once every GPU has been collected.
|
|
274
|
+
return ProcInfo(
|
|
275
|
+
pid, p.name(), user, mem, started, runtime, "", detail,
|
|
276
|
+
cmd=cmd, ppid=ppid, started_ts=started_ts, runtime_sec=runtime_sec,
|
|
277
|
+
)
|
|
278
|
+
|
|
279
|
+
def _script_detail(self, p, pp) -> str:
|
|
280
|
+
"""Best-effort reconstruction of which task in a shell script is running.
|
|
281
|
+
|
|
282
|
+
Ported from the original tool; wrapped so any failure simply shows "?".
|
|
283
|
+
"""
|
|
284
|
+
try:
|
|
285
|
+
pp_file_path = pp.open_files()[0].path
|
|
286
|
+
pp_file_name = pp_file_path.rsplit("/", maxsplit=1)[-1]
|
|
287
|
+
cur_cmd = " ".join(p.cmdline())
|
|
288
|
+
total_task = 0
|
|
289
|
+
cur_task = -1
|
|
290
|
+
bash_args: Dict[str, str] = {}
|
|
291
|
+
|
|
292
|
+
def replace_bash_args(cmd: str) -> str:
|
|
293
|
+
for arg, val in bash_args.items():
|
|
294
|
+
rx = re.compile(r"\$(\{" + arg + r"\}|" + arg + r"(?!_))")
|
|
295
|
+
cmd = rx.sub(val, cmd)
|
|
296
|
+
return cmd.replace('"', "")
|
|
297
|
+
|
|
298
|
+
with open(pp_file_path, "r", newline=None) as fh:
|
|
299
|
+
for cmd in fh:
|
|
300
|
+
if cmd.startswith("#"):
|
|
301
|
+
continue
|
|
302
|
+
cmd = cmd.strip("\n")
|
|
303
|
+
if cmd.startswith(p.name()):
|
|
304
|
+
total_task += 1
|
|
305
|
+
if replace_bash_args(cmd) == cur_cmd:
|
|
306
|
+
cur_task = total_task
|
|
307
|
+
elif "=" in cmd:
|
|
308
|
+
key, raw = cmd.split("=", maxsplit=1)
|
|
309
|
+
if "$" in raw:
|
|
310
|
+
val = replace_bash_args(raw)
|
|
311
|
+
if "$" in val:
|
|
312
|
+
raise NotImplementedError
|
|
313
|
+
else:
|
|
314
|
+
val = raw
|
|
315
|
+
bash_args[key] = val.strip('"')
|
|
316
|
+
return f"{pp_file_name} [{cur_task}/{total_task}]"
|
|
317
|
+
except Exception:
|
|
318
|
+
return "?"
|
|
319
|
+
|
|
320
|
+
# -- demo collection ---------------------------------------------------
|
|
321
|
+
def _collect_demo(self) -> List[GPUInfo]:
|
|
322
|
+
rng = random.Random() # fresh randomness each tick for a "live" feel
|
|
323
|
+
names = [
|
|
324
|
+
"NVIDIA GeForce RTX 4090",
|
|
325
|
+
"NVIDIA A100-SXM4-80GB",
|
|
326
|
+
"NVIDIA H100 80GB HBM3",
|
|
327
|
+
]
|
|
328
|
+
users_pool = ["alice", "bob", "carol", "dave", "erin", self.watch_user or "frank"]
|
|
329
|
+
# Give each user a small pool of "parent" pids so the same parent can
|
|
330
|
+
# show up on several GPUs - that is what makes the per-user numbering
|
|
331
|
+
# (parentNo-childNo) interesting to look at.
|
|
332
|
+
parent_pids = {u: [rng.randint(1000, 9999) for _ in range(2)] for u in users_pool}
|
|
333
|
+
gpus: List[GPUInfo] = []
|
|
334
|
+
n_gpu = 4
|
|
335
|
+
for gpu_id in range(n_gpu):
|
|
336
|
+
total = rng.choice([24, 40, 80]) * 1024 ** 3
|
|
337
|
+
gpu = GPUInfo(
|
|
338
|
+
gpu_id,
|
|
339
|
+
names[gpu_id % len(names)],
|
|
340
|
+
0,
|
|
341
|
+
total,
|
|
342
|
+
total,
|
|
343
|
+
rng.randint(0, 100),
|
|
344
|
+
rng.randint(35, 85),
|
|
345
|
+
)
|
|
346
|
+
used = 0
|
|
347
|
+
n_proc = rng.randint(0, 5)
|
|
348
|
+
for _ in range(n_proc):
|
|
349
|
+
user = rng.choice(users_pool)
|
|
350
|
+
self.color_for(user)
|
|
351
|
+
mem = rng.randint(1, 12) * 1024 ** 3
|
|
352
|
+
if used + mem > total:
|
|
353
|
+
break
|
|
354
|
+
used += mem
|
|
355
|
+
detail = "-"
|
|
356
|
+
if user == self.watch_user:
|
|
357
|
+
detail = f"train_{rng.randint(1,9)}.sh [{rng.randint(1,4)}/4]"
|
|
358
|
+
runtime_sec = rng.randint(0, 400000)
|
|
359
|
+
started_ts = time.time() - runtime_sec
|
|
360
|
+
script = rng.choice(["train.py", "finetune.py", "eval.py", "main.py"])
|
|
361
|
+
cmd = (
|
|
362
|
+
f"python {script} --lr {rng.choice(['1e-3', '5e-4', '3e-5'])}"
|
|
363
|
+
f" --batch-size {rng.choice([16, 32, 64])}"
|
|
364
|
+
f" --epochs {rng.randint(10, 200)} --fp16"
|
|
365
|
+
)
|
|
366
|
+
gpu.procs.append(
|
|
367
|
+
ProcInfo(
|
|
368
|
+
pid=rng.randint(10000, 99999),
|
|
369
|
+
name=rng.choice(["python", "python3", "train", "pt_main"]),
|
|
370
|
+
user=user,
|
|
371
|
+
mem=mem,
|
|
372
|
+
started=time.strftime("%y-%m-%d %H:%M:%S", time.localtime(started_ts)),
|
|
373
|
+
runtime=fmt_duration(runtime_sec),
|
|
374
|
+
number="",
|
|
375
|
+
detail=detail,
|
|
376
|
+
cmd=cmd,
|
|
377
|
+
ppid=rng.choice(parent_pids[user]),
|
|
378
|
+
started_ts=started_ts,
|
|
379
|
+
runtime_sec=runtime_sec,
|
|
380
|
+
)
|
|
381
|
+
)
|
|
382
|
+
gpu.user_mems[user] = gpu.user_mems.get(user, 0) + mem
|
|
383
|
+
gpu.mem_used = used
|
|
384
|
+
gpu.mem_free = total - used
|
|
385
|
+
gpus.append(gpu)
|
|
386
|
+
self._assign_numbers(gpus)
|
|
387
|
+
return gpus
|
|
388
|
+
|
|
389
|
+
|
|
390
|
+
def _decode(value) -> str:
|
|
391
|
+
if isinstance(value, bytes):
|
|
392
|
+
return value.decode("utf-8", "replace")
|
|
393
|
+
return str(value)
|
scopos/widgets.py
ADDED
|
@@ -0,0 +1,308 @@
|
|
|
1
|
+
# -*- coding: utf-8 -*-
|
|
2
|
+
"""Reusable Textual widgets for Scopos."""
|
|
3
|
+
|
|
4
|
+
from __future__ import annotations
|
|
5
|
+
from rich.text import Text
|
|
6
|
+
from textual.containers import Vertical
|
|
7
|
+
from textual.widget import Widget
|
|
8
|
+
from textual.widgets import (DataTable, Static)
|
|
9
|
+
from typing import (Callable, List, Optional, Tuple)
|
|
10
|
+
|
|
11
|
+
from . import __version__
|
|
12
|
+
from .monitor import (GPUInfo, Monitor, fmt_gb)
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
LOGO = r""" ___ ___ _____ ____ _____ ___
|
|
16
|
+
/ __) / __)( _ )( _ \( _ )/ __)
|
|
17
|
+
\__ \( (__ )(_)( )___/ )(_)( \__ \
|
|
18
|
+
(___/ \___)(_____)(__) (_____)(___/"""
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class Logo(Static):
|
|
22
|
+
"""The SCOPOS ASCII logo, pinned top-left."""
|
|
23
|
+
|
|
24
|
+
def __init__(self):
|
|
25
|
+
text = Text(LOGO, style="bold cyan")
|
|
26
|
+
text.append(f" v{__version__}", style="dim white")
|
|
27
|
+
super().__init__(text)
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
class SysMeter(Static):
|
|
31
|
+
"""Compact host RAM / swap usage bars, shown next to the logo."""
|
|
32
|
+
|
|
33
|
+
DEFAULT_CSS = """
|
|
34
|
+
SysMeter {
|
|
35
|
+
width: auto;
|
|
36
|
+
height: auto;
|
|
37
|
+
}
|
|
38
|
+
"""
|
|
39
|
+
|
|
40
|
+
BAR_WIDTH = 26
|
|
41
|
+
|
|
42
|
+
def __init__(self, monitor: Monitor) -> None:
|
|
43
|
+
super().__init__()
|
|
44
|
+
self.monitor = monitor
|
|
45
|
+
|
|
46
|
+
def on_mount(self) -> None:
|
|
47
|
+
self.refresh_stats()
|
|
48
|
+
self.set_interval(2.0, self.refresh_stats)
|
|
49
|
+
|
|
50
|
+
def refresh_stats(self) -> None:
|
|
51
|
+
stats = self.monitor.system_stats()
|
|
52
|
+
text = Text(justify="right")
|
|
53
|
+
text.append(self._line("Mem", *stats["mem"]))
|
|
54
|
+
text.append("\n")
|
|
55
|
+
text.append(self._line("Swp", *stats["swap"]))
|
|
56
|
+
self.update(text)
|
|
57
|
+
|
|
58
|
+
def _line(self, label: str, used: float, total: float) -> Text:
|
|
59
|
+
total = total or 1
|
|
60
|
+
frac = max(0.0, min(1.0, used / total))
|
|
61
|
+
if frac >= 0.85:
|
|
62
|
+
color = "red"
|
|
63
|
+
elif frac >= 0.6:
|
|
64
|
+
color = "yellow"
|
|
65
|
+
else:
|
|
66
|
+
color = "green"
|
|
67
|
+
filled = round(frac * self.BAR_WIDTH)
|
|
68
|
+
gb = 1024 ** 3
|
|
69
|
+
line = Text()
|
|
70
|
+
line.append(f"{label} ", style="bold")
|
|
71
|
+
line.append("▕", style="grey50")
|
|
72
|
+
line.append("█" * filled, style=color)
|
|
73
|
+
line.append("░" * (self.BAR_WIDTH - filled), style="grey35")
|
|
74
|
+
line.append("▏", style="grey50")
|
|
75
|
+
line.append(f" {used / gb:5.1f} / {total / gb:5.1f} GB", style="dim")
|
|
76
|
+
line.append(f" {frac * 100:3.0f}%", style=color)
|
|
77
|
+
return line
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
class MemoryBar(Widget):
|
|
81
|
+
"""A single-line bar whose coloured segments show each user's share.
|
|
82
|
+
|
|
83
|
+
The bar always fills its own width, so it grows and shrinks with the
|
|
84
|
+
terminal - that is what gives the "直观占比" (intuitive proportion) view.
|
|
85
|
+
"""
|
|
86
|
+
|
|
87
|
+
DEFAULT_CSS = """
|
|
88
|
+
MemoryBar {
|
|
89
|
+
height: 1;
|
|
90
|
+
width: 1fr;
|
|
91
|
+
}
|
|
92
|
+
"""
|
|
93
|
+
|
|
94
|
+
def __init__(self):
|
|
95
|
+
super().__init__()
|
|
96
|
+
self._segments: List[Tuple[str, float]] = []
|
|
97
|
+
self._total: float = 1.0
|
|
98
|
+
|
|
99
|
+
def set_data(self, segments: List[Tuple[str, float]], total: float):
|
|
100
|
+
"""segments: list of (colour, weight); total: the bar's full weight."""
|
|
101
|
+
self._segments = segments
|
|
102
|
+
self._total = total or 1.0
|
|
103
|
+
self.refresh()
|
|
104
|
+
|
|
105
|
+
def render(self) -> Text:
|
|
106
|
+
width = self.size.width or 1
|
|
107
|
+
text = Text(no_wrap=True, overflow="crop")
|
|
108
|
+
used = 0
|
|
109
|
+
for color, weight in self._segments:
|
|
110
|
+
if weight <= 0:
|
|
111
|
+
continue
|
|
112
|
+
cells = round(weight / self._total * width)
|
|
113
|
+
if cells == 0:
|
|
114
|
+
cells = 1 # keep tiny-but-present users visible
|
|
115
|
+
cells = min(cells, width - used)
|
|
116
|
+
if cells <= 0:
|
|
117
|
+
break
|
|
118
|
+
text.append("█" * cells, style=color)
|
|
119
|
+
used += cells
|
|
120
|
+
if used < width:
|
|
121
|
+
text.append("░" * (width - used), style="grey35")
|
|
122
|
+
return text
|
|
123
|
+
|
|
124
|
+
|
|
125
|
+
class GpuCard(Vertical):
|
|
126
|
+
"""One GPU: header, stats line, proportion bar, legend and process table."""
|
|
127
|
+
|
|
128
|
+
DEFAULT_CSS = """
|
|
129
|
+
GpuCard {
|
|
130
|
+
height: auto;
|
|
131
|
+
border: round $primary;
|
|
132
|
+
border-title-color: $text;
|
|
133
|
+
border-title-style: bold;
|
|
134
|
+
padding: 0 1;
|
|
135
|
+
margin: 0;
|
|
136
|
+
}
|
|
137
|
+
GpuCard .stats { height: 1; }
|
|
138
|
+
GpuCard .legend { height: auto; color: $text-muted; }
|
|
139
|
+
GpuCard DataTable {
|
|
140
|
+
height: auto;
|
|
141
|
+
max-height: 30;
|
|
142
|
+
margin-top: 1;
|
|
143
|
+
}
|
|
144
|
+
"""
|
|
145
|
+
|
|
146
|
+
# Header labels and, for each, how to sort the rows by that column.
|
|
147
|
+
# ``None`` means the column is not sortable. The trailing DETAIL column is
|
|
148
|
+
# only shown when a user is being watched (see ``_headers``).
|
|
149
|
+
COLUMNS: List[Tuple[str, Optional[Callable]]] = [
|
|
150
|
+
("PID", lambda p: p.pid),
|
|
151
|
+
("USER", lambda p: p.user.lower()),
|
|
152
|
+
("NO.", lambda p: (p.user.lower(), p.number)),
|
|
153
|
+
("MEM/GB", lambda p: p.mem),
|
|
154
|
+
("STARTED", lambda p: p.started_ts),
|
|
155
|
+
("RUNTIME", lambda p: p.runtime_sec),
|
|
156
|
+
("DETAIL", lambda p: (p.detail or "").lower()),
|
|
157
|
+
("COMMAND", lambda p: p.cmd.lower()),
|
|
158
|
+
]
|
|
159
|
+
# Columns that read most naturally largest-first on the initial click:
|
|
160
|
+
# PID, MEM/GB, STARTED, RUNTIME.
|
|
161
|
+
DESC_FIRST = {0, 3, 4, 5}
|
|
162
|
+
|
|
163
|
+
def __init__(self, monitor: Monitor, show_detail: bool) -> None:
|
|
164
|
+
super().__init__()
|
|
165
|
+
self.monitor = monitor
|
|
166
|
+
self.stats = Static(classes="stats")
|
|
167
|
+
self.bar = MemoryBar()
|
|
168
|
+
self.legend = Static(classes="legend")
|
|
169
|
+
self.table = DataTable(zebra_stripes=True, cursor_type="row")
|
|
170
|
+
self._pending: Optional[GPUInfo] = None
|
|
171
|
+
self._gpu: Optional[GPUInfo] = None
|
|
172
|
+
self._sort_index: Optional[int] = None
|
|
173
|
+
self._sort_reverse: bool = False
|
|
174
|
+
# detail
|
|
175
|
+
self.show_detail = show_detail
|
|
176
|
+
if not show_detail:
|
|
177
|
+
self._header = self.COLUMNS[:-2] + self.COLUMNS[-1:]
|
|
178
|
+
else:
|
|
179
|
+
self._header = self.COLUMNS
|
|
180
|
+
|
|
181
|
+
def compose(self):
|
|
182
|
+
yield self.stats
|
|
183
|
+
yield self.bar
|
|
184
|
+
yield self.legend
|
|
185
|
+
yield self.table
|
|
186
|
+
|
|
187
|
+
def on_mount(self) -> None:
|
|
188
|
+
if self._pending is not None:
|
|
189
|
+
self._apply(self._pending)
|
|
190
|
+
|
|
191
|
+
# -- sorting -----------------------------------------------------------
|
|
192
|
+
def on_data_table_header_selected(
|
|
193
|
+
self, event: DataTable.HeaderSelected
|
|
194
|
+
) -> None:
|
|
195
|
+
event.stop()
|
|
196
|
+
idx = event.column_index
|
|
197
|
+
if idx >= len(self._header) or self._header[idx][1] is None:
|
|
198
|
+
return
|
|
199
|
+
if self._sort_index == idx:
|
|
200
|
+
self._sort_reverse = not self._sort_reverse
|
|
201
|
+
else:
|
|
202
|
+
self._sort_index = idx
|
|
203
|
+
self._sort_reverse = idx in self.DESC_FIRST
|
|
204
|
+
if self._gpu is not None:
|
|
205
|
+
self._update_table(self._gpu)
|
|
206
|
+
|
|
207
|
+
# -- updating ----------------------------------------------------------
|
|
208
|
+
def update(self, gpu: GPUInfo):
|
|
209
|
+
# A card may be updated in the same frame it is mounted, before its
|
|
210
|
+
# columns exist; defer until on_mount in that case.
|
|
211
|
+
if not self.is_mounted:
|
|
212
|
+
self._pending = gpu
|
|
213
|
+
return
|
|
214
|
+
self._apply(gpu)
|
|
215
|
+
|
|
216
|
+
def _apply(self, gpu: GPUInfo):
|
|
217
|
+
self._pending = None
|
|
218
|
+
self._gpu = gpu
|
|
219
|
+
self.border_title = f" #{gpu.index} {gpu.name} "
|
|
220
|
+
self._update_stats(gpu)
|
|
221
|
+
self._update_bar(gpu)
|
|
222
|
+
self._update_legend(gpu)
|
|
223
|
+
self._update_table(gpu)
|
|
224
|
+
|
|
225
|
+
def _update_stats(self, gpu: GPUInfo):
|
|
226
|
+
rate = gpu.idle_rate
|
|
227
|
+
# if rate <= 0.15:
|
|
228
|
+
# free_style = "bold white on red"
|
|
229
|
+
# elif rate <= 0.5:
|
|
230
|
+
# free_style = "bold black on yellow"
|
|
231
|
+
# else:
|
|
232
|
+
# free_style = "bold black on green"
|
|
233
|
+
if rate <= 0.15:
|
|
234
|
+
free_style = "bold red"
|
|
235
|
+
elif rate <= 0.5:
|
|
236
|
+
free_style = "bold yellow"
|
|
237
|
+
else:
|
|
238
|
+
free_style = "bold green"
|
|
239
|
+
|
|
240
|
+
line = Text(no_wrap=True, overflow="ellipsis")
|
|
241
|
+
line.append("USED ", style="bold")
|
|
242
|
+
line.append(f"{fmt_gb(gpu.mem_used)}", style="bold")
|
|
243
|
+
line.append(f" / {fmt_gb(gpu.mem_total)} GB", style="dim")
|
|
244
|
+
line.append(f" ({gpu.used_rate * 100:.0f}%) ")
|
|
245
|
+
line.append(f" FREE {fmt_gb(gpu.mem_free)} GB ", style=free_style)
|
|
246
|
+
if gpu.util >= 0:
|
|
247
|
+
line.append(f" ⚡ {gpu.util}%", style="cyan")
|
|
248
|
+
if gpu.temperature >= 0:
|
|
249
|
+
temp_style = "red" if gpu.temperature >= 80 else "cyan"
|
|
250
|
+
line.append(f" 🌡 {gpu.temperature}°C", style=temp_style)
|
|
251
|
+
self.stats.update(line)
|
|
252
|
+
|
|
253
|
+
def _update_bar(self, gpu: GPUInfo):
|
|
254
|
+
ordered = sorted(gpu.user_mems.items(), key=lambda kv: kv[1], reverse=True)
|
|
255
|
+
segments = [(self.monitor.color_for(u), float(m)) for u, m in ordered]
|
|
256
|
+
self.bar.set_data(segments, float(gpu.mem_total))
|
|
257
|
+
|
|
258
|
+
def _update_legend(self, gpu: GPUInfo):
|
|
259
|
+
ordered = sorted(gpu.user_mems.items(), key=lambda kv: kv[1], reverse=True)
|
|
260
|
+
legend = Text(no_wrap=True, overflow="ellipsis")
|
|
261
|
+
if not ordered:
|
|
262
|
+
legend.append("idle", style="dim")
|
|
263
|
+
self.legend.update(legend)
|
|
264
|
+
return
|
|
265
|
+
mvp = ordered[0][0]
|
|
266
|
+
for user, mem in ordered:
|
|
267
|
+
color = self.monitor.color_for(user)
|
|
268
|
+
pct = mem / gpu.mem_total * 100 if gpu.mem_total else 0
|
|
269
|
+
legend.append("● ", style=color)
|
|
270
|
+
crown = "🏆 " if user == mvp else ""
|
|
271
|
+
legend.append(f"{crown}{user} {fmt_gb(mem)}G {pct:.0f}% ")
|
|
272
|
+
self.legend.update(legend)
|
|
273
|
+
|
|
274
|
+
def _update_table(self, gpu: GPUInfo) -> None:
|
|
275
|
+
# Rebuild columns each time so the sort arrow can move between headers.
|
|
276
|
+
self.table.clear(columns=True)
|
|
277
|
+
headers = self._header
|
|
278
|
+
labels = []
|
|
279
|
+
for i, (name, _) in enumerate(headers):
|
|
280
|
+
if i == self._sort_index:
|
|
281
|
+
name = f"{name} {'▼' if self._sort_reverse else '▲'}"
|
|
282
|
+
labels.append(name)
|
|
283
|
+
self.table.add_columns(*labels)
|
|
284
|
+
|
|
285
|
+
procs = list(gpu.procs)
|
|
286
|
+
if self._sort_index is not None:
|
|
287
|
+
key = headers[self._sort_index][1]
|
|
288
|
+
if key is not None:
|
|
289
|
+
procs.sort(key=key, reverse=self._sort_reverse)
|
|
290
|
+
|
|
291
|
+
for proc in procs:
|
|
292
|
+
color = self.monitor.color_for(proc.user)
|
|
293
|
+
row = [
|
|
294
|
+
str(proc.pid),
|
|
295
|
+
Text(f"● {proc.user}", style=color),
|
|
296
|
+
proc.number,
|
|
297
|
+
fmt_gb(proc.mem),
|
|
298
|
+
proc.started,
|
|
299
|
+
proc.runtime,
|
|
300
|
+
proc.cmd,
|
|
301
|
+
]
|
|
302
|
+
if self.show_detail:
|
|
303
|
+
row.insert(-1, proc.detail)
|
|
304
|
+
self.table.add_row(*row)
|
|
305
|
+
if not procs:
|
|
306
|
+
empty = ["" for _ in headers]
|
|
307
|
+
empty[1] = "— no compute processes —"
|
|
308
|
+
self.table.add_row(*[Text(c, style="dim") for c in empty])
|
|
@@ -0,0 +1,106 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: scopos
|
|
3
|
+
Version: 2.0.0
|
|
4
|
+
Summary: A Textual TUI for monitoring GPU memory usage, grouped by user.
|
|
5
|
+
Author-email: Zhen Tian <zhen.tian.cs@gmail.com>
|
|
6
|
+
Project-URL: Homepage, https://github.com/tinchen777/scopos-cli.git
|
|
7
|
+
Project-URL: Repository, https://github.com/tinchen777/scopos-cli.git
|
|
8
|
+
Project-URL: Issues, https://github.com/tinchen777/scopos-cli.git/issues
|
|
9
|
+
Requires-Python: >=3.8
|
|
10
|
+
Description-Content-Type: text/markdown
|
|
11
|
+
License-File: LICENSE
|
|
12
|
+
Requires-Dist: textual>=0.60
|
|
13
|
+
Requires-Dist: psutil>=5.9
|
|
14
|
+
Requires-Dist: nvidia-ml-py>=12.0
|
|
15
|
+
Dynamic: license-file
|
|
16
|
+
|
|
17
|
+
<div align="center">
|
|
18
|
+
|
|
19
|
+
<h2 id="title">
|
|
20
|
+
🐱👓 SCOPOS 🐱👓<br>
|
|
21
|
+
<sub>NVIDIA GPU Monitor</sub>
|
|
22
|
+
</h2>
|
|
23
|
+
|
|
24
|
+
[](https://pypi.org/project/scopos/)
|
|
25
|
+

|
|
26
|
+

|
|
27
|
+
|
|
28
|
+

|
|
29
|
+
|
|
30
|
+
</div>
|
|
31
|
+
|
|
32
|
+
```text
|
|
33
|
+
___ ___ _____ ____ _____ ___
|
|
34
|
+
/ __) / __)( _ )( _ \( _ )/ __)
|
|
35
|
+
\__ \( (__ )(_)( )___/ )(_)( \__ \
|
|
36
|
+
(___/ \___)(_____)(__) (_____)(___/
|
|
37
|
+
```
|
|
38
|
+
|
|
39
|
+
## About
|
|
40
|
+
|
|
41
|
+
Monitor NVIDIA GPU memory usage from the terminal, **grouped by user**. SCOPOS
|
|
42
|
+
is built with [Textual](https://textual.textualize.io/): the layout adapts to
|
|
43
|
+
your terminal size, and every GPU shows an at-a-glance bar of how its memory is
|
|
44
|
+
split between users.
|
|
45
|
+
|
|
46
|
+
- Python: 3.8+
|
|
47
|
+
|
|
48
|
+
## Installation
|
|
49
|
+
|
|
50
|
+
### Install with pipx
|
|
51
|
+
|
|
52
|
+
`pipx` installs the application in an isolated environment while making
|
|
53
|
+
the command globally available.
|
|
54
|
+
|
|
55
|
+
```bash
|
|
56
|
+
pip install pipx
|
|
57
|
+
pipx ensurepath
|
|
58
|
+
```
|
|
59
|
+
|
|
60
|
+
```bash
|
|
61
|
+
pipx install scopos
|
|
62
|
+
```
|
|
63
|
+
|
|
64
|
+
## Quick Start
|
|
65
|
+
|
|
66
|
+
### monitor all GPUs
|
|
67
|
+
|
|
68
|
+
```bash
|
|
69
|
+
scopos
|
|
70
|
+
```
|
|
71
|
+
|
|
72
|
+
### highlight user "alice" and show their task details
|
|
73
|
+
|
|
74
|
+
```bash
|
|
75
|
+
scopos -u alice
|
|
76
|
+
```
|
|
77
|
+
|
|
78
|
+
### refresh every 2 seconds
|
|
79
|
+
|
|
80
|
+
```bash
|
|
81
|
+
scopos -i 2
|
|
82
|
+
```
|
|
83
|
+
|
|
84
|
+
### synthetic data, no NVIDIA driver needed
|
|
85
|
+
|
|
86
|
+
```bash
|
|
87
|
+
scopos --demo
|
|
88
|
+
```
|
|
89
|
+
|
|
90
|
+
---
|
|
91
|
+
|
|
92
|
+
## Requirements
|
|
93
|
+
|
|
94
|
+
- Python >= 3.8
|
|
95
|
+
- `textual` >= 0.60
|
|
96
|
+
- `psutil` >= 5.9
|
|
97
|
+
- `nvidia-ml-py` >= 12.0
|
|
98
|
+
|
|
99
|
+
## License
|
|
100
|
+
|
|
101
|
+
See LICENSE in the repository.
|
|
102
|
+
|
|
103
|
+
## Links
|
|
104
|
+
|
|
105
|
+
- [Homepage/Repo](https://github.com/tinchen777/scopos.git)
|
|
106
|
+
- [Issues](https://github.com/tinchen777/scopos.git/issues)
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
scopos/__init__.py,sha256=c7oK_575F5nfQE0HMLr24FNhpmusnLUp14oOo8J26QE,152
|
|
2
|
+
scopos/app.py,sha256=x-u_-km_dpoN7oKyb5SqeuY3l4_uwGIiZeSSis9Dud4,5388
|
|
3
|
+
scopos/cli.py,sha256=pQK8LHWZm3rzsMaS8aeBhLMdqrUGoNuzH3zDzAzXB88,1202
|
|
4
|
+
scopos/monitor.py,sha256=b5m0LHBm-Jd6wm5XR248o96ILwEilcbhfNAknDdlNTA,14208
|
|
5
|
+
scopos/widgets.py,sha256=YK4lgTzAfWNNyJy2ubbJlbudoS4FiUljYgkNHvrTZBE,10558
|
|
6
|
+
scopos-2.0.0.dist-info/licenses/LICENSE,sha256=UB1vOlHpp1w6bfzyNq42Ou5m31wc-fPCX4YHR_pSxIw,1066
|
|
7
|
+
scopos-2.0.0.dist-info/METADATA,sha256=Fx8QXWvine8NR95oHAzdmBvz0Nz1U008XbhK5M0_GZk,2235
|
|
8
|
+
scopos-2.0.0.dist-info/WHEEL,sha256=aeYiig01lYGDzBgS8HxWXOg3uV61G9ijOsup-k9o1sk,91
|
|
9
|
+
scopos-2.0.0.dist-info/entry_points.txt,sha256=SBfzJsiT9E9aBr8sgpirMJysEvk4q-kDoRYrhYC5sz4,43
|
|
10
|
+
scopos-2.0.0.dist-info/top_level.txt,sha256=U6ZHTdL7oshtVFYHEjzx13othz6fdrW6_B0E0H_-3po,7
|
|
11
|
+
scopos-2.0.0.dist-info/RECORD,,
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2025 Cathie Li
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
scopos
|