hpc-stats-scripts 1.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- hpc_stats_scripts-1.2/LICENSE +21 -0
- hpc_stats_scripts-1.2/PKG-INFO +278 -0
- hpc_stats_scripts-1.2/README.md +236 -0
- hpc_stats_scripts-1.2/pyproject.toml +38 -0
- hpc_stats_scripts-1.2/setup.cfg +4 -0
- hpc_stats_scripts-1.2/setup.py +34 -0
- hpc_stats_scripts-1.2/src/hpc_scripts/__init__.py +8 -0
- hpc_stats_scripts-1.2/src/hpc_scripts/pbs_bulk_user_stats.py +442 -0
- hpc_stats_scripts-1.2/src/hpc_scripts/psutil_monitor.py +479 -0
- hpc_stats_scripts-1.2/src/hpc_scripts/slurm_bulk_user_stats.py +499 -0
- hpc_stats_scripts-1.2/src/hpc_stats_scripts.egg-info/PKG-INFO +278 -0
- hpc_stats_scripts-1.2/src/hpc_stats_scripts.egg-info/SOURCES.txt +14 -0
- hpc_stats_scripts-1.2/src/hpc_stats_scripts.egg-info/dependency_links.txt +1 -0
- hpc_stats_scripts-1.2/src/hpc_stats_scripts.egg-info/entry_points.txt +4 -0
- hpc_stats_scripts-1.2/src/hpc_stats_scripts.egg-info/requires.txt +13 -0
- hpc_stats_scripts-1.2/src/hpc_stats_scripts.egg-info/top_level.txt +1 -0
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2025
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,278 @@
|
|
|
1
|
+
Metadata-Version: 2.1
|
|
2
|
+
Name: hpc-stats-scripts
|
|
3
|
+
Version: 1.2
|
|
4
|
+
Summary: Utilities for HPC clusters including PBS/Slurm job statistics and a psutil-based resource monitor.
|
|
5
|
+
Author: hpc-stats-scripts contributors
|
|
6
|
+
License: MIT License
|
|
7
|
+
|
|
8
|
+
Copyright (c) 2025
|
|
9
|
+
|
|
10
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
11
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
12
|
+
in the Software without restriction, including without limitation the rights
|
|
13
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
14
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
15
|
+
furnished to do so, subject to the following conditions:
|
|
16
|
+
|
|
17
|
+
The above copyright notice and this permission notice shall be included in all
|
|
18
|
+
copies or substantial portions of the Software.
|
|
19
|
+
|
|
20
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
21
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
22
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
23
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
24
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
25
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
26
|
+
SOFTWARE.
|
|
27
|
+
|
|
28
|
+
Keywords: hpc,pbs,slurm,monitoring,cluster,psutil
|
|
29
|
+
Requires-Python: >=3.9
|
|
30
|
+
Description-Content-Type: text/markdown
|
|
31
|
+
License-File: LICENSE
|
|
32
|
+
Requires-Dist: psutil
|
|
33
|
+
Provides-Extra: plot
|
|
34
|
+
Requires-Dist: matplotlib; extra == "plot"
|
|
35
|
+
Requires-Dist: numpy; extra == "plot"
|
|
36
|
+
Provides-Extra: gpu
|
|
37
|
+
Requires-Dist: nvidia-ml-py3; extra == "gpu"
|
|
38
|
+
Provides-Extra: all
|
|
39
|
+
Requires-Dist: matplotlib; extra == "all"
|
|
40
|
+
Requires-Dist: numpy; extra == "all"
|
|
41
|
+
Requires-Dist: nvidia-ml-py3; extra == "all"
|
|
42
|
+
|
|
43
|
+
# hpc-stats-scripts
|
|
44
|
+
|
|
45
|
+
Utilities for working with high-performance computing (HPC) environments. The scripts
|
|
46
|
+
help inspect PBS/Slurm job efficiency and monitor CPU and memory usage on a
|
|
47
|
+
running system or process tree.
|
|
48
|
+
|
|
49
|
+
Made with Codex :)
|
|
50
|
+
|
|
51
|
+
## Dependencies
|
|
52
|
+
|
|
53
|
+
Install the required Python packages with pip:
|
|
54
|
+
|
|
55
|
+
| Feature | Packages | Install command |
|
|
56
|
+
| ------- | -------- | ---------------- |
|
|
57
|
+
| Core utilities | psutil | `pip install psutil` |
|
|
58
|
+
| Plotting for `psutil-monitor` | matplotlib, numpy | `pip install matplotlib numpy` |
|
|
59
|
+
| GPU monitoring for `psutil-monitor --gpu` | nvidia-ml-py3 (pynvml) | `pip install nvidia-ml-py3` |
|
|
60
|
+
| Plot + GPU combo | psutil, matplotlib, numpy, nvidia-ml-py3 | `pip install psutil matplotlib numpy nvidia-ml-py3` |
|
|
61
|
+
| All extras via pip extras | plot + GPU | `pip install .[all]` |
|
|
62
|
+
|
|
63
|
+
The `pbs-bulk-user-stats` command also expects the PBS `qstat` utility to be
|
|
64
|
+
available in your environment.
|
|
65
|
+
The `slurm-bulk-user-stats` command expects Slurm's `sacct` utility to be
|
|
66
|
+
available in your environment.
|
|
67
|
+
|
|
68
|
+
## Installation
|
|
69
|
+
|
|
70
|
+
Clone the repository and install with pip:
|
|
71
|
+
|
|
72
|
+
```bash
|
|
73
|
+
# Core only
|
|
74
|
+
pip install .
|
|
75
|
+
|
|
76
|
+
# Core + plotting support
|
|
77
|
+
pip install .[plot]
|
|
78
|
+
|
|
79
|
+
# Core + GPU support
|
|
80
|
+
pip install .[gpu]
|
|
81
|
+
|
|
82
|
+
# Everything (plot + GPU)
|
|
83
|
+
pip install .[all]
|
|
84
|
+
```
|
|
85
|
+
|
|
86
|
+
Install directly from GitHub:
|
|
87
|
+
```bash
|
|
88
|
+
pip install "hpc-stats-scripts @ git+https://github.com/avnikonenko/hpc-stats-scripts.git"
|
|
89
|
+
pip install "hpc-stats-scripts[plot] @ git+https://github.com/avnikonenko/hpc-stats-scripts.git"
|
|
90
|
+
pip install "hpc-stats-scripts[gpu] @ git+https://github.com/avnikonenko/hpc-stats-scripts.git"
|
|
91
|
+
pip install "hpc-stats-scripts[all] @ git+https://github.com/avnikonenko/hpc-stats-scripts.git"
|
|
92
|
+
```
|
|
93
|
+
|
|
94
|
+
The base installation depends on [psutil](https://pypi.org/project/psutil/).
|
|
95
|
+
The `plot` extra pulls in `matplotlib` and `numpy` for the `--plot` feature of `psutil-monitor`.
|
|
96
|
+
The `gpu` extra installs `nvidia-ml-py3` to enable `--gpu`.
|
|
97
|
+
|
|
98
|
+
## CLI tools
|
|
99
|
+
|
|
100
|
+
### `pbs-bulk-user-stats`
|
|
101
|
+
|
|
102
|
+
Summarize CPU and memory usage for PBS jobs and show which nodes the jobs are
|
|
103
|
+
allocated to. The command relies on `qstat` being available in your `PATH`.
|
|
104
|
+
The table now includes `NGPUS` (requested GPUs) when present.
|
|
105
|
+
|
|
106
|
+
Examples:
|
|
107
|
+
|
|
108
|
+
```bash
|
|
109
|
+
# Summarize a specific job and write CSV output
|
|
110
|
+
pbs-bulk-user-stats --job 12345 --csv stats.csv
|
|
111
|
+
|
|
112
|
+
# Summarize all jobs for the current user (default)
|
|
113
|
+
pbs-bulk-user-stats --include-finished
|
|
114
|
+
|
|
115
|
+
# Summarize all jobs for a specific user
|
|
116
|
+
pbs-bulk-user-stats --user myuser --include-finished
|
|
117
|
+
```
|
|
118
|
+
|
|
119
|
+
When invoked with no `--user` or `--job` options:
|
|
120
|
+
- On a login node (no `$PBS_JOBID` present), it summarizes all jobs for the current user.
|
|
121
|
+
- Inside a running PBS job (where `$PBS_JOBID` is set), it automatically summarizes that specific job.
|
|
122
|
+
|
|
123
|
+
```
|
|
124
|
+
pbs-bulk-user-stats
|
|
125
|
+
```
|
|
126
|
+
|
|
127
|
+
State codes (PBS):
|
|
128
|
+
- `R` running, `Q` queued/waiting, `X` finished (requires `--include-finished`), other codes are printed under “other” in the summary.
|
|
129
|
+
|
|
130
|
+
**Expected output (CPU/RAM only):**
|
|
131
|
+
```
|
|
132
|
+
$ pbs-bulk-user-stats
|
|
133
|
+
|
|
134
|
+
JOBID STATE NAME NODES NCPUS WALL(h) CPUT(h) avgCPU CPUeff memUsed memReq memEff
|
|
135
|
+
-------------------------------------------------------------------------------------------------------
|
|
136
|
+
0001 R run1 pbs-1 176 38.55 3632.12 163.6 93.53% 207.4 GiB 256.00 GiB 81.10%
|
|
137
|
+
0002 R run2 pbs-2 176 38.59 3589.72 93.13 52.91% 50.02 GiB 256.00 GiB 19.54%
|
|
138
|
+
...
|
|
139
|
+
Summary:
|
|
140
|
+
jobs: 5
|
|
141
|
+
unique nodes: 3
|
|
142
|
+
states: R=4 Q=1 X=0 other=0
|
|
143
|
+
mean CPUeff: 75.20%
|
|
144
|
+
mean avgCPU: 132.35
|
|
145
|
+
mean memEff: 82.50%
|
|
146
|
+
max memUsed: 230.16 GiB
|
|
147
|
+
|
|
148
|
+
```
|
|
149
|
+
or if run inside a running PBS:
|
|
150
|
+
```
|
|
151
|
+
JOBID STATE NAME NODES NCPUS WALL(h) CPUT(h) avgCPU CPUeff memUsed memReq memEff
|
|
152
|
+
-----------------------------------------------------------------------------------------------------
|
|
153
|
+
0001 R STDIN pbs-5 100 0.03 0.01 0.22 0.22% 666.58 MiB 30.00 GiB 2.17%
|
|
154
|
+
|
|
155
|
+
Summary:
|
|
156
|
+
jobs: 1
|
|
157
|
+
mean CPUeff: 0.22%
|
|
158
|
+
mean avgCPU: 0.22
|
|
159
|
+
mean memEff: 2.17%
|
|
160
|
+
max memUsed: 666.58 MiB
|
|
161
|
+
|
|
162
|
+
```
|
|
163
|
+
|
|
164
|
+
After the table, a summary reports the job count, mean CPU efficiency,
|
|
165
|
+
mean average CPU usage, mean memory efficiency, and the peak memory used
|
|
166
|
+
across all listed jobs.
|
|
167
|
+
|
|
168
|
+
### `psutil-monitor`
|
|
169
|
+
|
|
170
|
+
Real-time CPU and memory monitor for the system or a process tree.
|
|
171
|
+
Use `--gpu` to also report aggregate GPU utilization and memory via NVML (requires `nvidia-ml-py3`).
|
|
172
|
+
When `--csv`/`--plot` are used, metrics stream live to the terminal during the run; CSV/PNG files are written when the monitor exits (Ctrl+C, duration reached, or proc tree ends).
|
|
173
|
+
|
|
174
|
+
Example output files (generated with `--plot` and `--csv`):
|
|
175
|
+
|
|
176
|
+
- Plot (CPU + GPU stacked):
|
|
177
|
+
|
|
178
|
+

|
|
179
|
+
|
|
180
|
+
- CSV: `docs/psutil-monitor-example.csv`
|
|
181
|
+
|
|
182
|
+
GPU output fields (when `--gpu` is used):
|
|
183
|
+
- **GPU util**: Average utilization across visible GPUs.
|
|
184
|
+
- **busyGPUs**: Sum of utilization fractions (e.g., two GPUs at 50% each → 1.0).
|
|
185
|
+
- **GPU mem %**: Aggregate GPU memory usage percentage.
|
|
186
|
+
- **Per-GPU** (CSV `gpu_pergpu`): `index:util%/used/total` for each device.
|
|
187
|
+
|
|
188
|
+
Examples:
|
|
189
|
+
|
|
190
|
+
```bash
|
|
191
|
+
# System-wide (by default) monitoring with console output only
|
|
192
|
+
psutil-monitor
|
|
193
|
+
|
|
194
|
+
# System-wide monitoring with CSV and PNG output
|
|
195
|
+
psutil-monitor --mode system --csv node.csv --plot node.png
|
|
196
|
+
|
|
197
|
+
# Monitor the current process tree (useful inside a PBS job)
|
|
198
|
+
psutil-monitor --mode proc --pid $$ --include-children --csv job.csv
|
|
199
|
+
|
|
200
|
+
# For script.py resources monitoring:
|
|
201
|
+
python script.py & # launch the workload
|
|
202
|
+
target=$! # PID of script.py
|
|
203
|
+
echo $target
|
|
204
|
+
# psutil-monitor watches that PID and exits when the process tree is gone
|
|
205
|
+
psutil-monitor --mode proc --pid "$target" --include-children --csv stat.csv --plot plot.png
|
|
206
|
+
|
|
207
|
+
```
|
|
208
|
+
**Expected output:**
|
|
209
|
+
```
|
|
210
|
+
$ psutil-monitor
|
|
211
|
+
|
|
212
|
+
CPUs available (affinity): 384
|
|
213
|
+
Total memory available: 754.76 GiB
|
|
214
|
+
CPU basis for %: 384
|
|
215
|
+
Memory basis for %: 754.76 GiB
|
|
216
|
+
2025-08-14T15:20:14 CPU 79.67% busyCPUs 305.93 (provided 384) MEM 9.93% used 74.96 GiB / total 754.76 GiB
|
|
217
|
+
2025-08-14T15:20:16 CPU 69.30% busyCPUs 266.13 (provided 384) MEM 9.95% used 75.12 GiB / total 754.76 GiB
|
|
218
|
+
2025-08-14T15:20:18 CPU 61.34% busyCPUs 235.53 (provided 384) MEM 10.05% used 75.82 GiB / total 754.76 GiB
|
|
219
|
+
2025-08-14T15:20:20 CPU 61.32% busyCPUs 235.47 (provided 384) MEM 10.09% used 76.15 GiB / total 754.76 GiB
|
|
220
|
+
2025-08-14T15:20:22 CPU 74.57% busyCPUs 286.33 (provided 384) MEM 9.94% used 74.99 GiB / total 754.76 GiB
|
|
221
|
+
2025-08-14T15:20:24 CPU 85.94% busyCPUs 330.01 (provided 384) MEM 9.86% used 74.44 GiB / total 754.76 GiB
|
|
222
|
+
Average busy CPUs over run: 276.570
|
|
223
|
+
Peak memory (system): 76.15 GiB
|
|
224
|
+
|
|
225
|
+
```
|
|
226
|
+
With GPUs (`--gpu` and NVIDIA GPUs present):
|
|
227
|
+
```
|
|
228
|
+
$ psutil-monitor --gpu
|
|
229
|
+
|
|
230
|
+
CPUs available (affinity): 96
|
|
231
|
+
Total memory available: 503.70 GiB
|
|
232
|
+
CPU basis for %: 96
|
|
233
|
+
Memory basis for %: 503.70 GiB
|
|
234
|
+
GPUs detected (NVML): 4
|
|
235
|
+
2026-02-03T10:00:14 CPU 45.12% busyCPUs 43.32 (provided 96) MEM 8.10% used 40.80 GiB / total 503.70 GiB GPU util 57.5% busyGPUs 2.30 mem 42.0%
|
|
236
|
+
2026-02-03T10:00:16 CPU 48.33% busyCPUs 46.39 (provided 96) MEM 8.20% used 41.30 GiB / total 503.70 GiB GPU util 63.0% busyGPUs 2.52 mem 44.1%
|
|
237
|
+
2026-02-03T10:00:18 CPU 52.10% busyCPUs 49.99 (provided 96) MEM 8.25% used 41.60 GiB / total 503.70 GiB GPU util 68.7% busyGPUs 2.75 mem 45.3%
|
|
238
|
+
Average busy CPUs over run: 46.567
|
|
239
|
+
Average busy GPUs over run: 2.523
|
|
240
|
+
Peak memory (system): 41.60 GiB
|
|
241
|
+
|
|
242
|
+
```
|
|
243
|
+
|
|
244
|
+
Use the `--help` option of each command to see all available options.
|
|
245
|
+
|
|
246
|
+
### `slurm-bulk-user-stats`
|
|
247
|
+
|
|
248
|
+
Summarize CPU and memory usage for Slurm jobs and show which nodes the jobs are
|
|
249
|
+
allocated to. The command relies on `sacct` being available in your `PATH`.
|
|
250
|
+
The table includes `NGPUS` based on AllocTres/AllocGRES when present.
|
|
251
|
+
If TRES GPU usage metrics are available, the summary also reports mean GPU util and GPU hours (used/requested).
|
|
252
|
+
|
|
253
|
+
State codes (Slurm):
|
|
254
|
+
- `R`/`RUNNING`, `PD`/`PENDING`, `CD`/`COMPLETED`; other states (e.g., `F`, `CG`, `S`, `TO`) are grouped under “other” in the summary and listed in the breakdown.
|
|
255
|
+
|
|
256
|
+
Examples:
|
|
257
|
+
|
|
258
|
+
```bash
|
|
259
|
+
# Summarize a specific job and write CSV output
|
|
260
|
+
slurm-bulk-user-stats --job 12345 --csv stats.csv
|
|
261
|
+
|
|
262
|
+
# Summarize all running jobs for the current user (default)
|
|
263
|
+
slurm-bulk-user-stats
|
|
264
|
+
|
|
265
|
+
# Summarize all jobs (including finished) for a specific user
|
|
266
|
+
slurm-bulk-user-stats --user myuser --include-finished
|
|
267
|
+
```
|
|
268
|
+
|
|
269
|
+
When invoked with no `--user` or `--job` options:
|
|
270
|
+
- On a login node (no `$SLURM_JOB_ID` present), it summarizes pending/running jobs for the current user.
|
|
271
|
+
- Inside a running Slurm job (where `$SLURM_JOB_ID` is set), it automatically summarizes that specific job.
|
|
272
|
+
|
|
273
|
+
```
|
|
274
|
+
slurm-bulk-user-stats
|
|
275
|
+
```
|
|
276
|
+
|
|
277
|
+
The output mirrors the PBS version, showing job state, node list, CPU/memory
|
|
278
|
+
usage, efficiency metrics, and a summary block with job counts and averages.
|
|
@@ -0,0 +1,236 @@
|
|
|
1
|
+
# hpc-stats-scripts
|
|
2
|
+
|
|
3
|
+
Utilities for working with high-performance computing (HPC) environments. The scripts
|
|
4
|
+
help inspect PBS/Slurm job efficiency and monitor CPU and memory usage on a
|
|
5
|
+
running system or process tree.
|
|
6
|
+
|
|
7
|
+
Made with Codex :)
|
|
8
|
+
|
|
9
|
+
## Dependencies
|
|
10
|
+
|
|
11
|
+
Install the required Python packages with pip:
|
|
12
|
+
|
|
13
|
+
| Feature | Packages | Install command |
|
|
14
|
+
| ------- | -------- | ---------------- |
|
|
15
|
+
| Core utilities | psutil | `pip install psutil` |
|
|
16
|
+
| Plotting for `psutil-monitor` | matplotlib, numpy | `pip install matplotlib numpy` |
|
|
17
|
+
| GPU monitoring for `psutil-monitor --gpu` | nvidia-ml-py3 (pynvml) | `pip install nvidia-ml-py3` |
|
|
18
|
+
| Plot + GPU combo | psutil, matplotlib, numpy, nvidia-ml-py3 | `pip install psutil matplotlib numpy nvidia-ml-py3` |
|
|
19
|
+
| All extras via pip extras | plot + GPU | `pip install .[all]` |
|
|
20
|
+
|
|
21
|
+
The `pbs-bulk-user-stats` command also expects the PBS `qstat` utility to be
|
|
22
|
+
available in your environment.
|
|
23
|
+
The `slurm-bulk-user-stats` command expects Slurm's `sacct` utility to be
|
|
24
|
+
available in your environment.
|
|
25
|
+
|
|
26
|
+
## Installation
|
|
27
|
+
|
|
28
|
+
Clone the repository and install with pip:
|
|
29
|
+
|
|
30
|
+
```bash
|
|
31
|
+
# Core only
|
|
32
|
+
pip install .
|
|
33
|
+
|
|
34
|
+
# Core + plotting support
|
|
35
|
+
pip install .[plot]
|
|
36
|
+
|
|
37
|
+
# Core + GPU support
|
|
38
|
+
pip install .[gpu]
|
|
39
|
+
|
|
40
|
+
# Everything (plot + GPU)
|
|
41
|
+
pip install .[all]
|
|
42
|
+
```
|
|
43
|
+
|
|
44
|
+
Install directly from GitHub:
|
|
45
|
+
```bash
|
|
46
|
+
pip install "hpc-stats-scripts @ git+https://github.com/avnikonenko/hpc-stats-scripts.git"
|
|
47
|
+
pip install "hpc-stats-scripts[plot] @ git+https://github.com/avnikonenko/hpc-stats-scripts.git"
|
|
48
|
+
pip install "hpc-stats-scripts[gpu] @ git+https://github.com/avnikonenko/hpc-stats-scripts.git"
|
|
49
|
+
pip install "hpc-stats-scripts[all] @ git+https://github.com/avnikonenko/hpc-stats-scripts.git"
|
|
50
|
+
```
|
|
51
|
+
|
|
52
|
+
The base installation depends on [psutil](https://pypi.org/project/psutil/).
|
|
53
|
+
The `plot` extra pulls in `matplotlib` and `numpy` for the `--plot` feature of `psutil-monitor`.
|
|
54
|
+
The `gpu` extra installs `nvidia-ml-py3` to enable `--gpu`.
|
|
55
|
+
|
|
56
|
+
## CLI tools
|
|
57
|
+
|
|
58
|
+
### `pbs-bulk-user-stats`
|
|
59
|
+
|
|
60
|
+
Summarize CPU and memory usage for PBS jobs and show which nodes the jobs are
|
|
61
|
+
allocated to. The command relies on `qstat` being available in your `PATH`.
|
|
62
|
+
The table now includes `NGPUS` (requested GPUs) when present.
|
|
63
|
+
|
|
64
|
+
Examples:
|
|
65
|
+
|
|
66
|
+
```bash
|
|
67
|
+
# Summarize a specific job and write CSV output
|
|
68
|
+
pbs-bulk-user-stats --job 12345 --csv stats.csv
|
|
69
|
+
|
|
70
|
+
# Summarize all jobs for the current user (default)
|
|
71
|
+
pbs-bulk-user-stats --include-finished
|
|
72
|
+
|
|
73
|
+
# Summarize all jobs for a specific user
|
|
74
|
+
pbs-bulk-user-stats --user myuser --include-finished
|
|
75
|
+
```
|
|
76
|
+
|
|
77
|
+
When invoked with no `--user` or `--job` options:
|
|
78
|
+
- On a login node (no `$PBS_JOBID` present), it summarizes all jobs for the current user.
|
|
79
|
+
- Inside a running PBS job (where `$PBS_JOBID` is set), it automatically summarizes that specific job.
|
|
80
|
+
|
|
81
|
+
```
|
|
82
|
+
pbs-bulk-user-stats
|
|
83
|
+
```
|
|
84
|
+
|
|
85
|
+
State codes (PBS):
|
|
86
|
+
- `R` running, `Q` queued/waiting, `X` finished (requires `--include-finished`), other codes are printed under “other” in the summary.
|
|
87
|
+
|
|
88
|
+
**Expected output (CPU/RAM only):**
|
|
89
|
+
```
|
|
90
|
+
$ pbs-bulk-user-stats
|
|
91
|
+
|
|
92
|
+
JOBID STATE NAME NODES NCPUS WALL(h) CPUT(h) avgCPU CPUeff memUsed memReq memEff
|
|
93
|
+
-------------------------------------------------------------------------------------------------------
|
|
94
|
+
0001 R run1 pbs-1 176 38.55 3632.12 163.6 93.53% 207.4 GiB 256.00 GiB 81.10%
|
|
95
|
+
0002 R run2 pbs-2 176 38.59 3589.72 93.13 52.91% 50.02 GiB 256.00 GiB 19.54%
|
|
96
|
+
...
|
|
97
|
+
Summary:
|
|
98
|
+
jobs: 5
|
|
99
|
+
unique nodes: 3
|
|
100
|
+
states: R=4 Q=1 X=0 other=0
|
|
101
|
+
mean CPUeff: 75.20%
|
|
102
|
+
mean avgCPU: 132.35
|
|
103
|
+
mean memEff: 82.50%
|
|
104
|
+
max memUsed: 230.16 GiB
|
|
105
|
+
|
|
106
|
+
```
|
|
107
|
+
or if run inside a running PBS:
|
|
108
|
+
```
|
|
109
|
+
JOBID STATE NAME NODES NCPUS WALL(h) CPUT(h) avgCPU CPUeff memUsed memReq memEff
|
|
110
|
+
-----------------------------------------------------------------------------------------------------
|
|
111
|
+
0001 R STDIN pbs-5 100 0.03 0.01 0.22 0.22% 666.58 MiB 30.00 GiB 2.17%
|
|
112
|
+
|
|
113
|
+
Summary:
|
|
114
|
+
jobs: 1
|
|
115
|
+
mean CPUeff: 0.22%
|
|
116
|
+
mean avgCPU: 0.22
|
|
117
|
+
mean memEff: 2.17%
|
|
118
|
+
max memUsed: 666.58 MiB
|
|
119
|
+
|
|
120
|
+
```
|
|
121
|
+
|
|
122
|
+
After the table, a summary reports the job count, mean CPU efficiency,
|
|
123
|
+
mean average CPU usage, mean memory efficiency, and the peak memory used
|
|
124
|
+
across all listed jobs.
|
|
125
|
+
|
|
126
|
+
### `psutil-monitor`
|
|
127
|
+
|
|
128
|
+
Real-time CPU and memory monitor for the system or a process tree.
|
|
129
|
+
Use `--gpu` to also report aggregate GPU utilization and memory via NVML (requires `nvidia-ml-py3`).
|
|
130
|
+
When `--csv`/`--plot` are used, metrics stream live to the terminal during the run; CSV/PNG files are written when the monitor exits (Ctrl+C, duration reached, or proc tree ends).
|
|
131
|
+
|
|
132
|
+
Example output files (generated with `--plot` and `--csv`):
|
|
133
|
+
|
|
134
|
+
- Plot (CPU + GPU stacked):
|
|
135
|
+
|
|
136
|
+

|
|
137
|
+
|
|
138
|
+
- CSV: `docs/psutil-monitor-example.csv`
|
|
139
|
+
|
|
140
|
+
GPU output fields (when `--gpu` is used):
|
|
141
|
+
- **GPU util**: Average utilization across visible GPUs.
|
|
142
|
+
- **busyGPUs**: Sum of utilization fractions (e.g., two GPUs at 50% each → 1.0).
|
|
143
|
+
- **GPU mem %**: Aggregate GPU memory usage percentage.
|
|
144
|
+
- **Per-GPU** (CSV `gpu_pergpu`): `index:util%/used/total` for each device.
|
|
145
|
+
|
|
146
|
+
Examples:
|
|
147
|
+
|
|
148
|
+
```bash
|
|
149
|
+
# System-wide (by default) monitoring with console output only
|
|
150
|
+
psutil-monitor
|
|
151
|
+
|
|
152
|
+
# System-wide monitoring with CSV and PNG output
|
|
153
|
+
psutil-monitor --mode system --csv node.csv --plot node.png
|
|
154
|
+
|
|
155
|
+
# Monitor the current process tree (useful inside a PBS job)
|
|
156
|
+
psutil-monitor --mode proc --pid $$ --include-children --csv job.csv
|
|
157
|
+
|
|
158
|
+
# For script.py resources monitoring:
|
|
159
|
+
python script.py & # launch the workload
|
|
160
|
+
target=$! # PID of script.py
|
|
161
|
+
echo $target
|
|
162
|
+
# psutil-monitor watches that PID and exits when the process tree is gone
|
|
163
|
+
psutil-monitor --mode proc --pid "$target" --include-children --csv stat.csv --plot plot.png
|
|
164
|
+
|
|
165
|
+
```
|
|
166
|
+
**Expected output:**
|
|
167
|
+
```
|
|
168
|
+
$ psutil-monitor
|
|
169
|
+
|
|
170
|
+
CPUs available (affinity): 384
|
|
171
|
+
Total memory available: 754.76 GiB
|
|
172
|
+
CPU basis for %: 384
|
|
173
|
+
Memory basis for %: 754.76 GiB
|
|
174
|
+
2025-08-14T15:20:14 CPU 79.67% busyCPUs 305.93 (provided 384) MEM 9.93% used 74.96 GiB / total 754.76 GiB
|
|
175
|
+
2025-08-14T15:20:16 CPU 69.30% busyCPUs 266.13 (provided 384) MEM 9.95% used 75.12 GiB / total 754.76 GiB
|
|
176
|
+
2025-08-14T15:20:18 CPU 61.34% busyCPUs 235.53 (provided 384) MEM 10.05% used 75.82 GiB / total 754.76 GiB
|
|
177
|
+
2025-08-14T15:20:20 CPU 61.32% busyCPUs 235.47 (provided 384) MEM 10.09% used 76.15 GiB / total 754.76 GiB
|
|
178
|
+
2025-08-14T15:20:22 CPU 74.57% busyCPUs 286.33 (provided 384) MEM 9.94% used 74.99 GiB / total 754.76 GiB
|
|
179
|
+
2025-08-14T15:20:24 CPU 85.94% busyCPUs 330.01 (provided 384) MEM 9.86% used 74.44 GiB / total 754.76 GiB
|
|
180
|
+
Average busy CPUs over run: 276.570
|
|
181
|
+
Peak memory (system): 76.15 GiB
|
|
182
|
+
|
|
183
|
+
```
|
|
184
|
+
With GPUs (`--gpu` and NVIDIA GPUs present):
|
|
185
|
+
```
|
|
186
|
+
$ psutil-monitor --gpu
|
|
187
|
+
|
|
188
|
+
CPUs available (affinity): 96
|
|
189
|
+
Total memory available: 503.70 GiB
|
|
190
|
+
CPU basis for %: 96
|
|
191
|
+
Memory basis for %: 503.70 GiB
|
|
192
|
+
GPUs detected (NVML): 4
|
|
193
|
+
2026-02-03T10:00:14 CPU 45.12% busyCPUs 43.32 (provided 96) MEM 8.10% used 40.80 GiB / total 503.70 GiB GPU util 57.5% busyGPUs 2.30 mem 42.0%
|
|
194
|
+
2026-02-03T10:00:16 CPU 48.33% busyCPUs 46.39 (provided 96) MEM 8.20% used 41.30 GiB / total 503.70 GiB GPU util 63.0% busyGPUs 2.52 mem 44.1%
|
|
195
|
+
2026-02-03T10:00:18 CPU 52.10% busyCPUs 49.99 (provided 96) MEM 8.25% used 41.60 GiB / total 503.70 GiB GPU util 68.7% busyGPUs 2.75 mem 45.3%
|
|
196
|
+
Average busy CPUs over run: 46.567
|
|
197
|
+
Average busy GPUs over run: 2.523
|
|
198
|
+
Peak memory (system): 41.60 GiB
|
|
199
|
+
|
|
200
|
+
```
|
|
201
|
+
|
|
202
|
+
Use the `--help` option of each command to see all available options.
|
|
203
|
+
|
|
204
|
+
### `slurm-bulk-user-stats`
|
|
205
|
+
|
|
206
|
+
Summarize CPU and memory usage for Slurm jobs and show which nodes the jobs are
|
|
207
|
+
allocated to. The command relies on `sacct` being available in your `PATH`.
|
|
208
|
+
The table includes `NGPUS` based on AllocTres/AllocGRES when present.
|
|
209
|
+
If TRES GPU usage metrics are available, the summary also reports mean GPU util and GPU hours (used/requested).
|
|
210
|
+
|
|
211
|
+
State codes (Slurm):
|
|
212
|
+
- `R`/`RUNNING`, `PD`/`PENDING`, `CD`/`COMPLETED`; other states (e.g., `F`, `CG`, `S`, `TO`) are grouped under “other” in the summary and listed in the breakdown.
|
|
213
|
+
|
|
214
|
+
Examples:
|
|
215
|
+
|
|
216
|
+
```bash
|
|
217
|
+
# Summarize a specific job and write CSV output
|
|
218
|
+
slurm-bulk-user-stats --job 12345 --csv stats.csv
|
|
219
|
+
|
|
220
|
+
# Summarize all running jobs for the current user (default)
|
|
221
|
+
slurm-bulk-user-stats
|
|
222
|
+
|
|
223
|
+
# Summarize all jobs (including finished) for a specific user
|
|
224
|
+
slurm-bulk-user-stats --user myuser --include-finished
|
|
225
|
+
```
|
|
226
|
+
|
|
227
|
+
When invoked with no `--user` or `--job` options:
|
|
228
|
+
- On a login node (no `$SLURM_JOB_ID` present), it summarizes pending/running jobs for the current user.
|
|
229
|
+
- Inside a running Slurm job (where `$SLURM_JOB_ID` is set), it automatically summarizes that specific job.
|
|
230
|
+
|
|
231
|
+
```
|
|
232
|
+
slurm-bulk-user-stats
|
|
233
|
+
```
|
|
234
|
+
|
|
235
|
+
The output mirrors the PBS version, showing job state, node list, CPU/memory
|
|
236
|
+
usage, efficiency metrics, and a summary block with job counts and averages.
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=61.0"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "hpc-stats-scripts"
|
|
7
|
+
version = "1.2"
|
|
8
|
+
description = "Utilities for HPC clusters including PBS/Slurm job statistics and a psutil-based resource monitor."
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
license = {file = "LICENSE"}
|
|
11
|
+
requires-python = ">=3.9"
|
|
12
|
+
authors = [{name = "hpc-stats-scripts contributors"}]
|
|
13
|
+
dependencies = [
|
|
14
|
+
"psutil",
|
|
15
|
+
]
|
|
16
|
+
keywords = ["hpc", "pbs", "slurm", "monitoring", "cluster", "psutil"]
|
|
17
|
+
|
|
18
|
+
[project.optional-dependencies]
|
|
19
|
+
plot = [
|
|
20
|
+
"matplotlib",
|
|
21
|
+
"numpy",
|
|
22
|
+
]
|
|
23
|
+
gpu = [
|
|
24
|
+
"nvidia-ml-py3",
|
|
25
|
+
]
|
|
26
|
+
all = [
|
|
27
|
+
"matplotlib",
|
|
28
|
+
"numpy",
|
|
29
|
+
"nvidia-ml-py3",
|
|
30
|
+
]
|
|
31
|
+
|
|
32
|
+
[project.scripts]
|
|
33
|
+
pbs-bulk-user-stats = "hpc_scripts.pbs_bulk_user_stats:main"
|
|
34
|
+
psutil-monitor = "hpc_scripts.psutil_monitor:main"
|
|
35
|
+
slurm-bulk-user-stats = "hpc_scripts.slurm_bulk_user_stats:main"
|
|
36
|
+
|
|
37
|
+
[tool.setuptools.packages.find]
|
|
38
|
+
where = ["src"]
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
from setuptools import find_packages, setup
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
def read_readme() -> str:
|
|
6
|
+
readme = Path(__file__).parent / "README.md"
|
|
7
|
+
return readme.read_text(encoding="utf-8") if readme.exists() else ""
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
setup(
|
|
11
|
+
name="hpc-stats-scripts",
|
|
12
|
+
version="1.2",
|
|
13
|
+
description="Utilities for HPC clusters including PBS/Slurm job statistics and a psutil-based resource monitor.",
|
|
14
|
+
long_description=read_readme(),
|
|
15
|
+
long_description_content_type="text/markdown",
|
|
16
|
+
author="hpc-stats-scripts contributors",
|
|
17
|
+
python_requires=">=3.9",
|
|
18
|
+
license="MIT",
|
|
19
|
+
package_dir={"": "src"},
|
|
20
|
+
packages=find_packages(where="src"),
|
|
21
|
+
install_requires=["psutil"],
|
|
22
|
+
extras_require={
|
|
23
|
+
"plot": ["matplotlib", "numpy"],
|
|
24
|
+
"gpu": ["nvidia-ml-py3"],
|
|
25
|
+
"all": ["matplotlib", "numpy", "nvidia-ml-py3"],
|
|
26
|
+
},
|
|
27
|
+
entry_points={
|
|
28
|
+
"console_scripts": [
|
|
29
|
+
"pbs-bulk-user-stats=hpc_scripts.pbs_bulk_user_stats:main",
|
|
30
|
+
"psutil-monitor=hpc_scripts.psutil_monitor:main",
|
|
31
|
+
"slurm-bulk-user-stats=hpc_scripts.slurm_bulk_user_stats:main",
|
|
32
|
+
]
|
|
33
|
+
},
|
|
34
|
+
)
|