gpu-usage-audit 1.0.1__tar.gz → 1.0.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {gpu_usage_audit-1.0.1 → gpu_usage_audit-1.0.2}/CHANGELOG.md +14 -0
- {gpu_usage_audit-1.0.1 → gpu_usage_audit-1.0.2}/PKG-INFO +23 -11
- {gpu_usage_audit-1.0.1 → gpu_usage_audit-1.0.2}/README.md +22 -10
- gpu_usage_audit-1.0.2/projects/bare-metal-1.0/handoff.ko.md +83 -0
- gpu_usage_audit-1.0.2/projects/bare-metal-1.0/status.ko.md +120 -0
- {gpu_usage_audit-1.0.1 → gpu_usage_audit-1.0.2}/pyproject.toml +1 -1
- {gpu_usage_audit-1.0.1 → gpu_usage_audit-1.0.2}/src/gpu_usage_audit/__main__.py +56 -13
- {gpu_usage_audit-1.0.1 → gpu_usage_audit-1.0.2}/src/gpu_usage_audit/nvml.py +13 -1
- {gpu_usage_audit-1.0.1 → gpu_usage_audit-1.0.2}/src/gpu_usage_audit/render.py +23 -9
- {gpu_usage_audit-1.0.1 → gpu_usage_audit-1.0.2}/src/gpu_usage_audit/report.py +70 -29
- {gpu_usage_audit-1.0.1 → gpu_usage_audit-1.0.2}/tests/test_render.py +25 -11
- {gpu_usage_audit-1.0.1 → gpu_usage_audit-1.0.2}/tests/test_report.py +38 -7
- {gpu_usage_audit-1.0.1 → gpu_usage_audit-1.0.2}/tests/test_smoke.py +79 -1
- {gpu_usage_audit-1.0.1 → gpu_usage_audit-1.0.2}/uv.lock +1 -1
- gpu_usage_audit-1.0.1/projects/bare-metal-1.0/handoff.ko.md +0 -84
- gpu_usage_audit-1.0.1/projects/bare-metal-1.0/status.ko.md +0 -96
- {gpu_usage_audit-1.0.1 → gpu_usage_audit-1.0.2}/.github/workflows/ci.yml +0 -0
- {gpu_usage_audit-1.0.1 → gpu_usage_audit-1.0.2}/.github/workflows/release.yml +0 -0
- {gpu_usage_audit-1.0.1 → gpu_usage_audit-1.0.2}/.gitignore +0 -0
- {gpu_usage_audit-1.0.1 → gpu_usage_audit-1.0.2}/LICENSE +0 -0
- {gpu_usage_audit-1.0.1 → gpu_usage_audit-1.0.2}/projects/bare-metal-1.0/plan.ko.md +0 -0
- {gpu_usage_audit-1.0.1 → gpu_usage_audit-1.0.2}/scripts/check-tag-version.py +0 -0
- {gpu_usage_audit-1.0.1 → gpu_usage_audit-1.0.2}/scripts/smoke-dist-wheel.sh +0 -0
- {gpu_usage_audit-1.0.1 → gpu_usage_audit-1.0.2}/src/gpu_usage_audit/__init__.py +0 -0
- {gpu_usage_audit-1.0.1 → gpu_usage_audit-1.0.2}/src/gpu_usage_audit/classify.py +0 -0
- {gpu_usage_audit-1.0.1 → gpu_usage_audit-1.0.2}/src/gpu_usage_audit/daemon.py +0 -0
- {gpu_usage_audit-1.0.1 → gpu_usage_audit-1.0.2}/src/gpu_usage_audit/db.py +0 -0
- {gpu_usage_audit-1.0.1 → gpu_usage_audit-1.0.2}/src/gpu_usage_audit/doctor.py +0 -0
- {gpu_usage_audit-1.0.1 → gpu_usage_audit-1.0.2}/src/gpu_usage_audit/identity.py +0 -0
- {gpu_usage_audit-1.0.1 → gpu_usage_audit-1.0.2}/src/gpu_usage_audit/model.py +0 -0
- {gpu_usage_audit-1.0.1 → gpu_usage_audit-1.0.2}/src/gpu_usage_audit/summarize.py +0 -0
- {gpu_usage_audit-1.0.1 → gpu_usage_audit-1.0.2}/src/gpu_usage_audit/tier.py +0 -0
- {gpu_usage_audit-1.0.1 → gpu_usage_audit-1.0.2}/tests/__init__.py +0 -0
- {gpu_usage_audit-1.0.1 → gpu_usage_audit-1.0.2}/tests/test_classify.py +0 -0
- {gpu_usage_audit-1.0.1 → gpu_usage_audit-1.0.2}/tests/test_daemon.py +0 -0
- {gpu_usage_audit-1.0.1 → gpu_usage_audit-1.0.2}/tests/test_db.py +0 -0
- {gpu_usage_audit-1.0.1 → gpu_usage_audit-1.0.2}/tests/test_doctor.py +0 -0
- {gpu_usage_audit-1.0.1 → gpu_usage_audit-1.0.2}/tests/test_identity.py +0 -0
- {gpu_usage_audit-1.0.1 → gpu_usage_audit-1.0.2}/tests/test_nvml.py +0 -0
- {gpu_usage_audit-1.0.1 → gpu_usage_audit-1.0.2}/tests/test_summarize.py +0 -0
- {gpu_usage_audit-1.0.1 → gpu_usage_audit-1.0.2}/tests/test_tier.py +0 -0
|
@@ -1,5 +1,19 @@
|
|
|
1
1
|
# Changelog
|
|
2
2
|
|
|
3
|
+
## 1.0.2 - 2026-05-15
|
|
4
|
+
|
|
5
|
+
- Hardened `gua status` and `gua stop` so stale PID files do not act on
|
|
6
|
+
unrelated live processes.
|
|
7
|
+
- Clarified report output by explaining sample units, classification rules,
|
|
8
|
+
interval-dependent GPU-hours, and heatmap density.
|
|
9
|
+
- Split §2 from generic "Waste" into idle-held capacity and truly-idle
|
|
10
|
+
capacity. The equivalent-GPU figures now use GPUs present in the report
|
|
11
|
+
window instead of the entire database.
|
|
12
|
+
- Made §4 Top identities aggregate by identity/GPU/tick before converting to
|
|
13
|
+
GPU-hours, so reports may show lower per-user GPU-hours when one user has
|
|
14
|
+
multiple processes on the same GPU at the same tick.
|
|
15
|
+
- Warn when NVML process-list visibility is unavailable for a GPU.
|
|
16
|
+
|
|
3
17
|
## 1.0.1 - 2026-05-15
|
|
4
18
|
|
|
5
19
|
- Made `gua` the documented command surface for daemon, report, demo, and doctor output.
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: gpu-usage-audit
|
|
3
|
-
Version: 1.0.
|
|
3
|
+
Version: 1.0.2
|
|
4
4
|
Summary: Single-host daemon that surfaces 'idle-held' NVIDIA GPU memory — the embarrassing category conventional dashboards miss.
|
|
5
5
|
Project-URL: Homepage, https://github.com/AI-Ocean/gpu-usage-audit
|
|
6
6
|
Project-URL: Issues, https://github.com/AI-Ocean/gpu-usage-audit/issues
|
|
@@ -287,8 +287,8 @@ its `gua` / `gpu-usage-audit` commands.
|
|
|
287
287
|
GitHub Release assets are also available for manual download:
|
|
288
288
|
|
|
289
289
|
```sh
|
|
290
|
-
BASE="https://github.com/AI-Ocean/gpu-usage-audit/releases/download/v1.0.
|
|
291
|
-
WHEEL="gpu_usage_audit-1.0.
|
|
290
|
+
BASE="https://github.com/AI-Ocean/gpu-usage-audit/releases/download/v1.0.2"
|
|
291
|
+
WHEEL="gpu_usage_audit-1.0.2-py3-none-any.whl"
|
|
292
292
|
|
|
293
293
|
curl -fsSLO "$BASE/$WHEEL"
|
|
294
294
|
curl -fsSLO "$BASE/SHA256SUMS"
|
|
@@ -304,26 +304,33 @@ $ gua report --since 1h --interval 30s
|
|
|
304
304
|
gua — lab-a100 (bare, driver 560.35.05) Window: 1:00:00
|
|
305
305
|
|
|
306
306
|
§1 Headline
|
|
307
|
+
basis: one sample = one GPU card at one daemon tick
|
|
308
|
+
rules: active >=10% util; idle-held <10% util with >100 MB process memory
|
|
307
309
|
█████████▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒░░░░░░░░░░░░░░░░░░░░░░░░
|
|
308
310
|
active █ 15.7%
|
|
309
311
|
idle-held ▒ 45.1% ← this is the number conventional tools miss
|
|
310
312
|
truly-idle ░ 39.2%
|
|
311
313
|
(51 samples)
|
|
312
314
|
|
|
313
|
-
§2
|
|
314
|
-
|
|
315
|
+
§2 Idle capacity
|
|
316
|
+
converted from card-ticks to GPU-hours using the report --interval
|
|
317
|
+
idle-held: ~0.31 GPU-hours, ~1.53 GPUs equivalently unavailable
|
|
318
|
+
truly-idle: ~0.12 GPU-hours, ~1.00 GPUs equivalently free
|
|
315
319
|
|
|
316
320
|
§3 Per-GPU
|
|
321
|
+
per-card share of samples in the same three states
|
|
317
322
|
GPU-0 active 47.1% idle-held 35.3% truly-idle 17.6%
|
|
318
323
|
GPU-1 active 0.0% idle-held 100.0% truly-idle 0.0%
|
|
319
324
|
GPU-2 active 0.0% idle-held 0.0% truly-idle 100.0%
|
|
320
325
|
|
|
321
326
|
§4 Top identities
|
|
322
|
-
identity
|
|
323
|
-
|
|
324
|
-
|
|
327
|
+
one identity counts once per GPU/tick after its processes are summed
|
|
328
|
+
identity gpu-hours idle-held samples
|
|
329
|
+
alice 0.42 42.9% 51
|
|
330
|
+
bob 0.28 100.0% 34
|
|
325
331
|
|
|
326
332
|
§5 Time-of-day heatmap (UTC)
|
|
333
|
+
darker means higher active share; blank means no samples
|
|
327
334
|
0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3
|
|
328
335
|
Mon .
|
|
329
336
|
```
|
|
@@ -331,7 +338,10 @@ gua — lab-a100 (bare, driver 560.35.05) Window: 1:00:00
|
|
|
331
338
|
The 3-bar collapses every card × every tick over the window into the
|
|
332
339
|
active / idle-held / truly-idle split. **`idle-held` rows are the
|
|
333
340
|
embarrassing category**: a process is holding GPU memory but the SM
|
|
334
|
-
utilization is below 10%.
|
|
341
|
+
utilization is below 10%. §2 converts those card-ticks into GPU-hours
|
|
342
|
+
with `--interval`; §4 groups process rows by identity, GPU, and tick
|
|
343
|
+
before ranking users, so multiple same-user processes on one GPU/tick
|
|
344
|
+
count once.
|
|
335
345
|
|
|
336
346
|
## Demo (no GPU required)
|
|
337
347
|
|
|
@@ -408,7 +418,7 @@ point remains installed for compatibility, but new examples use `gua`.
|
|
|
408
418
|
| -------- | ----------------------------------------------------------- |
|
|
409
419
|
| `daemon` | Starts the collector in the background. Samples real NVML telemetry on every tick and writes to a new database. NVIDIA host required. |
|
|
410
420
|
| `start` | Alias for `gua daemon`. |
|
|
411
|
-
| `status` | Shows whether the background collector PID is still running. |
|
|
421
|
+
| `status` | Shows whether the background collector PID is still running. Also clears a stale PID file when it points to a missing or unrelated process. |
|
|
412
422
|
| `stop` | Stops the background collector with SIGTERM. |
|
|
413
423
|
| `report` | One-shot read against the accumulated database. Safe to run **while the daemon is still writing** — SQLite WAL mode handles the concurrency. |
|
|
414
424
|
| `demo` | Self-contained showcase. Records N fake ticks and immediately prints the report. No GPU, no second shell, no operational meaning — just to see the output shape. |
|
|
@@ -436,6 +446,8 @@ By default, `gua daemon` returns after the collector starts. Each tick is
|
|
|
436
446
|
written to the log file; on shutdown the cumulative row count is written
|
|
437
447
|
there too. `gua daemon --foreground` prints the tick summaries directly
|
|
438
448
|
to the terminal and exits on Ctrl+C, SIGTERM, or `systemctl stop`.
|
|
449
|
+
`gua status` and `gua stop` verify that the PID file points to the
|
|
450
|
+
managed collector before acting on it; stale PID files are cleared.
|
|
439
451
|
|
|
440
452
|
### `report`
|
|
441
453
|
|
|
@@ -450,7 +462,7 @@ gua report [--db PATH] [--since D] [--interval D] [--width N]
|
|
|
450
462
|
of oldest sample), so passing a huge `--since` is the same as "all
|
|
451
463
|
data". Units: `ms`, `s`, `m`, `h`, `d` (no `w`; use `7d`).
|
|
452
464
|
- `--interval D` (default `30s`) — **must match what the daemon used**.
|
|
453
|
-
This is how §2 (
|
|
465
|
+
This is how §2 (Idle capacity) and §4 (Top identities) convert tick counts
|
|
454
466
|
to GPU-hours. Mismatched intervals → wrong GPU-hours.
|
|
455
467
|
- `--width N` (default `60`) — width of the §1 three-bar in characters.
|
|
456
468
|
|
|
@@ -64,8 +64,8 @@ its `gua` / `gpu-usage-audit` commands.
|
|
|
64
64
|
GitHub Release assets are also available for manual download:
|
|
65
65
|
|
|
66
66
|
```sh
|
|
67
|
-
BASE="https://github.com/AI-Ocean/gpu-usage-audit/releases/download/v1.0.
|
|
68
|
-
WHEEL="gpu_usage_audit-1.0.
|
|
67
|
+
BASE="https://github.com/AI-Ocean/gpu-usage-audit/releases/download/v1.0.2"
|
|
68
|
+
WHEEL="gpu_usage_audit-1.0.2-py3-none-any.whl"
|
|
69
69
|
|
|
70
70
|
curl -fsSLO "$BASE/$WHEEL"
|
|
71
71
|
curl -fsSLO "$BASE/SHA256SUMS"
|
|
@@ -81,26 +81,33 @@ $ gua report --since 1h --interval 30s
|
|
|
81
81
|
gua — lab-a100 (bare, driver 560.35.05) Window: 1:00:00
|
|
82
82
|
|
|
83
83
|
§1 Headline
|
|
84
|
+
basis: one sample = one GPU card at one daemon tick
|
|
85
|
+
rules: active >=10% util; idle-held <10% util with >100 MB process memory
|
|
84
86
|
█████████▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒░░░░░░░░░░░░░░░░░░░░░░░░
|
|
85
87
|
active █ 15.7%
|
|
86
88
|
idle-held ▒ 45.1% ← this is the number conventional tools miss
|
|
87
89
|
truly-idle ░ 39.2%
|
|
88
90
|
(51 samples)
|
|
89
91
|
|
|
90
|
-
§2
|
|
91
|
-
|
|
92
|
+
§2 Idle capacity
|
|
93
|
+
converted from card-ticks to GPU-hours using the report --interval
|
|
94
|
+
idle-held: ~0.31 GPU-hours, ~1.53 GPUs equivalently unavailable
|
|
95
|
+
truly-idle: ~0.12 GPU-hours, ~1.00 GPUs equivalently free
|
|
92
96
|
|
|
93
97
|
§3 Per-GPU
|
|
98
|
+
per-card share of samples in the same three states
|
|
94
99
|
GPU-0 active 47.1% idle-held 35.3% truly-idle 17.6%
|
|
95
100
|
GPU-1 active 0.0% idle-held 100.0% truly-idle 0.0%
|
|
96
101
|
GPU-2 active 0.0% idle-held 0.0% truly-idle 100.0%
|
|
97
102
|
|
|
98
103
|
§4 Top identities
|
|
99
|
-
identity
|
|
100
|
-
|
|
101
|
-
|
|
104
|
+
one identity counts once per GPU/tick after its processes are summed
|
|
105
|
+
identity gpu-hours idle-held samples
|
|
106
|
+
alice 0.42 42.9% 51
|
|
107
|
+
bob 0.28 100.0% 34
|
|
102
108
|
|
|
103
109
|
§5 Time-of-day heatmap (UTC)
|
|
110
|
+
darker means higher active share; blank means no samples
|
|
104
111
|
0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3
|
|
105
112
|
Mon .
|
|
106
113
|
```
|
|
@@ -108,7 +115,10 @@ gua — lab-a100 (bare, driver 560.35.05) Window: 1:00:00
|
|
|
108
115
|
The 3-bar collapses every card × every tick over the window into the
|
|
109
116
|
active / idle-held / truly-idle split. **`idle-held` rows are the
|
|
110
117
|
embarrassing category**: a process is holding GPU memory but the SM
|
|
111
|
-
utilization is below 10%.
|
|
118
|
+
utilization is below 10%. §2 converts those card-ticks into GPU-hours
|
|
119
|
+
with `--interval`; §4 groups process rows by identity, GPU, and tick
|
|
120
|
+
before ranking users, so multiple same-user processes on one GPU/tick
|
|
121
|
+
count once.
|
|
112
122
|
|
|
113
123
|
## Demo (no GPU required)
|
|
114
124
|
|
|
@@ -185,7 +195,7 @@ point remains installed for compatibility, but new examples use `gua`.
|
|
|
185
195
|
| -------- | ----------------------------------------------------------- |
|
|
186
196
|
| `daemon` | Starts the collector in the background. Samples real NVML telemetry on every tick and writes to a new database. NVIDIA host required. |
|
|
187
197
|
| `start` | Alias for `gua daemon`. |
|
|
188
|
-
| `status` | Shows whether the background collector PID is still running. |
|
|
198
|
+
| `status` | Shows whether the background collector PID is still running. Also clears a stale PID file when it points to a missing or unrelated process. |
|
|
189
199
|
| `stop` | Stops the background collector with SIGTERM. |
|
|
190
200
|
| `report` | One-shot read against the accumulated database. Safe to run **while the daemon is still writing** — SQLite WAL mode handles the concurrency. |
|
|
191
201
|
| `demo` | Self-contained showcase. Records N fake ticks and immediately prints the report. No GPU, no second shell, no operational meaning — just to see the output shape. |
|
|
@@ -213,6 +223,8 @@ By default, `gua daemon` returns after the collector starts. Each tick is
|
|
|
213
223
|
written to the log file; on shutdown the cumulative row count is written
|
|
214
224
|
there too. `gua daemon --foreground` prints the tick summaries directly
|
|
215
225
|
to the terminal and exits on Ctrl+C, SIGTERM, or `systemctl stop`.
|
|
226
|
+
`gua status` and `gua stop` verify that the PID file points to the
|
|
227
|
+
managed collector before acting on it; stale PID files are cleared.
|
|
216
228
|
|
|
217
229
|
### `report`
|
|
218
230
|
|
|
@@ -227,7 +239,7 @@ gua report [--db PATH] [--since D] [--interval D] [--width N]
|
|
|
227
239
|
of oldest sample), so passing a huge `--since` is the same as "all
|
|
228
240
|
data". Units: `ms`, `s`, `m`, `h`, `d` (no `w`; use `7d`).
|
|
229
241
|
- `--interval D` (default `30s`) — **must match what the daemon used**.
|
|
230
|
-
This is how §2 (
|
|
242
|
+
This is how §2 (Idle capacity) and §4 (Top identities) convert tick counts
|
|
231
243
|
to GPU-hours. Mismatched intervals → wrong GPU-hours.
|
|
232
244
|
- `--width N` (default `60`) — width of the §1 three-bar in characters.
|
|
233
245
|
|
|
@@ -0,0 +1,83 @@
|
|
|
1
|
+
# Bare Metal 1.0 Handoff
|
|
2
|
+
|
|
3
|
+
갱신일: 2026-05-15
|
|
4
|
+
|
|
5
|
+
## 이어받을 때 먼저 볼 것
|
|
6
|
+
|
|
7
|
+
- `projects/bare-metal-1.0/status.ko.md`: 현재 완료 상태, 1.0.1 검증 결과, 1.0.2 release prep 상태.
|
|
8
|
+
- `README.md`: 실제 사용자 문서와 release/install/runbook/report 표면.
|
|
9
|
+
- `src/gpu_usage_audit/__main__.py`: `gua` CLI, background daemon lifecycle, PID handling.
|
|
10
|
+
- `src/gpu_usage_audit/report.py`: report SQL 집계.
|
|
11
|
+
- `src/gpu_usage_audit/render.py`: report 사람이 읽는 출력.
|
|
12
|
+
- `.github/workflows/release.yml`: tag release, GitHub Release, PyPI publish 경로.
|
|
13
|
+
|
|
14
|
+
## 고정된 결정
|
|
15
|
+
|
|
16
|
+
- 1.0은 단일 로컬 베어메탈 NVIDIA 호스트만 본다.
|
|
17
|
+
- Kubernetes, Slurm, Docker/Podman fallback, remote node, cluster-wide report는 1.0 범위 밖이다.
|
|
18
|
+
- `nvidia-ml-py`는 기본 dependency다.
|
|
19
|
+
- `gpu-usage-audit[nvml]` extra는 compatibility를 위해 빈 alias로 남긴다.
|
|
20
|
+
- DB schema는 v1을 유지한다: `host`, `gpu_sample`, `proc_sample`.
|
|
21
|
+
- 기본 DB는 `/tmp/gua.db`다.
|
|
22
|
+
- `gua daemon`은 기본 백그라운드 실행이다.
|
|
23
|
+
- `gua daemon --foreground`는 systemd/debugging 용도다.
|
|
24
|
+
- `gua start`는 `gua daemon` alias다.
|
|
25
|
+
- `gua status`와 `gua stop`은 pid file 기반 background collector 관리용이다.
|
|
26
|
+
- `daemon`은 기존 DB 파일이 있으면 실패한다.
|
|
27
|
+
- `report`는 DB 파일이 없으면 실패한다.
|
|
28
|
+
- `daemon`과 `demo`는 host row의 `env_kind`를 항상 `"bare"`로 기록한다.
|
|
29
|
+
- auto-runtime proposal/project 문서는 삭제했다. Kubernetes/Slurm/Docker/Podman 확장을 다시
|
|
30
|
+
시작하려면 새 proposal로 시작한다.
|
|
31
|
+
|
|
32
|
+
## 현재 상태
|
|
33
|
+
|
|
34
|
+
- PR A: implemented in PR #9.
|
|
35
|
+
- PR B: implemented in PR #10.
|
|
36
|
+
- Post-1.0 cleanup: completed in PR #11.
|
|
37
|
+
- Bare-metal 1.0 release: completed in PR #12 and tag `v1.0.0`.
|
|
38
|
+
- 1.0.1 command surface/background daemon release: completed in PR #13 and tag `v1.0.1`.
|
|
39
|
+
- GitHub Release `v1.0.1`: published.
|
|
40
|
+
- PyPI `gpu-usage-audit 1.0.1`: published.
|
|
41
|
+
- NVIDIA host acceptance: 사용자가 실제 host에서 수집 정상 동작을 확인했다.
|
|
42
|
+
- 1.0.2 release prep: 진행 중. #14 lifecycle/report cleanup을 patch release로 배포한다.
|
|
43
|
+
package version은 `1.0.2`로 bump했고 local build/wheel smoke는 통과했다.
|
|
44
|
+
|
|
45
|
+
## 마지막 로컬 검증
|
|
46
|
+
|
|
47
|
+
```sh
|
|
48
|
+
uv run ruff check
|
|
49
|
+
uv run ruff format --check
|
|
50
|
+
uv run mypy
|
|
51
|
+
uv run pytest
|
|
52
|
+
uv build --out-dir /tmp/gua-dist-1.0.2-prep
|
|
53
|
+
bash scripts/smoke-dist-wheel.sh /tmp/gua-dist-1.0.2-prep/gpu_usage_audit-1.0.2-py3-none-any.whl
|
|
54
|
+
env GITHUB_REF_NAME=v1.0.2 uv run python scripts/check-tag-version.py
|
|
55
|
+
```
|
|
56
|
+
|
|
57
|
+
결과는 `pytest` 124 passed, `mypy` 25 source files, `ruff format` 26 files 기준이다.
|
|
58
|
+
|
|
59
|
+
## 현재 cleanup PR 방향
|
|
60
|
+
|
|
61
|
+
- `/tmp/gua.pid`가 PID 재사용으로 다른 프로세스를 가리킬 수 있으므로 `status`/`stop` 전에
|
|
62
|
+
해당 PID가 실제 managed `gpu_usage_audit daemon` 프로세스인지 확인한다.
|
|
63
|
+
- report §2는 low-util 전체를 "waste"로 합치지 말고 `idle-held`와 `truly-idle`을 분리한다.
|
|
64
|
+
- report §4는 process row가 아니라 identity/GPU/tick 단위로 먼저 접어서 사용자별 GPU-hours를 계산한다.
|
|
65
|
+
- report 출력 자체에 sample 의미, classification rule, `--interval` 의존성, heatmap 의미를 짧게 노출한다.
|
|
66
|
+
- NVML process list 조회 실패는 idle-held를 과소평가할 수 있으므로 warning으로 남긴다.
|
|
67
|
+
- 1.0.2 release prep에서는 package version, README release asset 예시, CHANGELOG를 `1.0.2`로 맞춘다.
|
|
68
|
+
|
|
69
|
+
## 주의할 점
|
|
70
|
+
|
|
71
|
+
- 현재 로컬 개발 머신은 NVIDIA host가 아니다. `gua doctor`가 unsupported를 내는 것은 정상이다.
|
|
72
|
+
- `/tmp/gua.db`가 이미 존재한다. 기본 경로 daemon 실행이 거부되는 것은 기대 동작이다.
|
|
73
|
+
- `report --interval`은 daemon 수집 interval과 같아야 GPU-hours가 맞다.
|
|
74
|
+
- SQLite WAL sidecar(`*.db-wal`, `*.db-shm`)는 마지막 connection이 닫히면 정리된다.
|
|
75
|
+
- 1.0.2를 자를 경우 `env GITHUB_REF_NAME=v1.0.2 uv run python scripts/check-tag-version.py`가
|
|
76
|
+
통과해야 한다.
|
|
77
|
+
|
|
78
|
+
## 다음 세션 추천 순서
|
|
79
|
+
|
|
80
|
+
1. `git status --short`로 사용자 변경 여부를 먼저 확인한다.
|
|
81
|
+
2. cleanup PR의 CI 결과와 review comments를 확인한다.
|
|
82
|
+
3. 필요하면 report wording을 실제 운영자가 읽기 쉬운 형태로 한 번 더 다듬는다.
|
|
83
|
+
4. merge 후 patch release가 필요하면 version bump와 changelog를 별도 PR로 처리한다.
|
|
@@ -0,0 +1,120 @@
|
|
|
1
|
+
# Bare Metal 1.0 Status
|
|
2
|
+
|
|
3
|
+
갱신일: 2026-05-15
|
|
4
|
+
|
|
5
|
+
## 요약
|
|
6
|
+
|
|
7
|
+
Bare Metal 1.0은 단일 NVIDIA 베어메탈 호스트만 대상으로 하는 형태로 1.0.1까지
|
|
8
|
+
릴리스됐고, 현재 1.0.2 release prep을 진행 중이다. `v1.0.1` GitHub Release와
|
|
9
|
+
PyPI publish는 완료됐고, 사용자가 실제 NVIDIA host에서 telemetry 수집이 정상
|
|
10
|
+
동작하는 것도 확인했다.
|
|
11
|
+
|
|
12
|
+
1.0.2 후보는 1.0.1 이후 코드 퀄리티 cleanup을 배포하기 위한 patch release다.
|
|
13
|
+
주요 초점은 background daemon PID 안전성, report 의미 가시성, 내부 문서 정합성이다.
|
|
14
|
+
|
|
15
|
+
## 구현 상태
|
|
16
|
+
|
|
17
|
+
| 영역 | 상태 | 메모 |
|
|
18
|
+
| --- | --- | --- |
|
|
19
|
+
| Scope reset | 완료 | Kubernetes/Slurm/Docker/remote runtime 표면 제거. |
|
|
20
|
+
| `gua doctor` | 완료 | 현재 머신의 `/dev/nvidia*`, `nvidia-smi -L`, NVML, DB path만 진단. |
|
|
21
|
+
| Packaging UX | 완료 | `nvidia-ml-py`가 기본 dependency이고 `nvml` extra는 빈 compatibility alias. |
|
|
22
|
+
| `gua` command surface | 완료 | `doctor`, `daemon`, `start`, `status`, `stop`, `report`, `demo` 제공. |
|
|
23
|
+
| Background daemon UX | 완료 | `gua daemon`은 기본 백그라운드 실행, `--foreground`는 systemd/debug용. |
|
|
24
|
+
| `daemon`/`report` DB UX | 완료 | 기본 DB는 `/tmp/gua.db`; daemon은 기존 DB를 거부하고 report는 없는 DB를 거부. |
|
|
25
|
+
| README bare-metal 문서 | 완료 | install, runbook, systemd 예시, 운영 notes가 1.0.2 기준. |
|
|
26
|
+
| Release | 진행 중 | package version은 `1.0.2`; local build/wheel smoke 완료, release prep PR과 tag publish가 남음. |
|
|
27
|
+
| NVIDIA host acceptance | 완료 | 실제 NVIDIA host에서 수집 정상 동작 확인. |
|
|
28
|
+
|
|
29
|
+
## 마지막 확인 결과
|
|
30
|
+
|
|
31
|
+
2026-05-15 1.0.2 release prep 로컬 검증:
|
|
32
|
+
|
|
33
|
+
```sh
|
|
34
|
+
uv run ruff format --check
|
|
35
|
+
uv run ruff check
|
|
36
|
+
uv run mypy
|
|
37
|
+
uv run pytest
|
|
38
|
+
env GITHUB_REF_NAME=v1.0.2 uv run python scripts/check-tag-version.py
|
|
39
|
+
uv build --out-dir /tmp/gua-dist-1.0.2-prep
|
|
40
|
+
bash scripts/smoke-dist-wheel.sh /tmp/gua-dist-1.0.2-prep/gpu_usage_audit-1.0.2-py3-none-any.whl
|
|
41
|
+
```
|
|
42
|
+
|
|
43
|
+
결과:
|
|
44
|
+
|
|
45
|
+
- `ruff format --check`: 26 files already formatted.
|
|
46
|
+
- `ruff check`: pass.
|
|
47
|
+
- `mypy`: no issues in 25 source files.
|
|
48
|
+
- `pytest`: 124 passed.
|
|
49
|
+
- tag-version check: `v1.0.2`와 `pyproject.toml` version 일치.
|
|
50
|
+
- `uv build`: sdist/wheel build 성공.
|
|
51
|
+
- wheel smoke: 성공.
|
|
52
|
+
|
|
53
|
+
2026-05-15 1.0.1 상태 확인:
|
|
54
|
+
|
|
55
|
+
```sh
|
|
56
|
+
git status --short
|
|
57
|
+
uv run ruff check
|
|
58
|
+
uv run ruff format --check
|
|
59
|
+
uv run mypy
|
|
60
|
+
uv run pytest
|
|
61
|
+
env GITHUB_REF_NAME=v1.0.1 uv run python scripts/check-tag-version.py
|
|
62
|
+
uv build --out-dir /tmp/gua-dist-1.0.1-status
|
|
63
|
+
bash scripts/smoke-dist-wheel.sh /tmp/gua-dist-1.0.1-status/gpu_usage_audit-1.0.1-py3-none-any.whl
|
|
64
|
+
```
|
|
65
|
+
|
|
66
|
+
결과:
|
|
67
|
+
|
|
68
|
+
- 작업트리 clean.
|
|
69
|
+
- `ruff check`: pass.
|
|
70
|
+
- `ruff format --check`: 26 files already formatted.
|
|
71
|
+
- `mypy`: no issues in 25 source files.
|
|
72
|
+
- `pytest`: 114 passed.
|
|
73
|
+
- tag-version check: `v1.0.1`과 `pyproject.toml` version 일치.
|
|
74
|
+
- `uv build`: sdist/wheel build 성공.
|
|
75
|
+
- wheel smoke: 성공.
|
|
76
|
+
- Release workflow: `v1.0.1` success.
|
|
77
|
+
- PyPI latest: `gpu-usage-audit 1.0.1`.
|
|
78
|
+
|
|
79
|
+
## 1.0.1에서 바뀐 점
|
|
80
|
+
|
|
81
|
+
- `gua`를 documented command surface로 정리했다.
|
|
82
|
+
- `gua daemon`은 collector를 백그라운드로 시작한다.
|
|
83
|
+
- `gua daemon --foreground`는 systemd와 debugging 용도로 유지한다.
|
|
84
|
+
- `gua start`, `gua status`, `gua stop`을 추가했다.
|
|
85
|
+
- README의 install/run/report 예시는 `gua` 기준으로 정리됐다.
|
|
86
|
+
|
|
87
|
+
## 현재 cleanup 리뷰 결과
|
|
88
|
+
|
|
89
|
+
- `/tmp/gua.pid` 숫자만 믿고 `gua stop`이 SIGTERM을 보내면 PID 재사용 시 다른
|
|
90
|
+
프로세스를 건드릴 수 있다. pid가 실제 `python -m gpu_usage_audit daemon`
|
|
91
|
+
프로세스인지 확인해야 한다.
|
|
92
|
+
- §2 report가 `idle-held`와 `truly-idle`을 모두 "idle/waste"로 합쳐 보여주면
|
|
93
|
+
제품 메시지가 흐려진다. 사용자가 못 쓰는 용량과 실제 빈 용량을 분리해야 한다.
|
|
94
|
+
- §4 Top identities는 process row를 바로 세면 같은 사용자의 여러 프로세스가
|
|
95
|
+
같은 GPU/tick에서 과대계상될 수 있다. identity/GPU/tick 단위로 먼저 접어야 한다.
|
|
96
|
+
- report는 "sample"의 의미, threshold, `--interval` 의존성을 출력 자체에서 더
|
|
97
|
+
잘 설명해야 한다.
|
|
98
|
+
- NVML process list를 읽지 못하는 경우 low-util GPU가 `truly-idle`처럼 보일 수
|
|
99
|
+
있으므로 최소한 경고가 필요하다.
|
|
100
|
+
|
|
101
|
+
## 로컬 `doctor` 상태
|
|
102
|
+
|
|
103
|
+
현재 개발 머신은 NVIDIA host가 아니므로 `uv run gua doctor`는 `unsupported`가
|
|
104
|
+
정상 결과다.
|
|
105
|
+
|
|
106
|
+
관찰된 blocker:
|
|
107
|
+
|
|
108
|
+
- `/dev/nvidia*` 없음.
|
|
109
|
+
- `nvidia-smi`가 PATH에 없음.
|
|
110
|
+
- NVML init 실패: `libnvidia-ml.so.1` 없음.
|
|
111
|
+
- `/tmp/gua.db`가 이미 있어 daemon은 기본 경로로 시작하지 않음.
|
|
112
|
+
|
|
113
|
+
이 결과는 로컬 환경 한계이며, 제품 regression으로 보지 않는다.
|
|
114
|
+
|
|
115
|
+
## 다음 작업
|
|
116
|
+
|
|
117
|
+
1. 1.0.2 release prep PR에서 version, README release asset 예시, CHANGELOG를 갱신한다.
|
|
118
|
+
2. `uv run ruff check`, `uv run ruff format --check`, `uv run mypy`, `uv run pytest`,
|
|
119
|
+
`uv build`, wheel smoke, tag-version check를 다시 실행한다.
|
|
120
|
+
3. PR merge 후 `v1.0.2` tag를 push해 GitHub Release와 PyPI publish workflow를 실행한다.
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[project]
|
|
2
2
|
name = "gpu-usage-audit"
|
|
3
|
-
version = "1.0.
|
|
3
|
+
version = "1.0.2"
|
|
4
4
|
description = "Single-host daemon that surfaces 'idle-held' NVIDIA GPU memory — the embarrassing category conventional dashboards miss."
|
|
5
5
|
readme = "README.md"
|
|
6
6
|
license = { file = "LICENSE" }
|
|
@@ -48,17 +48,17 @@ from .nvml import NVMLNotAvailableError, NVMLTier
|
|
|
48
48
|
from .render import (
|
|
49
49
|
render_headline,
|
|
50
50
|
render_heatmap,
|
|
51
|
+
render_idle_capacity,
|
|
51
52
|
render_per_gpu,
|
|
52
53
|
render_top_identities,
|
|
53
|
-
render_waste,
|
|
54
54
|
)
|
|
55
55
|
from .report import (
|
|
56
56
|
load_headline,
|
|
57
57
|
load_heatmap,
|
|
58
58
|
load_host,
|
|
59
|
+
load_idle_capacity,
|
|
59
60
|
load_per_gpu,
|
|
60
61
|
load_top_identities,
|
|
61
|
-
load_waste,
|
|
62
62
|
)
|
|
63
63
|
from .tier import FakeTier
|
|
64
64
|
|
|
@@ -137,7 +137,7 @@ def build_parser() -> argparse.ArgumentParser:
|
|
|
137
137
|
"--interval",
|
|
138
138
|
type=_duration,
|
|
139
139
|
default=timedelta(seconds=30),
|
|
140
|
-
help="Daemon tick interval — for §2
|
|
140
|
+
help="Daemon tick interval — for §2 Idle capacity / §4 time conversion [default: 30s]",
|
|
141
141
|
)
|
|
142
142
|
p_report.add_argument(
|
|
143
143
|
"--width",
|
|
@@ -206,7 +206,7 @@ def _add_report_args(parser: argparse.ArgumentParser) -> None:
|
|
|
206
206
|
"--interval",
|
|
207
207
|
type=_duration,
|
|
208
208
|
default=timedelta(seconds=30),
|
|
209
|
-
help="Daemon tick interval — for §2
|
|
209
|
+
help="Daemon tick interval — for §2 Idle capacity / §4 time conversion [default: 30s]",
|
|
210
210
|
)
|
|
211
211
|
parser.add_argument(
|
|
212
212
|
"--width",
|
|
@@ -355,10 +355,15 @@ def _cmd_gua_start(args: argparse.Namespace) -> int:
|
|
|
355
355
|
log_path = Path(args.log_file)
|
|
356
356
|
|
|
357
357
|
existing_pid = _read_pid(pid_path)
|
|
358
|
-
if existing_pid is not None and _pid_alive(existing_pid):
|
|
359
|
-
print(f"gua daemon: already running (pid {existing_pid})")
|
|
360
|
-
return 0
|
|
361
358
|
if existing_pid is not None:
|
|
359
|
+
if _pid_alive(existing_pid) and _pid_is_managed_daemon(existing_pid):
|
|
360
|
+
print(f"gua daemon: already running (pid {existing_pid})")
|
|
361
|
+
return 0
|
|
362
|
+
if _pid_alive(existing_pid):
|
|
363
|
+
print(
|
|
364
|
+
f"gua daemon: pid {existing_pid} belongs to another process; "
|
|
365
|
+
"clearing stale pid file"
|
|
366
|
+
)
|
|
362
367
|
_unlink_if_exists(pid_path)
|
|
363
368
|
|
|
364
369
|
if db_path.exists():
|
|
@@ -418,13 +423,20 @@ def _cmd_gua_status(args: argparse.Namespace) -> int:
|
|
|
418
423
|
if pid is None:
|
|
419
424
|
print("gua daemon: not running")
|
|
420
425
|
return 0
|
|
421
|
-
if _pid_alive(pid):
|
|
426
|
+
if _pid_alive(pid) and _pid_is_managed_daemon(pid):
|
|
422
427
|
print(f"gua daemon: running (pid {pid})")
|
|
423
428
|
print(f" pid file: {pid_path}")
|
|
424
429
|
print(f" log: {log_path}")
|
|
425
430
|
return 0
|
|
426
|
-
|
|
427
|
-
|
|
431
|
+
if _pid_alive(pid):
|
|
432
|
+
_unlink_if_exists(pid_path)
|
|
433
|
+
print(
|
|
434
|
+
f"gua daemon: not running (pid {pid} belongs to another process; "
|
|
435
|
+
"cleared stale pid file)"
|
|
436
|
+
)
|
|
437
|
+
else:
|
|
438
|
+
print(f"gua daemon: not running (stale pid {pid})")
|
|
439
|
+
_unlink_if_exists(pid_path)
|
|
428
440
|
return 0
|
|
429
441
|
|
|
430
442
|
|
|
@@ -438,7 +450,17 @@ def _cmd_gua_stop(args: argparse.Namespace) -> int:
|
|
|
438
450
|
_unlink_if_exists(pid_path)
|
|
439
451
|
print(f"gua daemon: not running (removed stale pid {pid})")
|
|
440
452
|
return 0
|
|
453
|
+
if not _pid_is_managed_daemon(pid):
|
|
454
|
+
_unlink_if_exists(pid_path)
|
|
455
|
+
print(
|
|
456
|
+
f"gua daemon: not running (pid {pid} belongs to another process; "
|
|
457
|
+
"cleared stale pid file)"
|
|
458
|
+
)
|
|
459
|
+
return 0
|
|
441
460
|
|
|
461
|
+
# The identity check above closes the common stale-PID-file case. A tiny
|
|
462
|
+
# check-then-kill race remains if the process exits and the OS reuses the
|
|
463
|
+
# PID before SIGTERM; avoiding that needs a stronger lock model.
|
|
442
464
|
try:
|
|
443
465
|
os.kill(pid, signal.SIGTERM)
|
|
444
466
|
except PermissionError:
|
|
@@ -525,12 +547,12 @@ def _cmd_report(args: argparse.Namespace) -> int:
|
|
|
525
547
|
cutoff = datetime.now(UTC) - args.since
|
|
526
548
|
host = load_host(conn)
|
|
527
549
|
headline = load_headline(conn, cutoff)
|
|
528
|
-
|
|
550
|
+
idle_capacity = load_idle_capacity(conn, cutoff, args.interval)
|
|
529
551
|
per_gpu = load_per_gpu(conn, cutoff)
|
|
530
552
|
top = load_top_identities(conn, cutoff, args.interval)
|
|
531
553
|
heat = load_heatmap(conn, cutoff)
|
|
532
554
|
render_headline(sys.stdout, host, headline, args.since, args.width)
|
|
533
|
-
|
|
555
|
+
render_idle_capacity(sys.stdout, idle_capacity)
|
|
534
556
|
render_per_gpu(sys.stdout, per_gpu)
|
|
535
557
|
render_top_identities(sys.stdout, top)
|
|
536
558
|
render_heatmap(sys.stdout, heat)
|
|
@@ -586,7 +608,7 @@ def _cmd_demo(args: argparse.Namespace) -> int:
|
|
|
586
608
|
cutoff = datetime.now(UTC) - window
|
|
587
609
|
loaded_host = load_host(conn)
|
|
588
610
|
render_headline(sys.stdout, loaded_host, load_headline(conn, cutoff), window, width=60)
|
|
589
|
-
|
|
611
|
+
render_idle_capacity(sys.stdout, load_idle_capacity(conn, cutoff, args.interval))
|
|
590
612
|
render_per_gpu(sys.stdout, load_per_gpu(conn, cutoff))
|
|
591
613
|
render_top_identities(sys.stdout, load_top_identities(conn, cutoff, args.interval))
|
|
592
614
|
render_heatmap(sys.stdout, load_heatmap(conn, cutoff))
|
|
@@ -677,6 +699,27 @@ def _pid_alive(pid: int) -> bool:
|
|
|
677
699
|
return True
|
|
678
700
|
|
|
679
701
|
|
|
702
|
+
def _pid_is_managed_daemon(pid: int) -> bool:
|
|
703
|
+
"""Return True for the subprocess shape created by `_cmd_gua_start`.
|
|
704
|
+
|
|
705
|
+
Keep this in sync with the spawn command in `_cmd_gua_start`; status/stop
|
|
706
|
+
use it to avoid acting on unrelated processes from stale PID files.
|
|
707
|
+
"""
|
|
708
|
+
args = _read_proc_cmdline(pid)
|
|
709
|
+
for i, arg in enumerate(args):
|
|
710
|
+
if arg == "-m" and args[i + 1 : i + 3] == ["gpu_usage_audit", "daemon"]:
|
|
711
|
+
return True
|
|
712
|
+
return False
|
|
713
|
+
|
|
714
|
+
|
|
715
|
+
def _read_proc_cmdline(pid: int) -> list[str]:
|
|
716
|
+
try:
|
|
717
|
+
raw = Path(f"/proc/{pid}/cmdline").read_bytes()
|
|
718
|
+
except OSError:
|
|
719
|
+
return []
|
|
720
|
+
return [part.decode("utf-8", errors="replace") for part in raw.split(b"\0") if part]
|
|
721
|
+
|
|
722
|
+
|
|
680
723
|
def _unlink_if_exists(path: Path) -> None:
|
|
681
724
|
with contextlib.suppress(FileNotFoundError):
|
|
682
725
|
path.unlink()
|
|
@@ -10,11 +10,14 @@ GPU 없는 개발/CI/demo 환경도 계속 동작해야 하므로 import/init
|
|
|
10
10
|
from __future__ import annotations
|
|
11
11
|
|
|
12
12
|
import contextlib
|
|
13
|
+
import logging
|
|
13
14
|
from datetime import datetime
|
|
14
15
|
from typing import Any
|
|
15
16
|
|
|
16
17
|
from .model import GPUSample, ProcSample, Snapshot
|
|
17
18
|
|
|
19
|
+
logger = logging.getLogger(__name__)
|
|
20
|
+
|
|
18
21
|
|
|
19
22
|
class NVMLNotAvailableError(RuntimeError):
|
|
20
23
|
"""pynvml 미설치 또는 NVML 초기화 실패. 사용자 facing 메시지로도 사용."""
|
|
@@ -59,6 +62,7 @@ class NVMLTier:
|
|
|
59
62
|
def __init__(self) -> None:
|
|
60
63
|
self._nvml: Any | None = None # pynvml ModuleType
|
|
61
64
|
self._initialized = False
|
|
65
|
+
self._process_list_warning_uuids: set[str] = set()
|
|
62
66
|
|
|
63
67
|
def __enter__(self) -> NVMLTier:
|
|
64
68
|
return self
|
|
@@ -97,7 +101,15 @@ class NVMLTier:
|
|
|
97
101
|
# 해당 카드의 process list 만 비우고 진행.
|
|
98
102
|
try:
|
|
99
103
|
running = nvml.nvmlDeviceGetComputeRunningProcesses(h)
|
|
100
|
-
except nvml.NVMLError:
|
|
104
|
+
except nvml.NVMLError as e:
|
|
105
|
+
if uuid not in self._process_list_warning_uuids:
|
|
106
|
+
logger.warning(
|
|
107
|
+
"NVML process list unavailable for %s; idle-held classification "
|
|
108
|
+
"may be understated: %s",
|
|
109
|
+
uuid,
|
|
110
|
+
e,
|
|
111
|
+
)
|
|
112
|
+
self._process_list_warning_uuids.add(uuid)
|
|
101
113
|
running = []
|
|
102
114
|
|
|
103
115
|
for p in running:
|