gpu-usage-audit 1.0.1__tar.gz → 1.0.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (41) hide show
  1. {gpu_usage_audit-1.0.1 → gpu_usage_audit-1.0.2}/CHANGELOG.md +14 -0
  2. {gpu_usage_audit-1.0.1 → gpu_usage_audit-1.0.2}/PKG-INFO +23 -11
  3. {gpu_usage_audit-1.0.1 → gpu_usage_audit-1.0.2}/README.md +22 -10
  4. gpu_usage_audit-1.0.2/projects/bare-metal-1.0/handoff.ko.md +83 -0
  5. gpu_usage_audit-1.0.2/projects/bare-metal-1.0/status.ko.md +120 -0
  6. {gpu_usage_audit-1.0.1 → gpu_usage_audit-1.0.2}/pyproject.toml +1 -1
  7. {gpu_usage_audit-1.0.1 → gpu_usage_audit-1.0.2}/src/gpu_usage_audit/__main__.py +56 -13
  8. {gpu_usage_audit-1.0.1 → gpu_usage_audit-1.0.2}/src/gpu_usage_audit/nvml.py +13 -1
  9. {gpu_usage_audit-1.0.1 → gpu_usage_audit-1.0.2}/src/gpu_usage_audit/render.py +23 -9
  10. {gpu_usage_audit-1.0.1 → gpu_usage_audit-1.0.2}/src/gpu_usage_audit/report.py +70 -29
  11. {gpu_usage_audit-1.0.1 → gpu_usage_audit-1.0.2}/tests/test_render.py +25 -11
  12. {gpu_usage_audit-1.0.1 → gpu_usage_audit-1.0.2}/tests/test_report.py +38 -7
  13. {gpu_usage_audit-1.0.1 → gpu_usage_audit-1.0.2}/tests/test_smoke.py +79 -1
  14. {gpu_usage_audit-1.0.1 → gpu_usage_audit-1.0.2}/uv.lock +1 -1
  15. gpu_usage_audit-1.0.1/projects/bare-metal-1.0/handoff.ko.md +0 -84
  16. gpu_usage_audit-1.0.1/projects/bare-metal-1.0/status.ko.md +0 -96
  17. {gpu_usage_audit-1.0.1 → gpu_usage_audit-1.0.2}/.github/workflows/ci.yml +0 -0
  18. {gpu_usage_audit-1.0.1 → gpu_usage_audit-1.0.2}/.github/workflows/release.yml +0 -0
  19. {gpu_usage_audit-1.0.1 → gpu_usage_audit-1.0.2}/.gitignore +0 -0
  20. {gpu_usage_audit-1.0.1 → gpu_usage_audit-1.0.2}/LICENSE +0 -0
  21. {gpu_usage_audit-1.0.1 → gpu_usage_audit-1.0.2}/projects/bare-metal-1.0/plan.ko.md +0 -0
  22. {gpu_usage_audit-1.0.1 → gpu_usage_audit-1.0.2}/scripts/check-tag-version.py +0 -0
  23. {gpu_usage_audit-1.0.1 → gpu_usage_audit-1.0.2}/scripts/smoke-dist-wheel.sh +0 -0
  24. {gpu_usage_audit-1.0.1 → gpu_usage_audit-1.0.2}/src/gpu_usage_audit/__init__.py +0 -0
  25. {gpu_usage_audit-1.0.1 → gpu_usage_audit-1.0.2}/src/gpu_usage_audit/classify.py +0 -0
  26. {gpu_usage_audit-1.0.1 → gpu_usage_audit-1.0.2}/src/gpu_usage_audit/daemon.py +0 -0
  27. {gpu_usage_audit-1.0.1 → gpu_usage_audit-1.0.2}/src/gpu_usage_audit/db.py +0 -0
  28. {gpu_usage_audit-1.0.1 → gpu_usage_audit-1.0.2}/src/gpu_usage_audit/doctor.py +0 -0
  29. {gpu_usage_audit-1.0.1 → gpu_usage_audit-1.0.2}/src/gpu_usage_audit/identity.py +0 -0
  30. {gpu_usage_audit-1.0.1 → gpu_usage_audit-1.0.2}/src/gpu_usage_audit/model.py +0 -0
  31. {gpu_usage_audit-1.0.1 → gpu_usage_audit-1.0.2}/src/gpu_usage_audit/summarize.py +0 -0
  32. {gpu_usage_audit-1.0.1 → gpu_usage_audit-1.0.2}/src/gpu_usage_audit/tier.py +0 -0
  33. {gpu_usage_audit-1.0.1 → gpu_usage_audit-1.0.2}/tests/__init__.py +0 -0
  34. {gpu_usage_audit-1.0.1 → gpu_usage_audit-1.0.2}/tests/test_classify.py +0 -0
  35. {gpu_usage_audit-1.0.1 → gpu_usage_audit-1.0.2}/tests/test_daemon.py +0 -0
  36. {gpu_usage_audit-1.0.1 → gpu_usage_audit-1.0.2}/tests/test_db.py +0 -0
  37. {gpu_usage_audit-1.0.1 → gpu_usage_audit-1.0.2}/tests/test_doctor.py +0 -0
  38. {gpu_usage_audit-1.0.1 → gpu_usage_audit-1.0.2}/tests/test_identity.py +0 -0
  39. {gpu_usage_audit-1.0.1 → gpu_usage_audit-1.0.2}/tests/test_nvml.py +0 -0
  40. {gpu_usage_audit-1.0.1 → gpu_usage_audit-1.0.2}/tests/test_summarize.py +0 -0
  41. {gpu_usage_audit-1.0.1 → gpu_usage_audit-1.0.2}/tests/test_tier.py +0 -0
@@ -1,5 +1,19 @@
1
1
  # Changelog
2
2
 
3
+ ## 1.0.2 - 2026-05-15
4
+
5
+ - Hardened `gua status` and `gua stop` so stale PID files do not act on
6
+ unrelated live processes.
7
+ - Clarified report output by explaining sample units, classification rules,
8
+ interval-dependent GPU-hours, and heatmap density.
9
+ - Split §2 from generic "Waste" into idle-held capacity and truly-idle
10
+ capacity. The equivalent-GPU figures now use GPUs present in the report
11
+ window instead of the entire database.
12
+ - Made §4 Top identities aggregate by identity/GPU/tick before converting to
13
+ GPU-hours, so reports may show lower per-user GPU-hours when one user has
14
+ multiple processes on the same GPU at the same tick.
15
+ - Warn when NVML process-list visibility is unavailable for a GPU.
16
+
3
17
  ## 1.0.1 - 2026-05-15
4
18
 
5
19
  - Made `gua` the documented command surface for daemon, report, demo, and doctor output.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: gpu-usage-audit
3
- Version: 1.0.1
3
+ Version: 1.0.2
4
4
  Summary: Single-host daemon that surfaces 'idle-held' NVIDIA GPU memory — the embarrassing category conventional dashboards miss.
5
5
  Project-URL: Homepage, https://github.com/AI-Ocean/gpu-usage-audit
6
6
  Project-URL: Issues, https://github.com/AI-Ocean/gpu-usage-audit/issues
@@ -287,8 +287,8 @@ its `gua` / `gpu-usage-audit` commands.
287
287
  GitHub Release assets are also available for manual download:
288
288
 
289
289
  ```sh
290
- BASE="https://github.com/AI-Ocean/gpu-usage-audit/releases/download/v1.0.1"
291
- WHEEL="gpu_usage_audit-1.0.1-py3-none-any.whl"
290
+ BASE="https://github.com/AI-Ocean/gpu-usage-audit/releases/download/v1.0.2"
291
+ WHEEL="gpu_usage_audit-1.0.2-py3-none-any.whl"
292
292
 
293
293
  curl -fsSLO "$BASE/$WHEEL"
294
294
  curl -fsSLO "$BASE/SHA256SUMS"
@@ -304,26 +304,33 @@ $ gua report --since 1h --interval 30s
304
304
  gua — lab-a100 (bare, driver 560.35.05) Window: 1:00:00
305
305
 
306
306
  §1 Headline
307
+ basis: one sample = one GPU card at one daemon tick
308
+ rules: active >=10% util; idle-held <10% util with >100 MB process memory
307
309
  █████████▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒░░░░░░░░░░░░░░░░░░░░░░░░
308
310
  active █ 15.7%
309
311
  idle-held ▒ 45.1% ← this is the number conventional tools miss
310
312
  truly-idle ░ 39.2%
311
313
  (51 samples)
312
314
 
313
- §2 Waste
314
- ~0.43 GPU-hours idle, ~2.53 GPUs equivalently unused
315
+ §2 Idle capacity
316
+ converted from card-ticks to GPU-hours using the report --interval
317
+ idle-held: ~0.31 GPU-hours, ~1.53 GPUs equivalently unavailable
318
+ truly-idle: ~0.12 GPU-hours, ~1.00 GPUs equivalently free
315
319
 
316
320
  §3 Per-GPU
321
+ per-card share of samples in the same three states
317
322
  GPU-0 active 47.1% idle-held 35.3% truly-idle 17.6%
318
323
  GPU-1 active 0.0% idle-held 100.0% truly-idle 0.0%
319
324
  GPU-2 active 0.0% idle-held 0.0% truly-idle 100.0%
320
325
 
321
326
  §4 Top identities
322
- identity gpu-hours idle-held
323
- alice 0.42 42.9%
324
- bob 0.28 100.0%
327
+ one identity counts once per GPU/tick after its processes are summed
328
+ identity gpu-hours idle-held samples
329
+ alice 0.42 42.9% 51
330
+ bob 0.28 100.0% 34
325
331
 
326
332
  §5 Time-of-day heatmap (UTC)
333
+ darker means higher active share; blank means no samples
327
334
  0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3
328
335
  Mon .
329
336
  ```
@@ -331,7 +338,10 @@ gua — lab-a100 (bare, driver 560.35.05) Window: 1:00:00
331
338
  The 3-bar collapses every card × every tick over the window into the
332
339
  active / idle-held / truly-idle split. **`idle-held` rows are the
333
340
  embarrassing category**: a process is holding GPU memory but the SM
334
- utilization is below 10%.
341
+ utilization is below 10%. §2 converts those card-ticks into GPU-hours
342
+ with `--interval`; §4 groups process rows by identity, GPU, and tick
343
+ before ranking users, so multiple same-user processes on one GPU/tick
344
+ count once.
335
345
 
336
346
  ## Demo (no GPU required)
337
347
 
@@ -408,7 +418,7 @@ point remains installed for compatibility, but new examples use `gua`.
408
418
  | -------- | ----------------------------------------------------------- |
409
419
  | `daemon` | Starts the collector in the background. Samples real NVML telemetry on every tick and writes to a new database. NVIDIA host required. |
410
420
  | `start` | Alias for `gua daemon`. |
411
- | `status` | Shows whether the background collector PID is still running. |
421
+ | `status` | Shows whether the background collector PID is still running. Also clears a stale PID file when it points to a missing or unrelated process. |
412
422
  | `stop` | Stops the background collector with SIGTERM. |
413
423
  | `report` | One-shot read against the accumulated database. Safe to run **while the daemon is still writing** — SQLite WAL mode handles the concurrency. |
414
424
  | `demo` | Self-contained showcase. Records N fake ticks and immediately prints the report. No GPU, no second shell, no operational meaning — just to see the output shape. |
@@ -436,6 +446,8 @@ By default, `gua daemon` returns after the collector starts. Each tick is
436
446
  written to the log file; on shutdown the cumulative row count is written
437
447
  there too. `gua daemon --foreground` prints the tick summaries directly
438
448
  to the terminal and exits on Ctrl+C, SIGTERM, or `systemctl stop`.
449
+ `gua status` and `gua stop` verify that the PID file points to the
450
+ managed collector before acting on it; stale PID files are cleared.
439
451
 
440
452
  ### `report`
441
453
 
@@ -450,7 +462,7 @@ gua report [--db PATH] [--since D] [--interval D] [--width N]
450
462
  of oldest sample), so passing a huge `--since` is the same as "all
451
463
  data". Units: `ms`, `s`, `m`, `h`, `d` (no `w`; use `7d`).
452
464
  - `--interval D` (default `30s`) — **must match what the daemon used**.
453
- This is how §2 (Waste) and §4 (Top identities) convert tick counts
465
+ This is how §2 (Idle capacity) and §4 (Top identities) convert tick counts
454
466
  to GPU-hours. Mismatched intervals → wrong GPU-hours.
455
467
  - `--width N` (default `60`) — width of the §1 three-bar in characters.
456
468
 
@@ -64,8 +64,8 @@ its `gua` / `gpu-usage-audit` commands.
64
64
  GitHub Release assets are also available for manual download:
65
65
 
66
66
  ```sh
67
- BASE="https://github.com/AI-Ocean/gpu-usage-audit/releases/download/v1.0.1"
68
- WHEEL="gpu_usage_audit-1.0.1-py3-none-any.whl"
67
+ BASE="https://github.com/AI-Ocean/gpu-usage-audit/releases/download/v1.0.2"
68
+ WHEEL="gpu_usage_audit-1.0.2-py3-none-any.whl"
69
69
 
70
70
  curl -fsSLO "$BASE/$WHEEL"
71
71
  curl -fsSLO "$BASE/SHA256SUMS"
@@ -81,26 +81,33 @@ $ gua report --since 1h --interval 30s
81
81
  gua — lab-a100 (bare, driver 560.35.05) Window: 1:00:00
82
82
 
83
83
  §1 Headline
84
+ basis: one sample = one GPU card at one daemon tick
85
+ rules: active >=10% util; idle-held <10% util with >100 MB process memory
84
86
  █████████▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒░░░░░░░░░░░░░░░░░░░░░░░░
85
87
  active █ 15.7%
86
88
  idle-held ▒ 45.1% ← this is the number conventional tools miss
87
89
  truly-idle ░ 39.2%
88
90
  (51 samples)
89
91
 
90
- §2 Waste
91
- ~0.43 GPU-hours idle, ~2.53 GPUs equivalently unused
92
+ §2 Idle capacity
93
+ converted from card-ticks to GPU-hours using the report --interval
94
+ idle-held: ~0.31 GPU-hours, ~1.53 GPUs equivalently unavailable
95
+ truly-idle: ~0.12 GPU-hours, ~1.00 GPUs equivalently free
92
96
 
93
97
  §3 Per-GPU
98
+ per-card share of samples in the same three states
94
99
  GPU-0 active 47.1% idle-held 35.3% truly-idle 17.6%
95
100
  GPU-1 active 0.0% idle-held 100.0% truly-idle 0.0%
96
101
  GPU-2 active 0.0% idle-held 0.0% truly-idle 100.0%
97
102
 
98
103
  §4 Top identities
99
- identity gpu-hours idle-held
100
- alice 0.42 42.9%
101
- bob 0.28 100.0%
104
+ one identity counts once per GPU/tick after its processes are summed
105
+ identity gpu-hours idle-held samples
106
+ alice 0.42 42.9% 51
107
+ bob 0.28 100.0% 34
102
108
 
103
109
  §5 Time-of-day heatmap (UTC)
110
+ darker means higher active share; blank means no samples
104
111
  0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3
105
112
  Mon .
106
113
  ```
@@ -108,7 +115,10 @@ gua — lab-a100 (bare, driver 560.35.05) Window: 1:00:00
108
115
  The 3-bar collapses every card × every tick over the window into the
109
116
  active / idle-held / truly-idle split. **`idle-held` rows are the
110
117
  embarrassing category**: a process is holding GPU memory but the SM
111
- utilization is below 10%.
118
+ utilization is below 10%. §2 converts those card-ticks into GPU-hours
119
+ with `--interval`; §4 groups process rows by identity, GPU, and tick
120
+ before ranking users, so multiple same-user processes on one GPU/tick
121
+ count once.
112
122
 
113
123
  ## Demo (no GPU required)
114
124
 
@@ -185,7 +195,7 @@ point remains installed for compatibility, but new examples use `gua`.
185
195
  | -------- | ----------------------------------------------------------- |
186
196
  | `daemon` | Starts the collector in the background. Samples real NVML telemetry on every tick and writes to a new database. NVIDIA host required. |
187
197
  | `start` | Alias for `gua daemon`. |
188
- | `status` | Shows whether the background collector PID is still running. |
198
+ | `status` | Shows whether the background collector PID is still running. Also clears a stale PID file when it points to a missing or unrelated process. |
189
199
  | `stop` | Stops the background collector with SIGTERM. |
190
200
  | `report` | One-shot read against the accumulated database. Safe to run **while the daemon is still writing** — SQLite WAL mode handles the concurrency. |
191
201
  | `demo` | Self-contained showcase. Records N fake ticks and immediately prints the report. No GPU, no second shell, no operational meaning — just to see the output shape. |
@@ -213,6 +223,8 @@ By default, `gua daemon` returns after the collector starts. Each tick is
213
223
  written to the log file; on shutdown the cumulative row count is written
214
224
  there too. `gua daemon --foreground` prints the tick summaries directly
215
225
  to the terminal and exits on Ctrl+C, SIGTERM, or `systemctl stop`.
226
+ `gua status` and `gua stop` verify that the PID file points to the
227
+ managed collector before acting on it; stale PID files are cleared.
216
228
 
217
229
  ### `report`
218
230
 
@@ -227,7 +239,7 @@ gua report [--db PATH] [--since D] [--interval D] [--width N]
227
239
  of oldest sample), so passing a huge `--since` is the same as "all
228
240
  data". Units: `ms`, `s`, `m`, `h`, `d` (no `w`; use `7d`).
229
241
  - `--interval D` (default `30s`) — **must match what the daemon used**.
230
- This is how §2 (Waste) and §4 (Top identities) convert tick counts
242
+ This is how §2 (Idle capacity) and §4 (Top identities) convert tick counts
231
243
  to GPU-hours. Mismatched intervals → wrong GPU-hours.
232
244
  - `--width N` (default `60`) — width of the §1 three-bar in characters.
233
245
 
@@ -0,0 +1,83 @@
1
+ # Bare Metal 1.0 Handoff
2
+
3
+ 갱신일: 2026-05-15
4
+
5
+ ## 이어받을 때 먼저 볼 것
6
+
7
+ - `projects/bare-metal-1.0/status.ko.md`: 현재 완료 상태, 1.0.1 검증 결과, 1.0.2 release prep 상태.
8
+ - `README.md`: 실제 사용자 문서와 release/install/runbook/report 표면.
9
+ - `src/gpu_usage_audit/__main__.py`: `gua` CLI, background daemon lifecycle, PID handling.
10
+ - `src/gpu_usage_audit/report.py`: report SQL 집계.
11
+ - `src/gpu_usage_audit/render.py`: report 사람이 읽는 출력.
12
+ - `.github/workflows/release.yml`: tag release, GitHub Release, PyPI publish 경로.
13
+
14
+ ## 고정된 결정
15
+
16
+ - 1.0은 단일 로컬 베어메탈 NVIDIA 호스트만 본다.
17
+ - Kubernetes, Slurm, Docker/Podman fallback, remote node, cluster-wide report는 1.0 범위 밖이다.
18
+ - `nvidia-ml-py`는 기본 dependency다.
19
+ - `gpu-usage-audit[nvml]` extra는 compatibility를 위해 빈 alias로 남긴다.
20
+ - DB schema는 v1을 유지한다: `host`, `gpu_sample`, `proc_sample`.
21
+ - 기본 DB는 `/tmp/gua.db`다.
22
+ - `gua daemon`은 기본 백그라운드 실행이다.
23
+ - `gua daemon --foreground`는 systemd/debugging 용도다.
24
+ - `gua start`는 `gua daemon` alias다.
25
+ - `gua status`와 `gua stop`은 pid file 기반 background collector 관리용이다.
26
+ - `daemon`은 기존 DB 파일이 있으면 실패한다.
27
+ - `report`는 DB 파일이 없으면 실패한다.
28
+ - `daemon`과 `demo`는 host row의 `env_kind`를 항상 `"bare"`로 기록한다.
29
+ - auto-runtime proposal/project 문서는 삭제했다. Kubernetes/Slurm/Docker/Podman 확장을 다시
30
+ 시작하려면 새 proposal로 시작한다.
31
+
32
+ ## 현재 상태
33
+
34
+ - PR A: implemented in PR #9.
35
+ - PR B: implemented in PR #10.
36
+ - Post-1.0 cleanup: completed in PR #11.
37
+ - Bare-metal 1.0 release: completed in PR #12 and tag `v1.0.0`.
38
+ - 1.0.1 command surface/background daemon release: completed in PR #13 and tag `v1.0.1`.
39
+ - GitHub Release `v1.0.1`: published.
40
+ - PyPI `gpu-usage-audit 1.0.1`: published.
41
+ - NVIDIA host acceptance: 사용자가 실제 host에서 수집 정상 동작을 확인했다.
42
+ - 1.0.2 release prep: 진행 중. #14 lifecycle/report cleanup을 patch release로 배포한다.
43
+ package version은 `1.0.2`로 bump했고 local build/wheel smoke는 통과했다.
44
+
45
+ ## 마지막 로컬 검증
46
+
47
+ ```sh
48
+ uv run ruff check
49
+ uv run ruff format --check
50
+ uv run mypy
51
+ uv run pytest
52
+ uv build --out-dir /tmp/gua-dist-1.0.2-prep
53
+ bash scripts/smoke-dist-wheel.sh /tmp/gua-dist-1.0.2-prep/gpu_usage_audit-1.0.2-py3-none-any.whl
54
+ env GITHUB_REF_NAME=v1.0.2 uv run python scripts/check-tag-version.py
55
+ ```
56
+
57
+ 결과는 `pytest` 124 passed, `mypy` 25 source files, `ruff format` 26 files 기준이다.
58
+
59
+ ## 현재 cleanup PR 방향
60
+
61
+ - `/tmp/gua.pid`가 PID 재사용으로 다른 프로세스를 가리킬 수 있으므로 `status`/`stop` 전에
62
+ 해당 PID가 실제 managed `gpu_usage_audit daemon` 프로세스인지 확인한다.
63
+ - report §2는 low-util 전체를 "waste"로 합치지 말고 `idle-held`와 `truly-idle`을 분리한다.
64
+ - report §4는 process row가 아니라 identity/GPU/tick 단위로 먼저 접어서 사용자별 GPU-hours를 계산한다.
65
+ - report 출력 자체에 sample 의미, classification rule, `--interval` 의존성, heatmap 의미를 짧게 노출한다.
66
+ - NVML process list 조회 실패는 idle-held를 과소평가할 수 있으므로 warning으로 남긴다.
67
+ - 1.0.2 release prep에서는 package version, README release asset 예시, CHANGELOG를 `1.0.2`로 맞춘다.
68
+
69
+ ## 주의할 점
70
+
71
+ - 현재 로컬 개발 머신은 NVIDIA host가 아니다. `gua doctor`가 unsupported를 내는 것은 정상이다.
72
+ - `/tmp/gua.db`가 이미 존재한다. 기본 경로 daemon 실행이 거부되는 것은 기대 동작이다.
73
+ - `report --interval`은 daemon 수집 interval과 같아야 GPU-hours가 맞다.
74
+ - SQLite WAL sidecar(`*.db-wal`, `*.db-shm`)는 마지막 connection이 닫히면 정리된다.
75
+ - 1.0.2를 자를 경우 `env GITHUB_REF_NAME=v1.0.2 uv run python scripts/check-tag-version.py`가
76
+ 통과해야 한다.
77
+
78
+ ## 다음 세션 추천 순서
79
+
80
+ 1. `git status --short`로 사용자 변경 여부를 먼저 확인한다.
81
+ 2. cleanup PR의 CI 결과와 review comments를 확인한다.
82
+ 3. 필요하면 report wording을 실제 운영자가 읽기 쉬운 형태로 한 번 더 다듬는다.
83
+ 4. merge 후 patch release가 필요하면 version bump와 changelog를 별도 PR로 처리한다.
@@ -0,0 +1,120 @@
1
+ # Bare Metal 1.0 Status
2
+
3
+ 갱신일: 2026-05-15
4
+
5
+ ## 요약
6
+
7
+ Bare Metal 1.0은 단일 NVIDIA 베어메탈 호스트만 대상으로 하는 형태로 1.0.1까지
8
+ 릴리스됐고, 현재 1.0.2 release prep을 진행 중이다. `v1.0.1` GitHub Release와
9
+ PyPI publish는 완료됐고, 사용자가 실제 NVIDIA host에서 telemetry 수집이 정상
10
+ 동작하는 것도 확인했다.
11
+
12
+ 1.0.2 후보는 1.0.1 이후 코드 퀄리티 cleanup을 배포하기 위한 patch release다.
13
+ 주요 초점은 background daemon PID 안전성, report 의미 가시성, 내부 문서 정합성이다.
14
+
15
+ ## 구현 상태
16
+
17
+ | 영역 | 상태 | 메모 |
18
+ | --- | --- | --- |
19
+ | Scope reset | 완료 | Kubernetes/Slurm/Docker/remote runtime 표면 제거. |
20
+ | `gua doctor` | 완료 | 현재 머신의 `/dev/nvidia*`, `nvidia-smi -L`, NVML, DB path만 진단. |
21
+ | Packaging UX | 완료 | `nvidia-ml-py`가 기본 dependency이고 `nvml` extra는 빈 compatibility alias. |
22
+ | `gua` command surface | 완료 | `doctor`, `daemon`, `start`, `status`, `stop`, `report`, `demo` 제공. |
23
+ | Background daemon UX | 완료 | `gua daemon`은 기본 백그라운드 실행, `--foreground`는 systemd/debug용. |
24
+ | `daemon`/`report` DB UX | 완료 | 기본 DB는 `/tmp/gua.db`; daemon은 기존 DB를 거부하고 report는 없는 DB를 거부. |
25
+ | README bare-metal 문서 | 완료 | install, runbook, systemd 예시, 운영 notes가 1.0.2 기준. |
26
+ | Release | 진행 중 | package version은 `1.0.2`; local build/wheel smoke 완료, release prep PR과 tag publish가 남음. |
27
+ | NVIDIA host acceptance | 완료 | 실제 NVIDIA host에서 수집 정상 동작 확인. |
28
+
29
+ ## 마지막 확인 결과
30
+
31
+ 2026-05-15 1.0.2 release prep 로컬 검증:
32
+
33
+ ```sh
34
+ uv run ruff format --check
35
+ uv run ruff check
36
+ uv run mypy
37
+ uv run pytest
38
+ env GITHUB_REF_NAME=v1.0.2 uv run python scripts/check-tag-version.py
39
+ uv build --out-dir /tmp/gua-dist-1.0.2-prep
40
+ bash scripts/smoke-dist-wheel.sh /tmp/gua-dist-1.0.2-prep/gpu_usage_audit-1.0.2-py3-none-any.whl
41
+ ```
42
+
43
+ 결과:
44
+
45
+ - `ruff format --check`: 26 files already formatted.
46
+ - `ruff check`: pass.
47
+ - `mypy`: no issues in 25 source files.
48
+ - `pytest`: 124 passed.
49
+ - tag-version check: `v1.0.2`와 `pyproject.toml` version 일치.
50
+ - `uv build`: sdist/wheel build 성공.
51
+ - wheel smoke: 성공.
52
+
53
+ 2026-05-15 1.0.1 상태 확인:
54
+
55
+ ```sh
56
+ git status --short
57
+ uv run ruff check
58
+ uv run ruff format --check
59
+ uv run mypy
60
+ uv run pytest
61
+ env GITHUB_REF_NAME=v1.0.1 uv run python scripts/check-tag-version.py
62
+ uv build --out-dir /tmp/gua-dist-1.0.1-status
63
+ bash scripts/smoke-dist-wheel.sh /tmp/gua-dist-1.0.1-status/gpu_usage_audit-1.0.1-py3-none-any.whl
64
+ ```
65
+
66
+ 결과:
67
+
68
+ - 작업트리 clean.
69
+ - `ruff check`: pass.
70
+ - `ruff format --check`: 26 files already formatted.
71
+ - `mypy`: no issues in 25 source files.
72
+ - `pytest`: 114 passed.
73
+ - tag-version check: `v1.0.1`과 `pyproject.toml` version 일치.
74
+ - `uv build`: sdist/wheel build 성공.
75
+ - wheel smoke: 성공.
76
+ - Release workflow: `v1.0.1` success.
77
+ - PyPI latest: `gpu-usage-audit 1.0.1`.
78
+
79
+ ## 1.0.1에서 바뀐 점
80
+
81
+ - `gua`를 documented command surface로 정리했다.
82
+ - `gua daemon`은 collector를 백그라운드로 시작한다.
83
+ - `gua daemon --foreground`는 systemd와 debugging 용도로 유지한다.
84
+ - `gua start`, `gua status`, `gua stop`을 추가했다.
85
+ - README의 install/run/report 예시는 `gua` 기준으로 정리됐다.
86
+
87
+ ## 현재 cleanup 리뷰 결과
88
+
89
+ - `/tmp/gua.pid` 숫자만 믿고 `gua stop`이 SIGTERM을 보내면 PID 재사용 시 다른
90
+ 프로세스를 건드릴 수 있다. pid가 실제 `python -m gpu_usage_audit daemon`
91
+ 프로세스인지 확인해야 한다.
92
+ - §2 report가 `idle-held`와 `truly-idle`을 모두 "idle/waste"로 합쳐 보여주면
93
+ 제품 메시지가 흐려진다. 사용자가 못 쓰는 용량과 실제 빈 용량을 분리해야 한다.
94
+ - §4 Top identities는 process row를 바로 세면 같은 사용자의 여러 프로세스가
95
+ 같은 GPU/tick에서 과대계상될 수 있다. identity/GPU/tick 단위로 먼저 접어야 한다.
96
+ - report는 "sample"의 의미, threshold, `--interval` 의존성을 출력 자체에서 더
97
+ 잘 설명해야 한다.
98
+ - NVML process list를 읽지 못하는 경우 low-util GPU가 `truly-idle`처럼 보일 수
99
+ 있으므로 최소한 경고가 필요하다.
100
+
101
+ ## 로컬 `doctor` 상태
102
+
103
+ 현재 개발 머신은 NVIDIA host가 아니므로 `uv run gua doctor`는 `unsupported`가
104
+ 정상 결과다.
105
+
106
+ 관찰된 blocker:
107
+
108
+ - `/dev/nvidia*` 없음.
109
+ - `nvidia-smi`가 PATH에 없음.
110
+ - NVML init 실패: `libnvidia-ml.so.1` 없음.
111
+ - `/tmp/gua.db`가 이미 있어 daemon은 기본 경로로 시작하지 않음.
112
+
113
+ 이 결과는 로컬 환경 한계이며, 제품 regression으로 보지 않는다.
114
+
115
+ ## 다음 작업
116
+
117
+ 1. 1.0.2 release prep PR에서 version, README release asset 예시, CHANGELOG를 갱신한다.
118
+ 2. `uv run ruff check`, `uv run ruff format --check`, `uv run mypy`, `uv run pytest`,
119
+ `uv build`, wheel smoke, tag-version check를 다시 실행한다.
120
+ 3. PR merge 후 `v1.0.2` tag를 push해 GitHub Release와 PyPI publish workflow를 실행한다.
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "gpu-usage-audit"
3
- version = "1.0.1"
3
+ version = "1.0.2"
4
4
  description = "Single-host daemon that surfaces 'idle-held' NVIDIA GPU memory — the embarrassing category conventional dashboards miss."
5
5
  readme = "README.md"
6
6
  license = { file = "LICENSE" }
@@ -48,17 +48,17 @@ from .nvml import NVMLNotAvailableError, NVMLTier
48
48
  from .render import (
49
49
  render_headline,
50
50
  render_heatmap,
51
+ render_idle_capacity,
51
52
  render_per_gpu,
52
53
  render_top_identities,
53
- render_waste,
54
54
  )
55
55
  from .report import (
56
56
  load_headline,
57
57
  load_heatmap,
58
58
  load_host,
59
+ load_idle_capacity,
59
60
  load_per_gpu,
60
61
  load_top_identities,
61
- load_waste,
62
62
  )
63
63
  from .tier import FakeTier
64
64
 
@@ -137,7 +137,7 @@ def build_parser() -> argparse.ArgumentParser:
137
137
  "--interval",
138
138
  type=_duration,
139
139
  default=timedelta(seconds=30),
140
- help="Daemon tick interval — for §2 Waste / §4 time conversion [default: 30s]",
140
+ help="Daemon tick interval — for §2 Idle capacity / §4 time conversion [default: 30s]",
141
141
  )
142
142
  p_report.add_argument(
143
143
  "--width",
@@ -206,7 +206,7 @@ def _add_report_args(parser: argparse.ArgumentParser) -> None:
206
206
  "--interval",
207
207
  type=_duration,
208
208
  default=timedelta(seconds=30),
209
- help="Daemon tick interval — for §2 Waste / §4 time conversion [default: 30s]",
209
+ help="Daemon tick interval — for §2 Idle capacity / §4 time conversion [default: 30s]",
210
210
  )
211
211
  parser.add_argument(
212
212
  "--width",
@@ -355,10 +355,15 @@ def _cmd_gua_start(args: argparse.Namespace) -> int:
355
355
  log_path = Path(args.log_file)
356
356
 
357
357
  existing_pid = _read_pid(pid_path)
358
- if existing_pid is not None and _pid_alive(existing_pid):
359
- print(f"gua daemon: already running (pid {existing_pid})")
360
- return 0
361
358
  if existing_pid is not None:
359
+ if _pid_alive(existing_pid) and _pid_is_managed_daemon(existing_pid):
360
+ print(f"gua daemon: already running (pid {existing_pid})")
361
+ return 0
362
+ if _pid_alive(existing_pid):
363
+ print(
364
+ f"gua daemon: pid {existing_pid} belongs to another process; "
365
+ "clearing stale pid file"
366
+ )
362
367
  _unlink_if_exists(pid_path)
363
368
 
364
369
  if db_path.exists():
@@ -418,13 +423,20 @@ def _cmd_gua_status(args: argparse.Namespace) -> int:
418
423
  if pid is None:
419
424
  print("gua daemon: not running")
420
425
  return 0
421
- if _pid_alive(pid):
426
+ if _pid_alive(pid) and _pid_is_managed_daemon(pid):
422
427
  print(f"gua daemon: running (pid {pid})")
423
428
  print(f" pid file: {pid_path}")
424
429
  print(f" log: {log_path}")
425
430
  return 0
426
- print(f"gua daemon: not running (stale pid {pid})")
427
- _unlink_if_exists(pid_path)
431
+ if _pid_alive(pid):
432
+ _unlink_if_exists(pid_path)
433
+ print(
434
+ f"gua daemon: not running (pid {pid} belongs to another process; "
435
+ "cleared stale pid file)"
436
+ )
437
+ else:
438
+ print(f"gua daemon: not running (stale pid {pid})")
439
+ _unlink_if_exists(pid_path)
428
440
  return 0
429
441
 
430
442
 
@@ -438,7 +450,17 @@ def _cmd_gua_stop(args: argparse.Namespace) -> int:
438
450
  _unlink_if_exists(pid_path)
439
451
  print(f"gua daemon: not running (removed stale pid {pid})")
440
452
  return 0
453
+ if not _pid_is_managed_daemon(pid):
454
+ _unlink_if_exists(pid_path)
455
+ print(
456
+ f"gua daemon: not running (pid {pid} belongs to another process; "
457
+ "cleared stale pid file)"
458
+ )
459
+ return 0
441
460
 
461
+ # The identity check above closes the common stale-PID-file case. A tiny
462
+ # check-then-kill race remains if the process exits and the OS reuses the
463
+ # PID before SIGTERM; avoiding that needs a stronger lock model.
442
464
  try:
443
465
  os.kill(pid, signal.SIGTERM)
444
466
  except PermissionError:
@@ -525,12 +547,12 @@ def _cmd_report(args: argparse.Namespace) -> int:
525
547
  cutoff = datetime.now(UTC) - args.since
526
548
  host = load_host(conn)
527
549
  headline = load_headline(conn, cutoff)
528
- waste = load_waste(conn, cutoff, args.interval)
550
+ idle_capacity = load_idle_capacity(conn, cutoff, args.interval)
529
551
  per_gpu = load_per_gpu(conn, cutoff)
530
552
  top = load_top_identities(conn, cutoff, args.interval)
531
553
  heat = load_heatmap(conn, cutoff)
532
554
  render_headline(sys.stdout, host, headline, args.since, args.width)
533
- render_waste(sys.stdout, waste)
555
+ render_idle_capacity(sys.stdout, idle_capacity)
534
556
  render_per_gpu(sys.stdout, per_gpu)
535
557
  render_top_identities(sys.stdout, top)
536
558
  render_heatmap(sys.stdout, heat)
@@ -586,7 +608,7 @@ def _cmd_demo(args: argparse.Namespace) -> int:
586
608
  cutoff = datetime.now(UTC) - window
587
609
  loaded_host = load_host(conn)
588
610
  render_headline(sys.stdout, loaded_host, load_headline(conn, cutoff), window, width=60)
589
- render_waste(sys.stdout, load_waste(conn, cutoff, args.interval))
611
+ render_idle_capacity(sys.stdout, load_idle_capacity(conn, cutoff, args.interval))
590
612
  render_per_gpu(sys.stdout, load_per_gpu(conn, cutoff))
591
613
  render_top_identities(sys.stdout, load_top_identities(conn, cutoff, args.interval))
592
614
  render_heatmap(sys.stdout, load_heatmap(conn, cutoff))
@@ -677,6 +699,27 @@ def _pid_alive(pid: int) -> bool:
677
699
  return True
678
700
 
679
701
 
702
+ def _pid_is_managed_daemon(pid: int) -> bool:
703
+ """Return True for the subprocess shape created by `_cmd_gua_start`.
704
+
705
+ Keep this in sync with the spawn command in `_cmd_gua_start`; status/stop
706
+ use it to avoid acting on unrelated processes from stale PID files.
707
+ """
708
+ args = _read_proc_cmdline(pid)
709
+ for i, arg in enumerate(args):
710
+ if arg == "-m" and args[i + 1 : i + 3] == ["gpu_usage_audit", "daemon"]:
711
+ return True
712
+ return False
713
+
714
+
715
+ def _read_proc_cmdline(pid: int) -> list[str]:
716
+ try:
717
+ raw = Path(f"/proc/{pid}/cmdline").read_bytes()
718
+ except OSError:
719
+ return []
720
+ return [part.decode("utf-8", errors="replace") for part in raw.split(b"\0") if part]
721
+
722
+
680
723
  def _unlink_if_exists(path: Path) -> None:
681
724
  with contextlib.suppress(FileNotFoundError):
682
725
  path.unlink()
@@ -10,11 +10,14 @@ GPU 없는 개발/CI/demo 환경도 계속 동작해야 하므로 import/init
10
10
  from __future__ import annotations
11
11
 
12
12
  import contextlib
13
+ import logging
13
14
  from datetime import datetime
14
15
  from typing import Any
15
16
 
16
17
  from .model import GPUSample, ProcSample, Snapshot
17
18
 
19
+ logger = logging.getLogger(__name__)
20
+
18
21
 
19
22
  class NVMLNotAvailableError(RuntimeError):
20
23
  """pynvml 미설치 또는 NVML 초기화 실패. 사용자 facing 메시지로도 사용."""
@@ -59,6 +62,7 @@ class NVMLTier:
59
62
  def __init__(self) -> None:
60
63
  self._nvml: Any | None = None # pynvml ModuleType
61
64
  self._initialized = False
65
+ self._process_list_warning_uuids: set[str] = set()
62
66
 
63
67
  def __enter__(self) -> NVMLTier:
64
68
  return self
@@ -97,7 +101,15 @@ class NVMLTier:
97
101
  # 해당 카드의 process list 만 비우고 진행.
98
102
  try:
99
103
  running = nvml.nvmlDeviceGetComputeRunningProcesses(h)
100
- except nvml.NVMLError:
104
+ except nvml.NVMLError as e:
105
+ if uuid not in self._process_list_warning_uuids:
106
+ logger.warning(
107
+ "NVML process list unavailable for %s; idle-held classification "
108
+ "may be understated: %s",
109
+ uuid,
110
+ e,
111
+ )
112
+ self._process_list_warning_uuids.add(uuid)
101
113
  running = []
102
114
 
103
115
  for p in running: