vramtop 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- vramtop-0.1.0/.github/workflows/ci.yml +50 -0
- vramtop-0.1.0/.github/workflows/publish.yml +56 -0
- vramtop-0.1.0/.gitignore +11 -0
- vramtop-0.1.0/CLAUDE.md +283 -0
- vramtop-0.1.0/COMPLIANCE_REPORT.md +409 -0
- vramtop-0.1.0/DESIGN_DOC.md +738 -0
- vramtop-0.1.0/LICENSE +21 -0
- vramtop-0.1.0/PKG-INFO +204 -0
- vramtop-0.1.0/README.md +167 -0
- vramtop-0.1.0/docker/Dockerfile.gpu-workload +18 -0
- vramtop-0.1.0/docker/docker-compose.gpu.yml +50 -0
- vramtop-0.1.0/docker/gpu_workload.py +62 -0
- vramtop-0.1.0/pyproject.toml +80 -0
- vramtop-0.1.0/scripts/train_test.py +226 -0
- vramtop-0.1.0/src/vramtop/__init__.py +20 -0
- vramtop-0.1.0/src/vramtop/__main__.py +5 -0
- vramtop-0.1.0/src/vramtop/analysis/__init__.py +1 -0
- vramtop-0.1.0/src/vramtop/analysis/breakdown.py +63 -0
- vramtop-0.1.0/src/vramtop/analysis/oom_predictor.py +232 -0
- vramtop-0.1.0/src/vramtop/analysis/pelt_detector.py +160 -0
- vramtop-0.1.0/src/vramtop/analysis/phase_detector.py +108 -0
- vramtop-0.1.0/src/vramtop/analysis/segment_labels.py +573 -0
- vramtop-0.1.0/src/vramtop/analysis/survival.py +393 -0
- vramtop-0.1.0/src/vramtop/analysis/trends.py +79 -0
- vramtop-0.1.0/src/vramtop/backends/__init__.py +40 -0
- vramtop-0.1.0/src/vramtop/backends/base.py +111 -0
- vramtop-0.1.0/src/vramtop/backends/nvidia.py +298 -0
- vramtop-0.1.0/src/vramtop/cli.py +160 -0
- vramtop-0.1.0/src/vramtop/config.py +199 -0
- vramtop-0.1.0/src/vramtop/enrichment/__init__.py +232 -0
- vramtop-0.1.0/src/vramtop/enrichment/container.py +162 -0
- vramtop-0.1.0/src/vramtop/enrichment/deep_mode.py +127 -0
- vramtop-0.1.0/src/vramtop/enrichment/detector.py +97 -0
- vramtop-0.1.0/src/vramtop/enrichment/model_files.py +73 -0
- vramtop-0.1.0/src/vramtop/enrichment/mps.py +61 -0
- vramtop-0.1.0/src/vramtop/enrichment/scraper.py +225 -0
- vramtop-0.1.0/src/vramtop/enrichment/scrapers/__init__.py +87 -0
- vramtop-0.1.0/src/vramtop/enrichment/scrapers/llamacpp.py +60 -0
- vramtop-0.1.0/src/vramtop/enrichment/scrapers/ollama.py +69 -0
- vramtop-0.1.0/src/vramtop/enrichment/scrapers/sglang.py +47 -0
- vramtop-0.1.0/src/vramtop/enrichment/scrapers/vllm.py +57 -0
- vramtop-0.1.0/src/vramtop/export/__init__.py +39 -0
- vramtop-0.1.0/src/vramtop/export/csv_logger.py +105 -0
- vramtop-0.1.0/src/vramtop/export/screenshot.py +40 -0
- vramtop-0.1.0/src/vramtop/permissions.py +22 -0
- vramtop-0.1.0/src/vramtop/process_identity.py +22 -0
- vramtop-0.1.0/src/vramtop/reporter/__init__.py +3 -0
- vramtop-0.1.0/src/vramtop/reporter/protocol.py +77 -0
- vramtop-0.1.0/src/vramtop/reporter/pytorch.py +149 -0
- vramtop-0.1.0/src/vramtop/sanitize.py +24 -0
- vramtop-0.1.0/src/vramtop/secrets.py +52 -0
- vramtop-0.1.0/src/vramtop/ui/__init__.py +1 -0
- vramtop-0.1.0/src/vramtop/ui/app.py +844 -0
- vramtop-0.1.0/src/vramtop/ui/styles.tcss +81 -0
- vramtop-0.1.0/src/vramtop/ui/themes/__init__.py +60 -0
- vramtop-0.1.0/src/vramtop/ui/themes/catppuccin.tcss +83 -0
- vramtop-0.1.0/src/vramtop/ui/themes/dark.tcss +85 -0
- vramtop-0.1.0/src/vramtop/ui/themes/dracula.tcss +83 -0
- vramtop-0.1.0/src/vramtop/ui/themes/light.tcss +83 -0
- vramtop-0.1.0/src/vramtop/ui/themes/nord.tcss +83 -0
- vramtop-0.1.0/src/vramtop/ui/themes/solarized.tcss +83 -0
- vramtop-0.1.0/src/vramtop/ui/widgets/__init__.py +1 -0
- vramtop-0.1.0/src/vramtop/ui/widgets/alerts.py +53 -0
- vramtop-0.1.0/src/vramtop/ui/widgets/detail_panel.py +365 -0
- vramtop-0.1.0/src/vramtop/ui/widgets/gpu_card.py +165 -0
- vramtop-0.1.0/src/vramtop/ui/widgets/kill_dialog.py +381 -0
- vramtop-0.1.0/src/vramtop/ui/widgets/memory_bar.py +98 -0
- vramtop-0.1.0/src/vramtop/ui/widgets/memory_chart.py +248 -0
- vramtop-0.1.0/src/vramtop/ui/widgets/phase_badge.py +76 -0
- vramtop-0.1.0/src/vramtop/ui/widgets/process_table.py +138 -0
- vramtop-0.1.0/src/vramtop/ui/widgets/space_bg.py +68 -0
- vramtop-0.1.0/src/vramtop/ui/widgets/timeline.py +78 -0
- vramtop-0.1.0/tests/__init__.py +0 -0
- vramtop-0.1.0/tests/conftest.py +190 -0
- vramtop-0.1.0/tests/fixtures/__init__.py +0 -0
- vramtop-0.1.0/tests/fixtures/nvml_responses.py +138 -0
- vramtop-0.1.0/tests/integration/__init__.py +0 -0
- vramtop-0.1.0/tests/integration/test_cli_smoke.py +158 -0
- vramtop-0.1.0/tests/integration/test_gpu_pytorch.py +409 -0
- vramtop-0.1.0/tests/integration/test_multi_container.py +104 -0
- vramtop-0.1.0/tests/test_analysis_breakdown.py +87 -0
- vramtop-0.1.0/tests/test_config.py +329 -0
- vramtop-0.1.0/tests/test_deep_mode.py +237 -0
- vramtop-0.1.0/tests/test_enrichment_container.py +203 -0
- vramtop-0.1.0/tests/test_enrichment_detector.py +142 -0
- vramtop-0.1.0/tests/test_enrichment_model_files.py +116 -0
- vramtop-0.1.0/tests/test_enrichment_mps.py +99 -0
- vramtop-0.1.0/tests/test_export_csv.py +172 -0
- vramtop-0.1.0/tests/test_export_screenshot.py +108 -0
- vramtop-0.1.0/tests/test_kill_dialog.py +342 -0
- vramtop-0.1.0/tests/test_labels_realworld.py +534 -0
- vramtop-0.1.0/tests/test_memory_chart.py +228 -0
- vramtop-0.1.0/tests/test_multi_container.py +340 -0
- vramtop-0.1.0/tests/test_nvidia_backend.py +292 -0
- vramtop-0.1.0/tests/test_oom_predictor.py +165 -0
- vramtop-0.1.0/tests/test_pelt_detector.py +295 -0
- vramtop-0.1.0/tests/test_permissions.py +48 -0
- vramtop-0.1.0/tests/test_phase_detector.py +96 -0
- vramtop-0.1.0/tests/test_pid_namespace.py +153 -0
- vramtop-0.1.0/tests/test_process_identity.py +65 -0
- vramtop-0.1.0/tests/test_property_oom.py +91 -0
- vramtop-0.1.0/tests/test_property_phase.py +57 -0
- vramtop-0.1.0/tests/test_property_sanitize.py +39 -0
- vramtop-0.1.0/tests/test_reporter_pytorch.py +187 -0
- vramtop-0.1.0/tests/test_sanitize.py +67 -0
- vramtop-0.1.0/tests/test_scraper_security.py +381 -0
- vramtop-0.1.0/tests/test_scrapers_llamacpp.py +73 -0
- vramtop-0.1.0/tests/test_scrapers_ollama.py +96 -0
- vramtop-0.1.0/tests/test_scrapers_sglang.py +67 -0
- vramtop-0.1.0/tests/test_scrapers_vllm.py +85 -0
- vramtop-0.1.0/tests/test_security_boundaries.py +117 -0
- vramtop-0.1.0/tests/test_segment_labels.py +545 -0
- vramtop-0.1.0/tests/test_survival.py +882 -0
- vramtop-0.1.0/tests/test_tui.py +137 -0
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
name: CI
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
push:
|
|
5
|
+
branches: [main]
|
|
6
|
+
pull_request:
|
|
7
|
+
branches: [main]
|
|
8
|
+
|
|
9
|
+
jobs:
|
|
10
|
+
lint:
|
|
11
|
+
runs-on: ubuntu-latest
|
|
12
|
+
steps:
|
|
13
|
+
- uses: actions/checkout@v4
|
|
14
|
+
- uses: actions/setup-python@v5
|
|
15
|
+
with:
|
|
16
|
+
python-version: "3.12"
|
|
17
|
+
- run: pip install -e ".[dev]"
|
|
18
|
+
- run: ruff check src/
|
|
19
|
+
- run: mypy --strict src/vramtop/
|
|
20
|
+
|
|
21
|
+
test-unit:
|
|
22
|
+
runs-on: ubuntu-latest
|
|
23
|
+
steps:
|
|
24
|
+
- uses: actions/checkout@v4
|
|
25
|
+
- uses: actions/setup-python@v5
|
|
26
|
+
with:
|
|
27
|
+
python-version: "3.12"
|
|
28
|
+
- run: pip install -e ".[dev]"
|
|
29
|
+
- run: pytest tests/ -m "not gpu and not docker" -v
|
|
30
|
+
|
|
31
|
+
test-property:
|
|
32
|
+
runs-on: ubuntu-latest
|
|
33
|
+
steps:
|
|
34
|
+
- uses: actions/checkout@v4
|
|
35
|
+
- uses: actions/setup-python@v5
|
|
36
|
+
with:
|
|
37
|
+
python-version: "3.12"
|
|
38
|
+
- run: pip install -e ".[dev]"
|
|
39
|
+
- run: pytest tests/test_property_*.py --hypothesis-show-statistics -v
|
|
40
|
+
|
|
41
|
+
security:
|
|
42
|
+
runs-on: ubuntu-latest
|
|
43
|
+
steps:
|
|
44
|
+
- uses: actions/checkout@v4
|
|
45
|
+
- uses: actions/setup-python@v5
|
|
46
|
+
with:
|
|
47
|
+
python-version: "3.12"
|
|
48
|
+
- run: pip install -e ".[dev]"
|
|
49
|
+
- run: bandit -r src/ -ll
|
|
50
|
+
- run: pip-audit
|
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
name: Publish to PyPI
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
push:
|
|
5
|
+
tags:
|
|
6
|
+
- "v*"
|
|
7
|
+
|
|
8
|
+
jobs:
|
|
9
|
+
lint:
|
|
10
|
+
runs-on: ubuntu-latest
|
|
11
|
+
steps:
|
|
12
|
+
- uses: actions/checkout@v4
|
|
13
|
+
- uses: actions/setup-python@v5
|
|
14
|
+
with:
|
|
15
|
+
python-version: "3.12"
|
|
16
|
+
- run: pip install -e ".[dev]"
|
|
17
|
+
- run: ruff check src/
|
|
18
|
+
- run: mypy --strict src/vramtop/
|
|
19
|
+
|
|
20
|
+
test:
|
|
21
|
+
runs-on: ubuntu-latest
|
|
22
|
+
steps:
|
|
23
|
+
- uses: actions/checkout@v4
|
|
24
|
+
- uses: actions/setup-python@v5
|
|
25
|
+
with:
|
|
26
|
+
python-version: "3.12"
|
|
27
|
+
- run: pip install -e ".[dev]"
|
|
28
|
+
- run: pytest tests/ -m "not gpu and not docker" -v
|
|
29
|
+
|
|
30
|
+
security:
|
|
31
|
+
runs-on: ubuntu-latest
|
|
32
|
+
steps:
|
|
33
|
+
- uses: actions/checkout@v4
|
|
34
|
+
- uses: actions/setup-python@v5
|
|
35
|
+
with:
|
|
36
|
+
python-version: "3.12"
|
|
37
|
+
- run: pip install -e ".[dev]"
|
|
38
|
+
- run: bandit -r src/ -ll
|
|
39
|
+
- run: pip-audit
|
|
40
|
+
|
|
41
|
+
publish:
|
|
42
|
+
needs: [lint, test, security]
|
|
43
|
+
runs-on: ubuntu-latest
|
|
44
|
+
permissions:
|
|
45
|
+
id-token: write
|
|
46
|
+
steps:
|
|
47
|
+
- uses: actions/checkout@v4
|
|
48
|
+
- uses: actions/setup-python@v5
|
|
49
|
+
with:
|
|
50
|
+
python-version: "3.12"
|
|
51
|
+
- run: pip install build
|
|
52
|
+
- run: python -m build
|
|
53
|
+
- name: Publish to PyPI
|
|
54
|
+
uses: pypa/gh-action-pypi-publish@release/v1
|
|
55
|
+
with:
|
|
56
|
+
password: ${{ secrets.PYPI_API_TOKEN }}
|
vramtop-0.1.0/.gitignore
ADDED
vramtop-0.1.0/CLAUDE.md
ADDED
|
@@ -0,0 +1,283 @@
|
|
|
1
|
+
# vramtop — Implementation Progress
|
|
2
|
+
|
|
3
|
+
## Project Overview
|
|
4
|
+
vramtop is "the htop for GPU memory" — an NVIDIA-first GPU memory monitoring TUI.
|
|
5
|
+
See `DESIGN_DOC.md` for the full architecture and design specification.
|
|
6
|
+
See `COMPLIANCE_REPORT.md` for detailed design-doc compliance audit.
|
|
7
|
+
|
|
8
|
+
## Build/Test Commands
|
|
9
|
+
```bash
|
|
10
|
+
pip install -e ".[dev]" # Install with dev deps
|
|
11
|
+
pytest tests/ -m "not gpu" -v # Unit tests (no GPU needed)
|
|
12
|
+
pytest tests/integration/ -v # GPU integration tests (requires NVIDIA GPU)
|
|
13
|
+
pytest tests/test_property_*.py --hypothesis-show-statistics -v # Property tests
|
|
14
|
+
mypy --strict src/vramtop/ # Type checking
|
|
15
|
+
ruff check src/ # Linting
|
|
16
|
+
bandit -r src/ # Security scan
|
|
17
|
+
python -m vramtop --help # CLI help
|
|
18
|
+
python -m vramtop --version # Version check
|
|
19
|
+
```
|
|
20
|
+
|
|
21
|
+
## Quality Gate Status (current)
|
|
22
|
+
- **477 unit tests + 20 integration tests = 497 total** (0 failures)
|
|
23
|
+
- **mypy --strict**: 0 errors (51 source files)
|
|
24
|
+
- **ruff check**: All checks passed
|
|
25
|
+
- **bandit**: 0 high/medium
|
|
26
|
+
- **CLI**: `vramtop 0.1.0` working
|
|
27
|
+
|
|
28
|
+
## Implementation Status
|
|
29
|
+
|
|
30
|
+
### Phase 1: Core — COMPLETE
|
|
31
|
+
- [x] Commit 1: Project scaffolding (pyproject.toml, package stubs, CI, LICENSE)
|
|
32
|
+
- [x] Commit 2: Core data types (ProcessIdentity, GPUProcess, GPUDevice, MemorySnapshot), backend ABC, exceptions
|
|
33
|
+
- [x] Commit 3: Utility modules (sanitize.py, permissions.py, process_identity.py, secrets.py) + tests + Hypothesis property tests
|
|
34
|
+
- [x] Commit 4: NVIDIA backend (NVMLClient) with compute+graphics merge, retry logic, fixtures + tests
|
|
35
|
+
- [x] Commit 5: Phase detector (variance-threshold + hysteresis), OOM predictor (range-based), trends + tests + Hypothesis
|
|
36
|
+
- [x] Commit 6: Pydantic config with SIGHUP reload + tests
|
|
37
|
+
- [x] Commit 7: TUI (GPU cards, memory bar, sparkline timeline, process table, phase badges, OOM alerts, CLI) + pilot tests
|
|
38
|
+
|
|
39
|
+
### Phase 2: Intelligence — COMPLETE
|
|
40
|
+
- [x] Commit 8: Enrichment data model (EnrichmentResult, ModelFileInfo) + orchestrator
|
|
41
|
+
- [x] Commit 9: Framework detection (cmdline + maps patterns, 30s TTL cache) + model file scanning
|
|
42
|
+
- [x] Commit 10: Container detection (Docker/Podman/cgroup) + MPS daemon detection
|
|
43
|
+
- [x] Commit 11: Memory breakdown estimation + OOM predictor rate range enhancement
|
|
44
|
+
- [x] Commit 12: Detail panel (slide-in process info) + kill dialog (SIGTERM->SIGKILL, audit log)
|
|
45
|
+
- [x] Commit 13: Wired enrichment into app poll loop + enrichment tests
|
|
46
|
+
- [x] Commit 14: Security boundary tests (UID enforcement, sanitize, audit permissions)
|
|
47
|
+
|
|
48
|
+
### Phase 3: Beauty — COMPLETE
|
|
49
|
+
- [x] Commit 15: Theme system + 6 themes (dark, light, nord, catppuccin, dracula, solarized)
|
|
50
|
+
- [x] Commit 16: Layout modes (FULL/COMPACT/MINI) + terminal size auto-detection
|
|
51
|
+
- [x] Commit 17: Accessibility (NO_COLOR, --accessible, text fallbacks)
|
|
52
|
+
- [x] Commit 18: Visual polish (number formatting, truncation, loading states, footer bar, theme cycling with 't')
|
|
53
|
+
|
|
54
|
+
### Phase 4: Scraping — COMPLETE (MVP DONE)
|
|
55
|
+
- [x] Commit 19: Base HTTP scraper with 5 security rules (localhost-only, port verification, rate limit, no redirects, size limit)
|
|
56
|
+
- [x] Commit 20: Framework scrapers (vLLM, Ollama, SGLang, llama.cpp)
|
|
57
|
+
- [x] Commit 21: Wire scrapers into enrichment pipeline
|
|
58
|
+
- [x] Commit 22: Scraper security tests (25 tests) + per-framework parsing tests (30 tests)
|
|
59
|
+
- [x] Commit 23: Final quality gate — all checks clean
|
|
60
|
+
|
|
61
|
+
### Phase 5: Export & Polish — COMPLETE
|
|
62
|
+
- [x] CSV export (`--export-csv FILE` CLI flag, `export/csv_logger.py`)
|
|
63
|
+
- [x] SVG screenshot (`s` key, saves to `~/.local/share/vramtop/screenshots/`)
|
|
64
|
+
- [x] Memory reporting fix: uses process-sum for used (v1 API), shows reserved as separate bar segment
|
|
65
|
+
- [x] GPU integration tests (14 tests with real PyTorch + NVML on RTX 2000 Ada)
|
|
66
|
+
- [x] CLI smoke tests (6 tests)
|
|
67
|
+
- [x] Design doc compliance audit (see COMPLIANCE_REPORT.md)
|
|
68
|
+
- [x] Survival predictor with KV cache + scrape-data awareness (`analysis/survival.py`)
|
|
69
|
+
- [x] PELT changepoint detection (`analysis/pelt_detector.py`, optional `ruptures` dependency)
|
|
70
|
+
- [x] Deep mode Unix socket IPC (`enrichment/deep_mode.py`, `reporter/protocol.py`, `reporter/pytorch.py`)
|
|
71
|
+
|
|
72
|
+
### Phase 6: Hardening & UI — IN PROGRESS
|
|
73
|
+
- [x] Fix survival predictor blind stable→OK (absolute headroom + spike detection)
|
|
74
|
+
- [x] Fix scraping never invoked (pass `scraping_config` to `enrich_process()`)
|
|
75
|
+
- [x] Live detail panel (auto-refreshes every poll cycle, retains data on process exit with [EXITED] tag)
|
|
76
|
+
- [x] Direct deep mode query in detail panel (bypasses enrichment cache for live PyTorch internals)
|
|
77
|
+
- [x] Space background (trig-based dot pattern via `SpaceScroll.render()`)
|
|
78
|
+
- [x] UI polish: gradient memory bar, color-coded sparkline, safe ASCII GPU header, GitHub-dark theme
|
|
79
|
+
- [x] Enrichment cache TTL tuned to 10s (static /proc data) with live bypass for deep mode
|
|
80
|
+
- [x] Docker PID namespace resolution (`_resolve_pid()` in enrichment, kill dialog, deep mode)
|
|
81
|
+
- [x] PELT wired into app (per-process timeseries accumulation, always-on in detail panel)
|
|
82
|
+
- [x] Labeled PELT segments with memory chart (`analysis/segment_labels.py`, `ui/widgets/memory_chart.py`)
|
|
83
|
+
- [ ] Pre-launch OOM risk score (predict OOM before memory fills, not just after)
|
|
84
|
+
|
|
85
|
+
## Architecture (key files)
|
|
86
|
+
```
|
|
87
|
+
src/vramtop/
|
|
88
|
+
├── backends/base.py # Data types, ABC, exceptions
|
|
89
|
+
├── backends/nvidia.py # NVMLClient (compute+graphics merge, v1/v2 memory handling)
|
|
90
|
+
├── analysis/phase_detector.py # Variance-threshold phase detection
|
|
91
|
+
├── analysis/pelt_detector.py # PELT changepoint detection (optional ruptures dependency)
|
|
92
|
+
├── analysis/segment_labels.py # 18 model-agnostic segment labels, two-pass heuristic labeling
|
|
93
|
+
├── analysis/oom_predictor.py # Range-based OOM prediction (GPU-level)
|
|
94
|
+
├── analysis/survival.py # Per-process survival predictor (OK/TIGHT/OOM verdicts)
|
|
95
|
+
├── analysis/breakdown.py # Weight vs dynamic estimation
|
|
96
|
+
├── analysis/trends.py # EMA allocation rate tracker
|
|
97
|
+
├── enrichment/__init__.py # Enrichment orchestrator (framework, model, container, scraping, deep mode)
|
|
98
|
+
├── enrichment/detector.py # Framework detection from /proc (JAX before PyTorch)
|
|
99
|
+
├── enrichment/model_files.py # Model file scanning from /proc/fd
|
|
100
|
+
├── enrichment/container.py # Docker/Podman detection
|
|
101
|
+
├── enrichment/mps.py # MPS daemon detection
|
|
102
|
+
├── enrichment/deep_mode.py # Unix socket IPC discovery + enrichment
|
|
103
|
+
├── enrichment/scraper.py # Base HTTP scraper (5 security rules)
|
|
104
|
+
├── enrichment/scrapers/ # vLLM, Ollama, SGLang, llama.cpp scrapers
|
|
105
|
+
├── ui/app.py # Main Textual app (layout modes, theme cycling, screenshot, live detail panel)
|
|
106
|
+
├── ui/widgets/memory_bar.py # 3-segment gradient bar: used | reserved | free
|
|
107
|
+
├── ui/widgets/timeline.py # Color-gradient sparkline (green→yellow→red)
|
|
108
|
+
├── ui/widgets/gpu_card.py # GPU card (header, memory bar, timeline, process table, OOM alert)
|
|
109
|
+
├── ui/widgets/process_table.py # Process table with phase badges + survival verdicts
|
|
110
|
+
├── ui/widgets/detail_panel.py # Slide-in panel (live-updating, PELT chart+segments, deep mode, [EXITED] retention)
|
|
111
|
+
├── ui/widgets/memory_chart.py # Sparkline + segment color bar + human-readable summary
|
|
112
|
+
├── ui/widgets/kill_dialog.py # Kill dialog (SIGTERM→SIGKILL, audit logging, Docker PID resolution)
|
|
113
|
+
├── ui/widgets/space_bg.py # SpaceScroll container with trig-based dot background
|
|
114
|
+
├── ui/themes/ # 6 theme TCSS files (dark = GitHub-dark palette)
|
|
115
|
+
├── config.py # Pydantic config + SIGHUP reload
|
|
116
|
+
├── sanitize.py # ANSI/control char stripping (idempotent)
|
|
117
|
+
├── permissions.py # UID checks
|
|
118
|
+
├── process_identity.py # (PID, starttime) from /proc/pid/stat
|
|
119
|
+
├── secrets.py # Env var + 0600 file resolution
|
|
120
|
+
├── export/__init__.py # ExportManager (CSV)
|
|
121
|
+
├── export/csv_logger.py # Thread-safe CSV writer
|
|
122
|
+
├── export/screenshot.py # SVG screenshot via Textual
|
|
123
|
+
├── reporter/__init__.py # Deep mode reporter package
|
|
124
|
+
├── reporter/protocol.py # Wire protocol (HandshakeMsg, MemoryMsg)
|
|
125
|
+
└── reporter/pytorch.py # PyTorch reporter daemon thread
|
|
126
|
+
```
|
|
127
|
+
|
|
128
|
+
## Key Design Decisions
|
|
129
|
+
- Process identity uses `(PID, starttime)` tuples everywhere to prevent PID recycling issues
|
|
130
|
+
- NVML backend calls BOTH `GetComputeRunningProcesses` AND `GetGraphicsRunningProcesses` and merges
|
|
131
|
+
- **Memory reporting**: v2 API (has `reserved` field) preferred; v1 fallback uses process-sum for `used` and raw `free` for truly allocatable memory. Memory bar shows reserved as dim `╌` segment between used and free.
|
|
132
|
+
- Per-process GPU utilization NOT shown (broken for multi-process, confirmed by NVIDIA)
|
|
133
|
+
- All /proc reads are same-UID only (security boundary)
|
|
134
|
+
- All external strings sanitized (ANSI strip, control char removal, 256-char truncation)
|
|
135
|
+
- Kill flow: SIGTERM first -> 5s wait -> offer SIGKILL, with audit logging
|
|
136
|
+
- OOM predictions are always ranges, never point estimates
|
|
137
|
+
- HTTP scrapers enforce 5 rules: localhost-only, port-owner verification, rate limiting, no redirects, 64KB size limit
|
|
138
|
+
- **Survival predictor**: Three layers of OOM detection: (1) spike detection — if historical peak-to-trough > free memory → OOM, (2) absolute headroom — <2% free → TIGHT regardless of phase, <5% free in stable → TIGHT, (3) multiplier heuristic — framework-aware peak estimation for growing/volatile phases. Pre-allocating frameworks (vLLM, SGLang, JAX) exempt from headroom checks (they deliberately run at 95%+ utilization).
|
|
139
|
+
- **Detail panel**: Live-updating every poll cycle. Queries deep mode socket directly (bypasses enrichment cache). Retains data with [EXITED] tag when process dies.
|
|
140
|
+
- **Enrichment cache**: 10s TTL for expensive static /proc reads. Deep mode in detail panel bypasses this for 1s freshness. HTTP scrapers have their own 5s rate limiter.
|
|
141
|
+
- **Space background**: `SpaceScroll(VerticalScroll)` overrides `render()` to draw trig-based dot pattern. Child widgets render on top naturally. Pattern cached per (width, height).
|
|
142
|
+
- **Docker PID namespace**: NVML reports host PIDs inside containers. `_resolve_pid()` detects phantom PIDs (not in `/proc/`) and scans for GPU-using processes via `/dev/nvidia*` fd references. Applied in enrichment, kill dialog, and deep mode. Cached 30s.
|
|
143
|
+
- **PELT segment labels**: 18 model-agnostic labels via two-pass heuristic system. Pass 1: single-segment heuristics (phase, position, magnitude, duration, variance). Pass 2: multi-segment refinement (neighboring context for checkpoint saves, cooldowns, cache filling). Labels: Initialization, Pre-allocation, Warmup, Allocation Event, Memory Growth, Memory Leak, Cache Filling, Gradient Steps, Steady State, Saturation, Plateau, Idle, Batch Processing, Fragmentation, Checkpoint Save, Cleanup, Releasing, Cooldown. Saturation detection uses `gpu_total_mb`.
|
|
144
|
+
- **Memory chart**: Compact sparkline (single row, ▁-█ chars) + colored segment bar with numbered phases. Each segment in the summary shows: label, human description, memory delta in GB, and GPU utilization %. Designed to fit cleanly in the 46-char detail panel.
|
|
145
|
+
- **Saved analysis**: PELT analysis keyed by process name survives process exit, enabling post-mortem review.
|
|
146
|
+
|
|
147
|
+
## Test File Map
|
|
148
|
+
```
|
|
149
|
+
tests/
|
|
150
|
+
├── conftest.py # Mock NVML fixtures
|
|
151
|
+
├── fixtures/nvml_responses.py # Fake NVML response data
|
|
152
|
+
├── test_nvidia_backend.py # 21 tests: backend lifecycle, merge, errors
|
|
153
|
+
├── test_phase_detector.py # 9 tests: phases, hysteresis, confidence
|
|
154
|
+
├── test_pelt_detector.py # 24 tests: PELT changepoints, penalties, framework mapping, fallback
|
|
155
|
+
├── test_oom_predictor.py # 8 tests: prediction rules, suppression
|
|
156
|
+
├── test_survival.py # 69 tests: survival predictor, headroom, spike detection, scrape-data
|
|
157
|
+
├── test_sanitize.py # 18 tests: ANSI, control chars, truncation
|
|
158
|
+
├── test_permissions.py # 9 tests: UID checks
|
|
159
|
+
├── test_process_identity.py # 8 tests: /proc/stat parsing
|
|
160
|
+
├── test_config.py # 19 tests: TOML loading, SIGHUP, validation
|
|
161
|
+
├── test_tui.py # 5 tests: Textual pilot tests
|
|
162
|
+
├── test_enrichment_detector.py # 17 tests: framework detection
|
|
163
|
+
├── test_enrichment_model_files.py # 12 tests: model file scanning
|
|
164
|
+
├── test_enrichment_container.py # 9 tests: Docker/Podman
|
|
165
|
+
├── test_enrichment_mps.py # 8 tests: MPS daemon
|
|
166
|
+
├── test_analysis_breakdown.py # 9 tests: memory breakdown
|
|
167
|
+
├── test_segment_labels.py # 35 tests: 18 heuristic labels, two-pass refinement, metadata
|
|
168
|
+
├── test_memory_chart.py # 30 tests: sparkline chart, segment bar, summary, formatting
|
|
169
|
+
├── test_deep_mode.py # 18 tests: deep mode IPC, socket discovery
|
|
170
|
+
├── test_reporter_pytorch.py # 6 tests: PyTorch reporter daemon
|
|
171
|
+
├── test_pid_namespace.py # 8 tests: Docker PID namespace resolution, deep mode fallback
|
|
172
|
+
├── test_kill_dialog.py # 16 tests: kill safety, audit
|
|
173
|
+
├── test_security_boundaries.py # 9 tests: UID enforcement
|
|
174
|
+
├── test_scraper_security.py # 25 tests: 5 scraper security rules
|
|
175
|
+
├── test_scrapers_vllm.py # 9 tests: Prometheus parsing
|
|
176
|
+
├── test_scrapers_ollama.py # 7 tests: JSON /api/ps
|
|
177
|
+
├── test_scrapers_sglang.py # 6 tests: JSON model info
|
|
178
|
+
├── test_scrapers_llamacpp.py # 8 tests: Prometheus parsing
|
|
179
|
+
├── test_export_csv.py # 8 tests: CSV logger
|
|
180
|
+
├── test_export_screenshot.py # 5 tests: SVG screenshot
|
|
181
|
+
├── test_property_sanitize.py # 4 Hypothesis tests
|
|
182
|
+
├── test_property_phase.py # 3 Hypothesis tests
|
|
183
|
+
├── test_property_oom.py # 2 Hypothesis tests
|
|
184
|
+
└── integration/
|
|
185
|
+
├── test_gpu_pytorch.py # 14 tests: real GPU (NVML, process detection, CSV, phases)
|
|
186
|
+
└── test_cli_smoke.py # 6 tests: CLI flags, CSV export with real data
|
|
187
|
+
```
|
|
188
|
+
|
|
189
|
+
## Known Issues / Gotchas
|
|
190
|
+
- **NVML v1 memory inflation**: `nvmlDeviceGetMemoryInfo` (v1) lumps driver-reserved memory (~300 MB) into `used`. Fixed by using process-sum for `used_memory_bytes` when v2 API unavailable. `free_memory_bytes` always comes from NVML's `free` (truly allocatable).
|
|
191
|
+
- **Docker PID namespace (FIXED)**: NVML reports host PIDs inside containers, not container PIDs. `_resolve_pid()` scans `/proc/*/fd/` for `/dev/nvidia*` to find the real container PID. Applied in enrichment, kill dialog, and deep mode socket fallback. Integration tests handle this with fallback matching.
|
|
192
|
+
- **No v2 API on driver 550.127.05**: RTX 2000 Ada doesn't expose `nvmlDeviceGetMemoryInfo_v2`. Code falls back gracefully.
|
|
193
|
+
- **starttime=0 cache poisoning**: When `/proc/{pid}/stat` is unreadable, `ProcessIdentity` gets `starttime=0`. The enrichment cache now skips caching for `starttime=0` to prevent PID recycling from aliasing different processes. The framework detection cache in `detector.py` also receives starttime to prevent aliasing.
|
|
194
|
+
- **NVML compute+graphics double-counting (FIXED)**: A process with both CUDA compute and OpenGL graphics contexts appears in both `GetComputeRunningProcesses` and `GetGraphicsRunningProcesses`. NVML reports the SAME allocation in both lists. `nvidia.py` now uses `max()` (not sum) when merging to avoid 2x inflation.
|
|
195
|
+
- **Phase states keyed by ProcessIdentity (FIXED)**: `_phase_states` in `app.py` is now keyed by `(gpu_index, ProcessIdentity)` instead of `(gpu_index, pid)` to prevent PID recycling from inheriting stale phase data.
|
|
196
|
+
- **PyTorch caching allocator**: NVML reports the allocator's reserved pool (2-3x actual usage). Memory breakdown is labeled as "estimate" because we can't distinguish cache from active memory without deep mode data.
|
|
197
|
+
- **JAX detection (FIXED)**: `detector.py` now checks JAX patterns (`libxla_extension`, `libxla`, `libtpu`) BEFORE PyTorch's `libtorch` — JAX environments often also have libtorch installed.
|
|
198
|
+
- **Pre-allocating framework survival (FIXED)**: `estimate_peak()` now uses `process_used * 1.05` for pre-allocating frameworks even when `model_size_bytes` IS known (KV cache pool size depends on GPU, not model).
|
|
199
|
+
- **Cache pruning (FIXED)**: `_prune_dead_processes()` in `app.py` removes entries from `_enrichment_cache`, `_phase_detectors`, `_phase_states`, `_peak_memory` for processes no longer in the snapshot.
|
|
200
|
+
- **Enrichment event loop (FIXED)**: `_enrich_processes()` now runs in `asyncio.to_thread()` to avoid blocking the Textual event loop with /proc reads.
|
|
201
|
+
- **Breakdown confidence (FIXED)**: File-size-based weight estimates capped at 0.5 confidence (down from 0.7). File sizes don't reliably reflect in-memory sizes due to compression and quantization.
|
|
202
|
+
- **PELT VOLATILE classification (FIXED)**: `classify_segments()` now detects VOLATILE segments (mixed-sign deltas with high variance).
|
|
203
|
+
- **Raw NVML exceptions in snapshot (FIXED)**: `nvmlSystemGetDriverVersion`, `nvmlSystemGetNVMLVersion`, and `nvmlDeviceGetMemoryInfo` (v1 fallback) now wrapped in try/except to translate through `_translate_nvml_error` instead of escaping as raw `NVMLError`.
|
|
204
|
+
- **Framework detector starttime=0 caching (FIXED)**: `detector.py` now skips its own `_cache` when `starttime=0`, matching the enrichment cache fix in `app.py`.
|
|
205
|
+
- **SIGKILL audit accuracy (FIXED)**: `kill_dialog.py` SIGKILL handler now records `process_gone` or `permission_denied` instead of always `sent`.
|
|
206
|
+
- **Scraper rate-limit on failure (FIXED)**: Rate-limit timestamp is cleared on network failure so transient errors don't throttle the next retry.
|
|
207
|
+
- **Deep-mode socket read cap (FIXED)**: `read_deep_data()` enforces a 64 KB byte cap to prevent same-UID memory pressure from large payloads.
|
|
208
|
+
- **OOM min_rate_mb_per_sec default (FIXED)**: Changed from 5.0 to 1.0 to match design doc specification. The old value of 5.0 suppressed warnings for gradual memory growth.
|
|
209
|
+
- **Survival predictor blind stable→OK (FIXED)**: Previously `phase == "stable"` returned OK unconditionally — no headroom check. A process using 97.5% of GPU (405 MiB free on 16 GB) got "OK stable". Fixed with three checks: (1) spike detection if `peak - current > free`, (2) critical headroom if `free < 2%`, (3) stable + low if `free < 5%`. Pre-allocating frameworks exempt.
|
|
210
|
+
- **Scraping never invoked (FIXED)**: `enrich_process()` accepted `scraping_config` but `app.py` never passed it. HTTP scrapers were dead code. Now passes `self.config.scraping`.
|
|
211
|
+
- **Detail panel was one-shot (FIXED)**: Pressing `d` rendered once and never updated. Now auto-refreshes every poll cycle while visible. Deep mode queried directly (bypasses enrichment cache). Shows `[EXITED]` when process dies instead of vanishing.
|
|
212
|
+
|
|
213
|
+
## Design Rules (Prevent Logical Errors)
|
|
214
|
+
|
|
215
|
+
These rules MUST be followed when modifying analysis, enrichment, or survival code:
|
|
216
|
+
|
|
217
|
+
### 1. Pre-Allocation Awareness
|
|
218
|
+
**Rule**: Frameworks that pre-allocate memory pools (vLLM, SGLang, TGI, JAX) report `process_used` that already includes the pool. Applying a multiplier > 1x on already-allocated memory double-counts.
|
|
219
|
+
- `_PRE_ALLOCATING_FRAMEWORKS` in `survival.py` tracks these frameworks
|
|
220
|
+
- ALWAYS use `process_used * 1.05` for pre-allocating frameworks, even when `model_size_bytes` IS known
|
|
221
|
+
- KV cache pool size is proportional to remaining GPU memory, NOT model size — `model_size * 1.8x` is wrong
|
|
222
|
+
- When `model_size_bytes` IS known for non-pre-allocating frameworks, use `model_size * multiplier`
|
|
223
|
+
|
|
224
|
+
### 2. Scrape Data Freshness
|
|
225
|
+
**Rule**: Scrape data (HTTP metrics from inference servers) has its own 5s rate limiter. It can be stale. Don't treat it as real-time truth.
|
|
226
|
+
- Survival predictor returns scrape-based verdicts early (most accurate when fresh)
|
|
227
|
+
- Peak tracking and collective pressure operate independently of scrape data
|
|
228
|
+
- Never assume scrape data and NVML data are from the same instant
|
|
229
|
+
|
|
230
|
+
### 3. PID Identity Safety
|
|
231
|
+
**Rule**: `starttime=0` means "identity unknown". Never cache data keyed on `(pid, 0)` because PID recycling will alias different processes.
|
|
232
|
+
- Enrichment cache in `app.py` skips caching when `starttime=0`
|
|
233
|
+
- Kill dialog MUST re-verify identity before sending signals
|
|
234
|
+
- Phase detectors are keyed by full `ProcessIdentity`, not just PID
|
|
235
|
+
|
|
236
|
+
### 4. Framework-Specific Memory Patterns
|
|
237
|
+
**Rule**: Different frameworks have fundamentally different memory patterns. Never apply generic assumptions.
|
|
238
|
+
- **PyTorch**: Caching allocator creates sawtooth noise. `reserved >> allocated >> active`. Phase detector may see "volatile" even when training is stable. Training has bursty allocation (forward/backward spikes).
|
|
239
|
+
- **JAX**: Pre-allocates ~90% on first computation. Goes 0→90% in one sample. OOM prediction is meaningless for JAX startup.
|
|
240
|
+
- **vLLM/SGLang**: Pre-allocate KV cache pool at startup. KV cache usage % from metrics is the real signal, not NVML `process_used`. Exempt from absolute headroom checks.
|
|
241
|
+
- **Ollama**: Model loads fail fast (before model is fully in VRAM). Scrape data `/api/ps` only shows already-loaded models — can't predict OOM for new loads.
|
|
242
|
+
- **TGI**: Same pre-allocation pattern as vLLM. Uses internal KV cache pool.
|
|
243
|
+
|
|
244
|
+
### 5. Memory Accounting Consistency
|
|
245
|
+
**Rule**: v1 and v2 NVML APIs report different values for `used`. Code MUST be consistent about which one it uses.
|
|
246
|
+
- `device.used_memory_bytes`: v2 = app-allocated, v1 = process-sum (excludes driver overhead)
|
|
247
|
+
- `device.free_memory_bytes`: Always from NVML's raw `free` (truly allocatable)
|
|
248
|
+
- `device.total_memory_bytes`: Always exact
|
|
249
|
+
- Survival predictor uses `gpu_free_bytes` for headroom checks — this is correct regardless of v1/v2
|
|
250
|
+
|
|
251
|
+
### 6. Multi-Process Collective Safety
|
|
252
|
+
**Rule**: Individual process predictions can all be "OK" while the collective memory demand exceeds GPU total.
|
|
253
|
+
- `check_collective_pressure()` sums estimated peaks and compares to GPU total
|
|
254
|
+
- If overcommitted, upgrades OK→TIGHT and TIGHT→OOM
|
|
255
|
+
- This catches scenarios where 3 training jobs each think they have enough headroom
|
|
256
|
+
|
|
257
|
+
### 7. No False Precision
|
|
258
|
+
**Rule**: Never show point estimates for predictions. Always ranges or qualitative verdicts.
|
|
259
|
+
- OOM predictor: always `seconds_low`/`seconds_high` range
|
|
260
|
+
- Survival predictor: qualitative OK/TIGHT/OOM with reason string
|
|
261
|
+
- Memory breakdown: labeled as "estimate", never "exact"
|
|
262
|
+
|
|
263
|
+
### 8. Absolute Headroom Floor
|
|
264
|
+
**Rule**: Never return OK for a non-pre-allocating process when GPU free memory is dangerously low, regardless of phase.
|
|
265
|
+
- Spike detection: if `peak_used - current_used > gpu_free` → OOM (training loops spike during forward/backward)
|
|
266
|
+
- Critical floor: if `free < 2% of total` → TIGHT (any phase)
|
|
267
|
+
- Stable floor: if `free < 5% of total` → TIGHT (even stable phase)
|
|
268
|
+
- Pre-allocating frameworks are EXEMPT (they deliberately run at 95%+ utilization)
|
|
269
|
+
- `gpu_total_bytes` parameter is required for these checks; if 0, checks are skipped for backward compatibility
|
|
270
|
+
|
|
271
|
+
### 9. Enrichment Cache Architecture
|
|
272
|
+
**Rule**: The enrichment cache (10s TTL) protects against expensive /proc reads. Dynamic data sources bypass it.
|
|
273
|
+
- Static data (framework, model files, container) → cached for 10s, changes rarely
|
|
274
|
+
- Deep mode (PyTorch internals) → detail panel queries socket directly every 1s, bypasses cache
|
|
275
|
+
- HTTP scraping → has its own 5s rate limiter in `BaseScraper`, independent of enrichment cache
|
|
276
|
+
- Never increase enrichment cache TTL above 10s — delays deep mode socket discovery for new processes
|
|
277
|
+
|
|
278
|
+
### 10. UI Emoji Safety
|
|
279
|
+
**Rule**: Never use multi-byte emoji characters (🌡, ⊞, etc.) in widget rendering — they cause variable-width rendering across terminals.
|
|
280
|
+
- GPU card header uses ASCII separators (`|`) not Unicode box-drawing or emoji
|
|
281
|
+
- Phase badges use safe Unicode: `●`, `▲`, `▼`, `◆`
|
|
282
|
+
- Verdict badges use safe Unicode: `✓`, `⚠`, `✗`
|
|
283
|
+
- Memory bar uses safe box-drawing: `━`, `╌`, `─`, `├`, `┤`
|