vramtop 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (114) hide show
  1. vramtop-0.1.0/.github/workflows/ci.yml +50 -0
  2. vramtop-0.1.0/.github/workflows/publish.yml +56 -0
  3. vramtop-0.1.0/.gitignore +11 -0
  4. vramtop-0.1.0/CLAUDE.md +283 -0
  5. vramtop-0.1.0/COMPLIANCE_REPORT.md +409 -0
  6. vramtop-0.1.0/DESIGN_DOC.md +738 -0
  7. vramtop-0.1.0/LICENSE +21 -0
  8. vramtop-0.1.0/PKG-INFO +204 -0
  9. vramtop-0.1.0/README.md +167 -0
  10. vramtop-0.1.0/docker/Dockerfile.gpu-workload +18 -0
  11. vramtop-0.1.0/docker/docker-compose.gpu.yml +50 -0
  12. vramtop-0.1.0/docker/gpu_workload.py +62 -0
  13. vramtop-0.1.0/pyproject.toml +80 -0
  14. vramtop-0.1.0/scripts/train_test.py +226 -0
  15. vramtop-0.1.0/src/vramtop/__init__.py +20 -0
  16. vramtop-0.1.0/src/vramtop/__main__.py +5 -0
  17. vramtop-0.1.0/src/vramtop/analysis/__init__.py +1 -0
  18. vramtop-0.1.0/src/vramtop/analysis/breakdown.py +63 -0
  19. vramtop-0.1.0/src/vramtop/analysis/oom_predictor.py +232 -0
  20. vramtop-0.1.0/src/vramtop/analysis/pelt_detector.py +160 -0
  21. vramtop-0.1.0/src/vramtop/analysis/phase_detector.py +108 -0
  22. vramtop-0.1.0/src/vramtop/analysis/segment_labels.py +573 -0
  23. vramtop-0.1.0/src/vramtop/analysis/survival.py +393 -0
  24. vramtop-0.1.0/src/vramtop/analysis/trends.py +79 -0
  25. vramtop-0.1.0/src/vramtop/backends/__init__.py +40 -0
  26. vramtop-0.1.0/src/vramtop/backends/base.py +111 -0
  27. vramtop-0.1.0/src/vramtop/backends/nvidia.py +298 -0
  28. vramtop-0.1.0/src/vramtop/cli.py +160 -0
  29. vramtop-0.1.0/src/vramtop/config.py +199 -0
  30. vramtop-0.1.0/src/vramtop/enrichment/__init__.py +232 -0
  31. vramtop-0.1.0/src/vramtop/enrichment/container.py +162 -0
  32. vramtop-0.1.0/src/vramtop/enrichment/deep_mode.py +127 -0
  33. vramtop-0.1.0/src/vramtop/enrichment/detector.py +97 -0
  34. vramtop-0.1.0/src/vramtop/enrichment/model_files.py +73 -0
  35. vramtop-0.1.0/src/vramtop/enrichment/mps.py +61 -0
  36. vramtop-0.1.0/src/vramtop/enrichment/scraper.py +225 -0
  37. vramtop-0.1.0/src/vramtop/enrichment/scrapers/__init__.py +87 -0
  38. vramtop-0.1.0/src/vramtop/enrichment/scrapers/llamacpp.py +60 -0
  39. vramtop-0.1.0/src/vramtop/enrichment/scrapers/ollama.py +69 -0
  40. vramtop-0.1.0/src/vramtop/enrichment/scrapers/sglang.py +47 -0
  41. vramtop-0.1.0/src/vramtop/enrichment/scrapers/vllm.py +57 -0
  42. vramtop-0.1.0/src/vramtop/export/__init__.py +39 -0
  43. vramtop-0.1.0/src/vramtop/export/csv_logger.py +105 -0
  44. vramtop-0.1.0/src/vramtop/export/screenshot.py +40 -0
  45. vramtop-0.1.0/src/vramtop/permissions.py +22 -0
  46. vramtop-0.1.0/src/vramtop/process_identity.py +22 -0
  47. vramtop-0.1.0/src/vramtop/reporter/__init__.py +3 -0
  48. vramtop-0.1.0/src/vramtop/reporter/protocol.py +77 -0
  49. vramtop-0.1.0/src/vramtop/reporter/pytorch.py +149 -0
  50. vramtop-0.1.0/src/vramtop/sanitize.py +24 -0
  51. vramtop-0.1.0/src/vramtop/secrets.py +52 -0
  52. vramtop-0.1.0/src/vramtop/ui/__init__.py +1 -0
  53. vramtop-0.1.0/src/vramtop/ui/app.py +844 -0
  54. vramtop-0.1.0/src/vramtop/ui/styles.tcss +81 -0
  55. vramtop-0.1.0/src/vramtop/ui/themes/__init__.py +60 -0
  56. vramtop-0.1.0/src/vramtop/ui/themes/catppuccin.tcss +83 -0
  57. vramtop-0.1.0/src/vramtop/ui/themes/dark.tcss +85 -0
  58. vramtop-0.1.0/src/vramtop/ui/themes/dracula.tcss +83 -0
  59. vramtop-0.1.0/src/vramtop/ui/themes/light.tcss +83 -0
  60. vramtop-0.1.0/src/vramtop/ui/themes/nord.tcss +83 -0
  61. vramtop-0.1.0/src/vramtop/ui/themes/solarized.tcss +83 -0
  62. vramtop-0.1.0/src/vramtop/ui/widgets/__init__.py +1 -0
  63. vramtop-0.1.0/src/vramtop/ui/widgets/alerts.py +53 -0
  64. vramtop-0.1.0/src/vramtop/ui/widgets/detail_panel.py +365 -0
  65. vramtop-0.1.0/src/vramtop/ui/widgets/gpu_card.py +165 -0
  66. vramtop-0.1.0/src/vramtop/ui/widgets/kill_dialog.py +381 -0
  67. vramtop-0.1.0/src/vramtop/ui/widgets/memory_bar.py +98 -0
  68. vramtop-0.1.0/src/vramtop/ui/widgets/memory_chart.py +248 -0
  69. vramtop-0.1.0/src/vramtop/ui/widgets/phase_badge.py +76 -0
  70. vramtop-0.1.0/src/vramtop/ui/widgets/process_table.py +138 -0
  71. vramtop-0.1.0/src/vramtop/ui/widgets/space_bg.py +68 -0
  72. vramtop-0.1.0/src/vramtop/ui/widgets/timeline.py +78 -0
  73. vramtop-0.1.0/tests/__init__.py +0 -0
  74. vramtop-0.1.0/tests/conftest.py +190 -0
  75. vramtop-0.1.0/tests/fixtures/__init__.py +0 -0
  76. vramtop-0.1.0/tests/fixtures/nvml_responses.py +138 -0
  77. vramtop-0.1.0/tests/integration/__init__.py +0 -0
  78. vramtop-0.1.0/tests/integration/test_cli_smoke.py +158 -0
  79. vramtop-0.1.0/tests/integration/test_gpu_pytorch.py +409 -0
  80. vramtop-0.1.0/tests/integration/test_multi_container.py +104 -0
  81. vramtop-0.1.0/tests/test_analysis_breakdown.py +87 -0
  82. vramtop-0.1.0/tests/test_config.py +329 -0
  83. vramtop-0.1.0/tests/test_deep_mode.py +237 -0
  84. vramtop-0.1.0/tests/test_enrichment_container.py +203 -0
  85. vramtop-0.1.0/tests/test_enrichment_detector.py +142 -0
  86. vramtop-0.1.0/tests/test_enrichment_model_files.py +116 -0
  87. vramtop-0.1.0/tests/test_enrichment_mps.py +99 -0
  88. vramtop-0.1.0/tests/test_export_csv.py +172 -0
  89. vramtop-0.1.0/tests/test_export_screenshot.py +108 -0
  90. vramtop-0.1.0/tests/test_kill_dialog.py +342 -0
  91. vramtop-0.1.0/tests/test_labels_realworld.py +534 -0
  92. vramtop-0.1.0/tests/test_memory_chart.py +228 -0
  93. vramtop-0.1.0/tests/test_multi_container.py +340 -0
  94. vramtop-0.1.0/tests/test_nvidia_backend.py +292 -0
  95. vramtop-0.1.0/tests/test_oom_predictor.py +165 -0
  96. vramtop-0.1.0/tests/test_pelt_detector.py +295 -0
  97. vramtop-0.1.0/tests/test_permissions.py +48 -0
  98. vramtop-0.1.0/tests/test_phase_detector.py +96 -0
  99. vramtop-0.1.0/tests/test_pid_namespace.py +153 -0
  100. vramtop-0.1.0/tests/test_process_identity.py +65 -0
  101. vramtop-0.1.0/tests/test_property_oom.py +91 -0
  102. vramtop-0.1.0/tests/test_property_phase.py +57 -0
  103. vramtop-0.1.0/tests/test_property_sanitize.py +39 -0
  104. vramtop-0.1.0/tests/test_reporter_pytorch.py +187 -0
  105. vramtop-0.1.0/tests/test_sanitize.py +67 -0
  106. vramtop-0.1.0/tests/test_scraper_security.py +381 -0
  107. vramtop-0.1.0/tests/test_scrapers_llamacpp.py +73 -0
  108. vramtop-0.1.0/tests/test_scrapers_ollama.py +96 -0
  109. vramtop-0.1.0/tests/test_scrapers_sglang.py +67 -0
  110. vramtop-0.1.0/tests/test_scrapers_vllm.py +85 -0
  111. vramtop-0.1.0/tests/test_security_boundaries.py +117 -0
  112. vramtop-0.1.0/tests/test_segment_labels.py +545 -0
  113. vramtop-0.1.0/tests/test_survival.py +882 -0
  114. vramtop-0.1.0/tests/test_tui.py +137 -0
@@ -0,0 +1,50 @@
1
+ name: CI
2
+
3
+ on:
4
+ push:
5
+ branches: [main]
6
+ pull_request:
7
+ branches: [main]
8
+
9
+ jobs:
10
+ lint:
11
+ runs-on: ubuntu-latest
12
+ steps:
13
+ - uses: actions/checkout@v4
14
+ - uses: actions/setup-python@v5
15
+ with:
16
+ python-version: "3.12"
17
+ - run: pip install -e ".[dev]"
18
+ - run: ruff check src/
19
+ - run: mypy --strict src/vramtop/
20
+
21
+ test-unit:
22
+ runs-on: ubuntu-latest
23
+ steps:
24
+ - uses: actions/checkout@v4
25
+ - uses: actions/setup-python@v5
26
+ with:
27
+ python-version: "3.12"
28
+ - run: pip install -e ".[dev]"
29
+ - run: pytest tests/ -m "not gpu and not docker" -v
30
+
31
+ test-property:
32
+ runs-on: ubuntu-latest
33
+ steps:
34
+ - uses: actions/checkout@v4
35
+ - uses: actions/setup-python@v5
36
+ with:
37
+ python-version: "3.12"
38
+ - run: pip install -e ".[dev]"
39
+ - run: pytest tests/test_property_*.py --hypothesis-show-statistics -v
40
+
41
+ security:
42
+ runs-on: ubuntu-latest
43
+ steps:
44
+ - uses: actions/checkout@v4
45
+ - uses: actions/setup-python@v5
46
+ with:
47
+ python-version: "3.12"
48
+ - run: pip install -e ".[dev]"
49
+ - run: bandit -r src/ -ll
50
+ - run: pip-audit
@@ -0,0 +1,56 @@
1
+ name: Publish to PyPI
2
+
3
+ on:
4
+ push:
5
+ tags:
6
+ - "v*"
7
+
8
+ jobs:
9
+ lint:
10
+ runs-on: ubuntu-latest
11
+ steps:
12
+ - uses: actions/checkout@v4
13
+ - uses: actions/setup-python@v5
14
+ with:
15
+ python-version: "3.12"
16
+ - run: pip install -e ".[dev]"
17
+ - run: ruff check src/
18
+ - run: mypy --strict src/vramtop/
19
+
20
+ test:
21
+ runs-on: ubuntu-latest
22
+ steps:
23
+ - uses: actions/checkout@v4
24
+ - uses: actions/setup-python@v5
25
+ with:
26
+ python-version: "3.12"
27
+ - run: pip install -e ".[dev]"
28
+ - run: pytest tests/ -m "not gpu and not docker" -v
29
+
30
+ security:
31
+ runs-on: ubuntu-latest
32
+ steps:
33
+ - uses: actions/checkout@v4
34
+ - uses: actions/setup-python@v5
35
+ with:
36
+ python-version: "3.12"
37
+ - run: pip install -e ".[dev]"
38
+ - run: bandit -r src/ -ll
39
+ - run: pip-audit
40
+
41
+ publish:
42
+ needs: [lint, test, security]
43
+ runs-on: ubuntu-latest
44
+ permissions:
45
+ id-token: write
46
+ steps:
47
+ - uses: actions/checkout@v4
48
+ - uses: actions/setup-python@v5
49
+ with:
50
+ python-version: "3.12"
51
+ - run: pip install build
52
+ - run: python -m build
53
+ - name: Publish to PyPI
54
+ uses: pypa/gh-action-pypi-publish@release/v1
55
+ with:
56
+ password: ${{ secrets.PYPI_API_TOKEN }}
@@ -0,0 +1,11 @@
1
+ __pycache__/
2
+ *.pyc
3
+ *.pyo
4
+ *.egg-info/
5
+ dist/
6
+ build/
7
+ .hypothesis/
8
+ .pytest_cache/
9
+ *.csv
10
+ .mypy_cache/
11
+ .ruff_cache/
@@ -0,0 +1,283 @@
1
+ # vramtop — Implementation Progress
2
+
3
+ ## Project Overview
4
+ vramtop is "the htop for GPU memory" — an NVIDIA-first GPU memory monitoring TUI.
5
+ See `DESIGN_DOC.md` for the full architecture and design specification.
6
+ See `COMPLIANCE_REPORT.md` for detailed design-doc compliance audit.
7
+
8
+ ## Build/Test Commands
9
+ ```bash
10
+ pip install -e ".[dev]" # Install with dev deps
11
+ pytest tests/ -m "not gpu" -v # Unit tests (no GPU needed)
12
+ pytest tests/integration/ -v # GPU integration tests (requires NVIDIA GPU)
13
+ pytest tests/test_property_*.py --hypothesis-show-statistics -v # Property tests
14
+ mypy --strict src/vramtop/ # Type checking
15
+ ruff check src/ # Linting
16
+ bandit -r src/ # Security scan
17
+ python -m vramtop --help # CLI help
18
+ python -m vramtop --version # Version check
19
+ ```
20
+
21
+ ## Quality Gate Status (current)
22
+ - **477 unit tests + 20 integration tests = 497 total** (0 failures)
23
+ - **mypy --strict**: 0 errors (51 source files)
24
+ - **ruff check**: All checks passed
25
+ - **bandit**: 0 high/medium
26
+ - **CLI**: `vramtop 0.1.0` working
27
+
28
+ ## Implementation Status
29
+
30
+ ### Phase 1: Core — COMPLETE
31
+ - [x] Commit 1: Project scaffolding (pyproject.toml, package stubs, CI, LICENSE)
32
+ - [x] Commit 2: Core data types (ProcessIdentity, GPUProcess, GPUDevice, MemorySnapshot), backend ABC, exceptions
33
+ - [x] Commit 3: Utility modules (sanitize.py, permissions.py, process_identity.py, secrets.py) + tests + Hypothesis property tests
34
+ - [x] Commit 4: NVIDIA backend (NVMLClient) with compute+graphics merge, retry logic, fixtures + tests
35
+ - [x] Commit 5: Phase detector (variance-threshold + hysteresis), OOM predictor (range-based), trends + tests + Hypothesis
36
+ - [x] Commit 6: Pydantic config with SIGHUP reload + tests
37
+ - [x] Commit 7: TUI (GPU cards, memory bar, sparkline timeline, process table, phase badges, OOM alerts, CLI) + pilot tests
38
+
39
+ ### Phase 2: Intelligence — COMPLETE
40
+ - [x] Commit 8: Enrichment data model (EnrichmentResult, ModelFileInfo) + orchestrator
41
+ - [x] Commit 9: Framework detection (cmdline + maps patterns, 30s TTL cache) + model file scanning
42
+ - [x] Commit 10: Container detection (Docker/Podman/cgroup) + MPS daemon detection
43
+ - [x] Commit 11: Memory breakdown estimation + OOM predictor rate range enhancement
44
+ - [x] Commit 12: Detail panel (slide-in process info) + kill dialog (SIGTERM->SIGKILL, audit log)
45
+ - [x] Commit 13: Wired enrichment into app poll loop + enrichment tests
46
+ - [x] Commit 14: Security boundary tests (UID enforcement, sanitize, audit permissions)
47
+
48
+ ### Phase 3: Beauty — COMPLETE
49
+ - [x] Commit 15: Theme system + 6 themes (dark, light, nord, catppuccin, dracula, solarized)
50
+ - [x] Commit 16: Layout modes (FULL/COMPACT/MINI) + terminal size auto-detection
51
+ - [x] Commit 17: Accessibility (NO_COLOR, --accessible, text fallbacks)
52
+ - [x] Commit 18: Visual polish (number formatting, truncation, loading states, footer bar, theme cycling with 't')
53
+
54
+ ### Phase 4: Scraping — COMPLETE (MVP DONE)
55
+ - [x] Commit 19: Base HTTP scraper with 5 security rules (localhost-only, port verification, rate limit, no redirects, size limit)
56
+ - [x] Commit 20: Framework scrapers (vLLM, Ollama, SGLang, llama.cpp)
57
+ - [x] Commit 21: Wire scrapers into enrichment pipeline
58
+ - [x] Commit 22: Scraper security tests (25 tests) + per-framework parsing tests (30 tests)
59
+ - [x] Commit 23: Final quality gate — all checks clean
60
+
61
+ ### Phase 5: Export & Polish — COMPLETE
62
+ - [x] CSV export (`--export-csv FILE` CLI flag, `export/csv_logger.py`)
63
+ - [x] SVG screenshot (`s` key, saves to `~/.local/share/vramtop/screenshots/`)
64
+ - [x] Memory reporting fix: uses process-sum for used (v1 API), shows reserved as separate bar segment
65
+ - [x] GPU integration tests (14 tests with real PyTorch + NVML on RTX 2000 Ada)
66
+ - [x] CLI smoke tests (6 tests)
67
+ - [x] Design doc compliance audit (see COMPLIANCE_REPORT.md)
68
+ - [x] Survival predictor with KV cache + scrape-data awareness (`analysis/survival.py`)
69
+ - [x] PELT changepoint detection (`analysis/pelt_detector.py`, optional `ruptures` dependency)
70
+ - [x] Deep mode Unix socket IPC (`enrichment/deep_mode.py`, `reporter/protocol.py`, `reporter/pytorch.py`)
71
+
72
+ ### Phase 6: Hardening & UI — IN PROGRESS
73
+ - [x] Fix survival predictor blind stable→OK (absolute headroom + spike detection)
74
+ - [x] Fix scraping never invoked (pass `scraping_config` to `enrich_process()`)
75
+ - [x] Live detail panel (auto-refreshes every poll cycle, retains data on process exit with [EXITED] tag)
76
+ - [x] Direct deep mode query in detail panel (bypasses enrichment cache for live PyTorch internals)
77
+ - [x] Space background (trig-based dot pattern via `SpaceScroll.render()`)
78
+ - [x] UI polish: gradient memory bar, color-coded sparkline, safe ASCII GPU header, GitHub-dark theme
79
+ - [x] Enrichment cache TTL tuned to 10s (static /proc data) with live bypass for deep mode
80
+ - [x] Docker PID namespace resolution (`_resolve_pid()` in enrichment, kill dialog, deep mode)
81
+ - [x] PELT wired into app (per-process timeseries accumulation, always-on in detail panel)
82
+ - [x] Labeled PELT segments with memory chart (`analysis/segment_labels.py`, `ui/widgets/memory_chart.py`)
83
+ - [ ] Pre-launch OOM risk score (predict OOM before memory fills, not just after)
84
+
85
+ ## Architecture (key files)
86
+ ```
87
+ src/vramtop/
88
+ ├── backends/base.py # Data types, ABC, exceptions
89
+ ├── backends/nvidia.py # NVMLClient (compute+graphics merge, v1/v2 memory handling)
90
+ ├── analysis/phase_detector.py # Variance-threshold phase detection
91
+ ├── analysis/pelt_detector.py # PELT changepoint detection (optional ruptures dependency)
92
+ ├── analysis/segment_labels.py # 18 model-agnostic segment labels, two-pass heuristic labeling
93
+ ├── analysis/oom_predictor.py # Range-based OOM prediction (GPU-level)
94
+ ├── analysis/survival.py # Per-process survival predictor (OK/TIGHT/OOM verdicts)
95
+ ├── analysis/breakdown.py # Weight vs dynamic estimation
96
+ ├── analysis/trends.py # EMA allocation rate tracker
97
+ ├── enrichment/__init__.py # Enrichment orchestrator (framework, model, container, scraping, deep mode)
98
+ ├── enrichment/detector.py # Framework detection from /proc (JAX before PyTorch)
99
+ ├── enrichment/model_files.py # Model file scanning from /proc/fd
100
+ ├── enrichment/container.py # Docker/Podman detection
101
+ ├── enrichment/mps.py # MPS daemon detection
102
+ ├── enrichment/deep_mode.py # Unix socket IPC discovery + enrichment
103
+ ├── enrichment/scraper.py # Base HTTP scraper (5 security rules)
104
+ ├── enrichment/scrapers/ # vLLM, Ollama, SGLang, llama.cpp scrapers
105
+ ├── ui/app.py # Main Textual app (layout modes, theme cycling, screenshot, live detail panel)
106
+ ├── ui/widgets/memory_bar.py # 3-segment gradient bar: used | reserved | free
107
+ ├── ui/widgets/timeline.py # Color-gradient sparkline (green→yellow→red)
108
+ ├── ui/widgets/gpu_card.py # GPU card (header, memory bar, timeline, process table, OOM alert)
109
+ ├── ui/widgets/process_table.py # Process table with phase badges + survival verdicts
110
+ ├── ui/widgets/detail_panel.py # Slide-in panel (live-updating, PELT chart+segments, deep mode, [EXITED] retention)
111
+ ├── ui/widgets/memory_chart.py # Sparkline + segment color bar + human-readable summary
112
+ ├── ui/widgets/kill_dialog.py # Kill dialog (SIGTERM→SIGKILL, audit logging, Docker PID resolution)
113
+ ├── ui/widgets/space_bg.py # SpaceScroll container with trig-based dot background
114
+ ├── ui/themes/ # 6 theme TCSS files (dark = GitHub-dark palette)
115
+ ├── config.py # Pydantic config + SIGHUP reload
116
+ ├── sanitize.py # ANSI/control char stripping (idempotent)
117
+ ├── permissions.py # UID checks
118
+ ├── process_identity.py # (PID, starttime) from /proc/pid/stat
119
+ ├── secrets.py # Env var + 0600 file resolution
120
+ ├── export/__init__.py # ExportManager (CSV)
121
+ ├── export/csv_logger.py # Thread-safe CSV writer
122
+ ├── export/screenshot.py # SVG screenshot via Textual
123
+ ├── reporter/__init__.py # Deep mode reporter package
124
+ ├── reporter/protocol.py # Wire protocol (HandshakeMsg, MemoryMsg)
125
+ └── reporter/pytorch.py # PyTorch reporter daemon thread
126
+ ```
127
+
128
+ ## Key Design Decisions
129
+ - Process identity uses `(PID, starttime)` tuples everywhere to prevent PID recycling issues
130
+ - NVML backend calls BOTH `GetComputeRunningProcesses` AND `GetGraphicsRunningProcesses` and merges
131
+ - **Memory reporting**: v2 API (has `reserved` field) preferred; v1 fallback uses process-sum for `used` and raw `free` for truly allocatable memory. Memory bar shows reserved as dim `╌` segment between used and free.
132
+ - Per-process GPU utilization NOT shown (broken for multi-process, confirmed by NVIDIA)
133
+ - All /proc reads are same-UID only (security boundary)
134
+ - All external strings sanitized (ANSI strip, control char removal, 256-char truncation)
135
+ - Kill flow: SIGTERM first -> 5s wait -> offer SIGKILL, with audit logging
136
+ - OOM predictions are always ranges, never point estimates
137
+ - HTTP scrapers enforce 5 rules: localhost-only, port-owner verification, rate limiting, no redirects, 64KB size limit
138
+ - **Survival predictor**: Three layers of OOM detection: (1) spike detection — if historical peak-to-trough > free memory → OOM, (2) absolute headroom — <2% free → TIGHT regardless of phase, <5% free in stable → TIGHT, (3) multiplier heuristic — framework-aware peak estimation for growing/volatile phases. Pre-allocating frameworks (vLLM, SGLang, JAX) exempt from headroom checks (they deliberately run at 95%+ utilization).
139
+ - **Detail panel**: Live-updating every poll cycle. Queries deep mode socket directly (bypasses enrichment cache). Retains data with [EXITED] tag when process dies.
140
+ - **Enrichment cache**: 10s TTL for expensive static /proc reads. Deep mode in detail panel bypasses this for 1s freshness. HTTP scrapers have their own 5s rate limiter.
141
+ - **Space background**: `SpaceScroll(VerticalScroll)` overrides `render()` to draw trig-based dot pattern. Child widgets render on top naturally. Pattern cached per (width, height).
142
+ - **Docker PID namespace**: NVML reports host PIDs inside containers. `_resolve_pid()` detects phantom PIDs (not in `/proc/`) and scans for GPU-using processes via `/dev/nvidia*` fd references. Applied in enrichment, kill dialog, and deep mode. Cached 30s.
143
+ - **PELT segment labels**: 18 model-agnostic labels via two-pass heuristic system. Pass 1: single-segment heuristics (phase, position, magnitude, duration, variance). Pass 2: multi-segment refinement (neighboring context for checkpoint saves, cooldowns, cache filling). Labels: Initialization, Pre-allocation, Warmup, Allocation Event, Memory Growth, Memory Leak, Cache Filling, Gradient Steps, Steady State, Saturation, Plateau, Idle, Batch Processing, Fragmentation, Checkpoint Save, Cleanup, Releasing, Cooldown. Saturation detection uses `gpu_total_mb`.
144
+ - **Memory chart**: Compact sparkline (single row, ▁-█ chars) + colored segment bar with numbered phases. Each segment in the summary shows: label, human description, memory delta in GB, and GPU utilization %. Designed to fit cleanly in the 46-char detail panel.
145
+ - **Saved analysis**: PELT analysis keyed by process name survives process exit, enabling post-mortem review.
146
+
147
+ ## Test File Map
148
+ ```
149
+ tests/
150
+ ├── conftest.py # Mock NVML fixtures
151
+ ├── fixtures/nvml_responses.py # Fake NVML response data
152
+ ├── test_nvidia_backend.py # 21 tests: backend lifecycle, merge, errors
153
+ ├── test_phase_detector.py # 9 tests: phases, hysteresis, confidence
154
+ ├── test_pelt_detector.py # 24 tests: PELT changepoints, penalties, framework mapping, fallback
155
+ ├── test_oom_predictor.py # 8 tests: prediction rules, suppression
156
+ ├── test_survival.py # 69 tests: survival predictor, headroom, spike detection, scrape-data
157
+ ├── test_sanitize.py # 18 tests: ANSI, control chars, truncation
158
+ ├── test_permissions.py # 9 tests: UID checks
159
+ ├── test_process_identity.py # 8 tests: /proc/stat parsing
160
+ ├── test_config.py # 19 tests: TOML loading, SIGHUP, validation
161
+ ├── test_tui.py # 5 tests: Textual pilot tests
162
+ ├── test_enrichment_detector.py # 17 tests: framework detection
163
+ ├── test_enrichment_model_files.py # 12 tests: model file scanning
164
+ ├── test_enrichment_container.py # 9 tests: Docker/Podman
165
+ ├── test_enrichment_mps.py # 8 tests: MPS daemon
166
+ ├── test_analysis_breakdown.py # 9 tests: memory breakdown
167
+ ├── test_segment_labels.py # 35 tests: 18 heuristic labels, two-pass refinement, metadata
168
+ ├── test_memory_chart.py # 30 tests: sparkline chart, segment bar, summary, formatting
169
+ ├── test_deep_mode.py # 18 tests: deep mode IPC, socket discovery
170
+ ├── test_reporter_pytorch.py # 6 tests: PyTorch reporter daemon
171
+ ├── test_pid_namespace.py # 8 tests: Docker PID namespace resolution, deep mode fallback
172
+ ├── test_kill_dialog.py # 16 tests: kill safety, audit
173
+ ├── test_security_boundaries.py # 9 tests: UID enforcement
174
+ ├── test_scraper_security.py # 25 tests: 5 scraper security rules
175
+ ├── test_scrapers_vllm.py # 9 tests: Prometheus parsing
176
+ ├── test_scrapers_ollama.py # 7 tests: JSON /api/ps
177
+ ├── test_scrapers_sglang.py # 6 tests: JSON model info
178
+ ├── test_scrapers_llamacpp.py # 8 tests: Prometheus parsing
179
+ ├── test_export_csv.py # 8 tests: CSV logger
180
+ ├── test_export_screenshot.py # 5 tests: SVG screenshot
181
+ ├── test_property_sanitize.py # 4 Hypothesis tests
182
+ ├── test_property_phase.py # 3 Hypothesis tests
183
+ ├── test_property_oom.py # 2 Hypothesis tests
184
+ └── integration/
185
+ ├── test_gpu_pytorch.py # 14 tests: real GPU (NVML, process detection, CSV, phases)
186
+ └── test_cli_smoke.py # 6 tests: CLI flags, CSV export with real data
187
+ ```
188
+
189
+ ## Known Issues / Gotchas
190
+ - **NVML v1 memory inflation**: `nvmlDeviceGetMemoryInfo` (v1) lumps driver-reserved memory (~300 MB) into `used`. Fixed by using process-sum for `used_memory_bytes` when v2 API unavailable. `free_memory_bytes` always comes from NVML's `free` (truly allocatable).
191
+ - **Docker PID namespace (FIXED)**: NVML reports host PIDs inside containers, not container PIDs. `_resolve_pid()` scans `/proc/*/fd/` for `/dev/nvidia*` to find the real container PID. Applied in enrichment, kill dialog, and deep mode socket fallback. Integration tests handle this with fallback matching.
192
+ - **No v2 API on driver 550.127.05**: RTX 2000 Ada doesn't expose `nvmlDeviceGetMemoryInfo_v2`. Code falls back gracefully.
193
+ - **starttime=0 cache poisoning**: When `/proc/{pid}/stat` is unreadable, `ProcessIdentity` gets `starttime=0`. The enrichment cache now skips caching for `starttime=0` to prevent PID recycling from aliasing different processes. The framework detection cache in `detector.py` also receives starttime to prevent aliasing.
194
+ - **NVML compute+graphics double-counting (FIXED)**: A process with both CUDA compute and OpenGL graphics contexts appears in both `GetComputeRunningProcesses` and `GetGraphicsRunningProcesses`. NVML reports the SAME allocation in both lists. `nvidia.py` now uses `max()` (not sum) when merging to avoid 2x inflation.
195
+ - **Phase states keyed by ProcessIdentity (FIXED)**: `_phase_states` in `app.py` is now keyed by `(gpu_index, ProcessIdentity)` instead of `(gpu_index, pid)` to prevent PID recycling from inheriting stale phase data.
196
+ - **PyTorch caching allocator**: NVML reports the allocator's reserved pool (2-3x actual usage). Memory breakdown is labeled as "estimate" because we can't distinguish cache from active memory without deep mode data.
197
+ - **JAX detection (FIXED)**: `detector.py` now checks JAX patterns (`libxla_extension`, `libxla`, `libtpu`) BEFORE PyTorch's `libtorch` — JAX environments often also have libtorch installed.
198
+ - **Pre-allocating framework survival (FIXED)**: `estimate_peak()` now uses `process_used * 1.05` for pre-allocating frameworks even when `model_size_bytes` IS known (KV cache pool size depends on GPU, not model).
199
+ - **Cache pruning (FIXED)**: `_prune_dead_processes()` in `app.py` removes entries from `_enrichment_cache`, `_phase_detectors`, `_phase_states`, `_peak_memory` for processes no longer in the snapshot.
200
+ - **Enrichment event loop (FIXED)**: `_enrich_processes()` now runs in `asyncio.to_thread()` to avoid blocking the Textual event loop with /proc reads.
201
+ - **Breakdown confidence (FIXED)**: File-size-based weight estimates capped at 0.5 confidence (down from 0.7). File sizes don't reliably reflect in-memory sizes due to compression and quantization.
202
+ - **PELT VOLATILE classification (FIXED)**: `classify_segments()` now detects VOLATILE segments (mixed-sign deltas with high variance).
203
+ - **Raw NVML exceptions in snapshot (FIXED)**: `nvmlSystemGetDriverVersion`, `nvmlSystemGetNVMLVersion`, and `nvmlDeviceGetMemoryInfo` (v1 fallback) now wrapped in try/except to translate through `_translate_nvml_error` instead of escaping as raw `NVMLError`.
204
+ - **Framework detector starttime=0 caching (FIXED)**: `detector.py` now skips its own `_cache` when `starttime=0`, matching the enrichment cache fix in `app.py`.
205
+ - **SIGKILL audit accuracy (FIXED)**: `kill_dialog.py` SIGKILL handler now records `process_gone` or `permission_denied` instead of always `sent`.
206
+ - **Scraper rate-limit on failure (FIXED)**: Rate-limit timestamp is cleared on network failure so transient errors don't throttle the next retry.
207
+ - **Deep-mode socket read cap (FIXED)**: `read_deep_data()` enforces a 64 KB byte cap to prevent same-UID memory pressure from large payloads.
208
+ - **OOM min_rate_mb_per_sec default (FIXED)**: Changed from 5.0 to 1.0 to match design doc specification. The old value of 5.0 suppressed warnings for gradual memory growth.
209
+ - **Survival predictor blind stable→OK (FIXED)**: Previously `phase == "stable"` returned OK unconditionally — no headroom check. A process using 97.5% of GPU (405 MiB free on 16 GB) got "OK stable". Fixed with three checks: (1) spike detection if `peak - current > free`, (2) critical headroom if `free < 2%`, (3) stable + low if `free < 5%`. Pre-allocating frameworks exempt.
210
+ - **Scraping never invoked (FIXED)**: `enrich_process()` accepted `scraping_config` but `app.py` never passed it. HTTP scrapers were dead code. Now passes `self.config.scraping`.
211
+ - **Detail panel was one-shot (FIXED)**: Pressing `d` rendered once and never updated. Now auto-refreshes every poll cycle while visible. Deep mode queried directly (bypasses enrichment cache). Shows `[EXITED]` when process dies instead of vanishing.
212
+
213
+ ## Design Rules (Prevent Logical Errors)
214
+
215
+ These rules MUST be followed when modifying analysis, enrichment, or survival code:
216
+
217
+ ### 1. Pre-Allocation Awareness
218
+ **Rule**: Frameworks that pre-allocate memory pools (vLLM, SGLang, TGI, JAX) report `process_used` that already includes the pool. Applying a multiplier > 1x on already-allocated memory double-counts.
219
+ - `_PRE_ALLOCATING_FRAMEWORKS` in `survival.py` tracks these frameworks
220
+ - ALWAYS use `process_used * 1.05` for pre-allocating frameworks, even when `model_size_bytes` IS known
221
+ - KV cache pool size is proportional to remaining GPU memory, NOT model size — `model_size * 1.8x` is wrong
222
+ - When `model_size_bytes` IS known for non-pre-allocating frameworks, use `model_size * multiplier`
223
+
224
+ ### 2. Scrape Data Freshness
225
+ **Rule**: Scrape data (HTTP metrics from inference servers) has its own 5s rate limiter. It can be stale. Don't treat it as real-time truth.
226
+ - Survival predictor returns scrape-based verdicts early (most accurate when fresh)
227
+ - Peak tracking and collective pressure operate independently of scrape data
228
+ - Never assume scrape data and NVML data are from the same instant
229
+
230
+ ### 3. PID Identity Safety
231
+ **Rule**: `starttime=0` means "identity unknown". Never cache data keyed on `(pid, 0)` because PID recycling will alias different processes.
232
+ - Enrichment cache in `app.py` skips caching when `starttime=0`
233
+ - Kill dialog MUST re-verify identity before sending signals
234
+ - Phase detectors are keyed by full `ProcessIdentity`, not just PID
235
+
236
+ ### 4. Framework-Specific Memory Patterns
237
+ **Rule**: Different frameworks have fundamentally different memory patterns. Never apply generic assumptions.
238
+ - **PyTorch**: Caching allocator creates sawtooth noise. `reserved >> allocated >> active`. Phase detector may see "volatile" even when training is stable. Training has bursty allocation (forward/backward spikes).
239
+ - **JAX**: Pre-allocates ~90% on first computation. Goes 0→90% in one sample. OOM prediction is meaningless for JAX startup.
240
+ - **vLLM/SGLang**: Pre-allocate KV cache pool at startup. KV cache usage % from metrics is the real signal, not NVML `process_used`. Exempt from absolute headroom checks.
241
+ - **Ollama**: Model loads fail fast (before model is fully in VRAM). Scrape data `/api/ps` only shows already-loaded models — can't predict OOM for new loads.
242
+ - **TGI**: Same pre-allocation pattern as vLLM. Uses internal KV cache pool.
243
+
244
+ ### 5. Memory Accounting Consistency
245
+ **Rule**: v1 and v2 NVML APIs report different values for `used`. Code MUST be consistent about which one it uses.
246
+ - `device.used_memory_bytes`: v2 = app-allocated, v1 = process-sum (excludes driver overhead)
247
+ - `device.free_memory_bytes`: Always from NVML's raw `free` (truly allocatable)
248
+ - `device.total_memory_bytes`: Always exact
249
+ - Survival predictor uses `gpu_free_bytes` for headroom checks — this is correct regardless of v1/v2
250
+
251
+ ### 6. Multi-Process Collective Safety
252
+ **Rule**: Individual process predictions can all be "OK" while the collective memory demand exceeds GPU total.
253
+ - `check_collective_pressure()` sums estimated peaks and compares to GPU total
254
+ - If overcommitted, upgrades OK→TIGHT and TIGHT→OOM
255
+ - This catches scenarios where 3 training jobs each think they have enough headroom
256
+
257
+ ### 7. No False Precision
258
+ **Rule**: Never show point estimates for predictions. Always ranges or qualitative verdicts.
259
+ - OOM predictor: always `seconds_low`/`seconds_high` range
260
+ - Survival predictor: qualitative OK/TIGHT/OOM with reason string
261
+ - Memory breakdown: labeled as "estimate", never "exact"
262
+
263
+ ### 8. Absolute Headroom Floor
264
+ **Rule**: Never return OK for a non-pre-allocating process when GPU free memory is dangerously low, regardless of phase.
265
+ - Spike detection: if `peak_used - current_used > gpu_free` → OOM (training loops spike during forward/backward)
266
+ - Critical floor: if `free < 2% of total` → TIGHT (any phase)
267
+ - Stable floor: if `free < 5% of total` → TIGHT (even stable phase)
268
+ - Pre-allocating frameworks are EXEMPT (they deliberately run at 95%+ utilization)
269
+ - `gpu_total_bytes` parameter is required for these checks; if 0, checks are skipped for backward compatibility
270
+
271
+ ### 9. Enrichment Cache Architecture
272
+ **Rule**: The enrichment cache (10s TTL) protects against expensive /proc reads. Dynamic data sources bypass it.
273
+ - Static data (framework, model files, container) → cached for 10s, changes rarely
274
+ - Deep mode (PyTorch internals) → detail panel queries socket directly every 1s, bypasses cache
275
+ - HTTP scraping → has its own 5s rate limiter in `BaseScraper`, independent of enrichment cache
276
+ - Never increase enrichment cache TTL above 10s — delays deep mode socket discovery for new processes
277
+
278
+ ### 10. UI Emoji Safety
279
+ **Rule**: Never use multi-byte emoji characters (🌡, ⊞, etc.) in widget rendering — they cause variable-width rendering across terminals.
280
+ - GPU card header uses ASCII separators (`|`) not Unicode box-drawing or emoji
281
+ - Phase badges use safe Unicode: `●`, `▲`, `▼`, `◆`
282
+ - Verdict badges use safe Unicode: `✓`, `⚠`, `✗`
283
+ - Memory bar uses safe box-drawing: `━`, `╌`, `─`, `├`, `┤`