gpu-container 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (88) hide show
  1. gpu_container-0.1.0/.dockerignore +10 -0
  2. gpu_container-0.1.0/.github/workflows/ci.yml +57 -0
  3. gpu_container-0.1.0/.github/workflows/pages.yml +50 -0
  4. gpu_container-0.1.0/.github/workflows/release.yml +216 -0
  5. gpu_container-0.1.0/.gitignore +58 -0
  6. gpu_container-0.1.0/CHANGELOG.md +26 -0
  7. gpu_container-0.1.0/Dockerfile +30 -0
  8. gpu_container-0.1.0/LICENSE +21 -0
  9. gpu_container-0.1.0/PKG-INFO +100 -0
  10. gpu_container-0.1.0/README.es.md +82 -0
  11. gpu_container-0.1.0/README.fr.md +82 -0
  12. gpu_container-0.1.0/README.hi.md +82 -0
  13. gpu_container-0.1.0/README.it.md +84 -0
  14. gpu_container-0.1.0/README.ja.md +82 -0
  15. gpu_container-0.1.0/README.md +82 -0
  16. gpu_container-0.1.0/README.pt-BR.md +84 -0
  17. gpu_container-0.1.0/README.zh.md +82 -0
  18. gpu_container-0.1.0/RELEASE_ASSESSMENT.md +87 -0
  19. gpu_container-0.1.0/SCORECARD.md +30 -0
  20. gpu_container-0.1.0/SECURITY.md +40 -0
  21. gpu_container-0.1.0/SHIP_GATE.md +81 -0
  22. gpu_container-0.1.0/assets/logo.png +0 -0
  23. gpu_container-0.1.0/docs/architecture.md +109 -0
  24. gpu_container-0.1.0/docs/cli.md +213 -0
  25. gpu_container-0.1.0/docs/constraints.md +64 -0
  26. gpu_container-0.1.0/docs/decisions/0001-per-expert-cache-build-vs-upstream.md +55 -0
  27. gpu_container-0.1.0/docs/derisk-concentration.md +95 -0
  28. gpu_container-0.1.0/docs/feasibility.md +67 -0
  29. gpu_container-0.1.0/docs/features.md +136 -0
  30. gpu_container-0.1.0/docs/moe-lane-architecture.md +98 -0
  31. gpu_container-0.1.0/docs/prior-art.md +103 -0
  32. gpu_container-0.1.0/docs/quickstart.md +111 -0
  33. gpu_container-0.1.0/gpu_container/__init__.py +9 -0
  34. gpu_container-0.1.0/gpu_container/__main__.py +60 -0
  35. gpu_container-0.1.0/gpu_container/errors.py +72 -0
  36. gpu_container-0.1.0/gpu_container/planner/__init__.py +17 -0
  37. gpu_container-0.1.0/gpu_container/planner/activation.py +225 -0
  38. gpu_container-0.1.0/gpu_container/planner/calibration.py +224 -0
  39. gpu_container-0.1.0/gpu_container/planner/calibration_seed.json +44 -0
  40. gpu_container-0.1.0/gpu_container/planner/cli.py +101 -0
  41. gpu_container-0.1.0/gpu_container/planner/concentration_cli.py +120 -0
  42. gpu_container-0.1.0/gpu_container/planner/placement.py +198 -0
  43. gpu_container-0.1.0/gpu_container/planner/receipt.py +155 -0
  44. gpu_container-0.1.0/gpu_container/planner/receipt_cli.py +143 -0
  45. gpu_container-0.1.0/gpu_container/profiler/__init__.py +24 -0
  46. gpu_container-0.1.0/gpu_container/profiler/baseline.py +122 -0
  47. gpu_container-0.1.0/gpu_container/profiler/cli.py +151 -0
  48. gpu_container-0.1.0/gpu_container/profiler/cuda_bench.py +306 -0
  49. gpu_container-0.1.0/gpu_container/profiler/hardware.py +304 -0
  50. gpu_container-0.1.0/gpu_container/profiler/model.py +178 -0
  51. gpu_container-0.1.0/gpu_container/profiler/nvme_bench.py +158 -0
  52. gpu_container-0.1.0/gpu_container/profiler/schema.py +245 -0
  53. gpu_container-0.1.0/gpu_container/watchdog.py +563 -0
  54. gpu_container-0.1.0/npm/LICENSE +21 -0
  55. gpu_container-0.1.0/npm/README.md +16 -0
  56. gpu_container-0.1.0/npm/bin/gpu-container.js +18 -0
  57. gpu_container-0.1.0/npm/package.json +38 -0
  58. gpu_container-0.1.0/pyproject.toml +35 -0
  59. gpu_container-0.1.0/scripts/gen_calibration_seed.py +102 -0
  60. gpu_container-0.1.0/scripts/ingest_sweep.py +111 -0
  61. gpu_container-0.1.0/scripts/verify.py +58 -0
  62. gpu_container-0.1.0/site/astro.config.mjs +30 -0
  63. gpu_container-0.1.0/site/package-lock.json +7926 -0
  64. gpu_container-0.1.0/site/package.json +18 -0
  65. gpu_container-0.1.0/site/src/content/docs/handbook/cli.md +77 -0
  66. gpu_container-0.1.0/site/src/content/docs/handbook/derisk.md +47 -0
  67. gpu_container-0.1.0/site/src/content/docs/handbook/getting-started.md +75 -0
  68. gpu_container-0.1.0/site/src/content/docs/handbook/index.md +48 -0
  69. gpu_container-0.1.0/site/src/content/docs/handbook/moe-lane.md +44 -0
  70. gpu_container-0.1.0/site/src/content/docs/handbook/reference.md +65 -0
  71. gpu_container-0.1.0/site/src/content/docs/handbook/safety.md +56 -0
  72. gpu_container-0.1.0/site/src/content.config.ts +7 -0
  73. gpu_container-0.1.0/site/src/pages/index.astro +33 -0
  74. gpu_container-0.1.0/site/src/site-config.ts +86 -0
  75. gpu_container-0.1.0/site/src/styles/global.css +3 -0
  76. gpu_container-0.1.0/site/src/styles/starlight-custom.css +5 -0
  77. gpu_container-0.1.0/site/tsconfig.json +5 -0
  78. gpu_container-0.1.0/tests/test_activation.py +101 -0
  79. gpu_container-0.1.0/tests/test_calibration.py +223 -0
  80. gpu_container-0.1.0/tests/test_concentration_cli.py +45 -0
  81. gpu_container-0.1.0/tests/test_dispatch.py +19 -0
  82. gpu_container-0.1.0/tests/test_errors.py +57 -0
  83. gpu_container-0.1.0/tests/test_measure.py +145 -0
  84. gpu_container-0.1.0/tests/test_planner.py +132 -0
  85. gpu_container-0.1.0/tests/test_profiler.py +59 -0
  86. gpu_container-0.1.0/tests/test_receipt_trace.py +73 -0
  87. gpu_container-0.1.0/tests/test_watchdog.py +306 -0
  88. gpu_container-0.1.0/watchdog.example.json +9 -0
@@ -0,0 +1,10 @@
1
+ .git
2
+ .gitignore
3
+ .pytest_cache
4
+ **/__pycache__
5
+ *.pyc
6
+ docs
7
+ tests
8
+ KICKOFF-*.md
9
+ *.egg-info
10
+ profile*.json
@@ -0,0 +1,57 @@
1
+ name: CI
2
+
3
+ # Paths-gated per the org GitHub Actions rules: only run when code, tests, packaging,
4
+ # the verify script, or the workflow itself change. workflow_dispatch is the manual fallback.
5
+ on:
6
+ push:
7
+ paths:
8
+ - 'pyproject.toml'
9
+ - 'gpu_container/**'
10
+ - 'tests/**'
11
+ - 'scripts/**'
12
+ - '.github/workflows/**'
13
+ pull_request:
14
+ paths:
15
+ - 'pyproject.toml'
16
+ - 'gpu_container/**'
17
+ - 'tests/**'
18
+ - 'scripts/**'
19
+ - '.github/workflows/**'
20
+ workflow_dispatch:
21
+
22
+ concurrency:
23
+ group: ${{ github.workflow }}-${{ github.ref }}
24
+ cancel-in-progress: true
25
+
26
+ permissions:
27
+ contents: read
28
+
29
+ jobs:
30
+ test:
31
+ runs-on: ubuntu-latest
32
+ strategy:
33
+ fail-fast: false
34
+ matrix:
35
+ python-version: ['3.11', '3.12']
36
+ steps:
37
+ - uses: actions/checkout@v4
38
+ - uses: actions/setup-python@v5
39
+ with:
40
+ python-version: ${{ matrix.python-version }}
41
+ - run: python -m pip install --upgrade pip
42
+ - run: pip install -e ".[dev,host]"
43
+ # The verify script runs the test suite + a CLI smoke of all five commands in one command (gate D1).
44
+ - run: python scripts/verify.py
45
+
46
+ deps:
47
+ # Dependency scanning (shipcheck gate D3). The core package has zero required deps;
48
+ # this scans the optional [host] extras (psutil, numpy) + the build toolchain.
49
+ runs-on: ubuntu-latest
50
+ steps:
51
+ - uses: actions/checkout@v4
52
+ - uses: actions/setup-python@v5
53
+ with:
54
+ python-version: '3.12'
55
+ - run: python -m pip install --upgrade pip
56
+ - run: pip install -e ".[host]" pip-audit
57
+ - run: pip-audit
@@ -0,0 +1,50 @@
1
+ name: Deploy site to GitHub Pages
2
+
3
+ on:
4
+ push:
5
+ branches: [main]
6
+ paths:
7
+ - 'site/**'
8
+ - '.github/workflows/pages.yml'
9
+ workflow_dispatch:
10
+
11
+ concurrency:
12
+ group: ${{ github.workflow }}-${{ github.ref }}
13
+ cancel-in-progress: true
14
+
15
+ permissions:
16
+ contents: read
17
+ pages: write
18
+ id-token: write
19
+
20
+ jobs:
21
+ build:
22
+ runs-on: ubuntu-latest
23
+ steps:
24
+ - uses: actions/checkout@v4
25
+
26
+ - uses: actions/setup-node@v4
27
+ with:
28
+ node-version: 22
29
+
30
+ - name: Install site dependencies
31
+ working-directory: site
32
+ run: npm ci
33
+
34
+ - name: Build site
35
+ working-directory: site
36
+ run: npm run build
37
+
38
+ - uses: actions/upload-pages-artifact@v3
39
+ with:
40
+ path: site/dist
41
+
42
+ deploy:
43
+ needs: build
44
+ runs-on: ubuntu-latest
45
+ environment:
46
+ name: github-pages
47
+ url: ${{ steps.deployment.outputs.page_url }}
48
+ steps:
49
+ - id: deployment
50
+ uses: actions/deploy-pages@v4
@@ -0,0 +1,216 @@
1
+ name: Release
2
+
3
+ # On a published GitHub Release this:
4
+ # 1. publishes the Python package to PyPI via Trusted Publishing (OIDC, no token),
5
+ # 2. builds PyInstaller binaries for each platform + uploads them (+ checksums) to the release,
6
+ # 3. publishes the `gpu-container` npm launcher via Trusted Publishing,
7
+ # 4. builds + pushes the profiler Docker image to ghcr.io.
8
+ #
9
+ # Org rule: publish workflows fire on `release: published` only. The repo has ci.yml + this
10
+ # release.yml (the org 2-workflow CI/publish pair) + pages.yml (the GitHub-Pages-deploy exception,
11
+ # paths-gated to site/**). The cross-OS binary matrix (macos/windows) is the explicit-request
12
+ # exception to the "1 OS / no macos" rule — the npm launcher distributes platform binaries.
13
+ #
14
+ # First PyPI publish needs a PyPI *pending publisher* (project gpu-container, workflow release.yml,
15
+ # environment "(Any)" — these jobs declare no GH environment). First npm publish needs a placeholder
16
+ # publish + a Trusted Publisher on npmjs.com for `gpu-container` (workflow release.yml).
17
+
18
+ on:
19
+ release:
20
+ types: [published]
21
+ workflow_dispatch:
22
+
23
+ permissions:
24
+ contents: read
25
+
26
+ concurrency:
27
+ group: ${{ github.workflow }}-${{ github.ref }}
28
+ cancel-in-progress: false # never cancel an in-flight publish
29
+
30
+ jobs:
31
+ pypi:
32
+ name: Publish to PyPI (Trusted Publishing)
33
+ runs-on: ubuntu-latest
34
+ permissions:
35
+ id-token: write # OIDC handshake for PyPI Trusted Publishing — the only auth needed
36
+ timeout-minutes: 15
37
+ steps:
38
+ - uses: actions/checkout@v5
39
+
40
+ - name: Verify the tag matches pyproject version
41
+ run: |
42
+ TAG="${GITHUB_REF_NAME#v}"
43
+ PKG=$(grep -m1 '^version = ' pyproject.toml | sed -E 's/version = "(.*)"/\1/')
44
+ echo "tag=${TAG} pyproject=${PKG}"
45
+ if [ "${TAG}" != "${PKG}" ]; then
46
+ echo "::error::release tag ${TAG} does not match pyproject version ${PKG}"
47
+ exit 1
48
+ fi
49
+
50
+ - uses: astral-sh/setup-uv@v6
51
+
52
+ - name: Build sdist + wheel
53
+ run: uv build
54
+
55
+ - name: Check distribution metadata
56
+ run: uvx twine check dist/*
57
+
58
+ - name: Publish to PyPI
59
+ # Trusted Publishing: no token. PEP 740 attestations are on by default (action >= v1.11.0).
60
+ uses: pypa/gh-action-pypi-publish@release/v1
61
+
62
+ build-binaries:
63
+ name: Build binary (${{ matrix.target }})
64
+ strategy:
65
+ fail-fast: false # one OS's failure must not cancel the others
66
+ matrix:
67
+ include:
68
+ - os: ubuntu-latest
69
+ target: linux-x64
70
+ ext: ""
71
+ - os: macos-latest
72
+ target: darwin-arm64
73
+ ext: ""
74
+ - os: windows-latest
75
+ target: win-x64
76
+ ext: ".exe"
77
+ runs-on: ${{ matrix.os }}
78
+ timeout-minutes: 20
79
+ steps:
80
+ - uses: actions/checkout@v5
81
+ - uses: astral-sh/setup-uv@v6
82
+ - run: uv python install 3.12
83
+ - run: uv venv
84
+
85
+ - name: Install gpu-container + PyInstaller
86
+ # [host] = psutil + numpy (system RAM, CPU-bandwidth probe, the watchdog). pynvml is optional
87
+ # (the profiler falls back to nvidia-smi); the binary still profiles + plans + watches.
88
+ run: uv pip install ".[host]" "pyinstaller>=6.9.0"
89
+
90
+ - name: Build binary
91
+ shell: bash
92
+ run: |
93
+ VERSION=${GITHUB_REF_NAME#v}
94
+ uv run pyinstaller --onefile --name gpu-container --console \
95
+ --collect-submodules gpu_container \
96
+ --copy-metadata gpu-container \
97
+ gpu_container/__main__.py
98
+ OUTNAME="gpu-container-${VERSION}-${{ matrix.target }}${{ matrix.ext }}"
99
+ mv "dist/gpu-container${{ matrix.ext }}" "dist/${OUTNAME}"
100
+ echo "ASSET_NAME=${OUTNAME}" >> "$GITHUB_ENV"
101
+
102
+ - name: Smoke-test the binary
103
+ shell: bash
104
+ run: dist/${{ env.ASSET_NAME }} --version
105
+
106
+ - uses: actions/upload-artifact@v4
107
+ with:
108
+ name: binary-${{ matrix.target }}
109
+ path: dist/${{ env.ASSET_NAME }}
110
+
111
+ release-binaries:
112
+ name: Upload binaries + checksums to the release
113
+ needs: build-binaries
114
+ runs-on: ubuntu-latest
115
+ permissions:
116
+ contents: write # upload assets to the release
117
+ steps:
118
+ - uses: actions/download-artifact@v4
119
+ with:
120
+ path: artifacts
121
+ merge-multiple: true
122
+
123
+ - name: Generate checksums
124
+ shell: bash
125
+ run: |
126
+ VERSION=${GITHUB_REF_NAME#v}
127
+ cd artifacts
128
+ sha256sum * > "checksums-${VERSION}.txt"
129
+ cat "checksums-${VERSION}.txt"
130
+
131
+ - uses: softprops/action-gh-release@v2
132
+ with:
133
+ files: artifacts/*
134
+
135
+ npm:
136
+ name: Publish npm launcher (Trusted Publishing)
137
+ needs: release-binaries # the launcher is only useful once the binaries are on the release
138
+ runs-on: ubuntu-latest
139
+ permissions:
140
+ id-token: write # npm provenance via Sigstore OIDC
141
+ timeout-minutes: 15
142
+ steps:
143
+ - uses: actions/checkout@v5
144
+
145
+ - name: Verify the npm launcher version matches the tag
146
+ run: |
147
+ TAG="${GITHUB_REF_NAME#v}"
148
+ PKG=$(node -p "require('./npm/package.json').version")
149
+ echo "tag=${TAG} npm=${PKG}"
150
+ if [ "${TAG}" != "${PKG}" ]; then
151
+ echo "::error::release tag ${TAG} does not match npm/package.json version ${PKG}"
152
+ exit 1
153
+ fi
154
+
155
+ - uses: actions/setup-node@v4
156
+ with:
157
+ node-version: "22"
158
+ registry-url: "https://registry.npmjs.org"
159
+
160
+ - name: Install npm >= 11.5 for OIDC trusted-publishing auth
161
+ run: |
162
+ # Node 22's bundled npm 10.9 races on an in-place `npm install -g npm@latest`
163
+ # (MODULE_NOT_FOUND: promise-retry). Install npm@latest into a sandbox and shadow it.
164
+ SANDBOX="$HOME/.npm-cli-sandbox"
165
+ mkdir -p "$SANDBOX"
166
+ pushd "$SANDBOX" >/dev/null
167
+ echo '{"name":"npm-cli-sandbox","version":"0.0.0","private":true}' > package.json
168
+ npm install --no-save --no-audit --no-fund --silent npm@latest
169
+ popd >/dev/null
170
+ echo "$SANDBOX/node_modules/.bin" >> "$GITHUB_PATH"
171
+ "$SANDBOX/node_modules/.bin/npm" --version
172
+
173
+ - name: Publish launcher with provenance (OIDC trusted publisher)
174
+ working-directory: npm
175
+ run: |
176
+ npm install --no-save --no-audit --no-fund
177
+ npm publish --provenance --access public
178
+
179
+ docker:
180
+ name: Build + push Docker image (ghcr)
181
+ runs-on: ubuntu-latest
182
+ permissions:
183
+ contents: read
184
+ packages: write # push to ghcr.io
185
+ timeout-minutes: 30
186
+ steps:
187
+ - uses: actions/checkout@v5
188
+
189
+ - name: Compute image tags
190
+ id: tags
191
+ run: |
192
+ VERSION=${GITHUB_REF_NAME#v}
193
+ IMG="ghcr.io/mcp-tool-shop-org/gpu-container"
194
+ # Always tag the exact version; only move :latest for a non-prerelease (stable) release.
195
+ {
196
+ echo "tags<<EOF"
197
+ echo "${IMG}:${VERSION}"
198
+ if [ "${{ github.event.release.prerelease }}" != "true" ]; then
199
+ echo "${IMG}:latest"
200
+ fi
201
+ echo "EOF"
202
+ } >> "$GITHUB_OUTPUT"
203
+
204
+ - name: Log in to ghcr
205
+ uses: docker/login-action@v3
206
+ with:
207
+ registry: ghcr.io
208
+ username: ${{ github.actor }}
209
+ password: ${{ secrets.GITHUB_TOKEN }}
210
+
211
+ - name: Build + push
212
+ uses: docker/build-push-action@v6
213
+ with:
214
+ context: .
215
+ push: true
216
+ tags: ${{ steps.tags.outputs.tags }}
@@ -0,0 +1,58 @@
1
+ # --- Python ---
2
+ __pycache__/
3
+ *.py[cod]
4
+ *.egg-info/
5
+ .eggs/
6
+ .venv/
7
+ venv/
8
+ .pytest_cache/
9
+ .mypy_cache/
10
+ .ruff_cache/
11
+
12
+ # --- Node ---
13
+ node_modules/
14
+ npm-debug.log*
15
+ pnpm-debug.log*
16
+ *.tsbuildinfo
17
+
18
+ # --- Build / dist ---
19
+ dist/
20
+ build/
21
+ out/
22
+
23
+ # --- Model weights / large artifacts (NEVER commit) ---
24
+ *.gguf
25
+ *.safetensors
26
+ *.bin
27
+ *.pt
28
+ *.pth
29
+ *.onnx
30
+ models/
31
+ weights/
32
+ .cache/
33
+
34
+ # --- Runtime outputs (profiles + receipts are generated, not committed) ---
35
+ receipts/
36
+ profiles/out/
37
+ *.receipt.json
38
+ profile*.json
39
+ plan*.json
40
+ bench*.json
41
+ receipt*.json
42
+ prof_dry.json
43
+ *.config.json
44
+ *-err.txt
45
+
46
+ # --- Secrets / env ---
47
+ .env
48
+ .env.*
49
+ !.env.example
50
+
51
+ # --- OS / editor ---
52
+ .DS_Store
53
+ Thumbs.db
54
+ .idea/
55
+ .vscode/
56
+ *.log
57
+ site/.astro/
58
+ .polyglot-cache.json
@@ -0,0 +1,26 @@
1
+ # Changelog
2
+
3
+ All notable changes to this project will be documented in this file.
4
+
5
+ The format is based on [Keep a Changelog](https://keepachangelog.com/),
6
+ and this project adheres to [Semantic Versioning](https://semver.org/).
7
+
8
+ ## [Unreleased]
9
+
10
+ The full feature set below is built and tested; it becomes `[1.0.0]` at the first release.
11
+
12
+ ### Added
13
+ - **Hardware + model profiler** (`gpu-container-profile`) — measured PCIe H2D/D2H, NVMe sequential + random-QD1, pinnable-RAM ceiling, CPU RAM bandwidth (all measured in-container, `None`-not-guess); closed-form model param-split (expert vs always-resident) and KV growth.
14
+ - **Placement planner** (`gpu-container-plan`) — minimal llama.cpp `--n-cpu-moe N` to fit VRAM, a roofline ceiling **and** a calibrated forecast band, and an honest ship/refuse verdict at the >1 tok/s floor.
15
+ - **Receipt + recalibration loop** (`gpu-container-receipt`) — pairs a `llama-bench` run with the plan's forecast, records realized efficiency / within-band, and writes a calibration point back so the next plan is calibrated. The verifier is a real GPU run, a different mechanism than the planner's math.
16
+ - **Routing de-risk gate** (`gpu-container-concentration`) — scores expert-routing concentration (`hot_frac_for_coverage`, `concentration_score`) from an activation trace or a `llama-imatrix` capture, to decide whether per-expert caching is worth building. Backed by [ADR-0001](docs/decisions/0001-per-expert-cache-build-vs-upstream.md).
17
+ - **Rig-safety watchdog** (`gpu-container-watchdog`) — polls GPU power/temp/VRAM (worst-case across all GPUs) + host memory against thresholds; emits ok/warn/abort (exit 0/5/7). A **supervisor mode** (`run -- <cmd>`) launches a GPU job as a child, polls in parallel, and aborts on a breach via `kill-job` (soft) or `wsl-shutdown` (catastrophic). Peak metrics export to the receipt (`--peaks-out` → `--peaks`) prove a run stayed inside the safe envelope. Shipped `watchdog.example.json`; `mem_source` tags host vs WSL2 VM; `--log` JSONL trajectory.
18
+ - **Docs** — `docs/cli.md` (CLI reference), `docs/quickstart.md` (end-to-end walkthrough), `docs/derisk-concentration.md` (the de-risk methodology), `docs/architecture.md`, `docs/features.md`, `docs/moe-lane-architecture.md`, `docs/feasibility.md`, ADR-0001.
19
+
20
+ ### Fixed
21
+ - Planner emits `-fa on` (current llama.cpp rejects a value-less `-fa`).
22
+ - Receipt: the safety-envelope verdict no longer clobbers the throughput `within_band` verdict (independent fields).
23
+
24
+ ### Notes
25
+ - Runtime support: **llama.cpp** is the integrated backend; the placement math is backend-agnostic and vLLM/Accelerate/ExLlamaV2/TensorRT-LLM are designed targets.
26
+ - Per-expert tiering is gated behind the de-risk gate + the upstream llama.cpp `#20757` mechanism (ADR-0001); the per-layer hot tier ships today.
@@ -0,0 +1,30 @@
1
+ # gpu-container profiler — runs INSIDE the container, the only honest measurement vantage
2
+ # (docker-knowledge wave-2 hw-measurement). Base = CUDA 12.8 *runtime*: it ships libcudart
3
+ # for the ctypes PCIe bench and targets sm_120 (RTX 5090 / Blackwell). No nvcc needed —
4
+ # cudaMemcpy/cudaHostAlloc are copy-engine ops, not compiled device kernels.
5
+ #
6
+ # Build: docker build -t gpu-container .
7
+ # Run: docker run --rm --gpus all -v gpc-bench:/bench gpu-container # full profile
8
+ # docker run --rm --gpus all gpu-container --no-bench # identity only
9
+ # docker run --rm --gpus all -v gpc-bench:/bench -v "$PWD":/out \
10
+ # gpu-container -o /out/profile.json
11
+ FROM nvidia/cuda:12.8.1-runtime-ubuntu24.04
12
+
13
+ # fio = NVMe seq + random-QD1 (pulls libaio); python3/pip = the profiler itself.
14
+ RUN apt-get update && apt-get install -y --no-install-recommends \
15
+ python3 python3-pip fio \
16
+ && rm -rf /var/lib/apt/lists/*
17
+
18
+ WORKDIR /app
19
+ COPY pyproject.toml README.md LICENSE ./
20
+ COPY gpu_container ./gpu_container
21
+ # [gpu]=pynvml (NVML v2 VRAM), [host]=psutil (RAM). Container venv is fine to write into.
22
+ RUN pip3 install --no-cache-dir --break-system-packages ".[gpu,host]"
23
+
24
+ # The NVMe bench writes here. Mount an ext4-backed volume (named volumes live on the WSL2
25
+ # ext4 vdisk — fast); NEVER a /mnt/<letter> drvfs bind (9p, ~5-10x slower) or the overlay
26
+ # layer (breaks O_DIRECT). The profiler refuses the wrong filesystem rather than mismeasure.
27
+ ENV GPU_CONTAINER_BENCH_DIR=/bench
28
+ VOLUME ["/bench"]
29
+
30
+ ENTRYPOINT ["python3", "-m", "gpu_container.profiler.cli"]
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 mcp-tool-shop
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,100 @@
1
+ Metadata-Version: 2.4
2
+ Name: gpu-container
3
+ Version: 0.1.0
4
+ Summary: Model-aware inference memory-placement planner for single-GPU rigs — profile, plan, prove.
5
+ Author-email: mcp-tool-shop <64996768+mcp-tool-shop@users.noreply.github.com>
6
+ License-Expression: MIT
7
+ License-File: LICENSE
8
+ Keywords: gpu,inference,llm,moe,offload,placement,profiler,vram
9
+ Requires-Python: >=3.10
10
+ Provides-Extra: dev
11
+ Requires-Dist: pytest>=8.0; extra == 'dev'
12
+ Provides-Extra: gpu
13
+ Requires-Dist: nvidia-ml-py>=12.535.0; extra == 'gpu'
14
+ Provides-Extra: host
15
+ Requires-Dist: numpy>=1.24; extra == 'host'
16
+ Requires-Dist: psutil>=5.9.0; extra == 'host'
17
+ Description-Content-Type: text/markdown
18
+
19
+ <p align="center">
20
+ <a href="README.ja.md">日本語</a> | <a href="README.zh.md">中文</a> | <a href="README.es.md">Español</a> | <a href="README.fr.md">Français</a> | <a href="README.hi.md">हिन्दी</a> | <a href="README.it.md">Italiano</a> | <a href="README.pt-BR.md">Português (BR)</a>
21
+ </p>
22
+
23
+ <div align="center">
24
+
25
+ <img src="https://raw.githubusercontent.com/mcp-tool-shop-org/gpu-container/main/assets/logo.png" width="400" alt="gpu-container" />
26
+
27
+ [![CI](https://github.com/mcp-tool-shop-org/gpu-container/actions/workflows/ci.yml/badge.svg)](https://github.com/mcp-tool-shop-org/gpu-container/actions/workflows/ci.yml)
28
+ [![PyPI](https://img.shields.io/pypi/v/gpu-container)](https://pypi.org/project/gpu-container/)
29
+ [![npm](https://img.shields.io/npm/v/gpu-container)](https://www.npmjs.com/package/gpu-container)
30
+ [![License: MIT](https://img.shields.io/badge/License-MIT-blue.svg)](LICENSE)
31
+ [![Handbook](https://img.shields.io/badge/handbook-docs-blue)](https://mcp-tool-shop-org.github.io/gpu-container/)
32
+
33
+ **A GPU-enabled container exposes the device. A model-aware runtime decides what lives in VRAM, pinned RAM, and NVMe.**
34
+
35
+ </div>
36
+
37
+ Run the largest useful local model your machine can honestly support, with explicit placement plans, benchmark receipts, and refusal when the plan would thrash.
38
+
39
+ ## Architecture
40
+
41
+ ```
42
+ Windows / WSL2 / Linux host
43
+ └─ GPU-enabled Docker container
44
+ └─ Inference runtime
45
+ ├─ VRAM: hot weights, active layers, activations, KV working set
46
+ ├─ pinned RAM: CPU-offloaded weights, MoE experts, KV spill/reuse
47
+ └─ NVMe: mmap shards, disk offload, cold experts, cold KV
48
+ ```
49
+
50
+ ## Product Boundary
51
+
52
+ ```
53
+ Docker = packaging + GPU exposure
54
+ CUDA/runtime = compute backend
55
+ Planner = memory law
56
+ Inference engine = execution
57
+ ```
58
+
59
+ ## Core Features
60
+
61
+ 1. **Hardware profiler** — Detect VRAM, RAM, GPU type, WSL/native Linux, NVMe speed, CUDA availability
62
+ 2. **Model profiler** — Detect dense vs MoE, largest layer, total weights, quantization, KV growth by context length
63
+ 3. **Runtime planner** — Generate launch plans for llama.cpp, vLLM, Accelerate, TensorRT-LLM, or DeepSpeed-style offload
64
+ 4. **Placement receipt** — Show what is in VRAM, what is in RAM, what is on disk, expected bottleneck, measured tokens/sec
65
+ 5. **MoE-specialized path** — Keep always-active layers on GPU, route experts to CPU/RAM, NVMe for cold fallback
66
+ 6. **Routing de-risk** — Measure whether a model's MoE routing is skewed enough that a per-expert cache would help, before building for it (`gpu-container-concentration`)
67
+ 7. **Rig-safety watchdog** — Poll GPU power/temperature/VRAM + host memory against configurable thresholds; an AI agent or an autonomous loop aborts a run before it endangers the machine (`gpu-container-watchdog`)
68
+
69
+ ## Key Constraint
70
+
71
+ On Windows/WSL, CUDA Unified Memory oversubscription is **not the path**. CUDA treats Windows/WSL as limited unified-memory support — no fine-grained GPU page-fault migration, no GPU-memory oversubscription beyond physical VRAM. This product is **explicit inference memory placement**, not "Docker VRAM overflow."
72
+
73
+ ## Status
74
+
75
+ Built and working today: `gpu-container-profile`, `gpu-container-plan`, `gpu-container-receipt` (with the recalibration loop), `gpu-container-concentration` (routing de-risk), and `gpu-container-watchdog` (supervise a GPU job safely). llama.cpp is the integrated backend; the placement math is backend-agnostic. Start with the [quickstart](docs/quickstart.md).
76
+
77
+ ## Privacy & safety
78
+
79
+ `gpu-container` is a **local, offline tool** — it makes no network calls and collects **no telemetry**, by default or otherwise. It reads GPU metrics (`nvidia-smi` / NVML) and host memory (`psutil`), the model `config.json` you supply, and the JSON files you point it at; it writes only to the output paths you specify. It does **not** read or transmit model weights, credentials, or tokens. Host-level actions (`wsl --shutdown`, `docker stop`, `kill`) run only when you explicitly opt in via the watchdog's `--on-breach`; the defaults never touch your machine beyond the job they supervise. Full policy: [SECURITY.md](SECURITY.md).
80
+
81
+ ## Documentation
82
+
83
+ - [`docs/quickstart.md`](docs/quickstart.md) — end-to-end walkthrough: profile → plan → launch under the watchdog → receipt → recalibrate
84
+ - [`docs/cli.md`](docs/cli.md) — the five commands: synopsis, flags, exit codes, worked examples
85
+ - [`docs/architecture.md`](docs/architecture.md) — memory-tier model, data flow, MoE expert routing, the recalibration loop
86
+ - [`docs/features.md`](docs/features.md) — the seven core features in depth
87
+ - [`docs/moe-lane-architecture.md`](docs/moe-lane-architecture.md) — the flagship MoE lane in depth
88
+ - [`docs/derisk-concentration.md`](docs/derisk-concentration.md) — the per-expert-cache de-risk gate (routing concentration)
89
+ - [`docs/decisions/0001-per-expert-cache-build-vs-upstream.md`](docs/decisions/0001-per-expert-cache-build-vs-upstream.md) — ADR-0001: consume the cache mechanism, contribute the policy
90
+ - [`docs/constraints.md`](docs/constraints.md) — non-goals + the Windows/WSL CUDA Unified-Memory correction
91
+ - [`docs/prior-art.md`](docs/prior-art.md) — runtimes we orchestrate, and the gap this product fills
92
+ - [`docs/feasibility.md`](docs/feasibility.md) — feasibility assessment, research grounding, and what's confirmed live
93
+
94
+ ---
95
+
96
+ <div align="center">
97
+
98
+ Built by <a href="https://mcp-tool-shop.github.io/">MCP Tool Shop</a> · MIT Licensed
99
+
100
+ </div>
@@ -0,0 +1,82 @@
1
+ <p align="center">
2
+ <a href="README.ja.md">日本語</a> | <a href="README.zh.md">中文</a> | <a href="README.md">English</a> | <a href="README.fr.md">Français</a> | <a href="README.hi.md">हिन्दी</a> | <a href="README.it.md">Italiano</a> | <a href="README.pt-BR.md">Português (BR)</a>
3
+ </p>
4
+
5
+ <div align="center">
6
+
7
+ <img src="https://raw.githubusercontent.com/mcp-tool-shop-org/gpu-container/main/assets/logo.png" width="400" alt="gpu-container" />
8
+
9
+ [![CI](https://github.com/mcp-tool-shop-org/gpu-container/actions/workflows/ci.yml/badge.svg)](https://github.com/mcp-tool-shop-org/gpu-container/actions/workflows/ci.yml)
10
+ [![PyPI](https://img.shields.io/pypi/v/gpu-container)](https://pypi.org/project/gpu-container/)
11
+ [![npm](https://img.shields.io/npm/v/gpu-container)](https://www.npmjs.com/package/gpu-container)
12
+ [![License: MIT](https://img.shields.io/badge/License-MIT-blue.svg)](LICENSE)
13
+ [![Handbook](https://img.shields.io/badge/handbook-docs-blue)](https://mcp-tool-shop-org.github.io/gpu-container/)
14
+
15
+ **Un contenedor habilitado para GPU expone el dispositivo. Un entorno de ejecución consciente del modelo decide qué se almacena en la VRAM, la RAM asignada y la NVMe.**
16
+
17
+ </div>
18
+
19
+ Ejecute el modelo local más grande y útil que su máquina pueda soportar de manera realista, con planes de ubicación explícitos, informes de pruebas de rendimiento y rechazo cuando el plan cause problemas.
20
+
21
+ ## Arquitectura
22
+
23
+ ```
24
+ Windows / WSL2 / Linux host
25
+ └─ GPU-enabled Docker container
26
+ └─ Inference runtime
27
+ ├─ VRAM: hot weights, active layers, activations, KV working set
28
+ ├─ pinned RAM: CPU-offloaded weights, MoE experts, KV spill/reuse
29
+ └─ NVMe: mmap shards, disk offload, cold experts, cold KV
30
+ ```
31
+
32
+ ## Límite del producto
33
+
34
+ ```
35
+ Docker = packaging + GPU exposure
36
+ CUDA/runtime = compute backend
37
+ Planner = memory law
38
+ Inference engine = execution
39
+ ```
40
+
41
+ ## Características principales
42
+
43
+ 1. **Perfilador de hardware:** Detecta la VRAM, la RAM, el tipo de GPU, WSL/Linux nativo, la velocidad de NVMe y la disponibilidad de CUDA.
44
+ 2. **Perfilador de modelo:** Detecta si es denso o MoE, la capa más grande, el número total de parámetros, la cuantificación y el crecimiento de KV según la longitud del contexto.
45
+ 3. **Planificador de entorno de ejecución:** Genera planes de inicio para llama.cpp, vLLM, Accelerate, TensorRT-LLM o la descarga al estilo DeepSpeed.
46
+ 4. **Informe de ubicación:** Muestra qué hay en la VRAM, qué hay en la RAM, qué hay en el disco, el cuello de botella esperado y los tokens/segundo medidos.
47
+ 5. **Ruta especializada para MoE:** Mantiene las capas siempre activas en la GPU, dirige los expertos a la CPU/RAM y a la NVMe para la recuperación en caso de inactividad.
48
+ 6. **Reducción de riesgos en el enrutamiento:** Mide si el enrutamiento MoE de un modelo está lo suficientemente sesgado como para que una caché por experto sea útil, antes de construirlo (`gpu-container-concentration`).
49
+ 7. **Vigilante de seguridad del sistema:** Supervisa la potencia/temperatura/VRAM de la GPU y la memoria del host en comparación con los umbrales configurables; un agente de IA o un bucle autónomo aborta una ejecución antes de que ponga en peligro la máquina (`gpu-container-watchdog`).
50
+
51
+ ## Restricción clave
52
+
53
+ En Windows/WSL, la sobreasignación de memoria unificada de CUDA **no es la solución**. CUDA trata Windows/WSL como un soporte limitado para la memoria unificada: no hay migración de páginas de GPU de grano fino, ni sobreasignación de memoria de GPU más allá de la VRAM física. Este producto es la **ubicación explícita de la memoria de inferencia**, no el "desbordamiento de VRAM de Docker".
54
+
55
+ ## Estado
56
+
57
+ Construido y funcionando hoy: `gpu-container-profile`, `gpu-container-plan`, `gpu-container-receipt` (con el bucle de recalibración), `gpu-container-concentration` (reducción de riesgos en el enrutamiento) y `gpu-container-watchdog` (supervisión segura de un trabajo de GPU). llama.cpp es el backend integrado; las matemáticas de ubicación son independientes del backend. Comience con el [inicio rápido](docs/quickstart.md).
58
+
59
+ ## Privacidad y seguridad
60
+
61
+ `gpu-container` es una **herramienta local y sin conexión**: no realiza llamadas de red y no recopila **ninguna telemetría**, ni por defecto ni de otra manera. Lee las métricas de la GPU (`nvidia-smi` / NVML) y la memoria del host (`psutil`), el archivo `config.json` del modelo que proporciona y los archivos JSON a los que lo apunta; solo escribe en las rutas de salida que especifica. **No** lee ni transmite los parámetros del modelo, las credenciales ni los tokens. Las acciones a nivel de host (`wsl --shutdown`, `docker stop`, `kill`) solo se ejecutan cuando usted acepta explícitamente a través del parámetro `--on-breach` del vigilante; los valores predeterminados nunca tocan su máquina más allá del trabajo que supervisan. Política completa: [SECURITY.md](SECURITY.md).
62
+
63
+ ## Documentación
64
+
65
+ - [`docs/quickstart.md`](docs/quickstart.md): recorrido completo: perfil → plan → inicio bajo el vigilante → informe → recalibración
66
+ - [`docs/cli.md`](docs/cli.md): los cinco comandos: resumen, opciones, códigos de salida, ejemplos prácticos
67
+ - [`docs/architecture.md`](docs/architecture.md): modelo de niveles de memoria, flujo de datos, enrutamiento de expertos MoE, el bucle de recalibración
68
+ - [`docs/features.md`](docs/features.md): las siete características principales en detalle
69
+ - [`docs/moe-lane-architecture.md`](docs/moe-lane-architecture.md): la ruta MoE insignia en detalle
70
+ - [`docs/derisk-concentration.md`](docs/derisk-concentration.md): la puerta de reducción de riesgos de la caché por experto (concentración del enrutamiento)
71
+ - [`docs/decisions/0001-per-expert-cache-build-vs-upstream.md`](docs/decisions/0001-per-expert-cache-build-vs-upstream.md): ADR-0001: consumir el mecanismo de caché, contribuir a la política
72
+ - [`docs/constraints.md`](docs/constraints.md): objetivos no cumplidos + la corrección de la memoria unificada de CUDA en Windows/WSL
73
+ - [`docs/prior-art.md`](docs/prior-art.md): entornos de ejecución que orquestamos y la brecha que llena este producto
74
+ - [`docs/feasibility.md`](docs/feasibility.md): evaluación de viabilidad, base de investigación y lo que se ha confirmado que funciona
75
+
76
+ ---
77
+
78
+ <div align="center">
79
+
80
+ Creado por <a href="https://mcp-tool-shop.github.io/">MCP Tool Shop</a> · Licencia MIT
81
+
82
+ </div>