gpu-container 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- gpu_container-0.1.0/.dockerignore +10 -0
- gpu_container-0.1.0/.github/workflows/ci.yml +57 -0
- gpu_container-0.1.0/.github/workflows/pages.yml +50 -0
- gpu_container-0.1.0/.github/workflows/release.yml +216 -0
- gpu_container-0.1.0/.gitignore +58 -0
- gpu_container-0.1.0/CHANGELOG.md +26 -0
- gpu_container-0.1.0/Dockerfile +30 -0
- gpu_container-0.1.0/LICENSE +21 -0
- gpu_container-0.1.0/PKG-INFO +100 -0
- gpu_container-0.1.0/README.es.md +82 -0
- gpu_container-0.1.0/README.fr.md +82 -0
- gpu_container-0.1.0/README.hi.md +82 -0
- gpu_container-0.1.0/README.it.md +84 -0
- gpu_container-0.1.0/README.ja.md +82 -0
- gpu_container-0.1.0/README.md +82 -0
- gpu_container-0.1.0/README.pt-BR.md +84 -0
- gpu_container-0.1.0/README.zh.md +82 -0
- gpu_container-0.1.0/RELEASE_ASSESSMENT.md +87 -0
- gpu_container-0.1.0/SCORECARD.md +30 -0
- gpu_container-0.1.0/SECURITY.md +40 -0
- gpu_container-0.1.0/SHIP_GATE.md +81 -0
- gpu_container-0.1.0/assets/logo.png +0 -0
- gpu_container-0.1.0/docs/architecture.md +109 -0
- gpu_container-0.1.0/docs/cli.md +213 -0
- gpu_container-0.1.0/docs/constraints.md +64 -0
- gpu_container-0.1.0/docs/decisions/0001-per-expert-cache-build-vs-upstream.md +55 -0
- gpu_container-0.1.0/docs/derisk-concentration.md +95 -0
- gpu_container-0.1.0/docs/feasibility.md +67 -0
- gpu_container-0.1.0/docs/features.md +136 -0
- gpu_container-0.1.0/docs/moe-lane-architecture.md +98 -0
- gpu_container-0.1.0/docs/prior-art.md +103 -0
- gpu_container-0.1.0/docs/quickstart.md +111 -0
- gpu_container-0.1.0/gpu_container/__init__.py +9 -0
- gpu_container-0.1.0/gpu_container/__main__.py +60 -0
- gpu_container-0.1.0/gpu_container/errors.py +72 -0
- gpu_container-0.1.0/gpu_container/planner/__init__.py +17 -0
- gpu_container-0.1.0/gpu_container/planner/activation.py +225 -0
- gpu_container-0.1.0/gpu_container/planner/calibration.py +224 -0
- gpu_container-0.1.0/gpu_container/planner/calibration_seed.json +44 -0
- gpu_container-0.1.0/gpu_container/planner/cli.py +101 -0
- gpu_container-0.1.0/gpu_container/planner/concentration_cli.py +120 -0
- gpu_container-0.1.0/gpu_container/planner/placement.py +198 -0
- gpu_container-0.1.0/gpu_container/planner/receipt.py +155 -0
- gpu_container-0.1.0/gpu_container/planner/receipt_cli.py +143 -0
- gpu_container-0.1.0/gpu_container/profiler/__init__.py +24 -0
- gpu_container-0.1.0/gpu_container/profiler/baseline.py +122 -0
- gpu_container-0.1.0/gpu_container/profiler/cli.py +151 -0
- gpu_container-0.1.0/gpu_container/profiler/cuda_bench.py +306 -0
- gpu_container-0.1.0/gpu_container/profiler/hardware.py +304 -0
- gpu_container-0.1.0/gpu_container/profiler/model.py +178 -0
- gpu_container-0.1.0/gpu_container/profiler/nvme_bench.py +158 -0
- gpu_container-0.1.0/gpu_container/profiler/schema.py +245 -0
- gpu_container-0.1.0/gpu_container/watchdog.py +563 -0
- gpu_container-0.1.0/npm/LICENSE +21 -0
- gpu_container-0.1.0/npm/README.md +16 -0
- gpu_container-0.1.0/npm/bin/gpu-container.js +18 -0
- gpu_container-0.1.0/npm/package.json +38 -0
- gpu_container-0.1.0/pyproject.toml +35 -0
- gpu_container-0.1.0/scripts/gen_calibration_seed.py +102 -0
- gpu_container-0.1.0/scripts/ingest_sweep.py +111 -0
- gpu_container-0.1.0/scripts/verify.py +58 -0
- gpu_container-0.1.0/site/astro.config.mjs +30 -0
- gpu_container-0.1.0/site/package-lock.json +7926 -0
- gpu_container-0.1.0/site/package.json +18 -0
- gpu_container-0.1.0/site/src/content/docs/handbook/cli.md +77 -0
- gpu_container-0.1.0/site/src/content/docs/handbook/derisk.md +47 -0
- gpu_container-0.1.0/site/src/content/docs/handbook/getting-started.md +75 -0
- gpu_container-0.1.0/site/src/content/docs/handbook/index.md +48 -0
- gpu_container-0.1.0/site/src/content/docs/handbook/moe-lane.md +44 -0
- gpu_container-0.1.0/site/src/content/docs/handbook/reference.md +65 -0
- gpu_container-0.1.0/site/src/content/docs/handbook/safety.md +56 -0
- gpu_container-0.1.0/site/src/content.config.ts +7 -0
- gpu_container-0.1.0/site/src/pages/index.astro +33 -0
- gpu_container-0.1.0/site/src/site-config.ts +86 -0
- gpu_container-0.1.0/site/src/styles/global.css +3 -0
- gpu_container-0.1.0/site/src/styles/starlight-custom.css +5 -0
- gpu_container-0.1.0/site/tsconfig.json +5 -0
- gpu_container-0.1.0/tests/test_activation.py +101 -0
- gpu_container-0.1.0/tests/test_calibration.py +223 -0
- gpu_container-0.1.0/tests/test_concentration_cli.py +45 -0
- gpu_container-0.1.0/tests/test_dispatch.py +19 -0
- gpu_container-0.1.0/tests/test_errors.py +57 -0
- gpu_container-0.1.0/tests/test_measure.py +145 -0
- gpu_container-0.1.0/tests/test_planner.py +132 -0
- gpu_container-0.1.0/tests/test_profiler.py +59 -0
- gpu_container-0.1.0/tests/test_receipt_trace.py +73 -0
- gpu_container-0.1.0/tests/test_watchdog.py +306 -0
- gpu_container-0.1.0/watchdog.example.json +9 -0
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
name: CI
|
|
2
|
+
|
|
3
|
+
# Paths-gated per the org GitHub Actions rules: only run when code, tests, packaging,
|
|
4
|
+
# the verify script, or the workflow itself change. workflow_dispatch is the manual fallback.
|
|
5
|
+
on:
|
|
6
|
+
push:
|
|
7
|
+
paths:
|
|
8
|
+
- 'pyproject.toml'
|
|
9
|
+
- 'gpu_container/**'
|
|
10
|
+
- 'tests/**'
|
|
11
|
+
- 'scripts/**'
|
|
12
|
+
- '.github/workflows/**'
|
|
13
|
+
pull_request:
|
|
14
|
+
paths:
|
|
15
|
+
- 'pyproject.toml'
|
|
16
|
+
- 'gpu_container/**'
|
|
17
|
+
- 'tests/**'
|
|
18
|
+
- 'scripts/**'
|
|
19
|
+
- '.github/workflows/**'
|
|
20
|
+
workflow_dispatch:
|
|
21
|
+
|
|
22
|
+
concurrency:
|
|
23
|
+
group: ${{ github.workflow }}-${{ github.ref }}
|
|
24
|
+
cancel-in-progress: true
|
|
25
|
+
|
|
26
|
+
permissions:
|
|
27
|
+
contents: read
|
|
28
|
+
|
|
29
|
+
jobs:
|
|
30
|
+
test:
|
|
31
|
+
runs-on: ubuntu-latest
|
|
32
|
+
strategy:
|
|
33
|
+
fail-fast: false
|
|
34
|
+
matrix:
|
|
35
|
+
python-version: ['3.11', '3.12']
|
|
36
|
+
steps:
|
|
37
|
+
- uses: actions/checkout@v4
|
|
38
|
+
- uses: actions/setup-python@v5
|
|
39
|
+
with:
|
|
40
|
+
python-version: ${{ matrix.python-version }}
|
|
41
|
+
- run: python -m pip install --upgrade pip
|
|
42
|
+
- run: pip install -e ".[dev,host]"
|
|
43
|
+
# The verify script runs the test suite + a CLI smoke of all five commands in one command (gate D1).
|
|
44
|
+
- run: python scripts/verify.py
|
|
45
|
+
|
|
46
|
+
deps:
|
|
47
|
+
# Dependency scanning (shipcheck gate D3). The core package has zero required deps;
|
|
48
|
+
# this scans the optional [host] extras (psutil, numpy) + the build toolchain.
|
|
49
|
+
runs-on: ubuntu-latest
|
|
50
|
+
steps:
|
|
51
|
+
- uses: actions/checkout@v4
|
|
52
|
+
- uses: actions/setup-python@v5
|
|
53
|
+
with:
|
|
54
|
+
python-version: '3.12'
|
|
55
|
+
- run: python -m pip install --upgrade pip
|
|
56
|
+
- run: pip install -e ".[host]" pip-audit
|
|
57
|
+
- run: pip-audit
|
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
name: Deploy site to GitHub Pages
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
push:
|
|
5
|
+
branches: [main]
|
|
6
|
+
paths:
|
|
7
|
+
- 'site/**'
|
|
8
|
+
- '.github/workflows/pages.yml'
|
|
9
|
+
workflow_dispatch:
|
|
10
|
+
|
|
11
|
+
concurrency:
|
|
12
|
+
group: ${{ github.workflow }}-${{ github.ref }}
|
|
13
|
+
cancel-in-progress: true
|
|
14
|
+
|
|
15
|
+
permissions:
|
|
16
|
+
contents: read
|
|
17
|
+
pages: write
|
|
18
|
+
id-token: write
|
|
19
|
+
|
|
20
|
+
jobs:
|
|
21
|
+
build:
|
|
22
|
+
runs-on: ubuntu-latest
|
|
23
|
+
steps:
|
|
24
|
+
- uses: actions/checkout@v4
|
|
25
|
+
|
|
26
|
+
- uses: actions/setup-node@v4
|
|
27
|
+
with:
|
|
28
|
+
node-version: 22
|
|
29
|
+
|
|
30
|
+
- name: Install site dependencies
|
|
31
|
+
working-directory: site
|
|
32
|
+
run: npm ci
|
|
33
|
+
|
|
34
|
+
- name: Build site
|
|
35
|
+
working-directory: site
|
|
36
|
+
run: npm run build
|
|
37
|
+
|
|
38
|
+
- uses: actions/upload-pages-artifact@v3
|
|
39
|
+
with:
|
|
40
|
+
path: site/dist
|
|
41
|
+
|
|
42
|
+
deploy:
|
|
43
|
+
needs: build
|
|
44
|
+
runs-on: ubuntu-latest
|
|
45
|
+
environment:
|
|
46
|
+
name: github-pages
|
|
47
|
+
url: ${{ steps.deployment.outputs.page_url }}
|
|
48
|
+
steps:
|
|
49
|
+
- id: deployment
|
|
50
|
+
uses: actions/deploy-pages@v4
|
|
@@ -0,0 +1,216 @@
|
|
|
1
|
+
name: Release
|
|
2
|
+
|
|
3
|
+
# On a published GitHub Release this:
|
|
4
|
+
# 1. publishes the Python package to PyPI via Trusted Publishing (OIDC, no token),
|
|
5
|
+
# 2. builds PyInstaller binaries for each platform + uploads them (+ checksums) to the release,
|
|
6
|
+
# 3. publishes the `gpu-container` npm launcher via Trusted Publishing,
|
|
7
|
+
# 4. builds + pushes the profiler Docker image to ghcr.io.
|
|
8
|
+
#
|
|
9
|
+
# Org rule: publish workflows fire on `release: published` only. The repo has ci.yml + this
|
|
10
|
+
# release.yml (the org 2-workflow CI/publish pair) + pages.yml (the GitHub-Pages-deploy exception,
|
|
11
|
+
# paths-gated to site/**). The cross-OS binary matrix (macos/windows) is the explicit-request
|
|
12
|
+
# exception to the "1 OS / no macos" rule — the npm launcher distributes platform binaries.
|
|
13
|
+
#
|
|
14
|
+
# First PyPI publish needs a PyPI *pending publisher* (project gpu-container, workflow release.yml,
|
|
15
|
+
# environment "(Any)" — these jobs declare no GH environment). First npm publish needs a placeholder
|
|
16
|
+
# publish + a Trusted Publisher on npmjs.com for `gpu-container` (workflow release.yml).
|
|
17
|
+
|
|
18
|
+
on:
|
|
19
|
+
release:
|
|
20
|
+
types: [published]
|
|
21
|
+
workflow_dispatch:
|
|
22
|
+
|
|
23
|
+
permissions:
|
|
24
|
+
contents: read
|
|
25
|
+
|
|
26
|
+
concurrency:
|
|
27
|
+
group: ${{ github.workflow }}-${{ github.ref }}
|
|
28
|
+
cancel-in-progress: false # never cancel an in-flight publish
|
|
29
|
+
|
|
30
|
+
jobs:
|
|
31
|
+
pypi:
|
|
32
|
+
name: Publish to PyPI (Trusted Publishing)
|
|
33
|
+
runs-on: ubuntu-latest
|
|
34
|
+
permissions:
|
|
35
|
+
id-token: write # OIDC handshake for PyPI Trusted Publishing — the only auth needed
|
|
36
|
+
timeout-minutes: 15
|
|
37
|
+
steps:
|
|
38
|
+
- uses: actions/checkout@v5
|
|
39
|
+
|
|
40
|
+
- name: Verify the tag matches pyproject version
|
|
41
|
+
run: |
|
|
42
|
+
TAG="${GITHUB_REF_NAME#v}"
|
|
43
|
+
PKG=$(grep -m1 '^version = ' pyproject.toml | sed -E 's/version = "(.*)"/\1/')
|
|
44
|
+
echo "tag=${TAG} pyproject=${PKG}"
|
|
45
|
+
if [ "${TAG}" != "${PKG}" ]; then
|
|
46
|
+
echo "::error::release tag ${TAG} does not match pyproject version ${PKG}"
|
|
47
|
+
exit 1
|
|
48
|
+
fi
|
|
49
|
+
|
|
50
|
+
- uses: astral-sh/setup-uv@v6
|
|
51
|
+
|
|
52
|
+
- name: Build sdist + wheel
|
|
53
|
+
run: uv build
|
|
54
|
+
|
|
55
|
+
- name: Check distribution metadata
|
|
56
|
+
run: uvx twine check dist/*
|
|
57
|
+
|
|
58
|
+
- name: Publish to PyPI
|
|
59
|
+
# Trusted Publishing: no token. PEP 740 attestations are on by default (action >= v1.11.0).
|
|
60
|
+
uses: pypa/gh-action-pypi-publish@release/v1
|
|
61
|
+
|
|
62
|
+
build-binaries:
|
|
63
|
+
name: Build binary (${{ matrix.target }})
|
|
64
|
+
strategy:
|
|
65
|
+
fail-fast: false # one OS's failure must not cancel the others
|
|
66
|
+
matrix:
|
|
67
|
+
include:
|
|
68
|
+
- os: ubuntu-latest
|
|
69
|
+
target: linux-x64
|
|
70
|
+
ext: ""
|
|
71
|
+
- os: macos-latest
|
|
72
|
+
target: darwin-arm64
|
|
73
|
+
ext: ""
|
|
74
|
+
- os: windows-latest
|
|
75
|
+
target: win-x64
|
|
76
|
+
ext: ".exe"
|
|
77
|
+
runs-on: ${{ matrix.os }}
|
|
78
|
+
timeout-minutes: 20
|
|
79
|
+
steps:
|
|
80
|
+
- uses: actions/checkout@v5
|
|
81
|
+
- uses: astral-sh/setup-uv@v6
|
|
82
|
+
- run: uv python install 3.12
|
|
83
|
+
- run: uv venv
|
|
84
|
+
|
|
85
|
+
- name: Install gpu-container + PyInstaller
|
|
86
|
+
# [host] = psutil + numpy (system RAM, CPU-bandwidth probe, the watchdog). pynvml is optional
|
|
87
|
+
# (the profiler falls back to nvidia-smi); the binary still profiles + plans + watches.
|
|
88
|
+
run: uv pip install ".[host]" "pyinstaller>=6.9.0"
|
|
89
|
+
|
|
90
|
+
- name: Build binary
|
|
91
|
+
shell: bash
|
|
92
|
+
run: |
|
|
93
|
+
VERSION=${GITHUB_REF_NAME#v}
|
|
94
|
+
uv run pyinstaller --onefile --name gpu-container --console \
|
|
95
|
+
--collect-submodules gpu_container \
|
|
96
|
+
--copy-metadata gpu-container \
|
|
97
|
+
gpu_container/__main__.py
|
|
98
|
+
OUTNAME="gpu-container-${VERSION}-${{ matrix.target }}${{ matrix.ext }}"
|
|
99
|
+
mv "dist/gpu-container${{ matrix.ext }}" "dist/${OUTNAME}"
|
|
100
|
+
echo "ASSET_NAME=${OUTNAME}" >> "$GITHUB_ENV"
|
|
101
|
+
|
|
102
|
+
- name: Smoke-test the binary
|
|
103
|
+
shell: bash
|
|
104
|
+
run: dist/${{ env.ASSET_NAME }} --version
|
|
105
|
+
|
|
106
|
+
- uses: actions/upload-artifact@v4
|
|
107
|
+
with:
|
|
108
|
+
name: binary-${{ matrix.target }}
|
|
109
|
+
path: dist/${{ env.ASSET_NAME }}
|
|
110
|
+
|
|
111
|
+
release-binaries:
|
|
112
|
+
name: Upload binaries + checksums to the release
|
|
113
|
+
needs: build-binaries
|
|
114
|
+
runs-on: ubuntu-latest
|
|
115
|
+
permissions:
|
|
116
|
+
contents: write # upload assets to the release
|
|
117
|
+
steps:
|
|
118
|
+
- uses: actions/download-artifact@v4
|
|
119
|
+
with:
|
|
120
|
+
path: artifacts
|
|
121
|
+
merge-multiple: true
|
|
122
|
+
|
|
123
|
+
- name: Generate checksums
|
|
124
|
+
shell: bash
|
|
125
|
+
run: |
|
|
126
|
+
VERSION=${GITHUB_REF_NAME#v}
|
|
127
|
+
cd artifacts
|
|
128
|
+
sha256sum * > "checksums-${VERSION}.txt"
|
|
129
|
+
cat "checksums-${VERSION}.txt"
|
|
130
|
+
|
|
131
|
+
- uses: softprops/action-gh-release@v2
|
|
132
|
+
with:
|
|
133
|
+
files: artifacts/*
|
|
134
|
+
|
|
135
|
+
npm:
|
|
136
|
+
name: Publish npm launcher (Trusted Publishing)
|
|
137
|
+
needs: release-binaries # the launcher is only useful once the binaries are on the release
|
|
138
|
+
runs-on: ubuntu-latest
|
|
139
|
+
permissions:
|
|
140
|
+
id-token: write # npm provenance via Sigstore OIDC
|
|
141
|
+
timeout-minutes: 15
|
|
142
|
+
steps:
|
|
143
|
+
- uses: actions/checkout@v5
|
|
144
|
+
|
|
145
|
+
- name: Verify the npm launcher version matches the tag
|
|
146
|
+
run: |
|
|
147
|
+
TAG="${GITHUB_REF_NAME#v}"
|
|
148
|
+
PKG=$(node -p "require('./npm/package.json').version")
|
|
149
|
+
echo "tag=${TAG} npm=${PKG}"
|
|
150
|
+
if [ "${TAG}" != "${PKG}" ]; then
|
|
151
|
+
echo "::error::release tag ${TAG} does not match npm/package.json version ${PKG}"
|
|
152
|
+
exit 1
|
|
153
|
+
fi
|
|
154
|
+
|
|
155
|
+
- uses: actions/setup-node@v4
|
|
156
|
+
with:
|
|
157
|
+
node-version: "22"
|
|
158
|
+
registry-url: "https://registry.npmjs.org"
|
|
159
|
+
|
|
160
|
+
- name: Install npm >= 11.5 for OIDC trusted-publishing auth
|
|
161
|
+
run: |
|
|
162
|
+
# Node 22's bundled npm 10.9 races on an in-place `npm install -g npm@latest`
|
|
163
|
+
# (MODULE_NOT_FOUND: promise-retry). Install npm@latest into a sandbox and shadow it.
|
|
164
|
+
SANDBOX="$HOME/.npm-cli-sandbox"
|
|
165
|
+
mkdir -p "$SANDBOX"
|
|
166
|
+
pushd "$SANDBOX" >/dev/null
|
|
167
|
+
echo '{"name":"npm-cli-sandbox","version":"0.0.0","private":true}' > package.json
|
|
168
|
+
npm install --no-save --no-audit --no-fund --silent npm@latest
|
|
169
|
+
popd >/dev/null
|
|
170
|
+
echo "$SANDBOX/node_modules/.bin" >> "$GITHUB_PATH"
|
|
171
|
+
"$SANDBOX/node_modules/.bin/npm" --version
|
|
172
|
+
|
|
173
|
+
- name: Publish launcher with provenance (OIDC trusted publisher)
|
|
174
|
+
working-directory: npm
|
|
175
|
+
run: |
|
|
176
|
+
npm install --no-save --no-audit --no-fund
|
|
177
|
+
npm publish --provenance --access public
|
|
178
|
+
|
|
179
|
+
docker:
|
|
180
|
+
name: Build + push Docker image (ghcr)
|
|
181
|
+
runs-on: ubuntu-latest
|
|
182
|
+
permissions:
|
|
183
|
+
contents: read
|
|
184
|
+
packages: write # push to ghcr.io
|
|
185
|
+
timeout-minutes: 30
|
|
186
|
+
steps:
|
|
187
|
+
- uses: actions/checkout@v5
|
|
188
|
+
|
|
189
|
+
- name: Compute image tags
|
|
190
|
+
id: tags
|
|
191
|
+
run: |
|
|
192
|
+
VERSION=${GITHUB_REF_NAME#v}
|
|
193
|
+
IMG="ghcr.io/mcp-tool-shop-org/gpu-container"
|
|
194
|
+
# Always tag the exact version; only move :latest for a non-prerelease (stable) release.
|
|
195
|
+
{
|
|
196
|
+
echo "tags<<EOF"
|
|
197
|
+
echo "${IMG}:${VERSION}"
|
|
198
|
+
if [ "${{ github.event.release.prerelease }}" != "true" ]; then
|
|
199
|
+
echo "${IMG}:latest"
|
|
200
|
+
fi
|
|
201
|
+
echo "EOF"
|
|
202
|
+
} >> "$GITHUB_OUTPUT"
|
|
203
|
+
|
|
204
|
+
- name: Log in to ghcr
|
|
205
|
+
uses: docker/login-action@v3
|
|
206
|
+
with:
|
|
207
|
+
registry: ghcr.io
|
|
208
|
+
username: ${{ github.actor }}
|
|
209
|
+
password: ${{ secrets.GITHUB_TOKEN }}
|
|
210
|
+
|
|
211
|
+
- name: Build + push
|
|
212
|
+
uses: docker/build-push-action@v6
|
|
213
|
+
with:
|
|
214
|
+
context: .
|
|
215
|
+
push: true
|
|
216
|
+
tags: ${{ steps.tags.outputs.tags }}
|
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
# --- Python ---
|
|
2
|
+
__pycache__/
|
|
3
|
+
*.py[cod]
|
|
4
|
+
*.egg-info/
|
|
5
|
+
.eggs/
|
|
6
|
+
.venv/
|
|
7
|
+
venv/
|
|
8
|
+
.pytest_cache/
|
|
9
|
+
.mypy_cache/
|
|
10
|
+
.ruff_cache/
|
|
11
|
+
|
|
12
|
+
# --- Node ---
|
|
13
|
+
node_modules/
|
|
14
|
+
npm-debug.log*
|
|
15
|
+
pnpm-debug.log*
|
|
16
|
+
*.tsbuildinfo
|
|
17
|
+
|
|
18
|
+
# --- Build / dist ---
|
|
19
|
+
dist/
|
|
20
|
+
build/
|
|
21
|
+
out/
|
|
22
|
+
|
|
23
|
+
# --- Model weights / large artifacts (NEVER commit) ---
|
|
24
|
+
*.gguf
|
|
25
|
+
*.safetensors
|
|
26
|
+
*.bin
|
|
27
|
+
*.pt
|
|
28
|
+
*.pth
|
|
29
|
+
*.onnx
|
|
30
|
+
models/
|
|
31
|
+
weights/
|
|
32
|
+
.cache/
|
|
33
|
+
|
|
34
|
+
# --- Runtime outputs (profiles + receipts are generated, not committed) ---
|
|
35
|
+
receipts/
|
|
36
|
+
profiles/out/
|
|
37
|
+
*.receipt.json
|
|
38
|
+
profile*.json
|
|
39
|
+
plan*.json
|
|
40
|
+
bench*.json
|
|
41
|
+
receipt*.json
|
|
42
|
+
prof_dry.json
|
|
43
|
+
*.config.json
|
|
44
|
+
*-err.txt
|
|
45
|
+
|
|
46
|
+
# --- Secrets / env ---
|
|
47
|
+
.env
|
|
48
|
+
.env.*
|
|
49
|
+
!.env.example
|
|
50
|
+
|
|
51
|
+
# --- OS / editor ---
|
|
52
|
+
.DS_Store
|
|
53
|
+
Thumbs.db
|
|
54
|
+
.idea/
|
|
55
|
+
.vscode/
|
|
56
|
+
*.log
|
|
57
|
+
site/.astro/
|
|
58
|
+
.polyglot-cache.json
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
# Changelog
|
|
2
|
+
|
|
3
|
+
All notable changes to this project will be documented in this file.
|
|
4
|
+
|
|
5
|
+
The format is based on [Keep a Changelog](https://keepachangelog.com/),
|
|
6
|
+
and this project adheres to [Semantic Versioning](https://semver.org/).
|
|
7
|
+
|
|
8
|
+
## [Unreleased]
|
|
9
|
+
|
|
10
|
+
The full feature set below is built and tested; it becomes `[1.0.0]` at the first release.
|
|
11
|
+
|
|
12
|
+
### Added
|
|
13
|
+
- **Hardware + model profiler** (`gpu-container-profile`) — measured PCIe H2D/D2H, NVMe sequential + random-QD1, pinnable-RAM ceiling, CPU RAM bandwidth (all measured in-container, `None`-not-guess); closed-form model param-split (expert vs always-resident) and KV growth.
|
|
14
|
+
- **Placement planner** (`gpu-container-plan`) — minimal llama.cpp `--n-cpu-moe N` to fit VRAM, a roofline ceiling **and** a calibrated forecast band, and an honest ship/refuse verdict at the >1 tok/s floor.
|
|
15
|
+
- **Receipt + recalibration loop** (`gpu-container-receipt`) — pairs a `llama-bench` run with the plan's forecast, records realized efficiency / within-band, and writes a calibration point back so the next plan is calibrated. The verifier is a real GPU run, a different mechanism than the planner's math.
|
|
16
|
+
- **Routing de-risk gate** (`gpu-container-concentration`) — scores expert-routing concentration (`hot_frac_for_coverage`, `concentration_score`) from an activation trace or a `llama-imatrix` capture, to decide whether per-expert caching is worth building. Backed by [ADR-0001](docs/decisions/0001-per-expert-cache-build-vs-upstream.md).
|
|
17
|
+
- **Rig-safety watchdog** (`gpu-container-watchdog`) — polls GPU power/temp/VRAM (worst-case across all GPUs) + host memory against thresholds; emits ok/warn/abort (exit 0/5/7). A **supervisor mode** (`run -- <cmd>`) launches a GPU job as a child, polls in parallel, and aborts on a breach via `kill-job` (soft) or `wsl-shutdown` (catastrophic). Peak metrics export to the receipt (`--peaks-out` → `--peaks`) prove a run stayed inside the safe envelope. Shipped `watchdog.example.json`; `mem_source` tags host vs WSL2 VM; `--log` JSONL trajectory.
|
|
18
|
+
- **Docs** — `docs/cli.md` (CLI reference), `docs/quickstart.md` (end-to-end walkthrough), `docs/derisk-concentration.md` (the de-risk methodology), `docs/architecture.md`, `docs/features.md`, `docs/moe-lane-architecture.md`, `docs/feasibility.md`, ADR-0001.
|
|
19
|
+
|
|
20
|
+
### Fixed
|
|
21
|
+
- Planner emits `-fa on` (current llama.cpp rejects a value-less `-fa`).
|
|
22
|
+
- Receipt: the safety-envelope verdict no longer clobbers the throughput `within_band` verdict (independent fields).
|
|
23
|
+
|
|
24
|
+
### Notes
|
|
25
|
+
- Runtime support: **llama.cpp** is the integrated backend; the placement math is backend-agnostic and vLLM/Accelerate/ExLlamaV2/TensorRT-LLM are designed targets.
|
|
26
|
+
- Per-expert tiering is gated behind the de-risk gate + the upstream llama.cpp `#20757` mechanism (ADR-0001); the per-layer hot tier ships today.
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
# gpu-container profiler — runs INSIDE the container, the only honest measurement vantage
|
|
2
|
+
# (docker-knowledge wave-2 hw-measurement). Base = CUDA 12.8 *runtime*: it ships libcudart
|
|
3
|
+
# for the ctypes PCIe bench and targets sm_120 (RTX 5090 / Blackwell). No nvcc needed —
|
|
4
|
+
# cudaMemcpy/cudaHostAlloc are copy-engine ops, not compiled device kernels.
|
|
5
|
+
#
|
|
6
|
+
# Build: docker build -t gpu-container .
|
|
7
|
+
# Run: docker run --rm --gpus all -v gpc-bench:/bench gpu-container # full profile
|
|
8
|
+
# docker run --rm --gpus all gpu-container --no-bench # identity only
|
|
9
|
+
# docker run --rm --gpus all -v gpc-bench:/bench -v "$PWD":/out \
|
|
10
|
+
# gpu-container -o /out/profile.json
|
|
11
|
+
FROM nvidia/cuda:12.8.1-runtime-ubuntu24.04
|
|
12
|
+
|
|
13
|
+
# fio = NVMe seq + random-QD1 (pulls libaio); python3/pip = the profiler itself.
|
|
14
|
+
RUN apt-get update && apt-get install -y --no-install-recommends \
|
|
15
|
+
python3 python3-pip fio \
|
|
16
|
+
&& rm -rf /var/lib/apt/lists/*
|
|
17
|
+
|
|
18
|
+
WORKDIR /app
|
|
19
|
+
COPY pyproject.toml README.md LICENSE ./
|
|
20
|
+
COPY gpu_container ./gpu_container
|
|
21
|
+
# [gpu]=pynvml (NVML v2 VRAM), [host]=psutil (RAM). Container venv is fine to write into.
|
|
22
|
+
RUN pip3 install --no-cache-dir --break-system-packages ".[gpu,host]"
|
|
23
|
+
|
|
24
|
+
# The NVMe bench writes here. Mount an ext4-backed volume (named volumes live on the WSL2
|
|
25
|
+
# ext4 vdisk — fast); NEVER a /mnt/<letter> drvfs bind (9p, ~5-10x slower) or the overlay
|
|
26
|
+
# layer (breaks O_DIRECT). The profiler refuses the wrong filesystem rather than mismeasure.
|
|
27
|
+
ENV GPU_CONTAINER_BENCH_DIR=/bench
|
|
28
|
+
VOLUME ["/bench"]
|
|
29
|
+
|
|
30
|
+
ENTRYPOINT ["python3", "-m", "gpu_container.profiler.cli"]
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 mcp-tool-shop
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,100 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: gpu-container
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Model-aware inference memory-placement planner for single-GPU rigs — profile, plan, prove.
|
|
5
|
+
Author-email: mcp-tool-shop <64996768+mcp-tool-shop@users.noreply.github.com>
|
|
6
|
+
License-Expression: MIT
|
|
7
|
+
License-File: LICENSE
|
|
8
|
+
Keywords: gpu,inference,llm,moe,offload,placement,profiler,vram
|
|
9
|
+
Requires-Python: >=3.10
|
|
10
|
+
Provides-Extra: dev
|
|
11
|
+
Requires-Dist: pytest>=8.0; extra == 'dev'
|
|
12
|
+
Provides-Extra: gpu
|
|
13
|
+
Requires-Dist: nvidia-ml-py>=12.535.0; extra == 'gpu'
|
|
14
|
+
Provides-Extra: host
|
|
15
|
+
Requires-Dist: numpy>=1.24; extra == 'host'
|
|
16
|
+
Requires-Dist: psutil>=5.9.0; extra == 'host'
|
|
17
|
+
Description-Content-Type: text/markdown
|
|
18
|
+
|
|
19
|
+
<p align="center">
|
|
20
|
+
<a href="README.ja.md">日本語</a> | <a href="README.zh.md">中文</a> | <a href="README.es.md">Español</a> | <a href="README.fr.md">Français</a> | <a href="README.hi.md">हिन्दी</a> | <a href="README.it.md">Italiano</a> | <a href="README.pt-BR.md">Português (BR)</a>
|
|
21
|
+
</p>
|
|
22
|
+
|
|
23
|
+
<div align="center">
|
|
24
|
+
|
|
25
|
+
<img src="https://raw.githubusercontent.com/mcp-tool-shop-org/gpu-container/main/assets/logo.png" width="400" alt="gpu-container" />
|
|
26
|
+
|
|
27
|
+
[](https://github.com/mcp-tool-shop-org/gpu-container/actions/workflows/ci.yml)
|
|
28
|
+
[](https://pypi.org/project/gpu-container/)
|
|
29
|
+
[](https://www.npmjs.com/package/gpu-container)
|
|
30
|
+
[](LICENSE)
|
|
31
|
+
[](https://mcp-tool-shop-org.github.io/gpu-container/)
|
|
32
|
+
|
|
33
|
+
**A GPU-enabled container exposes the device. A model-aware runtime decides what lives in VRAM, pinned RAM, and NVMe.**
|
|
34
|
+
|
|
35
|
+
</div>
|
|
36
|
+
|
|
37
|
+
Run the largest useful local model your machine can honestly support, with explicit placement plans, benchmark receipts, and refusal when the plan would thrash.
|
|
38
|
+
|
|
39
|
+
## Architecture
|
|
40
|
+
|
|
41
|
+
```
|
|
42
|
+
Windows / WSL2 / Linux host
|
|
43
|
+
└─ GPU-enabled Docker container
|
|
44
|
+
└─ Inference runtime
|
|
45
|
+
├─ VRAM: hot weights, active layers, activations, KV working set
|
|
46
|
+
├─ pinned RAM: CPU-offloaded weights, MoE experts, KV spill/reuse
|
|
47
|
+
└─ NVMe: mmap shards, disk offload, cold experts, cold KV
|
|
48
|
+
```
|
|
49
|
+
|
|
50
|
+
## Product Boundary
|
|
51
|
+
|
|
52
|
+
```
|
|
53
|
+
Docker = packaging + GPU exposure
|
|
54
|
+
CUDA/runtime = compute backend
|
|
55
|
+
Planner = memory law
|
|
56
|
+
Inference engine = execution
|
|
57
|
+
```
|
|
58
|
+
|
|
59
|
+
## Core Features
|
|
60
|
+
|
|
61
|
+
1. **Hardware profiler** — Detect VRAM, RAM, GPU type, WSL/native Linux, NVMe speed, CUDA availability
|
|
62
|
+
2. **Model profiler** — Detect dense vs MoE, largest layer, total weights, quantization, KV growth by context length
|
|
63
|
+
3. **Runtime planner** — Generate launch plans for llama.cpp, vLLM, Accelerate, TensorRT-LLM, or DeepSpeed-style offload
|
|
64
|
+
4. **Placement receipt** — Show what is in VRAM, what is in RAM, what is on disk, expected bottleneck, measured tokens/sec
|
|
65
|
+
5. **MoE-specialized path** — Keep always-active layers on GPU, route experts to CPU/RAM, NVMe for cold fallback
|
|
66
|
+
6. **Routing de-risk** — Measure whether a model's MoE routing is skewed enough that a per-expert cache would help, before building for it (`gpu-container-concentration`)
|
|
67
|
+
7. **Rig-safety watchdog** — Poll GPU power/temperature/VRAM + host memory against configurable thresholds; an AI agent or an autonomous loop aborts a run before it endangers the machine (`gpu-container-watchdog`)
|
|
68
|
+
|
|
69
|
+
## Key Constraint
|
|
70
|
+
|
|
71
|
+
On Windows/WSL, CUDA Unified Memory oversubscription is **not the path**. CUDA treats Windows/WSL as limited unified-memory support — no fine-grained GPU page-fault migration, no GPU-memory oversubscription beyond physical VRAM. This product is **explicit inference memory placement**, not "Docker VRAM overflow."
|
|
72
|
+
|
|
73
|
+
## Status
|
|
74
|
+
|
|
75
|
+
Built and working today: `gpu-container-profile`, `gpu-container-plan`, `gpu-container-receipt` (with the recalibration loop), `gpu-container-concentration` (routing de-risk), and `gpu-container-watchdog` (supervise a GPU job safely). llama.cpp is the integrated backend; the placement math is backend-agnostic. Start with the [quickstart](docs/quickstart.md).
|
|
76
|
+
|
|
77
|
+
## Privacy & safety
|
|
78
|
+
|
|
79
|
+
`gpu-container` is a **local, offline tool** — it makes no network calls and collects **no telemetry**, by default or otherwise. It reads GPU metrics (`nvidia-smi` / NVML) and host memory (`psutil`), the model `config.json` you supply, and the JSON files you point it at; it writes only to the output paths you specify. It does **not** read or transmit model weights, credentials, or tokens. Host-level actions (`wsl --shutdown`, `docker stop`, `kill`) run only when you explicitly opt in via the watchdog's `--on-breach`; the defaults never touch your machine beyond the job they supervise. Full policy: [SECURITY.md](SECURITY.md).
|
|
80
|
+
|
|
81
|
+
## Documentation
|
|
82
|
+
|
|
83
|
+
- [`docs/quickstart.md`](docs/quickstart.md) — end-to-end walkthrough: profile → plan → launch under the watchdog → receipt → recalibrate
|
|
84
|
+
- [`docs/cli.md`](docs/cli.md) — the five commands: synopsis, flags, exit codes, worked examples
|
|
85
|
+
- [`docs/architecture.md`](docs/architecture.md) — memory-tier model, data flow, MoE expert routing, the recalibration loop
|
|
86
|
+
- [`docs/features.md`](docs/features.md) — the seven core features in depth
|
|
87
|
+
- [`docs/moe-lane-architecture.md`](docs/moe-lane-architecture.md) — the flagship MoE lane in depth
|
|
88
|
+
- [`docs/derisk-concentration.md`](docs/derisk-concentration.md) — the per-expert-cache de-risk gate (routing concentration)
|
|
89
|
+
- [`docs/decisions/0001-per-expert-cache-build-vs-upstream.md`](docs/decisions/0001-per-expert-cache-build-vs-upstream.md) — ADR-0001: consume the cache mechanism, contribute the policy
|
|
90
|
+
- [`docs/constraints.md`](docs/constraints.md) — non-goals + the Windows/WSL CUDA Unified-Memory correction
|
|
91
|
+
- [`docs/prior-art.md`](docs/prior-art.md) — runtimes we orchestrate, and the gap this product fills
|
|
92
|
+
- [`docs/feasibility.md`](docs/feasibility.md) — feasibility assessment, research grounding, and what's confirmed live
|
|
93
|
+
|
|
94
|
+
---
|
|
95
|
+
|
|
96
|
+
<div align="center">
|
|
97
|
+
|
|
98
|
+
Built by <a href="https://mcp-tool-shop.github.io/">MCP Tool Shop</a> · MIT Licensed
|
|
99
|
+
|
|
100
|
+
</div>
|
|
@@ -0,0 +1,82 @@
|
|
|
1
|
+
<p align="center">
|
|
2
|
+
<a href="README.ja.md">日本語</a> | <a href="README.zh.md">中文</a> | <a href="README.md">English</a> | <a href="README.fr.md">Français</a> | <a href="README.hi.md">हिन्दी</a> | <a href="README.it.md">Italiano</a> | <a href="README.pt-BR.md">Português (BR)</a>
|
|
3
|
+
</p>
|
|
4
|
+
|
|
5
|
+
<div align="center">
|
|
6
|
+
|
|
7
|
+
<img src="https://raw.githubusercontent.com/mcp-tool-shop-org/gpu-container/main/assets/logo.png" width="400" alt="gpu-container" />
|
|
8
|
+
|
|
9
|
+
[](https://github.com/mcp-tool-shop-org/gpu-container/actions/workflows/ci.yml)
|
|
10
|
+
[](https://pypi.org/project/gpu-container/)
|
|
11
|
+
[](https://www.npmjs.com/package/gpu-container)
|
|
12
|
+
[](LICENSE)
|
|
13
|
+
[](https://mcp-tool-shop-org.github.io/gpu-container/)
|
|
14
|
+
|
|
15
|
+
**Un contenedor habilitado para GPU expone el dispositivo. Un entorno de ejecución consciente del modelo decide qué se almacena en la VRAM, la RAM asignada y la NVMe.**
|
|
16
|
+
|
|
17
|
+
</div>
|
|
18
|
+
|
|
19
|
+
Ejecute el modelo local más grande y útil que su máquina pueda soportar de manera realista, con planes de ubicación explícitos, informes de pruebas de rendimiento y rechazo cuando el plan cause problemas.
|
|
20
|
+
|
|
21
|
+
## Arquitectura
|
|
22
|
+
|
|
23
|
+
```
|
|
24
|
+
Windows / WSL2 / Linux host
|
|
25
|
+
└─ GPU-enabled Docker container
|
|
26
|
+
└─ Inference runtime
|
|
27
|
+
├─ VRAM: hot weights, active layers, activations, KV working set
|
|
28
|
+
├─ pinned RAM: CPU-offloaded weights, MoE experts, KV spill/reuse
|
|
29
|
+
└─ NVMe: mmap shards, disk offload, cold experts, cold KV
|
|
30
|
+
```
|
|
31
|
+
|
|
32
|
+
## Límite del producto
|
|
33
|
+
|
|
34
|
+
```
|
|
35
|
+
Docker = packaging + GPU exposure
|
|
36
|
+
CUDA/runtime = compute backend
|
|
37
|
+
Planner = memory law
|
|
38
|
+
Inference engine = execution
|
|
39
|
+
```
|
|
40
|
+
|
|
41
|
+
## Características principales
|
|
42
|
+
|
|
43
|
+
1. **Perfilador de hardware:** Detecta la VRAM, la RAM, el tipo de GPU, WSL/Linux nativo, la velocidad de NVMe y la disponibilidad de CUDA.
|
|
44
|
+
2. **Perfilador de modelo:** Detecta si es denso o MoE, la capa más grande, el número total de parámetros, la cuantificación y el crecimiento de KV según la longitud del contexto.
|
|
45
|
+
3. **Planificador de entorno de ejecución:** Genera planes de inicio para llama.cpp, vLLM, Accelerate, TensorRT-LLM o la descarga al estilo DeepSpeed.
|
|
46
|
+
4. **Informe de ubicación:** Muestra qué hay en la VRAM, qué hay en la RAM, qué hay en el disco, el cuello de botella esperado y los tokens/segundo medidos.
|
|
47
|
+
5. **Ruta especializada para MoE:** Mantiene las capas siempre activas en la GPU, dirige los expertos a la CPU/RAM y a la NVMe para la recuperación en caso de inactividad.
|
|
48
|
+
6. **Reducción de riesgos en el enrutamiento:** Mide si el enrutamiento MoE de un modelo está lo suficientemente sesgado como para que una caché por experto sea útil, antes de construirlo (`gpu-container-concentration`).
|
|
49
|
+
7. **Vigilante de seguridad del sistema:** Supervisa la potencia/temperatura/VRAM de la GPU y la memoria del host en comparación con los umbrales configurables; un agente de IA o un bucle autónomo aborta una ejecución antes de que ponga en peligro la máquina (`gpu-container-watchdog`).
|
|
50
|
+
|
|
51
|
+
## Restricción clave
|
|
52
|
+
|
|
53
|
+
En Windows/WSL, la sobreasignación de memoria unificada de CUDA **no es la solución**. CUDA trata Windows/WSL como un soporte limitado para la memoria unificada: no hay migración de páginas de GPU de grano fino, ni sobreasignación de memoria de GPU más allá de la VRAM física. Este producto es la **ubicación explícita de la memoria de inferencia**, no el "desbordamiento de VRAM de Docker".
|
|
54
|
+
|
|
55
|
+
## Estado
|
|
56
|
+
|
|
57
|
+
Construido y funcionando hoy: `gpu-container-profile`, `gpu-container-plan`, `gpu-container-receipt` (con el bucle de recalibración), `gpu-container-concentration` (reducción de riesgos en el enrutamiento) y `gpu-container-watchdog` (supervisión segura de un trabajo de GPU). llama.cpp es el backend integrado; las matemáticas de ubicación son independientes del backend. Comience con el [inicio rápido](docs/quickstart.md).
|
|
58
|
+
|
|
59
|
+
## Privacidad y seguridad
|
|
60
|
+
|
|
61
|
+
`gpu-container` es una **herramienta local y sin conexión**: no realiza llamadas de red y no recopila **ninguna telemetría**, ni por defecto ni de otra manera. Lee las métricas de la GPU (`nvidia-smi` / NVML) y la memoria del host (`psutil`), el archivo `config.json` del modelo que proporciona y los archivos JSON a los que lo apunta; solo escribe en las rutas de salida que especifica. **No** lee ni transmite los parámetros del modelo, las credenciales ni los tokens. Las acciones a nivel de host (`wsl --shutdown`, `docker stop`, `kill`) solo se ejecutan cuando usted acepta explícitamente a través del parámetro `--on-breach` del vigilante; los valores predeterminados nunca tocan su máquina más allá del trabajo que supervisan. Política completa: [SECURITY.md](SECURITY.md).
|
|
62
|
+
|
|
63
|
+
## Documentación
|
|
64
|
+
|
|
65
|
+
- [`docs/quickstart.md`](docs/quickstart.md): recorrido completo: perfil → plan → inicio bajo el vigilante → informe → recalibración
|
|
66
|
+
- [`docs/cli.md`](docs/cli.md): los cinco comandos: resumen, opciones, códigos de salida, ejemplos prácticos
|
|
67
|
+
- [`docs/architecture.md`](docs/architecture.md): modelo de niveles de memoria, flujo de datos, enrutamiento de expertos MoE, el bucle de recalibración
|
|
68
|
+
- [`docs/features.md`](docs/features.md): las siete características principales en detalle
|
|
69
|
+
- [`docs/moe-lane-architecture.md`](docs/moe-lane-architecture.md): la ruta MoE insignia en detalle
|
|
70
|
+
- [`docs/derisk-concentration.md`](docs/derisk-concentration.md): la puerta de reducción de riesgos de la caché por experto (concentración del enrutamiento)
|
|
71
|
+
- [`docs/decisions/0001-per-expert-cache-build-vs-upstream.md`](docs/decisions/0001-per-expert-cache-build-vs-upstream.md): ADR-0001: consumir el mecanismo de caché, contribuir a la política
|
|
72
|
+
- [`docs/constraints.md`](docs/constraints.md): objetivos no cumplidos + la corrección de la memoria unificada de CUDA en Windows/WSL
|
|
73
|
+
- [`docs/prior-art.md`](docs/prior-art.md): entornos de ejecución que orquestamos y la brecha que llena este producto
|
|
74
|
+
- [`docs/feasibility.md`](docs/feasibility.md): evaluación de viabilidad, base de investigación y lo que se ha confirmado que funciona
|
|
75
|
+
|
|
76
|
+
---
|
|
77
|
+
|
|
78
|
+
<div align="center">
|
|
79
|
+
|
|
80
|
+
Creado por <a href="https://mcp-tool-shop.github.io/">MCP Tool Shop</a> · Licencia MIT
|
|
81
|
+
|
|
82
|
+
</div>
|