gpu-gate 0.2.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- gpu_gate-0.2.0/.github/workflows/ci.yml +28 -0
- gpu_gate-0.2.0/.github/workflows/publish.yml +21 -0
- gpu_gate-0.2.0/.gitignore +26 -0
- gpu_gate-0.2.0/.pre-commit-config.yaml +15 -0
- gpu_gate-0.2.0/CHANGELOG.md +30 -0
- gpu_gate-0.2.0/Dockerfile +20 -0
- gpu_gate-0.2.0/LICENSE +21 -0
- gpu_gate-0.2.0/PKG-INFO +153 -0
- gpu_gate-0.2.0/README.md +105 -0
- gpu_gate-0.2.0/pyproject.toml +69 -0
- gpu_gate-0.2.0/src/gpu_gate/__init__.py +15 -0
- gpu_gate-0.2.0/src/gpu_gate/__main__.py +4 -0
- gpu_gate-0.2.0/src/gpu_gate/cli.py +213 -0
- gpu_gate-0.2.0/src/gpu_gate/lock.py +72 -0
- gpu_gate-0.2.0/src/gpu_gate/models.py +61 -0
- gpu_gate-0.2.0/src/gpu_gate/probe.py +92 -0
- gpu_gate-0.2.0/src/gpu_gate/runner.py +109 -0
- gpu_gate-0.2.0/src/gpu_gate/selector.py +61 -0
- gpu_gate-0.2.0/tests/conftest.py +52 -0
- gpu_gate-0.2.0/tests/test_cli.py +81 -0
- gpu_gate-0.2.0/tests/test_lock.py +28 -0
- gpu_gate-0.2.0/tests/test_runner.py +101 -0
- gpu_gate-0.2.0/tests/test_selector.py +70 -0
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
name: ci
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
push:
|
|
5
|
+
branches: [main]
|
|
6
|
+
pull_request:
|
|
7
|
+
|
|
8
|
+
jobs:
|
|
9
|
+
test:
|
|
10
|
+
runs-on: ubuntu-latest
|
|
11
|
+
strategy:
|
|
12
|
+
fail-fast: false
|
|
13
|
+
matrix:
|
|
14
|
+
python-version: ["3.10", "3.11", "3.12"]
|
|
15
|
+
steps:
|
|
16
|
+
- uses: actions/checkout@v4
|
|
17
|
+
- name: Install uv
|
|
18
|
+
uses: astral-sh/setup-uv@v3
|
|
19
|
+
with:
|
|
20
|
+
python-version: ${{ matrix.python-version }}
|
|
21
|
+
- name: Sync dependencies
|
|
22
|
+
run: uv sync --all-extras --dev
|
|
23
|
+
- name: Lint
|
|
24
|
+
run: uv run ruff check .
|
|
25
|
+
- name: Format check
|
|
26
|
+
run: uv run ruff format --check .
|
|
27
|
+
- name: Test
|
|
28
|
+
run: uv run pytest --cov --cov-report=term-missing
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
name: publish
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
release:
|
|
5
|
+
types: [published]
|
|
6
|
+
workflow_dispatch:
|
|
7
|
+
|
|
8
|
+
jobs:
|
|
9
|
+
pypi:
|
|
10
|
+
runs-on: ubuntu-latest
|
|
11
|
+
environment: pypi
|
|
12
|
+
permissions:
|
|
13
|
+
id-token: write
|
|
14
|
+
steps:
|
|
15
|
+
- uses: actions/checkout@v4
|
|
16
|
+
- name: Install uv
|
|
17
|
+
uses: astral-sh/setup-uv@v3
|
|
18
|
+
- name: Build
|
|
19
|
+
run: uv build
|
|
20
|
+
- name: Publish to PyPI
|
|
21
|
+
run: uv publish
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
# Python
|
|
2
|
+
__pycache__/
|
|
3
|
+
*.py[cod]
|
|
4
|
+
*.egg-info/
|
|
5
|
+
.eggs/
|
|
6
|
+
build/
|
|
7
|
+
dist/
|
|
8
|
+
|
|
9
|
+
# Virtual environments
|
|
10
|
+
.venv/
|
|
11
|
+
venv/
|
|
12
|
+
|
|
13
|
+
# uv
|
|
14
|
+
uv.lock
|
|
15
|
+
|
|
16
|
+
# Test and coverage
|
|
17
|
+
.pytest_cache/
|
|
18
|
+
.coverage
|
|
19
|
+
.coverage.*
|
|
20
|
+
htmlcov/
|
|
21
|
+
.ruff_cache/
|
|
22
|
+
|
|
23
|
+
# Editor / OS
|
|
24
|
+
.vscode/
|
|
25
|
+
.idea/
|
|
26
|
+
.DS_Store
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
repos:
|
|
2
|
+
- repo: https://github.com/astral-sh/ruff-pre-commit
|
|
3
|
+
rev: v0.6.9
|
|
4
|
+
hooks:
|
|
5
|
+
- id: ruff
|
|
6
|
+
args: [--fix]
|
|
7
|
+
- id: ruff-format
|
|
8
|
+
- repo: https://github.com/pre-commit/pre-commit-hooks
|
|
9
|
+
rev: v4.6.0
|
|
10
|
+
hooks:
|
|
11
|
+
- id: end-of-file-fixer
|
|
12
|
+
- id: trailing-whitespace
|
|
13
|
+
- id: check-yaml
|
|
14
|
+
- id: check-toml
|
|
15
|
+
- id: check-merge-conflict
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
# Changelog
|
|
2
|
+
|
|
3
|
+
All notable changes to this project are documented here. The format is based
|
|
4
|
+
on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/), and this project
|
|
5
|
+
adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
|
|
6
|
+
|
|
7
|
+
## [0.2.0] - 2026-05-10
|
|
8
|
+
|
|
9
|
+
### Added
|
|
10
|
+
- Docker image and a published container entry point.
|
|
11
|
+
- Continuous integration across Python 3.10, 3.11 and 3.12 (lint, format
|
|
12
|
+
check and tests).
|
|
13
|
+
- Expanded documentation and usage examples.
|
|
14
|
+
|
|
15
|
+
### Changed
|
|
16
|
+
- Hardened packaging metadata and pinned the supported Python versions.
|
|
17
|
+
|
|
18
|
+
## [0.1.0] - 2026-05-07
|
|
19
|
+
|
|
20
|
+
### Added
|
|
21
|
+
- `run` command: wait for a free GPU, claim it with a cooperative lock, set
|
|
22
|
+
`CUDA_VISIBLE_DEVICES`, and exec the wrapped command.
|
|
23
|
+
- `wait` command: block until a GPU is free and print the chosen index.
|
|
24
|
+
- `status` command: list visible GPUs with free memory and utilization, with
|
|
25
|
+
optional JSON output.
|
|
26
|
+
- Filters for memory, utilization and explicit index include/exclude.
|
|
27
|
+
- Advisory per-device file locks to avoid two runs grabbing the same card.
|
|
28
|
+
|
|
29
|
+
[0.2.0]: https://github.com/jmweb-org/gpu-gate/releases/tag/v0.2.0
|
|
30
|
+
[0.1.0]: https://github.com/jmweb-org/gpu-gate/releases/tag/v0.1.0
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
# gpu-gate runs on the host's NVIDIA driver via NVML. Build a small image and
|
|
2
|
+
# run it with `--gpus all` so the container can see the cards:
|
|
3
|
+
#
|
|
4
|
+
# docker build -t gpu-gate .
|
|
5
|
+
# docker run --rm --gpus all gpu-gate status
|
|
6
|
+
#
|
|
7
|
+
FROM python:3.12-slim
|
|
8
|
+
|
|
9
|
+
LABEL org.opencontainers.image.source="https://github.com/jmweb-org/gpu-gate"
|
|
10
|
+
LABEL org.opencontainers.image.description="Wait for a free GPU, claim it, and run a command on it."
|
|
11
|
+
LABEL org.opencontainers.image.licenses="MIT"
|
|
12
|
+
|
|
13
|
+
WORKDIR /app
|
|
14
|
+
COPY pyproject.toml README.md LICENSE ./
|
|
15
|
+
COPY src ./src
|
|
16
|
+
|
|
17
|
+
RUN pip install --no-cache-dir .
|
|
18
|
+
|
|
19
|
+
ENTRYPOINT ["gpu-gate"]
|
|
20
|
+
CMD ["--help"]
|
gpu_gate-0.2.0/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 José del Río
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
gpu_gate-0.2.0/PKG-INFO
ADDED
|
@@ -0,0 +1,153 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: gpu-gate
|
|
3
|
+
Version: 0.2.0
|
|
4
|
+
Summary: Wait for a free GPU, claim it, and run a command on it.
|
|
5
|
+
Project-URL: Homepage, https://github.com/jmweb-org/gpu-gate
|
|
6
|
+
Project-URL: Repository, https://github.com/jmweb-org/gpu-gate
|
|
7
|
+
Project-URL: Issues, https://github.com/jmweb-org/gpu-gate/issues
|
|
8
|
+
Author: José del Río
|
|
9
|
+
License: MIT License
|
|
10
|
+
|
|
11
|
+
Copyright (c) 2026 José del Río
|
|
12
|
+
|
|
13
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
14
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
15
|
+
in the Software without restriction, including without limitation the rights
|
|
16
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
17
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
18
|
+
furnished to do so, subject to the following conditions:
|
|
19
|
+
|
|
20
|
+
The above copyright notice and this permission notice shall be included in all
|
|
21
|
+
copies or substantial portions of the Software.
|
|
22
|
+
|
|
23
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
24
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
25
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
26
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
27
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
28
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
29
|
+
SOFTWARE.
|
|
30
|
+
License-File: LICENSE
|
|
31
|
+
Keywords: cli,cuda,gpu,nvidia,nvml,scheduler
|
|
32
|
+
Classifier: Development Status :: 4 - Beta
|
|
33
|
+
Classifier: Environment :: GPU :: NVIDIA CUDA
|
|
34
|
+
Classifier: Intended Audience :: Developers
|
|
35
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
36
|
+
Classifier: Operating System :: POSIX :: Linux
|
|
37
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
38
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
39
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
40
|
+
Classifier: Topic :: System :: Distributed Computing
|
|
41
|
+
Classifier: Topic :: Utilities
|
|
42
|
+
Requires-Python: >=3.10
|
|
43
|
+
Requires-Dist: filelock>=3.12
|
|
44
|
+
Requires-Dist: nvidia-ml-py>=12.535
|
|
45
|
+
Requires-Dist: rich>=13.0
|
|
46
|
+
Requires-Dist: typer>=0.12
|
|
47
|
+
Description-Content-Type: text/markdown
|
|
48
|
+
|
|
49
|
+
# gpu-gate
|
|
50
|
+
|
|
51
|
+
[](https://github.com/jmweb-org/gpu-gate/actions/workflows/ci.yml)
|
|
52
|
+
[](https://pypi.org/project/gpu-gate/)
|
|
53
|
+
[](https://www.python.org)
|
|
54
|
+
[](LICENSE)
|
|
55
|
+
|
|
56
|
+
Wait for a free GPU, claim it, set `CUDA_VISIBLE_DEVICES`, and run your command.
|
|
57
|
+
|
|
58
|
+
On a shared multi-GPU box without a cluster scheduler, starting a job usually
|
|
59
|
+
means watching `nvidia-smi`, picking a card by hand, exporting the env var, and
|
|
60
|
+
remembering to actually launch. `gpu-gate` is the small wait-pick-export-run
|
|
61
|
+
loop that does this for you, with a cooperative lock so two invocations on the
|
|
62
|
+
same host do not grab the same just-freed card. No daemon, no server, nothing
|
|
63
|
+
to administer.
|
|
64
|
+
|
|
65
|
+
```console
|
|
66
|
+
$ gpu-gate run --min-free-mb 8000 -- python train.py
|
|
67
|
+
gpu-gate: waiting for a free GPU ...
|
|
68
|
+
# ... blocks until a card has >= 8 GB free, then runs train.py with
|
|
69
|
+
# CUDA_VISIBLE_DEVICES set to the chosen index
|
|
70
|
+
```
|
|
71
|
+
|
|
72
|
+
## Install
|
|
73
|
+
|
|
74
|
+
```console
|
|
75
|
+
$ pip install gpu-gate # from PyPI, once released
|
|
76
|
+
$ pip install git+https://github.com/jmweb-org/gpu-gate # latest, available now
|
|
77
|
+
```
|
|
78
|
+
|
|
79
|
+
It requires an NVIDIA driver at run time. The NVML binding
|
|
80
|
+
(`nvidia-ml-py`) is pulled in automatically; the package still installs and
|
|
81
|
+
imports on machines without a GPU, so it is safe to add to shared requirements.
|
|
82
|
+
|
|
83
|
+
## Usage
|
|
84
|
+
|
|
85
|
+
### Run a command on a free GPU
|
|
86
|
+
|
|
87
|
+
```console
|
|
88
|
+
$ gpu-gate run -n 1 --min-free-mb 8000 -- python train.py --epochs 50
|
|
89
|
+
```
|
|
90
|
+
|
|
91
|
+
Everything after `--` is the command. `gpu-gate` blocks until the requirements
|
|
92
|
+
are met, claims the chosen device(s), exports `CUDA_VISIBLE_DEVICES`, and execs
|
|
93
|
+
the command. Its own exit code is the command's exit code, so it drops cleanly
|
|
94
|
+
into scripts and CI.
|
|
95
|
+
|
|
96
|
+
Common options:
|
|
97
|
+
|
|
98
|
+
| Option | Meaning |
|
|
99
|
+
| --- | --- |
|
|
100
|
+
| `-n, --count` | Number of GPUs to claim (default 1) |
|
|
101
|
+
| `--min-free-mb` | Require at least this much free memory |
|
|
102
|
+
| `--max-util` | Skip cards busier than this percent |
|
|
103
|
+
| `--only 0,1` | Restrict the search to these indices |
|
|
104
|
+
| `--exclude 2,3` | Never pick these indices |
|
|
105
|
+
| `--poll` | Seconds between checks (default 5) |
|
|
106
|
+
| `--timeout` | Give up after N seconds (exit 124) |
|
|
107
|
+
|
|
108
|
+
### Just wait, then use the result yourself
|
|
109
|
+
|
|
110
|
+
```console
|
|
111
|
+
$ export CUDA_VISIBLE_DEVICES=$(gpu-gate wait --min-free-mb 8000)
|
|
112
|
+
```
|
|
113
|
+
|
|
114
|
+
### Inspect the current state
|
|
115
|
+
|
|
116
|
+
```console
|
|
117
|
+
$ gpu-gate status
|
|
118
|
+
idx name free total util
|
|
119
|
+
0 NVIDIA L40S 44211 MiB 46068 MiB 3%
|
|
120
|
+
1 NVIDIA L40S 812 MiB 46068 MiB 97%
|
|
121
|
+
|
|
122
|
+
$ gpu-gate status --json
|
|
123
|
+
```
|
|
124
|
+
|
|
125
|
+
## Exit codes
|
|
126
|
+
|
|
127
|
+
| Code | Meaning |
|
|
128
|
+
| --- | --- |
|
|
129
|
+
| 0 | Command ran (its own code is forwarded) |
|
|
130
|
+
| 2 | Bad invocation (for example, no command after `--`) |
|
|
131
|
+
| 124 | Timed out waiting for a GPU |
|
|
132
|
+
| 3 | Requirements could never be met |
|
|
133
|
+
| 4 | Could not read GPU state (no driver / NVML error) |
|
|
134
|
+
|
|
135
|
+
## How selection works
|
|
136
|
+
|
|
137
|
+
A GPU is eligible when it has enough free memory, is below the utilization
|
|
138
|
+
ceiling, is not excluded, and is not currently locked by another `gpu-gate`
|
|
139
|
+
caller. Eligible cards are ranked by most free memory, then lowest
|
|
140
|
+
utilization, then index, and the top `--count` are chosen. The ordering is
|
|
141
|
+
fully deterministic.
|
|
142
|
+
|
|
143
|
+
## Locking
|
|
144
|
+
|
|
145
|
+
While a command runs, `gpu-gate` holds an advisory file lock per claimed
|
|
146
|
+
device under `$GPU_GATE_LOCK_DIR` (a per-user directory by default). Other
|
|
147
|
+
`gpu-gate` invocations skip locked devices, which avoids the classic race where
|
|
148
|
+
two jobs both see the same card free at the same instant. The lock is advisory:
|
|
149
|
+
it coordinates `gpu-gate` callers, not arbitrary CUDA programs.
|
|
150
|
+
|
|
151
|
+
## License
|
|
152
|
+
|
|
153
|
+
MIT. See [LICENSE](LICENSE).
|
gpu_gate-0.2.0/README.md
ADDED
|
@@ -0,0 +1,105 @@
|
|
|
1
|
+
# gpu-gate
|
|
2
|
+
|
|
3
|
+
[](https://github.com/jmweb-org/gpu-gate/actions/workflows/ci.yml)
|
|
4
|
+
[](https://pypi.org/project/gpu-gate/)
|
|
5
|
+
[](https://www.python.org)
|
|
6
|
+
[](LICENSE)
|
|
7
|
+
|
|
8
|
+
Wait for a free GPU, claim it, set `CUDA_VISIBLE_DEVICES`, and run your command.
|
|
9
|
+
|
|
10
|
+
On a shared multi-GPU box without a cluster scheduler, starting a job usually
|
|
11
|
+
means watching `nvidia-smi`, picking a card by hand, exporting the env var, and
|
|
12
|
+
remembering to actually launch. `gpu-gate` is the small wait-pick-export-run
|
|
13
|
+
loop that does this for you, with a cooperative lock so two invocations on the
|
|
14
|
+
same host do not grab the same just-freed card. No daemon, no server, nothing
|
|
15
|
+
to administer.
|
|
16
|
+
|
|
17
|
+
```console
|
|
18
|
+
$ gpu-gate run --min-free-mb 8000 -- python train.py
|
|
19
|
+
gpu-gate: waiting for a free GPU ...
|
|
20
|
+
# ... blocks until a card has >= 8 GB free, then runs train.py with
|
|
21
|
+
# CUDA_VISIBLE_DEVICES set to the chosen index
|
|
22
|
+
```
|
|
23
|
+
|
|
24
|
+
## Install
|
|
25
|
+
|
|
26
|
+
```console
|
|
27
|
+
$ pip install gpu-gate # from PyPI, once released
|
|
28
|
+
$ pip install git+https://github.com/jmweb-org/gpu-gate # latest, available now
|
|
29
|
+
```
|
|
30
|
+
|
|
31
|
+
It requires an NVIDIA driver at run time. The NVML binding
|
|
32
|
+
(`nvidia-ml-py`) is pulled in automatically; the package still installs and
|
|
33
|
+
imports on machines without a GPU, so it is safe to add to shared requirements.
|
|
34
|
+
|
|
35
|
+
## Usage
|
|
36
|
+
|
|
37
|
+
### Run a command on a free GPU
|
|
38
|
+
|
|
39
|
+
```console
|
|
40
|
+
$ gpu-gate run -n 1 --min-free-mb 8000 -- python train.py --epochs 50
|
|
41
|
+
```
|
|
42
|
+
|
|
43
|
+
Everything after `--` is the command. `gpu-gate` blocks until the requirements
|
|
44
|
+
are met, claims the chosen device(s), exports `CUDA_VISIBLE_DEVICES`, and execs
|
|
45
|
+
the command. Its own exit code is the command's exit code, so it drops cleanly
|
|
46
|
+
into scripts and CI.
|
|
47
|
+
|
|
48
|
+
Common options:
|
|
49
|
+
|
|
50
|
+
| Option | Meaning |
|
|
51
|
+
| --- | --- |
|
|
52
|
+
| `-n, --count` | Number of GPUs to claim (default 1) |
|
|
53
|
+
| `--min-free-mb` | Require at least this much free memory |
|
|
54
|
+
| `--max-util` | Skip cards busier than this percent |
|
|
55
|
+
| `--only 0,1` | Restrict the search to these indices |
|
|
56
|
+
| `--exclude 2,3` | Never pick these indices |
|
|
57
|
+
| `--poll` | Seconds between checks (default 5) |
|
|
58
|
+
| `--timeout` | Give up after N seconds (exit 124) |
|
|
59
|
+
|
|
60
|
+
### Just wait, then use the result yourself
|
|
61
|
+
|
|
62
|
+
```console
|
|
63
|
+
$ export CUDA_VISIBLE_DEVICES=$(gpu-gate wait --min-free-mb 8000)
|
|
64
|
+
```
|
|
65
|
+
|
|
66
|
+
### Inspect the current state
|
|
67
|
+
|
|
68
|
+
```console
|
|
69
|
+
$ gpu-gate status
|
|
70
|
+
idx name free total util
|
|
71
|
+
0 NVIDIA L40S 44211 MiB 46068 MiB 3%
|
|
72
|
+
1 NVIDIA L40S 812 MiB 46068 MiB 97%
|
|
73
|
+
|
|
74
|
+
$ gpu-gate status --json
|
|
75
|
+
```
|
|
76
|
+
|
|
77
|
+
## Exit codes
|
|
78
|
+
|
|
79
|
+
| Code | Meaning |
|
|
80
|
+
| --- | --- |
|
|
81
|
+
| 0 | Command ran (its own code is forwarded) |
|
|
82
|
+
| 2 | Bad invocation (for example, no command after `--`) |
|
|
83
|
+
| 124 | Timed out waiting for a GPU |
|
|
84
|
+
| 3 | Requirements could never be met |
|
|
85
|
+
| 4 | Could not read GPU state (no driver / NVML error) |
|
|
86
|
+
|
|
87
|
+
## How selection works
|
|
88
|
+
|
|
89
|
+
A GPU is eligible when it has enough free memory, is below the utilization
|
|
90
|
+
ceiling, is not excluded, and is not currently locked by another `gpu-gate`
|
|
91
|
+
caller. Eligible cards are ranked by most free memory, then lowest
|
|
92
|
+
utilization, then index, and the top `--count` are chosen. The ordering is
|
|
93
|
+
fully deterministic.
|
|
94
|
+
|
|
95
|
+
## Locking
|
|
96
|
+
|
|
97
|
+
While a command runs, `gpu-gate` holds an advisory file lock per claimed
|
|
98
|
+
device under `$GPU_GATE_LOCK_DIR` (a per-user directory by default). Other
|
|
99
|
+
`gpu-gate` invocations skip locked devices, which avoids the classic race where
|
|
100
|
+
two jobs both see the same card free at the same instant. The lock is advisory:
|
|
101
|
+
it coordinates `gpu-gate` callers, not arbitrary CUDA programs.
|
|
102
|
+
|
|
103
|
+
## License
|
|
104
|
+
|
|
105
|
+
MIT. See [LICENSE](LICENSE).
|
|
@@ -0,0 +1,69 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["hatchling"]
|
|
3
|
+
build-backend = "hatchling.build"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "gpu-gate"
|
|
7
|
+
version = "0.2.0"
|
|
8
|
+
description = "Wait for a free GPU, claim it, and run a command on it."
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
requires-python = ">=3.10"
|
|
11
|
+
license = { file = "LICENSE" }
|
|
12
|
+
authors = [{ name = "José del Río" }]
|
|
13
|
+
keywords = ["gpu", "cuda", "nvidia", "nvml", "scheduler", "cli"]
|
|
14
|
+
classifiers = [
|
|
15
|
+
"Development Status :: 4 - Beta",
|
|
16
|
+
"Environment :: GPU :: NVIDIA CUDA",
|
|
17
|
+
"Intended Audience :: Developers",
|
|
18
|
+
"License :: OSI Approved :: MIT License",
|
|
19
|
+
"Operating System :: POSIX :: Linux",
|
|
20
|
+
"Programming Language :: Python :: 3.10",
|
|
21
|
+
"Programming Language :: Python :: 3.11",
|
|
22
|
+
"Programming Language :: Python :: 3.12",
|
|
23
|
+
"Topic :: System :: Distributed Computing",
|
|
24
|
+
"Topic :: Utilities",
|
|
25
|
+
]
|
|
26
|
+
dependencies = [
|
|
27
|
+
"typer>=0.12",
|
|
28
|
+
"rich>=13.0",
|
|
29
|
+
"filelock>=3.12",
|
|
30
|
+
"nvidia-ml-py>=12.535",
|
|
31
|
+
]
|
|
32
|
+
|
|
33
|
+
[project.urls]
|
|
34
|
+
Homepage = "https://github.com/jmweb-org/gpu-gate"
|
|
35
|
+
Repository = "https://github.com/jmweb-org/gpu-gate"
|
|
36
|
+
Issues = "https://github.com/jmweb-org/gpu-gate/issues"
|
|
37
|
+
|
|
38
|
+
[project.scripts]
|
|
39
|
+
gpu-gate = "gpu_gate.cli:entrypoint"
|
|
40
|
+
|
|
41
|
+
[dependency-groups]
|
|
42
|
+
dev = [
|
|
43
|
+
"pytest>=8.0",
|
|
44
|
+
"pytest-cov>=5.0",
|
|
45
|
+
"ruff>=0.6",
|
|
46
|
+
]
|
|
47
|
+
|
|
48
|
+
[tool.hatch.build.targets.wheel]
|
|
49
|
+
packages = ["src/gpu_gate"]
|
|
50
|
+
|
|
51
|
+
[tool.pytest.ini_options]
|
|
52
|
+
addopts = "-q"
|
|
53
|
+
testpaths = ["tests"]
|
|
54
|
+
pythonpath = ["."]
|
|
55
|
+
|
|
56
|
+
[tool.ruff]
|
|
57
|
+
line-length = 100
|
|
58
|
+
target-version = "py310"
|
|
59
|
+
src = ["src", "tests"]
|
|
60
|
+
|
|
61
|
+
[tool.ruff.lint]
|
|
62
|
+
select = ["E", "F", "I", "UP", "B", "S", "C4", "RUF"]
|
|
63
|
+
|
|
64
|
+
[tool.ruff.lint.per-file-ignores]
|
|
65
|
+
"tests/*" = ["S101"]
|
|
66
|
+
|
|
67
|
+
[tool.coverage.run]
|
|
68
|
+
source = ["gpu_gate"]
|
|
69
|
+
branch = true
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
"""gpu-gate: wait for a free GPU, claim it, and run a command on it."""
|
|
2
|
+
|
|
3
|
+
from gpu_gate.models import GpuStatus, Requirements, Selection
|
|
4
|
+
from gpu_gate.selector import NotEnoughGPUs, select
|
|
5
|
+
|
|
6
|
+
__version__ = "0.2.0"
|
|
7
|
+
|
|
8
|
+
__all__ = [
|
|
9
|
+
"GpuStatus",
|
|
10
|
+
"NotEnoughGPUs",
|
|
11
|
+
"Requirements",
|
|
12
|
+
"Selection",
|
|
13
|
+
"__version__",
|
|
14
|
+
"select",
|
|
15
|
+
]
|