gpu-gate 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
gpu_gate/__init__.py ADDED
@@ -0,0 +1,15 @@
1
+ """gpu-gate: wait for a free GPU, claim it, and run a command on it."""
2
+
3
+ from gpu_gate.models import GpuStatus, Requirements, Selection
4
+ from gpu_gate.selector import NotEnoughGPUs, select
5
+
6
+ __version__ = "0.2.0"
7
+
8
+ __all__ = [
9
+ "GpuStatus",
10
+ "NotEnoughGPUs",
11
+ "Requirements",
12
+ "Selection",
13
+ "__version__",
14
+ "select",
15
+ ]
gpu_gate/__main__.py ADDED
@@ -0,0 +1,4 @@
1
+ from gpu_gate.cli import entrypoint
2
+
3
+ if __name__ == "__main__":
4
+ entrypoint()
gpu_gate/cli.py ADDED
@@ -0,0 +1,213 @@
1
+ """Command-line interface for gpu-gate."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import json
6
+ import sys
7
+
8
+ import typer
9
+ from rich.console import Console
10
+ from rich.table import Table
11
+
12
+ from gpu_gate import __version__
13
+ from gpu_gate.models import GpuStatus, Requirements
14
+ from gpu_gate.probe import NvmlProbe, Probe, ProbeError
15
+ from gpu_gate.runner import (
16
+ WaitConfig,
17
+ WaitTimeout,
18
+ run_with_gpus,
19
+ wait_for_selection,
20
+ )
21
+ from gpu_gate.selector import NotEnoughGPUs
22
+
23
+ app = typer.Typer(
24
+ add_completion=False,
25
+ no_args_is_help=True,
26
+ help="Wait for a free GPU, claim it, and run a command on it.",
27
+ )
28
+ _err = Console(stderr=True)
29
+ _out = Console()
30
+
31
+ EXIT_OK = 0
32
+ EXIT_TIMEOUT = 124
33
+ EXIT_NO_GPU = 3
34
+ EXIT_PROBE = 4
35
+
36
+
37
+ def _requirements(
38
+ count: int,
39
+ min_free_mb: int,
40
+ max_util: int,
41
+ exclude: str | None,
42
+ only: str | None,
43
+ ) -> Requirements:
44
+ return Requirements(
45
+ count=count,
46
+ min_free_mb=min_free_mb,
47
+ max_utilization_pct=max_util,
48
+ exclude=_parse_indices(exclude),
49
+ include=_parse_indices(only) if only else None,
50
+ )
51
+
52
+
53
+ def _parse_indices(value: str | None) -> frozenset[int]:
54
+ if not value:
55
+ return frozenset()
56
+ out: set[int] = set()
57
+ for chunk in value.split(","):
58
+ chunk = chunk.strip()
59
+ if chunk:
60
+ out.add(int(chunk))
61
+ return frozenset(out)
62
+
63
+
64
+ def _make_probe() -> Probe:
65
+ return NvmlProbe()
66
+
67
+
68
+ def _status_rows(statuses: list[GpuStatus]) -> Table:
69
+ table = Table(box=None, pad_edge=False)
70
+ table.add_column("idx", justify="right")
71
+ table.add_column("name")
72
+ table.add_column("free", justify="right")
73
+ table.add_column("total", justify="right")
74
+ table.add_column("util", justify="right")
75
+ for s in statuses:
76
+ table.add_row(
77
+ str(s.index),
78
+ s.name,
79
+ f"{s.memory_free_mb} MiB",
80
+ f"{s.memory_total_mb} MiB",
81
+ f"{s.utilization_pct}%",
82
+ )
83
+ return table
84
+
85
+
86
+ def _version_callback(value: bool) -> None:
87
+ if value:
88
+ _out.print(f"gpu-gate {__version__}")
89
+ raise typer.Exit()
90
+
91
+
92
+ @app.callback()
93
+ def main(
94
+ _version: bool = typer.Option(
95
+ False,
96
+ "--version",
97
+ callback=_version_callback,
98
+ is_eager=True,
99
+ help="Show the version and exit.",
100
+ ),
101
+ ) -> None:
102
+ """gpu-gate command-line interface."""
103
+
104
+
105
+ @app.command("status")
106
+ def status(
107
+ as_json: bool = typer.Option(False, "--json", help="Emit machine-readable JSON."),
108
+ ) -> None:
109
+ """Show the current state of every visible GPU."""
110
+
111
+ try:
112
+ statuses = _make_probe().query()
113
+ except ProbeError as exc:
114
+ _err.print(f"gpu-gate: {exc}")
115
+ raise typer.Exit(EXIT_PROBE) from exc
116
+ if as_json:
117
+ payload = [
118
+ {
119
+ "index": s.index,
120
+ "name": s.name,
121
+ "uuid": s.uuid,
122
+ "memory_total_mb": s.memory_total_mb,
123
+ "memory_used_mb": s.memory_used_mb,
124
+ "memory_free_mb": s.memory_free_mb,
125
+ "utilization_pct": s.utilization_pct,
126
+ }
127
+ for s in statuses
128
+ ]
129
+ _out.print_json(json.dumps(payload))
130
+ else:
131
+ _out.print(_status_rows(statuses))
132
+
133
+
134
+ @app.command("wait")
135
+ def wait(
136
+ count: int = typer.Option(1, "-n", "--count", help="Number of GPUs to wait for."),
137
+ min_free_mb: int = typer.Option(0, "--min-free-mb", help="Minimum free memory."),
138
+ max_util: int = typer.Option(100, "--max-util", help="Maximum utilization percent."),
139
+ exclude: str | None = typer.Option(None, "--exclude", help="Indices to skip, e.g. 0,1."),
140
+ only: str | None = typer.Option(None, "--only", help="Restrict to these indices."),
141
+ poll: float = typer.Option(5.0, "--poll", help="Seconds between polls."),
142
+ timeout: float | None = typer.Option(None, "--timeout", help="Give up after N seconds."),
143
+ as_json: bool = typer.Option(False, "--json", help="Print the chosen indices as JSON."),
144
+ ) -> None:
145
+ """Block until enough GPUs are free, then print the chosen indices."""
146
+
147
+ req = _requirements(count, min_free_mb, max_util, exclude, only)
148
+ config = WaitConfig(requirements=req, poll_interval=poll, timeout=timeout)
149
+ try:
150
+ selection = wait_for_selection(_make_probe(), config)
151
+ except WaitTimeout as exc:
152
+ _err.print(f"gpu-gate: {exc}")
153
+ raise typer.Exit(EXIT_TIMEOUT) from exc
154
+ except ProbeError as exc:
155
+ _err.print(f"gpu-gate: {exc}")
156
+ raise typer.Exit(EXIT_PROBE) from exc
157
+ if as_json:
158
+ _out.print_json(json.dumps({"indices": list(selection.indices)}))
159
+ else:
160
+ _out.print(selection.cuda_visible_devices)
161
+
162
+
163
+ @app.command(
164
+ "run",
165
+ context_settings={"allow_extra_args": True, "ignore_unknown_options": True},
166
+ )
167
+ def run(
168
+ ctx: typer.Context,
169
+ count: int = typer.Option(1, "-n", "--count", help="Number of GPUs to claim."),
170
+ min_free_mb: int = typer.Option(0, "--min-free-mb", help="Minimum free memory."),
171
+ max_util: int = typer.Option(100, "--max-util", help="Maximum utilization percent."),
172
+ exclude: str | None = typer.Option(None, "--exclude", help="Indices to skip."),
173
+ only: str | None = typer.Option(None, "--only", help="Restrict to these indices."),
174
+ poll: float = typer.Option(5.0, "--poll", help="Seconds between polls."),
175
+ timeout: float | None = typer.Option(None, "--timeout", help="Give up after N seconds."),
176
+ quiet: bool = typer.Option(False, "--quiet", help="Do not announce the choice."),
177
+ ) -> None:
178
+ """Wait for GPUs, set CUDA_VISIBLE_DEVICES, and run COMMAND.
179
+
180
+ Put the command after a literal ``--``, for example:
181
+
182
+ gpu-gate run -n 1 --min-free-mb 8000 -- python train.py
183
+ """
184
+
185
+ command = list(ctx.args)
186
+ if not command:
187
+ _err.print("gpu-gate: no command given; pass it after --")
188
+ raise typer.Exit(2)
189
+
190
+ req = _requirements(count, min_free_mb, max_util, exclude, only)
191
+ config = WaitConfig(requirements=req, poll_interval=poll, timeout=timeout)
192
+
193
+ def announce(_exc: NotEnoughGPUs) -> None:
194
+ if not quiet:
195
+ _err.print("gpu-gate: waiting for a free GPU ...")
196
+
197
+ try:
198
+ code = run_with_gpus(_make_probe(), config, command, on_wait=announce)
199
+ except WaitTimeout as exc:
200
+ _err.print(f"gpu-gate: {exc}")
201
+ raise typer.Exit(EXIT_TIMEOUT) from exc
202
+ except ProbeError as exc:
203
+ _err.print(f"gpu-gate: {exc}")
204
+ raise typer.Exit(EXIT_PROBE) from exc
205
+ raise typer.Exit(code)
206
+
207
+
208
+ def entrypoint() -> None:
209
+ try:
210
+ app()
211
+ except KeyboardInterrupt: # pragma: no cover - interactive only
212
+ print("gpu-gate: interrupted", file=sys.stderr)
213
+ raise SystemExit(130) from None
gpu_gate/lock.py ADDED
@@ -0,0 +1,72 @@
1
+ """Cooperative, host-local locks so two gpu-gate runs do not grab the same
2
+ just-freed card.
3
+
4
+ A lock is a small file under a shared directory (``$GPU_GATE_LOCK_DIR`` or a
5
+ per-user default). Holding the lock means "I am about to use this GPU";
6
+ selection skips devices whose lock is currently held by a live process.
7
+ The scheme is advisory: it only coordinates other gpu-gate callers, not
8
+ arbitrary CUDA programs.
9
+ """
10
+
11
+ from __future__ import annotations
12
+
13
+ import os
14
+ import tempfile
15
+ from contextlib import ExitStack
16
+ from pathlib import Path
17
+
18
+ from filelock import FileLock, Timeout
19
+
20
+
21
+ def lock_dir() -> Path:
22
+ env = os.environ.get("GPU_GATE_LOCK_DIR")
23
+ if env:
24
+ path = Path(env)
25
+ else:
26
+ base = os.environ.get("XDG_RUNTIME_DIR") or tempfile.gettempdir()
27
+ path = Path(base) / "gpu-gate-locks"
28
+ path.mkdir(parents=True, exist_ok=True)
29
+ return path
30
+
31
+
32
+ def _lock_path(index: int) -> Path:
33
+ return lock_dir() / f"gpu{index}.lock"
34
+
35
+
36
+ def is_locked(index: int) -> bool:
37
+ """Return whether GPU ``index`` is currently claimed by another caller."""
38
+
39
+ lock = FileLock(str(_lock_path(index)), timeout=0)
40
+ try:
41
+ lock.acquire()
42
+ except Timeout:
43
+ return True
44
+ else:
45
+ lock.release()
46
+ return False
47
+
48
+
49
+ class GpuClaim:
50
+ """Hold locks for a set of GPU indices for the lifetime of the context."""
51
+
52
+ def __init__(self, indices: tuple[int, ...]) -> None:
53
+ self.indices = indices
54
+ self._stack: ExitStack | None = None
55
+
56
+ def __enter__(self) -> GpuClaim:
57
+ stack = ExitStack()
58
+ try:
59
+ for index in self.indices:
60
+ lock = FileLock(str(_lock_path(index)), timeout=0)
61
+ lock.acquire()
62
+ stack.callback(lock.release)
63
+ except Timeout:
64
+ stack.close()
65
+ raise
66
+ self._stack = stack
67
+ return self
68
+
69
+ def __exit__(self, *exc: object) -> None:
70
+ if self._stack is not None:
71
+ self._stack.close()
72
+ self._stack = None
gpu_gate/models.py ADDED
@@ -0,0 +1,61 @@
1
+ """Plain data structures shared across gpu-gate.
2
+
3
+ These types carry no behaviour and no NVML dependency, which keeps the
4
+ selection logic in :mod:`gpu_gate.selector` pure and trivially testable.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ from dataclasses import dataclass, field
10
+
11
+
12
+ @dataclass(frozen=True, slots=True)
13
+ class GpuStatus:
14
+ """A point-in-time snapshot of a single GPU."""
15
+
16
+ index: int
17
+ name: str
18
+ uuid: str
19
+ memory_total_mb: int
20
+ memory_used_mb: int
21
+ utilization_pct: int
22
+
23
+ @property
24
+ def memory_free_mb(self) -> int:
25
+ return max(0, self.memory_total_mb - self.memory_used_mb)
26
+
27
+ @property
28
+ def memory_used_pct(self) -> float:
29
+ if self.memory_total_mb <= 0:
30
+ return 0.0
31
+ return 100.0 * self.memory_used_mb / self.memory_total_mb
32
+
33
+
34
+ @dataclass(frozen=True, slots=True)
35
+ class Requirements:
36
+ """What a caller needs before a GPU is considered usable."""
37
+
38
+ count: int = 1
39
+ min_free_mb: int = 0
40
+ max_utilization_pct: int = 100
41
+ exclude: frozenset[int] = field(default_factory=frozenset)
42
+ include: frozenset[int] | None = None
43
+
44
+ def __post_init__(self) -> None:
45
+ if self.count < 1:
46
+ raise ValueError("count must be >= 1")
47
+ if self.min_free_mb < 0:
48
+ raise ValueError("min_free_mb must be >= 0")
49
+ if not 0 <= self.max_utilization_pct <= 100:
50
+ raise ValueError("max_utilization_pct must be between 0 and 100")
51
+
52
+
53
+ @dataclass(frozen=True, slots=True)
54
+ class Selection:
55
+ """The result of a successful selection."""
56
+
57
+ indices: tuple[int, ...]
58
+
59
+ @property
60
+ def cuda_visible_devices(self) -> str:
61
+ return ",".join(str(i) for i in self.indices)
gpu_gate/probe.py ADDED
@@ -0,0 +1,92 @@
1
+ """Read GPU state from the system.
2
+
3
+ The real implementation talks to NVIDIA's NVML through ``pynvml``. NVML is
4
+ imported lazily so the package installs and imports on machines without a
5
+ driver; the dependency only matters when you actually probe hardware.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ from typing import Protocol
11
+
12
+ from gpu_gate.models import GpuStatus
13
+
14
+
15
+ class Probe(Protocol):
16
+ """Anything that can report the current state of the local GPUs."""
17
+
18
+ def query(self) -> list[GpuStatus]: ...
19
+
20
+
21
+ class ProbeError(RuntimeError):
22
+ """Raised when GPU state cannot be read."""
23
+
24
+
25
+ class NvmlProbe:
26
+ """Query local NVIDIA GPUs via NVML.
27
+
28
+ A single instance keeps NVML initialized between calls; use it as a
29
+ context manager (or call :meth:`close`) to release the handle.
30
+ """
31
+
32
+ def __init__(self) -> None:
33
+ self._nvml = None
34
+
35
+ def _ensure_init(self):
36
+ if self._nvml is not None:
37
+ return self._nvml
38
+ try:
39
+ import pynvml
40
+ except ImportError as exc: # pragma: no cover - import guard
41
+ raise ProbeError(
42
+ "pynvml is not installed; install gpu-gate with its default "
43
+ "dependencies to read GPU state"
44
+ ) from exc
45
+ try:
46
+ pynvml.nvmlInit()
47
+ except Exception as exc: # pragma: no cover - needs a driver
48
+ raise ProbeError("could not initialize NVML; is an NVIDIA driver present?") from exc
49
+ self._nvml = pynvml
50
+ return pynvml
51
+
52
+ def query(self) -> list[GpuStatus]:
53
+ pynvml = self._ensure_init()
54
+ out: list[GpuStatus] = []
55
+ count = pynvml.nvmlDeviceGetCount()
56
+ for index in range(count):
57
+ handle = pynvml.nvmlDeviceGetHandleByIndex(index)
58
+ mem = pynvml.nvmlDeviceGetMemoryInfo(handle)
59
+ util = pynvml.nvmlDeviceGetUtilizationRates(handle)
60
+ out.append(
61
+ GpuStatus(
62
+ index=index,
63
+ name=_as_text(pynvml.nvmlDeviceGetName(handle)),
64
+ uuid=_as_text(pynvml.nvmlDeviceGetUUID(handle)),
65
+ memory_total_mb=int(mem.total // (1024 * 1024)),
66
+ memory_used_mb=int(mem.used // (1024 * 1024)),
67
+ utilization_pct=int(util.gpu),
68
+ )
69
+ )
70
+ return out
71
+
72
+ def close(self) -> None:
73
+ if self._nvml is not None:
74
+ try:
75
+ self._nvml.nvmlShutdown()
76
+ finally:
77
+ self._nvml = None
78
+
79
+ def __enter__(self) -> NvmlProbe:
80
+ self._ensure_init()
81
+ return self
82
+
83
+ def __exit__(self, *exc: object) -> None:
84
+ self.close()
85
+
86
+
87
+ def _as_text(value: object) -> str:
88
+ """NVML returns ``bytes`` on some versions and ``str`` on others."""
89
+
90
+ if isinstance(value, bytes):
91
+ return value.decode("utf-8", "replace")
92
+ return str(value)
gpu_gate/runner.py ADDED
@@ -0,0 +1,109 @@
1
+ """Wait for eligible GPUs, claim them, and hand control to the command.
2
+
3
+ The wait loop takes its clock and sleep function as arguments so tests can
4
+ drive timeouts deterministically without real delays.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ import os
10
+ import sys
11
+ import time
12
+ from collections.abc import Callable, Sequence
13
+ from dataclasses import dataclass
14
+
15
+ from gpu_gate.lock import GpuClaim, is_locked
16
+ from gpu_gate.models import GpuStatus, Requirements, Selection
17
+ from gpu_gate.probe import Probe
18
+ from gpu_gate.selector import NotEnoughGPUs, select
19
+
20
+ Clock = Callable[[], float]
21
+ Sleep = Callable[[float], None]
22
+
23
+
24
+ class WaitTimeout(Exception):
25
+ """Raised when no eligible GPU appeared before the deadline."""
26
+
27
+ def __init__(self, seconds: float) -> None:
28
+ self.seconds = seconds
29
+ super().__init__(f"timed out after {seconds:g}s waiting for a free GPU")
30
+
31
+
32
+ @dataclass(frozen=True, slots=True)
33
+ class WaitConfig:
34
+ requirements: Requirements
35
+ poll_interval: float = 5.0
36
+ timeout: float | None = None
37
+ respect_locks: bool = True
38
+
39
+
40
+ def _without_locked(statuses: Sequence[GpuStatus]) -> list[GpuStatus]:
41
+ return [s for s in statuses if not is_locked(s.index)]
42
+
43
+
44
+ def wait_for_selection(
45
+ probe: Probe,
46
+ config: WaitConfig,
47
+ *,
48
+ clock: Clock = time.monotonic,
49
+ sleep: Sleep = time.sleep,
50
+ on_wait: Callable[[NotEnoughGPUs], None] | None = None,
51
+ ) -> Selection:
52
+ """Poll ``probe`` until ``config.requirements`` can be met, then return it.
53
+
54
+ Raises :class:`WaitTimeout` if a timeout is set and elapses first.
55
+ """
56
+
57
+ start = clock()
58
+ while True:
59
+ statuses = probe.query()
60
+ if config.respect_locks:
61
+ statuses = _without_locked(statuses)
62
+ try:
63
+ return select(statuses, config.requirements)
64
+ except NotEnoughGPUs as exc:
65
+ if on_wait is not None:
66
+ on_wait(exc)
67
+ if config.timeout is not None and clock() - start >= config.timeout:
68
+ raise WaitTimeout(config.timeout) from exc
69
+ sleep(config.poll_interval)
70
+
71
+
72
+ def build_child_env(selection: Selection, base: dict[str, str] | None = None) -> dict[str, str]:
73
+ """Return an environment with ``CUDA_VISIBLE_DEVICES`` set to the choice."""
74
+
75
+ env = dict(os.environ if base is None else base)
76
+ env["CUDA_VISIBLE_DEVICES"] = selection.cuda_visible_devices
77
+ return env
78
+
79
+
80
+ def run_with_gpus(
81
+ probe: Probe,
82
+ config: WaitConfig,
83
+ command: Sequence[str],
84
+ *,
85
+ clock: Clock = time.monotonic,
86
+ sleep: Sleep = time.sleep,
87
+ exec_fn: Callable[[Sequence[str], dict[str, str]], int] | None = None,
88
+ on_wait: Callable[[NotEnoughGPUs], None] | None = None,
89
+ ) -> int:
90
+ """Wait for GPUs, claim them, and run ``command``. Returns its exit code."""
91
+
92
+ if not command:
93
+ raise ValueError("command must not be empty")
94
+ selection = wait_for_selection(probe, config, clock=clock, sleep=sleep, on_wait=on_wait)
95
+ runner = exec_fn or _spawn
96
+ with GpuClaim(selection.indices):
97
+ env = build_child_env(selection)
98
+ return runner(command, env)
99
+
100
+
101
+ def _spawn(command: Sequence[str], env: dict[str, str]) -> int:
102
+ import subprocess
103
+
104
+ try:
105
+ completed = subprocess.run(list(command), env=env, check=False) # noqa: S603
106
+ except FileNotFoundError:
107
+ print(f"gpu-gate: command not found: {command[0]}", file=sys.stderr)
108
+ return 127
109
+ return completed.returncode
gpu_gate/selector.py ADDED
@@ -0,0 +1,61 @@
1
+ """Pure GPU selection logic.
2
+
3
+ Given a list of :class:`~gpu_gate.models.GpuStatus` and a set of
4
+ :class:`~gpu_gate.models.Requirements`, decide which devices to claim.
5
+ No NVML, no I/O, no global state, so it can be exercised exhaustively
6
+ in unit tests without a GPU present.
7
+ """
8
+
9
+ from __future__ import annotations
10
+
11
+ from collections.abc import Iterable
12
+
13
+ from gpu_gate.models import GpuStatus, Requirements, Selection
14
+
15
+
16
+ class NotEnoughGPUs(Exception):
17
+ """Raised when fewer GPUs meet the requirements than were requested."""
18
+
19
+ def __init__(self, requested: int, available: int) -> None:
20
+ self.requested = requested
21
+ self.available = available
22
+ super().__init__(f"requested {requested} GPU(s) but only {available} met the requirements")
23
+
24
+
25
+ def eligible(status: GpuStatus, req: Requirements) -> bool:
26
+ """Return whether a single GPU satisfies the requirements."""
27
+
28
+ if req.include is not None and status.index not in req.include:
29
+ return False
30
+ if status.index in req.exclude:
31
+ return False
32
+ if status.memory_free_mb < req.min_free_mb:
33
+ return False
34
+ if status.utilization_pct > req.max_utilization_pct:
35
+ return False
36
+ return True
37
+
38
+
39
+ def rank(statuses: Iterable[GpuStatus]) -> list[GpuStatus]:
40
+ """Order GPUs best-first: most free memory, then lowest utilization.
41
+
42
+ Index is the final tie-breaker so the ordering is deterministic.
43
+ """
44
+
45
+ return sorted(
46
+ statuses,
47
+ key=lambda s: (-s.memory_free_mb, s.utilization_pct, s.index),
48
+ )
49
+
50
+
51
+ def select(statuses: Iterable[GpuStatus], req: Requirements) -> Selection:
52
+ """Choose ``req.count`` GPUs that satisfy ``req``.
53
+
54
+ Raises :class:`NotEnoughGPUs` if not enough eligible devices exist.
55
+ """
56
+
57
+ candidates = rank(s for s in statuses if eligible(s, req))
58
+ if len(candidates) < req.count:
59
+ raise NotEnoughGPUs(requested=req.count, available=len(candidates))
60
+ chosen = candidates[: req.count]
61
+ return Selection(indices=tuple(sorted(s.index for s in chosen)))
@@ -0,0 +1,153 @@
1
+ Metadata-Version: 2.4
2
+ Name: gpu-gate
3
+ Version: 0.2.0
4
+ Summary: Wait for a free GPU, claim it, and run a command on it.
5
+ Project-URL: Homepage, https://github.com/jmweb-org/gpu-gate
6
+ Project-URL: Repository, https://github.com/jmweb-org/gpu-gate
7
+ Project-URL: Issues, https://github.com/jmweb-org/gpu-gate/issues
8
+ Author: José del Río
9
+ License: MIT License
10
+
11
+ Copyright (c) 2026 José del Río
12
+
13
+ Permission is hereby granted, free of charge, to any person obtaining a copy
14
+ of this software and associated documentation files (the "Software"), to deal
15
+ in the Software without restriction, including without limitation the rights
16
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
17
+ copies of the Software, and to permit persons to whom the Software is
18
+ furnished to do so, subject to the following conditions:
19
+
20
+ The above copyright notice and this permission notice shall be included in all
21
+ copies or substantial portions of the Software.
22
+
23
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
24
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
25
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
26
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
27
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
28
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
29
+ SOFTWARE.
30
+ License-File: LICENSE
31
+ Keywords: cli,cuda,gpu,nvidia,nvml,scheduler
32
+ Classifier: Development Status :: 4 - Beta
33
+ Classifier: Environment :: GPU :: NVIDIA CUDA
34
+ Classifier: Intended Audience :: Developers
35
+ Classifier: License :: OSI Approved :: MIT License
36
+ Classifier: Operating System :: POSIX :: Linux
37
+ Classifier: Programming Language :: Python :: 3.10
38
+ Classifier: Programming Language :: Python :: 3.11
39
+ Classifier: Programming Language :: Python :: 3.12
40
+ Classifier: Topic :: System :: Distributed Computing
41
+ Classifier: Topic :: Utilities
42
+ Requires-Python: >=3.10
43
+ Requires-Dist: filelock>=3.12
44
+ Requires-Dist: nvidia-ml-py>=12.535
45
+ Requires-Dist: rich>=13.0
46
+ Requires-Dist: typer>=0.12
47
+ Description-Content-Type: text/markdown
48
+
49
+ # gpu-gate
50
+
51
+ [![CI](https://github.com/jmweb-org/gpu-gate/actions/workflows/ci.yml/badge.svg)](https://github.com/jmweb-org/gpu-gate/actions/workflows/ci.yml)
52
+ [![PyPI](https://img.shields.io/pypi/v/gpu-gate.svg)](https://pypi.org/project/gpu-gate/)
53
+ [![Python](https://img.shields.io/badge/python-3.10%2B-blue.svg)](https://www.python.org)
54
+ [![License: MIT](https://img.shields.io/badge/license-MIT-blue.svg)](LICENSE)
55
+
56
+ Wait for a free GPU, claim it, set `CUDA_VISIBLE_DEVICES`, and run your command.
57
+
58
+ On a shared multi-GPU box without a cluster scheduler, starting a job usually
59
+ means watching `nvidia-smi`, picking a card by hand, exporting the env var, and
60
+ remembering to actually launch. `gpu-gate` is the small wait-pick-export-run
61
+ loop that does this for you, with a cooperative lock so two invocations on the
62
+ same host do not grab the same just-freed card. No daemon, no server, nothing
63
+ to administer.
64
+
65
+ ```console
66
+ $ gpu-gate run --min-free-mb 8000 -- python train.py
67
+ gpu-gate: waiting for a free GPU ...
68
+ # ... blocks until a card has >= 8 GB free, then runs train.py with
69
+ # CUDA_VISIBLE_DEVICES set to the chosen index
70
+ ```
71
+
72
+ ## Install
73
+
74
+ ```console
75
+ $ pip install gpu-gate # from PyPI, once released
76
+ $ pip install git+https://github.com/jmweb-org/gpu-gate # latest, available now
77
+ ```
78
+
79
+ It requires an NVIDIA driver at run time. The NVML binding
80
+ (`nvidia-ml-py`) is pulled in automatically; the package still installs and
81
+ imports on machines without a GPU, so it is safe to add to shared requirements.
82
+
83
+ ## Usage
84
+
85
+ ### Run a command on a free GPU
86
+
87
+ ```console
88
+ $ gpu-gate run -n 1 --min-free-mb 8000 -- python train.py --epochs 50
89
+ ```
90
+
91
+ Everything after `--` is the command. `gpu-gate` blocks until the requirements
92
+ are met, claims the chosen device(s), exports `CUDA_VISIBLE_DEVICES`, and execs
93
+ the command. Its own exit code is the command's exit code, so it drops cleanly
94
+ into scripts and CI.
95
+
96
+ Common options:
97
+
98
+ | Option | Meaning |
99
+ | --- | --- |
100
+ | `-n, --count` | Number of GPUs to claim (default 1) |
101
+ | `--min-free-mb` | Require at least this much free memory |
102
+ | `--max-util` | Skip cards busier than this percent |
103
+ | `--only 0,1` | Restrict the search to these indices |
104
+ | `--exclude 2,3` | Never pick these indices |
105
+ | `--poll` | Seconds between checks (default 5) |
106
+ | `--timeout` | Give up after N seconds (exit 124) |
107
+
108
+ ### Just wait, then use the result yourself
109
+
110
+ ```console
111
+ $ export CUDA_VISIBLE_DEVICES=$(gpu-gate wait --min-free-mb 8000)
112
+ ```
113
+
114
+ ### Inspect the current state
115
+
116
+ ```console
117
+ $ gpu-gate status
118
+ idx name free total util
119
+ 0 NVIDIA L40S 44211 MiB 46068 MiB 3%
120
+ 1 NVIDIA L40S 812 MiB 46068 MiB 97%
121
+
122
+ $ gpu-gate status --json
123
+ ```
124
+
125
+ ## Exit codes
126
+
127
+ | Code | Meaning |
128
+ | --- | --- |
129
+ | 0 | Command ran (its own code is forwarded) |
130
+ | 2 | Bad invocation (for example, no command after `--`) |
131
+ | 124 | Timed out waiting for a GPU |
132
+ | 3 | Requirements could never be met |
133
+ | 4 | Could not read GPU state (no driver / NVML error) |
134
+
135
+ ## How selection works
136
+
137
+ A GPU is eligible when it has enough free memory, is below the utilization
138
+ ceiling, is not excluded, and is not currently locked by another `gpu-gate`
139
+ caller. Eligible cards are ranked by most free memory, then lowest
140
+ utilization, then index, and the top `--count` are chosen. The ordering is
141
+ fully deterministic.
142
+
143
+ ## Locking
144
+
145
+ While a command runs, `gpu-gate` holds an advisory file lock per claimed
146
+ device under `$GPU_GATE_LOCK_DIR` (a per-user directory by default). Other
147
+ `gpu-gate` invocations skip locked devices, which avoids the classic race where
148
+ two jobs both see the same card free at the same instant. The lock is advisory:
149
+ it coordinates `gpu-gate` callers, not arbitrary CUDA programs.
150
+
151
+ ## License
152
+
153
+ MIT. See [LICENSE](LICENSE).
@@ -0,0 +1,13 @@
1
+ gpu_gate/__init__.py,sha256=aheQ9bzv_7YxyEay-6KgLe-jlqHvW8tJ6VY3TSNi4OM,334
2
+ gpu_gate/__main__.py,sha256=-K0PP9Dejd0YsIKDNGCSVQizMGx1-YYAAE0DGp9q7x8,81
3
+ gpu_gate/cli.py,sha256=ELdGznx1BPq_rzzP3NZrMVQhvgXMIlqUAIpYEgLS_R8,6687
4
+ gpu_gate/lock.py,sha256=B96J2K5hHWMxXbWCggOC8wt2HdYjLwjEFRp_VfrK8Bw,2008
5
+ gpu_gate/models.py,sha256=WggqmEYyNjltLVABwFccbE50E-y04l1TFl7-jiG5e7I,1691
6
+ gpu_gate/probe.py,sha256=rmV-PPSgFMatxDTN_SAnhxGTjYEgb95oNVJ3AyYqE54,2880
7
+ gpu_gate/runner.py,sha256=Lkv6MtaTLh1lGfrXSUeWBaANjwo3sHhAhkhkzWEGRrQ,3429
8
+ gpu_gate/selector.py,sha256=RYcigKR_ZgC3LSvMILEi1v6LDE_GyRvVI937SXeuDog,2021
9
+ gpu_gate-0.2.0.dist-info/METADATA,sha256=jZVdkWP7PXozFatELomuJG2LmCSsfBgGCcEsJ9r7p50,6050
10
+ gpu_gate-0.2.0.dist-info/WHEEL,sha256=mffPy8wBnZQn2VnJUU5jE99KsxaSfiyMHV9Yt0aLVxs,87
11
+ gpu_gate-0.2.0.dist-info/entry_points.txt,sha256=mR4F0k-HdSWtzhVUc9Q4iwkMQIxuabiTjICD5jBDaN0,53
12
+ gpu_gate-0.2.0.dist-info/licenses/LICENSE,sha256=N4nJy_wSxYwULjDvuE2GupQWZSSwgOOU_HJSzuxHBsI,1071
13
+ gpu_gate-0.2.0.dist-info/RECORD,,
@@ -0,0 +1,4 @@
1
+ Wheel-Version: 1.0
2
+ Generator: hatchling 1.30.1
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
@@ -0,0 +1,2 @@
1
+ [console_scripts]
2
+ gpu-gate = gpu_gate.cli:entrypoint
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 José del Río
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.