gpu-gate 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- gpu_gate/__init__.py +15 -0
- gpu_gate/__main__.py +4 -0
- gpu_gate/cli.py +213 -0
- gpu_gate/lock.py +72 -0
- gpu_gate/models.py +61 -0
- gpu_gate/probe.py +92 -0
- gpu_gate/runner.py +109 -0
- gpu_gate/selector.py +61 -0
- gpu_gate-0.2.0.dist-info/METADATA +153 -0
- gpu_gate-0.2.0.dist-info/RECORD +13 -0
- gpu_gate-0.2.0.dist-info/WHEEL +4 -0
- gpu_gate-0.2.0.dist-info/entry_points.txt +2 -0
- gpu_gate-0.2.0.dist-info/licenses/LICENSE +21 -0
gpu_gate/__init__.py
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
"""gpu-gate: wait for a free GPU, claim it, and run a command on it."""
|
|
2
|
+
|
|
3
|
+
from gpu_gate.models import GpuStatus, Requirements, Selection
|
|
4
|
+
from gpu_gate.selector import NotEnoughGPUs, select
|
|
5
|
+
|
|
6
|
+
__version__ = "0.2.0"
|
|
7
|
+
|
|
8
|
+
__all__ = [
|
|
9
|
+
"GpuStatus",
|
|
10
|
+
"NotEnoughGPUs",
|
|
11
|
+
"Requirements",
|
|
12
|
+
"Selection",
|
|
13
|
+
"__version__",
|
|
14
|
+
"select",
|
|
15
|
+
]
|
gpu_gate/__main__.py
ADDED
gpu_gate/cli.py
ADDED
|
@@ -0,0 +1,213 @@
|
|
|
1
|
+
"""Command-line interface for gpu-gate."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import json
|
|
6
|
+
import sys
|
|
7
|
+
|
|
8
|
+
import typer
|
|
9
|
+
from rich.console import Console
|
|
10
|
+
from rich.table import Table
|
|
11
|
+
|
|
12
|
+
from gpu_gate import __version__
|
|
13
|
+
from gpu_gate.models import GpuStatus, Requirements
|
|
14
|
+
from gpu_gate.probe import NvmlProbe, Probe, ProbeError
|
|
15
|
+
from gpu_gate.runner import (
|
|
16
|
+
WaitConfig,
|
|
17
|
+
WaitTimeout,
|
|
18
|
+
run_with_gpus,
|
|
19
|
+
wait_for_selection,
|
|
20
|
+
)
|
|
21
|
+
from gpu_gate.selector import NotEnoughGPUs
|
|
22
|
+
|
|
23
|
+
app = typer.Typer(
|
|
24
|
+
add_completion=False,
|
|
25
|
+
no_args_is_help=True,
|
|
26
|
+
help="Wait for a free GPU, claim it, and run a command on it.",
|
|
27
|
+
)
|
|
28
|
+
_err = Console(stderr=True)
|
|
29
|
+
_out = Console()
|
|
30
|
+
|
|
31
|
+
EXIT_OK = 0
|
|
32
|
+
EXIT_TIMEOUT = 124
|
|
33
|
+
EXIT_NO_GPU = 3
|
|
34
|
+
EXIT_PROBE = 4
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def _requirements(
|
|
38
|
+
count: int,
|
|
39
|
+
min_free_mb: int,
|
|
40
|
+
max_util: int,
|
|
41
|
+
exclude: str | None,
|
|
42
|
+
only: str | None,
|
|
43
|
+
) -> Requirements:
|
|
44
|
+
return Requirements(
|
|
45
|
+
count=count,
|
|
46
|
+
min_free_mb=min_free_mb,
|
|
47
|
+
max_utilization_pct=max_util,
|
|
48
|
+
exclude=_parse_indices(exclude),
|
|
49
|
+
include=_parse_indices(only) if only else None,
|
|
50
|
+
)
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def _parse_indices(value: str | None) -> frozenset[int]:
|
|
54
|
+
if not value:
|
|
55
|
+
return frozenset()
|
|
56
|
+
out: set[int] = set()
|
|
57
|
+
for chunk in value.split(","):
|
|
58
|
+
chunk = chunk.strip()
|
|
59
|
+
if chunk:
|
|
60
|
+
out.add(int(chunk))
|
|
61
|
+
return frozenset(out)
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
def _make_probe() -> Probe:
|
|
65
|
+
return NvmlProbe()
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
def _status_rows(statuses: list[GpuStatus]) -> Table:
|
|
69
|
+
table = Table(box=None, pad_edge=False)
|
|
70
|
+
table.add_column("idx", justify="right")
|
|
71
|
+
table.add_column("name")
|
|
72
|
+
table.add_column("free", justify="right")
|
|
73
|
+
table.add_column("total", justify="right")
|
|
74
|
+
table.add_column("util", justify="right")
|
|
75
|
+
for s in statuses:
|
|
76
|
+
table.add_row(
|
|
77
|
+
str(s.index),
|
|
78
|
+
s.name,
|
|
79
|
+
f"{s.memory_free_mb} MiB",
|
|
80
|
+
f"{s.memory_total_mb} MiB",
|
|
81
|
+
f"{s.utilization_pct}%",
|
|
82
|
+
)
|
|
83
|
+
return table
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
def _version_callback(value: bool) -> None:
|
|
87
|
+
if value:
|
|
88
|
+
_out.print(f"gpu-gate {__version__}")
|
|
89
|
+
raise typer.Exit()
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
@app.callback()
|
|
93
|
+
def main(
|
|
94
|
+
_version: bool = typer.Option(
|
|
95
|
+
False,
|
|
96
|
+
"--version",
|
|
97
|
+
callback=_version_callback,
|
|
98
|
+
is_eager=True,
|
|
99
|
+
help="Show the version and exit.",
|
|
100
|
+
),
|
|
101
|
+
) -> None:
|
|
102
|
+
"""gpu-gate command-line interface."""
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
@app.command("status")
|
|
106
|
+
def status(
|
|
107
|
+
as_json: bool = typer.Option(False, "--json", help="Emit machine-readable JSON."),
|
|
108
|
+
) -> None:
|
|
109
|
+
"""Show the current state of every visible GPU."""
|
|
110
|
+
|
|
111
|
+
try:
|
|
112
|
+
statuses = _make_probe().query()
|
|
113
|
+
except ProbeError as exc:
|
|
114
|
+
_err.print(f"gpu-gate: {exc}")
|
|
115
|
+
raise typer.Exit(EXIT_PROBE) from exc
|
|
116
|
+
if as_json:
|
|
117
|
+
payload = [
|
|
118
|
+
{
|
|
119
|
+
"index": s.index,
|
|
120
|
+
"name": s.name,
|
|
121
|
+
"uuid": s.uuid,
|
|
122
|
+
"memory_total_mb": s.memory_total_mb,
|
|
123
|
+
"memory_used_mb": s.memory_used_mb,
|
|
124
|
+
"memory_free_mb": s.memory_free_mb,
|
|
125
|
+
"utilization_pct": s.utilization_pct,
|
|
126
|
+
}
|
|
127
|
+
for s in statuses
|
|
128
|
+
]
|
|
129
|
+
_out.print_json(json.dumps(payload))
|
|
130
|
+
else:
|
|
131
|
+
_out.print(_status_rows(statuses))
|
|
132
|
+
|
|
133
|
+
|
|
134
|
+
@app.command("wait")
|
|
135
|
+
def wait(
|
|
136
|
+
count: int = typer.Option(1, "-n", "--count", help="Number of GPUs to wait for."),
|
|
137
|
+
min_free_mb: int = typer.Option(0, "--min-free-mb", help="Minimum free memory."),
|
|
138
|
+
max_util: int = typer.Option(100, "--max-util", help="Maximum utilization percent."),
|
|
139
|
+
exclude: str | None = typer.Option(None, "--exclude", help="Indices to skip, e.g. 0,1."),
|
|
140
|
+
only: str | None = typer.Option(None, "--only", help="Restrict to these indices."),
|
|
141
|
+
poll: float = typer.Option(5.0, "--poll", help="Seconds between polls."),
|
|
142
|
+
timeout: float | None = typer.Option(None, "--timeout", help="Give up after N seconds."),
|
|
143
|
+
as_json: bool = typer.Option(False, "--json", help="Print the chosen indices as JSON."),
|
|
144
|
+
) -> None:
|
|
145
|
+
"""Block until enough GPUs are free, then print the chosen indices."""
|
|
146
|
+
|
|
147
|
+
req = _requirements(count, min_free_mb, max_util, exclude, only)
|
|
148
|
+
config = WaitConfig(requirements=req, poll_interval=poll, timeout=timeout)
|
|
149
|
+
try:
|
|
150
|
+
selection = wait_for_selection(_make_probe(), config)
|
|
151
|
+
except WaitTimeout as exc:
|
|
152
|
+
_err.print(f"gpu-gate: {exc}")
|
|
153
|
+
raise typer.Exit(EXIT_TIMEOUT) from exc
|
|
154
|
+
except ProbeError as exc:
|
|
155
|
+
_err.print(f"gpu-gate: {exc}")
|
|
156
|
+
raise typer.Exit(EXIT_PROBE) from exc
|
|
157
|
+
if as_json:
|
|
158
|
+
_out.print_json(json.dumps({"indices": list(selection.indices)}))
|
|
159
|
+
else:
|
|
160
|
+
_out.print(selection.cuda_visible_devices)
|
|
161
|
+
|
|
162
|
+
|
|
163
|
+
@app.command(
|
|
164
|
+
"run",
|
|
165
|
+
context_settings={"allow_extra_args": True, "ignore_unknown_options": True},
|
|
166
|
+
)
|
|
167
|
+
def run(
|
|
168
|
+
ctx: typer.Context,
|
|
169
|
+
count: int = typer.Option(1, "-n", "--count", help="Number of GPUs to claim."),
|
|
170
|
+
min_free_mb: int = typer.Option(0, "--min-free-mb", help="Minimum free memory."),
|
|
171
|
+
max_util: int = typer.Option(100, "--max-util", help="Maximum utilization percent."),
|
|
172
|
+
exclude: str | None = typer.Option(None, "--exclude", help="Indices to skip."),
|
|
173
|
+
only: str | None = typer.Option(None, "--only", help="Restrict to these indices."),
|
|
174
|
+
poll: float = typer.Option(5.0, "--poll", help="Seconds between polls."),
|
|
175
|
+
timeout: float | None = typer.Option(None, "--timeout", help="Give up after N seconds."),
|
|
176
|
+
quiet: bool = typer.Option(False, "--quiet", help="Do not announce the choice."),
|
|
177
|
+
) -> None:
|
|
178
|
+
"""Wait for GPUs, set CUDA_VISIBLE_DEVICES, and run COMMAND.
|
|
179
|
+
|
|
180
|
+
Put the command after a literal ``--``, for example:
|
|
181
|
+
|
|
182
|
+
gpu-gate run -n 1 --min-free-mb 8000 -- python train.py
|
|
183
|
+
"""
|
|
184
|
+
|
|
185
|
+
command = list(ctx.args)
|
|
186
|
+
if not command:
|
|
187
|
+
_err.print("gpu-gate: no command given; pass it after --")
|
|
188
|
+
raise typer.Exit(2)
|
|
189
|
+
|
|
190
|
+
req = _requirements(count, min_free_mb, max_util, exclude, only)
|
|
191
|
+
config = WaitConfig(requirements=req, poll_interval=poll, timeout=timeout)
|
|
192
|
+
|
|
193
|
+
def announce(_exc: NotEnoughGPUs) -> None:
|
|
194
|
+
if not quiet:
|
|
195
|
+
_err.print("gpu-gate: waiting for a free GPU ...")
|
|
196
|
+
|
|
197
|
+
try:
|
|
198
|
+
code = run_with_gpus(_make_probe(), config, command, on_wait=announce)
|
|
199
|
+
except WaitTimeout as exc:
|
|
200
|
+
_err.print(f"gpu-gate: {exc}")
|
|
201
|
+
raise typer.Exit(EXIT_TIMEOUT) from exc
|
|
202
|
+
except ProbeError as exc:
|
|
203
|
+
_err.print(f"gpu-gate: {exc}")
|
|
204
|
+
raise typer.Exit(EXIT_PROBE) from exc
|
|
205
|
+
raise typer.Exit(code)
|
|
206
|
+
|
|
207
|
+
|
|
208
|
+
def entrypoint() -> None:
|
|
209
|
+
try:
|
|
210
|
+
app()
|
|
211
|
+
except KeyboardInterrupt: # pragma: no cover - interactive only
|
|
212
|
+
print("gpu-gate: interrupted", file=sys.stderr)
|
|
213
|
+
raise SystemExit(130) from None
|
gpu_gate/lock.py
ADDED
|
@@ -0,0 +1,72 @@
|
|
|
1
|
+
"""Cooperative, host-local locks so two gpu-gate runs do not grab the same
|
|
2
|
+
just-freed card.
|
|
3
|
+
|
|
4
|
+
A lock is a small file under a shared directory (``$GPU_GATE_LOCK_DIR`` or a
|
|
5
|
+
per-user default). Holding the lock means "I am about to use this GPU";
|
|
6
|
+
selection skips devices whose lock is currently held by a live process.
|
|
7
|
+
The scheme is advisory: it only coordinates other gpu-gate callers, not
|
|
8
|
+
arbitrary CUDA programs.
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
from __future__ import annotations
|
|
12
|
+
|
|
13
|
+
import os
|
|
14
|
+
import tempfile
|
|
15
|
+
from contextlib import ExitStack
|
|
16
|
+
from pathlib import Path
|
|
17
|
+
|
|
18
|
+
from filelock import FileLock, Timeout
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def lock_dir() -> Path:
|
|
22
|
+
env = os.environ.get("GPU_GATE_LOCK_DIR")
|
|
23
|
+
if env:
|
|
24
|
+
path = Path(env)
|
|
25
|
+
else:
|
|
26
|
+
base = os.environ.get("XDG_RUNTIME_DIR") or tempfile.gettempdir()
|
|
27
|
+
path = Path(base) / "gpu-gate-locks"
|
|
28
|
+
path.mkdir(parents=True, exist_ok=True)
|
|
29
|
+
return path
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def _lock_path(index: int) -> Path:
|
|
33
|
+
return lock_dir() / f"gpu{index}.lock"
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def is_locked(index: int) -> bool:
|
|
37
|
+
"""Return whether GPU ``index`` is currently claimed by another caller."""
|
|
38
|
+
|
|
39
|
+
lock = FileLock(str(_lock_path(index)), timeout=0)
|
|
40
|
+
try:
|
|
41
|
+
lock.acquire()
|
|
42
|
+
except Timeout:
|
|
43
|
+
return True
|
|
44
|
+
else:
|
|
45
|
+
lock.release()
|
|
46
|
+
return False
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
class GpuClaim:
|
|
50
|
+
"""Hold locks for a set of GPU indices for the lifetime of the context."""
|
|
51
|
+
|
|
52
|
+
def __init__(self, indices: tuple[int, ...]) -> None:
|
|
53
|
+
self.indices = indices
|
|
54
|
+
self._stack: ExitStack | None = None
|
|
55
|
+
|
|
56
|
+
def __enter__(self) -> GpuClaim:
|
|
57
|
+
stack = ExitStack()
|
|
58
|
+
try:
|
|
59
|
+
for index in self.indices:
|
|
60
|
+
lock = FileLock(str(_lock_path(index)), timeout=0)
|
|
61
|
+
lock.acquire()
|
|
62
|
+
stack.callback(lock.release)
|
|
63
|
+
except Timeout:
|
|
64
|
+
stack.close()
|
|
65
|
+
raise
|
|
66
|
+
self._stack = stack
|
|
67
|
+
return self
|
|
68
|
+
|
|
69
|
+
def __exit__(self, *exc: object) -> None:
|
|
70
|
+
if self._stack is not None:
|
|
71
|
+
self._stack.close()
|
|
72
|
+
self._stack = None
|
gpu_gate/models.py
ADDED
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
"""Plain data structures shared across gpu-gate.
|
|
2
|
+
|
|
3
|
+
These types carry no behaviour and no NVML dependency, which keeps the
|
|
4
|
+
selection logic in :mod:`gpu_gate.selector` pure and trivially testable.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
from dataclasses import dataclass, field
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
@dataclass(frozen=True, slots=True)
|
|
13
|
+
class GpuStatus:
|
|
14
|
+
"""A point-in-time snapshot of a single GPU."""
|
|
15
|
+
|
|
16
|
+
index: int
|
|
17
|
+
name: str
|
|
18
|
+
uuid: str
|
|
19
|
+
memory_total_mb: int
|
|
20
|
+
memory_used_mb: int
|
|
21
|
+
utilization_pct: int
|
|
22
|
+
|
|
23
|
+
@property
|
|
24
|
+
def memory_free_mb(self) -> int:
|
|
25
|
+
return max(0, self.memory_total_mb - self.memory_used_mb)
|
|
26
|
+
|
|
27
|
+
@property
|
|
28
|
+
def memory_used_pct(self) -> float:
|
|
29
|
+
if self.memory_total_mb <= 0:
|
|
30
|
+
return 0.0
|
|
31
|
+
return 100.0 * self.memory_used_mb / self.memory_total_mb
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
@dataclass(frozen=True, slots=True)
|
|
35
|
+
class Requirements:
|
|
36
|
+
"""What a caller needs before a GPU is considered usable."""
|
|
37
|
+
|
|
38
|
+
count: int = 1
|
|
39
|
+
min_free_mb: int = 0
|
|
40
|
+
max_utilization_pct: int = 100
|
|
41
|
+
exclude: frozenset[int] = field(default_factory=frozenset)
|
|
42
|
+
include: frozenset[int] | None = None
|
|
43
|
+
|
|
44
|
+
def __post_init__(self) -> None:
|
|
45
|
+
if self.count < 1:
|
|
46
|
+
raise ValueError("count must be >= 1")
|
|
47
|
+
if self.min_free_mb < 0:
|
|
48
|
+
raise ValueError("min_free_mb must be >= 0")
|
|
49
|
+
if not 0 <= self.max_utilization_pct <= 100:
|
|
50
|
+
raise ValueError("max_utilization_pct must be between 0 and 100")
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
@dataclass(frozen=True, slots=True)
|
|
54
|
+
class Selection:
|
|
55
|
+
"""The result of a successful selection."""
|
|
56
|
+
|
|
57
|
+
indices: tuple[int, ...]
|
|
58
|
+
|
|
59
|
+
@property
|
|
60
|
+
def cuda_visible_devices(self) -> str:
|
|
61
|
+
return ",".join(str(i) for i in self.indices)
|
gpu_gate/probe.py
ADDED
|
@@ -0,0 +1,92 @@
|
|
|
1
|
+
"""Read GPU state from the system.
|
|
2
|
+
|
|
3
|
+
The real implementation talks to NVIDIA's NVML through ``pynvml``. NVML is
|
|
4
|
+
imported lazily so the package installs and imports on machines without a
|
|
5
|
+
driver; the dependency only matters when you actually probe hardware.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
from typing import Protocol
|
|
11
|
+
|
|
12
|
+
from gpu_gate.models import GpuStatus
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class Probe(Protocol):
|
|
16
|
+
"""Anything that can report the current state of the local GPUs."""
|
|
17
|
+
|
|
18
|
+
def query(self) -> list[GpuStatus]: ...
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class ProbeError(RuntimeError):
|
|
22
|
+
"""Raised when GPU state cannot be read."""
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class NvmlProbe:
|
|
26
|
+
"""Query local NVIDIA GPUs via NVML.
|
|
27
|
+
|
|
28
|
+
A single instance keeps NVML initialized between calls; use it as a
|
|
29
|
+
context manager (or call :meth:`close`) to release the handle.
|
|
30
|
+
"""
|
|
31
|
+
|
|
32
|
+
def __init__(self) -> None:
|
|
33
|
+
self._nvml = None
|
|
34
|
+
|
|
35
|
+
def _ensure_init(self):
|
|
36
|
+
if self._nvml is not None:
|
|
37
|
+
return self._nvml
|
|
38
|
+
try:
|
|
39
|
+
import pynvml
|
|
40
|
+
except ImportError as exc: # pragma: no cover - import guard
|
|
41
|
+
raise ProbeError(
|
|
42
|
+
"pynvml is not installed; install gpu-gate with its default "
|
|
43
|
+
"dependencies to read GPU state"
|
|
44
|
+
) from exc
|
|
45
|
+
try:
|
|
46
|
+
pynvml.nvmlInit()
|
|
47
|
+
except Exception as exc: # pragma: no cover - needs a driver
|
|
48
|
+
raise ProbeError("could not initialize NVML; is an NVIDIA driver present?") from exc
|
|
49
|
+
self._nvml = pynvml
|
|
50
|
+
return pynvml
|
|
51
|
+
|
|
52
|
+
def query(self) -> list[GpuStatus]:
|
|
53
|
+
pynvml = self._ensure_init()
|
|
54
|
+
out: list[GpuStatus] = []
|
|
55
|
+
count = pynvml.nvmlDeviceGetCount()
|
|
56
|
+
for index in range(count):
|
|
57
|
+
handle = pynvml.nvmlDeviceGetHandleByIndex(index)
|
|
58
|
+
mem = pynvml.nvmlDeviceGetMemoryInfo(handle)
|
|
59
|
+
util = pynvml.nvmlDeviceGetUtilizationRates(handle)
|
|
60
|
+
out.append(
|
|
61
|
+
GpuStatus(
|
|
62
|
+
index=index,
|
|
63
|
+
name=_as_text(pynvml.nvmlDeviceGetName(handle)),
|
|
64
|
+
uuid=_as_text(pynvml.nvmlDeviceGetUUID(handle)),
|
|
65
|
+
memory_total_mb=int(mem.total // (1024 * 1024)),
|
|
66
|
+
memory_used_mb=int(mem.used // (1024 * 1024)),
|
|
67
|
+
utilization_pct=int(util.gpu),
|
|
68
|
+
)
|
|
69
|
+
)
|
|
70
|
+
return out
|
|
71
|
+
|
|
72
|
+
def close(self) -> None:
|
|
73
|
+
if self._nvml is not None:
|
|
74
|
+
try:
|
|
75
|
+
self._nvml.nvmlShutdown()
|
|
76
|
+
finally:
|
|
77
|
+
self._nvml = None
|
|
78
|
+
|
|
79
|
+
def __enter__(self) -> NvmlProbe:
|
|
80
|
+
self._ensure_init()
|
|
81
|
+
return self
|
|
82
|
+
|
|
83
|
+
def __exit__(self, *exc: object) -> None:
|
|
84
|
+
self.close()
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
def _as_text(value: object) -> str:
|
|
88
|
+
"""NVML returns ``bytes`` on some versions and ``str`` on others."""
|
|
89
|
+
|
|
90
|
+
if isinstance(value, bytes):
|
|
91
|
+
return value.decode("utf-8", "replace")
|
|
92
|
+
return str(value)
|
gpu_gate/runner.py
ADDED
|
@@ -0,0 +1,109 @@
|
|
|
1
|
+
"""Wait for eligible GPUs, claim them, and hand control to the command.
|
|
2
|
+
|
|
3
|
+
The wait loop takes its clock and sleep function as arguments so tests can
|
|
4
|
+
drive timeouts deterministically without real delays.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
import os
|
|
10
|
+
import sys
|
|
11
|
+
import time
|
|
12
|
+
from collections.abc import Callable, Sequence
|
|
13
|
+
from dataclasses import dataclass
|
|
14
|
+
|
|
15
|
+
from gpu_gate.lock import GpuClaim, is_locked
|
|
16
|
+
from gpu_gate.models import GpuStatus, Requirements, Selection
|
|
17
|
+
from gpu_gate.probe import Probe
|
|
18
|
+
from gpu_gate.selector import NotEnoughGPUs, select
|
|
19
|
+
|
|
20
|
+
Clock = Callable[[], float]
|
|
21
|
+
Sleep = Callable[[float], None]
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class WaitTimeout(Exception):
|
|
25
|
+
"""Raised when no eligible GPU appeared before the deadline."""
|
|
26
|
+
|
|
27
|
+
def __init__(self, seconds: float) -> None:
|
|
28
|
+
self.seconds = seconds
|
|
29
|
+
super().__init__(f"timed out after {seconds:g}s waiting for a free GPU")
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
@dataclass(frozen=True, slots=True)
|
|
33
|
+
class WaitConfig:
|
|
34
|
+
requirements: Requirements
|
|
35
|
+
poll_interval: float = 5.0
|
|
36
|
+
timeout: float | None = None
|
|
37
|
+
respect_locks: bool = True
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def _without_locked(statuses: Sequence[GpuStatus]) -> list[GpuStatus]:
|
|
41
|
+
return [s for s in statuses if not is_locked(s.index)]
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def wait_for_selection(
|
|
45
|
+
probe: Probe,
|
|
46
|
+
config: WaitConfig,
|
|
47
|
+
*,
|
|
48
|
+
clock: Clock = time.monotonic,
|
|
49
|
+
sleep: Sleep = time.sleep,
|
|
50
|
+
on_wait: Callable[[NotEnoughGPUs], None] | None = None,
|
|
51
|
+
) -> Selection:
|
|
52
|
+
"""Poll ``probe`` until ``config.requirements`` can be met, then return it.
|
|
53
|
+
|
|
54
|
+
Raises :class:`WaitTimeout` if a timeout is set and elapses first.
|
|
55
|
+
"""
|
|
56
|
+
|
|
57
|
+
start = clock()
|
|
58
|
+
while True:
|
|
59
|
+
statuses = probe.query()
|
|
60
|
+
if config.respect_locks:
|
|
61
|
+
statuses = _without_locked(statuses)
|
|
62
|
+
try:
|
|
63
|
+
return select(statuses, config.requirements)
|
|
64
|
+
except NotEnoughGPUs as exc:
|
|
65
|
+
if on_wait is not None:
|
|
66
|
+
on_wait(exc)
|
|
67
|
+
if config.timeout is not None and clock() - start >= config.timeout:
|
|
68
|
+
raise WaitTimeout(config.timeout) from exc
|
|
69
|
+
sleep(config.poll_interval)
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
def build_child_env(selection: Selection, base: dict[str, str] | None = None) -> dict[str, str]:
|
|
73
|
+
"""Return an environment with ``CUDA_VISIBLE_DEVICES`` set to the choice."""
|
|
74
|
+
|
|
75
|
+
env = dict(os.environ if base is None else base)
|
|
76
|
+
env["CUDA_VISIBLE_DEVICES"] = selection.cuda_visible_devices
|
|
77
|
+
return env
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
def run_with_gpus(
|
|
81
|
+
probe: Probe,
|
|
82
|
+
config: WaitConfig,
|
|
83
|
+
command: Sequence[str],
|
|
84
|
+
*,
|
|
85
|
+
clock: Clock = time.monotonic,
|
|
86
|
+
sleep: Sleep = time.sleep,
|
|
87
|
+
exec_fn: Callable[[Sequence[str], dict[str, str]], int] | None = None,
|
|
88
|
+
on_wait: Callable[[NotEnoughGPUs], None] | None = None,
|
|
89
|
+
) -> int:
|
|
90
|
+
"""Wait for GPUs, claim them, and run ``command``. Returns its exit code."""
|
|
91
|
+
|
|
92
|
+
if not command:
|
|
93
|
+
raise ValueError("command must not be empty")
|
|
94
|
+
selection = wait_for_selection(probe, config, clock=clock, sleep=sleep, on_wait=on_wait)
|
|
95
|
+
runner = exec_fn or _spawn
|
|
96
|
+
with GpuClaim(selection.indices):
|
|
97
|
+
env = build_child_env(selection)
|
|
98
|
+
return runner(command, env)
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
def _spawn(command: Sequence[str], env: dict[str, str]) -> int:
|
|
102
|
+
import subprocess
|
|
103
|
+
|
|
104
|
+
try:
|
|
105
|
+
completed = subprocess.run(list(command), env=env, check=False) # noqa: S603
|
|
106
|
+
except FileNotFoundError:
|
|
107
|
+
print(f"gpu-gate: command not found: {command[0]}", file=sys.stderr)
|
|
108
|
+
return 127
|
|
109
|
+
return completed.returncode
|
gpu_gate/selector.py
ADDED
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
"""Pure GPU selection logic.
|
|
2
|
+
|
|
3
|
+
Given a list of :class:`~gpu_gate.models.GpuStatus` and a set of
|
|
4
|
+
:class:`~gpu_gate.models.Requirements`, decide which devices to claim.
|
|
5
|
+
No NVML, no I/O, no global state, so it can be exercised exhaustively
|
|
6
|
+
in unit tests without a GPU present.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
from collections.abc import Iterable
|
|
12
|
+
|
|
13
|
+
from gpu_gate.models import GpuStatus, Requirements, Selection
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class NotEnoughGPUs(Exception):
|
|
17
|
+
"""Raised when fewer GPUs meet the requirements than were requested."""
|
|
18
|
+
|
|
19
|
+
def __init__(self, requested: int, available: int) -> None:
|
|
20
|
+
self.requested = requested
|
|
21
|
+
self.available = available
|
|
22
|
+
super().__init__(f"requested {requested} GPU(s) but only {available} met the requirements")
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def eligible(status: GpuStatus, req: Requirements) -> bool:
|
|
26
|
+
"""Return whether a single GPU satisfies the requirements."""
|
|
27
|
+
|
|
28
|
+
if req.include is not None and status.index not in req.include:
|
|
29
|
+
return False
|
|
30
|
+
if status.index in req.exclude:
|
|
31
|
+
return False
|
|
32
|
+
if status.memory_free_mb < req.min_free_mb:
|
|
33
|
+
return False
|
|
34
|
+
if status.utilization_pct > req.max_utilization_pct:
|
|
35
|
+
return False
|
|
36
|
+
return True
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def rank(statuses: Iterable[GpuStatus]) -> list[GpuStatus]:
|
|
40
|
+
"""Order GPUs best-first: most free memory, then lowest utilization.
|
|
41
|
+
|
|
42
|
+
Index is the final tie-breaker so the ordering is deterministic.
|
|
43
|
+
"""
|
|
44
|
+
|
|
45
|
+
return sorted(
|
|
46
|
+
statuses,
|
|
47
|
+
key=lambda s: (-s.memory_free_mb, s.utilization_pct, s.index),
|
|
48
|
+
)
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
def select(statuses: Iterable[GpuStatus], req: Requirements) -> Selection:
|
|
52
|
+
"""Choose ``req.count`` GPUs that satisfy ``req``.
|
|
53
|
+
|
|
54
|
+
Raises :class:`NotEnoughGPUs` if not enough eligible devices exist.
|
|
55
|
+
"""
|
|
56
|
+
|
|
57
|
+
candidates = rank(s for s in statuses if eligible(s, req))
|
|
58
|
+
if len(candidates) < req.count:
|
|
59
|
+
raise NotEnoughGPUs(requested=req.count, available=len(candidates))
|
|
60
|
+
chosen = candidates[: req.count]
|
|
61
|
+
return Selection(indices=tuple(sorted(s.index for s in chosen)))
|
|
@@ -0,0 +1,153 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: gpu-gate
|
|
3
|
+
Version: 0.2.0
|
|
4
|
+
Summary: Wait for a free GPU, claim it, and run a command on it.
|
|
5
|
+
Project-URL: Homepage, https://github.com/jmweb-org/gpu-gate
|
|
6
|
+
Project-URL: Repository, https://github.com/jmweb-org/gpu-gate
|
|
7
|
+
Project-URL: Issues, https://github.com/jmweb-org/gpu-gate/issues
|
|
8
|
+
Author: José del Río
|
|
9
|
+
License: MIT License
|
|
10
|
+
|
|
11
|
+
Copyright (c) 2026 José del Río
|
|
12
|
+
|
|
13
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
14
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
15
|
+
in the Software without restriction, including without limitation the rights
|
|
16
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
17
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
18
|
+
furnished to do so, subject to the following conditions:
|
|
19
|
+
|
|
20
|
+
The above copyright notice and this permission notice shall be included in all
|
|
21
|
+
copies or substantial portions of the Software.
|
|
22
|
+
|
|
23
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
24
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
25
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
26
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
27
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
28
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
29
|
+
SOFTWARE.
|
|
30
|
+
License-File: LICENSE
|
|
31
|
+
Keywords: cli,cuda,gpu,nvidia,nvml,scheduler
|
|
32
|
+
Classifier: Development Status :: 4 - Beta
|
|
33
|
+
Classifier: Environment :: GPU :: NVIDIA CUDA
|
|
34
|
+
Classifier: Intended Audience :: Developers
|
|
35
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
36
|
+
Classifier: Operating System :: POSIX :: Linux
|
|
37
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
38
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
39
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
40
|
+
Classifier: Topic :: System :: Distributed Computing
|
|
41
|
+
Classifier: Topic :: Utilities
|
|
42
|
+
Requires-Python: >=3.10
|
|
43
|
+
Requires-Dist: filelock>=3.12
|
|
44
|
+
Requires-Dist: nvidia-ml-py>=12.535
|
|
45
|
+
Requires-Dist: rich>=13.0
|
|
46
|
+
Requires-Dist: typer>=0.12
|
|
47
|
+
Description-Content-Type: text/markdown
|
|
48
|
+
|
|
49
|
+
# gpu-gate
|
|
50
|
+
|
|
51
|
+
[](https://github.com/jmweb-org/gpu-gate/actions/workflows/ci.yml)
|
|
52
|
+
[](https://pypi.org/project/gpu-gate/)
|
|
53
|
+
[](https://www.python.org)
|
|
54
|
+
[](LICENSE)
|
|
55
|
+
|
|
56
|
+
Wait for a free GPU, claim it, set `CUDA_VISIBLE_DEVICES`, and run your command.
|
|
57
|
+
|
|
58
|
+
On a shared multi-GPU box without a cluster scheduler, starting a job usually
|
|
59
|
+
means watching `nvidia-smi`, picking a card by hand, exporting the env var, and
|
|
60
|
+
remembering to actually launch. `gpu-gate` is the small wait-pick-export-run
|
|
61
|
+
loop that does this for you, with a cooperative lock so two invocations on the
|
|
62
|
+
same host do not grab the same just-freed card. No daemon, no server, nothing
|
|
63
|
+
to administer.
|
|
64
|
+
|
|
65
|
+
```console
|
|
66
|
+
$ gpu-gate run --min-free-mb 8000 -- python train.py
|
|
67
|
+
gpu-gate: waiting for a free GPU ...
|
|
68
|
+
# ... blocks until a card has >= 8 GB free, then runs train.py with
|
|
69
|
+
# CUDA_VISIBLE_DEVICES set to the chosen index
|
|
70
|
+
```
|
|
71
|
+
|
|
72
|
+
## Install
|
|
73
|
+
|
|
74
|
+
```console
|
|
75
|
+
$ pip install gpu-gate # from PyPI, once released
|
|
76
|
+
$ pip install git+https://github.com/jmweb-org/gpu-gate # latest, available now
|
|
77
|
+
```
|
|
78
|
+
|
|
79
|
+
It requires an NVIDIA driver at run time. The NVML binding
|
|
80
|
+
(`nvidia-ml-py`) is pulled in automatically; the package still installs and
|
|
81
|
+
imports on machines without a GPU, so it is safe to add to shared requirements.
|
|
82
|
+
|
|
83
|
+
## Usage
|
|
84
|
+
|
|
85
|
+
### Run a command on a free GPU
|
|
86
|
+
|
|
87
|
+
```console
|
|
88
|
+
$ gpu-gate run -n 1 --min-free-mb 8000 -- python train.py --epochs 50
|
|
89
|
+
```
|
|
90
|
+
|
|
91
|
+
Everything after `--` is the command. `gpu-gate` blocks until the requirements
|
|
92
|
+
are met, claims the chosen device(s), exports `CUDA_VISIBLE_DEVICES`, and execs
|
|
93
|
+
the command. Its own exit code is the command's exit code, so it drops cleanly
|
|
94
|
+
into scripts and CI.
|
|
95
|
+
|
|
96
|
+
Common options:
|
|
97
|
+
|
|
98
|
+
| Option | Meaning |
|
|
99
|
+
| --- | --- |
|
|
100
|
+
| `-n, --count` | Number of GPUs to claim (default 1) |
|
|
101
|
+
| `--min-free-mb` | Require at least this much free memory |
|
|
102
|
+
| `--max-util` | Skip cards busier than this percent |
|
|
103
|
+
| `--only 0,1` | Restrict the search to these indices |
|
|
104
|
+
| `--exclude 2,3` | Never pick these indices |
|
|
105
|
+
| `--poll` | Seconds between checks (default 5) |
|
|
106
|
+
| `--timeout` | Give up after N seconds (exit 124) |
|
|
107
|
+
|
|
108
|
+
### Just wait, then use the result yourself
|
|
109
|
+
|
|
110
|
+
```console
|
|
111
|
+
$ export CUDA_VISIBLE_DEVICES=$(gpu-gate wait --min-free-mb 8000)
|
|
112
|
+
```
|
|
113
|
+
|
|
114
|
+
### Inspect the current state
|
|
115
|
+
|
|
116
|
+
```console
|
|
117
|
+
$ gpu-gate status
|
|
118
|
+
idx name free total util
|
|
119
|
+
0 NVIDIA L40S 44211 MiB 46068 MiB 3%
|
|
120
|
+
1 NVIDIA L40S 812 MiB 46068 MiB 97%
|
|
121
|
+
|
|
122
|
+
$ gpu-gate status --json
|
|
123
|
+
```
|
|
124
|
+
|
|
125
|
+
## Exit codes
|
|
126
|
+
|
|
127
|
+
| Code | Meaning |
|
|
128
|
+
| --- | --- |
|
|
129
|
+
| 0 | Command ran (its own code is forwarded) |
|
|
130
|
+
| 2 | Bad invocation (for example, no command after `--`) |
|
|
131
|
+
| 124 | Timed out waiting for a GPU |
|
|
132
|
+
| 3 | Requirements could never be met |
|
|
133
|
+
| 4 | Could not read GPU state (no driver / NVML error) |
|
|
134
|
+
|
|
135
|
+
## How selection works
|
|
136
|
+
|
|
137
|
+
A GPU is eligible when it has enough free memory, is below the utilization
|
|
138
|
+
ceiling, is not excluded, and is not currently locked by another `gpu-gate`
|
|
139
|
+
caller. Eligible cards are ranked by most free memory, then lowest
|
|
140
|
+
utilization, then index, and the top `--count` are chosen. The ordering is
|
|
141
|
+
fully deterministic.
|
|
142
|
+
|
|
143
|
+
## Locking
|
|
144
|
+
|
|
145
|
+
While a command runs, `gpu-gate` holds an advisory file lock per claimed
|
|
146
|
+
device under `$GPU_GATE_LOCK_DIR` (a per-user directory by default). Other
|
|
147
|
+
`gpu-gate` invocations skip locked devices, which avoids the classic race where
|
|
148
|
+
two jobs both see the same card free at the same instant. The lock is advisory:
|
|
149
|
+
it coordinates `gpu-gate` callers, not arbitrary CUDA programs.
|
|
150
|
+
|
|
151
|
+
## License
|
|
152
|
+
|
|
153
|
+
MIT. See [LICENSE](LICENSE).
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
gpu_gate/__init__.py,sha256=aheQ9bzv_7YxyEay-6KgLe-jlqHvW8tJ6VY3TSNi4OM,334
|
|
2
|
+
gpu_gate/__main__.py,sha256=-K0PP9Dejd0YsIKDNGCSVQizMGx1-YYAAE0DGp9q7x8,81
|
|
3
|
+
gpu_gate/cli.py,sha256=ELdGznx1BPq_rzzP3NZrMVQhvgXMIlqUAIpYEgLS_R8,6687
|
|
4
|
+
gpu_gate/lock.py,sha256=B96J2K5hHWMxXbWCggOC8wt2HdYjLwjEFRp_VfrK8Bw,2008
|
|
5
|
+
gpu_gate/models.py,sha256=WggqmEYyNjltLVABwFccbE50E-y04l1TFl7-jiG5e7I,1691
|
|
6
|
+
gpu_gate/probe.py,sha256=rmV-PPSgFMatxDTN_SAnhxGTjYEgb95oNVJ3AyYqE54,2880
|
|
7
|
+
gpu_gate/runner.py,sha256=Lkv6MtaTLh1lGfrXSUeWBaANjwo3sHhAhkhkzWEGRrQ,3429
|
|
8
|
+
gpu_gate/selector.py,sha256=RYcigKR_ZgC3LSvMILEi1v6LDE_GyRvVI937SXeuDog,2021
|
|
9
|
+
gpu_gate-0.2.0.dist-info/METADATA,sha256=jZVdkWP7PXozFatELomuJG2LmCSsfBgGCcEsJ9r7p50,6050
|
|
10
|
+
gpu_gate-0.2.0.dist-info/WHEEL,sha256=mffPy8wBnZQn2VnJUU5jE99KsxaSfiyMHV9Yt0aLVxs,87
|
|
11
|
+
gpu_gate-0.2.0.dist-info/entry_points.txt,sha256=mR4F0k-HdSWtzhVUc9Q4iwkMQIxuabiTjICD5jBDaN0,53
|
|
12
|
+
gpu_gate-0.2.0.dist-info/licenses/LICENSE,sha256=N4nJy_wSxYwULjDvuE2GupQWZSSwgOOU_HJSzuxHBsI,1071
|
|
13
|
+
gpu_gate-0.2.0.dist-info/RECORD,,
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 José del Río
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|