gpusched 0.3.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- gpusched-0.3.0/LICENSE +21 -0
- gpusched-0.3.0/PKG-INFO +320 -0
- gpusched-0.3.0/README.md +299 -0
- gpusched-0.3.0/pyproject.toml +34 -0
- gpusched-0.3.0/setup.cfg +4 -0
- gpusched-0.3.0/src/gpusched/__init__.py +13 -0
- gpusched-0.3.0/src/gpusched/allocation.py +142 -0
- gpusched-0.3.0/src/gpusched/backend.py +130 -0
- gpusched-0.3.0/src/gpusched/cli.py +112 -0
- gpusched-0.3.0/src/gpusched/jobspec.py +161 -0
- gpusched-0.3.0/src/gpusched/journal.py +106 -0
- gpusched-0.3.0/src/gpusched/scheduler.py +568 -0
- gpusched-0.3.0/src/gpusched/simjob.py +63 -0
- gpusched-0.3.0/src/gpusched/testing.py +86 -0
- gpusched-0.3.0/src/gpusched.egg-info/PKG-INFO +320 -0
- gpusched-0.3.0/src/gpusched.egg-info/SOURCES.txt +23 -0
- gpusched-0.3.0/src/gpusched.egg-info/dependency_links.txt +1 -0
- gpusched-0.3.0/src/gpusched.egg-info/entry_points.txt +2 -0
- gpusched-0.3.0/src/gpusched.egg-info/requires.txt +3 -0
- gpusched-0.3.0/src/gpusched.egg-info/top_level.txt +1 -0
- gpusched-0.3.0/tests/test_allocation.py +85 -0
- gpusched-0.3.0/tests/test_jobspec.py +46 -0
- gpusched-0.3.0/tests/test_live_queue.py +237 -0
- gpusched-0.3.0/tests/test_scheduler.py +171 -0
- gpusched-0.3.0/tests/test_spike_buffer.py +136 -0
gpusched-0.3.0/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 gpusched contributors
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
gpusched-0.3.0/PKG-INFO
ADDED
|
@@ -0,0 +1,320 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: gpusched
|
|
3
|
+
Version: 0.3.0
|
|
4
|
+
Summary: VRAM-aware single-node GPU job scheduler: queue shell commands, place them by free GPU memory, and verify declared-vs-actual VRAM per job.
|
|
5
|
+
License: MIT
|
|
6
|
+
Project-URL: Homepage, https://github.com/ceruleane/gpusched
|
|
7
|
+
Project-URL: Issues, https://github.com/ceruleane/gpusched/issues
|
|
8
|
+
Project-URL: Changelog, https://github.com/ceruleane/gpusched/blob/main/CHANGELOG.md
|
|
9
|
+
Keywords: gpu,scheduler,vram,nvidia,queue,cuda
|
|
10
|
+
Classifier: Environment :: Console
|
|
11
|
+
Classifier: Intended Audience :: Science/Research
|
|
12
|
+
Classifier: Operating System :: POSIX :: Linux
|
|
13
|
+
Classifier: Programming Language :: Python :: 3
|
|
14
|
+
Classifier: Topic :: System :: Distributed Computing
|
|
15
|
+
Requires-Python: >=3.10
|
|
16
|
+
Description-Content-Type: text/markdown
|
|
17
|
+
License-File: LICENSE
|
|
18
|
+
Provides-Extra: dev
|
|
19
|
+
Requires-Dist: pytest>=7; extra == "dev"
|
|
20
|
+
Dynamic: license-file
|
|
21
|
+
|
|
22
|
+
# gpusched
|
|
23
|
+
|
|
24
|
+
A VRAM-aware GPU job scheduler for a single machine. You write shell commands
|
|
25
|
+
in a text file, optionally declaring how much GPU memory each needs; gpusched
|
|
26
|
+
runs them, placing each job on a GPU with enough free memory the moment one
|
|
27
|
+
opens up, measures how much VRAM each job actually used, and tells you when
|
|
28
|
+
your declarations were wrong.
|
|
29
|
+
|
|
30
|
+
```
|
|
31
|
+
$ cat jobs.txt
|
|
32
|
+
[vram=18G] python train.py --config a.yaml
|
|
33
|
+
[vram=18G] python train.py --config b.yaml
|
|
34
|
+
python preprocess.py
|
|
35
|
+
|
|
36
|
+
$ gpusched jobs.txt --watch -v
|
|
37
|
+
[14:02:10] job 1 started on gpu [0] (1/3 dispatched, declared 18432 MiB/gpu) — ...
|
|
38
|
+
[14:02:10] job 2 started on gpu [1] (2/3 dispatched, declared 18432 MiB/gpu) — ...
|
|
39
|
+
[14:31:44] job 1 finished [OK] in 1774s — peak vram gpu0:17910 MiB | declared 18432 MiB → within ±10% | avg gpu util 96%
|
|
40
|
+
```
|
|
41
|
+
|
|
42
|
+
## What this is, and what it is not
|
|
43
|
+
|
|
44
|
+
gpusched is a **single-node, single-user research tool**: roughly a thousand
|
|
45
|
+
lines you can read in an afternoon, with no daemon, no database, and zero
|
|
46
|
+
Python dependencies. It exists for one workflow — you have a box with a few
|
|
47
|
+
GPUs, a pile of training/eval commands, and you want them to run unattended
|
|
48
|
+
with minimal GPU idle time and without OOM-ing each other.
|
|
49
|
+
|
|
50
|
+
It is **not** a cluster scheduler, and several of its guarantees are honest
|
|
51
|
+
best-effort rather than enforcement:
|
|
52
|
+
|
|
53
|
+
- **Declarations are advisory.** gpusched places jobs based on what you
|
|
54
|
+
declare and warns loudly when reality diverges, but it cannot cap a
|
|
55
|
+
process's VRAM (nothing can, short of MIG partitions). A job that blows
|
|
56
|
+
through its declaration can still OOM a neighbor — you get a warning, not
|
|
57
|
+
protection. `--exclusive` (one scheduled job per GPU) is the zero-risk mode.
|
|
58
|
+
- **It polls.** GPU state is sampled every `--poll` seconds (default 5) via
|
|
59
|
+
`nvidia-smi`. VRAM spikes shorter than the poll interval are invisible to
|
|
60
|
+
both placement and peak reporting.
|
|
61
|
+
- **Co-locating compute-heavy jobs can be slower than serializing them.**
|
|
62
|
+
Packing optimizes VRAM occupancy, not throughput. Two trainers that each
|
|
63
|
+
saturate the SMs will roughly halve each other's speed; packing pays off
|
|
64
|
+
for memory-light or compute-light neighbors (eval scripts, dataloader-bound
|
|
65
|
+
jobs, inference). The per-job utilization report helps you see which case
|
|
66
|
+
you're in.
|
|
67
|
+
- **Per-job VRAM attribution can fail in some container setups** where
|
|
68
|
+
`nvidia-smi` reports host-namespace PIDs that don't match the container's.
|
|
69
|
+
gpusched then reports the peak as `n/a` rather than a wrong number;
|
|
70
|
+
placement still works because it uses device-level totals.
|
|
71
|
+
- **Testing honesty:** the test suite (65 tests) exercises the full scheduler
|
|
72
|
+
against simulated GPU backends and real subprocesses; the `nvidia-smi`
|
|
73
|
+
parsing layer is straightforward but thin. Do a small smoke run on your
|
|
74
|
+
hardware before trusting it with a week of compute.
|
|
75
|
+
|
|
76
|
+
## If your needs are bigger than this
|
|
77
|
+
|
|
78
|
+
Use the right tool instead of stretching this one:
|
|
79
|
+
|
|
80
|
+
| Need | Look at |
|
|
81
|
+
|---|---|
|
|
82
|
+
| Multiple machines, multiple users, fairness, accounting | Slurm (or PBS/LSF) |
|
|
83
|
+
| Hyperparameter optimization with early stopping / ASHA / PBT | Ray Tune, Optuna, W&B Sweeps |
|
|
84
|
+
| Distributed training orchestration | Ray, torchrun + Slurm, Kubernetes + device plugins |
|
|
85
|
+
| Workflow DAGs (job B consumes job A's output) | Snakemake, Makefile, Airflow |
|
|
86
|
+
| Hard VRAM/compute isolation between jobs | NVIDIA MIG (partitioning), MPS (limits) |
|
|
87
|
+
| Just a per-GPU FIFO queue, even simpler than this | task-spooler (`ts`), simple_gpu_scheduler |
|
|
88
|
+
|
|
89
|
+
For sweeps specifically, the intended pattern is: a 10-line script (or
|
|
90
|
+
Optuna in ask-and-tell mode) *generates* the jobs file; gpusched stays a dumb
|
|
91
|
+
command queue. The moment you want trials stopped early based on metrics, you
|
|
92
|
+
have outgrown this tool — that requires bidirectional communication with
|
|
93
|
+
running jobs, which is deliberately out of scope.
|
|
94
|
+
|
|
95
|
+
## Install
|
|
96
|
+
|
|
97
|
+
```
|
|
98
|
+
uv tool install gpusched # isolated env, `gpusched` on PATH
|
|
99
|
+
# or: pip install gpusched
|
|
100
|
+
# or, from a clone: uv tool install .
|
|
101
|
+
```
|
|
102
|
+
|
|
103
|
+
Requires Python ≥ 3.10 and Linux with `nvidia-smi` (the scheduler itself has
|
|
104
|
+
zero Python dependencies). `uvx gpusched jobs.txt` runs it without installing.
|
|
105
|
+
|
|
106
|
+
## Sixty-second tour (no GPU required)
|
|
107
|
+
|
|
108
|
+
```
|
|
109
|
+
echo '[vram=8G] python3 -m gpusched.simjob --vram 8000 --ramp 2 --hold 5' > jobs.txt
|
|
110
|
+
gpusched jobs.txt --sim 2 --poll 0.5 -v
|
|
111
|
+
```
|
|
112
|
+
|
|
113
|
+
`--sim N` runs against N simulated 24 GiB GPUs; `gpusched.simjob` is a fake
|
|
114
|
+
GPU job that ramps and holds a declared amount of fake VRAM. Everything below
|
|
115
|
+
behaves identically in sim and on real hardware.
|
|
116
|
+
|
|
117
|
+
## The jobs file
|
|
118
|
+
|
|
119
|
+
One shell command per line; blank lines and `#` comments are skipped. An
|
|
120
|
+
optional leading `[...]` block declares per-job attributes:
|
|
121
|
+
|
|
122
|
+
```
|
|
123
|
+
# no declaration -> runs only on a fully idle GPU, alone
|
|
124
|
+
python preprocess.py
|
|
125
|
+
|
|
126
|
+
# declared max VRAM (per GPU): may share a GPU when the declared amount fits
|
|
127
|
+
[vram=18000] python train.py --config a.yaml
|
|
128
|
+
[vram=22G] bash run_eval.sh
|
|
129
|
+
|
|
130
|
+
# multi-GPU: 2 GPUs, EACH with >= 30 GiB free; CUDA_VISIBLE_DEVICES gets both
|
|
131
|
+
[vram=30G gpus=2] torchrun --nproc_per_node=2 train_big.py
|
|
132
|
+
|
|
133
|
+
# auto-retry on CUDA OOM, declaration bumped ~1.25x of observed peak per retry
|
|
134
|
+
[vram=8G retries=2] python sweep.py --seed 3
|
|
135
|
+
|
|
136
|
+
# opt-in walltime: SIGTERM at 2h, SIGKILL +10s. No timeout attribute = runs
|
|
137
|
+
# forever; the scheduler never guesses which long-running jobs are hung.
|
|
138
|
+
[timeout=2h] python flaky_eval.py
|
|
139
|
+
```
|
|
140
|
+
|
|
141
|
+
`vram` accepts MiB integers or `G`/`GiB` suffixes and is always per GPU.
|
|
142
|
+
Declare your honest worst case; the completion report tells you how close you
|
|
143
|
+
were, so declarations converge after a run or two.
|
|
144
|
+
|
|
145
|
+
## How placement works
|
|
146
|
+
|
|
147
|
+
A job **without** a declaration gets a GPU only when it is fully idle (below
|
|
148
|
+
`--idle-threshold`, default 200 MiB, with no other scheduled job) — the safe
|
|
149
|
+
default when you don't know what a job needs.
|
|
150
|
+
|
|
151
|
+
A job **with** a declaration of E MiB can be placed wherever *effective
|
|
152
|
+
headroom* ≥ E + `--margin` (default 512 MiB). Effective headroom accounts for
|
|
153
|
+
three things. First, a just-launched job that hasn't allocated its CUDA
|
|
154
|
+
context yet still reserves everything it declared — this closes the classic
|
|
155
|
+
double-booking race in poll-based schedulers, where a GPU looks empty for the
|
|
156
|
+
few seconds before a process materializes. Second, every process gpusched did
|
|
157
|
+
not launch is tracked to its observed per-GPU peak and held to
|
|
158
|
+
`peak × (1 + --spike-buffer)` until it exits, so a fluctuating external
|
|
159
|
+
process's momentary trough is not treated as packable space. Third, a
|
|
160
|
+
scheduled job that exceeds its own declaration stops being trusted: its
|
|
161
|
+
budget escalates from the declaration to its buffered observed peak.
|
|
162
|
+
|
|
163
|
+
Queue order is file order with backfill: if the next job can't fit right now,
|
|
164
|
+
smaller jobs behind it run first. A job that could not fit even on a
|
|
165
|
+
completely empty GPU fails immediately as `INFEASIBLE` rather than stalling
|
|
166
|
+
the queue. Multi-GPU jobs take N distinct GPUs, each meeting the per-GPU
|
|
167
|
+
requirement, chosen best-fit to preserve large contiguous headroom.
|
|
168
|
+
|
|
169
|
+
(One physical reality worth knowing: PyTorch's caching allocator rarely
|
|
170
|
+
returns VRAM to the driver, so for torch jobs `nvidia-smi` already reads near
|
|
171
|
+
the high-water mark — the spike-buffer machinery matters most for processes
|
|
172
|
+
that genuinely release memory between phases.)
|
|
173
|
+
|
|
174
|
+
## Monitoring: declared vs actual
|
|
175
|
+
|
|
176
|
+
Each scheduled job runs in its own session (`setsid`), so all its descendant
|
|
177
|
+
processes share one process-group id; each poll, gpusched maps `nvidia-smi`'s
|
|
178
|
+
per-process VRAM onto jobs by pgid and tracks per-GPU peaks. Two asymmetric
|
|
179
|
+
notifications, tuned by `--tolerance` (default ±10%):
|
|
180
|
+
|
|
181
|
+
**Under-declaration warns immediately** — the first poll where actual exceeds
|
|
182
|
+
declared, you get `WARN job N EXCEEDS declared VRAM ... neighbors may OOM`,
|
|
183
|
+
because at that moment the packing math other jobs were placed under is
|
|
184
|
+
already violated. **Over-declaration is reported at completion** — a
|
|
185
|
+
fluctuating job may legitimately peak late, so it can only be judged once it
|
|
186
|
+
exits: `declared 12288 MiB → over-declared (-59%); lowering it frees packing
|
|
187
|
+
headroom`.
|
|
188
|
+
|
|
189
|
+
Every completion line is streamed the moment the job finishes and includes
|
|
190
|
+
its per-GPU peak and average device-level GPU utilization (device-level: when
|
|
191
|
+
two jobs share a GPU the number is confounded — treat it as a diagnostic for
|
|
192
|
+
spotting dataloader-bound runs, not a per-process metric). `--verbose` adds
|
|
193
|
+
live usage lines as a job's peak grows.
|
|
194
|
+
|
|
195
|
+
## Live queue, resume, and the status board
|
|
196
|
+
|
|
197
|
+
The jobs file is **user-owned and never written by the scheduler**; it is
|
|
198
|
+
re-read every poll. Each line has a stable identity (hash of its command text
|
|
199
|
+
plus an occurrence counter for duplicate lines), and an append-only journal
|
|
200
|
+
(`<log_dir>/journal.jsonl`) records attempts and outcomes per identity.
|
|
201
|
+
"Pending" is defined as: lines in the file that are neither running nor
|
|
202
|
+
terminal in the journal. Everything follows from that one definition —
|
|
203
|
+
append a line from any terminal and it is dispatched within a poll; delete a
|
|
204
|
+
pending line and it is dequeued; reorder pending lines and you have reordered
|
|
205
|
+
the queue (file order among pending IS the priority; there is no separate
|
|
206
|
+
priority mechanism); edit a pending line and you have replaced it. Edits to
|
|
207
|
+
running or completed lines do nothing. A malformed mid-edit save is rejected
|
|
208
|
+
with a warning and the last good queue is kept; in-flight jobs are never
|
|
209
|
+
affected.
|
|
210
|
+
|
|
211
|
+
Re-running the same command after a crash or Ctrl-C skips everything the
|
|
212
|
+
journal marks done — that is resume. `--fresh` wipes the journal to re-run
|
|
213
|
+
all; to re-run one job, change its line trivially (new identity). `--watch`
|
|
214
|
+
keeps the scheduler alive after the queue drains, waiting for appended lines.
|
|
215
|
+
|
|
216
|
+
A live board is rendered to `<log_dir>/status.txt` every poll
|
|
217
|
+
(`▶` running, `·` pending, `↻` retrying after OOM, `✓`/`✗` done):
|
|
218
|
+
|
|
219
|
+
```
|
|
220
|
+
watch -n2 cat gpusched_logs/status.txt
|
|
221
|
+
```
|
|
222
|
+
|
|
223
|
+
Known limitation: with continuous submission, a large blocked job can be
|
|
224
|
+
starved by a stream of small backfilled ones. There is no aging policy — you
|
|
225
|
+
are the priority mechanism. Move the big job's line up and hold small
|
|
226
|
+
submissions, or run it under `--exclusive`.
|
|
227
|
+
|
|
228
|
+
## OOM retry and timeouts
|
|
229
|
+
|
|
230
|
+
A failed job whose log tail matches CUDA OOM signatures and that declared
|
|
231
|
+
`[retries=N]` (or ran under `--oom-retries N`) is requeued instead of
|
|
232
|
+
terminal-failed: its declaration is bumped to ~1.25× of max(observed peak,
|
|
233
|
+
old declaration), recorded in the journal (so it survives scheduler
|
|
234
|
+
restarts), and applied on the next attempt — the retry is scheduled with
|
|
235
|
+
honest requirements instead of repeating the same collision. Non-OOM failures
|
|
236
|
+
never consume retries, so retry loops cannot mask code bugs.
|
|
237
|
+
|
|
238
|
+
Timeouts are strictly per-job opt-in (`[timeout=90s|15m|2h|1d]`).
|
|
239
|
+
Distinguishing a hung process from a legitimate three-day run is your
|
|
240
|
+
declaration, never a heuristic — heuristics (e.g. "0% util for 10 minutes")
|
|
241
|
+
kill legitimate CPU phases like preprocessing and checkpoint serialization,
|
|
242
|
+
so none are included.
|
|
243
|
+
|
|
244
|
+
## Running detached (tmux)
|
|
245
|
+
|
|
246
|
+
The scheduler is an ordinary foreground process — run it inside tmux and
|
|
247
|
+
disconnect freely:
|
|
248
|
+
|
|
249
|
+
```
|
|
250
|
+
tmux new -s sched
|
|
251
|
+
gpusched jobs.txt --watch -v # pane 1
|
|
252
|
+
# Ctrl-b c -> pane 2:
|
|
253
|
+
watch -n2 cat gpusched_logs/status.txt
|
|
254
|
+
# Ctrl-b d to detach; later: tmux attach -t sched
|
|
255
|
+
```
|
|
256
|
+
|
|
257
|
+
You can also append jobs over ssh without attaching at all — the file is
|
|
258
|
+
re-read every poll. Failure-mode hierarchy, honestly: tmux protects
|
|
259
|
+
everything from SSH drops; the journal protects queue state from scheduler
|
|
260
|
+
death; nothing recovers *tracking* of jobs orphaned by a dead scheduler (they
|
|
261
|
+
keep running in their own sessions, but their exit codes are lost and a
|
|
262
|
+
restarted scheduler treats them as not-done).
|
|
263
|
+
|
|
264
|
+
## CLI reference
|
|
265
|
+
|
|
266
|
+
```
|
|
267
|
+
gpusched jobs.txt
|
|
268
|
+
--gpus 0,1,3 restrict to these GPU indices (default: all visible)
|
|
269
|
+
--idle-threshold 200 MiB below which a GPU counts as idle (undeclared jobs)
|
|
270
|
+
--margin 512 MiB safety margin added to every declaration
|
|
271
|
+
--tolerance 0.10 band before flagging over/under-declaration
|
|
272
|
+
--spike-buffer 0.10 buffer over observed VRAM maxima of fluctuating processes
|
|
273
|
+
--poll 5 seconds between scheduling rounds
|
|
274
|
+
--exclusive one scheduled job per GPU, even when declarations fit
|
|
275
|
+
--watch keep running after drain; pick up appended lines
|
|
276
|
+
--oom-retries N default CUDA-OOM auto-retries ([retries=N] overrides)
|
|
277
|
+
--fresh ignore + remove the journal: re-run everything
|
|
278
|
+
--log-dir DIR per-job logs, journal, status board (default: gpusched_logs)
|
|
279
|
+
-v / --verbose stream live per-job VRAM as peaks grow
|
|
280
|
+
--sim N dry-run on N simulated 24 GiB GPUs (no hardware)
|
|
281
|
+
```
|
|
282
|
+
|
|
283
|
+
Exit code: 0 if every job succeeded, otherwise the max failing job exit code.
|
|
284
|
+
|
|
285
|
+
## Architecture and extending
|
|
286
|
+
|
|
287
|
+
```
|
|
288
|
+
src/gpusched/
|
|
289
|
+
jobspec.py parsing: [vram=.. gpus=.. timeout=.. retries=..] cmd -> JobSpec
|
|
290
|
+
backend.py GpuBackend protocol; NvidiaSmiBackend (2 queries/poll); pgid attribution
|
|
291
|
+
allocation.py PURE placement function: headroom, reservations, best-fit
|
|
292
|
+
journal.py append-only JSONL: attempts + terminal outcomes per job identity
|
|
293
|
+
scheduler.py tick loop: snapshot -> attribute -> warn -> timeouts -> reap -> dispatch
|
|
294
|
+
testing.py FakeBackend (unit tests), SimBackend (integration / --sim)
|
|
295
|
+
simjob.py simulated GPU job for tests and dry runs
|
|
296
|
+
cli.py argparse front end
|
|
297
|
+
```
|
|
298
|
+
|
|
299
|
+
The deliberate seams: `allocation.find_allocation` is pure (snapshot +
|
|
300
|
+
occupants in, GPU list out), so new placement rules — GPU-type constraints,
|
|
301
|
+
NVLink-aware pairing — are filters there plus an attribute in the jobspec
|
|
302
|
+
parser, with nothing else touched. Alternative monitors (pynvml, DCGM)
|
|
303
|
+
implement the two-method `GpuBackend` protocol. The journal is the only
|
|
304
|
+
persistent state.
|
|
305
|
+
|
|
306
|
+
## Development
|
|
307
|
+
|
|
308
|
+
```
|
|
309
|
+
git clone <repo> && cd gpusched
|
|
310
|
+
uv venv && uv pip install -e ".[dev]"
|
|
311
|
+
uv run pytest -q # 65 tests, ~20s, no GPU required
|
|
312
|
+
```
|
|
313
|
+
|
|
314
|
+
Tests drive the scheduler against fake/simulated backends with real
|
|
315
|
+
subprocesses; several are timing-based (sub-second sim jobs with fast polls),
|
|
316
|
+
so a heavily loaded machine can occasionally need a re-run.
|
|
317
|
+
|
|
318
|
+
## License
|
|
319
|
+
|
|
320
|
+
MIT.
|
gpusched-0.3.0/README.md
ADDED
|
@@ -0,0 +1,299 @@
|
|
|
1
|
+
# gpusched
|
|
2
|
+
|
|
3
|
+
A VRAM-aware GPU job scheduler for a single machine. You write shell commands
|
|
4
|
+
in a text file, optionally declaring how much GPU memory each needs; gpusched
|
|
5
|
+
runs them, placing each job on a GPU with enough free memory the moment one
|
|
6
|
+
opens up, measures how much VRAM each job actually used, and tells you when
|
|
7
|
+
your declarations were wrong.
|
|
8
|
+
|
|
9
|
+
```
|
|
10
|
+
$ cat jobs.txt
|
|
11
|
+
[vram=18G] python train.py --config a.yaml
|
|
12
|
+
[vram=18G] python train.py --config b.yaml
|
|
13
|
+
python preprocess.py
|
|
14
|
+
|
|
15
|
+
$ gpusched jobs.txt --watch -v
|
|
16
|
+
[14:02:10] job 1 started on gpu [0] (1/3 dispatched, declared 18432 MiB/gpu) — ...
|
|
17
|
+
[14:02:10] job 2 started on gpu [1] (2/3 dispatched, declared 18432 MiB/gpu) — ...
|
|
18
|
+
[14:31:44] job 1 finished [OK] in 1774s — peak vram gpu0:17910 MiB | declared 18432 MiB → within ±10% | avg gpu util 96%
|
|
19
|
+
```
|
|
20
|
+
|
|
21
|
+
## What this is, and what it is not
|
|
22
|
+
|
|
23
|
+
gpusched is a **single-node, single-user research tool**: roughly a thousand
|
|
24
|
+
lines you can read in an afternoon, with no daemon, no database, and zero
|
|
25
|
+
Python dependencies. It exists for one workflow — you have a box with a few
|
|
26
|
+
GPUs, a pile of training/eval commands, and you want them to run unattended
|
|
27
|
+
with minimal GPU idle time and without OOM-ing each other.
|
|
28
|
+
|
|
29
|
+
It is **not** a cluster scheduler, and several of its guarantees are honest
|
|
30
|
+
best-effort rather than enforcement:
|
|
31
|
+
|
|
32
|
+
- **Declarations are advisory.** gpusched places jobs based on what you
|
|
33
|
+
declare and warns loudly when reality diverges, but it cannot cap a
|
|
34
|
+
process's VRAM (nothing can, short of MIG partitions). A job that blows
|
|
35
|
+
through its declaration can still OOM a neighbor — you get a warning, not
|
|
36
|
+
protection. `--exclusive` (one scheduled job per GPU) is the zero-risk mode.
|
|
37
|
+
- **It polls.** GPU state is sampled every `--poll` seconds (default 5) via
|
|
38
|
+
`nvidia-smi`. VRAM spikes shorter than the poll interval are invisible to
|
|
39
|
+
both placement and peak reporting.
|
|
40
|
+
- **Co-locating compute-heavy jobs can be slower than serializing them.**
|
|
41
|
+
Packing optimizes VRAM occupancy, not throughput. Two trainers that each
|
|
42
|
+
saturate the SMs will roughly halve each other's speed; packing pays off
|
|
43
|
+
for memory-light or compute-light neighbors (eval scripts, dataloader-bound
|
|
44
|
+
jobs, inference). The per-job utilization report helps you see which case
|
|
45
|
+
you're in.
|
|
46
|
+
- **Per-job VRAM attribution can fail in some container setups** where
|
|
47
|
+
`nvidia-smi` reports host-namespace PIDs that don't match the container's.
|
|
48
|
+
gpusched then reports the peak as `n/a` rather than a wrong number;
|
|
49
|
+
placement still works because it uses device-level totals.
|
|
50
|
+
- **Testing honesty:** the test suite (65 tests) exercises the full scheduler
|
|
51
|
+
against simulated GPU backends and real subprocesses; the `nvidia-smi`
|
|
52
|
+
parsing layer is straightforward but thin. Do a small smoke run on your
|
|
53
|
+
hardware before trusting it with a week of compute.
|
|
54
|
+
|
|
55
|
+
## If your needs are bigger than this
|
|
56
|
+
|
|
57
|
+
Use the right tool instead of stretching this one:
|
|
58
|
+
|
|
59
|
+
| Need | Look at |
|
|
60
|
+
|---|---|
|
|
61
|
+
| Multiple machines, multiple users, fairness, accounting | Slurm (or PBS/LSF) |
|
|
62
|
+
| Hyperparameter optimization with early stopping / ASHA / PBT | Ray Tune, Optuna, W&B Sweeps |
|
|
63
|
+
| Distributed training orchestration | Ray, torchrun + Slurm, Kubernetes + device plugins |
|
|
64
|
+
| Workflow DAGs (job B consumes job A's output) | Snakemake, Makefile, Airflow |
|
|
65
|
+
| Hard VRAM/compute isolation between jobs | NVIDIA MIG (partitioning), MPS (limits) |
|
|
66
|
+
| Just a per-GPU FIFO queue, even simpler than this | task-spooler (`ts`), simple_gpu_scheduler |
|
|
67
|
+
|
|
68
|
+
For sweeps specifically, the intended pattern is: a 10-line script (or
|
|
69
|
+
Optuna in ask-and-tell mode) *generates* the jobs file; gpusched stays a dumb
|
|
70
|
+
command queue. The moment you want trials stopped early based on metrics, you
|
|
71
|
+
have outgrown this tool — that requires bidirectional communication with
|
|
72
|
+
running jobs, which is deliberately out of scope.
|
|
73
|
+
|
|
74
|
+
## Install
|
|
75
|
+
|
|
76
|
+
```
|
|
77
|
+
uv tool install gpusched # isolated env, `gpusched` on PATH
|
|
78
|
+
# or: pip install gpusched
|
|
79
|
+
# or, from a clone: uv tool install .
|
|
80
|
+
```
|
|
81
|
+
|
|
82
|
+
Requires Python ≥ 3.10 and Linux with `nvidia-smi` (the scheduler itself has
|
|
83
|
+
zero Python dependencies). `uvx gpusched jobs.txt` runs it without installing.
|
|
84
|
+
|
|
85
|
+
## Sixty-second tour (no GPU required)
|
|
86
|
+
|
|
87
|
+
```
|
|
88
|
+
echo '[vram=8G] python3 -m gpusched.simjob --vram 8000 --ramp 2 --hold 5' > jobs.txt
|
|
89
|
+
gpusched jobs.txt --sim 2 --poll 0.5 -v
|
|
90
|
+
```
|
|
91
|
+
|
|
92
|
+
`--sim N` runs against N simulated 24 GiB GPUs; `gpusched.simjob` is a fake
|
|
93
|
+
GPU job that ramps and holds a declared amount of fake VRAM. Everything below
|
|
94
|
+
behaves identically in sim and on real hardware.
|
|
95
|
+
|
|
96
|
+
## The jobs file
|
|
97
|
+
|
|
98
|
+
One shell command per line; blank lines and `#` comments are skipped. An
|
|
99
|
+
optional leading `[...]` block declares per-job attributes:
|
|
100
|
+
|
|
101
|
+
```
|
|
102
|
+
# no declaration -> runs only on a fully idle GPU, alone
|
|
103
|
+
python preprocess.py
|
|
104
|
+
|
|
105
|
+
# declared max VRAM (per GPU): may share a GPU when the declared amount fits
|
|
106
|
+
[vram=18000] python train.py --config a.yaml
|
|
107
|
+
[vram=22G] bash run_eval.sh
|
|
108
|
+
|
|
109
|
+
# multi-GPU: 2 GPUs, EACH with >= 30 GiB free; CUDA_VISIBLE_DEVICES gets both
|
|
110
|
+
[vram=30G gpus=2] torchrun --nproc_per_node=2 train_big.py
|
|
111
|
+
|
|
112
|
+
# auto-retry on CUDA OOM, declaration bumped ~1.25x of observed peak per retry
|
|
113
|
+
[vram=8G retries=2] python sweep.py --seed 3
|
|
114
|
+
|
|
115
|
+
# opt-in walltime: SIGTERM at 2h, SIGKILL +10s. No timeout attribute = runs
|
|
116
|
+
# forever; the scheduler never guesses which long-running jobs are hung.
|
|
117
|
+
[timeout=2h] python flaky_eval.py
|
|
118
|
+
```
|
|
119
|
+
|
|
120
|
+
`vram` accepts MiB integers or `G`/`GiB` suffixes and is always per GPU.
|
|
121
|
+
Declare your honest worst case; the completion report tells you how close you
|
|
122
|
+
were, so declarations converge after a run or two.
|
|
123
|
+
|
|
124
|
+
## How placement works
|
|
125
|
+
|
|
126
|
+
A job **without** a declaration gets a GPU only when it is fully idle (below
|
|
127
|
+
`--idle-threshold`, default 200 MiB, with no other scheduled job) — the safe
|
|
128
|
+
default when you don't know what a job needs.
|
|
129
|
+
|
|
130
|
+
A job **with** a declaration of E MiB can be placed wherever *effective
|
|
131
|
+
headroom* ≥ E + `--margin` (default 512 MiB). Effective headroom accounts for
|
|
132
|
+
three things. First, a just-launched job that hasn't allocated its CUDA
|
|
133
|
+
context yet still reserves everything it declared — this closes the classic
|
|
134
|
+
double-booking race in poll-based schedulers, where a GPU looks empty for the
|
|
135
|
+
few seconds before a process materializes. Second, every process gpusched did
|
|
136
|
+
not launch is tracked to its observed per-GPU peak and held to
|
|
137
|
+
`peak × (1 + --spike-buffer)` until it exits, so a fluctuating external
|
|
138
|
+
process's momentary trough is not treated as packable space. Third, a
|
|
139
|
+
scheduled job that exceeds its own declaration stops being trusted: its
|
|
140
|
+
budget escalates from the declaration to its buffered observed peak.
|
|
141
|
+
|
|
142
|
+
Queue order is file order with backfill: if the next job can't fit right now,
|
|
143
|
+
smaller jobs behind it run first. A job that could not fit even on a
|
|
144
|
+
completely empty GPU fails immediately as `INFEASIBLE` rather than stalling
|
|
145
|
+
the queue. Multi-GPU jobs take N distinct GPUs, each meeting the per-GPU
|
|
146
|
+
requirement, chosen best-fit to preserve large contiguous headroom.
|
|
147
|
+
|
|
148
|
+
(One physical reality worth knowing: PyTorch's caching allocator rarely
|
|
149
|
+
returns VRAM to the driver, so for torch jobs `nvidia-smi` already reads near
|
|
150
|
+
the high-water mark — the spike-buffer machinery matters most for processes
|
|
151
|
+
that genuinely release memory between phases.)
|
|
152
|
+
|
|
153
|
+
## Monitoring: declared vs actual
|
|
154
|
+
|
|
155
|
+
Each scheduled job runs in its own session (`setsid`), so all its descendant
|
|
156
|
+
processes share one process-group id; each poll, gpusched maps `nvidia-smi`'s
|
|
157
|
+
per-process VRAM onto jobs by pgid and tracks per-GPU peaks. Two asymmetric
|
|
158
|
+
notifications, tuned by `--tolerance` (default ±10%):
|
|
159
|
+
|
|
160
|
+
**Under-declaration warns immediately** — the first poll where actual exceeds
|
|
161
|
+
declared, you get `WARN job N EXCEEDS declared VRAM ... neighbors may OOM`,
|
|
162
|
+
because at that moment the packing math other jobs were placed under is
|
|
163
|
+
already violated. **Over-declaration is reported at completion** — a
|
|
164
|
+
fluctuating job may legitimately peak late, so it can only be judged once it
|
|
165
|
+
exits: `declared 12288 MiB → over-declared (-59%); lowering it frees packing
|
|
166
|
+
headroom`.
|
|
167
|
+
|
|
168
|
+
Every completion line is streamed the moment the job finishes and includes
|
|
169
|
+
its per-GPU peak and average device-level GPU utilization (device-level: when
|
|
170
|
+
two jobs share a GPU the number is confounded — treat it as a diagnostic for
|
|
171
|
+
spotting dataloader-bound runs, not a per-process metric). `--verbose` adds
|
|
172
|
+
live usage lines as a job's peak grows.
|
|
173
|
+
|
|
174
|
+
## Live queue, resume, and the status board
|
|
175
|
+
|
|
176
|
+
The jobs file is **user-owned and never written by the scheduler**; it is
|
|
177
|
+
re-read every poll. Each line has a stable identity (hash of its command text
|
|
178
|
+
plus an occurrence counter for duplicate lines), and an append-only journal
|
|
179
|
+
(`<log_dir>/journal.jsonl`) records attempts and outcomes per identity.
|
|
180
|
+
"Pending" is defined as: lines in the file that are neither running nor
|
|
181
|
+
terminal in the journal. Everything follows from that one definition —
|
|
182
|
+
append a line from any terminal and it is dispatched within a poll; delete a
|
|
183
|
+
pending line and it is dequeued; reorder pending lines and you have reordered
|
|
184
|
+
the queue (file order among pending IS the priority; there is no separate
|
|
185
|
+
priority mechanism); edit a pending line and you have replaced it. Edits to
|
|
186
|
+
running or completed lines do nothing. A malformed mid-edit save is rejected
|
|
187
|
+
with a warning and the last good queue is kept; in-flight jobs are never
|
|
188
|
+
affected.
|
|
189
|
+
|
|
190
|
+
Re-running the same command after a crash or Ctrl-C skips everything the
|
|
191
|
+
journal marks done — that is resume. `--fresh` wipes the journal to re-run
|
|
192
|
+
all; to re-run one job, change its line trivially (new identity). `--watch`
|
|
193
|
+
keeps the scheduler alive after the queue drains, waiting for appended lines.
|
|
194
|
+
|
|
195
|
+
A live board is rendered to `<log_dir>/status.txt` every poll
|
|
196
|
+
(`▶` running, `·` pending, `↻` retrying after OOM, `✓`/`✗` done):
|
|
197
|
+
|
|
198
|
+
```
|
|
199
|
+
watch -n2 cat gpusched_logs/status.txt
|
|
200
|
+
```
|
|
201
|
+
|
|
202
|
+
Known limitation: with continuous submission, a large blocked job can be
|
|
203
|
+
starved by a stream of small backfilled ones. There is no aging policy — you
|
|
204
|
+
are the priority mechanism. Move the big job's line up and hold small
|
|
205
|
+
submissions, or run it under `--exclusive`.
|
|
206
|
+
|
|
207
|
+
## OOM retry and timeouts
|
|
208
|
+
|
|
209
|
+
A failed job whose log tail matches CUDA OOM signatures and that declared
|
|
210
|
+
`[retries=N]` (or ran under `--oom-retries N`) is requeued instead of
|
|
211
|
+
terminal-failed: its declaration is bumped to ~1.25× of max(observed peak,
|
|
212
|
+
old declaration), recorded in the journal (so it survives scheduler
|
|
213
|
+
restarts), and applied on the next attempt — the retry is scheduled with
|
|
214
|
+
honest requirements instead of repeating the same collision. Non-OOM failures
|
|
215
|
+
never consume retries, so retry loops cannot mask code bugs.
|
|
216
|
+
|
|
217
|
+
Timeouts are strictly per-job opt-in (`[timeout=90s|15m|2h|1d]`).
|
|
218
|
+
Distinguishing a hung process from a legitimate three-day run is your
|
|
219
|
+
declaration, never a heuristic — heuristics (e.g. "0% util for 10 minutes")
|
|
220
|
+
kill legitimate CPU phases like preprocessing and checkpoint serialization,
|
|
221
|
+
so none are included.
|
|
222
|
+
|
|
223
|
+
## Running detached (tmux)
|
|
224
|
+
|
|
225
|
+
The scheduler is an ordinary foreground process — run it inside tmux and
|
|
226
|
+
disconnect freely:
|
|
227
|
+
|
|
228
|
+
```
|
|
229
|
+
tmux new -s sched
|
|
230
|
+
gpusched jobs.txt --watch -v # pane 1
|
|
231
|
+
# Ctrl-b c -> pane 2:
|
|
232
|
+
watch -n2 cat gpusched_logs/status.txt
|
|
233
|
+
# Ctrl-b d to detach; later: tmux attach -t sched
|
|
234
|
+
```
|
|
235
|
+
|
|
236
|
+
You can also append jobs over ssh without attaching at all — the file is
|
|
237
|
+
re-read every poll. Failure-mode hierarchy, honestly: tmux protects
|
|
238
|
+
everything from SSH drops; the journal protects queue state from scheduler
|
|
239
|
+
death; nothing recovers *tracking* of jobs orphaned by a dead scheduler (they
|
|
240
|
+
keep running in their own sessions, but their exit codes are lost and a
|
|
241
|
+
restarted scheduler treats them as not-done).
|
|
242
|
+
|
|
243
|
+
## CLI reference
|
|
244
|
+
|
|
245
|
+
```
|
|
246
|
+
gpusched jobs.txt
|
|
247
|
+
--gpus 0,1,3 restrict to these GPU indices (default: all visible)
|
|
248
|
+
--idle-threshold 200 MiB below which a GPU counts as idle (undeclared jobs)
|
|
249
|
+
--margin 512 MiB safety margin added to every declaration
|
|
250
|
+
--tolerance 0.10 band before flagging over/under-declaration
|
|
251
|
+
--spike-buffer 0.10 buffer over observed VRAM maxima of fluctuating processes
|
|
252
|
+
--poll 5 seconds between scheduling rounds
|
|
253
|
+
--exclusive one scheduled job per GPU, even when declarations fit
|
|
254
|
+
--watch keep running after drain; pick up appended lines
|
|
255
|
+
--oom-retries N default CUDA-OOM auto-retries ([retries=N] overrides)
|
|
256
|
+
--fresh ignore + remove the journal: re-run everything
|
|
257
|
+
--log-dir DIR per-job logs, journal, status board (default: gpusched_logs)
|
|
258
|
+
-v / --verbose stream live per-job VRAM as peaks grow
|
|
259
|
+
--sim N dry-run on N simulated 24 GiB GPUs (no hardware)
|
|
260
|
+
```
|
|
261
|
+
|
|
262
|
+
Exit code: 0 if every job succeeded, otherwise the max failing job exit code.
|
|
263
|
+
|
|
264
|
+
## Architecture and extending
|
|
265
|
+
|
|
266
|
+
```
|
|
267
|
+
src/gpusched/
|
|
268
|
+
jobspec.py parsing: [vram=.. gpus=.. timeout=.. retries=..] cmd -> JobSpec
|
|
269
|
+
backend.py GpuBackend protocol; NvidiaSmiBackend (2 queries/poll); pgid attribution
|
|
270
|
+
allocation.py PURE placement function: headroom, reservations, best-fit
|
|
271
|
+
journal.py append-only JSONL: attempts + terminal outcomes per job identity
|
|
272
|
+
scheduler.py tick loop: snapshot -> attribute -> warn -> timeouts -> reap -> dispatch
|
|
273
|
+
testing.py FakeBackend (unit tests), SimBackend (integration / --sim)
|
|
274
|
+
simjob.py simulated GPU job for tests and dry runs
|
|
275
|
+
cli.py argparse front end
|
|
276
|
+
```
|
|
277
|
+
|
|
278
|
+
The deliberate seams: `allocation.find_allocation` is pure (snapshot +
|
|
279
|
+
occupants in, GPU list out), so new placement rules — GPU-type constraints,
|
|
280
|
+
NVLink-aware pairing — are filters there plus an attribute in the jobspec
|
|
281
|
+
parser, with nothing else touched. Alternative monitors (pynvml, DCGM)
|
|
282
|
+
implement the two-method `GpuBackend` protocol. The journal is the only
|
|
283
|
+
persistent state.
|
|
284
|
+
|
|
285
|
+
## Development
|
|
286
|
+
|
|
287
|
+
```
|
|
288
|
+
git clone <repo> && cd gpusched
|
|
289
|
+
uv venv && uv pip install -e ".[dev]"
|
|
290
|
+
uv run pytest -q # 65 tests, ~20s, no GPU required
|
|
291
|
+
```
|
|
292
|
+
|
|
293
|
+
Tests drive the scheduler against fake/simulated backends with real
|
|
294
|
+
subprocesses; several are timing-based (sub-second sim jobs with fast polls),
|
|
295
|
+
so a heavily loaded machine can occasionally need a re-run.
|
|
296
|
+
|
|
297
|
+
## License
|
|
298
|
+
|
|
299
|
+
MIT.
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=68"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "gpusched"
|
|
7
|
+
version = "0.3.0"
|
|
8
|
+
description = "VRAM-aware single-node GPU job scheduler: queue shell commands, place them by free GPU memory, and verify declared-vs-actual VRAM per job."
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
requires-python = ">=3.10"
|
|
11
|
+
license = { text = "MIT" }
|
|
12
|
+
keywords = ["gpu", "scheduler", "vram", "nvidia", "queue", "cuda"]
|
|
13
|
+
classifiers = [
|
|
14
|
+
"Environment :: Console",
|
|
15
|
+
"Intended Audience :: Science/Research",
|
|
16
|
+
"Operating System :: POSIX :: Linux",
|
|
17
|
+
"Programming Language :: Python :: 3",
|
|
18
|
+
"Topic :: System :: Distributed Computing",
|
|
19
|
+
]
|
|
20
|
+
dependencies = []
|
|
21
|
+
|
|
22
|
+
[project.urls]
|
|
23
|
+
Homepage = "https://github.com/ceruleane/gpusched"
|
|
24
|
+
Issues = "https://github.com/ceruleane/gpusched/issues"
|
|
25
|
+
Changelog = "https://github.com/ceruleane/gpusched/blob/main/CHANGELOG.md"
|
|
26
|
+
|
|
27
|
+
[project.optional-dependencies]
|
|
28
|
+
dev = ["pytest>=7"]
|
|
29
|
+
|
|
30
|
+
[project.scripts]
|
|
31
|
+
gpusched = "gpusched.cli:main"
|
|
32
|
+
|
|
33
|
+
[tool.setuptools.packages.find]
|
|
34
|
+
where = ["src"]
|
gpusched-0.3.0/setup.cfg
ADDED