gpusched 0.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- gpusched/__init__.py +13 -0
- gpusched/allocation.py +142 -0
- gpusched/backend.py +130 -0
- gpusched/cli.py +112 -0
- gpusched/jobspec.py +161 -0
- gpusched/journal.py +106 -0
- gpusched/scheduler.py +568 -0
- gpusched/simjob.py +63 -0
- gpusched/testing.py +86 -0
- gpusched-0.3.0.dist-info/METADATA +320 -0
- gpusched-0.3.0.dist-info/RECORD +15 -0
- gpusched-0.3.0.dist-info/WHEEL +5 -0
- gpusched-0.3.0.dist-info/entry_points.txt +2 -0
- gpusched-0.3.0.dist-info/licenses/LICENSE +21 -0
- gpusched-0.3.0.dist-info/top_level.txt +1 -0
gpusched/__init__.py
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
"""gpusched — VRAM-aware single-node GPU job scheduler."""
|
|
2
|
+
|
|
3
|
+
__version__ = "0.3.0"
|
|
4
|
+
|
|
5
|
+
from .allocation import AllocOptions
|
|
6
|
+
from .backend import NvidiaSmiBackend
|
|
7
|
+
from .jobspec import JobSpec, parse_jobs_file
|
|
8
|
+
from .scheduler import Scheduler, SchedulerOptions
|
|
9
|
+
|
|
10
|
+
__all__ = [
|
|
11
|
+
"AllocOptions", "JobSpec", "NvidiaSmiBackend",
|
|
12
|
+
"Scheduler", "SchedulerOptions", "parse_jobs_file", "__version__",
|
|
13
|
+
]
|
gpusched/allocation.py
ADDED
|
@@ -0,0 +1,142 @@
|
|
|
1
|
+
"""Pure GPU allocation logic.
|
|
2
|
+
|
|
3
|
+
Separated from the scheduler loop so placement rules are testable with no
|
|
4
|
+
subprocesses and no clock.
|
|
5
|
+
|
|
6
|
+
Semantics (v0.2: high-water-mark aware)
|
|
7
|
+
---------------------------------------
|
|
8
|
+
* A job **without** a vram declaration requires a *fully idle* GPU:
|
|
9
|
+
effective external usage < idle_threshold and no scheduler job placed there.
|
|
10
|
+
* A job **with** a declaration of E MiB (per GPU) can be placed on any GPU
|
|
11
|
+
whose *effective headroom* >= E + margin.
|
|
12
|
+
* Effective headroom = total - effective_external - sum(own effective budgets).
|
|
13
|
+
* A running job's effective budget per GPU is its declaration while its
|
|
14
|
+
observed peak stays within it; once the peak exceeds the declaration, the
|
|
15
|
+
declaration is no longer trusted and the budget escalates to
|
|
16
|
+
``peak * (1 + spike_buffer)``. A job that has not ramped yet (actual 0)
|
|
17
|
+
still blocks its full budget — this closes the launch double-booking race.
|
|
18
|
+
* ``effective_external`` is supplied by the scheduler as
|
|
19
|
+
``max(instantaneous_external, sum(per-pid external peaks) * (1 + spike_buffer))``
|
|
20
|
+
so that fluctuating external processes are held to their observed maxima,
|
|
21
|
+
not their momentary troughs. When not supplied (unit tests / simple use),
|
|
22
|
+
it falls back to instantaneous usage minus attributed own usage.
|
|
23
|
+
* A GPU hosting an *undeclared* scheduler job is never eligible for others.
|
|
24
|
+
* ``exclusive=True`` additionally forbids two scheduler jobs on one GPU,
|
|
25
|
+
while still honoring headroom vs external processes.
|
|
26
|
+
* Multi-GPU jobs (n_gpus=N) need N distinct GPUs, each individually
|
|
27
|
+
eligible; selection is best-fit (smallest sufficient headroom first) to
|
|
28
|
+
preserve large contiguous headroom for big jobs.
|
|
29
|
+
"""
|
|
30
|
+
|
|
31
|
+
from __future__ import annotations
|
|
32
|
+
|
|
33
|
+
import math
|
|
34
|
+
from dataclasses import dataclass, field
|
|
35
|
+
|
|
36
|
+
from .backend import GpuSnapshot
|
|
37
|
+
from .jobspec import JobSpec
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
@dataclass(frozen=True)
|
|
41
|
+
class Occupant:
|
|
42
|
+
"""Lightweight view of a running scheduler job, for allocation purposes."""
|
|
43
|
+
|
|
44
|
+
gpu_indices: tuple[int, ...]
|
|
45
|
+
vram_mib: int | None # declared estimate (per GPU); None = undeclared
|
|
46
|
+
actual_mib: dict[int, int] # last attributed usage per GPU (MiB)
|
|
47
|
+
peak_mib: dict[int, int] = field(default_factory=dict) # observed max per GPU
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
@dataclass(frozen=True)
|
|
51
|
+
class AllocOptions:
|
|
52
|
+
idle_threshold_mib: int = 200 # GPU counts as idle below this usage
|
|
53
|
+
margin_mib: int = 512 # safety margin on top of declarations
|
|
54
|
+
exclusive: bool = False # one scheduler job per GPU
|
|
55
|
+
spike_buffer: float = 0.10 # headroom buffer over observed maxima
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def effective_budget(occ: Occupant, gpu: int, spike_buffer: float) -> int | None:
|
|
59
|
+
"""MiB to hold against `occ` on `gpu`. None = the whole device (undeclared)."""
|
|
60
|
+
if occ.vram_mib is None:
|
|
61
|
+
return None
|
|
62
|
+
peak = occ.peak_mib.get(gpu, 0)
|
|
63
|
+
if peak <= occ.vram_mib:
|
|
64
|
+
budget = occ.vram_mib
|
|
65
|
+
else: # declaration violated -> trust the empirical max, buffered
|
|
66
|
+
budget = math.ceil(round(peak * (1 + spike_buffer), 6))
|
|
67
|
+
return max(budget, occ.actual_mib.get(gpu, 0))
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
def _fallback_external(gpu: int, snapshot: GpuSnapshot, occupants: list[Occupant]) -> int:
|
|
71
|
+
own_actual = sum(occ.actual_mib.get(gpu, 0) for occ in occupants if gpu in occ.gpu_indices)
|
|
72
|
+
return max(0, snapshot.gpus[gpu].used_mib - own_actual)
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
def effective_headroom(
|
|
76
|
+
gpu: int,
|
|
77
|
+
snapshot: GpuSnapshot,
|
|
78
|
+
occupants: list[Occupant],
|
|
79
|
+
opts: AllocOptions,
|
|
80
|
+
external_mib: dict[int, int] | None = None,
|
|
81
|
+
) -> int | None:
|
|
82
|
+
"""Headroom available for a *new declared* job on `gpu`.
|
|
83
|
+
|
|
84
|
+
Returns None if the GPU is off-limits (hosts an undeclared job).
|
|
85
|
+
"""
|
|
86
|
+
stat = snapshot.gpus[gpu]
|
|
87
|
+
own = 0
|
|
88
|
+
for occ in occupants:
|
|
89
|
+
if gpu not in occ.gpu_indices:
|
|
90
|
+
continue
|
|
91
|
+
budget = effective_budget(occ, gpu, opts.spike_buffer)
|
|
92
|
+
if budget is None:
|
|
93
|
+
return None # undeclared job owns the whole device
|
|
94
|
+
own += budget
|
|
95
|
+
external = (
|
|
96
|
+
external_mib[gpu] if external_mib is not None
|
|
97
|
+
else _fallback_external(gpu, snapshot, occupants)
|
|
98
|
+
)
|
|
99
|
+
return stat.total_mib - external - own
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
def find_allocation(
|
|
103
|
+
spec: JobSpec,
|
|
104
|
+
snapshot: GpuSnapshot,
|
|
105
|
+
occupants: list[Occupant],
|
|
106
|
+
allowed_gpus: set[int],
|
|
107
|
+
opts: AllocOptions,
|
|
108
|
+
external_mib: dict[int, int] | None = None,
|
|
109
|
+
) -> list[int] | None:
|
|
110
|
+
"""Return the GPU indices to run `spec` on, or None if it cannot start now."""
|
|
111
|
+
candidates = sorted(g for g in snapshot.gpus if g in allowed_gpus)
|
|
112
|
+
scheduler_used = {g for occ in occupants for g in occ.gpu_indices}
|
|
113
|
+
|
|
114
|
+
def external(g: int) -> int:
|
|
115
|
+
return (
|
|
116
|
+
external_mib[g] if external_mib is not None
|
|
117
|
+
else _fallback_external(g, snapshot, occupants)
|
|
118
|
+
)
|
|
119
|
+
|
|
120
|
+
if spec.vram_mib is None:
|
|
121
|
+
# Undeclared: fully idle GPUs only (judged on EFFECTIVE external usage,
|
|
122
|
+
# so a live external process is held to its peak), lowest index first.
|
|
123
|
+
eligible = [
|
|
124
|
+
g for g in candidates
|
|
125
|
+
if g not in scheduler_used and external(g) < opts.idle_threshold_mib
|
|
126
|
+
]
|
|
127
|
+
return eligible[: spec.n_gpus] if len(eligible) >= spec.n_gpus else None
|
|
128
|
+
|
|
129
|
+
need = spec.vram_mib + opts.margin_mib
|
|
130
|
+
scored: list[tuple[int, int]] = [] # (headroom, gpu)
|
|
131
|
+
for g in candidates:
|
|
132
|
+
if opts.exclusive and g in scheduler_used:
|
|
133
|
+
continue
|
|
134
|
+
headroom = effective_headroom(g, snapshot, occupants, opts, external_mib)
|
|
135
|
+
if headroom is not None and headroom >= need:
|
|
136
|
+
scored.append((headroom, g))
|
|
137
|
+
|
|
138
|
+
if len(scored) < spec.n_gpus:
|
|
139
|
+
return None
|
|
140
|
+
# Best-fit: smallest sufficient headroom first; tie-break on index.
|
|
141
|
+
scored.sort()
|
|
142
|
+
return sorted(g for _, g in scored[: spec.n_gpus])
|
gpusched/backend.py
ADDED
|
@@ -0,0 +1,130 @@
|
|
|
1
|
+
"""GPU state backends.
|
|
2
|
+
|
|
3
|
+
The scheduler only ever sees a :class:`GpuSnapshot`; how it is produced is
|
|
4
|
+
behind the :class:`GpuBackend` protocol. The real backend shells out to
|
|
5
|
+
``nvidia-smi``; test backends fabricate snapshots (see ``gpusched.testing``).
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
import subprocess
|
|
11
|
+
from dataclasses import dataclass, field
|
|
12
|
+
from typing import Protocol
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
@dataclass(frozen=True)
|
|
16
|
+
class GpuStat:
|
|
17
|
+
index: int
|
|
18
|
+
total_mib: int
|
|
19
|
+
used_mib: int
|
|
20
|
+
util_pct: int | None = None # device-level SM utilization, if known
|
|
21
|
+
|
|
22
|
+
@property
|
|
23
|
+
def free_mib(self) -> int:
|
|
24
|
+
return self.total_mib - self.used_mib
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
@dataclass(frozen=True)
|
|
28
|
+
class ProcStat:
|
|
29
|
+
gpu_index: int
|
|
30
|
+
pid: int
|
|
31
|
+
used_mib: int
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
@dataclass
|
|
35
|
+
class GpuSnapshot:
|
|
36
|
+
gpus: dict[int, GpuStat] = field(default_factory=dict)
|
|
37
|
+
procs: list[ProcStat] = field(default_factory=list)
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
class GpuBackend(Protocol):
|
|
41
|
+
def snapshot(self) -> GpuSnapshot: ...
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
class BackendError(RuntimeError):
|
|
45
|
+
pass
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def pgid_of(pid: int) -> int | None:
|
|
49
|
+
"""Process-group id of *pid* via /proc (Linux). None if the process is gone.
|
|
50
|
+
|
|
51
|
+
Used to attribute GPU compute processes to scheduler-launched jobs: each
|
|
52
|
+
job is started in its own session (``os.setsid``), so every descendant —
|
|
53
|
+
including those spawned through ``sh -c`` — shares the job's pgid.
|
|
54
|
+
"""
|
|
55
|
+
try:
|
|
56
|
+
with open(f"/proc/{pid}/stat", "rb") as f:
|
|
57
|
+
data = f.read().decode("utf-8", "replace")
|
|
58
|
+
# /proc/[pid]/stat: "pid (comm) state ppid pgrp session ...".
|
|
59
|
+
# comm may itself contain spaces/parens, so split after the LAST ')':
|
|
60
|
+
after_comm = data.rsplit(")", 1)[1].split()
|
|
61
|
+
return int(after_comm[2]) # [0]=state, [1]=ppid, [2]=pgrp
|
|
62
|
+
except (FileNotFoundError, ProcessLookupError, PermissionError, IndexError, ValueError):
|
|
63
|
+
return None
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
class NvidiaSmiBackend:
|
|
67
|
+
"""Real backend: two nvidia-smi queries per snapshot.
|
|
68
|
+
|
|
69
|
+
* ``--query-gpu=index,uuid,memory.total,memory.used`` for GPU-level stats
|
|
70
|
+
* ``--query-compute-apps=gpu_uuid,pid,used_memory`` for per-process stats
|
|
71
|
+
"""
|
|
72
|
+
|
|
73
|
+
def __init__(self, nvidia_smi: str = "nvidia-smi", timeout: float = 10.0):
|
|
74
|
+
self._bin = nvidia_smi
|
|
75
|
+
self._timeout = timeout
|
|
76
|
+
|
|
77
|
+
def _run(self, args: list[str]) -> str:
|
|
78
|
+
try:
|
|
79
|
+
return subprocess.check_output(
|
|
80
|
+
[self._bin, *args], text=True, timeout=self._timeout,
|
|
81
|
+
stderr=subprocess.PIPE,
|
|
82
|
+
)
|
|
83
|
+
except FileNotFoundError as e:
|
|
84
|
+
raise BackendError(f"{self._bin} not found — is the NVIDIA driver installed?") from e
|
|
85
|
+
except subprocess.CalledProcessError as e:
|
|
86
|
+
raise BackendError(f"{self._bin} failed: {e.stderr or e}") from e
|
|
87
|
+
except subprocess.TimeoutExpired as e:
|
|
88
|
+
raise BackendError(f"{self._bin} timed out after {self._timeout}s") from e
|
|
89
|
+
|
|
90
|
+
def snapshot(self) -> GpuSnapshot:
|
|
91
|
+
snap = GpuSnapshot()
|
|
92
|
+
uuid_to_index: dict[str, int] = {}
|
|
93
|
+
|
|
94
|
+
out = self._run([
|
|
95
|
+
"--query-gpu=index,uuid,memory.total,memory.used,utilization.gpu",
|
|
96
|
+
"--format=csv,noheader,nounits",
|
|
97
|
+
])
|
|
98
|
+
for line in out.strip().splitlines():
|
|
99
|
+
parts = [p.strip() for p in line.split(",")]
|
|
100
|
+
if len(parts) != 5:
|
|
101
|
+
continue
|
|
102
|
+
idx, uuid, total, used, util = parts
|
|
103
|
+
try:
|
|
104
|
+
util_pct: int | None = int(util)
|
|
105
|
+
except ValueError:
|
|
106
|
+
util_pct = None # "[N/A]" on some virtualized setups
|
|
107
|
+
gpu = GpuStat(index=int(idx), total_mib=int(total),
|
|
108
|
+
used_mib=int(used), util_pct=util_pct)
|
|
109
|
+
snap.gpus[gpu.index] = gpu
|
|
110
|
+
uuid_to_index[uuid] = gpu.index
|
|
111
|
+
|
|
112
|
+
out = self._run([
|
|
113
|
+
"--query-compute-apps=gpu_uuid,pid,used_memory",
|
|
114
|
+
"--format=csv,noheader,nounits",
|
|
115
|
+
])
|
|
116
|
+
for line in out.strip().splitlines():
|
|
117
|
+
parts = [p.strip() for p in line.split(",")]
|
|
118
|
+
if len(parts) != 3 or parts[0] not in uuid_to_index:
|
|
119
|
+
continue
|
|
120
|
+
try:
|
|
121
|
+
snap.procs.append(ProcStat(
|
|
122
|
+
gpu_index=uuid_to_index[parts[0]],
|
|
123
|
+
pid=int(parts[1]),
|
|
124
|
+
used_mib=int(parts[2]),
|
|
125
|
+
))
|
|
126
|
+
except ValueError:
|
|
127
|
+
# used_memory can be "[N/A]" (e.g. inside some containers /
|
|
128
|
+
# for graphics processes); skip — attribution degrades to n/a.
|
|
129
|
+
continue
|
|
130
|
+
return snap
|
gpusched/cli.py
ADDED
|
@@ -0,0 +1,112 @@
|
|
|
1
|
+
"""Command-line interface: ``gpusched jobs.txt [options]``."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import argparse
|
|
6
|
+
import sys
|
|
7
|
+
|
|
8
|
+
from . import __version__
|
|
9
|
+
from .allocation import AllocOptions
|
|
10
|
+
from .backend import NvidiaSmiBackend
|
|
11
|
+
from .jobspec import JobSpecError, parse_jobs_file
|
|
12
|
+
from .scheduler import Scheduler, SchedulerOptions
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def build_parser() -> argparse.ArgumentParser:
|
|
16
|
+
p = argparse.ArgumentParser(
|
|
17
|
+
prog="gpusched",
|
|
18
|
+
description=(
|
|
19
|
+
"VRAM-aware GPU job scheduler. Reads one shell command per line; "
|
|
20
|
+
"lines may declare expected max VRAM: '[vram=18G] python train.py' "
|
|
21
|
+
"or multiple GPUs: '[vram=30G gpus=2] torchrun ...'. Declared jobs "
|
|
22
|
+
"are packed onto GPUs with enough free memory; undeclared jobs get "
|
|
23
|
+
"a fully idle GPU. Actual per-job VRAM is monitored and compared "
|
|
24
|
+
"against declarations."
|
|
25
|
+
),
|
|
26
|
+
)
|
|
27
|
+
p.add_argument("jobs_file", help="file with one shell command per line")
|
|
28
|
+
p.add_argument("--gpus", default=None, metavar="0,1,3",
|
|
29
|
+
help="comma-separated GPU indices to use (default: all visible)")
|
|
30
|
+
p.add_argument("--idle-threshold", type=int, default=200, metavar="MIB",
|
|
31
|
+
help="GPU counts as idle below this usage, for undeclared jobs (default: 200)")
|
|
32
|
+
p.add_argument("--margin", type=int, default=512, metavar="MIB",
|
|
33
|
+
help="safety margin added to every vram declaration (default: 512)")
|
|
34
|
+
p.add_argument("--tolerance", type=float, default=0.10, metavar="FRAC",
|
|
35
|
+
help="relative band before flagging over/under-declaration (default: 0.10)")
|
|
36
|
+
p.add_argument("--poll", type=float, default=5.0, metavar="SEC",
|
|
37
|
+
help="polling interval in seconds (default: 5)")
|
|
38
|
+
p.add_argument("--spike-buffer", type=float, default=0.10, metavar="FRAC",
|
|
39
|
+
help="headroom buffer applied over observed VRAM maxima of fluctuating "
|
|
40
|
+
"processes — both external ones and own jobs that exceeded their "
|
|
41
|
+
"declaration (default: 0.10)")
|
|
42
|
+
p.add_argument("--exclusive", action="store_true",
|
|
43
|
+
help="never co-locate two scheduler jobs on one GPU, even if declarations fit")
|
|
44
|
+
p.add_argument("--log-dir", default="gpusched_logs",
|
|
45
|
+
help="directory for per-job stdout/stderr logs (default: gpusched_logs)")
|
|
46
|
+
p.add_argument("--watch", action="store_true",
|
|
47
|
+
help="keep running after the queue drains, picking up lines appended "
|
|
48
|
+
"to the jobs file (the jobs file is re-read every poll either way)")
|
|
49
|
+
p.add_argument("--oom-retries", type=int, default=0, metavar="N",
|
|
50
|
+
help="default CUDA-OOM auto-retries per job; [retries=N] overrides (default: 0)")
|
|
51
|
+
p.add_argument("--fresh", action="store_true",
|
|
52
|
+
help="ignore and remove the existing journal: re-run everything")
|
|
53
|
+
p.add_argument("-v", "--verbose", action="store_true",
|
|
54
|
+
help="stream live per-job VRAM usage as peaks grow")
|
|
55
|
+
p.add_argument("--sim", type=int, default=None, metavar="N_GPUS",
|
|
56
|
+
help="dry-run against N simulated 24 GiB GPUs (no hardware needed); "
|
|
57
|
+
"pair with 'python -m gpusched.simjob' commands")
|
|
58
|
+
p.add_argument("--version", action="version", version=f"gpusched {__version__}")
|
|
59
|
+
return p
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def main(argv: list[str] | None = None) -> int:
|
|
63
|
+
args = build_parser().parse_args(argv)
|
|
64
|
+
|
|
65
|
+
try:
|
|
66
|
+
jobs = parse_jobs_file(args.jobs_file)
|
|
67
|
+
except JobSpecError as e:
|
|
68
|
+
print(f"gpusched: {e}", file=sys.stderr)
|
|
69
|
+
return 2
|
|
70
|
+
except OSError as e:
|
|
71
|
+
print(f"gpusched: cannot read jobs file: {e}", file=sys.stderr)
|
|
72
|
+
return 2
|
|
73
|
+
if not jobs and not args.watch:
|
|
74
|
+
print("gpusched: jobs file contains no jobs (use --watch to wait for some)", file=sys.stderr)
|
|
75
|
+
return 2
|
|
76
|
+
|
|
77
|
+
if args.fresh:
|
|
78
|
+
import pathlib
|
|
79
|
+
pathlib.Path(args.log_dir, "journal.jsonl").unlink(missing_ok=True)
|
|
80
|
+
|
|
81
|
+
if args.sim is not None:
|
|
82
|
+
from .testing import SimBackend
|
|
83
|
+
backend = SimBackend(n_gpus=args.sim)
|
|
84
|
+
else:
|
|
85
|
+
backend = NvidiaSmiBackend()
|
|
86
|
+
|
|
87
|
+
allowed = (
|
|
88
|
+
{int(g) for g in args.gpus.split(",")} if args.gpus else None
|
|
89
|
+
)
|
|
90
|
+
options = SchedulerOptions(
|
|
91
|
+
alloc=AllocOptions(
|
|
92
|
+
idle_threshold_mib=args.idle_threshold,
|
|
93
|
+
margin_mib=args.margin,
|
|
94
|
+
exclusive=args.exclusive,
|
|
95
|
+
spike_buffer=args.spike_buffer,
|
|
96
|
+
),
|
|
97
|
+
poll_interval=args.poll,
|
|
98
|
+
tolerance=args.tolerance,
|
|
99
|
+
verbose=args.verbose,
|
|
100
|
+
log_dir=args.log_dir,
|
|
101
|
+
watch=args.watch,
|
|
102
|
+
oom_retries_default=args.oom_retries,
|
|
103
|
+
)
|
|
104
|
+
sched = Scheduler(backend, jobs_path=args.jobs_file, allowed_gpus=allowed, options=options)
|
|
105
|
+
try:
|
|
106
|
+
return sched.run()
|
|
107
|
+
except KeyboardInterrupt:
|
|
108
|
+
return 130
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
if __name__ == "__main__":
|
|
112
|
+
sys.exit(main())
|
gpusched/jobspec.py
ADDED
|
@@ -0,0 +1,161 @@
|
|
|
1
|
+
"""Job specification parsing.
|
|
2
|
+
|
|
3
|
+
Jobs-file syntax (one job per line):
|
|
4
|
+
|
|
5
|
+
# comment / blank lines are skipped
|
|
6
|
+
python train.py --config a.yaml # no estimate -> needs an idle GPU
|
|
7
|
+
[vram=18000] python train.py # declares max 18000 MiB
|
|
8
|
+
[vram=22G] bash run_eval.sh # G / GiB suffix accepted
|
|
9
|
+
[vram=30G gpus=2] torchrun train_big.py # 2 GPUs, each with >= 30 GiB free
|
|
10
|
+
[timeout=2h] python flaky_eval.py # SIGTERM after 2 hours (opt-in only)
|
|
11
|
+
[vram=8G retries=2] python sweep.py # auto-retry on CUDA OOM, up to 2x
|
|
12
|
+
|
|
13
|
+
Attribute block must be a single leading ``[key=value ...]`` group.
|
|
14
|
+
``vram`` is interpreted **per GPU** for multi-GPU jobs. ``timeout`` accepts
|
|
15
|
+
s/m/h/d suffixes (default seconds). Jobs are identified by a hash of their
|
|
16
|
+
command text (plus an occurrence counter for duplicate lines), which is what
|
|
17
|
+
makes live-edited queue files and resume-after-restart well defined.
|
|
18
|
+
"""
|
|
19
|
+
|
|
20
|
+
from __future__ import annotations
|
|
21
|
+
|
|
22
|
+
import re
|
|
23
|
+
from dataclasses import dataclass, field
|
|
24
|
+
|
|
25
|
+
_ATTR_BLOCK = re.compile(r"^\[(?P<attrs>[^\]]*)\]\s*(?P<cmd>.*)$", re.DOTALL)
|
|
26
|
+
_VRAM_VALUE = re.compile(r"^(?P<num>\d+(?:\.\d+)?)\s*(?P<unit>g|gb|gib|m|mb|mib)?$", re.IGNORECASE)
|
|
27
|
+
_DURATION = re.compile(r"^(?P<num>\d+(?:\.\d+)?)\s*(?P<unit>s|m|h|d)?$", re.IGNORECASE)
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
class JobSpecError(ValueError):
|
|
31
|
+
"""Raised on malformed jobs-file lines; carries the 1-based line number."""
|
|
32
|
+
|
|
33
|
+
def __init__(self, lineno: int, message: str):
|
|
34
|
+
super().__init__(f"jobs file line {lineno}: {message}")
|
|
35
|
+
self.lineno = lineno
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
@dataclass(frozen=True)
|
|
39
|
+
class JobSpec:
|
|
40
|
+
"""A single queued job."""
|
|
41
|
+
|
|
42
|
+
index: int # 1-based position in the jobs file
|
|
43
|
+
command: str # shell command to execute
|
|
44
|
+
vram_mib: int | None = None # declared max VRAM per GPU (MiB); None = undeclared
|
|
45
|
+
n_gpus: int = 1 # number of GPUs required
|
|
46
|
+
timeout_s: float | None = None # walltime before SIGTERM; None = never (opt-in)
|
|
47
|
+
retries: int = 0 # auto-retries on CUDA OOM
|
|
48
|
+
key: str = field(default="", compare=False) # stable identity (hash#occurrence)
|
|
49
|
+
lineno: int = field(default=0, compare=False)
|
|
50
|
+
|
|
51
|
+
@property
|
|
52
|
+
def label(self) -> str:
|
|
53
|
+
return f"job {self.index}"
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def parse_vram(value: str, lineno: int = 0) -> int:
|
|
57
|
+
"""Parse a vram value like '12000', '12.5G', '24GiB', '8000MiB' -> MiB."""
|
|
58
|
+
m = _VRAM_VALUE.match(value.strip())
|
|
59
|
+
if not m:
|
|
60
|
+
raise JobSpecError(lineno, f"cannot parse vram value {value!r} (use MiB or a G/GiB suffix)")
|
|
61
|
+
num = float(m.group("num"))
|
|
62
|
+
unit = (m.group("unit") or "m").lower()
|
|
63
|
+
mib = num * 1024 if unit.startswith("g") else num
|
|
64
|
+
mib_int = int(round(mib))
|
|
65
|
+
if mib_int <= 0:
|
|
66
|
+
raise JobSpecError(lineno, f"vram must be positive, got {value!r}")
|
|
67
|
+
return mib_int
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
def parse_duration(value: str, lineno: int = 0) -> float:
|
|
71
|
+
"""Parse '90', '90s', '15m', '2h', '1.5d' -> seconds."""
|
|
72
|
+
m = _DURATION.match(value.strip())
|
|
73
|
+
if not m:
|
|
74
|
+
raise JobSpecError(lineno, f"cannot parse duration {value!r} (use s/m/h/d)")
|
|
75
|
+
mult = {"s": 1, "m": 60, "h": 3600, "d": 86400}[(m.group("unit") or "s").lower()]
|
|
76
|
+
sec = float(m.group("num")) * mult
|
|
77
|
+
if sec <= 0:
|
|
78
|
+
raise JobSpecError(lineno, f"timeout must be positive, got {value!r}")
|
|
79
|
+
return sec
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
def job_key(command: str, occurrence: int) -> str:
|
|
83
|
+
"""Stable identity for a command line; occurrence disambiguates duplicates."""
|
|
84
|
+
import hashlib
|
|
85
|
+
h = hashlib.sha1(command.strip().encode()).hexdigest()[:12]
|
|
86
|
+
return f"{h}#{occurrence}"
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
def parse_line(line: str, lineno: int, index: int) -> JobSpec | None:
|
|
90
|
+
"""Parse one jobs-file line. Returns None for blanks/comments."""
|
|
91
|
+
stripped = line.strip()
|
|
92
|
+
if not stripped or stripped.startswith("#"):
|
|
93
|
+
return None
|
|
94
|
+
|
|
95
|
+
vram_mib: int | None = None
|
|
96
|
+
n_gpus = 1
|
|
97
|
+
timeout_s: float | None = None
|
|
98
|
+
retries = 0
|
|
99
|
+
command = stripped
|
|
100
|
+
|
|
101
|
+
if stripped.startswith("[") and not _ATTR_BLOCK.match(stripped):
|
|
102
|
+
# A line beginning with '[' but never closed is almost certainly a
|
|
103
|
+
# torn mid-edit attribute block — refuse to execute it as shell.
|
|
104
|
+
raise JobSpecError(lineno, "unterminated '[...]' attribute block")
|
|
105
|
+
|
|
106
|
+
m = _ATTR_BLOCK.match(stripped)
|
|
107
|
+
if m:
|
|
108
|
+
command = m.group("cmd").strip()
|
|
109
|
+
if not command:
|
|
110
|
+
raise JobSpecError(lineno, "attribute block present but command is empty")
|
|
111
|
+
for token in m.group("attrs").split():
|
|
112
|
+
if "=" not in token:
|
|
113
|
+
raise JobSpecError(lineno, f"malformed attribute {token!r} (expected key=value)")
|
|
114
|
+
key, _, value = token.partition("=")
|
|
115
|
+
key = key.lower()
|
|
116
|
+
if key == "vram":
|
|
117
|
+
vram_mib = parse_vram(value, lineno)
|
|
118
|
+
elif key == "timeout":
|
|
119
|
+
timeout_s = parse_duration(value, lineno)
|
|
120
|
+
elif key == "retries":
|
|
121
|
+
try:
|
|
122
|
+
retries = int(value)
|
|
123
|
+
except ValueError:
|
|
124
|
+
raise JobSpecError(lineno, f"retries must be an integer, got {value!r}") from None
|
|
125
|
+
if retries < 0:
|
|
126
|
+
raise JobSpecError(lineno, f"retries must be >= 0, got {retries}")
|
|
127
|
+
elif key == "gpus":
|
|
128
|
+
try:
|
|
129
|
+
n_gpus = int(value)
|
|
130
|
+
except ValueError:
|
|
131
|
+
raise JobSpecError(lineno, f"gpus must be an integer, got {value!r}") from None
|
|
132
|
+
if n_gpus < 1:
|
|
133
|
+
raise JobSpecError(lineno, f"gpus must be >= 1, got {n_gpus}")
|
|
134
|
+
else:
|
|
135
|
+
raise JobSpecError(lineno, f"unknown attribute {key!r} (known: vram, gpus, timeout, retries)")
|
|
136
|
+
|
|
137
|
+
return JobSpec(index=index, command=command, vram_mib=vram_mib, n_gpus=n_gpus,
|
|
138
|
+
timeout_s=timeout_s, retries=retries, lineno=lineno)
|
|
139
|
+
|
|
140
|
+
|
|
141
|
+
def assign_keys(specs: list[JobSpec]) -> list[JobSpec]:
|
|
142
|
+
"""Attach stable identity keys (command hash + occurrence index)."""
|
|
143
|
+
from dataclasses import replace
|
|
144
|
+
seen: dict[str, int] = {}
|
|
145
|
+
out = []
|
|
146
|
+
for spec in specs:
|
|
147
|
+
occ = seen.get(spec.command, 0)
|
|
148
|
+
seen[spec.command] = occ + 1
|
|
149
|
+
out.append(replace(spec, key=job_key(spec.command, occ)))
|
|
150
|
+
return out
|
|
151
|
+
|
|
152
|
+
|
|
153
|
+
def parse_jobs_file(path: str) -> list[JobSpec]:
|
|
154
|
+
"""Parse a jobs file into an ordered list of JobSpecs with identity keys."""
|
|
155
|
+
specs: list[JobSpec] = []
|
|
156
|
+
with open(path, encoding="utf-8") as f:
|
|
157
|
+
for lineno, line in enumerate(f, start=1):
|
|
158
|
+
spec = parse_line(line, lineno, index=len(specs) + 1)
|
|
159
|
+
if spec is not None:
|
|
160
|
+
specs.append(spec)
|
|
161
|
+
return assign_keys(specs)
|
gpusched/journal.py
ADDED
|
@@ -0,0 +1,106 @@
|
|
|
1
|
+
"""Append-only job journal (JSONL).
|
|
2
|
+
|
|
3
|
+
One file per log directory. Each line is an event:
|
|
4
|
+
|
|
5
|
+
{"key": "<hash#occ>", "event": "seen", "no": 7, "command": "..."}
|
|
6
|
+
{"key": "...", "event": "oom", "attempts": 1, "next_vram": 9750}
|
|
7
|
+
{"key": "...", "event": "done", "status": "ok", "returncode": 0,
|
|
8
|
+
"peak_mib": {"0": 7800}, "avg_util": 84.0}
|
|
9
|
+
|
|
10
|
+
Folding the events yields per-key state. Terminal statuses are never
|
|
11
|
+
re-dispatched; non-terminal keys remain eligible (this is what makes both
|
|
12
|
+
``--resume``-style restarts and OOM retries fall out of one mechanism).
|
|
13
|
+
The jobs file itself is never written by the scheduler — it stays entirely
|
|
14
|
+
user-owned, and this journal is the scheduler's only persistent state.
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
from __future__ import annotations
|
|
18
|
+
|
|
19
|
+
import json
|
|
20
|
+
import os
|
|
21
|
+
from dataclasses import dataclass, field
|
|
22
|
+
from pathlib import Path
|
|
23
|
+
|
|
24
|
+
TERMINAL = {"ok", "failed", "timeout", "infeasible", "failed_oom"}
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
@dataclass
|
|
28
|
+
class JobState:
|
|
29
|
+
no: int | None = None # persistent display number (first-seen order)
|
|
30
|
+
command: str = ""
|
|
31
|
+
attempts: int = 0 # completed attempts so far
|
|
32
|
+
status: str = "pending" # pending | running | <terminal>
|
|
33
|
+
returncode: int | None = None
|
|
34
|
+
next_vram: int | None = None # bumped declaration for the next attempt
|
|
35
|
+
peak_mib: dict[str, int] = field(default_factory=dict)
|
|
36
|
+
|
|
37
|
+
@property
|
|
38
|
+
def terminal(self) -> bool:
|
|
39
|
+
return self.status in TERMINAL
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
class Journal:
|
|
43
|
+
def __init__(self, path: str | os.PathLike):
|
|
44
|
+
self.path = Path(path)
|
|
45
|
+
self.states: dict[str, JobState] = {}
|
|
46
|
+
self._next_no = 1
|
|
47
|
+
if self.path.exists():
|
|
48
|
+
for line in self.path.read_text().splitlines():
|
|
49
|
+
if line.strip():
|
|
50
|
+
try:
|
|
51
|
+
self._fold(json.loads(line))
|
|
52
|
+
except (json.JSONDecodeError, KeyError):
|
|
53
|
+
continue # tolerate a torn last line after a crash
|
|
54
|
+
|
|
55
|
+
def _fold(self, ev: dict) -> None:
|
|
56
|
+
st = self.states.setdefault(ev["key"], JobState())
|
|
57
|
+
kind = ev.get("event")
|
|
58
|
+
if kind == "seen":
|
|
59
|
+
st.no = ev.get("no")
|
|
60
|
+
st.command = ev.get("command", "")
|
|
61
|
+
self._next_no = max(self._next_no, (st.no or 0) + 1)
|
|
62
|
+
elif kind == "oom":
|
|
63
|
+
st.attempts = ev.get("attempts", st.attempts + 1)
|
|
64
|
+
st.next_vram = ev.get("next_vram", st.next_vram)
|
|
65
|
+
st.status = "pending"
|
|
66
|
+
elif kind == "done":
|
|
67
|
+
st.attempts += 1
|
|
68
|
+
st.status = ev.get("status", "failed")
|
|
69
|
+
st.returncode = ev.get("returncode")
|
|
70
|
+
st.peak_mib = ev.get("peak_mib", {})
|
|
71
|
+
|
|
72
|
+
def _append(self, ev: dict) -> None:
|
|
73
|
+
self.path.parent.mkdir(parents=True, exist_ok=True)
|
|
74
|
+
with open(self.path, "a") as f:
|
|
75
|
+
f.write(json.dumps(ev) + "\n")
|
|
76
|
+
f.flush()
|
|
77
|
+
os.fsync(f.fileno())
|
|
78
|
+
|
|
79
|
+
# ------------------------------------------------------------- API
|
|
80
|
+
def state(self, key: str) -> JobState:
|
|
81
|
+
return self.states.get(key, JobState())
|
|
82
|
+
|
|
83
|
+
def ensure_seen(self, key: str, command: str) -> int:
|
|
84
|
+
"""Assign a persistent display number on first sight; return it."""
|
|
85
|
+
st = self.states.get(key)
|
|
86
|
+
if st is not None and st.no is not None:
|
|
87
|
+
return st.no
|
|
88
|
+
no = self._next_no
|
|
89
|
+
ev = {"key": key, "event": "seen", "no": no, "command": command}
|
|
90
|
+
self._append(ev)
|
|
91
|
+
self._fold(ev)
|
|
92
|
+
return no
|
|
93
|
+
|
|
94
|
+
def record_oom_retry(self, key: str, attempts: int, next_vram: int | None) -> None:
|
|
95
|
+
ev = {"key": key, "event": "oom", "attempts": attempts, "next_vram": next_vram}
|
|
96
|
+
self._append(ev)
|
|
97
|
+
self._fold(ev)
|
|
98
|
+
|
|
99
|
+
def record_done(self, key: str, status: str, returncode: int,
|
|
100
|
+
peak_mib: dict[int, int], avg_util: float | None) -> None:
|
|
101
|
+
ev = {
|
|
102
|
+
"key": key, "event": "done", "status": status, "returncode": returncode,
|
|
103
|
+
"peak_mib": {str(g): m for g, m in peak_mib.items()}, "avg_util": avg_util,
|
|
104
|
+
}
|
|
105
|
+
self._append(ev)
|
|
106
|
+
self._fold(ev)
|