gpusched 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
gpusched/__init__.py ADDED
@@ -0,0 +1,13 @@
1
+ """gpusched — VRAM-aware single-node GPU job scheduler."""
2
+
3
+ __version__ = "0.3.0"
4
+
5
+ from .allocation import AllocOptions
6
+ from .backend import NvidiaSmiBackend
7
+ from .jobspec import JobSpec, parse_jobs_file
8
+ from .scheduler import Scheduler, SchedulerOptions
9
+
10
+ __all__ = [
11
+ "AllocOptions", "JobSpec", "NvidiaSmiBackend",
12
+ "Scheduler", "SchedulerOptions", "parse_jobs_file", "__version__",
13
+ ]
gpusched/allocation.py ADDED
@@ -0,0 +1,142 @@
1
+ """Pure GPU allocation logic.
2
+
3
+ Separated from the scheduler loop so placement rules are testable with no
4
+ subprocesses and no clock.
5
+
6
+ Semantics (v0.2: high-water-mark aware)
7
+ ---------------------------------------
8
+ * A job **without** a vram declaration requires a *fully idle* GPU:
9
+ effective external usage < idle_threshold and no scheduler job placed there.
10
+ * A job **with** a declaration of E MiB (per GPU) can be placed on any GPU
11
+ whose *effective headroom* >= E + margin.
12
+ * Effective headroom = total - effective_external - sum(own effective budgets).
13
+ * A running job's effective budget per GPU is its declaration while its
14
+ observed peak stays within it; once the peak exceeds the declaration, the
15
+ declaration is no longer trusted and the budget escalates to
16
+ ``peak * (1 + spike_buffer)``. A job that has not ramped yet (actual 0)
17
+ still blocks its full budget — this closes the launch double-booking race.
18
+ * ``effective_external`` is supplied by the scheduler as
19
+ ``max(instantaneous_external, sum(per-pid external peaks) * (1 + spike_buffer))``
20
+ so that fluctuating external processes are held to their observed maxima,
21
+ not their momentary troughs. When not supplied (unit tests / simple use),
22
+ it falls back to instantaneous usage minus attributed own usage.
23
+ * A GPU hosting an *undeclared* scheduler job is never eligible for others.
24
+ * ``exclusive=True`` additionally forbids two scheduler jobs on one GPU,
25
+ while still honoring headroom vs external processes.
26
+ * Multi-GPU jobs (n_gpus=N) need N distinct GPUs, each individually
27
+ eligible; selection is best-fit (smallest sufficient headroom first) to
28
+ preserve large contiguous headroom for big jobs.
29
+ """
30
+
31
+ from __future__ import annotations
32
+
33
+ import math
34
+ from dataclasses import dataclass, field
35
+
36
+ from .backend import GpuSnapshot
37
+ from .jobspec import JobSpec
38
+
39
+
40
+ @dataclass(frozen=True)
41
+ class Occupant:
42
+ """Lightweight view of a running scheduler job, for allocation purposes."""
43
+
44
+ gpu_indices: tuple[int, ...]
45
+ vram_mib: int | None # declared estimate (per GPU); None = undeclared
46
+ actual_mib: dict[int, int] # last attributed usage per GPU (MiB)
47
+ peak_mib: dict[int, int] = field(default_factory=dict) # observed max per GPU
48
+
49
+
50
+ @dataclass(frozen=True)
51
+ class AllocOptions:
52
+ idle_threshold_mib: int = 200 # GPU counts as idle below this usage
53
+ margin_mib: int = 512 # safety margin on top of declarations
54
+ exclusive: bool = False # one scheduler job per GPU
55
+ spike_buffer: float = 0.10 # headroom buffer over observed maxima
56
+
57
+
58
+ def effective_budget(occ: Occupant, gpu: int, spike_buffer: float) -> int | None:
59
+ """MiB to hold against `occ` on `gpu`. None = the whole device (undeclared)."""
60
+ if occ.vram_mib is None:
61
+ return None
62
+ peak = occ.peak_mib.get(gpu, 0)
63
+ if peak <= occ.vram_mib:
64
+ budget = occ.vram_mib
65
+ else: # declaration violated -> trust the empirical max, buffered
66
+ budget = math.ceil(round(peak * (1 + spike_buffer), 6))
67
+ return max(budget, occ.actual_mib.get(gpu, 0))
68
+
69
+
70
+ def _fallback_external(gpu: int, snapshot: GpuSnapshot, occupants: list[Occupant]) -> int:
71
+ own_actual = sum(occ.actual_mib.get(gpu, 0) for occ in occupants if gpu in occ.gpu_indices)
72
+ return max(0, snapshot.gpus[gpu].used_mib - own_actual)
73
+
74
+
75
+ def effective_headroom(
76
+ gpu: int,
77
+ snapshot: GpuSnapshot,
78
+ occupants: list[Occupant],
79
+ opts: AllocOptions,
80
+ external_mib: dict[int, int] | None = None,
81
+ ) -> int | None:
82
+ """Headroom available for a *new declared* job on `gpu`.
83
+
84
+ Returns None if the GPU is off-limits (hosts an undeclared job).
85
+ """
86
+ stat = snapshot.gpus[gpu]
87
+ own = 0
88
+ for occ in occupants:
89
+ if gpu not in occ.gpu_indices:
90
+ continue
91
+ budget = effective_budget(occ, gpu, opts.spike_buffer)
92
+ if budget is None:
93
+ return None # undeclared job owns the whole device
94
+ own += budget
95
+ external = (
96
+ external_mib[gpu] if external_mib is not None
97
+ else _fallback_external(gpu, snapshot, occupants)
98
+ )
99
+ return stat.total_mib - external - own
100
+
101
+
102
+ def find_allocation(
103
+ spec: JobSpec,
104
+ snapshot: GpuSnapshot,
105
+ occupants: list[Occupant],
106
+ allowed_gpus: set[int],
107
+ opts: AllocOptions,
108
+ external_mib: dict[int, int] | None = None,
109
+ ) -> list[int] | None:
110
+ """Return the GPU indices to run `spec` on, or None if it cannot start now."""
111
+ candidates = sorted(g for g in snapshot.gpus if g in allowed_gpus)
112
+ scheduler_used = {g for occ in occupants for g in occ.gpu_indices}
113
+
114
+ def external(g: int) -> int:
115
+ return (
116
+ external_mib[g] if external_mib is not None
117
+ else _fallback_external(g, snapshot, occupants)
118
+ )
119
+
120
+ if spec.vram_mib is None:
121
+ # Undeclared: fully idle GPUs only (judged on EFFECTIVE external usage,
122
+ # so a live external process is held to its peak), lowest index first.
123
+ eligible = [
124
+ g for g in candidates
125
+ if g not in scheduler_used and external(g) < opts.idle_threshold_mib
126
+ ]
127
+ return eligible[: spec.n_gpus] if len(eligible) >= spec.n_gpus else None
128
+
129
+ need = spec.vram_mib + opts.margin_mib
130
+ scored: list[tuple[int, int]] = [] # (headroom, gpu)
131
+ for g in candidates:
132
+ if opts.exclusive and g in scheduler_used:
133
+ continue
134
+ headroom = effective_headroom(g, snapshot, occupants, opts, external_mib)
135
+ if headroom is not None and headroom >= need:
136
+ scored.append((headroom, g))
137
+
138
+ if len(scored) < spec.n_gpus:
139
+ return None
140
+ # Best-fit: smallest sufficient headroom first; tie-break on index.
141
+ scored.sort()
142
+ return sorted(g for _, g in scored[: spec.n_gpus])
gpusched/backend.py ADDED
@@ -0,0 +1,130 @@
1
+ """GPU state backends.
2
+
3
+ The scheduler only ever sees a :class:`GpuSnapshot`; how it is produced is
4
+ behind the :class:`GpuBackend` protocol. The real backend shells out to
5
+ ``nvidia-smi``; test backends fabricate snapshots (see ``gpusched.testing``).
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ import subprocess
11
+ from dataclasses import dataclass, field
12
+ from typing import Protocol
13
+
14
+
15
+ @dataclass(frozen=True)
16
+ class GpuStat:
17
+ index: int
18
+ total_mib: int
19
+ used_mib: int
20
+ util_pct: int | None = None # device-level SM utilization, if known
21
+
22
+ @property
23
+ def free_mib(self) -> int:
24
+ return self.total_mib - self.used_mib
25
+
26
+
27
+ @dataclass(frozen=True)
28
+ class ProcStat:
29
+ gpu_index: int
30
+ pid: int
31
+ used_mib: int
32
+
33
+
34
+ @dataclass
35
+ class GpuSnapshot:
36
+ gpus: dict[int, GpuStat] = field(default_factory=dict)
37
+ procs: list[ProcStat] = field(default_factory=list)
38
+
39
+
40
+ class GpuBackend(Protocol):
41
+ def snapshot(self) -> GpuSnapshot: ...
42
+
43
+
44
+ class BackendError(RuntimeError):
45
+ pass
46
+
47
+
48
+ def pgid_of(pid: int) -> int | None:
49
+ """Process-group id of *pid* via /proc (Linux). None if the process is gone.
50
+
51
+ Used to attribute GPU compute processes to scheduler-launched jobs: each
52
+ job is started in its own session (``os.setsid``), so every descendant —
53
+ including those spawned through ``sh -c`` — shares the job's pgid.
54
+ """
55
+ try:
56
+ with open(f"/proc/{pid}/stat", "rb") as f:
57
+ data = f.read().decode("utf-8", "replace")
58
+ # /proc/[pid]/stat: "pid (comm) state ppid pgrp session ...".
59
+ # comm may itself contain spaces/parens, so split after the LAST ')':
60
+ after_comm = data.rsplit(")", 1)[1].split()
61
+ return int(after_comm[2]) # [0]=state, [1]=ppid, [2]=pgrp
62
+ except (FileNotFoundError, ProcessLookupError, PermissionError, IndexError, ValueError):
63
+ return None
64
+
65
+
66
+ class NvidiaSmiBackend:
67
+ """Real backend: two nvidia-smi queries per snapshot.
68
+
69
+ * ``--query-gpu=index,uuid,memory.total,memory.used`` for GPU-level stats
70
+ * ``--query-compute-apps=gpu_uuid,pid,used_memory`` for per-process stats
71
+ """
72
+
73
+ def __init__(self, nvidia_smi: str = "nvidia-smi", timeout: float = 10.0):
74
+ self._bin = nvidia_smi
75
+ self._timeout = timeout
76
+
77
+ def _run(self, args: list[str]) -> str:
78
+ try:
79
+ return subprocess.check_output(
80
+ [self._bin, *args], text=True, timeout=self._timeout,
81
+ stderr=subprocess.PIPE,
82
+ )
83
+ except FileNotFoundError as e:
84
+ raise BackendError(f"{self._bin} not found — is the NVIDIA driver installed?") from e
85
+ except subprocess.CalledProcessError as e:
86
+ raise BackendError(f"{self._bin} failed: {e.stderr or e}") from e
87
+ except subprocess.TimeoutExpired as e:
88
+ raise BackendError(f"{self._bin} timed out after {self._timeout}s") from e
89
+
90
+ def snapshot(self) -> GpuSnapshot:
91
+ snap = GpuSnapshot()
92
+ uuid_to_index: dict[str, int] = {}
93
+
94
+ out = self._run([
95
+ "--query-gpu=index,uuid,memory.total,memory.used,utilization.gpu",
96
+ "--format=csv,noheader,nounits",
97
+ ])
98
+ for line in out.strip().splitlines():
99
+ parts = [p.strip() for p in line.split(",")]
100
+ if len(parts) != 5:
101
+ continue
102
+ idx, uuid, total, used, util = parts
103
+ try:
104
+ util_pct: int | None = int(util)
105
+ except ValueError:
106
+ util_pct = None # "[N/A]" on some virtualized setups
107
+ gpu = GpuStat(index=int(idx), total_mib=int(total),
108
+ used_mib=int(used), util_pct=util_pct)
109
+ snap.gpus[gpu.index] = gpu
110
+ uuid_to_index[uuid] = gpu.index
111
+
112
+ out = self._run([
113
+ "--query-compute-apps=gpu_uuid,pid,used_memory",
114
+ "--format=csv,noheader,nounits",
115
+ ])
116
+ for line in out.strip().splitlines():
117
+ parts = [p.strip() for p in line.split(",")]
118
+ if len(parts) != 3 or parts[0] not in uuid_to_index:
119
+ continue
120
+ try:
121
+ snap.procs.append(ProcStat(
122
+ gpu_index=uuid_to_index[parts[0]],
123
+ pid=int(parts[1]),
124
+ used_mib=int(parts[2]),
125
+ ))
126
+ except ValueError:
127
+ # used_memory can be "[N/A]" (e.g. inside some containers /
128
+ # for graphics processes); skip — attribution degrades to n/a.
129
+ continue
130
+ return snap
gpusched/cli.py ADDED
@@ -0,0 +1,112 @@
1
+ """Command-line interface: ``gpusched jobs.txt [options]``."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import argparse
6
+ import sys
7
+
8
+ from . import __version__
9
+ from .allocation import AllocOptions
10
+ from .backend import NvidiaSmiBackend
11
+ from .jobspec import JobSpecError, parse_jobs_file
12
+ from .scheduler import Scheduler, SchedulerOptions
13
+
14
+
15
+ def build_parser() -> argparse.ArgumentParser:
16
+ p = argparse.ArgumentParser(
17
+ prog="gpusched",
18
+ description=(
19
+ "VRAM-aware GPU job scheduler. Reads one shell command per line; "
20
+ "lines may declare expected max VRAM: '[vram=18G] python train.py' "
21
+ "or multiple GPUs: '[vram=30G gpus=2] torchrun ...'. Declared jobs "
22
+ "are packed onto GPUs with enough free memory; undeclared jobs get "
23
+ "a fully idle GPU. Actual per-job VRAM is monitored and compared "
24
+ "against declarations."
25
+ ),
26
+ )
27
+ p.add_argument("jobs_file", help="file with one shell command per line")
28
+ p.add_argument("--gpus", default=None, metavar="0,1,3",
29
+ help="comma-separated GPU indices to use (default: all visible)")
30
+ p.add_argument("--idle-threshold", type=int, default=200, metavar="MIB",
31
+ help="GPU counts as idle below this usage, for undeclared jobs (default: 200)")
32
+ p.add_argument("--margin", type=int, default=512, metavar="MIB",
33
+ help="safety margin added to every vram declaration (default: 512)")
34
+ p.add_argument("--tolerance", type=float, default=0.10, metavar="FRAC",
35
+ help="relative band before flagging over/under-declaration (default: 0.10)")
36
+ p.add_argument("--poll", type=float, default=5.0, metavar="SEC",
37
+ help="polling interval in seconds (default: 5)")
38
+ p.add_argument("--spike-buffer", type=float, default=0.10, metavar="FRAC",
39
+ help="headroom buffer applied over observed VRAM maxima of fluctuating "
40
+ "processes — both external ones and own jobs that exceeded their "
41
+ "declaration (default: 0.10)")
42
+ p.add_argument("--exclusive", action="store_true",
43
+ help="never co-locate two scheduler jobs on one GPU, even if declarations fit")
44
+ p.add_argument("--log-dir", default="gpusched_logs",
45
+ help="directory for per-job stdout/stderr logs (default: gpusched_logs)")
46
+ p.add_argument("--watch", action="store_true",
47
+ help="keep running after the queue drains, picking up lines appended "
48
+ "to the jobs file (the jobs file is re-read every poll either way)")
49
+ p.add_argument("--oom-retries", type=int, default=0, metavar="N",
50
+ help="default CUDA-OOM auto-retries per job; [retries=N] overrides (default: 0)")
51
+ p.add_argument("--fresh", action="store_true",
52
+ help="ignore and remove the existing journal: re-run everything")
53
+ p.add_argument("-v", "--verbose", action="store_true",
54
+ help="stream live per-job VRAM usage as peaks grow")
55
+ p.add_argument("--sim", type=int, default=None, metavar="N_GPUS",
56
+ help="dry-run against N simulated 24 GiB GPUs (no hardware needed); "
57
+ "pair with 'python -m gpusched.simjob' commands")
58
+ p.add_argument("--version", action="version", version=f"gpusched {__version__}")
59
+ return p
60
+
61
+
62
+ def main(argv: list[str] | None = None) -> int:
63
+ args = build_parser().parse_args(argv)
64
+
65
+ try:
66
+ jobs = parse_jobs_file(args.jobs_file)
67
+ except JobSpecError as e:
68
+ print(f"gpusched: {e}", file=sys.stderr)
69
+ return 2
70
+ except OSError as e:
71
+ print(f"gpusched: cannot read jobs file: {e}", file=sys.stderr)
72
+ return 2
73
+ if not jobs and not args.watch:
74
+ print("gpusched: jobs file contains no jobs (use --watch to wait for some)", file=sys.stderr)
75
+ return 2
76
+
77
+ if args.fresh:
78
+ import pathlib
79
+ pathlib.Path(args.log_dir, "journal.jsonl").unlink(missing_ok=True)
80
+
81
+ if args.sim is not None:
82
+ from .testing import SimBackend
83
+ backend = SimBackend(n_gpus=args.sim)
84
+ else:
85
+ backend = NvidiaSmiBackend()
86
+
87
+ allowed = (
88
+ {int(g) for g in args.gpus.split(",")} if args.gpus else None
89
+ )
90
+ options = SchedulerOptions(
91
+ alloc=AllocOptions(
92
+ idle_threshold_mib=args.idle_threshold,
93
+ margin_mib=args.margin,
94
+ exclusive=args.exclusive,
95
+ spike_buffer=args.spike_buffer,
96
+ ),
97
+ poll_interval=args.poll,
98
+ tolerance=args.tolerance,
99
+ verbose=args.verbose,
100
+ log_dir=args.log_dir,
101
+ watch=args.watch,
102
+ oom_retries_default=args.oom_retries,
103
+ )
104
+ sched = Scheduler(backend, jobs_path=args.jobs_file, allowed_gpus=allowed, options=options)
105
+ try:
106
+ return sched.run()
107
+ except KeyboardInterrupt:
108
+ return 130
109
+
110
+
111
+ if __name__ == "__main__":
112
+ sys.exit(main())
gpusched/jobspec.py ADDED
@@ -0,0 +1,161 @@
1
+ """Job specification parsing.
2
+
3
+ Jobs-file syntax (one job per line):
4
+
5
+ # comment / blank lines are skipped
6
+ python train.py --config a.yaml # no estimate -> needs an idle GPU
7
+ [vram=18000] python train.py # declares max 18000 MiB
8
+ [vram=22G] bash run_eval.sh # G / GiB suffix accepted
9
+ [vram=30G gpus=2] torchrun train_big.py # 2 GPUs, each with >= 30 GiB free
10
+ [timeout=2h] python flaky_eval.py # SIGTERM after 2 hours (opt-in only)
11
+ [vram=8G retries=2] python sweep.py # auto-retry on CUDA OOM, up to 2x
12
+
13
+ Attribute block must be a single leading ``[key=value ...]`` group.
14
+ ``vram`` is interpreted **per GPU** for multi-GPU jobs. ``timeout`` accepts
15
+ s/m/h/d suffixes (default seconds). Jobs are identified by a hash of their
16
+ command text (plus an occurrence counter for duplicate lines), which is what
17
+ makes live-edited queue files and resume-after-restart well defined.
18
+ """
19
+
20
+ from __future__ import annotations
21
+
22
+ import re
23
+ from dataclasses import dataclass, field
24
+
25
+ _ATTR_BLOCK = re.compile(r"^\[(?P<attrs>[^\]]*)\]\s*(?P<cmd>.*)$", re.DOTALL)
26
+ _VRAM_VALUE = re.compile(r"^(?P<num>\d+(?:\.\d+)?)\s*(?P<unit>g|gb|gib|m|mb|mib)?$", re.IGNORECASE)
27
+ _DURATION = re.compile(r"^(?P<num>\d+(?:\.\d+)?)\s*(?P<unit>s|m|h|d)?$", re.IGNORECASE)
28
+
29
+
30
+ class JobSpecError(ValueError):
31
+ """Raised on malformed jobs-file lines; carries the 1-based line number."""
32
+
33
+ def __init__(self, lineno: int, message: str):
34
+ super().__init__(f"jobs file line {lineno}: {message}")
35
+ self.lineno = lineno
36
+
37
+
38
+ @dataclass(frozen=True)
39
+ class JobSpec:
40
+ """A single queued job."""
41
+
42
+ index: int # 1-based position in the jobs file
43
+ command: str # shell command to execute
44
+ vram_mib: int | None = None # declared max VRAM per GPU (MiB); None = undeclared
45
+ n_gpus: int = 1 # number of GPUs required
46
+ timeout_s: float | None = None # walltime before SIGTERM; None = never (opt-in)
47
+ retries: int = 0 # auto-retries on CUDA OOM
48
+ key: str = field(default="", compare=False) # stable identity (hash#occurrence)
49
+ lineno: int = field(default=0, compare=False)
50
+
51
+ @property
52
+ def label(self) -> str:
53
+ return f"job {self.index}"
54
+
55
+
56
+ def parse_vram(value: str, lineno: int = 0) -> int:
57
+ """Parse a vram value like '12000', '12.5G', '24GiB', '8000MiB' -> MiB."""
58
+ m = _VRAM_VALUE.match(value.strip())
59
+ if not m:
60
+ raise JobSpecError(lineno, f"cannot parse vram value {value!r} (use MiB or a G/GiB suffix)")
61
+ num = float(m.group("num"))
62
+ unit = (m.group("unit") or "m").lower()
63
+ mib = num * 1024 if unit.startswith("g") else num
64
+ mib_int = int(round(mib))
65
+ if mib_int <= 0:
66
+ raise JobSpecError(lineno, f"vram must be positive, got {value!r}")
67
+ return mib_int
68
+
69
+
70
+ def parse_duration(value: str, lineno: int = 0) -> float:
71
+ """Parse '90', '90s', '15m', '2h', '1.5d' -> seconds."""
72
+ m = _DURATION.match(value.strip())
73
+ if not m:
74
+ raise JobSpecError(lineno, f"cannot parse duration {value!r} (use s/m/h/d)")
75
+ mult = {"s": 1, "m": 60, "h": 3600, "d": 86400}[(m.group("unit") or "s").lower()]
76
+ sec = float(m.group("num")) * mult
77
+ if sec <= 0:
78
+ raise JobSpecError(lineno, f"timeout must be positive, got {value!r}")
79
+ return sec
80
+
81
+
82
+ def job_key(command: str, occurrence: int) -> str:
83
+ """Stable identity for a command line; occurrence disambiguates duplicates."""
84
+ import hashlib
85
+ h = hashlib.sha1(command.strip().encode()).hexdigest()[:12]
86
+ return f"{h}#{occurrence}"
87
+
88
+
89
+ def parse_line(line: str, lineno: int, index: int) -> JobSpec | None:
90
+ """Parse one jobs-file line. Returns None for blanks/comments."""
91
+ stripped = line.strip()
92
+ if not stripped or stripped.startswith("#"):
93
+ return None
94
+
95
+ vram_mib: int | None = None
96
+ n_gpus = 1
97
+ timeout_s: float | None = None
98
+ retries = 0
99
+ command = stripped
100
+
101
+ if stripped.startswith("[") and not _ATTR_BLOCK.match(stripped):
102
+ # A line beginning with '[' but never closed is almost certainly a
103
+ # torn mid-edit attribute block — refuse to execute it as shell.
104
+ raise JobSpecError(lineno, "unterminated '[...]' attribute block")
105
+
106
+ m = _ATTR_BLOCK.match(stripped)
107
+ if m:
108
+ command = m.group("cmd").strip()
109
+ if not command:
110
+ raise JobSpecError(lineno, "attribute block present but command is empty")
111
+ for token in m.group("attrs").split():
112
+ if "=" not in token:
113
+ raise JobSpecError(lineno, f"malformed attribute {token!r} (expected key=value)")
114
+ key, _, value = token.partition("=")
115
+ key = key.lower()
116
+ if key == "vram":
117
+ vram_mib = parse_vram(value, lineno)
118
+ elif key == "timeout":
119
+ timeout_s = parse_duration(value, lineno)
120
+ elif key == "retries":
121
+ try:
122
+ retries = int(value)
123
+ except ValueError:
124
+ raise JobSpecError(lineno, f"retries must be an integer, got {value!r}") from None
125
+ if retries < 0:
126
+ raise JobSpecError(lineno, f"retries must be >= 0, got {retries}")
127
+ elif key == "gpus":
128
+ try:
129
+ n_gpus = int(value)
130
+ except ValueError:
131
+ raise JobSpecError(lineno, f"gpus must be an integer, got {value!r}") from None
132
+ if n_gpus < 1:
133
+ raise JobSpecError(lineno, f"gpus must be >= 1, got {n_gpus}")
134
+ else:
135
+ raise JobSpecError(lineno, f"unknown attribute {key!r} (known: vram, gpus, timeout, retries)")
136
+
137
+ return JobSpec(index=index, command=command, vram_mib=vram_mib, n_gpus=n_gpus,
138
+ timeout_s=timeout_s, retries=retries, lineno=lineno)
139
+
140
+
141
+ def assign_keys(specs: list[JobSpec]) -> list[JobSpec]:
142
+ """Attach stable identity keys (command hash + occurrence index)."""
143
+ from dataclasses import replace
144
+ seen: dict[str, int] = {}
145
+ out = []
146
+ for spec in specs:
147
+ occ = seen.get(spec.command, 0)
148
+ seen[spec.command] = occ + 1
149
+ out.append(replace(spec, key=job_key(spec.command, occ)))
150
+ return out
151
+
152
+
153
+ def parse_jobs_file(path: str) -> list[JobSpec]:
154
+ """Parse a jobs file into an ordered list of JobSpecs with identity keys."""
155
+ specs: list[JobSpec] = []
156
+ with open(path, encoding="utf-8") as f:
157
+ for lineno, line in enumerate(f, start=1):
158
+ spec = parse_line(line, lineno, index=len(specs) + 1)
159
+ if spec is not None:
160
+ specs.append(spec)
161
+ return assign_keys(specs)
gpusched/journal.py ADDED
@@ -0,0 +1,106 @@
1
+ """Append-only job journal (JSONL).
2
+
3
+ One file per log directory. Each line is an event:
4
+
5
+ {"key": "<hash#occ>", "event": "seen", "no": 7, "command": "..."}
6
+ {"key": "...", "event": "oom", "attempts": 1, "next_vram": 9750}
7
+ {"key": "...", "event": "done", "status": "ok", "returncode": 0,
8
+ "peak_mib": {"0": 7800}, "avg_util": 84.0}
9
+
10
+ Folding the events yields per-key state. Terminal statuses are never
11
+ re-dispatched; non-terminal keys remain eligible (this is what makes both
12
+ ``--resume``-style restarts and OOM retries fall out of one mechanism).
13
+ The jobs file itself is never written by the scheduler — it stays entirely
14
+ user-owned, and this journal is the scheduler's only persistent state.
15
+ """
16
+
17
+ from __future__ import annotations
18
+
19
+ import json
20
+ import os
21
+ from dataclasses import dataclass, field
22
+ from pathlib import Path
23
+
24
+ TERMINAL = {"ok", "failed", "timeout", "infeasible", "failed_oom"}
25
+
26
+
27
+ @dataclass
28
+ class JobState:
29
+ no: int | None = None # persistent display number (first-seen order)
30
+ command: str = ""
31
+ attempts: int = 0 # completed attempts so far
32
+ status: str = "pending" # pending | running | <terminal>
33
+ returncode: int | None = None
34
+ next_vram: int | None = None # bumped declaration for the next attempt
35
+ peak_mib: dict[str, int] = field(default_factory=dict)
36
+
37
+ @property
38
+ def terminal(self) -> bool:
39
+ return self.status in TERMINAL
40
+
41
+
42
+ class Journal:
43
+ def __init__(self, path: str | os.PathLike):
44
+ self.path = Path(path)
45
+ self.states: dict[str, JobState] = {}
46
+ self._next_no = 1
47
+ if self.path.exists():
48
+ for line in self.path.read_text().splitlines():
49
+ if line.strip():
50
+ try:
51
+ self._fold(json.loads(line))
52
+ except (json.JSONDecodeError, KeyError):
53
+ continue # tolerate a torn last line after a crash
54
+
55
+ def _fold(self, ev: dict) -> None:
56
+ st = self.states.setdefault(ev["key"], JobState())
57
+ kind = ev.get("event")
58
+ if kind == "seen":
59
+ st.no = ev.get("no")
60
+ st.command = ev.get("command", "")
61
+ self._next_no = max(self._next_no, (st.no or 0) + 1)
62
+ elif kind == "oom":
63
+ st.attempts = ev.get("attempts", st.attempts + 1)
64
+ st.next_vram = ev.get("next_vram", st.next_vram)
65
+ st.status = "pending"
66
+ elif kind == "done":
67
+ st.attempts += 1
68
+ st.status = ev.get("status", "failed")
69
+ st.returncode = ev.get("returncode")
70
+ st.peak_mib = ev.get("peak_mib", {})
71
+
72
+ def _append(self, ev: dict) -> None:
73
+ self.path.parent.mkdir(parents=True, exist_ok=True)
74
+ with open(self.path, "a") as f:
75
+ f.write(json.dumps(ev) + "\n")
76
+ f.flush()
77
+ os.fsync(f.fileno())
78
+
79
+ # ------------------------------------------------------------- API
80
+ def state(self, key: str) -> JobState:
81
+ return self.states.get(key, JobState())
82
+
83
+ def ensure_seen(self, key: str, command: str) -> int:
84
+ """Assign a persistent display number on first sight; return it."""
85
+ st = self.states.get(key)
86
+ if st is not None and st.no is not None:
87
+ return st.no
88
+ no = self._next_no
89
+ ev = {"key": key, "event": "seen", "no": no, "command": command}
90
+ self._append(ev)
91
+ self._fold(ev)
92
+ return no
93
+
94
+ def record_oom_retry(self, key: str, attempts: int, next_vram: int | None) -> None:
95
+ ev = {"key": key, "event": "oom", "attempts": attempts, "next_vram": next_vram}
96
+ self._append(ev)
97
+ self._fold(ev)
98
+
99
+ def record_done(self, key: str, status: str, returncode: int,
100
+ peak_mib: dict[int, int], avg_util: float | None) -> None:
101
+ ev = {
102
+ "key": key, "event": "done", "status": status, "returncode": returncode,
103
+ "peak_mib": {str(g): m for g, m in peak_mib.items()}, "avg_util": avg_util,
104
+ }
105
+ self._append(ev)
106
+ self._fold(ev)