pysalvo 0.2.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pysalvo-0.2.0/.github/workflows/ci.yml +30 -0
- pysalvo-0.2.0/.github/workflows/release.yml +42 -0
- pysalvo-0.2.0/.gitignore +12 -0
- pysalvo-0.2.0/.pre-commit-config.yaml +13 -0
- pysalvo-0.2.0/CONTRIBUTING.md +40 -0
- pysalvo-0.2.0/LICENSE +21 -0
- pysalvo-0.2.0/PKG-INFO +150 -0
- pysalvo-0.2.0/README.md +105 -0
- pysalvo-0.2.0/SECURITY.md +5 -0
- pysalvo-0.2.0/examples/cluv_integration.py +83 -0
- pysalvo-0.2.0/examples/xgenius_integration.py +87 -0
- pysalvo-0.2.0/pyproject.toml +79 -0
- pysalvo-0.2.0/src/salvo/__init__.py +74 -0
- pysalvo-0.2.0/src/salvo/_version.py +1 -0
- pysalvo-0.2.0/src/salvo/cli.py +83 -0
- pysalvo-0.2.0/src/salvo/decorators.py +39 -0
- pysalvo-0.2.0/src/salvo/dispatch/__init__.py +7 -0
- pysalvo-0.2.0/src/salvo/dispatch/account.py +52 -0
- pysalvo-0.2.0/src/salvo/dispatch/caps.py +100 -0
- pysalvo-0.2.0/src/salvo/dispatch/partition.py +43 -0
- pysalvo-0.2.0/src/salvo/doctor.py +130 -0
- pysalvo-0.2.0/src/salvo/errors.py +49 -0
- pysalvo-0.2.0/src/salvo/job/__init__.py +3 -0
- pysalvo-0.2.0/src/salvo/job/handle.py +70 -0
- pysalvo-0.2.0/src/salvo/job/oom.py +134 -0
- pysalvo-0.2.0/src/salvo/job/preempt.py +34 -0
- pysalvo-0.2.0/src/salvo/job/render.py +68 -0
- pysalvo-0.2.0/src/salvo/job/spec.py +108 -0
- pysalvo-0.2.0/src/salvo/job/submit.py +108 -0
- pysalvo-0.2.0/src/salvo/manifest/__init__.py +4 -0
- pysalvo-0.2.0/src/salvo/manifest/schema.py +19 -0
- pysalvo-0.2.0/src/salvo/manifest/store.py +108 -0
- pysalvo-0.2.0/src/salvo/obs/__init__.py +3 -0
- pysalvo-0.2.0/src/salvo/obs/events.py +38 -0
- pysalvo-0.2.0/src/salvo/policy.py +46 -0
- pysalvo-0.2.0/src/salvo/py.typed +0 -0
- pysalvo-0.2.0/src/salvo/stage/__init__.py +6 -0
- pysalvo-0.2.0/src/salvo/stage/gate.py +24 -0
- pysalvo-0.2.0/src/salvo/topology/__init__.py +44 -0
- pysalvo-0.2.0/src/salvo/topology/detect.py +27 -0
- pysalvo-0.2.0/src/salvo/topology/loader.py +74 -0
- pysalvo-0.2.0/src/salvo/topology/presets/__init__.py +0 -0
- pysalvo-0.2.0/src/salvo/topology/presets/_national.yaml +7 -0
- pysalvo-0.2.0/src/salvo/topology/presets/beluga.yaml +28 -0
- pysalvo-0.2.0/src/salvo/topology/presets/cedar.yaml +28 -0
- pysalvo-0.2.0/src/salvo/topology/presets/mila.yaml +33 -0
- pysalvo-0.2.0/src/salvo/topology/presets/narval.yaml +28 -0
- pysalvo-0.2.0/src/salvo/topology/presets/rorqual.yaml +35 -0
- pysalvo-0.2.0/src/salvo/topology/schema.py +107 -0
- pysalvo-0.2.0/tests/__init__.py +0 -0
- pysalvo-0.2.0/tests/conftest.py +49 -0
- pysalvo-0.2.0/tests/fixtures/clusters/child.yaml +9 -0
- pysalvo-0.2.0/tests/fixtures/clusters/parent.yaml +7 -0
- pysalvo-0.2.0/tests/integration/__init__.py +0 -0
- pysalvo-0.2.0/tests/integration/conftest.py +1 -0
- pysalvo-0.2.0/tests/integration/golden/mila_cpu_basic.sh +15 -0
- pysalvo-0.2.0/tests/integration/golden/rorqual_gpu_basic.sh +16 -0
- pysalvo-0.2.0/tests/integration/test_library_surface.py +96 -0
- pysalvo-0.2.0/tests/integration/test_oom_chain.py +25 -0
- pysalvo-0.2.0/tests/integration/test_submit_e2e.py +26 -0
- pysalvo-0.2.0/tests/perf/__init__.py +0 -0
- pysalvo-0.2.0/tests/perf/test_perf_budgets.py +29 -0
- pysalvo-0.2.0/tests/test_version.py +5 -0
- pysalvo-0.2.0/tests/unit/__init__.py +0 -0
- pysalvo-0.2.0/tests/unit/test_account.py +44 -0
- pysalvo-0.2.0/tests/unit/test_caps.py +45 -0
- pysalvo-0.2.0/tests/unit/test_cli.py +87 -0
- pysalvo-0.2.0/tests/unit/test_decorator.py +11 -0
- pysalvo-0.2.0/tests/unit/test_doctor.py +16 -0
- pysalvo-0.2.0/tests/unit/test_errors.py +26 -0
- pysalvo-0.2.0/tests/unit/test_events.py +39 -0
- pysalvo-0.2.0/tests/unit/test_gate.py +37 -0
- pysalvo-0.2.0/tests/unit/test_handle_imports.py +2 -0
- pysalvo-0.2.0/tests/unit/test_manifest.py +51 -0
- pysalvo-0.2.0/tests/unit/test_oom_dsl.py +48 -0
- pysalvo-0.2.0/tests/unit/test_oom_retry.py +51 -0
- pysalvo-0.2.0/tests/unit/test_partition.py +43 -0
- pysalvo-0.2.0/tests/unit/test_preempt.py +29 -0
- pysalvo-0.2.0/tests/unit/test_presets.py +45 -0
- pysalvo-0.2.0/tests/unit/test_public_api.py +118 -0
- pysalvo-0.2.0/tests/unit/test_render.py +63 -0
- pysalvo-0.2.0/tests/unit/test_spec.py +65 -0
- pysalvo-0.2.0/tests/unit/test_topology_detect.py +27 -0
- pysalvo-0.2.0/tests/unit/test_topology_loader.py +29 -0
- pysalvo-0.2.0/tests/unit/test_topology_schema.py +63 -0
- pysalvo-0.2.0/uv.lock +904 -0
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
name: ci
|
|
2
|
+
on:
|
|
3
|
+
push:
|
|
4
|
+
branches: [main]
|
|
5
|
+
pull_request:
|
|
6
|
+
concurrency:
|
|
7
|
+
group: ${{ github.workflow }}-${{ github.ref }}
|
|
8
|
+
cancel-in-progress: true
|
|
9
|
+
jobs:
|
|
10
|
+
test:
|
|
11
|
+
runs-on: ubuntu-latest
|
|
12
|
+
strategy:
|
|
13
|
+
matrix:
|
|
14
|
+
python: ["3.11", "3.12", "3.13"]
|
|
15
|
+
steps:
|
|
16
|
+
- uses: actions/checkout@v4
|
|
17
|
+
- uses: astral-sh/setup-uv@v3
|
|
18
|
+
- uses: actions/setup-python@v5
|
|
19
|
+
with:
|
|
20
|
+
python-version: ${{ matrix.python }}
|
|
21
|
+
- run: uv sync --all-extras
|
|
22
|
+
- run: uv run pre-commit run --all-files
|
|
23
|
+
- run: uv run mypy src/salvo
|
|
24
|
+
- run: uv run pytest -v
|
|
25
|
+
- run: uv build
|
|
26
|
+
- name: Smoke-import built wheel
|
|
27
|
+
run: |
|
|
28
|
+
python -m venv /tmp/wheel-smoke
|
|
29
|
+
/tmp/wheel-smoke/bin/pip install --quiet dist/pysalvo-*.whl
|
|
30
|
+
/tmp/wheel-smoke/bin/python -c "from salvo import JobSpec, render; from salvo.policy import apply_oom, OomContext; from salvo.topology import load_preset"
|
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
name: release
|
|
2
|
+
on:
|
|
3
|
+
push:
|
|
4
|
+
tags: ["v*"]
|
|
5
|
+
concurrency:
|
|
6
|
+
group: release-${{ github.ref }}
|
|
7
|
+
cancel-in-progress: false
|
|
8
|
+
jobs:
|
|
9
|
+
build:
|
|
10
|
+
runs-on: ubuntu-latest
|
|
11
|
+
steps:
|
|
12
|
+
- uses: actions/checkout@v4
|
|
13
|
+
- uses: astral-sh/setup-uv@v3
|
|
14
|
+
- uses: actions/setup-python@v5
|
|
15
|
+
with:
|
|
16
|
+
python-version: "3.12"
|
|
17
|
+
- name: Verify tag matches pyproject version
|
|
18
|
+
env:
|
|
19
|
+
REF_NAME: ${{ github.ref_name }}
|
|
20
|
+
run: |
|
|
21
|
+
tag="${REF_NAME#v}"
|
|
22
|
+
ver=$(uv run python -c "import tomllib,pathlib;print(tomllib.loads(pathlib.Path('pyproject.toml').read_text())['project']['version'])")
|
|
23
|
+
[ "$tag" = "$ver" ] || { echo "tag=$tag pyproject=$ver"; exit 1; }
|
|
24
|
+
- run: uv build
|
|
25
|
+
- uses: actions/upload-artifact@v4
|
|
26
|
+
with:
|
|
27
|
+
name: dist
|
|
28
|
+
path: dist/
|
|
29
|
+
publish:
|
|
30
|
+
needs: build
|
|
31
|
+
runs-on: ubuntu-latest
|
|
32
|
+
environment:
|
|
33
|
+
name: pypi
|
|
34
|
+
url: https://pypi.org/p/pysalvo
|
|
35
|
+
permissions:
|
|
36
|
+
id-token: write
|
|
37
|
+
steps:
|
|
38
|
+
- uses: actions/download-artifact@v4
|
|
39
|
+
with:
|
|
40
|
+
name: dist
|
|
41
|
+
path: dist/
|
|
42
|
+
- uses: pypa/gh-action-pypi-publish@release/v1
|
pysalvo-0.2.0/.gitignore
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
repos:
|
|
2
|
+
- repo: https://github.com/astral-sh/ruff-pre-commit
|
|
3
|
+
rev: v0.5.5
|
|
4
|
+
hooks:
|
|
5
|
+
- id: ruff
|
|
6
|
+
args: [--fix]
|
|
7
|
+
- id: ruff-format
|
|
8
|
+
- repo: https://github.com/pre-commit/mirrors-mypy
|
|
9
|
+
rev: v1.10.0
|
|
10
|
+
hooks:
|
|
11
|
+
- id: mypy
|
|
12
|
+
additional_dependencies: [pydantic>=2.6, types-pyyaml]
|
|
13
|
+
files: ^src/salvo
|
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
# Contributing
|
|
2
|
+
|
|
3
|
+
## Adding a cluster
|
|
4
|
+
|
|
5
|
+
The fastest contribution is a new cluster preset. Each preset is a single YAML file under `src/salvo/topology/presets/`. The schema lives in `src/salvo/topology/schema.py` (pydantic v2, frozen, `extra="forbid"`).
|
|
6
|
+
|
|
7
|
+
1. Copy an existing preset closest to your cluster (DRAC: `rorqual.yaml`; campus: `mila.yaml`).
|
|
8
|
+
2. Edit accounts, partitions, GPU types, walltime caps, defaults.
|
|
9
|
+
3. Add an entry to `src/salvo/topology/detect.py` for the hostname pattern.
|
|
10
|
+
4. Open a PR with the YAML + a one-line entry in the README cluster table.
|
|
11
|
+
|
|
12
|
+
Tests will exercise your preset via the parametrized suite in `tests/unit/test_presets.py`.
|
|
13
|
+
|
|
14
|
+
## Dev setup
|
|
15
|
+
|
|
16
|
+
git clone <repo>
|
|
17
|
+
cd salvo
|
|
18
|
+
uv sync --all-extras
|
|
19
|
+
uv run pre-commit install
|
|
20
|
+
uv run pytest
|
|
21
|
+
|
|
22
|
+
## Style
|
|
23
|
+
|
|
24
|
+
- Python 3.11+, ruff + mypy strict, line length 100.
|
|
25
|
+
- Pydantic v2 models are frozen and `extra="forbid"`.
|
|
26
|
+
- Public functions get type hints; tests get docstrings on intent (not behaviour).
|
|
27
|
+
- No mutable defaults, no f-strings in log messages.
|
|
28
|
+
|
|
29
|
+
## Test layers
|
|
30
|
+
|
|
31
|
+
- **Layer A** (`tests/unit/`): pure-Python, no subprocess.
|
|
32
|
+
- **Layer B** (`tests/integration/`): subprocess to fake `sbatch` via `tests/conftest.py:fake_sbatch`. Golden sbatch files in `tests/integration/golden/`.
|
|
33
|
+
- **Layer C** (cluster nightly): not run on PRs.
|
|
34
|
+
- **Perf** (`tests/perf/`): budget smoke tests.
|
|
35
|
+
|
|
36
|
+
PRs must keep total coverage at or above 80% and add tests for every new dispatch rule.
|
|
37
|
+
|
|
38
|
+
## Commit style
|
|
39
|
+
|
|
40
|
+
Conventional commits with module-scoped scope: `feat(job):`, `fix(dispatch):`, `test(integration):`, `docs:`, `chore:`.
|
pysalvo-0.2.0/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 W
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
pysalvo-0.2.0/PKG-INFO
ADDED
|
@@ -0,0 +1,150 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: pysalvo
|
|
3
|
+
Version: 0.2.0
|
|
4
|
+
Summary: Render + policy library for SLURM JobSpecs (Mila + DRAC clusters)
|
|
5
|
+
Project-URL: Homepage, https://github.com/wietzesuijker/salvo
|
|
6
|
+
Project-URL: Source, https://github.com/wietzesuijker/salvo
|
|
7
|
+
Project-URL: Issues, https://github.com/wietzesuijker/salvo/issues
|
|
8
|
+
Author: W
|
|
9
|
+
License: MIT
|
|
10
|
+
License-File: LICENSE
|
|
11
|
+
Keywords: alliance-canada,drac,hpc,mila,sbatch,slurm
|
|
12
|
+
Classifier: Development Status :: 3 - Alpha
|
|
13
|
+
Classifier: Intended Audience :: Science/Research
|
|
14
|
+
Classifier: Intended Audience :: System Administrators
|
|
15
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
16
|
+
Classifier: Operating System :: MacOS
|
|
17
|
+
Classifier: Operating System :: POSIX :: Linux
|
|
18
|
+
Classifier: Programming Language :: Python :: 3
|
|
19
|
+
Classifier: Programming Language :: Python :: 3 :: Only
|
|
20
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
21
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
22
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
23
|
+
Classifier: Topic :: Scientific/Engineering
|
|
24
|
+
Classifier: Topic :: System :: Clustering
|
|
25
|
+
Classifier: Topic :: System :: Distributed Computing
|
|
26
|
+
Classifier: Typing :: Typed
|
|
27
|
+
Requires-Python: >=3.11
|
|
28
|
+
Requires-Dist: pydantic>=2.6
|
|
29
|
+
Requires-Dist: pyyaml>=6.0
|
|
30
|
+
Requires-Dist: rich>=13.7
|
|
31
|
+
Requires-Dist: tomli-w>=1.0
|
|
32
|
+
Requires-Dist: typer>=0.12
|
|
33
|
+
Provides-Extra: all
|
|
34
|
+
Provides-Extra: dev
|
|
35
|
+
Requires-Dist: hypothesis>=6.100; extra == 'dev'
|
|
36
|
+
Requires-Dist: mypy>=1.10; extra == 'dev'
|
|
37
|
+
Requires-Dist: pre-commit>=3.7; extra == 'dev'
|
|
38
|
+
Requires-Dist: pytest-cov>=5.0; extra == 'dev'
|
|
39
|
+
Requires-Dist: pytest>=8.0; extra == 'dev'
|
|
40
|
+
Requires-Dist: ruff>=0.5; extra == 'dev'
|
|
41
|
+
Requires-Dist: types-pyyaml; extra == 'dev'
|
|
42
|
+
Provides-Extra: drac
|
|
43
|
+
Provides-Extra: mila
|
|
44
|
+
Description-Content-Type: text/markdown
|
|
45
|
+
|
|
46
|
+
# salvo
|
|
47
|
+
|
|
48
|
+
Policy and render library for SLURM. salvo turns a `JobSpec` into byte-stable sbatch text and decides what to do when a job hits OOM or gets preempted. It does not move code to the cluster, does not open SSH connections, and is not a runner. For transport, pair it with [cluv](https://github.com/mila-iqia/cluv) or invoke `sbatch` directly.
|
|
49
|
+
|
|
50
|
+
## Install
|
|
51
|
+
|
|
52
|
+
pip install pysalvo
|
|
53
|
+
|
|
54
|
+
Two CLI commands ship for diagnostics and one-shot rendering. Everything else is meant to be imported.
|
|
55
|
+
|
|
56
|
+
salvo doctor # topology, ssh alias hygiene, manifest freshness
|
|
57
|
+
salvo render spec.yaml --cluster mila # JobSpec YAML to sbatch text on stdout
|
|
58
|
+
|
|
59
|
+
## Render
|
|
60
|
+
|
|
61
|
+
```python
|
|
62
|
+
from salvo import JobSpec, render
|
|
63
|
+
|
|
64
|
+
spec = JobSpec(
|
|
65
|
+
name="train",
|
|
66
|
+
cmd=["python", "train.py"],
|
|
67
|
+
gpus=1, cpus=8, mem="32G", time="2h",
|
|
68
|
+
on_oom=["bump_mem(1.5x, max=128G)", "fail"],
|
|
69
|
+
)
|
|
70
|
+
|
|
71
|
+
sbatch_text = render(
|
|
72
|
+
spec,
|
|
73
|
+
cluster_id="mila",
|
|
74
|
+
account="mila",
|
|
75
|
+
partition="unkillable",
|
|
76
|
+
) # str, byte-stable, no side effects
|
|
77
|
+
```
|
|
78
|
+
|
|
79
|
+
Same JobSpec, same cluster, same inputs, same bytes every run. Useful when you need to audit what was actually submitted six months later.
|
|
80
|
+
|
|
81
|
+
If you leave `account` and `partition` off, salvo picks them via `salvo.dispatch` by shelling out to `squeue` for live capacity. That path only works on a SLURM login node; library callers (cluv, xgenius) should pass both explicitly to keep `render()` pure.
|
|
82
|
+
|
|
83
|
+
## OOM policy
|
|
84
|
+
|
|
85
|
+
```python
|
|
86
|
+
from salvo.policy import parse, apply_oom, OomContext
|
|
87
|
+
|
|
88
|
+
steps = parse(["bump_mem(1.5x, max=128G)", "escalate_partition", "fail"])
|
|
89
|
+
|
|
90
|
+
new_spec, action = apply_oom(prev_spec, OomContext(kind="cpu", max_rss_mb=33_500))
|
|
91
|
+
# new_spec is a fresh JobSpec with bumped mem, or None if the policy says fail
|
|
92
|
+
```
|
|
93
|
+
|
|
94
|
+
The DSL is intentionally small. Steps run in order until one applies:
|
|
95
|
+
|
|
96
|
+
- `bump_mem(<factor>x, max=<size>)` — multiplicative bump, capped
|
|
97
|
+
- `escalate_partition` — clear partition so the next render picks a larger tier
|
|
98
|
+
- `fail` — terminal
|
|
99
|
+
- `bump_gpus(...)`, `callback(...)` — parse today, execute in a later release
|
|
100
|
+
|
|
101
|
+
## Cluster topology
|
|
102
|
+
|
|
103
|
+
Five presets ship in `salvo/topology/presets/`: `mila`, `rorqual`, `narval`, `beluga`, `cedar`. More DRAC clusters land as YAMLs are contributed. Each YAML lists accounts, partitions, and capacity rules.
|
|
104
|
+
|
|
105
|
+
```python
|
|
106
|
+
from salvo.topology import load_preset, list_presets
|
|
107
|
+
|
|
108
|
+
list_presets() # ['beluga', 'cedar', 'mila', 'narval', 'rorqual']
|
|
109
|
+
cluster = load_preset("mila") # ClusterTopology
|
|
110
|
+
```
|
|
111
|
+
|
|
112
|
+
Contributing a new cluster is one YAML file. See [CONTRIBUTING.md](CONTRIBUTING.md).
|
|
113
|
+
|
|
114
|
+
## Decorator (optional, Python-native)
|
|
115
|
+
|
|
116
|
+
If you'd rather write a function than a YAML spec:
|
|
117
|
+
|
|
118
|
+
```python
|
|
119
|
+
from salvo import cluster
|
|
120
|
+
|
|
121
|
+
@cluster.submit(gpus=1, cpus=8, mem="32G", time="2h",
|
|
122
|
+
on_oom=["bump_mem(1.5x, max=128G)", "fail"])
|
|
123
|
+
def train(seed: int):
|
|
124
|
+
...
|
|
125
|
+
|
|
126
|
+
handle = train.submit(seed=42)
|
|
127
|
+
```
|
|
128
|
+
|
|
129
|
+
`train.submit(...)` runs the local-sbatch convenience path: it renders, calls `sbatch`, and returns a handle. Skip the decorator if you're embedding salvo inside another tool's submit flow; just call `render()` and let that tool do the submission.
|
|
130
|
+
|
|
131
|
+
## Use with cluv
|
|
132
|
+
|
|
133
|
+
cluv handles SSH, code sync, and the `sbatch` call. salvo handles the policy. The natural composition is cluv importing salvo's policy library when a user opts in:
|
|
134
|
+
|
|
135
|
+
```toml
|
|
136
|
+
# in your project's pyproject.toml
|
|
137
|
+
[tool.cluv.retry]
|
|
138
|
+
on_oom = ["bump_mem(1.5x, max=128G)", "fail"]
|
|
139
|
+
max_hops = 5
|
|
140
|
+
```
|
|
141
|
+
|
|
142
|
+
This is a proposal, not a shipped feature in cluv. An upstream issue is in draft.
|
|
143
|
+
|
|
144
|
+
## Use with anything that calls sbatch
|
|
145
|
+
|
|
146
|
+
salvo has no SSH or async dependencies. It is a pydantic + stdlib library. Any tool that runs `sbatch` can import `salvo.render`, `salvo.policy.apply_oom`, and `salvo.topology.load_preset` independently. Two runnable examples under [`examples/`](examples/) show the wiring end-to-end: [`cluv_integration.py`](examples/cluv_integration.py) (policy + render) and [`xgenius_integration.py`](examples/xgenius_integration.py) (policy-only, when the host tool keeps its own renderer).
|
|
147
|
+
|
|
148
|
+
## License
|
|
149
|
+
|
|
150
|
+
MIT. See [LICENSE](LICENSE).
|
pysalvo-0.2.0/README.md
ADDED
|
@@ -0,0 +1,105 @@
|
|
|
1
|
+
# salvo
|
|
2
|
+
|
|
3
|
+
Policy and render library for SLURM. salvo turns a `JobSpec` into byte-stable sbatch text and decides what to do when a job hits OOM or gets preempted. It does not move code to the cluster, does not open SSH connections, and is not a runner. For transport, pair it with [cluv](https://github.com/mila-iqia/cluv) or invoke `sbatch` directly.
|
|
4
|
+
|
|
5
|
+
## Install
|
|
6
|
+
|
|
7
|
+
pip install pysalvo
|
|
8
|
+
|
|
9
|
+
Two CLI commands ship for diagnostics and one-shot rendering. Everything else is meant to be imported.
|
|
10
|
+
|
|
11
|
+
salvo doctor # topology, ssh alias hygiene, manifest freshness
|
|
12
|
+
salvo render spec.yaml --cluster mila # JobSpec YAML to sbatch text on stdout
|
|
13
|
+
|
|
14
|
+
## Render
|
|
15
|
+
|
|
16
|
+
```python
|
|
17
|
+
from salvo import JobSpec, render
|
|
18
|
+
|
|
19
|
+
spec = JobSpec(
|
|
20
|
+
name="train",
|
|
21
|
+
cmd=["python", "train.py"],
|
|
22
|
+
gpus=1, cpus=8, mem="32G", time="2h",
|
|
23
|
+
on_oom=["bump_mem(1.5x, max=128G)", "fail"],
|
|
24
|
+
)
|
|
25
|
+
|
|
26
|
+
sbatch_text = render(
|
|
27
|
+
spec,
|
|
28
|
+
cluster_id="mila",
|
|
29
|
+
account="mila",
|
|
30
|
+
partition="unkillable",
|
|
31
|
+
) # str, byte-stable, no side effects
|
|
32
|
+
```
|
|
33
|
+
|
|
34
|
+
Same JobSpec, same cluster, same inputs, same bytes every run. Useful when you need to audit what was actually submitted six months later.
|
|
35
|
+
|
|
36
|
+
If you leave `account` and `partition` off, salvo picks them via `salvo.dispatch` by shelling out to `squeue` for live capacity. That path only works on a SLURM login node; library callers (cluv, xgenius) should pass both explicitly to keep `render()` pure.
|
|
37
|
+
|
|
38
|
+
## OOM policy
|
|
39
|
+
|
|
40
|
+
```python
|
|
41
|
+
from salvo.policy import parse, apply_oom, OomContext
|
|
42
|
+
|
|
43
|
+
steps = parse(["bump_mem(1.5x, max=128G)", "escalate_partition", "fail"])
|
|
44
|
+
|
|
45
|
+
new_spec, action = apply_oom(prev_spec, OomContext(kind="cpu", max_rss_mb=33_500))
|
|
46
|
+
# new_spec is a fresh JobSpec with bumped mem, or None if the policy says fail
|
|
47
|
+
```
|
|
48
|
+
|
|
49
|
+
The DSL is intentionally small. Steps run in order until one applies:
|
|
50
|
+
|
|
51
|
+
- `bump_mem(<factor>x, max=<size>)` — multiplicative bump, capped
|
|
52
|
+
- `escalate_partition` — clear partition so the next render picks a larger tier
|
|
53
|
+
- `fail` — terminal
|
|
54
|
+
- `bump_gpus(...)`, `callback(...)` — parse today, execute in a later release
|
|
55
|
+
|
|
56
|
+
## Cluster topology
|
|
57
|
+
|
|
58
|
+
Five presets ship in `salvo/topology/presets/`: `mila`, `rorqual`, `narval`, `beluga`, `cedar`. More DRAC clusters land as YAMLs are contributed. Each YAML lists accounts, partitions, and capacity rules.
|
|
59
|
+
|
|
60
|
+
```python
|
|
61
|
+
from salvo.topology import load_preset, list_presets
|
|
62
|
+
|
|
63
|
+
list_presets() # ['beluga', 'cedar', 'mila', 'narval', 'rorqual']
|
|
64
|
+
cluster = load_preset("mila") # ClusterTopology
|
|
65
|
+
```
|
|
66
|
+
|
|
67
|
+
Contributing a new cluster is one YAML file. See [CONTRIBUTING.md](CONTRIBUTING.md).
|
|
68
|
+
|
|
69
|
+
## Decorator (optional, Python-native)
|
|
70
|
+
|
|
71
|
+
If you'd rather write a function than a YAML spec:
|
|
72
|
+
|
|
73
|
+
```python
|
|
74
|
+
from salvo import cluster
|
|
75
|
+
|
|
76
|
+
@cluster.submit(gpus=1, cpus=8, mem="32G", time="2h",
|
|
77
|
+
on_oom=["bump_mem(1.5x, max=128G)", "fail"])
|
|
78
|
+
def train(seed: int):
|
|
79
|
+
...
|
|
80
|
+
|
|
81
|
+
handle = train.submit(seed=42)
|
|
82
|
+
```
|
|
83
|
+
|
|
84
|
+
`train.submit(...)` runs the local-sbatch convenience path: it renders, calls `sbatch`, and returns a handle. Skip the decorator if you're embedding salvo inside another tool's submit flow; just call `render()` and let that tool do the submission.
|
|
85
|
+
|
|
86
|
+
## Use with cluv
|
|
87
|
+
|
|
88
|
+
cluv handles SSH, code sync, and the `sbatch` call. salvo handles the policy. The natural composition is cluv importing salvo's policy library when a user opts in:
|
|
89
|
+
|
|
90
|
+
```toml
|
|
91
|
+
# in your project's pyproject.toml
|
|
92
|
+
[tool.cluv.retry]
|
|
93
|
+
on_oom = ["bump_mem(1.5x, max=128G)", "fail"]
|
|
94
|
+
max_hops = 5
|
|
95
|
+
```
|
|
96
|
+
|
|
97
|
+
This is a proposal, not a shipped feature in cluv. An upstream issue is in draft.
|
|
98
|
+
|
|
99
|
+
## Use with anything that calls sbatch
|
|
100
|
+
|
|
101
|
+
salvo has no SSH or async dependencies. It is a pydantic + stdlib library. Any tool that runs `sbatch` can import `salvo.render`, `salvo.policy.apply_oom`, and `salvo.topology.load_preset` independently. Two runnable examples under [`examples/`](examples/) show the wiring end-to-end: [`cluv_integration.py`](examples/cluv_integration.py) (policy + render) and [`xgenius_integration.py`](examples/xgenius_integration.py) (policy-only, when the host tool keeps its own renderer).
|
|
102
|
+
|
|
103
|
+
## License
|
|
104
|
+
|
|
105
|
+
MIT. See [LICENSE](LICENSE).
|
|
@@ -0,0 +1,5 @@
|
|
|
1
|
+
# Security policy
|
|
2
|
+
|
|
3
|
+
If you find a vulnerability in salvo, please report it privately rather than opening a public issue. Open a GitHub security advisory on this repo, or email the maintainer listed in `pyproject.toml`. Expect an acknowledgement within 7 days.
|
|
4
|
+
|
|
5
|
+
salvo dispatches user code to SLURM clusters via `sbatch`. It does not transmit credentials, secrets, or job payloads to any third party. The only network calls are SSH/scp/rsync against clusters the operator configures.
|
|
@@ -0,0 +1,83 @@
|
|
|
1
|
+
"""End-to-end example: how cluv would call salvo as its retry-policy library.
|
|
2
|
+
|
|
3
|
+
A cluv maintainer should be able to read this file on a laptop and trust
|
|
4
|
+
that wiring salvo into cluv's submit flow is a small, well-defined change.
|
|
5
|
+
It walks the natural composition described in salvo's README ("Use with
|
|
6
|
+
cluv"): parse the on_oom DSL from pyproject, build a JobSpec from values
|
|
7
|
+
cluv already has, run apply_oom on a synthetic OOM, and render the bumped
|
|
8
|
+
spec to sbatch text. No SLURM, no network, no subprocess — just stdlib +
|
|
9
|
+
pydantic. Run with: ``python examples/cluv_integration.py``.
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
from __future__ import annotations
|
|
13
|
+
|
|
14
|
+
from salvo import JobSpec, render
|
|
15
|
+
from salvo.policy import OomContext, apply_oom, parse
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def main() -> None:
|
|
19
|
+
# 1. Parse the policy DSL from a string list. In real cluv this list
|
|
20
|
+
# comes from `[tool.cluv.retry].on_oom` in the user's pyproject.toml.
|
|
21
|
+
on_oom_dsl: list[str] = ["bump_mem(1.5x, max=128G)", "fail"]
|
|
22
|
+
steps = parse(on_oom_dsl)
|
|
23
|
+
print(f"parsed {len(steps)} policy step(s): {[type(s).__name__ for s in steps]}")
|
|
24
|
+
|
|
25
|
+
# 2. Build a minimal JobSpec from values cluv already has. cluv knows
|
|
26
|
+
# the entrypoint, gpu/cpu ask, env-var-style mem string, and the
|
|
27
|
+
# user's on_oom list — exactly what JobSpec needs.
|
|
28
|
+
spec = JobSpec(
|
|
29
|
+
name="train",
|
|
30
|
+
cmd=["python", "train.py"],
|
|
31
|
+
gpus=1,
|
|
32
|
+
cpus=8,
|
|
33
|
+
mem="32G",
|
|
34
|
+
time="2h",
|
|
35
|
+
on_oom=on_oom_dsl,
|
|
36
|
+
max_hops=5,
|
|
37
|
+
)
|
|
38
|
+
|
|
39
|
+
# 6. Walk all three hops of the bump_mem ladder so the example shows
|
|
40
|
+
# the policy doing something across resubmissions, not just one call.
|
|
41
|
+
# 32G -> 48G -> 72G -> 108G (1.5x each, capped at 128G).
|
|
42
|
+
current: JobSpec | None = spec
|
|
43
|
+
for hop in range(1, 4):
|
|
44
|
+
assert current is not None # narrowing for type-checkers
|
|
45
|
+
mem_before = current.mem
|
|
46
|
+
|
|
47
|
+
# 3. Synthesize an OOM context. cluv builds this from the job state
|
|
48
|
+
# it already tracks (sacct max_rss, tail of stderr, cpu/gpu class).
|
|
49
|
+
ctx = OomContext(
|
|
50
|
+
kind="cpu",
|
|
51
|
+
max_rss_mb=current.mem_mb(), # observed RSS pinned to current ask
|
|
52
|
+
log_excerpt="slurmstepd: error: Detected 1 oom_kill event",
|
|
53
|
+
)
|
|
54
|
+
|
|
55
|
+
# 4. Apply the policy. Two outcomes:
|
|
56
|
+
# (new_spec, "bump_mem") -> resubmit with new_spec.mem
|
|
57
|
+
# (None, "fail") -> stop, surface to user
|
|
58
|
+
new_spec, action = apply_oom(current, ctx)
|
|
59
|
+
if new_spec is None:
|
|
60
|
+
print(f"hop {hop}: action={action!r} mem {mem_before} -> (stop)")
|
|
61
|
+
break
|
|
62
|
+
|
|
63
|
+
print(f"hop {hop}: action={action!r} mem {mem_before} -> {new_spec.mem}")
|
|
64
|
+
current = new_spec
|
|
65
|
+
|
|
66
|
+
# 5. Render the final bumped spec to sbatch text. With both account and
|
|
67
|
+
# partition supplied this is pure: no squeue, no SLURM, no I/O. cluv
|
|
68
|
+
# would write this string to a file and `sbatch` it; here we just
|
|
69
|
+
# print the header so the audit trail is visible.
|
|
70
|
+
assert current is not None
|
|
71
|
+
sbatch_text = render(
|
|
72
|
+
current,
|
|
73
|
+
cluster_id="mila",
|
|
74
|
+
account="mila",
|
|
75
|
+
partition="unkillable",
|
|
76
|
+
)
|
|
77
|
+
print("\nrendered sbatch (header only):")
|
|
78
|
+
for line in sbatch_text.splitlines()[:8]:
|
|
79
|
+
print(f" {line}")
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
if __name__ == "__main__":
|
|
83
|
+
main()
|
|
@@ -0,0 +1,87 @@
|
|
|
1
|
+
"""End-to-end example: how xgenius would call salvo as its OOM-policy library.
|
|
2
|
+
|
|
3
|
+
xgenius already has its own sbatch renderer (Jinja-style ``{{PLACEHOLDER}}``
|
|
4
|
+
templates in ``xgenius/templates.py``) and its own SafetyValidator with a
|
|
5
|
+
fixed ``max_memory_per_job`` ceiling. It has no dynamic OOM handling: an
|
|
6
|
+
OOM-killed job is observed by the watcher daemon but no corrective action
|
|
7
|
+
is taken. salvo.policy fills exactly that gap. This example shows the
|
|
8
|
+
policy-only integration path: xgenius keeps its template renderer; only
|
|
9
|
+
the watcher imports salvo. No SLURM, no network, no subprocess — just
|
|
10
|
+
stdlib + pydantic. Run with: ``python examples/xgenius_integration.py``.
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
from __future__ import annotations
|
|
14
|
+
|
|
15
|
+
from salvo import JobSpec
|
|
16
|
+
from salvo.policy import OomContext, OomDecision, apply_oom, parse
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def main() -> None:
|
|
20
|
+
# 1. Parse the policy from a future xgenius config knob, e.g. a new
|
|
21
|
+
# ``[retry] on_oom = [...]`` table in ``xgenius.toml``. Validation
|
|
22
|
+
# happens here: an unknown step raises ValueError pointing at the
|
|
23
|
+
# bad string, before any job runs.
|
|
24
|
+
on_oom_dsl: list[str] = ["bump_mem(2x, max=80G)", "fail"]
|
|
25
|
+
steps = parse(on_oom_dsl)
|
|
26
|
+
print(f"parsed {len(steps)} policy step(s): {[type(s).__name__ for s in steps]}")
|
|
27
|
+
|
|
28
|
+
# 2. The xgenius watcher detects OUT_OF_MEMORY for an experiment.
|
|
29
|
+
# It already knows the experiment's gpu/cpu/mem/walltime ask
|
|
30
|
+
# (from xgenius.toml + the running job record). Wrap that in a
|
|
31
|
+
# JobSpec — JobSpec is the only salvo type the watcher needs to
|
|
32
|
+
# build. The cmd field is unused by apply_oom; pass a sentinel.
|
|
33
|
+
experiment_gpus = 1
|
|
34
|
+
experiment_cpus = 4
|
|
35
|
+
experiment_mem = "16G"
|
|
36
|
+
experiment_walltime = "4h"
|
|
37
|
+
|
|
38
|
+
spec = JobSpec(
|
|
39
|
+
name="experiment-42",
|
|
40
|
+
cmd=["xgenius-runner"],
|
|
41
|
+
gpus=experiment_gpus,
|
|
42
|
+
cpus=experiment_cpus,
|
|
43
|
+
mem=experiment_mem,
|
|
44
|
+
time=experiment_walltime,
|
|
45
|
+
on_oom=on_oom_dsl,
|
|
46
|
+
)
|
|
47
|
+
|
|
48
|
+
# 3. Synthesize an OomContext from what the watcher already polls.
|
|
49
|
+
# xgenius's COMPLETION_EPILOG drops a trap-based marker on OOM;
|
|
50
|
+
# MaxRSS is read via sacct the same way cluv does. ``kind`` is
|
|
51
|
+
# "gpu" only when the job asked for gpus AND the OOM came from
|
|
52
|
+
# GPU memory exhaustion. For CPU-side OOM on a GPU job, "cpu" is
|
|
53
|
+
# still the right kind because that is the resource being bumped.
|
|
54
|
+
observed_max_rss_mb = 15_900
|
|
55
|
+
ctx = OomContext(
|
|
56
|
+
kind="cpu",
|
|
57
|
+
max_rss_mb=observed_max_rss_mb,
|
|
58
|
+
log_excerpt="slurmstepd: error: Detected 1 oom_kill event",
|
|
59
|
+
)
|
|
60
|
+
|
|
61
|
+
# 4. Ask salvo what to do. The return is a NamedTuple — both
|
|
62
|
+
# attribute and tuple-unpack work, so the watcher can pick
|
|
63
|
+
# whichever style reads cleaner in its existing code.
|
|
64
|
+
decision: OomDecision = apply_oom(spec, ctx)
|
|
65
|
+
if decision.new_spec is None:
|
|
66
|
+
print(f"policy terminated: action={decision.action!r} — escalate to human")
|
|
67
|
+
return
|
|
68
|
+
|
|
69
|
+
# 5. xgenius now re-runs its own template renderer with the bumped
|
|
70
|
+
# memory string. salvo does not render xgenius templates; the
|
|
71
|
+
# watcher just substitutes ``new_spec.mem`` back into the params
|
|
72
|
+
# dict it already builds in ``xgenius/jobs.py:submit``.
|
|
73
|
+
new_params = {
|
|
74
|
+
"memory": decision.new_spec.mem, # "32768M" after 16G * 2
|
|
75
|
+
"gpus": decision.new_spec.gpus,
|
|
76
|
+
"cpus": decision.new_spec.cpus,
|
|
77
|
+
"walltime": decision.new_spec.time,
|
|
78
|
+
}
|
|
79
|
+
print(f"action={decision.action!r}")
|
|
80
|
+
print(f" mem {experiment_mem} -> {decision.new_spec.mem}")
|
|
81
|
+
print(" hand back to xgenius template renderer with params:")
|
|
82
|
+
for k, v in new_params.items():
|
|
83
|
+
print(f" {k}={v!r}")
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
if __name__ == "__main__":
|
|
87
|
+
main()
|
|
@@ -0,0 +1,79 @@
|
|
|
1
|
+
[project]
|
|
2
|
+
name = "pysalvo"
|
|
3
|
+
version = "0.2.0"
|
|
4
|
+
description = "Render + policy library for SLURM JobSpecs (Mila + DRAC clusters)"
|
|
5
|
+
readme = "README.md"
|
|
6
|
+
license = { text = "MIT" }
|
|
7
|
+
requires-python = ">=3.11"
|
|
8
|
+
authors = [{ name = "W" }]
|
|
9
|
+
keywords = ["slurm", "hpc", "sbatch", "mila", "drac", "alliance-canada"]
|
|
10
|
+
classifiers = [
|
|
11
|
+
"Development Status :: 3 - Alpha",
|
|
12
|
+
"Intended Audience :: Science/Research",
|
|
13
|
+
"Intended Audience :: System Administrators",
|
|
14
|
+
"License :: OSI Approved :: MIT License",
|
|
15
|
+
"Operating System :: POSIX :: Linux",
|
|
16
|
+
"Operating System :: MacOS",
|
|
17
|
+
"Programming Language :: Python :: 3",
|
|
18
|
+
"Programming Language :: Python :: 3 :: Only",
|
|
19
|
+
"Programming Language :: Python :: 3.11",
|
|
20
|
+
"Programming Language :: Python :: 3.12",
|
|
21
|
+
"Programming Language :: Python :: 3.13",
|
|
22
|
+
"Topic :: Scientific/Engineering",
|
|
23
|
+
"Topic :: System :: Clustering",
|
|
24
|
+
"Topic :: System :: Distributed Computing",
|
|
25
|
+
"Typing :: Typed",
|
|
26
|
+
]
|
|
27
|
+
dependencies = [
|
|
28
|
+
"pydantic>=2.6",
|
|
29
|
+
"typer>=0.12",
|
|
30
|
+
"pyyaml>=6.0",
|
|
31
|
+
"rich>=13.7",
|
|
32
|
+
"tomli-w>=1.0",
|
|
33
|
+
]
|
|
34
|
+
|
|
35
|
+
[project.urls]
|
|
36
|
+
Homepage = "https://github.com/wietzesuijker/salvo"
|
|
37
|
+
Source = "https://github.com/wietzesuijker/salvo"
|
|
38
|
+
Issues = "https://github.com/wietzesuijker/salvo/issues"
|
|
39
|
+
|
|
40
|
+
[project.optional-dependencies]
|
|
41
|
+
dev = [
|
|
42
|
+
"pytest>=8.0",
|
|
43
|
+
"pytest-cov>=5.0",
|
|
44
|
+
"hypothesis>=6.100",
|
|
45
|
+
"ruff>=0.5",
|
|
46
|
+
"mypy>=1.10",
|
|
47
|
+
"pre-commit>=3.7",
|
|
48
|
+
"types-pyyaml",
|
|
49
|
+
]
|
|
50
|
+
mila = []
|
|
51
|
+
drac = []
|
|
52
|
+
all = ["pysalvo[mila,drac]"]
|
|
53
|
+
|
|
54
|
+
[project.scripts]
|
|
55
|
+
salvo = "salvo.cli:app"
|
|
56
|
+
|
|
57
|
+
[build-system]
|
|
58
|
+
requires = ["hatchling>=1.24"]
|
|
59
|
+
build-backend = "hatchling.build"
|
|
60
|
+
|
|
61
|
+
[tool.hatch.build.targets.wheel]
|
|
62
|
+
packages = ["src/salvo"]
|
|
63
|
+
|
|
64
|
+
[tool.ruff]
|
|
65
|
+
line-length = 100
|
|
66
|
+
target-version = "py311"
|
|
67
|
+
|
|
68
|
+
[tool.ruff.lint]
|
|
69
|
+
select = ["E", "F", "W", "I", "UP", "B", "SIM", "RUF"]
|
|
70
|
+
|
|
71
|
+
[tool.mypy]
|
|
72
|
+
python_version = "3.11"
|
|
73
|
+
strict = true
|
|
74
|
+
files = ["src/salvo"]
|
|
75
|
+
disallow_untyped_decorators = false
|
|
76
|
+
|
|
77
|
+
[tool.pytest.ini_options]
|
|
78
|
+
addopts = "-ra --strict-markers --cov=salvo --cov-report=term-missing --cov-fail-under=80"
|
|
79
|
+
testpaths = ["tests"]
|