pysalvo 0.2.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (86) hide show
  1. pysalvo-0.2.0/.github/workflows/ci.yml +30 -0
  2. pysalvo-0.2.0/.github/workflows/release.yml +42 -0
  3. pysalvo-0.2.0/.gitignore +12 -0
  4. pysalvo-0.2.0/.pre-commit-config.yaml +13 -0
  5. pysalvo-0.2.0/CONTRIBUTING.md +40 -0
  6. pysalvo-0.2.0/LICENSE +21 -0
  7. pysalvo-0.2.0/PKG-INFO +150 -0
  8. pysalvo-0.2.0/README.md +105 -0
  9. pysalvo-0.2.0/SECURITY.md +5 -0
  10. pysalvo-0.2.0/examples/cluv_integration.py +83 -0
  11. pysalvo-0.2.0/examples/xgenius_integration.py +87 -0
  12. pysalvo-0.2.0/pyproject.toml +79 -0
  13. pysalvo-0.2.0/src/salvo/__init__.py +74 -0
  14. pysalvo-0.2.0/src/salvo/_version.py +1 -0
  15. pysalvo-0.2.0/src/salvo/cli.py +83 -0
  16. pysalvo-0.2.0/src/salvo/decorators.py +39 -0
  17. pysalvo-0.2.0/src/salvo/dispatch/__init__.py +7 -0
  18. pysalvo-0.2.0/src/salvo/dispatch/account.py +52 -0
  19. pysalvo-0.2.0/src/salvo/dispatch/caps.py +100 -0
  20. pysalvo-0.2.0/src/salvo/dispatch/partition.py +43 -0
  21. pysalvo-0.2.0/src/salvo/doctor.py +130 -0
  22. pysalvo-0.2.0/src/salvo/errors.py +49 -0
  23. pysalvo-0.2.0/src/salvo/job/__init__.py +3 -0
  24. pysalvo-0.2.0/src/salvo/job/handle.py +70 -0
  25. pysalvo-0.2.0/src/salvo/job/oom.py +134 -0
  26. pysalvo-0.2.0/src/salvo/job/preempt.py +34 -0
  27. pysalvo-0.2.0/src/salvo/job/render.py +68 -0
  28. pysalvo-0.2.0/src/salvo/job/spec.py +108 -0
  29. pysalvo-0.2.0/src/salvo/job/submit.py +108 -0
  30. pysalvo-0.2.0/src/salvo/manifest/__init__.py +4 -0
  31. pysalvo-0.2.0/src/salvo/manifest/schema.py +19 -0
  32. pysalvo-0.2.0/src/salvo/manifest/store.py +108 -0
  33. pysalvo-0.2.0/src/salvo/obs/__init__.py +3 -0
  34. pysalvo-0.2.0/src/salvo/obs/events.py +38 -0
  35. pysalvo-0.2.0/src/salvo/policy.py +46 -0
  36. pysalvo-0.2.0/src/salvo/py.typed +0 -0
  37. pysalvo-0.2.0/src/salvo/stage/__init__.py +6 -0
  38. pysalvo-0.2.0/src/salvo/stage/gate.py +24 -0
  39. pysalvo-0.2.0/src/salvo/topology/__init__.py +44 -0
  40. pysalvo-0.2.0/src/salvo/topology/detect.py +27 -0
  41. pysalvo-0.2.0/src/salvo/topology/loader.py +74 -0
  42. pysalvo-0.2.0/src/salvo/topology/presets/__init__.py +0 -0
  43. pysalvo-0.2.0/src/salvo/topology/presets/_national.yaml +7 -0
  44. pysalvo-0.2.0/src/salvo/topology/presets/beluga.yaml +28 -0
  45. pysalvo-0.2.0/src/salvo/topology/presets/cedar.yaml +28 -0
  46. pysalvo-0.2.0/src/salvo/topology/presets/mila.yaml +33 -0
  47. pysalvo-0.2.0/src/salvo/topology/presets/narval.yaml +28 -0
  48. pysalvo-0.2.0/src/salvo/topology/presets/rorqual.yaml +35 -0
  49. pysalvo-0.2.0/src/salvo/topology/schema.py +107 -0
  50. pysalvo-0.2.0/tests/__init__.py +0 -0
  51. pysalvo-0.2.0/tests/conftest.py +49 -0
  52. pysalvo-0.2.0/tests/fixtures/clusters/child.yaml +9 -0
  53. pysalvo-0.2.0/tests/fixtures/clusters/parent.yaml +7 -0
  54. pysalvo-0.2.0/tests/integration/__init__.py +0 -0
  55. pysalvo-0.2.0/tests/integration/conftest.py +1 -0
  56. pysalvo-0.2.0/tests/integration/golden/mila_cpu_basic.sh +15 -0
  57. pysalvo-0.2.0/tests/integration/golden/rorqual_gpu_basic.sh +16 -0
  58. pysalvo-0.2.0/tests/integration/test_library_surface.py +96 -0
  59. pysalvo-0.2.0/tests/integration/test_oom_chain.py +25 -0
  60. pysalvo-0.2.0/tests/integration/test_submit_e2e.py +26 -0
  61. pysalvo-0.2.0/tests/perf/__init__.py +0 -0
  62. pysalvo-0.2.0/tests/perf/test_perf_budgets.py +29 -0
  63. pysalvo-0.2.0/tests/test_version.py +5 -0
  64. pysalvo-0.2.0/tests/unit/__init__.py +0 -0
  65. pysalvo-0.2.0/tests/unit/test_account.py +44 -0
  66. pysalvo-0.2.0/tests/unit/test_caps.py +45 -0
  67. pysalvo-0.2.0/tests/unit/test_cli.py +87 -0
  68. pysalvo-0.2.0/tests/unit/test_decorator.py +11 -0
  69. pysalvo-0.2.0/tests/unit/test_doctor.py +16 -0
  70. pysalvo-0.2.0/tests/unit/test_errors.py +26 -0
  71. pysalvo-0.2.0/tests/unit/test_events.py +39 -0
  72. pysalvo-0.2.0/tests/unit/test_gate.py +37 -0
  73. pysalvo-0.2.0/tests/unit/test_handle_imports.py +2 -0
  74. pysalvo-0.2.0/tests/unit/test_manifest.py +51 -0
  75. pysalvo-0.2.0/tests/unit/test_oom_dsl.py +48 -0
  76. pysalvo-0.2.0/tests/unit/test_oom_retry.py +51 -0
  77. pysalvo-0.2.0/tests/unit/test_partition.py +43 -0
  78. pysalvo-0.2.0/tests/unit/test_preempt.py +29 -0
  79. pysalvo-0.2.0/tests/unit/test_presets.py +45 -0
  80. pysalvo-0.2.0/tests/unit/test_public_api.py +118 -0
  81. pysalvo-0.2.0/tests/unit/test_render.py +63 -0
  82. pysalvo-0.2.0/tests/unit/test_spec.py +65 -0
  83. pysalvo-0.2.0/tests/unit/test_topology_detect.py +27 -0
  84. pysalvo-0.2.0/tests/unit/test_topology_loader.py +29 -0
  85. pysalvo-0.2.0/tests/unit/test_topology_schema.py +63 -0
  86. pysalvo-0.2.0/uv.lock +904 -0
@@ -0,0 +1,30 @@
1
+ name: ci
2
+ on:
3
+ push:
4
+ branches: [main]
5
+ pull_request:
6
+ concurrency:
7
+ group: ${{ github.workflow }}-${{ github.ref }}
8
+ cancel-in-progress: true
9
+ jobs:
10
+ test:
11
+ runs-on: ubuntu-latest
12
+ strategy:
13
+ matrix:
14
+ python: ["3.11", "3.12", "3.13"]
15
+ steps:
16
+ - uses: actions/checkout@v4
17
+ - uses: astral-sh/setup-uv@v3
18
+ - uses: actions/setup-python@v5
19
+ with:
20
+ python-version: ${{ matrix.python }}
21
+ - run: uv sync --all-extras
22
+ - run: uv run pre-commit run --all-files
23
+ - run: uv run mypy src/salvo
24
+ - run: uv run pytest -v
25
+ - run: uv build
26
+ - name: Smoke-import built wheel
27
+ run: |
28
+ python -m venv /tmp/wheel-smoke
29
+ /tmp/wheel-smoke/bin/pip install --quiet dist/pysalvo-*.whl
30
+ /tmp/wheel-smoke/bin/python -c "from salvo import JobSpec, render; from salvo.policy import apply_oom, OomContext; from salvo.topology import load_preset"
@@ -0,0 +1,42 @@
1
+ name: release
2
+ on:
3
+ push:
4
+ tags: ["v*"]
5
+ concurrency:
6
+ group: release-${{ github.ref }}
7
+ cancel-in-progress: false
8
+ jobs:
9
+ build:
10
+ runs-on: ubuntu-latest
11
+ steps:
12
+ - uses: actions/checkout@v4
13
+ - uses: astral-sh/setup-uv@v3
14
+ - uses: actions/setup-python@v5
15
+ with:
16
+ python-version: "3.12"
17
+ - name: Verify tag matches pyproject version
18
+ env:
19
+ REF_NAME: ${{ github.ref_name }}
20
+ run: |
21
+ tag="${REF_NAME#v}"
22
+ ver=$(uv run python -c "import tomllib,pathlib;print(tomllib.loads(pathlib.Path('pyproject.toml').read_text())['project']['version'])")
23
+ [ "$tag" = "$ver" ] || { echo "tag=$tag pyproject=$ver"; exit 1; }
24
+ - run: uv build
25
+ - uses: actions/upload-artifact@v4
26
+ with:
27
+ name: dist
28
+ path: dist/
29
+ publish:
30
+ needs: build
31
+ runs-on: ubuntu-latest
32
+ environment:
33
+ name: pypi
34
+ url: https://pypi.org/p/pysalvo
35
+ permissions:
36
+ id-token: write
37
+ steps:
38
+ - uses: actions/download-artifact@v4
39
+ with:
40
+ name: dist
41
+ path: dist/
42
+ - uses: pypa/gh-action-pypi-publish@release/v1
@@ -0,0 +1,12 @@
1
+ __pycache__/
2
+ *.py[cod]
3
+ .venv/
4
+ .mypy_cache/
5
+ .ruff_cache/
6
+ .pytest_cache/
7
+ htmlcov/
8
+ .coverage
9
+ dist/
10
+ build/
11
+ *.egg-info/
12
+ .DS_Store
@@ -0,0 +1,13 @@
1
+ repos:
2
+ - repo: https://github.com/astral-sh/ruff-pre-commit
3
+ rev: v0.5.5
4
+ hooks:
5
+ - id: ruff
6
+ args: [--fix]
7
+ - id: ruff-format
8
+ - repo: https://github.com/pre-commit/mirrors-mypy
9
+ rev: v1.10.0
10
+ hooks:
11
+ - id: mypy
12
+ additional_dependencies: [pydantic>=2.6, types-pyyaml]
13
+ files: ^src/salvo
@@ -0,0 +1,40 @@
1
+ # Contributing
2
+
3
+ ## Adding a cluster
4
+
5
+ The fastest contribution is a new cluster preset. Each preset is a single YAML file under `src/salvo/topology/presets/`. The schema lives in `src/salvo/topology/schema.py` (pydantic v2, frozen, `extra="forbid"`).
6
+
7
+ 1. Copy an existing preset closest to your cluster (DRAC: `rorqual.yaml`; campus: `mila.yaml`).
8
+ 2. Edit accounts, partitions, GPU types, walltime caps, defaults.
9
+ 3. Add an entry to `src/salvo/topology/detect.py` for the hostname pattern.
10
+ 4. Open a PR with the YAML + a one-line entry in the README cluster table.
11
+
12
+ Tests will exercise your preset via the parametrized suite in `tests/unit/test_presets.py`.
13
+
14
+ ## Dev setup
15
+
16
+ git clone <repo>
17
+ cd salvo
18
+ uv sync --all-extras
19
+ uv run pre-commit install
20
+ uv run pytest
21
+
22
+ ## Style
23
+
24
+ - Python 3.11+, ruff + mypy strict, line length 100.
25
+ - Pydantic v2 models are frozen and `extra="forbid"`.
26
+ - Public functions get type hints; tests get docstrings on intent (not behaviour).
27
+ - No mutable defaults, no f-strings in log messages.
28
+
29
+ ## Test layers
30
+
31
+ - **Layer A** (`tests/unit/`): pure-Python, no subprocess.
32
+ - **Layer B** (`tests/integration/`): subprocess to fake `sbatch` via `tests/conftest.py:fake_sbatch`. Golden sbatch files in `tests/integration/golden/`.
33
+ - **Layer C** (cluster nightly): not run on PRs.
34
+ - **Perf** (`tests/perf/`): budget smoke tests.
35
+
36
+ PRs must keep total coverage at or above 80% and add tests for every new dispatch rule.
37
+
38
+ ## Commit style
39
+
40
+ Conventional commits with module-scoped scope: `feat(job):`, `fix(dispatch):`, `test(integration):`, `docs:`, `chore:`.
pysalvo-0.2.0/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 W
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
pysalvo-0.2.0/PKG-INFO ADDED
@@ -0,0 +1,150 @@
1
+ Metadata-Version: 2.4
2
+ Name: pysalvo
3
+ Version: 0.2.0
4
+ Summary: Render + policy library for SLURM JobSpecs (Mila + DRAC clusters)
5
+ Project-URL: Homepage, https://github.com/wietzesuijker/salvo
6
+ Project-URL: Source, https://github.com/wietzesuijker/salvo
7
+ Project-URL: Issues, https://github.com/wietzesuijker/salvo/issues
8
+ Author: W
9
+ License: MIT
10
+ License-File: LICENSE
11
+ Keywords: alliance-canada,drac,hpc,mila,sbatch,slurm
12
+ Classifier: Development Status :: 3 - Alpha
13
+ Classifier: Intended Audience :: Science/Research
14
+ Classifier: Intended Audience :: System Administrators
15
+ Classifier: License :: OSI Approved :: MIT License
16
+ Classifier: Operating System :: MacOS
17
+ Classifier: Operating System :: POSIX :: Linux
18
+ Classifier: Programming Language :: Python :: 3
19
+ Classifier: Programming Language :: Python :: 3 :: Only
20
+ Classifier: Programming Language :: Python :: 3.11
21
+ Classifier: Programming Language :: Python :: 3.12
22
+ Classifier: Programming Language :: Python :: 3.13
23
+ Classifier: Topic :: Scientific/Engineering
24
+ Classifier: Topic :: System :: Clustering
25
+ Classifier: Topic :: System :: Distributed Computing
26
+ Classifier: Typing :: Typed
27
+ Requires-Python: >=3.11
28
+ Requires-Dist: pydantic>=2.6
29
+ Requires-Dist: pyyaml>=6.0
30
+ Requires-Dist: rich>=13.7
31
+ Requires-Dist: tomli-w>=1.0
32
+ Requires-Dist: typer>=0.12
33
+ Provides-Extra: all
34
+ Provides-Extra: dev
35
+ Requires-Dist: hypothesis>=6.100; extra == 'dev'
36
+ Requires-Dist: mypy>=1.10; extra == 'dev'
37
+ Requires-Dist: pre-commit>=3.7; extra == 'dev'
38
+ Requires-Dist: pytest-cov>=5.0; extra == 'dev'
39
+ Requires-Dist: pytest>=8.0; extra == 'dev'
40
+ Requires-Dist: ruff>=0.5; extra == 'dev'
41
+ Requires-Dist: types-pyyaml; extra == 'dev'
42
+ Provides-Extra: drac
43
+ Provides-Extra: mila
44
+ Description-Content-Type: text/markdown
45
+
46
+ # salvo
47
+
48
+ Policy and render library for SLURM. salvo turns a `JobSpec` into byte-stable sbatch text and decides what to do when a job hits OOM or gets preempted. It does not move code to the cluster, does not open SSH connections, and is not a runner. For transport, pair it with [cluv](https://github.com/mila-iqia/cluv) or invoke `sbatch` directly.
49
+
50
+ ## Install
51
+
52
+ pip install pysalvo
53
+
54
+ Two CLI commands ship for diagnostics and one-shot rendering. Everything else is meant to be imported.
55
+
56
+ salvo doctor # topology, ssh alias hygiene, manifest freshness
57
+ salvo render spec.yaml --cluster mila # JobSpec YAML to sbatch text on stdout
58
+
59
+ ## Render
60
+
61
+ ```python
62
+ from salvo import JobSpec, render
63
+
64
+ spec = JobSpec(
65
+ name="train",
66
+ cmd=["python", "train.py"],
67
+ gpus=1, cpus=8, mem="32G", time="2h",
68
+ on_oom=["bump_mem(1.5x, max=128G)", "fail"],
69
+ )
70
+
71
+ sbatch_text = render(
72
+ spec,
73
+ cluster_id="mila",
74
+ account="mila",
75
+ partition="unkillable",
76
+ ) # str, byte-stable, no side effects
77
+ ```
78
+
79
+ Same JobSpec, same cluster, same inputs, same bytes every run. Useful when you need to audit what was actually submitted six months later.
80
+
81
+ If you leave `account` and `partition` off, salvo picks them via `salvo.dispatch` by shelling out to `squeue` for live capacity. That path only works on a SLURM login node; library callers (cluv, xgenius) should pass both explicitly to keep `render()` pure.
82
+
83
+ ## OOM policy
84
+
85
+ ```python
86
+ from salvo.policy import parse, apply_oom, OomContext
87
+
88
+ steps = parse(["bump_mem(1.5x, max=128G)", "escalate_partition", "fail"])
89
+
90
+ new_spec, action = apply_oom(prev_spec, OomContext(kind="cpu", max_rss_mb=33_500))
91
+ # new_spec is a fresh JobSpec with bumped mem, or None if the policy says fail
92
+ ```
93
+
94
+ The DSL is intentionally small. Steps run in order until one applies:
95
+
96
+ - `bump_mem(<factor>x, max=<size>)` — multiplicative bump, capped
97
+ - `escalate_partition` — clear partition so the next render picks a larger tier
98
+ - `fail` — terminal
99
+ - `bump_gpus(...)`, `callback(...)` — parse today, execute in a later release
100
+
101
+ ## Cluster topology
102
+
103
+ Five presets ship in `salvo/topology/presets/`: `mila`, `rorqual`, `narval`, `beluga`, `cedar`. More DRAC clusters land as YAMLs are contributed. Each YAML lists accounts, partitions, and capacity rules.
104
+
105
+ ```python
106
+ from salvo.topology import load_preset, list_presets
107
+
108
+ list_presets() # ['beluga', 'cedar', 'mila', 'narval', 'rorqual']
109
+ cluster = load_preset("mila") # ClusterTopology
110
+ ```
111
+
112
+ Contributing a new cluster is one YAML file. See [CONTRIBUTING.md](CONTRIBUTING.md).
113
+
114
+ ## Decorator (optional, Python-native)
115
+
116
+ If you'd rather write a function than a YAML spec:
117
+
118
+ ```python
119
+ from salvo import cluster
120
+
121
+ @cluster.submit(gpus=1, cpus=8, mem="32G", time="2h",
122
+ on_oom=["bump_mem(1.5x, max=128G)", "fail"])
123
+ def train(seed: int):
124
+ ...
125
+
126
+ handle = train.submit(seed=42)
127
+ ```
128
+
129
+ `train.submit(...)` runs the local-sbatch convenience path: it renders, calls `sbatch`, and returns a handle. Skip the decorator if you're embedding salvo inside another tool's submit flow; just call `render()` and let that tool do the submission.
130
+
131
+ ## Use with cluv
132
+
133
+ cluv handles SSH, code sync, and the `sbatch` call. salvo handles the policy. The natural composition is cluv importing salvo's policy library when a user opts in:
134
+
135
+ ```toml
136
+ # in your project's pyproject.toml
137
+ [tool.cluv.retry]
138
+ on_oom = ["bump_mem(1.5x, max=128G)", "fail"]
139
+ max_hops = 5
140
+ ```
141
+
142
+ This is a proposal, not a shipped feature in cluv. An upstream issue is in draft.
143
+
144
+ ## Use with anything that calls sbatch
145
+
146
+ salvo has no SSH or async dependencies. It is a pydantic + stdlib library. Any tool that runs `sbatch` can import `salvo.render`, `salvo.policy.apply_oom`, and `salvo.topology.load_preset` independently. Two runnable examples under [`examples/`](examples/) show the wiring end-to-end: [`cluv_integration.py`](examples/cluv_integration.py) (policy + render) and [`xgenius_integration.py`](examples/xgenius_integration.py) (policy-only, when the host tool keeps its own renderer).
147
+
148
+ ## License
149
+
150
+ MIT. See [LICENSE](LICENSE).
@@ -0,0 +1,105 @@
1
+ # salvo
2
+
3
+ Policy and render library for SLURM. salvo turns a `JobSpec` into byte-stable sbatch text and decides what to do when a job hits OOM or gets preempted. It does not move code to the cluster, does not open SSH connections, and is not a runner. For transport, pair it with [cluv](https://github.com/mila-iqia/cluv) or invoke `sbatch` directly.
4
+
5
+ ## Install
6
+
7
+ pip install pysalvo
8
+
9
+ Two CLI commands ship for diagnostics and one-shot rendering. Everything else is meant to be imported.
10
+
11
+ salvo doctor # topology, ssh alias hygiene, manifest freshness
12
+ salvo render spec.yaml --cluster mila # JobSpec YAML to sbatch text on stdout
13
+
14
+ ## Render
15
+
16
+ ```python
17
+ from salvo import JobSpec, render
18
+
19
+ spec = JobSpec(
20
+ name="train",
21
+ cmd=["python", "train.py"],
22
+ gpus=1, cpus=8, mem="32G", time="2h",
23
+ on_oom=["bump_mem(1.5x, max=128G)", "fail"],
24
+ )
25
+
26
+ sbatch_text = render(
27
+ spec,
28
+ cluster_id="mila",
29
+ account="mila",
30
+ partition="unkillable",
31
+ ) # str, byte-stable, no side effects
32
+ ```
33
+
34
+ Same JobSpec, same cluster, same inputs, same bytes every run. Useful when you need to audit what was actually submitted six months later.
35
+
36
+ If you leave `account` and `partition` off, salvo picks them via `salvo.dispatch` by shelling out to `squeue` for live capacity. That path only works on a SLURM login node; library callers (cluv, xgenius) should pass both explicitly to keep `render()` pure.
37
+
38
+ ## OOM policy
39
+
40
+ ```python
41
+ from salvo.policy import parse, apply_oom, OomContext
42
+
43
+ steps = parse(["bump_mem(1.5x, max=128G)", "escalate_partition", "fail"])
44
+
45
+ new_spec, action = apply_oom(prev_spec, OomContext(kind="cpu", max_rss_mb=33_500))
46
+ # new_spec is a fresh JobSpec with bumped mem, or None if the policy says fail
47
+ ```
48
+
49
+ The DSL is intentionally small. Steps run in order until one applies:
50
+
51
+ - `bump_mem(<factor>x, max=<size>)` — multiplicative bump, capped
52
+ - `escalate_partition` — clear partition so the next render picks a larger tier
53
+ - `fail` — terminal
54
+ - `bump_gpus(...)`, `callback(...)` — parse today, execute in a later release
55
+
56
+ ## Cluster topology
57
+
58
+ Five presets ship in `salvo/topology/presets/`: `mila`, `rorqual`, `narval`, `beluga`, `cedar`. More DRAC clusters land as YAMLs are contributed. Each YAML lists accounts, partitions, and capacity rules.
59
+
60
+ ```python
61
+ from salvo.topology import load_preset, list_presets
62
+
63
+ list_presets() # ['beluga', 'cedar', 'mila', 'narval', 'rorqual']
64
+ cluster = load_preset("mila") # ClusterTopology
65
+ ```
66
+
67
+ Contributing a new cluster is one YAML file. See [CONTRIBUTING.md](CONTRIBUTING.md).
68
+
69
+ ## Decorator (optional, Python-native)
70
+
71
+ If you'd rather write a function than a YAML spec:
72
+
73
+ ```python
74
+ from salvo import cluster
75
+
76
+ @cluster.submit(gpus=1, cpus=8, mem="32G", time="2h",
77
+ on_oom=["bump_mem(1.5x, max=128G)", "fail"])
78
+ def train(seed: int):
79
+ ...
80
+
81
+ handle = train.submit(seed=42)
82
+ ```
83
+
84
+ `train.submit(...)` runs the local-sbatch convenience path: it renders, calls `sbatch`, and returns a handle. Skip the decorator if you're embedding salvo inside another tool's submit flow; just call `render()` and let that tool do the submission.
85
+
86
+ ## Use with cluv
87
+
88
+ cluv handles SSH, code sync, and the `sbatch` call. salvo handles the policy. The natural composition is cluv importing salvo's policy library when a user opts in:
89
+
90
+ ```toml
91
+ # in your project's pyproject.toml
92
+ [tool.cluv.retry]
93
+ on_oom = ["bump_mem(1.5x, max=128G)", "fail"]
94
+ max_hops = 5
95
+ ```
96
+
97
+ This is a proposal, not a shipped feature in cluv. An upstream issue is in draft.
98
+
99
+ ## Use with anything that calls sbatch
100
+
101
+ salvo has no SSH or async dependencies. It is a pydantic + stdlib library. Any tool that runs `sbatch` can import `salvo.render`, `salvo.policy.apply_oom`, and `salvo.topology.load_preset` independently. Two runnable examples under [`examples/`](examples/) show the wiring end-to-end: [`cluv_integration.py`](examples/cluv_integration.py) (policy + render) and [`xgenius_integration.py`](examples/xgenius_integration.py) (policy-only, when the host tool keeps its own renderer).
102
+
103
+ ## License
104
+
105
+ MIT. See [LICENSE](LICENSE).
@@ -0,0 +1,5 @@
1
+ # Security policy
2
+
3
+ If you find a vulnerability in salvo, please report it privately rather than opening a public issue. Open a GitHub security advisory on this repo, or email the maintainer listed in `pyproject.toml`. Expect an acknowledgement within 7 days.
4
+
5
+ salvo dispatches user code to SLURM clusters via `sbatch`. It does not transmit credentials, secrets, or job payloads to any third party. The only network calls are SSH/scp/rsync against clusters the operator configures.
@@ -0,0 +1,83 @@
1
+ """End-to-end example: how cluv would call salvo as its retry-policy library.
2
+
3
+ A cluv maintainer should be able to read this file on a laptop and trust
4
+ that wiring salvo into cluv's submit flow is a small, well-defined change.
5
+ It walks the natural composition described in salvo's README ("Use with
6
+ cluv"): parse the on_oom DSL from pyproject, build a JobSpec from values
7
+ cluv already has, run apply_oom on a synthetic OOM, and render the bumped
8
+ spec to sbatch text. No SLURM, no network, no subprocess — just stdlib +
9
+ pydantic. Run with: ``python examples/cluv_integration.py``.
10
+ """
11
+
12
+ from __future__ import annotations
13
+
14
+ from salvo import JobSpec, render
15
+ from salvo.policy import OomContext, apply_oom, parse
16
+
17
+
18
+ def main() -> None:
19
+ # 1. Parse the policy DSL from a string list. In real cluv this list
20
+ # comes from `[tool.cluv.retry].on_oom` in the user's pyproject.toml.
21
+ on_oom_dsl: list[str] = ["bump_mem(1.5x, max=128G)", "fail"]
22
+ steps = parse(on_oom_dsl)
23
+ print(f"parsed {len(steps)} policy step(s): {[type(s).__name__ for s in steps]}")
24
+
25
+ # 2. Build a minimal JobSpec from values cluv already has. cluv knows
26
+ # the entrypoint, gpu/cpu ask, env-var-style mem string, and the
27
+ # user's on_oom list — exactly what JobSpec needs.
28
+ spec = JobSpec(
29
+ name="train",
30
+ cmd=["python", "train.py"],
31
+ gpus=1,
32
+ cpus=8,
33
+ mem="32G",
34
+ time="2h",
35
+ on_oom=on_oom_dsl,
36
+ max_hops=5,
37
+ )
38
+
39
+ # 6. Walk all three hops of the bump_mem ladder so the example shows
40
+ # the policy doing something across resubmissions, not just one call.
41
+ # 32G -> 48G -> 72G -> 108G (1.5x each, capped at 128G).
42
+ current: JobSpec | None = spec
43
+ for hop in range(1, 4):
44
+ assert current is not None # narrowing for type-checkers
45
+ mem_before = current.mem
46
+
47
+ # 3. Synthesize an OOM context. cluv builds this from the job state
48
+ # it already tracks (sacct max_rss, tail of stderr, cpu/gpu class).
49
+ ctx = OomContext(
50
+ kind="cpu",
51
+ max_rss_mb=current.mem_mb(), # observed RSS pinned to current ask
52
+ log_excerpt="slurmstepd: error: Detected 1 oom_kill event",
53
+ )
54
+
55
+ # 4. Apply the policy. Two outcomes:
56
+ # (new_spec, "bump_mem") -> resubmit with new_spec.mem
57
+ # (None, "fail") -> stop, surface to user
58
+ new_spec, action = apply_oom(current, ctx)
59
+ if new_spec is None:
60
+ print(f"hop {hop}: action={action!r} mem {mem_before} -> (stop)")
61
+ break
62
+
63
+ print(f"hop {hop}: action={action!r} mem {mem_before} -> {new_spec.mem}")
64
+ current = new_spec
65
+
66
+ # 5. Render the final bumped spec to sbatch text. With both account and
67
+ # partition supplied this is pure: no squeue, no SLURM, no I/O. cluv
68
+ # would write this string to a file and `sbatch` it; here we just
69
+ # print the header so the audit trail is visible.
70
+ assert current is not None
71
+ sbatch_text = render(
72
+ current,
73
+ cluster_id="mila",
74
+ account="mila",
75
+ partition="unkillable",
76
+ )
77
+ print("\nrendered sbatch (header only):")
78
+ for line in sbatch_text.splitlines()[:8]:
79
+ print(f" {line}")
80
+
81
+
82
+ if __name__ == "__main__":
83
+ main()
@@ -0,0 +1,87 @@
1
+ """End-to-end example: how xgenius would call salvo as its OOM-policy library.
2
+
3
+ xgenius already has its own sbatch renderer (Jinja-style ``{{PLACEHOLDER}}``
4
+ templates in ``xgenius/templates.py``) and its own SafetyValidator with a
5
+ fixed ``max_memory_per_job`` ceiling. It has no dynamic OOM handling: an
6
+ OOM-killed job is observed by the watcher daemon but no corrective action
7
+ is taken. salvo.policy fills exactly that gap. This example shows the
8
+ policy-only integration path: xgenius keeps its template renderer; only
9
+ the watcher imports salvo. No SLURM, no network, no subprocess — just
10
+ stdlib + pydantic. Run with: ``python examples/xgenius_integration.py``.
11
+ """
12
+
13
+ from __future__ import annotations
14
+
15
+ from salvo import JobSpec
16
+ from salvo.policy import OomContext, OomDecision, apply_oom, parse
17
+
18
+
19
+ def main() -> None:
20
+ # 1. Parse the policy from a future xgenius config knob, e.g. a new
21
+ # ``[retry] on_oom = [...]`` table in ``xgenius.toml``. Validation
22
+ # happens here: an unknown step raises ValueError pointing at the
23
+ # bad string, before any job runs.
24
+ on_oom_dsl: list[str] = ["bump_mem(2x, max=80G)", "fail"]
25
+ steps = parse(on_oom_dsl)
26
+ print(f"parsed {len(steps)} policy step(s): {[type(s).__name__ for s in steps]}")
27
+
28
+ # 2. The xgenius watcher detects OUT_OF_MEMORY for an experiment.
29
+ # It already knows the experiment's gpu/cpu/mem/walltime ask
30
+ # (from xgenius.toml + the running job record). Wrap that in a
31
+ # JobSpec — JobSpec is the only salvo type the watcher needs to
32
+ # build. The cmd field is unused by apply_oom; pass a sentinel.
33
+ experiment_gpus = 1
34
+ experiment_cpus = 4
35
+ experiment_mem = "16G"
36
+ experiment_walltime = "4h"
37
+
38
+ spec = JobSpec(
39
+ name="experiment-42",
40
+ cmd=["xgenius-runner"],
41
+ gpus=experiment_gpus,
42
+ cpus=experiment_cpus,
43
+ mem=experiment_mem,
44
+ time=experiment_walltime,
45
+ on_oom=on_oom_dsl,
46
+ )
47
+
48
+ # 3. Synthesize an OomContext from what the watcher already polls.
49
+ # xgenius's COMPLETION_EPILOG drops a trap-based marker on OOM;
50
+ # MaxRSS is read via sacct the same way cluv does. ``kind`` is
51
+ # "gpu" only when the job asked for gpus AND the OOM came from
52
+ # GPU memory exhaustion. For CPU-side OOM on a GPU job, "cpu" is
53
+ # still the right kind because that is the resource being bumped.
54
+ observed_max_rss_mb = 15_900
55
+ ctx = OomContext(
56
+ kind="cpu",
57
+ max_rss_mb=observed_max_rss_mb,
58
+ log_excerpt="slurmstepd: error: Detected 1 oom_kill event",
59
+ )
60
+
61
+ # 4. Ask salvo what to do. The return is a NamedTuple — both
62
+ # attribute and tuple-unpack work, so the watcher can pick
63
+ # whichever style reads cleaner in its existing code.
64
+ decision: OomDecision = apply_oom(spec, ctx)
65
+ if decision.new_spec is None:
66
+ print(f"policy terminated: action={decision.action!r} — escalate to human")
67
+ return
68
+
69
+ # 5. xgenius now re-runs its own template renderer with the bumped
70
+ # memory string. salvo does not render xgenius templates; the
71
+ # watcher just substitutes ``new_spec.mem`` back into the params
72
+ # dict it already builds in ``xgenius/jobs.py:submit``.
73
+ new_params = {
74
+ "memory": decision.new_spec.mem, # "32768M" after 16G * 2
75
+ "gpus": decision.new_spec.gpus,
76
+ "cpus": decision.new_spec.cpus,
77
+ "walltime": decision.new_spec.time,
78
+ }
79
+ print(f"action={decision.action!r}")
80
+ print(f" mem {experiment_mem} -> {decision.new_spec.mem}")
81
+ print(" hand back to xgenius template renderer with params:")
82
+ for k, v in new_params.items():
83
+ print(f" {k}={v!r}")
84
+
85
+
86
+ if __name__ == "__main__":
87
+ main()
@@ -0,0 +1,79 @@
1
+ [project]
2
+ name = "pysalvo"
3
+ version = "0.2.0"
4
+ description = "Render + policy library for SLURM JobSpecs (Mila + DRAC clusters)"
5
+ readme = "README.md"
6
+ license = { text = "MIT" }
7
+ requires-python = ">=3.11"
8
+ authors = [{ name = "W" }]
9
+ keywords = ["slurm", "hpc", "sbatch", "mila", "drac", "alliance-canada"]
10
+ classifiers = [
11
+ "Development Status :: 3 - Alpha",
12
+ "Intended Audience :: Science/Research",
13
+ "Intended Audience :: System Administrators",
14
+ "License :: OSI Approved :: MIT License",
15
+ "Operating System :: POSIX :: Linux",
16
+ "Operating System :: MacOS",
17
+ "Programming Language :: Python :: 3",
18
+ "Programming Language :: Python :: 3 :: Only",
19
+ "Programming Language :: Python :: 3.11",
20
+ "Programming Language :: Python :: 3.12",
21
+ "Programming Language :: Python :: 3.13",
22
+ "Topic :: Scientific/Engineering",
23
+ "Topic :: System :: Clustering",
24
+ "Topic :: System :: Distributed Computing",
25
+ "Typing :: Typed",
26
+ ]
27
+ dependencies = [
28
+ "pydantic>=2.6",
29
+ "typer>=0.12",
30
+ "pyyaml>=6.0",
31
+ "rich>=13.7",
32
+ "tomli-w>=1.0",
33
+ ]
34
+
35
+ [project.urls]
36
+ Homepage = "https://github.com/wietzesuijker/salvo"
37
+ Source = "https://github.com/wietzesuijker/salvo"
38
+ Issues = "https://github.com/wietzesuijker/salvo/issues"
39
+
40
+ [project.optional-dependencies]
41
+ dev = [
42
+ "pytest>=8.0",
43
+ "pytest-cov>=5.0",
44
+ "hypothesis>=6.100",
45
+ "ruff>=0.5",
46
+ "mypy>=1.10",
47
+ "pre-commit>=3.7",
48
+ "types-pyyaml",
49
+ ]
50
+ mila = []
51
+ drac = []
52
+ all = ["pysalvo[mila,drac]"]
53
+
54
+ [project.scripts]
55
+ salvo = "salvo.cli:app"
56
+
57
+ [build-system]
58
+ requires = ["hatchling>=1.24"]
59
+ build-backend = "hatchling.build"
60
+
61
+ [tool.hatch.build.targets.wheel]
62
+ packages = ["src/salvo"]
63
+
64
+ [tool.ruff]
65
+ line-length = 100
66
+ target-version = "py311"
67
+
68
+ [tool.ruff.lint]
69
+ select = ["E", "F", "W", "I", "UP", "B", "SIM", "RUF"]
70
+
71
+ [tool.mypy]
72
+ python_version = "3.11"
73
+ strict = true
74
+ files = ["src/salvo"]
75
+ disallow_untyped_decorators = false
76
+
77
+ [tool.pytest.ini_options]
78
+ addopts = "-ra --strict-markers --cov=salvo --cov-report=term-missing --cov-fail-under=80"
79
+ testpaths = ["tests"]