slurmkit 0.0.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- slurmkit-0.0.1/LICENSE +21 -0
- slurmkit-0.0.1/PKG-INFO +527 -0
- slurmkit-0.0.1/README.md +478 -0
- slurmkit-0.0.1/pyproject.toml +77 -0
- slurmkit-0.0.1/setup.cfg +4 -0
- slurmkit-0.0.1/src/slurmkit/__init__.py +43 -0
- slurmkit-0.0.1/src/slurmkit/_version.py +1 -0
- slurmkit-0.0.1/src/slurmkit/cli/__init__.py +9 -0
- slurmkit-0.0.1/src/slurmkit/cli/commands.py +1839 -0
- slurmkit-0.0.1/src/slurmkit/cli/main.py +864 -0
- slurmkit-0.0.1/src/slurmkit/cli/ui/__init__.py +32 -0
- slurmkit-0.0.1/src/slurmkit/cli/ui/backend.py +59 -0
- slurmkit-0.0.1/src/slurmkit/cli/ui/context.py +90 -0
- slurmkit-0.0.1/src/slurmkit/cli/ui/models.py +57 -0
- slurmkit-0.0.1/src/slurmkit/cli/ui/plain.py +97 -0
- slurmkit-0.0.1/src/slurmkit/cli/ui/reports.py +365 -0
- slurmkit-0.0.1/src/slurmkit/cli/ui/rich_backend.py +105 -0
- slurmkit-0.0.1/src/slurmkit/collections.py +1251 -0
- slurmkit-0.0.1/src/slurmkit/config.py +487 -0
- slurmkit-0.0.1/src/slurmkit/generate.py +773 -0
- slurmkit-0.0.1/src/slurmkit/notifications.py +1516 -0
- slurmkit-0.0.1/src/slurmkit/slurm.py +840 -0
- slurmkit-0.0.1/src/slurmkit/sync.py +425 -0
- slurmkit-0.0.1/src/slurmkit/utils/__init__.py +5 -0
- slurmkit-0.0.1/src/slurmkit/wandb_utils.py +425 -0
- slurmkit-0.0.1/src/slurmkit.egg-info/PKG-INFO +527 -0
- slurmkit-0.0.1/src/slurmkit.egg-info/SOURCES.txt +43 -0
- slurmkit-0.0.1/src/slurmkit.egg-info/dependency_links.txt +1 -0
- slurmkit-0.0.1/src/slurmkit.egg-info/entry_points.txt +2 -0
- slurmkit-0.0.1/src/slurmkit.egg-info/requires.txt +25 -0
- slurmkit-0.0.1/src/slurmkit.egg-info/top_level.txt +1 -0
- slurmkit-0.0.1/tests/test_cli_collection_analyze.py +477 -0
- slurmkit-0.0.1/tests/test_cli_collection_groups.py +59 -0
- slurmkit-0.0.1/tests/test_cli_collection_show.py +128 -0
- slurmkit-0.0.1/tests/test_cli_init.py +99 -0
- slurmkit-0.0.1/tests/test_cli_notify.py +259 -0
- slurmkit-0.0.1/tests/test_cli_notify_collection_final.py +493 -0
- slurmkit-0.0.1/tests/test_cli_resubmit.py +170 -0
- slurmkit-0.0.1/tests/test_cli_ui.py +154 -0
- slurmkit-0.0.1/tests/test_collections.py +440 -0
- slurmkit-0.0.1/tests/test_config.py +173 -0
- slurmkit-0.0.1/tests/test_generate.py +308 -0
- slurmkit-0.0.1/tests/test_notifications.py +830 -0
- slurmkit-0.0.1/tests/test_notifications_phase2.py +235 -0
- slurmkit-0.0.1/tests/test_slurm.py +199 -0
slurmkit-0.0.1/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Awni
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
slurmkit-0.0.1/PKG-INFO
ADDED
|
@@ -0,0 +1,527 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: slurmkit
|
|
3
|
+
Version: 0.0.1
|
|
4
|
+
Summary: CLI tools for managing and generating SLURM jobs
|
|
5
|
+
License: MIT
|
|
6
|
+
Project-URL: Homepage, https://github.com/Awni00/slurmkit
|
|
7
|
+
Project-URL: Repository, https://github.com/Awni00/slurmkit
|
|
8
|
+
Project-URL: Issues, https://github.com/Awni00/slurmkit/issues
|
|
9
|
+
Keywords: slurm,hpc,job-management,cluster,batch
|
|
10
|
+
Classifier: Development Status :: 3 - Alpha
|
|
11
|
+
Classifier: Environment :: Console
|
|
12
|
+
Classifier: Intended Audience :: Science/Research
|
|
13
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
14
|
+
Classifier: Operating System :: POSIX :: Linux
|
|
15
|
+
Classifier: Programming Language :: Python :: 3
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.8
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
20
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
21
|
+
Classifier: Topic :: Scientific/Engineering
|
|
22
|
+
Classifier: Topic :: System :: Clustering
|
|
23
|
+
Classifier: Topic :: Utilities
|
|
24
|
+
Requires-Python: >=3.8
|
|
25
|
+
Description-Content-Type: text/markdown
|
|
26
|
+
License-File: LICENSE
|
|
27
|
+
Requires-Dist: pyyaml>=6.0
|
|
28
|
+
Requires-Dist: jinja2>=3.0
|
|
29
|
+
Requires-Dist: pandas>=1.3
|
|
30
|
+
Requires-Dist: tabulate>=0.9
|
|
31
|
+
Requires-Dist: requests>=2.31
|
|
32
|
+
Provides-Extra: ui
|
|
33
|
+
Requires-Dist: rich>=13.7; extra == "ui"
|
|
34
|
+
Provides-Extra: dev
|
|
35
|
+
Requires-Dist: pytest>=7.0; extra == "dev"
|
|
36
|
+
Requires-Dist: pytest-cov>=4.0; extra == "dev"
|
|
37
|
+
Provides-Extra: docs
|
|
38
|
+
Requires-Dist: mkdocs>=1.5; extra == "docs"
|
|
39
|
+
Requires-Dist: mkdocs-material>=9.5; extra == "docs"
|
|
40
|
+
Requires-Dist: mkdocstrings[python]>=0.24; extra == "docs"
|
|
41
|
+
Provides-Extra: all
|
|
42
|
+
Requires-Dist: rich>=13.7; extra == "all"
|
|
43
|
+
Requires-Dist: pytest>=7.0; extra == "all"
|
|
44
|
+
Requires-Dist: pytest-cov>=4.0; extra == "all"
|
|
45
|
+
Requires-Dist: mkdocs>=1.5; extra == "all"
|
|
46
|
+
Requires-Dist: mkdocs-material>=9.5; extra == "all"
|
|
47
|
+
Requires-Dist: mkdocstrings[python]>=0.24; extra == "all"
|
|
48
|
+
Dynamic: license-file
|
|
49
|
+
|
|
50
|
+

|
|
51
|
+
|
|
52
|
+
<p align="center">
|
|
53
|
+
<a href="https://github.com/Awni00/slurmkit/actions/workflows/tests.yml"><img src="https://github.com/Awni00/slurmkit/actions/workflows/tests.yml/badge.svg" alt="Unit Tests"></a>
|
|
54
|
+
<a href="https://github.com/Awni00/slurmkit/actions/workflows/docs.yml"><img src="https://github.com/Awni00/slurmkit/actions/workflows/docs.yml/badge.svg" alt="Docs"></a>
|
|
55
|
+
<img src="https://img.shields.io/badge/python-3.8%2B-blue" alt="Python 3.8+">
|
|
56
|
+
<img src="https://img.shields.io/badge/license-MIT-green" alt="MIT License">
|
|
57
|
+
</p>
|
|
58
|
+
|
|
59
|
+
<p align="center">
|
|
60
|
+
<a href="#installation">Install</a> •
|
|
61
|
+
<a href="#quick-start">Quick Start</a> •
|
|
62
|
+
<a href="#features">Features</a> •
|
|
63
|
+
<a href="https://awni00.github.io/slurmkit">Docs</a> •
|
|
64
|
+
<a href="https://deepwiki.com/Awni00/slurmkit">DeepWiki</a>
|
|
65
|
+
</p>
|
|
66
|
+
|
|
67
|
+
---
|
|
68
|
+
|
|
69
|
+
<!-- # slurmkit -->
|
|
70
|
+
|
|
71
|
+
A CLI toolkit for managing and generating SLURM jobs.
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
`slurmkit` provides tools for:
|
|
76
|
+
- Auto-discovering and tracking SLURM job status
|
|
77
|
+
- Generating job scripts from templates with parameter sweeps
|
|
78
|
+
- Organizing jobs into trackable collections
|
|
79
|
+
- Cross-cluster job synchronization
|
|
80
|
+
- Cleaning up failed jobs and W&B runs
|
|
81
|
+
|
|
82
|
+
## Installation
|
|
83
|
+
|
|
84
|
+
### Install Latest From GitHub
|
|
85
|
+
|
|
86
|
+
```bash
|
|
87
|
+
pip install git+https://github.com/Awni00/slurmkit.git
|
|
88
|
+
# include all optional extras (ui + dev + docs)
|
|
89
|
+
pip install "slurmkit[all] @ git+https://github.com/Awni00/slurmkit.git"
|
|
90
|
+
```
|
|
91
|
+
|
|
92
|
+
### Clone and Install (Recommended for Development)
|
|
93
|
+
|
|
94
|
+
```bash
|
|
95
|
+
git clone https://github.com/Awni00/slurmkit.git
|
|
96
|
+
cd slurmkit
|
|
97
|
+
pip install -e ".[all]"
|
|
98
|
+
```
|
|
99
|
+
<!--
|
|
100
|
+
### From PyPI
|
|
101
|
+
|
|
102
|
+
```bash
|
|
103
|
+
pip install slurmkit
|
|
104
|
+
```
|
|
105
|
+
-->
|
|
106
|
+
|
|
107
|
+
### Dependencies
|
|
108
|
+
|
|
109
|
+
**Required:**
|
|
110
|
+
- Python 3.8+
|
|
111
|
+
- PyYAML
|
|
112
|
+
- Jinja2
|
|
113
|
+
- pandas
|
|
114
|
+
- tabulate
|
|
115
|
+
- requests
|
|
116
|
+
|
|
117
|
+
**Optional:**
|
|
118
|
+
- wandb (for W&B cleanup features)
|
|
119
|
+
- rich (enhanced CLI UI; install with `pip install -e ".[ui]"` from a clone)
|
|
120
|
+
- `all` extra for optional groups (`ui`, `dev`, `docs`)
|
|
121
|
+
|
|
122
|
+
## Quick Start
|
|
123
|
+
|
|
124
|
+
### 1. Initialize Project
|
|
125
|
+
|
|
126
|
+
```bash
|
|
127
|
+
cd your-project
|
|
128
|
+
slurmkit init
|
|
129
|
+
```
|
|
130
|
+
|
|
131
|
+
This creates `.slurm-kit/config.yaml` with your settings.
|
|
132
|
+
|
|
133
|
+
### 2. Check Job Status
|
|
134
|
+
|
|
135
|
+
```bash
|
|
136
|
+
slurmkit status my_experiment
|
|
137
|
+
```
|
|
138
|
+
|
|
139
|
+
### 3. Generate Jobs from Template
|
|
140
|
+
|
|
141
|
+
Create a template `templates/train.job.j2`:
|
|
142
|
+
|
|
143
|
+
```jinja2
|
|
144
|
+
#!/bin/bash
|
|
145
|
+
#SBATCH --job-name={{ job_name }}
|
|
146
|
+
#SBATCH --partition={{ slurm.partition }}
|
|
147
|
+
#SBATCH --time={{ slurm.time }}
|
|
148
|
+
#SBATCH --output={{ logs_dir }}/{{ job_name }}.%j.out
|
|
149
|
+
|
|
150
|
+
python train.py --lr {{ learning_rate }} --bs {{ batch_size }}
|
|
151
|
+
```
|
|
152
|
+
|
|
153
|
+
Create a job spec `experiments/exp1/job_spec.yaml`:
|
|
154
|
+
|
|
155
|
+
```yaml
|
|
156
|
+
name: exp1
|
|
157
|
+
template: ../../templates/train.job.j2
|
|
158
|
+
output_dir: job_scripts
|
|
159
|
+
logs_dir: logs
|
|
160
|
+
|
|
161
|
+
parameters:
|
|
162
|
+
mode: grid
|
|
163
|
+
values:
|
|
164
|
+
learning_rate: [0.001, 0.01, 0.1]
|
|
165
|
+
batch_size: [32, 64]
|
|
166
|
+
# Optional: exclude incompatible combinations
|
|
167
|
+
filter:
|
|
168
|
+
file: params_filter.py
|
|
169
|
+
function: include_params
|
|
170
|
+
|
|
171
|
+
slurm_args:
|
|
172
|
+
defaults:
|
|
173
|
+
partition: gpu
|
|
174
|
+
time: "24:00:00"
|
|
175
|
+
|
|
176
|
+
job_name_pattern: "lr{{ learning_rate }}_bs{{ batch_size }}"
|
|
177
|
+
```
|
|
178
|
+
|
|
179
|
+
Generate jobs:
|
|
180
|
+
|
|
181
|
+
```bash
|
|
182
|
+
slurmkit generate experiments/exp1/job_spec.yaml --collection exp1
|
|
183
|
+
```
|
|
184
|
+
|
|
185
|
+
### 4. Submit Jobs
|
|
186
|
+
|
|
187
|
+
```bash
|
|
188
|
+
# Preview before actual submission
|
|
189
|
+
slurmkit submit --collection exp1 --dry-run
|
|
190
|
+
|
|
191
|
+
# Submit to SLURM
|
|
192
|
+
slurmkit submit --collection exp1
|
|
193
|
+
```
|
|
194
|
+
|
|
195
|
+
### 5. Monitor and Resubmit
|
|
196
|
+
|
|
197
|
+
```bash
|
|
198
|
+
# Update job states
|
|
199
|
+
slurmkit collection update exp1
|
|
200
|
+
|
|
201
|
+
# View collection status
|
|
202
|
+
slurmkit collection show exp1
|
|
203
|
+
|
|
204
|
+
# View latest effective attempts with primary/history context
|
|
205
|
+
slurmkit collection show exp1 --show-primary --show-history
|
|
206
|
+
|
|
207
|
+
# Rich UI (if installed)
|
|
208
|
+
slurmkit --ui rich collection analyze exp1
|
|
209
|
+
|
|
210
|
+
# Resubmit failed jobs
|
|
211
|
+
slurmkit resubmit --collection exp1 --filter failed
|
|
212
|
+
|
|
213
|
+
# Group-aware retry
|
|
214
|
+
slurmkit resubmit --collection exp1 --filter failed --submission-group retry_after_fix
|
|
215
|
+
```
|
|
216
|
+
|
|
217
|
+
## Testing and Showcase Workflows
|
|
218
|
+
|
|
219
|
+
### A) Local Demo (No SLURM Required)
|
|
220
|
+
|
|
221
|
+
Use the bundled demo project for a deterministic feature showcase:
|
|
222
|
+
|
|
223
|
+
```bash
|
|
224
|
+
cd examples/demo_project
|
|
225
|
+
python -m venv .venv
|
|
226
|
+
source .venv/bin/activate
|
|
227
|
+
pip install -e ../..
|
|
228
|
+
./setup_dummy_jobs.py --include-non-terminal
|
|
229
|
+
```
|
|
230
|
+
|
|
231
|
+
Then run:
|
|
232
|
+
|
|
233
|
+
```bash
|
|
234
|
+
slurmkit collection list
|
|
235
|
+
slurmkit collection show demo_terminal_failed
|
|
236
|
+
slurmkit collection analyze demo_terminal_failed
|
|
237
|
+
# Optional richer formatting (requires rich extra):
|
|
238
|
+
slurmkit --ui rich collection analyze demo_terminal_failed
|
|
239
|
+
slurmkit notify test --dry-run
|
|
240
|
+
slurmkit notify collection-final --collection demo_terminal_failed --job-id 990002 --no-refresh --dry-run
|
|
241
|
+
```
|
|
242
|
+
|
|
243
|
+
### B) Real Cluster Workflow
|
|
244
|
+
|
|
245
|
+
```bash
|
|
246
|
+
slurmkit generate experiments/exp1/job_spec.yaml --collection exp1
|
|
247
|
+
slurmkit submit --collection exp1 --dry-run
|
|
248
|
+
slurmkit submit --collection exp1
|
|
249
|
+
slurmkit status exp1
|
|
250
|
+
slurmkit collection update exp1
|
|
251
|
+
slurmkit collection show exp1
|
|
252
|
+
slurmkit collection analyze exp1 --attempt-mode latest
|
|
253
|
+
slurmkit collection groups exp1
|
|
254
|
+
slurmkit resubmit --collection exp1 --filter failed --dry-run
|
|
255
|
+
```
|
|
256
|
+
|
|
257
|
+
### C) Feature Checklist
|
|
258
|
+
|
|
259
|
+
| Goal | Command | Success signal |
|
|
260
|
+
|------|---------|----------------|
|
|
261
|
+
| Initialize config | `slurmkit init` | `.slurm-kit/config.yaml` created |
|
|
262
|
+
| Generate scripts | `slurmkit generate ... --collection exp1` | Job scripts written and collection updated |
|
|
263
|
+
| Preview submission | `slurmkit submit --collection exp1 --dry-run` | Candidate jobs listed with no submit |
|
|
264
|
+
| Inspect collection | `slurmkit collection show exp1` | Summary + jobs table rendered |
|
|
265
|
+
| Analyze outcomes | `slurmkit collection analyze exp1` | Parameter tables and risky/stable sections shown |
|
|
266
|
+
| Validate notifications | `slurmkit notify test --dry-run` | Route resolution and payload preview |
|
|
267
|
+
|
|
268
|
+
## Commands
|
|
269
|
+
|
|
270
|
+
| Command | Description |
|
|
271
|
+
|---------|-------------|
|
|
272
|
+
| `slurmkit init` | Initialize project configuration |
|
|
273
|
+
| `slurmkit status <exp>` | Show job status for experiment |
|
|
274
|
+
| `slurmkit find <job_id>` | Find output file for job ID |
|
|
275
|
+
| `slurmkit generate <spec>` | Generate job scripts from template |
|
|
276
|
+
| `slurmkit submit` | Submit job scripts |
|
|
277
|
+
| `slurmkit resubmit` | Resubmit failed jobs |
|
|
278
|
+
| `slurmkit notify` | Send job lifecycle notifications |
|
|
279
|
+
| `slurmkit collection` | Manage job collections |
|
|
280
|
+
| `slurmkit clean outputs` | Clean failed job outputs |
|
|
281
|
+
| `slurmkit clean wandb` | Clean failed W&B runs |
|
|
282
|
+
| `slurmkit sync` | Sync job states for cross-cluster |
|
|
283
|
+
|
|
284
|
+
Run `slurmkit <command> --help` for detailed usage.
|
|
285
|
+
|
|
286
|
+
## Configuration
|
|
287
|
+
|
|
288
|
+
Configuration is stored in `.slurm-kit/config.yaml`:
|
|
289
|
+
|
|
290
|
+
```yaml
|
|
291
|
+
jobs_dir: jobs/
|
|
292
|
+
collections_dir: .job-collections/
|
|
293
|
+
sync_dir: .slurm-kit/sync/
|
|
294
|
+
|
|
295
|
+
output_patterns:
|
|
296
|
+
- "{job_name}.{job_id}.out"
|
|
297
|
+
- "{job_name}.{job_id}.*.out"
|
|
298
|
+
- "slurm-{job_id}.out"
|
|
299
|
+
|
|
300
|
+
slurm_defaults:
|
|
301
|
+
partition: gpu
|
|
302
|
+
time: "24:00:00"
|
|
303
|
+
mem: "32G"
|
|
304
|
+
|
|
305
|
+
job_structure:
|
|
306
|
+
scripts_subdir: job_scripts/
|
|
307
|
+
logs_subdir: logs/
|
|
308
|
+
|
|
309
|
+
ui:
|
|
310
|
+
mode: plain # plain | rich | auto
|
|
311
|
+
|
|
312
|
+
notifications:
|
|
313
|
+
defaults:
|
|
314
|
+
events: [job_failed]
|
|
315
|
+
timeout_seconds: 5
|
|
316
|
+
max_attempts: 3
|
|
317
|
+
backoff_seconds: 0.5
|
|
318
|
+
output_tail_lines: 40
|
|
319
|
+
collection_final:
|
|
320
|
+
attempt_mode: latest
|
|
321
|
+
min_support: 3
|
|
322
|
+
top_k: 10
|
|
323
|
+
include_failed_output_tail_lines: 20
|
|
324
|
+
ai:
|
|
325
|
+
enabled: false
|
|
326
|
+
callback: null
|
|
327
|
+
routes:
|
|
328
|
+
- name: team_slack
|
|
329
|
+
type: slack
|
|
330
|
+
url: "${SLACK_WEBHOOK_URL}"
|
|
331
|
+
events: [job_failed, collection_failed]
|
|
332
|
+
- name: team_email
|
|
333
|
+
type: email
|
|
334
|
+
to: ["ops@example.com", "ml@example.com"]
|
|
335
|
+
from: "${SLURMKIT_EMAIL_FROM}"
|
|
336
|
+
smtp_host: "${SMTP_HOST}"
|
|
337
|
+
smtp_port: 587
|
|
338
|
+
smtp_username: "${SMTP_USER}"
|
|
339
|
+
smtp_password: "${SMTP_PASSWORD}"
|
|
340
|
+
smtp_starttls: true
|
|
341
|
+
smtp_ssl: false
|
|
342
|
+
events: [job_failed, collection_failed]
|
|
343
|
+
```
|
|
344
|
+
|
|
345
|
+
### Environment Variables
|
|
346
|
+
|
|
347
|
+
| Variable | Description |
|
|
348
|
+
|----------|-------------|
|
|
349
|
+
| `SLURMKIT_CONFIG` | Path to config file |
|
|
350
|
+
| `SLURMKIT_JOBS_DIR` | Jobs directory |
|
|
351
|
+
| `SLURMKIT_COLLECTIONS_DIR` | Collections directory |
|
|
352
|
+
| `SLURMKIT_WANDB_ENTITY` | W&B entity |
|
|
353
|
+
| `SLURMKIT_DRY_RUN` | Enable dry-run mode |
|
|
354
|
+
|
|
355
|
+
## Documentation
|
|
356
|
+
|
|
357
|
+
Full documentation is available at [https://awni00.github.io/slurmkit/](https://awni00.github.io/slurmkit/)
|
|
358
|
+
|
|
359
|
+
- [Getting Started](docs/getting-started.md)
|
|
360
|
+
- [Configuration](docs/configuration.md)
|
|
361
|
+
- [Job Generation](docs/job-generation.md)
|
|
362
|
+
- [Collections](docs/collections.md)
|
|
363
|
+
- [Notifications](docs/notifications.md)
|
|
364
|
+
- [Cross-Cluster Sync](docs/sync.md)
|
|
365
|
+
- [CLI Reference](docs/cli-reference.md)
|
|
366
|
+
|
|
367
|
+
## Project Structure
|
|
368
|
+
|
|
369
|
+
```
|
|
370
|
+
your-project/
|
|
371
|
+
├── .slurm-kit/
|
|
372
|
+
│ ├── config.yaml # Project configuration
|
|
373
|
+
│ └── sync/ # Cross-cluster sync files
|
|
374
|
+
├── .job-collections/ # Collection YAML files
|
|
375
|
+
├── jobs/
|
|
376
|
+
│ └── experiment1/
|
|
377
|
+
│ ├── job_scripts/ # Generated job scripts
|
|
378
|
+
│ └── logs/ # Job output files
|
|
379
|
+
└── templates/ # Jinja2 job templates
|
|
380
|
+
```
|
|
381
|
+
|
|
382
|
+
## Features
|
|
383
|
+
|
|
384
|
+
Key features at a glance:
|
|
385
|
+
|
|
386
|
+
**1) Job Creation**
|
|
387
|
+
|
|
388
|
+
- Generate parameterized job scripts and attach them to a collection: `slurmkit generate job_spec.yaml --collection exp1`
|
|
389
|
+
- Preview generation and submission safely: `slurmkit generate ... --dry-run`, `slurmkit submit ... --dry-run`
|
|
390
|
+
- Submit only unsubmitted collection jobs (default): `slurmkit submit --collection exp1 --filter unsubmitted`
|
|
391
|
+
|
|
392
|
+
**2) Collection Tracking and Analysis**
|
|
393
|
+
|
|
394
|
+
- Create, inspect, and refresh collections: `slurmkit collection create exp1`, `slurmkit collection show exp1`, `slurmkit collection update exp1`
|
|
395
|
+
- Analyze outcomes by parameter values and latest attempts: `slurmkit collection analyze exp1 --attempt-mode latest --top-k 10`
|
|
396
|
+
- Inspect resubmission waves and attempt history: `slurmkit collection groups exp1`, `slurmkit collection show exp1 --show-history`
|
|
397
|
+
- Resubmit failed jobs with optional selection and parameter callbacks to programatically specify which jobs are submitted and whether to include additional parameters in resubmission (e.g., checkpoint dir): `slurmkit resubmit --collection exp1 --filter failed --select-file callbacks.py --extra-params-file extra.py`
|
|
398
|
+
|
|
399
|
+
**3) Notifications and Cross-Cluster Sync**
|
|
400
|
+
|
|
401
|
+
- Validate routes and send job notifications: `slurmkit notify test`, `slurmkit notify job ...`
|
|
402
|
+
- Send one final collection-level summary when a collection reaches terminal state: `slurmkit notify collection-final ...`
|
|
403
|
+
- Sync collection/job state across clusters via git-backed files: `slurmkit sync --push`
|
|
404
|
+
|
|
405
|
+
### Job Collections
|
|
406
|
+
|
|
407
|
+
Track related jobs together:
|
|
408
|
+
|
|
409
|
+
```bash
|
|
410
|
+
# Create collection
|
|
411
|
+
slurmkit collection create my_exp --description "Training sweep"
|
|
412
|
+
|
|
413
|
+
# List collections
|
|
414
|
+
slurmkit collection list
|
|
415
|
+
|
|
416
|
+
# Show details
|
|
417
|
+
slurmkit collection show my_exp --state failed
|
|
418
|
+
slurmkit collection show my_exp --attempt-mode latest --show-primary
|
|
419
|
+
|
|
420
|
+
# Update states from SLURM
|
|
421
|
+
slurmkit collection update my_exp
|
|
422
|
+
|
|
423
|
+
# Submission-group summary
|
|
424
|
+
slurmkit collection groups my_exp
|
|
425
|
+
```
|
|
426
|
+
|
|
427
|
+
### Notifications
|
|
428
|
+
|
|
429
|
+
Send job lifecycle notifications to Slack, Discord, email, or generic webhooks:
|
|
430
|
+
|
|
431
|
+
```bash
|
|
432
|
+
# Validate route setup
|
|
433
|
+
slurmkit notify test
|
|
434
|
+
slurmkit notify test --route team_email --dry-run
|
|
435
|
+
|
|
436
|
+
# Typical end-of-job call from script (default: notify only on failure)
|
|
437
|
+
slurmkit notify job --job-id "$SLURM_JOB_ID" --exit-code "$rc"
|
|
438
|
+
|
|
439
|
+
# Collection-final summary notification (emits only when collection is terminal)
|
|
440
|
+
slurmkit notify collection-final --job-id "$SLURM_JOB_ID"
|
|
441
|
+
```
|
|
442
|
+
|
|
443
|
+
Recommended trap snippet inside a job script:
|
|
444
|
+
|
|
445
|
+
```bash
|
|
446
|
+
rc=$?
|
|
447
|
+
slurmkit notify job --job-id "${SLURM_JOB_ID}" --exit-code "${rc}"
|
|
448
|
+
slurmkit notify collection-final --job-id "${SLURM_JOB_ID}"
|
|
449
|
+
exit "${rc}"
|
|
450
|
+
```
|
|
451
|
+
|
|
452
|
+
### Parameter Sweeps
|
|
453
|
+
|
|
454
|
+
Generate jobs from parameter grids:
|
|
455
|
+
|
|
456
|
+
```yaml
|
|
457
|
+
parameters:
|
|
458
|
+
mode: grid
|
|
459
|
+
values:
|
|
460
|
+
learning_rate: [0.001, 0.01, 0.1]
|
|
461
|
+
batch_size: [32, 64, 128]
|
|
462
|
+
model: [resnet18, resnet50]
|
|
463
|
+
```
|
|
464
|
+
|
|
465
|
+
Or explicit lists:
|
|
466
|
+
|
|
467
|
+
```yaml
|
|
468
|
+
parameters:
|
|
469
|
+
mode: list
|
|
470
|
+
values:
|
|
471
|
+
- {lr: 0.001, bs: 32}
|
|
472
|
+
- {lr: 0.01, bs: 64}
|
|
473
|
+
```
|
|
474
|
+
|
|
475
|
+
### Dynamic SLURM Arguments
|
|
476
|
+
|
|
477
|
+
Use Python functions for complex resource logic:
|
|
478
|
+
|
|
479
|
+
```python
|
|
480
|
+
# slurm_logic.py
|
|
481
|
+
def get_slurm_args(params, defaults):
|
|
482
|
+
args = defaults.copy()
|
|
483
|
+
if params.get('model') == 'resnet50':
|
|
484
|
+
args['mem'] = '64G'
|
|
485
|
+
args['gpus'] = 2
|
|
486
|
+
return args
|
|
487
|
+
```
|
|
488
|
+
|
|
489
|
+
### Cross-Cluster Sync
|
|
490
|
+
|
|
491
|
+
Share job status across clusters via git:
|
|
492
|
+
|
|
493
|
+
```bash
|
|
494
|
+
# On cluster A
|
|
495
|
+
slurmkit sync --push
|
|
496
|
+
|
|
497
|
+
# On cluster B
|
|
498
|
+
git pull
|
|
499
|
+
slurmkit collection show my_exp
|
|
500
|
+
```
|
|
501
|
+
|
|
502
|
+
## Development
|
|
503
|
+
|
|
504
|
+
### Setup
|
|
505
|
+
|
|
506
|
+
We recommend using [uv](https://github.com/astral-sh/uv) to manage the development environment.
|
|
507
|
+
|
|
508
|
+
```bash
|
|
509
|
+
# Clone the repository
|
|
510
|
+
git clone https://github.com/Awni00/slurmkit.git
|
|
511
|
+
cd slurmkit
|
|
512
|
+
|
|
513
|
+
# Create a virtual environment and install dependencies in editable mode
|
|
514
|
+
uv venv
|
|
515
|
+
source .venv/bin/activate
|
|
516
|
+
uv pip install -e ".[dev]"
|
|
517
|
+
```
|
|
518
|
+
|
|
519
|
+
### Running Tests
|
|
520
|
+
|
|
521
|
+
```bash
|
|
522
|
+
pytest
|
|
523
|
+
```
|
|
524
|
+
|
|
525
|
+
## License
|
|
526
|
+
|
|
527
|
+
MIT License - see [LICENSE](LICENSE) for details.
|