snakemake-executor-plugin-vastai 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- snakemake_executor_plugin_vastai-0.1.0/.claude/settings.local.json +11 -0
- snakemake_executor_plugin_vastai-0.1.0/.claude/skills/vastai-executor/SKILL.md +59 -0
- snakemake_executor_plugin_vastai-0.1.0/.gitignore +9 -0
- snakemake_executor_plugin_vastai-0.1.0/LICENSE +21 -0
- snakemake_executor_plugin_vastai-0.1.0/PKG-INFO +199 -0
- snakemake_executor_plugin_vastai-0.1.0/README.md +176 -0
- snakemake_executor_plugin_vastai-0.1.0/pyproject.toml +40 -0
- snakemake_executor_plugin_vastai-0.1.0/snakemake_executor_plugin_vastai/__init__.py +846 -0
- snakemake_executor_plugin_vastai-0.1.0/snakemake_executor_plugin_vastai/_common.py +38 -0
- snakemake_executor_plugin_vastai-0.1.0/snakemake_executor_plugin_vastai/sshtransfer.py +340 -0
- snakemake_executor_plugin_vastai-0.1.0/tests/test_credentials.py +44 -0
- snakemake_executor_plugin_vastai-0.1.0/tests/test_e2e.py +81 -0
- snakemake_executor_plugin_vastai-0.1.0/tests/test_image.py +41 -0
- snakemake_executor_plugin_vastai-0.1.0/tests/test_lifecycle.py +162 -0
- snakemake_executor_plugin_vastai-0.1.0/tests/test_query.py +69 -0
- snakemake_executor_plugin_vastai-0.1.0/tests/test_scripts.py +65 -0
- snakemake_executor_plugin_vastai-0.1.0/tests/test_sshtransfer.py +128 -0
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
{
|
|
2
|
+
"permissions": {
|
|
3
|
+
"allow": [
|
|
4
|
+
"Bash(uv pip *)",
|
|
5
|
+
"Read(//private/tmp/vastai-smoke/**)",
|
|
6
|
+
"Bash(VAST_API_KEY=invalid-test-key /tmp/vastai-research/bin/snakemake --executor vastai --jobs 1 --default-storage-provider fs --default-storage-prefix /tmp/vastai-smoke-storage --verbose)",
|
|
7
|
+
"WebFetch(domain:docs.vast.ai)",
|
|
8
|
+
"WebSearch"
|
|
9
|
+
]
|
|
10
|
+
}
|
|
11
|
+
}
|
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: vastai-executor
|
|
3
|
+
description: Testing the snakemake-executor-plugin-vastai (unit + live e2e on real GPUs), debugging live instances, and the design decisions behind the plugin. Use when working on this repo, running its tests, or debugging Vast.ai job failures.
|
|
4
|
+
---
|
|
5
|
+
|
|
6
|
+
# Testing
|
|
7
|
+
|
|
8
|
+
- Unit (free, fast): `uv run pytest` — covers query building, status state
|
|
9
|
+
machine, scripts, credentials.
|
|
10
|
+
- Live e2e (rents a real GPU, ~$0.02–0.05, needs `VAST_API_KEY` with credit):
|
|
11
|
+
`SNAKEMAKE_VASTAI_E2E=1 pytest tests/test_e2e.py -s`
|
|
12
|
+
- After ANY live run, verify nothing is still billing:
|
|
13
|
+
`vastai show instances --raw` must show no `snakemake-*` labels. Destroy
|
|
14
|
+
stragglers immediately (`vastai destroy instance <id> -y`).
|
|
15
|
+
|
|
16
|
+
## Debugging a live run
|
|
17
|
+
|
|
18
|
+
- Run snakemake from a git-initialized scratch dir (source archiving needs git).
|
|
19
|
+
- Status: `VastAI().show_instance(id)` → `actual_status` + `status_msg`.
|
|
20
|
+
Container logs: `VastAI().logs(id, tail='200')`.
|
|
21
|
+
- SSH-mode instances: connect with the per-run key
|
|
22
|
+
`.snakemake/tmp.*/vastai_ssh_key`; remote workdir is `/snakemake-workdir`
|
|
23
|
+
(`job.sh`, `job.log`, `exit_code`, `env.sh`).
|
|
24
|
+
- Watch costs: prefer `--vastai-max-price`, small `--vastai-disk`, EU
|
|
25
|
+
datacenter offers (~$0.06–0.11/h for RTX 3060-class).
|
|
26
|
+
|
|
27
|
+
# Design decisions (all verified on real instances, 2026-06)
|
|
28
|
+
|
|
29
|
+
- **Two modes**, chosen by whether `--default-storage-provider` is set:
|
|
30
|
+
- *Storage mode*: entrypoint containers (`runtype="args"`); completion =
|
|
31
|
+
instance `exited` + `snakemake_vastai_exit_code=N` log sentinel; logs
|
|
32
|
+
fetched once at finalization.
|
|
33
|
+
- *SSH mode* (zero config): `runtype="ssh_proxy"` + per-job thread in
|
|
34
|
+
`sshtransfer.py` (scp sources/inputs, detached run, poll `exit_code`,
|
|
35
|
+
scp outputs). Enabled by `can_transfer_local_files=True`; in storage
|
|
36
|
+
mode the `Executor.common_settings` property presents a doctored copy
|
|
37
|
+
so core's source-deploy precommand is regenerated, and the executor
|
|
38
|
+
calls `workflow.upload_sources()` itself.
|
|
39
|
+
- Plain `runtype="ssh"` never gets the proxy tunnel wired — must be `ssh_proxy`.
|
|
40
|
+
- **Default image** `vastai/pytorch:@vastai-automatic-tag`: driver-matched
|
|
41
|
+
CUDA + OpenSSH (`snakemake/snakemake` has neither). Snakemake and the
|
|
42
|
+
locally installed storage plugins are pip-installed in-job, version-pinned
|
|
43
|
+
(their settings leak into spawned CLI args, e.g. `--storage-s3-retries`).
|
|
44
|
+
- **SSH keys via onstart** writing `authorized_keys` — the attach-key API on
|
|
45
|
+
a live instance is racy; account keys would leak across runs. apt-based
|
|
46
|
+
OpenSSH bootstrap for custom Debian images (conda installs are unusable).
|
|
47
|
+
- Non-interactive SSH doesn't get the image ENV PATH → `PYTHON_PATH_SETUP`
|
|
48
|
+
prelude discovers python, preferring `/venv/main` (where torch lives).
|
|
49
|
+
- **`datacenter=true` is the search default**: hobbyist hosts routinely hit
|
|
50
|
+
Docker Hub pull rate limits. Unrecoverable boot errors
|
|
51
|
+
(`FATAL_BOOT_ERRORS`) fail fast so `--retries` resubmits elsewhere; a
|
|
52
|
+
stuck proxy tunnel triggers one instance reboot.
|
|
53
|
+
- Interface bug: bool settings with `default=True` break CLI parsing
|
|
54
|
+
(argparse_dataclass renames the flag) → only default-False bools, hence
|
|
55
|
+
`--vastai-no-datacenter`, `--vastai-no-forward-credentials`.
|
|
56
|
+
- Credentials: snakemake core forwards storage-plugin settings itself; the
|
|
57
|
+
plugin additionally forwards ambient `AWS_*`/`AZURE_STORAGE_*` vars and
|
|
58
|
+
ships the GCP credentials *file content* base64-encoded (path-based
|
|
59
|
+
GOOGLE_APPLICATION_CREDENTIALS is useless remotely).
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 bards.ai
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,199 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: snakemake-executor-plugin-vastai
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: A Snakemake executor plugin for running jobs on Vast.ai GPU instances
|
|
5
|
+
Project-URL: Repository, https://github.com/bards-ai/snakemake-executor-plugin-vastai
|
|
6
|
+
Project-URL: Documentation, https://snakemake.github.io/snakemake-plugin-catalog/plugins/executor/vastai.html
|
|
7
|
+
Author-email: Michał Pogoda <michal.pogoda@bards.ai>
|
|
8
|
+
License: MIT
|
|
9
|
+
License-File: LICENSE
|
|
10
|
+
Keywords: cloud,executor,gpu,plugin,snakemake,vastai
|
|
11
|
+
Classifier: Development Status :: 4 - Beta
|
|
12
|
+
Classifier: Environment :: GPU :: NVIDIA CUDA
|
|
13
|
+
Classifier: Intended Audience :: Science/Research
|
|
14
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
15
|
+
Classifier: Programming Language :: Python :: 3
|
|
16
|
+
Classifier: Topic :: Scientific/Engineering :: Bio-Informatics
|
|
17
|
+
Classifier: Topic :: System :: Distributed Computing
|
|
18
|
+
Requires-Python: >=3.11
|
|
19
|
+
Requires-Dist: snakemake-interface-common<2.0.0,>=1.17.0
|
|
20
|
+
Requires-Dist: snakemake-interface-executor-plugins<10.0.0,>=9.3.0
|
|
21
|
+
Requires-Dist: vastai<2.0.0,>=1.0.13
|
|
22
|
+
Description-Content-Type: text/markdown
|
|
23
|
+
|
|
24
|
+
# snakemake-executor-plugin-vastai
|
|
25
|
+
|
|
26
|
+
Run [Snakemake](https://snakemake.github.io) jobs on cheap
|
|
27
|
+
[Vast.ai](https://vast.ai) GPUs. For every job the plugin rents the cheapest
|
|
28
|
+
marketplace offer matching the job's resources, runs it in a Docker
|
|
29
|
+
container, ships the files back, and destroys the instance.
|
|
30
|
+
|
|
31
|
+
## Quickstart
|
|
32
|
+
|
|
33
|
+
```bash
|
|
34
|
+
pip install snakemake-executor-plugin-vastai
|
|
35
|
+
export VAST_API_KEY=... # https://console.vast.ai/manage-keys/
|
|
36
|
+
```
|
|
37
|
+
|
|
38
|
+
Write a Snakefile as usual — resources decide what gets rented:
|
|
39
|
+
|
|
40
|
+
```python
|
|
41
|
+
rule train:
|
|
42
|
+
input: "data/train.parquet"
|
|
43
|
+
output: "models/model.pt"
|
|
44
|
+
threads: 8
|
|
45
|
+
resources:
|
|
46
|
+
gpu=1,
|
|
47
|
+
gpu_model="RTX_4090",
|
|
48
|
+
mem_mb=32000,
|
|
49
|
+
shell:
|
|
50
|
+
"python train.py {input} {output}"
|
|
51
|
+
```
|
|
52
|
+
|
|
53
|
+
Run it:
|
|
54
|
+
|
|
55
|
+
```bash
|
|
56
|
+
snakemake --executor vastai --jobs 4
|
|
57
|
+
```
|
|
58
|
+
|
|
59
|
+
That's all — no bucket, no shared filesystem, no cluster setup. Jobs run in
|
|
60
|
+
Vast.ai's PyTorch CUDA image, and every instance is destroyed as soon as
|
|
61
|
+
its job finishes, fails, or you hit Ctrl-C.
|
|
62
|
+
|
|
63
|
+
### What ends up on the machine
|
|
64
|
+
|
|
65
|
+
Each instance receives two uploads before the job starts:
|
|
66
|
+
|
|
67
|
+
1. **Your code**, as Snakemake's source archive: the Snakefile (plus
|
|
68
|
+
anything you `include:`), config files, `script:`/`notebook:` files, and
|
|
69
|
+
**every git-tracked file** in the workflow directory. In the example
|
|
70
|
+
above, `train.py` gets there this way — which is why the workflow must
|
|
71
|
+
be a git repository and why an uncommitted-and-untracked script would be
|
|
72
|
+
missing remotely (`git add` is enough, no commit needed). Files over
|
|
73
|
+
10 MB are skipped — declare those as `input:` instead.
|
|
74
|
+
2. **The job's declared `input:` files** (`data/train.parquet` above),
|
|
75
|
+
preserving their relative paths.
|
|
76
|
+
|
|
77
|
+
The job then runs in the same relative layout, and only its declared
|
|
78
|
+
`output:` (and `log:`) files are copied back to your machine. Anything else
|
|
79
|
+
written on the instance is discarded when the instance is destroyed.
|
|
80
|
+
|
|
81
|
+
## Configuration
|
|
82
|
+
|
|
83
|
+
Pin your defaults in `profiles/default/config.yaml` next to the Snakefile —
|
|
84
|
+
Snakemake loads it automatically, so the command stays plain `snakemake`:
|
|
85
|
+
|
|
86
|
+
```yaml
|
|
87
|
+
executor: vastai
|
|
88
|
+
jobs: 4
|
|
89
|
+
vastai-max-price: 1.0 # $/h cap per instance
|
|
90
|
+
vastai-geolocation: EU # region shortcut or country codes (PL,DE,CZ)
|
|
91
|
+
```
|
|
92
|
+
|
|
93
|
+
| Option | Default | Description |
|
|
94
|
+
|---|---|---|
|
|
95
|
+
| `--vastai-api-key` | – | API key (or `VAST_API_KEY` / `SNAKEMAKE_VASTAI_API_KEY`) |
|
|
96
|
+
| `--vastai-gpu-name` | any | Default GPU model, e.g. `RTX_4090`, `H100_SXM` |
|
|
97
|
+
| `--vastai-max-price` | – | Max price per instance in $/h (`dph_total`) |
|
|
98
|
+
| `--vastai-geolocation` | any | Region shortcut (EU, NA, AS, AF, LC, OC) or country codes |
|
|
99
|
+
| `--vastai-disk` | 40 | Disk allocation per instance (GB) |
|
|
100
|
+
| `--vastai-image` | `vastai/pytorch:@vastai-automatic-tag` | Docker image for jobs |
|
|
101
|
+
| `--vastai-reliability` | 0.98 | Minimum host reliability (0 disables) |
|
|
102
|
+
| `--vastai-no-datacenter` | off | Also allow non-datacenter (hobbyist) hosts |
|
|
103
|
+
| `--vastai-order` | `dph_total` | Offer ranking (e.g. `dlperf_usd-` for perf/$) |
|
|
104
|
+
| `--vastai-search-query` | – | Extra offer filters (vastai query syntax) |
|
|
105
|
+
| `--vastai-boot-timeout` | 1800 | Max seconds for an instance to start running |
|
|
106
|
+
| `--vastai-no-forward-credentials` | off | Don't forward local cloud credentials to jobs |
|
|
107
|
+
| `--vastai-keep-instances` | off | Don't destroy instances (debugging; **keeps billing!**) |
|
|
108
|
+
|
|
109
|
+
By default only verified datacenter hosts are rented — slightly pricier,
|
|
110
|
+
but they avoid the most common marketplace flakiness (Docker Hub pull rate
|
|
111
|
+
limits, slow residential uplinks). `--vastai-no-datacenter` unlocks the
|
|
112
|
+
cheapest hobbyist offers.
|
|
113
|
+
|
|
114
|
+
### Per-job resources
|
|
115
|
+
|
|
116
|
+
| Resource | Effect on the offer search |
|
|
117
|
+
|---|---|
|
|
118
|
+
| `gpu` / `nvidia_gpu` | `num_gpus=N` (minimum 1 — Vast.ai only rents GPU machines) |
|
|
119
|
+
| `gpu_model` | GPU model (Vast.ai naming, underscores for spaces) |
|
|
120
|
+
| `threads` | `cpu_cores_effective>=N` |
|
|
121
|
+
| `mem_mb` | minimum system RAM |
|
|
122
|
+
| `disk_mb` | minimum disk and rented allocation |
|
|
123
|
+
| `vastai_query` | extra filters, appended verbatim (e.g. `"cuda_vers>=12.4"`) |
|
|
124
|
+
|
|
125
|
+
### Container image
|
|
126
|
+
|
|
127
|
+
Jobs run in `vastai/pytorch:@vastai-automatic-tag` by default — Vast.ai's
|
|
128
|
+
curated PyTorch image with CUDA matched to each machine's driver, usually
|
|
129
|
+
cached on hosts. Snakemake is pip-installed into the container automatically
|
|
130
|
+
(pinned to your local version, ~30 s per instance).
|
|
131
|
+
|
|
132
|
+
For other stacks set `--vastai-image`. Requirements: `python` on `PATH`;
|
|
133
|
+
for SSH mode also OpenSSH (auto-installed via apt on Debian/Ubuntu images
|
|
134
|
+
if missing). Bake `pip install snakemake` into the image to skip the
|
|
135
|
+
bootstrap cost.
|
|
136
|
+
|
|
137
|
+
## File transfer
|
|
138
|
+
|
|
139
|
+
**SSH mode** (the default, used when no storage is configured): the source
|
|
140
|
+
archive and input files are scp'd from your machine to each instance, and
|
|
141
|
+
outputs are scp'd back (see "What ends up on the machine" above). Zero
|
|
142
|
+
setup, ideal for getting started and small/medium data. Caveats: everything
|
|
143
|
+
flows through your uplink, intermediate files between dependent jobs
|
|
144
|
+
round-trip through your machine, and the Snakemake process must stay
|
|
145
|
+
online.
|
|
146
|
+
|
|
147
|
+
**Storage mode** (recommended for real pipelines): the same things move,
|
|
148
|
+
but through a bucket instead of your machine — Snakemake uploads the source
|
|
149
|
+
archive once per run, and each job downloads the archive plus its inputs
|
|
150
|
+
from the bucket and uploads its outputs there (your laptop only orchestrates).
|
|
151
|
+
Faster, resumable:
|
|
152
|
+
|
|
153
|
+
```yaml
|
|
154
|
+
executor: vastai
|
|
155
|
+
jobs: 4
|
|
156
|
+
default-storage-provider: s3
|
|
157
|
+
default-storage-prefix: s3://my-bucket/my-workflow
|
|
158
|
+
```
|
|
159
|
+
|
|
160
|
+
Install the matching storage plugin (`pip install
|
|
161
|
+
snakemake-storage-plugin-s3`, or `-gcs` / `-azure`). Any S3-compatible
|
|
162
|
+
service works: AWS S3, MinIO, Cloudflare R2, Backblaze B2, …
|
|
163
|
+
|
|
164
|
+
### Credentials
|
|
165
|
+
|
|
166
|
+
Storage credentials reach the jobs automatically — no `--envvars` needed:
|
|
167
|
+
local `AWS_*` and `AZURE_STORAGE_*` variables are forwarded into the job
|
|
168
|
+
containers, and the Google credentials file
|
|
169
|
+
(`GOOGLE_APPLICATION_CREDENTIALS` or gcloud ADC) is shipped and materialized
|
|
170
|
+
inside the container. Settings configured on the storage plugin itself
|
|
171
|
+
(`SNAKEMAKE_STORAGE_S3_ACCESS_KEY` etc.) are forwarded by Snakemake core.
|
|
172
|
+
|
|
173
|
+
Notes: credentials living only in `~/.aws/credentials` are not forwarded —
|
|
174
|
+
export them as `AWS_*` variables. Other secrets your rules need (e.g.
|
|
175
|
+
`HF_TOKEN`) go through the standard `envvars:` directive or `--envvars`.
|
|
176
|
+
Forwarded credentials are visible inside containers on third-party hosts —
|
|
177
|
+
use scoped, revocable keys.
|
|
178
|
+
|
|
179
|
+
## Debugging & costs
|
|
180
|
+
|
|
181
|
+
- Remote job logs land in `.snakemake/auxiliary/vastai-logs/`; `--verbose`
|
|
182
|
+
prints the generated offer queries.
|
|
183
|
+
- `--vastai-keep-instances` keeps instances alive for inspection
|
|
184
|
+
(`vastai logs <id>`) — destroy them manually, they bill until then.
|
|
185
|
+
- Flaky hosts happen on a marketplace: unrecoverable boot errors fail fast,
|
|
186
|
+
so run with `--retries 2` to resubmit on a different machine.
|
|
187
|
+
- The plugin destroys every instance it rents, even on failure or Ctrl-C.
|
|
188
|
+
After a hard kill (`kill -9`), check
|
|
189
|
+
https://console.vast.ai/instances/ for leftovers.
|
|
190
|
+
- Each job rents its own instance, so prefer fewer, larger jobs (or job
|
|
191
|
+
grouping) over many tiny ones — boot overhead is paid per job.
|
|
192
|
+
|
|
193
|
+
## Development
|
|
194
|
+
|
|
195
|
+
```bash
|
|
196
|
+
uv sync
|
|
197
|
+
uv run pytest # unit tests, free
|
|
198
|
+
SNAKEMAKE_VASTAI_E2E=1 uv run pytest tests/test_e2e.py -s # rents a real GPU (~$0.05)
|
|
199
|
+
```
|
|
@@ -0,0 +1,176 @@
|
|
|
1
|
+
# snakemake-executor-plugin-vastai
|
|
2
|
+
|
|
3
|
+
Run [Snakemake](https://snakemake.github.io) jobs on cheap
|
|
4
|
+
[Vast.ai](https://vast.ai) GPUs. For every job the plugin rents the cheapest
|
|
5
|
+
marketplace offer matching the job's resources, runs it in a Docker
|
|
6
|
+
container, ships the files back, and destroys the instance.
|
|
7
|
+
|
|
8
|
+
## Quickstart
|
|
9
|
+
|
|
10
|
+
```bash
|
|
11
|
+
pip install snakemake-executor-plugin-vastai
|
|
12
|
+
export VAST_API_KEY=... # https://console.vast.ai/manage-keys/
|
|
13
|
+
```
|
|
14
|
+
|
|
15
|
+
Write a Snakefile as usual — resources decide what gets rented:
|
|
16
|
+
|
|
17
|
+
```python
|
|
18
|
+
rule train:
|
|
19
|
+
input: "data/train.parquet"
|
|
20
|
+
output: "models/model.pt"
|
|
21
|
+
threads: 8
|
|
22
|
+
resources:
|
|
23
|
+
gpu=1,
|
|
24
|
+
gpu_model="RTX_4090",
|
|
25
|
+
mem_mb=32000,
|
|
26
|
+
shell:
|
|
27
|
+
"python train.py {input} {output}"
|
|
28
|
+
```
|
|
29
|
+
|
|
30
|
+
Run it:
|
|
31
|
+
|
|
32
|
+
```bash
|
|
33
|
+
snakemake --executor vastai --jobs 4
|
|
34
|
+
```
|
|
35
|
+
|
|
36
|
+
That's all — no bucket, no shared filesystem, no cluster setup. Jobs run in
|
|
37
|
+
Vast.ai's PyTorch CUDA image, and every instance is destroyed as soon as
|
|
38
|
+
its job finishes, fails, or you hit Ctrl-C.
|
|
39
|
+
|
|
40
|
+
### What ends up on the machine
|
|
41
|
+
|
|
42
|
+
Each instance receives two uploads before the job starts:
|
|
43
|
+
|
|
44
|
+
1. **Your code**, as Snakemake's source archive: the Snakefile (plus
|
|
45
|
+
anything you `include:`), config files, `script:`/`notebook:` files, and
|
|
46
|
+
**every git-tracked file** in the workflow directory. In the example
|
|
47
|
+
above, `train.py` gets there this way — which is why the workflow must
|
|
48
|
+
be a git repository and why an uncommitted-and-untracked script would be
|
|
49
|
+
missing remotely (`git add` is enough, no commit needed). Files over
|
|
50
|
+
10 MB are skipped — declare those as `input:` instead.
|
|
51
|
+
2. **The job's declared `input:` files** (`data/train.parquet` above),
|
|
52
|
+
preserving their relative paths.
|
|
53
|
+
|
|
54
|
+
The job then runs in the same relative layout, and only its declared
|
|
55
|
+
`output:` (and `log:`) files are copied back to your machine. Anything else
|
|
56
|
+
written on the instance is discarded when the instance is destroyed.
|
|
57
|
+
|
|
58
|
+
## Configuration
|
|
59
|
+
|
|
60
|
+
Pin your defaults in `profiles/default/config.yaml` next to the Snakefile —
|
|
61
|
+
Snakemake loads it automatically, so the command stays plain `snakemake`:
|
|
62
|
+
|
|
63
|
+
```yaml
|
|
64
|
+
executor: vastai
|
|
65
|
+
jobs: 4
|
|
66
|
+
vastai-max-price: 1.0 # $/h cap per instance
|
|
67
|
+
vastai-geolocation: EU # region shortcut or country codes (PL,DE,CZ)
|
|
68
|
+
```
|
|
69
|
+
|
|
70
|
+
| Option | Default | Description |
|
|
71
|
+
|---|---|---|
|
|
72
|
+
| `--vastai-api-key` | – | API key (or `VAST_API_KEY` / `SNAKEMAKE_VASTAI_API_KEY`) |
|
|
73
|
+
| `--vastai-gpu-name` | any | Default GPU model, e.g. `RTX_4090`, `H100_SXM` |
|
|
74
|
+
| `--vastai-max-price` | – | Max price per instance in $/h (`dph_total`) |
|
|
75
|
+
| `--vastai-geolocation` | any | Region shortcut (EU, NA, AS, AF, LC, OC) or country codes |
|
|
76
|
+
| `--vastai-disk` | 40 | Disk allocation per instance (GB) |
|
|
77
|
+
| `--vastai-image` | `vastai/pytorch:@vastai-automatic-tag` | Docker image for jobs |
|
|
78
|
+
| `--vastai-reliability` | 0.98 | Minimum host reliability (0 disables) |
|
|
79
|
+
| `--vastai-no-datacenter` | off | Also allow non-datacenter (hobbyist) hosts |
|
|
80
|
+
| `--vastai-order` | `dph_total` | Offer ranking (e.g. `dlperf_usd-` for perf/$) |
|
|
81
|
+
| `--vastai-search-query` | – | Extra offer filters (vastai query syntax) |
|
|
82
|
+
| `--vastai-boot-timeout` | 1800 | Max seconds for an instance to start running |
|
|
83
|
+
| `--vastai-no-forward-credentials` | off | Don't forward local cloud credentials to jobs |
|
|
84
|
+
| `--vastai-keep-instances` | off | Don't destroy instances (debugging; **keeps billing!**) |
|
|
85
|
+
|
|
86
|
+
By default only verified datacenter hosts are rented — slightly pricier,
|
|
87
|
+
but they avoid the most common marketplace flakiness (Docker Hub pull rate
|
|
88
|
+
limits, slow residential uplinks). `--vastai-no-datacenter` unlocks the
|
|
89
|
+
cheapest hobbyist offers.
|
|
90
|
+
|
|
91
|
+
### Per-job resources
|
|
92
|
+
|
|
93
|
+
| Resource | Effect on the offer search |
|
|
94
|
+
|---|---|
|
|
95
|
+
| `gpu` / `nvidia_gpu` | `num_gpus=N` (minimum 1 — Vast.ai only rents GPU machines) |
|
|
96
|
+
| `gpu_model` | GPU model (Vast.ai naming, underscores for spaces) |
|
|
97
|
+
| `threads` | `cpu_cores_effective>=N` |
|
|
98
|
+
| `mem_mb` | minimum system RAM |
|
|
99
|
+
| `disk_mb` | minimum disk and rented allocation |
|
|
100
|
+
| `vastai_query` | extra filters, appended verbatim (e.g. `"cuda_vers>=12.4"`) |
|
|
101
|
+
|
|
102
|
+
### Container image
|
|
103
|
+
|
|
104
|
+
Jobs run in `vastai/pytorch:@vastai-automatic-tag` by default — Vast.ai's
|
|
105
|
+
curated PyTorch image with CUDA matched to each machine's driver, usually
|
|
106
|
+
cached on hosts. Snakemake is pip-installed into the container automatically
|
|
107
|
+
(pinned to your local version, ~30 s per instance).
|
|
108
|
+
|
|
109
|
+
For other stacks set `--vastai-image`. Requirements: `python` on `PATH`;
|
|
110
|
+
for SSH mode also OpenSSH (auto-installed via apt on Debian/Ubuntu images
|
|
111
|
+
if missing). Bake `pip install snakemake` into the image to skip the
|
|
112
|
+
bootstrap cost.
|
|
113
|
+
|
|
114
|
+
## File transfer
|
|
115
|
+
|
|
116
|
+
**SSH mode** (the default, used when no storage is configured): the source
|
|
117
|
+
archive and input files are scp'd from your machine to each instance, and
|
|
118
|
+
outputs are scp'd back (see "What ends up on the machine" above). Zero
|
|
119
|
+
setup, ideal for getting started and small/medium data. Caveats: everything
|
|
120
|
+
flows through your uplink, intermediate files between dependent jobs
|
|
121
|
+
round-trip through your machine, and the Snakemake process must stay
|
|
122
|
+
online.
|
|
123
|
+
|
|
124
|
+
**Storage mode** (recommended for real pipelines): the same things move,
|
|
125
|
+
but through a bucket instead of your machine — Snakemake uploads the source
|
|
126
|
+
archive once per run, and each job downloads the archive plus its inputs
|
|
127
|
+
from the bucket and uploads its outputs there (your laptop only orchestrates).
|
|
128
|
+
Faster, resumable:
|
|
129
|
+
|
|
130
|
+
```yaml
|
|
131
|
+
executor: vastai
|
|
132
|
+
jobs: 4
|
|
133
|
+
default-storage-provider: s3
|
|
134
|
+
default-storage-prefix: s3://my-bucket/my-workflow
|
|
135
|
+
```
|
|
136
|
+
|
|
137
|
+
Install the matching storage plugin (`pip install
|
|
138
|
+
snakemake-storage-plugin-s3`, or `-gcs` / `-azure`). Any S3-compatible
|
|
139
|
+
service works: AWS S3, MinIO, Cloudflare R2, Backblaze B2, …
|
|
140
|
+
|
|
141
|
+
### Credentials
|
|
142
|
+
|
|
143
|
+
Storage credentials reach the jobs automatically — no `--envvars` needed:
|
|
144
|
+
local `AWS_*` and `AZURE_STORAGE_*` variables are forwarded into the job
|
|
145
|
+
containers, and the Google credentials file
|
|
146
|
+
(`GOOGLE_APPLICATION_CREDENTIALS` or gcloud ADC) is shipped and materialized
|
|
147
|
+
inside the container. Settings configured on the storage plugin itself
|
|
148
|
+
(`SNAKEMAKE_STORAGE_S3_ACCESS_KEY` etc.) are forwarded by Snakemake core.
|
|
149
|
+
|
|
150
|
+
Notes: credentials living only in `~/.aws/credentials` are not forwarded —
|
|
151
|
+
export them as `AWS_*` variables. Other secrets your rules need (e.g.
|
|
152
|
+
`HF_TOKEN`) go through the standard `envvars:` directive or `--envvars`.
|
|
153
|
+
Forwarded credentials are visible inside containers on third-party hosts —
|
|
154
|
+
use scoped, revocable keys.
|
|
155
|
+
|
|
156
|
+
## Debugging & costs
|
|
157
|
+
|
|
158
|
+
- Remote job logs land in `.snakemake/auxiliary/vastai-logs/`; `--verbose`
|
|
159
|
+
prints the generated offer queries.
|
|
160
|
+
- `--vastai-keep-instances` keeps instances alive for inspection
|
|
161
|
+
(`vastai logs <id>`) — destroy them manually, they bill until then.
|
|
162
|
+
- Flaky hosts happen on a marketplace: unrecoverable boot errors fail fast,
|
|
163
|
+
so run with `--retries 2` to resubmit on a different machine.
|
|
164
|
+
- The plugin destroys every instance it rents, even on failure or Ctrl-C.
|
|
165
|
+
After a hard kill (`kill -9`), check
|
|
166
|
+
https://console.vast.ai/instances/ for leftovers.
|
|
167
|
+
- Each job rents its own instance, so prefer fewer, larger jobs (or job
|
|
168
|
+
grouping) over many tiny ones — boot overhead is paid per job.
|
|
169
|
+
|
|
170
|
+
## Development
|
|
171
|
+
|
|
172
|
+
```bash
|
|
173
|
+
uv sync
|
|
174
|
+
uv run pytest # unit tests, free
|
|
175
|
+
SNAKEMAKE_VASTAI_E2E=1 uv run pytest tests/test_e2e.py -s # rents a real GPU (~$0.05)
|
|
176
|
+
```
|
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["hatchling"]
|
|
3
|
+
build-backend = "hatchling.build"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "snakemake-executor-plugin-vastai"
|
|
7
|
+
version = "0.1.0"
|
|
8
|
+
description = "A Snakemake executor plugin for running jobs on Vast.ai GPU instances"
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
license = { text = "MIT" }
|
|
11
|
+
authors = [{ name = "Michał Pogoda", email = "michal.pogoda@bards.ai" }]
|
|
12
|
+
keywords = ["snakemake", "plugin", "executor", "vastai", "gpu", "cloud"]
|
|
13
|
+
classifiers = [
|
|
14
|
+
"Development Status :: 4 - Beta",
|
|
15
|
+
"Environment :: GPU :: NVIDIA CUDA",
|
|
16
|
+
"Intended Audience :: Science/Research",
|
|
17
|
+
"License :: OSI Approved :: MIT License",
|
|
18
|
+
"Programming Language :: Python :: 3",
|
|
19
|
+
"Topic :: Scientific/Engineering :: Bio-Informatics",
|
|
20
|
+
"Topic :: System :: Distributed Computing",
|
|
21
|
+
]
|
|
22
|
+
requires-python = ">=3.11"
|
|
23
|
+
dependencies = [
|
|
24
|
+
"snakemake-interface-common >=1.17.0,<2.0.0",
|
|
25
|
+
"snakemake-interface-executor-plugins >=9.3.0,<10.0.0",
|
|
26
|
+
"vastai >=1.0.13,<2.0.0",
|
|
27
|
+
]
|
|
28
|
+
|
|
29
|
+
[project.urls]
|
|
30
|
+
Repository = "https://github.com/bards-ai/snakemake-executor-plugin-vastai"
|
|
31
|
+
Documentation = "https://snakemake.github.io/snakemake-plugin-catalog/plugins/executor/vastai.html"
|
|
32
|
+
|
|
33
|
+
[dependency-groups]
|
|
34
|
+
dev = [
|
|
35
|
+
"pytest >=8",
|
|
36
|
+
"snakemake >=9.22.0",
|
|
37
|
+
]
|
|
38
|
+
|
|
39
|
+
[tool.hatch.build.targets.wheel]
|
|
40
|
+
packages = ["snakemake_executor_plugin_vastai"]
|