snakemake-executor-plugin-vastai 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,11 @@
1
+ {
2
+ "permissions": {
3
+ "allow": [
4
+ "Bash(uv pip *)",
5
+ "Read(//private/tmp/vastai-smoke/**)",
6
+ "Bash(VAST_API_KEY=invalid-test-key /tmp/vastai-research/bin/snakemake --executor vastai --jobs 1 --default-storage-provider fs --default-storage-prefix /tmp/vastai-smoke-storage --verbose)",
7
+ "WebFetch(domain:docs.vast.ai)",
8
+ "WebSearch"
9
+ ]
10
+ }
11
+ }
@@ -0,0 +1,59 @@
1
+ ---
2
+ name: vastai-executor
3
+ description: Testing the snakemake-executor-plugin-vastai (unit + live e2e on real GPUs), debugging live instances, and the design decisions behind the plugin. Use when working on this repo, running its tests, or debugging Vast.ai job failures.
4
+ ---
5
+
6
+ # Testing
7
+
8
+ - Unit (free, fast): `uv run pytest` — covers query building, status state
9
+ machine, scripts, credentials.
10
+ - Live e2e (rents a real GPU, ~$0.02–0.05, needs `VAST_API_KEY` with credit):
11
+ `SNAKEMAKE_VASTAI_E2E=1 pytest tests/test_e2e.py -s`
12
+ - After ANY live run, verify nothing is still billing:
13
+ `vastai show instances --raw` must show no `snakemake-*` labels. Destroy
14
+ stragglers immediately (`vastai destroy instance <id> -y`).
15
+
16
+ ## Debugging a live run
17
+
18
+ - Run snakemake from a git-initialized scratch dir (source archiving needs git).
19
+ - Status: `VastAI().show_instance(id)` → `actual_status` + `status_msg`.
20
+ Container logs: `VastAI().logs(id, tail='200')`.
21
+ - SSH-mode instances: connect with the per-run key
22
+ `.snakemake/tmp.*/vastai_ssh_key`; remote workdir is `/snakemake-workdir`
23
+ (`job.sh`, `job.log`, `exit_code`, `env.sh`).
24
+ - Watch costs: prefer `--vastai-max-price`, small `--vastai-disk`, EU
25
+ datacenter offers (~$0.06–0.11/h for RTX 3060-class).
26
+
27
+ # Design decisions (all verified on real instances, 2026-06)
28
+
29
+ - **Two modes**, chosen by whether `--default-storage-provider` is set:
30
+ - *Storage mode*: entrypoint containers (`runtype="args"`); completion =
31
+ instance `exited` + `snakemake_vastai_exit_code=N` log sentinel; logs
32
+ fetched once at finalization.
33
+ - *SSH mode* (zero config): `runtype="ssh_proxy"` + per-job thread in
34
+ `sshtransfer.py` (scp sources/inputs, detached run, poll `exit_code`,
35
+ scp outputs). Enabled by `can_transfer_local_files=True`; in storage
36
+ mode the `Executor.common_settings` property presents a doctored copy
37
+ so core's source-deploy precommand is regenerated, and the executor
38
+ calls `workflow.upload_sources()` itself.
39
+ - Plain `runtype="ssh"` never gets the proxy tunnel wired — must be `ssh_proxy`.
40
+ - **Default image** `vastai/pytorch:@vastai-automatic-tag`: driver-matched
41
+ CUDA + OpenSSH (`snakemake/snakemake` has neither). Snakemake and the
42
+ locally installed storage plugins are pip-installed in-job, version-pinned
43
+ (their settings leak into spawned CLI args, e.g. `--storage-s3-retries`).
44
+ - **SSH keys via onstart** writing `authorized_keys` — the attach-key API on
45
+ a live instance is racy; account keys would leak across runs. apt-based
46
+ OpenSSH bootstrap for custom Debian images (conda installs are unusable).
47
+ - Non-interactive SSH doesn't get the image ENV PATH → `PYTHON_PATH_SETUP`
48
+ prelude discovers python, preferring `/venv/main` (where torch lives).
49
+ - **`datacenter=true` is the search default**: hobbyist hosts routinely hit
50
+ Docker Hub pull rate limits. Unrecoverable boot errors
51
+ (`FATAL_BOOT_ERRORS`) fail fast so `--retries` resubmits elsewhere; a
52
+ stuck proxy tunnel triggers one instance reboot.
53
+ - Interface bug: bool settings with `default=True` break CLI parsing
54
+ (argparse_dataclass renames the flag) → only default-False bools, hence
55
+ `--vastai-no-datacenter`, `--vastai-no-forward-credentials`.
56
+ - Credentials: snakemake core forwards storage-plugin settings itself; the
57
+ plugin additionally forwards ambient `AWS_*`/`AZURE_STORAGE_*` vars and
58
+ ships the GCP credentials *file content* base64-encoded (path-based
59
+ GOOGLE_APPLICATION_CREDENTIALS is useless remotely).
@@ -0,0 +1,9 @@
1
+ __pycache__/
2
+ *.py[cod]
3
+ *.egg-info/
4
+ dist/
5
+ build/
6
+ .venv/
7
+ .pytest_cache/
8
+ .snakemake/
9
+ uv.lock
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 bards.ai
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,199 @@
1
+ Metadata-Version: 2.4
2
+ Name: snakemake-executor-plugin-vastai
3
+ Version: 0.1.0
4
+ Summary: A Snakemake executor plugin for running jobs on Vast.ai GPU instances
5
+ Project-URL: Repository, https://github.com/bards-ai/snakemake-executor-plugin-vastai
6
+ Project-URL: Documentation, https://snakemake.github.io/snakemake-plugin-catalog/plugins/executor/vastai.html
7
+ Author-email: Michał Pogoda <michal.pogoda@bards.ai>
8
+ License: MIT
9
+ License-File: LICENSE
10
+ Keywords: cloud,executor,gpu,plugin,snakemake,vastai
11
+ Classifier: Development Status :: 4 - Beta
12
+ Classifier: Environment :: GPU :: NVIDIA CUDA
13
+ Classifier: Intended Audience :: Science/Research
14
+ Classifier: License :: OSI Approved :: MIT License
15
+ Classifier: Programming Language :: Python :: 3
16
+ Classifier: Topic :: Scientific/Engineering :: Bio-Informatics
17
+ Classifier: Topic :: System :: Distributed Computing
18
+ Requires-Python: >=3.11
19
+ Requires-Dist: snakemake-interface-common<2.0.0,>=1.17.0
20
+ Requires-Dist: snakemake-interface-executor-plugins<10.0.0,>=9.3.0
21
+ Requires-Dist: vastai<2.0.0,>=1.0.13
22
+ Description-Content-Type: text/markdown
23
+
24
+ # snakemake-executor-plugin-vastai
25
+
26
+ Run [Snakemake](https://snakemake.github.io) jobs on cheap
27
+ [Vast.ai](https://vast.ai) GPUs. For every job the plugin rents the cheapest
28
+ marketplace offer matching the job's resources, runs it in a Docker
29
+ container, ships the files back, and destroys the instance.
30
+
31
+ ## Quickstart
32
+
33
+ ```bash
34
+ pip install snakemake-executor-plugin-vastai
35
+ export VAST_API_KEY=... # https://console.vast.ai/manage-keys/
36
+ ```
37
+
38
+ Write a Snakefile as usual — resources decide what gets rented:
39
+
40
+ ```python
41
+ rule train:
42
+ input: "data/train.parquet"
43
+ output: "models/model.pt"
44
+ threads: 8
45
+ resources:
46
+ gpu=1,
47
+ gpu_model="RTX_4090",
48
+ mem_mb=32000,
49
+ shell:
50
+ "python train.py {input} {output}"
51
+ ```
52
+
53
+ Run it:
54
+
55
+ ```bash
56
+ snakemake --executor vastai --jobs 4
57
+ ```
58
+
59
+ That's all — no bucket, no shared filesystem, no cluster setup. Jobs run in
60
+ Vast.ai's PyTorch CUDA image, and every instance is destroyed as soon as
61
+ its job finishes, fails, or you hit Ctrl-C.
62
+
63
+ ### What ends up on the machine
64
+
65
+ Each instance receives two uploads before the job starts:
66
+
67
+ 1. **Your code**, as Snakemake's source archive: the Snakefile (plus
68
+ anything you `include:`), config files, `script:`/`notebook:` files, and
69
+ **every git-tracked file** in the workflow directory. In the example
70
+ above, `train.py` gets there this way — which is why the workflow must
71
+ be a git repository and why an uncommitted-and-untracked script would be
72
+ missing remotely (`git add` is enough, no commit needed). Files over
73
+ 10 MB are skipped — declare those as `input:` instead.
74
+ 2. **The job's declared `input:` files** (`data/train.parquet` above),
75
+ preserving their relative paths.
76
+
77
+ The job then runs in the same relative layout, and only its declared
78
+ `output:` (and `log:`) files are copied back to your machine. Anything else
79
+ written on the instance is discarded when the instance is destroyed.
80
+
81
+ ## Configuration
82
+
83
+ Pin your defaults in `profiles/default/config.yaml` next to the Snakefile —
84
+ Snakemake loads it automatically, so the command stays plain `snakemake`:
85
+
86
+ ```yaml
87
+ executor: vastai
88
+ jobs: 4
89
+ vastai-max-price: 1.0 # $/h cap per instance
90
+ vastai-geolocation: EU # region shortcut or country codes (PL,DE,CZ)
91
+ ```
92
+
93
+ | Option | Default | Description |
94
+ |---|---|---|
95
+ | `--vastai-api-key` | – | API key (or `VAST_API_KEY` / `SNAKEMAKE_VASTAI_API_KEY`) |
96
+ | `--vastai-gpu-name` | any | Default GPU model, e.g. `RTX_4090`, `H100_SXM` |
97
+ | `--vastai-max-price` | – | Max price per instance in $/h (`dph_total`) |
98
+ | `--vastai-geolocation` | any | Region shortcut (EU, NA, AS, AF, LC, OC) or country codes |
99
+ | `--vastai-disk` | 40 | Disk allocation per instance (GB) |
100
+ | `--vastai-image` | `vastai/pytorch:@vastai-automatic-tag` | Docker image for jobs |
101
+ | `--vastai-reliability` | 0.98 | Minimum host reliability (0 disables) |
102
+ | `--vastai-no-datacenter` | off | Also allow non-datacenter (hobbyist) hosts |
103
+ | `--vastai-order` | `dph_total` | Offer ranking (e.g. `dlperf_usd-` for perf/$) |
104
+ | `--vastai-search-query` | – | Extra offer filters (vastai query syntax) |
105
+ | `--vastai-boot-timeout` | 1800 | Max seconds for an instance to start running |
106
+ | `--vastai-no-forward-credentials` | off | Don't forward local cloud credentials to jobs |
107
+ | `--vastai-keep-instances` | off | Don't destroy instances (debugging; **keeps billing!**) |
108
+
109
+ By default only verified datacenter hosts are rented — slightly pricier,
110
+ but they avoid the most common marketplace flakiness (Docker Hub pull rate
111
+ limits, slow residential uplinks). `--vastai-no-datacenter` unlocks the
112
+ cheapest hobbyist offers.
113
+
114
+ ### Per-job resources
115
+
116
+ | Resource | Effect on the offer search |
117
+ |---|---|
118
+ | `gpu` / `nvidia_gpu` | `num_gpus=N` (minimum 1 — Vast.ai only rents GPU machines) |
119
+ | `gpu_model` | GPU model (Vast.ai naming, underscores for spaces) |
120
+ | `threads` | `cpu_cores_effective>=N` |
121
+ | `mem_mb` | minimum system RAM |
122
+ | `disk_mb` | minimum disk and rented allocation |
123
+ | `vastai_query` | extra filters, appended verbatim (e.g. `"cuda_vers>=12.4"`) |
124
+
125
+ ### Container image
126
+
127
+ Jobs run in `vastai/pytorch:@vastai-automatic-tag` by default — Vast.ai's
128
+ curated PyTorch image with CUDA matched to each machine's driver, usually
129
+ cached on hosts. Snakemake is pip-installed into the container automatically
130
+ (pinned to your local version, ~30 s per instance).
131
+
132
+ For other stacks set `--vastai-image`. Requirements: `python` on `PATH`;
133
+ for SSH mode also OpenSSH (auto-installed via apt on Debian/Ubuntu images
134
+ if missing). Bake `pip install snakemake` into the image to skip the
135
+ bootstrap cost.
136
+
137
+ ## File transfer
138
+
139
+ **SSH mode** (the default, used when no storage is configured): the source
140
+ archive and input files are scp'd from your machine to each instance, and
141
+ outputs are scp'd back (see "What ends up on the machine" above). Zero
142
+ setup, ideal for getting started and small/medium data. Caveats: everything
143
+ flows through your uplink, intermediate files between dependent jobs
144
+ round-trip through your machine, and the Snakemake process must stay
145
+ online.
146
+
147
+ **Storage mode** (recommended for real pipelines): the same things move,
148
+ but through a bucket instead of your machine — Snakemake uploads the source
149
+ archive once per run, and each job downloads the archive plus its inputs
150
+ from the bucket and uploads its outputs there (your laptop only orchestrates).
151
+ Faster, resumable:
152
+
153
+ ```yaml
154
+ executor: vastai
155
+ jobs: 4
156
+ default-storage-provider: s3
157
+ default-storage-prefix: s3://my-bucket/my-workflow
158
+ ```
159
+
160
+ Install the matching storage plugin (`pip install
161
+ snakemake-storage-plugin-s3`, or `-gcs` / `-azure`). Any S3-compatible
162
+ service works: AWS S3, MinIO, Cloudflare R2, Backblaze B2, …
163
+
164
+ ### Credentials
165
+
166
+ Storage credentials reach the jobs automatically — no `--envvars` needed:
167
+ local `AWS_*` and `AZURE_STORAGE_*` variables are forwarded into the job
168
+ containers, and the Google credentials file
169
+ (`GOOGLE_APPLICATION_CREDENTIALS` or gcloud ADC) is shipped and materialized
170
+ inside the container. Settings configured on the storage plugin itself
171
+ (`SNAKEMAKE_STORAGE_S3_ACCESS_KEY` etc.) are forwarded by Snakemake core.
172
+
173
+ Notes: credentials living only in `~/.aws/credentials` are not forwarded —
174
+ export them as `AWS_*` variables. Other secrets your rules need (e.g.
175
+ `HF_TOKEN`) go through the standard `envvars:` directive or `--envvars`.
176
+ Forwarded credentials are visible inside containers on third-party hosts —
177
+ use scoped, revocable keys.
178
+
179
+ ## Debugging & costs
180
+
181
+ - Remote job logs land in `.snakemake/auxiliary/vastai-logs/`; `--verbose`
182
+ prints the generated offer queries.
183
+ - `--vastai-keep-instances` keeps instances alive for inspection
184
+ (`vastai logs <id>`) — destroy them manually, they bill until then.
185
+ - Flaky hosts happen on a marketplace: unrecoverable boot errors fail fast,
186
+ so run with `--retries 2` to resubmit on a different machine.
187
+ - The plugin destroys every instance it rents, even on failure or Ctrl-C.
188
+ After a hard kill (`kill -9`), check
189
+ https://console.vast.ai/instances/ for leftovers.
190
+ - Each job rents its own instance, so prefer fewer, larger jobs (or job
191
+ grouping) over many tiny ones — boot overhead is paid per job.
192
+
193
+ ## Development
194
+
195
+ ```bash
196
+ uv sync
197
+ uv run pytest # unit tests, free
198
+ SNAKEMAKE_VASTAI_E2E=1 uv run pytest tests/test_e2e.py -s # rents a real GPU (~$0.05)
199
+ ```
@@ -0,0 +1,176 @@
1
+ # snakemake-executor-plugin-vastai
2
+
3
+ Run [Snakemake](https://snakemake.github.io) jobs on cheap
4
+ [Vast.ai](https://vast.ai) GPUs. For every job the plugin rents the cheapest
5
+ marketplace offer matching the job's resources, runs it in a Docker
6
+ container, ships the files back, and destroys the instance.
7
+
8
+ ## Quickstart
9
+
10
+ ```bash
11
+ pip install snakemake-executor-plugin-vastai
12
+ export VAST_API_KEY=... # https://console.vast.ai/manage-keys/
13
+ ```
14
+
15
+ Write a Snakefile as usual — resources decide what gets rented:
16
+
17
+ ```python
18
+ rule train:
19
+ input: "data/train.parquet"
20
+ output: "models/model.pt"
21
+ threads: 8
22
+ resources:
23
+ gpu=1,
24
+ gpu_model="RTX_4090",
25
+ mem_mb=32000,
26
+ shell:
27
+ "python train.py {input} {output}"
28
+ ```
29
+
30
+ Run it:
31
+
32
+ ```bash
33
+ snakemake --executor vastai --jobs 4
34
+ ```
35
+
36
+ That's all — no bucket, no shared filesystem, no cluster setup. Jobs run in
37
+ Vast.ai's PyTorch CUDA image, and every instance is destroyed as soon as
38
+ its job finishes, fails, or you hit Ctrl-C.
39
+
40
+ ### What ends up on the machine
41
+
42
+ Each instance receives two uploads before the job starts:
43
+
44
+ 1. **Your code**, as Snakemake's source archive: the Snakefile (plus
45
+ anything you `include:`), config files, `script:`/`notebook:` files, and
46
+ **every git-tracked file** in the workflow directory. In the example
47
+ above, `train.py` gets there this way — which is why the workflow must
48
+ be a git repository and why an uncommitted-and-untracked script would be
49
+ missing remotely (`git add` is enough, no commit needed). Files over
50
+ 10 MB are skipped — declare those as `input:` instead.
51
+ 2. **The job's declared `input:` files** (`data/train.parquet` above),
52
+ preserving their relative paths.
53
+
54
+ The job then runs in the same relative layout, and only its declared
55
+ `output:` (and `log:`) files are copied back to your machine. Anything else
56
+ written on the instance is discarded when the instance is destroyed.
57
+
58
+ ## Configuration
59
+
60
+ Pin your defaults in `profiles/default/config.yaml` next to the Snakefile —
61
+ Snakemake loads it automatically, so the command stays plain `snakemake`:
62
+
63
+ ```yaml
64
+ executor: vastai
65
+ jobs: 4
66
+ vastai-max-price: 1.0 # $/h cap per instance
67
+ vastai-geolocation: EU # region shortcut or country codes (PL,DE,CZ)
68
+ ```
69
+
70
+ | Option | Default | Description |
71
+ |---|---|---|
72
+ | `--vastai-api-key` | – | API key (or `VAST_API_KEY` / `SNAKEMAKE_VASTAI_API_KEY`) |
73
+ | `--vastai-gpu-name` | any | Default GPU model, e.g. `RTX_4090`, `H100_SXM` |
74
+ | `--vastai-max-price` | – | Max price per instance in $/h (`dph_total`) |
75
+ | `--vastai-geolocation` | any | Region shortcut (EU, NA, AS, AF, LC, OC) or country codes |
76
+ | `--vastai-disk` | 40 | Disk allocation per instance (GB) |
77
+ | `--vastai-image` | `vastai/pytorch:@vastai-automatic-tag` | Docker image for jobs |
78
+ | `--vastai-reliability` | 0.98 | Minimum host reliability (0 disables) |
79
+ | `--vastai-no-datacenter` | off | Also allow non-datacenter (hobbyist) hosts |
80
+ | `--vastai-order` | `dph_total` | Offer ranking (e.g. `dlperf_usd-` for perf/$) |
81
+ | `--vastai-search-query` | – | Extra offer filters (vastai query syntax) |
82
+ | `--vastai-boot-timeout` | 1800 | Max seconds for an instance to start running |
83
+ | `--vastai-no-forward-credentials` | off | Don't forward local cloud credentials to jobs |
84
+ | `--vastai-keep-instances` | off | Don't destroy instances (debugging; **keeps billing!**) |
85
+
86
+ By default only verified datacenter hosts are rented — slightly pricier,
87
+ but they avoid the most common marketplace flakiness (Docker Hub pull rate
88
+ limits, slow residential uplinks). `--vastai-no-datacenter` unlocks the
89
+ cheapest hobbyist offers.
90
+
91
+ ### Per-job resources
92
+
93
+ | Resource | Effect on the offer search |
94
+ |---|---|
95
+ | `gpu` / `nvidia_gpu` | `num_gpus=N` (minimum 1 — Vast.ai only rents GPU machines) |
96
+ | `gpu_model` | GPU model (Vast.ai naming, underscores for spaces) |
97
+ | `threads` | `cpu_cores_effective>=N` |
98
+ | `mem_mb` | minimum system RAM |
99
+ | `disk_mb` | minimum disk and rented allocation |
100
+ | `vastai_query` | extra filters, appended verbatim (e.g. `"cuda_vers>=12.4"`) |
101
+
102
+ ### Container image
103
+
104
+ Jobs run in `vastai/pytorch:@vastai-automatic-tag` by default — Vast.ai's
105
+ curated PyTorch image with CUDA matched to each machine's driver, usually
106
+ cached on hosts. Snakemake is pip-installed into the container automatically
107
+ (pinned to your local version, ~30 s per instance).
108
+
109
+ For other stacks set `--vastai-image`. Requirements: `python` on `PATH`;
110
+ for SSH mode also OpenSSH (auto-installed via apt on Debian/Ubuntu images
111
+ if missing). Bake `pip install snakemake` into the image to skip the
112
+ bootstrap cost.
113
+
114
+ ## File transfer
115
+
116
+ **SSH mode** (the default, used when no storage is configured): the source
117
+ archive and input files are scp'd from your machine to each instance, and
118
+ outputs are scp'd back (see "What ends up on the machine" above). Zero
119
+ setup, ideal for getting started and small/medium data. Caveats: everything
120
+ flows through your uplink, intermediate files between dependent jobs
121
+ round-trip through your machine, and the Snakemake process must stay
122
+ online.
123
+
124
+ **Storage mode** (recommended for real pipelines): the same things move,
125
+ but through a bucket instead of your machine — Snakemake uploads the source
126
+ archive once per run, and each job downloads the archive plus its inputs
127
+ from the bucket and uploads its outputs there (your laptop only orchestrates).
128
+ Faster, resumable:
129
+
130
+ ```yaml
131
+ executor: vastai
132
+ jobs: 4
133
+ default-storage-provider: s3
134
+ default-storage-prefix: s3://my-bucket/my-workflow
135
+ ```
136
+
137
+ Install the matching storage plugin (`pip install
138
+ snakemake-storage-plugin-s3`, or `-gcs` / `-azure`). Any S3-compatible
139
+ service works: AWS S3, MinIO, Cloudflare R2, Backblaze B2, …
140
+
141
+ ### Credentials
142
+
143
+ Storage credentials reach the jobs automatically — no `--envvars` needed:
144
+ local `AWS_*` and `AZURE_STORAGE_*` variables are forwarded into the job
145
+ containers, and the Google credentials file
146
+ (`GOOGLE_APPLICATION_CREDENTIALS` or gcloud ADC) is shipped and materialized
147
+ inside the container. Settings configured on the storage plugin itself
148
+ (`SNAKEMAKE_STORAGE_S3_ACCESS_KEY` etc.) are forwarded by Snakemake core.
149
+
150
+ Notes: credentials living only in `~/.aws/credentials` are not forwarded —
151
+ export them as `AWS_*` variables. Other secrets your rules need (e.g.
152
+ `HF_TOKEN`) go through the standard `envvars:` directive or `--envvars`.
153
+ Forwarded credentials are visible inside containers on third-party hosts —
154
+ use scoped, revocable keys.
155
+
156
+ ## Debugging & costs
157
+
158
+ - Remote job logs land in `.snakemake/auxiliary/vastai-logs/`; `--verbose`
159
+ prints the generated offer queries.
160
+ - `--vastai-keep-instances` keeps instances alive for inspection
161
+ (`vastai logs <id>`) — destroy them manually, they bill until then.
162
+ - Flaky hosts happen on a marketplace: unrecoverable boot errors fail fast,
163
+ so run with `--retries 2` to resubmit on a different machine.
164
+ - The plugin destroys every instance it rents, even on failure or Ctrl-C.
165
+ After a hard kill (`kill -9`), check
166
+ https://console.vast.ai/instances/ for leftovers.
167
+ - Each job rents its own instance, so prefer fewer, larger jobs (or job
168
+ grouping) over many tiny ones — boot overhead is paid per job.
169
+
170
+ ## Development
171
+
172
+ ```bash
173
+ uv sync
174
+ uv run pytest # unit tests, free
175
+ SNAKEMAKE_VASTAI_E2E=1 uv run pytest tests/test_e2e.py -s # rents a real GPU (~$0.05)
176
+ ```
@@ -0,0 +1,40 @@
1
+ [build-system]
2
+ requires = ["hatchling"]
3
+ build-backend = "hatchling.build"
4
+
5
+ [project]
6
+ name = "snakemake-executor-plugin-vastai"
7
+ version = "0.1.0"
8
+ description = "A Snakemake executor plugin for running jobs on Vast.ai GPU instances"
9
+ readme = "README.md"
10
+ license = { text = "MIT" }
11
+ authors = [{ name = "Michał Pogoda", email = "michal.pogoda@bards.ai" }]
12
+ keywords = ["snakemake", "plugin", "executor", "vastai", "gpu", "cloud"]
13
+ classifiers = [
14
+ "Development Status :: 4 - Beta",
15
+ "Environment :: GPU :: NVIDIA CUDA",
16
+ "Intended Audience :: Science/Research",
17
+ "License :: OSI Approved :: MIT License",
18
+ "Programming Language :: Python :: 3",
19
+ "Topic :: Scientific/Engineering :: Bio-Informatics",
20
+ "Topic :: System :: Distributed Computing",
21
+ ]
22
+ requires-python = ">=3.11"
23
+ dependencies = [
24
+ "snakemake-interface-common >=1.17.0,<2.0.0",
25
+ "snakemake-interface-executor-plugins >=9.3.0,<10.0.0",
26
+ "vastai >=1.0.13,<2.0.0",
27
+ ]
28
+
29
+ [project.urls]
30
+ Repository = "https://github.com/bards-ai/snakemake-executor-plugin-vastai"
31
+ Documentation = "https://snakemake.github.io/snakemake-plugin-catalog/plugins/executor/vastai.html"
32
+
33
+ [dependency-groups]
34
+ dev = [
35
+ "pytest >=8",
36
+ "snakemake >=9.22.0",
37
+ ]
38
+
39
+ [tool.hatch.build.targets.wheel]
40
+ packages = ["snakemake_executor_plugin_vastai"]