bellhop-py 0.3.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- bellhop_py-0.3.0/.gitignore +10 -0
- bellhop_py-0.3.0/LICENSE +21 -0
- bellhop_py-0.3.0/PKG-INFO +285 -0
- bellhop_py-0.3.0/README.md +258 -0
- bellhop_py-0.3.0/_testcode/go.py +16 -0
- bellhop_py-0.3.0/pyproject.toml +42 -0
- bellhop_py-0.3.0/src/bellhop/__init__.py +34 -0
- bellhop_py-0.3.0/src/bellhop/backend.py +80 -0
- bellhop_py-0.3.0/src/bellhop/cli.py +124 -0
- bellhop_py-0.3.0/src/bellhop/errors.py +65 -0
- bellhop_py-0.3.0/src/bellhop/graphql.py +66 -0
- bellhop_py-0.3.0/src/bellhop/modal_box.py +258 -0
- bellhop_py-0.3.0/src/bellhop/pod.py +424 -0
- bellhop_py-0.3.0/src/bellhop/probes.py +110 -0
- bellhop_py-0.3.0/src/bellhop/rest.py +84 -0
- bellhop_py-0.3.0/src/bellhop/run.py +161 -0
- bellhop_py-0.3.0/tests/integration_live.py +56 -0
- bellhop_py-0.3.0/tests/integration_modal.py +57 -0
- bellhop_py-0.3.0/tests/test_offline.py +252 -0
bellhop_py-0.3.0/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Daniel Tan
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,285 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: bellhop-py
|
|
3
|
+
Version: 0.3.0
|
|
4
|
+
Summary: Async Python library that checks your code into an ephemeral box (RunPod pod or Modal sandbox), runs it, brings the results back, and checks out.
|
|
5
|
+
Project-URL: Homepage, https://github.com/dtch1997/bellhop
|
|
6
|
+
Project-URL: Repository, https://github.com/dtch1997/bellhop
|
|
7
|
+
Project-URL: Issues, https://github.com/dtch1997/bellhop/issues
|
|
8
|
+
Author: Daniel Tan
|
|
9
|
+
License-Expression: MIT
|
|
10
|
+
License-File: LICENSE
|
|
11
|
+
Keywords: compute,ephemeral,gpu,modal,runpod,sandbox
|
|
12
|
+
Classifier: Development Status :: 4 - Beta
|
|
13
|
+
Classifier: Framework :: AsyncIO
|
|
14
|
+
Classifier: Intended Audience :: Developers
|
|
15
|
+
Classifier: Programming Language :: Python :: 3
|
|
16
|
+
Classifier: Topic :: Software Development :: Libraries
|
|
17
|
+
Classifier: Topic :: System :: Distributed Computing
|
|
18
|
+
Requires-Python: >=3.10
|
|
19
|
+
Requires-Dist: httpx>=0.27
|
|
20
|
+
Provides-Extra: dev
|
|
21
|
+
Requires-Dist: modal>=1.0; extra == 'dev'
|
|
22
|
+
Requires-Dist: pytest-asyncio>=0.23; extra == 'dev'
|
|
23
|
+
Requires-Dist: pytest>=8; extra == 'dev'
|
|
24
|
+
Provides-Extra: modal
|
|
25
|
+
Requires-Dist: modal>=1.0; extra == 'modal'
|
|
26
|
+
Description-Content-Type: text/markdown
|
|
27
|
+
|
|
28
|
+
# bellhop
|
|
29
|
+
|
|
30
|
+
**Check your code into an ephemeral box, run it, bring the results back, and
|
|
31
|
+
check out** — an async Python library for disposable compute. Two backends:
|
|
32
|
+
a [RunPod](https://runpod.io) pod or a [Modal](https://modal.com) sandbox.
|
|
33
|
+
|
|
34
|
+
Like a hotel bellhop: it books a room (provisions the box), waits until it's
|
|
35
|
+
actually ready, carries your luggage up (uploads your code), and when you leave
|
|
36
|
+
it brings your bags back down (pulls results) and checks out (tears the box
|
|
37
|
+
down) — so you never leave a box (or a bill) running by accident.
|
|
38
|
+
|
|
39
|
+
```python
|
|
40
|
+
import asyncio
|
|
41
|
+
from bellhop import pod, PodConfig
|
|
42
|
+
|
|
43
|
+
async def main():
|
|
44
|
+
async with pod(PodConfig(gpu="RTX4090")) as p:
|
|
45
|
+
await p.push("./mycode", "/workspace/job")
|
|
46
|
+
r = await p.exec("cd /workspace/job && python train.py")
|
|
47
|
+
print(r.stdout)
|
|
48
|
+
await p.pull("/workspace/job/out", "./results")
|
|
49
|
+
# pod is gone here — even if the body raised
|
|
50
|
+
|
|
51
|
+
asyncio.run(main())
|
|
52
|
+
```
|
|
53
|
+
|
|
54
|
+
The same code runs on Modal by swapping the config — `sandbox(ModalConfig(...))`
|
|
55
|
+
instead of `pod(PodConfig(...))` (see [Two backends](#two-backends) below).
|
|
56
|
+
|
|
57
|
+
The RunPod backend talks to the RunPod **REST API** (`rest.runpod.io/v1`)
|
|
58
|
+
directly over `httpx`, falling back to the **GraphQL API** only to set native
|
|
59
|
+
safety timers. No `runpodctl`, no vendored SDK. The Modal backend drives a
|
|
60
|
+
Modal **Sandbox** via the `modal` SDK.
|
|
61
|
+
|
|
62
|
+
## Install
|
|
63
|
+
|
|
64
|
+
```bash
|
|
65
|
+
pip install bellhop-py # RunPod backend (or: pip install git+https://github.com/dtch1997/bellhop)
|
|
66
|
+
pip install 'bellhop-py[modal]' # add the Modal backend
|
|
67
|
+
```
|
|
68
|
+
|
|
69
|
+
(The PyPI distribution is `bellhop-py` — the bare `bellhop` name is an
|
|
70
|
+
unrelated package — but the import name and CLI are plain `bellhop`.)
|
|
71
|
+
|
|
72
|
+
For the **RunPod** backend, set `RUNPOD_API_KEY`. Connection uses your SSH
|
|
73
|
+
keypair (`~/.ssh/id_ed25519` by default): bellhop injects the public key as the
|
|
74
|
+
pod's `PUBLIC_KEY` env so `root@pod` is reachable. For the **Modal** backend,
|
|
75
|
+
configure Modal auth (`modal token new`, or `MODAL_TOKEN_ID` /
|
|
76
|
+
`MODAL_TOKEN_SECRET`). (GCS upload, if you enable it, needs `gcloud` on your
|
|
77
|
+
`PATH` either way.)
|
|
78
|
+
|
|
79
|
+
## Two backends
|
|
80
|
+
|
|
81
|
+
Both backends implement the same `ExecBox` contract — `exec` / `push` / `pull`
|
|
82
|
+
/ `exists_remote` / `teardown` — so the high-level `run()` / `run_many()`
|
|
83
|
+
pipeline (below) is provider-agnostic: hand it a `PodConfig` for RunPod or a
|
|
84
|
+
`ModalConfig` for Modal and everything else is identical.
|
|
85
|
+
|
|
86
|
+
```python
|
|
87
|
+
from bellhop import sandbox, ModalConfig
|
|
88
|
+
|
|
89
|
+
async with sandbox(ModalConfig(gpu="A10G")) as b: # CPU box: omit gpu
|
|
90
|
+
await b.push("./mycode", "/workspace/job")
|
|
91
|
+
r = await b.exec("cd /workspace/job && python train.py")
|
|
92
|
+
print(r.stdout)
|
|
93
|
+
await b.pull("/workspace/job/out", "./results")
|
|
94
|
+
# sandbox terminated on exit (pass keep=True to leave it up)
|
|
95
|
+
```
|
|
96
|
+
|
|
97
|
+
The whole common surface is spelled the same on both configs:
|
|
98
|
+
|
|
99
|
+
- **`gpu=`** — a canonical short name (`"A100"`, `"H100"`, `"L4"`, …); `None`
|
|
100
|
+
means a CPU box. On RunPod the name expands through `GPU_ALIASES` to the
|
|
101
|
+
*list* of matching gpuTypeIds (e.g. `"A100"` → PCIe *and* SXM), which the
|
|
102
|
+
REST API accepts wholesale — better stock availability than naming one SKU.
|
|
103
|
+
A full RunPod id (`"NVIDIA GeForce RTX 4090"`) still passes verbatim.
|
|
104
|
+
- **`max_lifetime=`** — the hard server-side kill switch, `timedelta` on both
|
|
105
|
+
(maps to `terminate_after` on RunPod, `timeout` on Modal).
|
|
106
|
+
- **`image=` / `image_preset=`** — the `pytorch-cuda` preset is pinned to the
|
|
107
|
+
same torch 2.4.0 + CUDA 12.4 environment on both backends.
|
|
108
|
+
|
|
109
|
+
What genuinely differs stays backend-specific:
|
|
110
|
+
|
|
111
|
+
| | RunPod (`PodConfig`, `pod()`) | Modal (`ModalConfig`, `sandbox()`) |
|
|
112
|
+
|---|---|---|
|
|
113
|
+
| Readiness | SSH/probe wait (below) | none — `create()` returns an execable box |
|
|
114
|
+
| Extra TTL | `stop_after` (wall-clock compute halt) | `idle_timeout` (kill after inactivity) |
|
|
115
|
+
| Image extras | — | `pip=` / `apt=`, `modal.Image`, `secrets=`, `volumes=` |
|
|
116
|
+
| Placement | `cloud=` SECURE/COMMUNITY (+fallback) | `region=`, `cpu=`, `memory=` |
|
|
117
|
+
| Auth | `RUNPOD_API_KEY` + SSH keypair | Modal token (`modal token new`) |
|
|
118
|
+
|
|
119
|
+
(`stop_after` and `idle_timeout` are deliberately *not* unified — one is a
|
|
120
|
+
wall-clock timer, the other an inactivity timer; pretending they're the same
|
|
121
|
+
concept would be a trap. `gpu_id=` remains as a legacy spelling of a verbatim
|
|
122
|
+
RunPod id.)
|
|
123
|
+
|
|
124
|
+
## "Return when functional" — the hard part (RunPod only)
|
|
125
|
+
|
|
126
|
+
`desiredStatus == RUNNING` is necessary but **not sufficient**: sshd / your
|
|
127
|
+
server typically lags the RUNNING state by 30–60s. So once a pod is routable
|
|
128
|
+
(RUNNING + public IP + mapped port), bellhop runs a **readiness probe** until it
|
|
129
|
+
passes before handing the pod to you. "Functional" is caller-specific, so it's
|
|
130
|
+
pluggable:
|
|
131
|
+
|
|
132
|
+
```python
|
|
133
|
+
from bellhop import SshProbe, TcpProbe, HttpProbe, LogMarkerProbe
|
|
134
|
+
|
|
135
|
+
PodConfig(..., ready=SshProbe("true")) # ssh job pods (default)
|
|
136
|
+
PodConfig(..., ready=HttpProbe(8000, "/health")) # a served endpoint
|
|
137
|
+
PodConfig(..., ready=LogMarkerProbe("server up")) # headless pods
|
|
138
|
+
```
|
|
139
|
+
|
|
140
|
+
(Modal sandboxes are execable as soon as `create()` returns, so there's no
|
|
141
|
+
probe step on that backend.)
|
|
142
|
+
|
|
143
|
+
## Two ways to use it
|
|
144
|
+
|
|
145
|
+
### Composable pod — multi-step / interactive
|
|
146
|
+
|
|
147
|
+
Keep one pod alive and run many steps against it:
|
|
148
|
+
|
|
149
|
+
```python
|
|
150
|
+
async with pod(PodConfig(gpu="RTX4090")) as p:
|
|
151
|
+
await p.push("./code", "/workspace/job")
|
|
152
|
+
await p.exec("cd /workspace/job && python train.py", env={"HF_TOKEN": tok})
|
|
153
|
+
await p.exec("python eval.py") # same pod, no re-provision
|
|
154
|
+
await p.pull("/workspace/job/results", "./out")
|
|
155
|
+
print(p.proxy_url(8000)) # https://<id>-8000.proxy.runpod.net
|
|
156
|
+
# torn down on exit (pass keep=True to leave it up)
|
|
157
|
+
```
|
|
158
|
+
|
|
159
|
+
### One-shot — provision, run, collect, done
|
|
160
|
+
|
|
161
|
+
```python
|
|
162
|
+
import asyncio
|
|
163
|
+
from bellhop import run, RunSpec, PodConfig
|
|
164
|
+
|
|
165
|
+
res = asyncio.run(run(
|
|
166
|
+
RunSpec(slug="demo", codebase="./mycode", run="python go.py"),
|
|
167
|
+
PodConfig(gpu="A100"), # ModalConfig(gpu="A100") runs the same pipeline on Modal
|
|
168
|
+
))
|
|
169
|
+
print(res.remote_exit, res.local_results)
|
|
170
|
+
```
|
|
171
|
+
|
|
172
|
+
`run()` provisions → waits-functional → uploads the codebase (local dir *or* git
|
|
173
|
+
URL) → runs `setup` then `run` (tee'd to `results/run.log`) → pulls the results
|
|
174
|
+
dir back → optionally uploads to GCS → tears down → returns a `RunResult`. Pass
|
|
175
|
+
a `ModalConfig` instead of a `PodConfig` to run the exact same pipeline on a
|
|
176
|
+
Modal sandbox.
|
|
177
|
+
|
|
178
|
+
CLI equivalent — the same `--gpu` flag works on both backends (omit it for a
|
|
179
|
+
CPU box):
|
|
180
|
+
|
|
181
|
+
```bash
|
|
182
|
+
bellhop run --slug demo --codebase ./mycode --run "python go.py" --gpu A100
|
|
183
|
+
bellhop run --backend modal --slug demo --codebase ./mycode --run "python go.py" --gpu A100
|
|
184
|
+
```
|
|
185
|
+
|
|
186
|
+
### Fan out a sweep
|
|
187
|
+
|
|
188
|
+
```python
|
|
189
|
+
from dataclasses import replace
|
|
190
|
+
from bellhop import run_many
|
|
191
|
+
|
|
192
|
+
base = RunSpec(slug="sweep", codebase="./code", run="python train.py")
|
|
193
|
+
specs = [replace(base, slug=f"lr{lr}", run=f"python train.py --lr {lr}")
|
|
194
|
+
for lr in (1e-4, 3e-4, 1e-3)]
|
|
195
|
+
results = await run_many(specs, gpu_cfg, max_concurrency=4)
|
|
196
|
+
```
|
|
197
|
+
|
|
198
|
+
## Cleanup: two layers
|
|
199
|
+
|
|
200
|
+
| When | Handled by |
|
|
201
|
+
|------|------------|
|
|
202
|
+
| Normal exit, exception, Ctrl-C | the `async with` block's `finally` — **always** tears the pod down (unless `keep=True`) |
|
|
203
|
+
| The host process itself dies (kill -9, crash, reboot) | native RunPod safety timers (below) |
|
|
204
|
+
|
|
205
|
+
The context manager is the primary guarantee and covers essentially everything.
|
|
206
|
+
The timers are a backstop for the one case `finally` can't reach.
|
|
207
|
+
|
|
208
|
+
### Native safety timers
|
|
209
|
+
|
|
210
|
+
Every GPU pod is created with RunPod's own server-side timers, set atomically at
|
|
211
|
+
creation so they hold even if your process dies the instant after:
|
|
212
|
+
|
|
213
|
+
```python
|
|
214
|
+
from datetime import timedelta
|
|
215
|
+
PodConfig(
|
|
216
|
+
stop_after=timedelta(hours=24), # halt compute billing; disk persists, restartable
|
|
217
|
+
terminate_after=timedelta(hours=72), # delete the pod; all billing stops
|
|
218
|
+
)
|
|
219
|
+
# set either to None to disable
|
|
220
|
+
```
|
|
221
|
+
|
|
222
|
+
These use the GraphQL `podFindAndDeployOnDemand` mutation (REST has no TTL
|
|
223
|
+
field), so setting a timer routes pod creation through GraphQL automatically.
|
|
224
|
+
|
|
225
|
+
> **Granularity caveat.** RunPod enforces these on a coarse schedule, *not*
|
|
226
|
+
> minute-precise — a short timer may fire well after its deadline. Treat them as
|
|
227
|
+
> an hours-scale backstop, not a precise kill switch. The `async with` cleanup
|
|
228
|
+
> is what you should rely on for prompt teardown. Native TTL currently applies
|
|
229
|
+
> to GPU pods only (the on-demand path); CPU pods rely on `finally` alone.
|
|
230
|
+
|
|
231
|
+
On the **Modal** backend the equivalents are first-class `create` kwargs:
|
|
232
|
+
`ModalConfig(timeout=timedelta(hours=24))` is the hard max lifetime and
|
|
233
|
+
`idle_timeout=timedelta(minutes=30)` terminates the sandbox after inactivity —
|
|
234
|
+
no GraphQL detour, and they apply to CPU and GPU sandboxes alike.
|
|
235
|
+
|
|
236
|
+
The backend-agnostic spelling of the hard kill is
|
|
237
|
+
**`max_lifetime=timedelta(...)`** — set it on either config (or
|
|
238
|
+
`--max-lifetime-hours` on the CLI) and it maps to `terminate_after` on RunPod
|
|
239
|
+
and `timeout` on Modal, taking precedence over those fields.
|
|
240
|
+
|
|
241
|
+
## Optional: persist results to GCS
|
|
242
|
+
|
|
243
|
+
Off by default. Pass `gcs_base` (or `--gcs-base`) to upload the pulled results
|
|
244
|
+
to Google Cloud Storage from your machine (credentials never touch the pod):
|
|
245
|
+
|
|
246
|
+
```python
|
|
247
|
+
RunSpec(slug="demo", codebase="./code", run="python go.py",
|
|
248
|
+
gcs_base="gs://your-bucket/experiments")
|
|
249
|
+
# res.gcs_uri and res.retrieve_cmd are populated
|
|
250
|
+
```
|
|
251
|
+
|
|
252
|
+
## Typed errors
|
|
253
|
+
|
|
254
|
+
`BellhopError` subclasses let you branch on failure mode:
|
|
255
|
+
`PreflightError` (bad config / missing key / `modal` not installed),
|
|
256
|
+
`ProvisionError` (pod or sandbox create failed), `PodNotReadyError` (never became
|
|
257
|
+
functional), `RemoteJobError` (carries `.remote_exit` + `.log_tail`),
|
|
258
|
+
`ResultsMissingError`, `GcsUploadError`. (`RunpodError` is a back-compat alias
|
|
259
|
+
for `BellhopError`.)
|
|
260
|
+
|
|
261
|
+
## Notes
|
|
262
|
+
|
|
263
|
+
- Code/result transfer is **tar-over-ssh** on RunPod and **tar-over-exec** on
|
|
264
|
+
Modal — only needs `tar` in the image (no rsync; on RunPod also `ssh`).
|
|
265
|
+
- Env vars passed to `exec(env=...)` never appear in the box's process list:
|
|
266
|
+
RunPod exports them inside a script fed over stdin; Modal passes them over its
|
|
267
|
+
API, not argv.
|
|
268
|
+
- On out-of-stock, a RunPod `COMMUNITY` request retries on `SECURE` automatically
|
|
269
|
+
(toggle with `cloud_fallback=False`).
|
|
270
|
+
- The Modal default image is `debian_slim` with `git` + `tar`; add packages with
|
|
271
|
+
`ModalConfig(pip=[...], apt=[...])`, or supply your own `modal.Image` /
|
|
272
|
+
registry ref (assumed to already have `tar`).
|
|
273
|
+
|
|
274
|
+
## Development
|
|
275
|
+
|
|
276
|
+
```bash
|
|
277
|
+
pip install -e ".[dev]"
|
|
278
|
+
pytest # offline unit tests (no pod/sandbox, no cost)
|
|
279
|
+
RUNPOD_LIVE=1 pytest tests/integration_live.py -s # billed RunPod end-to-end test
|
|
280
|
+
MODAL_LIVE=1 pytest tests/integration_modal.py -s # billed Modal end-to-end test
|
|
281
|
+
```
|
|
282
|
+
|
|
283
|
+
## License
|
|
284
|
+
|
|
285
|
+
MIT
|
|
@@ -0,0 +1,258 @@
|
|
|
1
|
+
# bellhop
|
|
2
|
+
|
|
3
|
+
**Check your code into an ephemeral box, run it, bring the results back, and
|
|
4
|
+
check out** — an async Python library for disposable compute. Two backends:
|
|
5
|
+
a [RunPod](https://runpod.io) pod or a [Modal](https://modal.com) sandbox.
|
|
6
|
+
|
|
7
|
+
Like a hotel bellhop: it books a room (provisions the box), waits until it's
|
|
8
|
+
actually ready, carries your luggage up (uploads your code), and when you leave
|
|
9
|
+
it brings your bags back down (pulls results) and checks out (tears the box
|
|
10
|
+
down) — so you never leave a box (or a bill) running by accident.
|
|
11
|
+
|
|
12
|
+
```python
|
|
13
|
+
import asyncio
|
|
14
|
+
from bellhop import pod, PodConfig
|
|
15
|
+
|
|
16
|
+
async def main():
|
|
17
|
+
async with pod(PodConfig(gpu="RTX4090")) as p:
|
|
18
|
+
await p.push("./mycode", "/workspace/job")
|
|
19
|
+
r = await p.exec("cd /workspace/job && python train.py")
|
|
20
|
+
print(r.stdout)
|
|
21
|
+
await p.pull("/workspace/job/out", "./results")
|
|
22
|
+
# pod is gone here — even if the body raised
|
|
23
|
+
|
|
24
|
+
asyncio.run(main())
|
|
25
|
+
```
|
|
26
|
+
|
|
27
|
+
The same code runs on Modal by swapping the config — `sandbox(ModalConfig(...))`
|
|
28
|
+
instead of `pod(PodConfig(...))` (see [Two backends](#two-backends) below).
|
|
29
|
+
|
|
30
|
+
The RunPod backend talks to the RunPod **REST API** (`rest.runpod.io/v1`)
|
|
31
|
+
directly over `httpx`, falling back to the **GraphQL API** only to set native
|
|
32
|
+
safety timers. No `runpodctl`, no vendored SDK. The Modal backend drives a
|
|
33
|
+
Modal **Sandbox** via the `modal` SDK.
|
|
34
|
+
|
|
35
|
+
## Install
|
|
36
|
+
|
|
37
|
+
```bash
|
|
38
|
+
pip install bellhop-py # RunPod backend (or: pip install git+https://github.com/dtch1997/bellhop)
|
|
39
|
+
pip install 'bellhop-py[modal]' # add the Modal backend
|
|
40
|
+
```
|
|
41
|
+
|
|
42
|
+
(The PyPI distribution is `bellhop-py` — the bare `bellhop` name is an
|
|
43
|
+
unrelated package — but the import name and CLI are plain `bellhop`.)
|
|
44
|
+
|
|
45
|
+
For the **RunPod** backend, set `RUNPOD_API_KEY`. Connection uses your SSH
|
|
46
|
+
keypair (`~/.ssh/id_ed25519` by default): bellhop injects the public key as the
|
|
47
|
+
pod's `PUBLIC_KEY` env so `root@pod` is reachable. For the **Modal** backend,
|
|
48
|
+
configure Modal auth (`modal token new`, or `MODAL_TOKEN_ID` /
|
|
49
|
+
`MODAL_TOKEN_SECRET`). (GCS upload, if you enable it, needs `gcloud` on your
|
|
50
|
+
`PATH` either way.)
|
|
51
|
+
|
|
52
|
+
## Two backends
|
|
53
|
+
|
|
54
|
+
Both backends implement the same `ExecBox` contract — `exec` / `push` / `pull`
|
|
55
|
+
/ `exists_remote` / `teardown` — so the high-level `run()` / `run_many()`
|
|
56
|
+
pipeline (below) is provider-agnostic: hand it a `PodConfig` for RunPod or a
|
|
57
|
+
`ModalConfig` for Modal and everything else is identical.
|
|
58
|
+
|
|
59
|
+
```python
|
|
60
|
+
from bellhop import sandbox, ModalConfig
|
|
61
|
+
|
|
62
|
+
async with sandbox(ModalConfig(gpu="A10G")) as b: # CPU box: omit gpu
|
|
63
|
+
await b.push("./mycode", "/workspace/job")
|
|
64
|
+
r = await b.exec("cd /workspace/job && python train.py")
|
|
65
|
+
print(r.stdout)
|
|
66
|
+
await b.pull("/workspace/job/out", "./results")
|
|
67
|
+
# sandbox terminated on exit (pass keep=True to leave it up)
|
|
68
|
+
```
|
|
69
|
+
|
|
70
|
+
The whole common surface is spelled the same on both configs:
|
|
71
|
+
|
|
72
|
+
- **`gpu=`** — a canonical short name (`"A100"`, `"H100"`, `"L4"`, …); `None`
|
|
73
|
+
means a CPU box. On RunPod the name expands through `GPU_ALIASES` to the
|
|
74
|
+
*list* of matching gpuTypeIds (e.g. `"A100"` → PCIe *and* SXM), which the
|
|
75
|
+
REST API accepts wholesale — better stock availability than naming one SKU.
|
|
76
|
+
A full RunPod id (`"NVIDIA GeForce RTX 4090"`) still passes verbatim.
|
|
77
|
+
- **`max_lifetime=`** — the hard server-side kill switch, `timedelta` on both
|
|
78
|
+
(maps to `terminate_after` on RunPod, `timeout` on Modal).
|
|
79
|
+
- **`image=` / `image_preset=`** — the `pytorch-cuda` preset is pinned to the
|
|
80
|
+
same torch 2.4.0 + CUDA 12.4 environment on both backends.
|
|
81
|
+
|
|
82
|
+
What genuinely differs stays backend-specific:
|
|
83
|
+
|
|
84
|
+
| | RunPod (`PodConfig`, `pod()`) | Modal (`ModalConfig`, `sandbox()`) |
|
|
85
|
+
|---|---|---|
|
|
86
|
+
| Readiness | SSH/probe wait (below) | none — `create()` returns an execable box |
|
|
87
|
+
| Extra TTL | `stop_after` (wall-clock compute halt) | `idle_timeout` (kill after inactivity) |
|
|
88
|
+
| Image extras | — | `pip=` / `apt=`, `modal.Image`, `secrets=`, `volumes=` |
|
|
89
|
+
| Placement | `cloud=` SECURE/COMMUNITY (+fallback) | `region=`, `cpu=`, `memory=` |
|
|
90
|
+
| Auth | `RUNPOD_API_KEY` + SSH keypair | Modal token (`modal token new`) |
|
|
91
|
+
|
|
92
|
+
(`stop_after` and `idle_timeout` are deliberately *not* unified — one is a
|
|
93
|
+
wall-clock timer, the other an inactivity timer; pretending they're the same
|
|
94
|
+
concept would be a trap. `gpu_id=` remains as a legacy spelling of a verbatim
|
|
95
|
+
RunPod id.)
|
|
96
|
+
|
|
97
|
+
## "Return when functional" — the hard part (RunPod only)
|
|
98
|
+
|
|
99
|
+
`desiredStatus == RUNNING` is necessary but **not sufficient**: sshd / your
|
|
100
|
+
server typically lags the RUNNING state by 30–60s. So once a pod is routable
|
|
101
|
+
(RUNNING + public IP + mapped port), bellhop runs a **readiness probe** until it
|
|
102
|
+
passes before handing the pod to you. "Functional" is caller-specific, so it's
|
|
103
|
+
pluggable:
|
|
104
|
+
|
|
105
|
+
```python
|
|
106
|
+
from bellhop import SshProbe, TcpProbe, HttpProbe, LogMarkerProbe
|
|
107
|
+
|
|
108
|
+
PodConfig(..., ready=SshProbe("true")) # ssh job pods (default)
|
|
109
|
+
PodConfig(..., ready=HttpProbe(8000, "/health")) # a served endpoint
|
|
110
|
+
PodConfig(..., ready=LogMarkerProbe("server up")) # headless pods
|
|
111
|
+
```
|
|
112
|
+
|
|
113
|
+
(Modal sandboxes are execable as soon as `create()` returns, so there's no
|
|
114
|
+
probe step on that backend.)
|
|
115
|
+
|
|
116
|
+
## Two ways to use it
|
|
117
|
+
|
|
118
|
+
### Composable pod — multi-step / interactive
|
|
119
|
+
|
|
120
|
+
Keep one pod alive and run many steps against it:
|
|
121
|
+
|
|
122
|
+
```python
|
|
123
|
+
async with pod(PodConfig(gpu="RTX4090")) as p:
|
|
124
|
+
await p.push("./code", "/workspace/job")
|
|
125
|
+
await p.exec("cd /workspace/job && python train.py", env={"HF_TOKEN": tok})
|
|
126
|
+
await p.exec("python eval.py") # same pod, no re-provision
|
|
127
|
+
await p.pull("/workspace/job/results", "./out")
|
|
128
|
+
print(p.proxy_url(8000)) # https://<id>-8000.proxy.runpod.net
|
|
129
|
+
# torn down on exit (pass keep=True to leave it up)
|
|
130
|
+
```
|
|
131
|
+
|
|
132
|
+
### One-shot — provision, run, collect, done
|
|
133
|
+
|
|
134
|
+
```python
|
|
135
|
+
import asyncio
|
|
136
|
+
from bellhop import run, RunSpec, PodConfig
|
|
137
|
+
|
|
138
|
+
res = asyncio.run(run(
|
|
139
|
+
RunSpec(slug="demo", codebase="./mycode", run="python go.py"),
|
|
140
|
+
PodConfig(gpu="A100"), # ModalConfig(gpu="A100") runs the same pipeline on Modal
|
|
141
|
+
))
|
|
142
|
+
print(res.remote_exit, res.local_results)
|
|
143
|
+
```
|
|
144
|
+
|
|
145
|
+
`run()` provisions → waits-functional → uploads the codebase (local dir *or* git
|
|
146
|
+
URL) → runs `setup` then `run` (tee'd to `results/run.log`) → pulls the results
|
|
147
|
+
dir back → optionally uploads to GCS → tears down → returns a `RunResult`. Pass
|
|
148
|
+
a `ModalConfig` instead of a `PodConfig` to run the exact same pipeline on a
|
|
149
|
+
Modal sandbox.
|
|
150
|
+
|
|
151
|
+
CLI equivalent — the same `--gpu` flag works on both backends (omit it for a
|
|
152
|
+
CPU box):
|
|
153
|
+
|
|
154
|
+
```bash
|
|
155
|
+
bellhop run --slug demo --codebase ./mycode --run "python go.py" --gpu A100
|
|
156
|
+
bellhop run --backend modal --slug demo --codebase ./mycode --run "python go.py" --gpu A100
|
|
157
|
+
```
|
|
158
|
+
|
|
159
|
+
### Fan out a sweep
|
|
160
|
+
|
|
161
|
+
```python
|
|
162
|
+
from dataclasses import replace
|
|
163
|
+
from bellhop import run_many
|
|
164
|
+
|
|
165
|
+
base = RunSpec(slug="sweep", codebase="./code", run="python train.py")
|
|
166
|
+
specs = [replace(base, slug=f"lr{lr}", run=f"python train.py --lr {lr}")
|
|
167
|
+
for lr in (1e-4, 3e-4, 1e-3)]
|
|
168
|
+
results = await run_many(specs, gpu_cfg, max_concurrency=4)
|
|
169
|
+
```
|
|
170
|
+
|
|
171
|
+
## Cleanup: two layers
|
|
172
|
+
|
|
173
|
+
| When | Handled by |
|
|
174
|
+
|------|------------|
|
|
175
|
+
| Normal exit, exception, Ctrl-C | the `async with` block's `finally` — **always** tears the pod down (unless `keep=True`) |
|
|
176
|
+
| The host process itself dies (kill -9, crash, reboot) | native RunPod safety timers (below) |
|
|
177
|
+
|
|
178
|
+
The context manager is the primary guarantee and covers essentially everything.
|
|
179
|
+
The timers are a backstop for the one case `finally` can't reach.
|
|
180
|
+
|
|
181
|
+
### Native safety timers
|
|
182
|
+
|
|
183
|
+
Every GPU pod is created with RunPod's own server-side timers, set atomically at
|
|
184
|
+
creation so they hold even if your process dies the instant after:
|
|
185
|
+
|
|
186
|
+
```python
|
|
187
|
+
from datetime import timedelta
|
|
188
|
+
PodConfig(
|
|
189
|
+
stop_after=timedelta(hours=24), # halt compute billing; disk persists, restartable
|
|
190
|
+
terminate_after=timedelta(hours=72), # delete the pod; all billing stops
|
|
191
|
+
)
|
|
192
|
+
# set either to None to disable
|
|
193
|
+
```
|
|
194
|
+
|
|
195
|
+
These use the GraphQL `podFindAndDeployOnDemand` mutation (REST has no TTL
|
|
196
|
+
field), so setting a timer routes pod creation through GraphQL automatically.
|
|
197
|
+
|
|
198
|
+
> **Granularity caveat.** RunPod enforces these on a coarse schedule, *not*
|
|
199
|
+
> minute-precise — a short timer may fire well after its deadline. Treat them as
|
|
200
|
+
> an hours-scale backstop, not a precise kill switch. The `async with` cleanup
|
|
201
|
+
> is what you should rely on for prompt teardown. Native TTL currently applies
|
|
202
|
+
> to GPU pods only (the on-demand path); CPU pods rely on `finally` alone.
|
|
203
|
+
|
|
204
|
+
On the **Modal** backend the equivalents are first-class `create` kwargs:
|
|
205
|
+
`ModalConfig(timeout=timedelta(hours=24))` is the hard max lifetime and
|
|
206
|
+
`idle_timeout=timedelta(minutes=30)` terminates the sandbox after inactivity —
|
|
207
|
+
no GraphQL detour, and they apply to CPU and GPU sandboxes alike.
|
|
208
|
+
|
|
209
|
+
The backend-agnostic spelling of the hard kill is
|
|
210
|
+
**`max_lifetime=timedelta(...)`** — set it on either config (or
|
|
211
|
+
`--max-lifetime-hours` on the CLI) and it maps to `terminate_after` on RunPod
|
|
212
|
+
and `timeout` on Modal, taking precedence over those fields.
|
|
213
|
+
|
|
214
|
+
## Optional: persist results to GCS
|
|
215
|
+
|
|
216
|
+
Off by default. Pass `gcs_base` (or `--gcs-base`) to upload the pulled results
|
|
217
|
+
to Google Cloud Storage from your machine (credentials never touch the pod):
|
|
218
|
+
|
|
219
|
+
```python
|
|
220
|
+
RunSpec(slug="demo", codebase="./code", run="python go.py",
|
|
221
|
+
gcs_base="gs://your-bucket/experiments")
|
|
222
|
+
# res.gcs_uri and res.retrieve_cmd are populated
|
|
223
|
+
```
|
|
224
|
+
|
|
225
|
+
## Typed errors
|
|
226
|
+
|
|
227
|
+
`BellhopError` subclasses let you branch on failure mode:
|
|
228
|
+
`PreflightError` (bad config / missing key / `modal` not installed),
|
|
229
|
+
`ProvisionError` (pod or sandbox create failed), `PodNotReadyError` (never became
|
|
230
|
+
functional), `RemoteJobError` (carries `.remote_exit` + `.log_tail`),
|
|
231
|
+
`ResultsMissingError`, `GcsUploadError`. (`RunpodError` is a back-compat alias
|
|
232
|
+
for `BellhopError`.)
|
|
233
|
+
|
|
234
|
+
## Notes
|
|
235
|
+
|
|
236
|
+
- Code/result transfer is **tar-over-ssh** on RunPod and **tar-over-exec** on
|
|
237
|
+
Modal — only needs `tar` in the image (no rsync; on RunPod also `ssh`).
|
|
238
|
+
- Env vars passed to `exec(env=...)` never appear in the box's process list:
|
|
239
|
+
RunPod exports them inside a script fed over stdin; Modal passes them over its
|
|
240
|
+
API, not argv.
|
|
241
|
+
- On out-of-stock, a RunPod `COMMUNITY` request retries on `SECURE` automatically
|
|
242
|
+
(toggle with `cloud_fallback=False`).
|
|
243
|
+
- The Modal default image is `debian_slim` with `git` + `tar`; add packages with
|
|
244
|
+
`ModalConfig(pip=[...], apt=[...])`, or supply your own `modal.Image` /
|
|
245
|
+
registry ref (assumed to already have `tar`).
|
|
246
|
+
|
|
247
|
+
## Development
|
|
248
|
+
|
|
249
|
+
```bash
|
|
250
|
+
pip install -e ".[dev]"
|
|
251
|
+
pytest # offline unit tests (no pod/sandbox, no cost)
|
|
252
|
+
RUNPOD_LIVE=1 pytest tests/integration_live.py -s # billed RunPod end-to-end test
|
|
253
|
+
MODAL_LIVE=1 pytest tests/integration_modal.py -s # billed Modal end-to-end test
|
|
254
|
+
```
|
|
255
|
+
|
|
256
|
+
## License
|
|
257
|
+
|
|
258
|
+
MIT
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import pathlib
|
|
3
|
+
import subprocess
|
|
4
|
+
|
|
5
|
+
pathlib.Path("results").mkdir(exist_ok=True)
|
|
6
|
+
secret = os.environ.get("MY_SECRET", "<MISSING>")
|
|
7
|
+
msg = f"job ran on pod; MY_SECRET={secret}"
|
|
8
|
+
print(msg)
|
|
9
|
+
pathlib.Path("results/out.txt").write_text(msg + "\n")
|
|
10
|
+
|
|
11
|
+
try:
|
|
12
|
+
g = subprocess.check_output(["nvidia-smi", "-L"], text=True).strip()
|
|
13
|
+
except Exception as e:
|
|
14
|
+
g = f"no nvidia-smi: {e}"
|
|
15
|
+
print("GPU:", g)
|
|
16
|
+
pathlib.Path("results/gpu.txt").write_text(g + "\n")
|
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
[project]
|
|
2
|
+
# Distribution name is bellhop-py ("bellhop" is squatted on PyPI by an
|
|
3
|
+
# unrelated 2022 package); the import name and CLI remain `bellhop`.
|
|
4
|
+
name = "bellhop-py"
|
|
5
|
+
version = "0.3.0"
|
|
6
|
+
description = "Async Python library that checks your code into an ephemeral box (RunPod pod or Modal sandbox), runs it, brings the results back, and checks out."
|
|
7
|
+
readme = "README.md"
|
|
8
|
+
license = "MIT"
|
|
9
|
+
authors = [{ name = "Daniel Tan" }]
|
|
10
|
+
requires-python = ">=3.10"
|
|
11
|
+
dependencies = ["httpx>=0.27"]
|
|
12
|
+
keywords = ["runpod", "modal", "gpu", "ephemeral", "compute", "sandbox"]
|
|
13
|
+
classifiers = [
|
|
14
|
+
"Development Status :: 4 - Beta",
|
|
15
|
+
"Framework :: AsyncIO",
|
|
16
|
+
"Intended Audience :: Developers",
|
|
17
|
+
"Programming Language :: Python :: 3",
|
|
18
|
+
"Topic :: Software Development :: Libraries",
|
|
19
|
+
"Topic :: System :: Distributed Computing",
|
|
20
|
+
]
|
|
21
|
+
|
|
22
|
+
[project.urls]
|
|
23
|
+
Homepage = "https://github.com/dtch1997/bellhop"
|
|
24
|
+
Repository = "https://github.com/dtch1997/bellhop"
|
|
25
|
+
Issues = "https://github.com/dtch1997/bellhop/issues"
|
|
26
|
+
|
|
27
|
+
[project.scripts]
|
|
28
|
+
bellhop = "bellhop.cli:main"
|
|
29
|
+
|
|
30
|
+
[project.optional-dependencies]
|
|
31
|
+
modal = ["modal>=1.0"]
|
|
32
|
+
dev = ["pytest>=8", "pytest-asyncio>=0.23", "modal>=1.0"]
|
|
33
|
+
|
|
34
|
+
[build-system]
|
|
35
|
+
requires = ["hatchling"]
|
|
36
|
+
build-backend = "hatchling.build"
|
|
37
|
+
|
|
38
|
+
[tool.hatch.build.targets.wheel]
|
|
39
|
+
packages = ["src/bellhop"]
|
|
40
|
+
|
|
41
|
+
[tool.pytest.ini_options]
|
|
42
|
+
asyncio_mode = "auto"
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
"""bellhop: check your code into an ephemeral box (RunPod pod or Modal sandbox), run it, bring results back, check out."""
|
|
2
|
+
|
|
3
|
+
from .backend import ExecBox, ExecResult, open_box
|
|
4
|
+
from .errors import (
|
|
5
|
+
BellhopError,
|
|
6
|
+
GcsUploadError,
|
|
7
|
+
PodNotReadyError,
|
|
8
|
+
PreflightError,
|
|
9
|
+
ProvisionError,
|
|
10
|
+
RemoteJobError,
|
|
11
|
+
ResultsMissingError,
|
|
12
|
+
RunpodError,
|
|
13
|
+
)
|
|
14
|
+
from .graphql import RunpodGraphQL
|
|
15
|
+
from .modal_box import ModalConfig, Sandbox, sandbox
|
|
16
|
+
from .pod import GPU_ALIASES, IMAGE_PRESETS, Pod, PodConfig, pod
|
|
17
|
+
from .probes import HttpProbe, LogMarkerProbe, ReadyProbe, SshProbe, TcpProbe
|
|
18
|
+
from .rest import RunpodRest
|
|
19
|
+
from .run import RunResult, RunSpec, run, run_many
|
|
20
|
+
|
|
21
|
+
__all__ = [
|
|
22
|
+
# backend-agnostic surface
|
|
23
|
+
"run", "run_many", "RunSpec", "RunResult",
|
|
24
|
+
"open_box", "ExecBox", "ExecResult",
|
|
25
|
+
# RunPod backend
|
|
26
|
+
"pod", "Pod", "PodConfig", "IMAGE_PRESETS", "GPU_ALIASES",
|
|
27
|
+
"RunpodRest", "RunpodGraphQL",
|
|
28
|
+
"ReadyProbe", "SshProbe", "TcpProbe", "HttpProbe", "LogMarkerProbe",
|
|
29
|
+
# Modal backend
|
|
30
|
+
"sandbox", "Sandbox", "ModalConfig",
|
|
31
|
+
# errors
|
|
32
|
+
"BellhopError", "RunpodError", "PreflightError", "ProvisionError", "PodNotReadyError",
|
|
33
|
+
"RemoteJobError", "ResultsMissingError", "GcsUploadError",
|
|
34
|
+
]
|