aws-bootstrap-g4dn 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- aws_bootstrap/__init__.py +1 -0
- aws_bootstrap/cli.py +438 -0
- aws_bootstrap/config.py +24 -0
- aws_bootstrap/ec2.py +341 -0
- aws_bootstrap/resources/__init__.py +0 -0
- aws_bootstrap/resources/gpu_benchmark.py +839 -0
- aws_bootstrap/resources/gpu_smoke_test.ipynb +340 -0
- aws_bootstrap/resources/remote_setup.sh +188 -0
- aws_bootstrap/resources/requirements.txt +8 -0
- aws_bootstrap/ssh.py +513 -0
- aws_bootstrap/tests/__init__.py +0 -0
- aws_bootstrap/tests/test_cli.py +528 -0
- aws_bootstrap/tests/test_config.py +35 -0
- aws_bootstrap/tests/test_ec2.py +313 -0
- aws_bootstrap/tests/test_ssh_config.py +297 -0
- aws_bootstrap/tests/test_ssh_gpu.py +138 -0
- aws_bootstrap_g4dn-0.1.0.dist-info/METADATA +308 -0
- aws_bootstrap_g4dn-0.1.0.dist-info/RECORD +22 -0
- aws_bootstrap_g4dn-0.1.0.dist-info/WHEEL +5 -0
- aws_bootstrap_g4dn-0.1.0.dist-info/entry_points.txt +2 -0
- aws_bootstrap_g4dn-0.1.0.dist-info/licenses/LICENSE +21 -0
- aws_bootstrap_g4dn-0.1.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,138 @@
|
|
|
1
|
+
"""Tests for GPU info queries via SSH (get_ssh_host_details, query_gpu_info)."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
import subprocess
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from unittest.mock import patch
|
|
7
|
+
|
|
8
|
+
from aws_bootstrap.ssh import (
|
|
9
|
+
_GPU_ARCHITECTURES,
|
|
10
|
+
GpuInfo,
|
|
11
|
+
add_ssh_host,
|
|
12
|
+
get_ssh_host_details,
|
|
13
|
+
query_gpu_info,
|
|
14
|
+
)
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
KEY_PATH = Path("/home/user/.ssh/id_ed25519.pub")
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
# ---------------------------------------------------------------------------
|
|
21
|
+
# get_ssh_host_details
|
|
22
|
+
# ---------------------------------------------------------------------------
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def test_get_ssh_host_details_found(tmp_path):
|
|
26
|
+
"""Parses HostName, User, IdentityFile from a managed SSH config block."""
|
|
27
|
+
cfg = tmp_path / ".ssh" / "config"
|
|
28
|
+
add_ssh_host("i-abc123", "1.2.3.4", "ubuntu", KEY_PATH, config_path=cfg)
|
|
29
|
+
|
|
30
|
+
details = get_ssh_host_details("i-abc123", config_path=cfg)
|
|
31
|
+
assert details is not None
|
|
32
|
+
assert details.hostname == "1.2.3.4"
|
|
33
|
+
assert details.user == "ubuntu"
|
|
34
|
+
assert details.identity_file == Path("/home/user/.ssh/id_ed25519")
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def test_get_ssh_host_details_not_found(tmp_path):
|
|
38
|
+
"""Returns None when no managed block exists for the instance."""
|
|
39
|
+
cfg = tmp_path / ".ssh" / "config"
|
|
40
|
+
cfg.parent.mkdir(parents=True, exist_ok=True)
|
|
41
|
+
cfg.write_text("")
|
|
42
|
+
|
|
43
|
+
assert get_ssh_host_details("i-missing", config_path=cfg) is None
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def test_get_ssh_host_details_nonexistent_file(tmp_path):
|
|
47
|
+
"""Returns None when the SSH config file doesn't exist."""
|
|
48
|
+
cfg = tmp_path / "no_such_file"
|
|
49
|
+
assert get_ssh_host_details("i-abc123", config_path=cfg) is None
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
# ---------------------------------------------------------------------------
|
|
53
|
+
# query_gpu_info
|
|
54
|
+
# ---------------------------------------------------------------------------
|
|
55
|
+
|
|
56
|
+
NVIDIA_SMI_OUTPUT = "560.35.03, Tesla T4, 7.5\n12.8\n12.6\n"
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
@patch("aws_bootstrap.ssh.subprocess.run")
|
|
60
|
+
def test_query_gpu_info_success(mock_run):
|
|
61
|
+
"""Successful nvidia-smi + nvcc output returns a valid GpuInfo."""
|
|
62
|
+
mock_run.return_value = subprocess.CompletedProcess(args=[], returncode=0, stdout=NVIDIA_SMI_OUTPUT, stderr="")
|
|
63
|
+
|
|
64
|
+
info = query_gpu_info("1.2.3.4", "ubuntu", Path("/home/user/.ssh/id_ed25519"))
|
|
65
|
+
assert info is not None
|
|
66
|
+
assert isinstance(info, GpuInfo)
|
|
67
|
+
assert info.driver_version == "560.35.03"
|
|
68
|
+
assert info.cuda_driver_version == "12.8"
|
|
69
|
+
assert info.cuda_toolkit_version == "12.6"
|
|
70
|
+
assert info.gpu_name == "Tesla T4"
|
|
71
|
+
assert info.compute_capability == "7.5"
|
|
72
|
+
assert info.architecture == "Turing"
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
@patch("aws_bootstrap.ssh.subprocess.run")
|
|
76
|
+
def test_query_gpu_info_no_nvcc(mock_run):
|
|
77
|
+
"""When nvcc is unavailable, cuda_toolkit_version is None."""
|
|
78
|
+
output = "560.35.03, Tesla T4, 7.5\n12.8\nN/A\n"
|
|
79
|
+
mock_run.return_value = subprocess.CompletedProcess(args=[], returncode=0, stdout=output, stderr="")
|
|
80
|
+
|
|
81
|
+
info = query_gpu_info("1.2.3.4", "ubuntu", Path("/home/user/.ssh/id_ed25519"))
|
|
82
|
+
assert info is not None
|
|
83
|
+
assert info.cuda_driver_version == "12.8"
|
|
84
|
+
assert info.cuda_toolkit_version is None
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
@patch("aws_bootstrap.ssh.subprocess.run")
|
|
88
|
+
def test_query_gpu_info_ssh_failure(mock_run):
|
|
89
|
+
"""Non-zero exit code returns None."""
|
|
90
|
+
mock_run.return_value = subprocess.CompletedProcess(args=[], returncode=255, stdout="", stderr="Connection refused")
|
|
91
|
+
|
|
92
|
+
info = query_gpu_info("1.2.3.4", "ubuntu", Path("/home/user/.ssh/id_ed25519"))
|
|
93
|
+
assert info is None
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
@patch("aws_bootstrap.ssh.subprocess.run", side_effect=subprocess.TimeoutExpired(cmd="ssh", timeout=15))
|
|
97
|
+
def test_query_gpu_info_timeout(mock_run):
|
|
98
|
+
"""TimeoutExpired returns None."""
|
|
99
|
+
info = query_gpu_info("1.2.3.4", "ubuntu", Path("/home/user/.ssh/id_ed25519"))
|
|
100
|
+
assert info is None
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
@patch("aws_bootstrap.ssh.subprocess.run")
|
|
104
|
+
def test_query_gpu_info_malformed_output(mock_run):
|
|
105
|
+
"""Garbage output returns None."""
|
|
106
|
+
mock_run.return_value = subprocess.CompletedProcess(
|
|
107
|
+
args=[], returncode=0, stdout="not valid gpu output\n", stderr=""
|
|
108
|
+
)
|
|
109
|
+
|
|
110
|
+
info = query_gpu_info("1.2.3.4", "ubuntu", Path("/home/user/.ssh/id_ed25519"))
|
|
111
|
+
assert info is None
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
# ---------------------------------------------------------------------------
|
|
115
|
+
# GPU architecture mapping
|
|
116
|
+
# ---------------------------------------------------------------------------
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
def test_gpu_architecture_mapping():
|
|
120
|
+
"""Known compute capabilities map to correct architecture names."""
|
|
121
|
+
assert _GPU_ARCHITECTURES["7.5"] == "Turing"
|
|
122
|
+
assert _GPU_ARCHITECTURES["8.0"] == "Ampere"
|
|
123
|
+
assert _GPU_ARCHITECTURES["8.6"] == "Ampere"
|
|
124
|
+
assert _GPU_ARCHITECTURES["8.9"] == "Ada Lovelace"
|
|
125
|
+
assert _GPU_ARCHITECTURES["9.0"] == "Hopper"
|
|
126
|
+
assert _GPU_ARCHITECTURES["7.0"] == "Volta"
|
|
127
|
+
|
|
128
|
+
|
|
129
|
+
@patch("aws_bootstrap.ssh.subprocess.run")
|
|
130
|
+
def test_query_gpu_info_unknown_architecture(mock_run):
|
|
131
|
+
"""Unknown compute capability produces a fallback architecture string."""
|
|
132
|
+
mock_run.return_value = subprocess.CompletedProcess(
|
|
133
|
+
args=[], returncode=0, stdout="550.00.00, Future GPU, 10.0\n13.0\n13.0\n", stderr=""
|
|
134
|
+
)
|
|
135
|
+
|
|
136
|
+
info = query_gpu_info("1.2.3.4", "ubuntu", Path("/home/user/.ssh/id_ed25519"))
|
|
137
|
+
assert info is not None
|
|
138
|
+
assert info.architecture == "Unknown (10.0)"
|
|
@@ -0,0 +1,308 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: aws-bootstrap-g4dn
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Bootstrap AWS EC2 GPU instances for hybrid local-remote development
|
|
5
|
+
Author: Adam Ever-Hadani
|
|
6
|
+
License-Expression: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/promptromp/aws-bootstrap-g4dn
|
|
8
|
+
Project-URL: Issues, https://github.com/promptromp/aws-bootstrap-g4dn/issues
|
|
9
|
+
Keywords: aws,ec2,gpu,cuda,deep-learning,spot-instances,cli
|
|
10
|
+
Requires-Python: >=3.14
|
|
11
|
+
Description-Content-Type: text/markdown
|
|
12
|
+
License-File: LICENSE
|
|
13
|
+
Requires-Dist: boto3>=1.35
|
|
14
|
+
Requires-Dist: click>=8.1
|
|
15
|
+
Dynamic: license-file
|
|
16
|
+
|
|
17
|
+
# aws-bootstrap-g4dn
|
|
18
|
+
|
|
19
|
+
--------------------------------------------------------------------------------
|
|
20
|
+
|
|
21
|
+
[](https://github.com/promptromp/aws-bootstrap-g4dn/actions/workflows/ci.yml)
|
|
22
|
+
[](https://github.com/promptromp/aws-bootstrap-g4dn/blob/main/LICENSE)
|
|
23
|
+
[](https://pypi.org/project/aws-bootstrap-g4dn/)
|
|
24
|
+
[](https://pypi.org/project/aws-bootstrap-g4dn/)
|
|
25
|
+
|
|
26
|
+
One command to go from zero to a **fully configured GPU dev box** on AWS — with CUDA-matched PyTorch, Jupyter, SSH aliases, and a GPU benchmark ready to run.
|
|
27
|
+
|
|
28
|
+
```bash
|
|
29
|
+
aws-bootstrap launch # Spot g4dn.xlarge in ~3 minutes
|
|
30
|
+
ssh aws-gpu1 # You're in, venv activated, PyTorch works
|
|
31
|
+
```
|
|
32
|
+
|
|
33
|
+
### ✨ Key Features
|
|
34
|
+
|
|
35
|
+
| | Feature | Details |
|
|
36
|
+
|---|---|---|
|
|
37
|
+
| 🚀 | **One-command launch** | Spot (default) or on-demand, with automatic fallback on capacity errors |
|
|
38
|
+
| 🔑 | **Auto SSH config** | Adds `aws-gpu1` alias to `~/.ssh/config` — no IP juggling. Cleaned up on terminate |
|
|
39
|
+
| 🐍 | **CUDA-aware PyTorch** | Detects the installed CUDA toolkit (`nvcc`) and installs PyTorch from the matching wheel index — no more `torch.version.cuda` mismatches |
|
|
40
|
+
| ✅ | **PyTorch smoke test** | Runs a quick `torch.cuda` matmul after setup to verify the GPU stack works end-to-end |
|
|
41
|
+
| 📊 | **GPU benchmark included** | CNN (MNIST) + Transformer benchmarks with FP16/FP32/BF16 precision and tqdm progress |
|
|
42
|
+
| 📓 | **Jupyter ready** | Lab server auto-starts as a systemd service on port 8888 — just SSH tunnel and open |
|
|
43
|
+
| 🖥️ | **`status --gpu`** | Shows CUDA toolkit version, driver max, GPU architecture, spot pricing, uptime, and estimated cost |
|
|
44
|
+
| 🗑️ | **Clean terminate** | Stops instances, removes SSH aliases, shows shutting-down state until fully gone |
|
|
45
|
+
|
|
46
|
+
### 🎯 Target Workflows
|
|
47
|
+
|
|
48
|
+
1. **Jupyter server-client** — Jupyter runs on the instance, connect from your local browser
|
|
49
|
+
2. **VSCode Remote SSH** — `ssh aws-gpu1` just works with the Remote SSH extension
|
|
50
|
+
3. **NVIDIA Nsight remote debugging** — GPU debugging over SSH
|
|
51
|
+
|
|
52
|
+
---
|
|
53
|
+
|
|
54
|
+
## Requirements
|
|
55
|
+
|
|
56
|
+
1. AWS profile configured with relevant permissions (profile name can be passed via `--profile` or read from `AWS_PROFILE` env var)
|
|
57
|
+
2. AWS CLI v2 — see [here](https://docs.aws.amazon.com/cli/latest/userguide/getting-started-install.html)
|
|
58
|
+
3. Python 3.14+ and [uv](https://github.com/astral-sh/uv)
|
|
59
|
+
4. An SSH key pair (see below)
|
|
60
|
+
|
|
61
|
+
## Installation
|
|
62
|
+
|
|
63
|
+
### From PyPI
|
|
64
|
+
|
|
65
|
+
```bash
|
|
66
|
+
pip install aws-bootstrap-g4dn
|
|
67
|
+
```
|
|
68
|
+
|
|
69
|
+
### From source (development)
|
|
70
|
+
|
|
71
|
+
```bash
|
|
72
|
+
git clone https://github.com/promptromp/aws-bootstrap-g4dn.git
|
|
73
|
+
cd aws-bootstrap-g4dn
|
|
74
|
+
uv venv
|
|
75
|
+
uv sync
|
|
76
|
+
```
|
|
77
|
+
|
|
78
|
+
Both methods install the `aws-bootstrap` CLI.
|
|
79
|
+
|
|
80
|
+
## SSH Key Setup
|
|
81
|
+
|
|
82
|
+
The CLI expects an Ed25519 SSH public key at `~/.ssh/id_ed25519.pub` by default. If you don't have one, generate it:
|
|
83
|
+
|
|
84
|
+
```bash
|
|
85
|
+
ssh-keygen -t ed25519
|
|
86
|
+
```
|
|
87
|
+
|
|
88
|
+
Accept the default path (`~/.ssh/id_ed25519`) and optionally set a passphrase. The key pair will be imported into AWS automatically on first launch.
|
|
89
|
+
|
|
90
|
+
To use a different key, pass `--key-path`:
|
|
91
|
+
|
|
92
|
+
```bash
|
|
93
|
+
aws-bootstrap launch --key-path ~/.ssh/my_other_key.pub
|
|
94
|
+
```
|
|
95
|
+
|
|
96
|
+
## Usage
|
|
97
|
+
|
|
98
|
+
### 🚀 Launching an Instance
|
|
99
|
+
|
|
100
|
+
```bash
|
|
101
|
+
# Show available commands
|
|
102
|
+
aws-bootstrap --help
|
|
103
|
+
|
|
104
|
+
# Dry run — validates AMI lookup, key import, and security group without launching
|
|
105
|
+
aws-bootstrap launch --dry-run
|
|
106
|
+
|
|
107
|
+
# Launch a spot g4dn.xlarge (default)
|
|
108
|
+
aws-bootstrap launch
|
|
109
|
+
|
|
110
|
+
# Launch on-demand in a specific region with a custom instance type
|
|
111
|
+
aws-bootstrap launch --on-demand --instance-type g5.xlarge --region us-east-1
|
|
112
|
+
|
|
113
|
+
# Launch without running the remote setup script
|
|
114
|
+
aws-bootstrap launch --no-setup
|
|
115
|
+
|
|
116
|
+
# Use a specific AWS profile
|
|
117
|
+
aws-bootstrap launch --profile my-aws-profile
|
|
118
|
+
```
|
|
119
|
+
|
|
120
|
+
After launch, the CLI:
|
|
121
|
+
|
|
122
|
+
1. **Adds an SSH alias** (e.g. `aws-gpu1`) to `~/.ssh/config`
|
|
123
|
+
2. **Runs remote setup** — installs utilities, creates a Python venv, installs CUDA-matched PyTorch, sets up Jupyter
|
|
124
|
+
3. **Runs a CUDA smoke test** — verifies `torch.cuda.is_available()` and runs a quick GPU matmul
|
|
125
|
+
4. **Prints connection commands** — SSH, Jupyter tunnel, GPU benchmark, and terminate
|
|
126
|
+
|
|
127
|
+
```bash
|
|
128
|
+
ssh aws-gpu1 # venv auto-activates on login
|
|
129
|
+
```
|
|
130
|
+
|
|
131
|
+
### 🔧 What Remote Setup Does
|
|
132
|
+
|
|
133
|
+
The setup script runs automatically on the instance after SSH becomes available:
|
|
134
|
+
|
|
135
|
+
| Step | What |
|
|
136
|
+
|------|------|
|
|
137
|
+
| **GPU verify** | Confirms `nvidia-smi` and `nvcc` are working |
|
|
138
|
+
| **Utilities** | Installs `htop`, `tmux`, `tree`, `jq` |
|
|
139
|
+
| **Python venv** | Creates `~/venv` with `uv`, auto-activates in `~/.bashrc` |
|
|
140
|
+
| **CUDA-aware PyTorch** | Detects CUDA toolkit version → installs PyTorch from the matching `cu{TAG}` wheel index |
|
|
141
|
+
| **CUDA smoke test** | Runs `torch.cuda.is_available()` + GPU matmul to verify the stack |
|
|
142
|
+
| **GPU benchmark** | Copies `gpu_benchmark.py` to `~/gpu_benchmark.py` |
|
|
143
|
+
| **GPU smoke test notebook** | Copies `gpu_smoke_test.ipynb` to `~/gpu_smoke_test.ipynb` (open in JupyterLab) |
|
|
144
|
+
| **Jupyter** | Configures and starts JupyterLab as a systemd service on port 8888 |
|
|
145
|
+
| **SSH keepalive** | Configures server-side keepalive to prevent idle disconnects |
|
|
146
|
+
|
|
147
|
+
### 📊 GPU Benchmark
|
|
148
|
+
|
|
149
|
+
A GPU throughput benchmark is pre-installed at `~/gpu_benchmark.py` on every instance:
|
|
150
|
+
|
|
151
|
+
```bash
|
|
152
|
+
# Run both CNN and Transformer benchmarks (default)
|
|
153
|
+
ssh aws-gpu1 'python ~/gpu_benchmark.py'
|
|
154
|
+
|
|
155
|
+
# CNN only, quick run
|
|
156
|
+
ssh aws-gpu1 'python ~/gpu_benchmark.py --mode cnn --benchmark-batches 20'
|
|
157
|
+
|
|
158
|
+
# Transformer only with custom batch size
|
|
159
|
+
ssh aws-gpu1 'python ~/gpu_benchmark.py --mode transformer --transformer-batch-size 16'
|
|
160
|
+
|
|
161
|
+
# Run CUDA diagnostics first (tests FP16/FP32 matmul, autocast, etc.)
|
|
162
|
+
ssh aws-gpu1 'python ~/gpu_benchmark.py --diagnose'
|
|
163
|
+
|
|
164
|
+
# Force FP32 precision (if FP16 has issues on your GPU)
|
|
165
|
+
ssh aws-gpu1 'python ~/gpu_benchmark.py --precision fp32'
|
|
166
|
+
```
|
|
167
|
+
|
|
168
|
+
Reports: iterations/sec, samples/sec, peak GPU memory, and avg batch time for each model.
|
|
169
|
+
|
|
170
|
+
### 📓 Jupyter (via SSH Tunnel)
|
|
171
|
+
|
|
172
|
+
```bash
|
|
173
|
+
ssh -NL 8888:localhost:8888 aws-gpu1
|
|
174
|
+
# Then open: http://localhost:8888
|
|
175
|
+
```
|
|
176
|
+
|
|
177
|
+
Or with explicit key/IP:
|
|
178
|
+
```bash
|
|
179
|
+
ssh -i ~/.ssh/id_ed25519 -NL 8888:localhost:8888 ubuntu@<public-ip>
|
|
180
|
+
```
|
|
181
|
+
|
|
182
|
+
A **GPU smoke test notebook** (`~/gpu_smoke_test.ipynb`) is pre-installed on every instance. Open it in JupyterLab to interactively verify the CUDA stack, run FP32/FP16 matmuls, train a small CNN on MNIST, and visualise training loss and GPU memory usage.
|
|
183
|
+
|
|
184
|
+
### 📋 Listing Resources
|
|
185
|
+
|
|
186
|
+
```bash
|
|
187
|
+
# List all g4dn instance types (default)
|
|
188
|
+
aws-bootstrap list instance-types
|
|
189
|
+
|
|
190
|
+
# List a different instance family
|
|
191
|
+
aws-bootstrap list instance-types --prefix p3
|
|
192
|
+
|
|
193
|
+
# List Deep Learning AMIs (default filter)
|
|
194
|
+
aws-bootstrap list amis
|
|
195
|
+
|
|
196
|
+
# List AMIs with a custom filter
|
|
197
|
+
aws-bootstrap list amis --filter "ubuntu/images/hvm-ssd-gp3/ubuntu-noble*"
|
|
198
|
+
|
|
199
|
+
# Use a specific region
|
|
200
|
+
aws-bootstrap list instance-types --region us-east-1
|
|
201
|
+
aws-bootstrap list amis --region us-east-1
|
|
202
|
+
```
|
|
203
|
+
|
|
204
|
+
### 🖥️ Managing Instances
|
|
205
|
+
|
|
206
|
+
```bash
|
|
207
|
+
# Show all aws-bootstrap instances (including shutting-down)
|
|
208
|
+
aws-bootstrap status
|
|
209
|
+
|
|
210
|
+
# Include GPU info (CUDA toolkit + driver version, GPU name, architecture) via SSH
|
|
211
|
+
aws-bootstrap status --gpu
|
|
212
|
+
|
|
213
|
+
# List instances in a specific region
|
|
214
|
+
aws-bootstrap status --region us-east-1
|
|
215
|
+
|
|
216
|
+
# Terminate all aws-bootstrap instances (with confirmation prompt)
|
|
217
|
+
aws-bootstrap terminate
|
|
218
|
+
|
|
219
|
+
# Terminate specific instances
|
|
220
|
+
aws-bootstrap terminate i-abc123 i-def456
|
|
221
|
+
|
|
222
|
+
# Skip confirmation prompt
|
|
223
|
+
aws-bootstrap terminate --yes
|
|
224
|
+
```
|
|
225
|
+
|
|
226
|
+
`status --gpu` reports both the **installed CUDA toolkit** version (from `nvcc`) and the **maximum CUDA version supported by the driver** (from `nvidia-smi`), so you can see at a glance whether they match:
|
|
227
|
+
|
|
228
|
+
```
|
|
229
|
+
CUDA: 12.8 (driver supports up to 13.0)
|
|
230
|
+
```
|
|
231
|
+
|
|
232
|
+
SSH aliases are managed automatically — they're created on `launch`, shown in `status`, and cleaned up on `terminate`. Aliases use sequential numbering (`aws-gpu1`, `aws-gpu2`, etc.) and never reuse numbers from previous instances.
|
|
233
|
+
|
|
234
|
+
## EC2 vCPU Quotas
|
|
235
|
+
|
|
236
|
+
AWS accounts have [service quotas](https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/ec2-resource-limits.html) that limit how many vCPUs you can run per instance family. New or lightly-used accounts often have a **default quota of 0 vCPUs** for GPU instance families (G and VT), which will cause errors on launch:
|
|
237
|
+
|
|
238
|
+
- **Spot**: `MaxSpotInstanceCountExceeded`
|
|
239
|
+
- **On-Demand**: `VcpuLimitExceeded`
|
|
240
|
+
|
|
241
|
+
Check your current quotas (g4dn.xlarge requires at least 4 vCPUs):
|
|
242
|
+
|
|
243
|
+
```bash
|
|
244
|
+
# Spot G/VT quota
|
|
245
|
+
aws service-quotas get-service-quota \
|
|
246
|
+
--service-code ec2 \
|
|
247
|
+
--quota-code L-3819A6DF \
|
|
248
|
+
--region us-west-2
|
|
249
|
+
|
|
250
|
+
# On-Demand G/VT quota
|
|
251
|
+
aws service-quotas get-service-quota \
|
|
252
|
+
--service-code ec2 \
|
|
253
|
+
--quota-code L-DB2BBE81 \
|
|
254
|
+
--region us-west-2
|
|
255
|
+
```
|
|
256
|
+
|
|
257
|
+
Request increases:
|
|
258
|
+
|
|
259
|
+
```bash
|
|
260
|
+
# Spot — increase to 4 vCPUs
|
|
261
|
+
aws service-quotas request-service-quota-increase \
|
|
262
|
+
--service-code ec2 \
|
|
263
|
+
--quota-code L-3819A6DF \
|
|
264
|
+
--desired-value 4 \
|
|
265
|
+
--region us-west-2
|
|
266
|
+
|
|
267
|
+
# On-Demand — increase to 4 vCPUs
|
|
268
|
+
aws service-quotas request-service-quota-increase \
|
|
269
|
+
--service-code ec2 \
|
|
270
|
+
--quota-code L-DB2BBE81 \
|
|
271
|
+
--desired-value 4 \
|
|
272
|
+
--region us-west-2
|
|
273
|
+
```
|
|
274
|
+
|
|
275
|
+
Quota codes may vary by region or account type. To list the actual codes in your region:
|
|
276
|
+
|
|
277
|
+
```bash
|
|
278
|
+
# List all G/VT-related quotas
|
|
279
|
+
aws service-quotas list-service-quotas \
|
|
280
|
+
--service-code ec2 \
|
|
281
|
+
--region us-west-2 \
|
|
282
|
+
--query "Quotas[?contains(QuotaName, 'G and VT')].[QuotaCode,QuotaName,Value]" \
|
|
283
|
+
--output table
|
|
284
|
+
```
|
|
285
|
+
|
|
286
|
+
Common quota codes:
|
|
287
|
+
- `L-3819A6DF` — All G and VT **Spot** Instance Requests
|
|
288
|
+
- `L-DB2BBE81` — Running **On-Demand** G and VT instances
|
|
289
|
+
|
|
290
|
+
Small increases (4-8 vCPUs) are typically auto-approved within minutes. You can also request increases via the [Service Quotas console](https://console.aws.amazon.com/servicequotas/home). While waiting, you can test the full launch/poll/SSH flow with a non-GPU instance type:
|
|
291
|
+
|
|
292
|
+
```bash
|
|
293
|
+
aws-bootstrap launch --instance-type t3.medium --ami-filter "ubuntu/images/hvm-ssd-gp3/ubuntu-noble-24.04-amd64-server-*"
|
|
294
|
+
```
|
|
295
|
+
|
|
296
|
+
## Additional Resources
|
|
297
|
+
|
|
298
|
+
| Topic | Link |
|
|
299
|
+
|-------|------|
|
|
300
|
+
| GPU instance pricing | [instances.vantage.sh](https://instances.vantage.sh/aws/ec2/g4dn.xlarge) |
|
|
301
|
+
| Spot instance quotas | [AWS docs](https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/using-spot-limits.html) |
|
|
302
|
+
| Deep Learning AMIs | [AWS docs](https://docs.aws.amazon.com/dlami/latest/devguide/what-is-dlami.html) |
|
|
303
|
+
| Nvidia Nsight remote debugging | [Nvidia docs](https://docs.nvidia.com/nsight-visual-studio-edition/3.2/Content/Setup_Remote_Debugging.htm) |
|
|
304
|
+
|
|
305
|
+
Tutorials on setting up a CUDA environment on EC2 GPU instances:
|
|
306
|
+
|
|
307
|
+
- [Provision an EC2 GPU Host on AWS](https://www.dolthub.com/blog/2025-03-12-provision-an-ec2-gpu-host-on-aws/) (DoltHub, 2025)
|
|
308
|
+
- [AWS EC2 Setup for GPU/CUDA Programming](https://techfortalk.co.uk/2025/10/11/aws-ec2-setup-for-gpu-cuda-programming/) (TechForTalk, 2025)
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
aws_bootstrap/__init__.py,sha256=kl_jvrunGyIyizdRqAP6ROb5P1BBrXX5PTq5gq1ipU0,82
|
|
2
|
+
aws_bootstrap/cli.py,sha256=UVMCBOfTm1q7RxVDot5d3a8SSjLj9zevbvGgWamQDQQ,16134
|
|
3
|
+
aws_bootstrap/config.py,sha256=bOADtpujEacED0pu9m7D781UFlMhZrmtHQ7eqI6ySjk,834
|
|
4
|
+
aws_bootstrap/ec2.py,sha256=-yEyGMCycY4ccsmbgqHnLK2FRFWX2kr7nLfYWXSKeaY,12242
|
|
5
|
+
aws_bootstrap/ssh.py,sha256=-8F0PAkl7CCY1b9n46ZhWJ6faIMlSvA26BleeIp-rMA,17533
|
|
6
|
+
aws_bootstrap/resources/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
7
|
+
aws_bootstrap/resources/gpu_benchmark.py,sha256=2uoss2bZGhg7c3D7Hg1-EJlOVDtzAH4co1ahSvF_lVU,29080
|
|
8
|
+
aws_bootstrap/resources/gpu_smoke_test.ipynb,sha256=XvAOEIPa5H9ri5mRZqOdknmwOwKNvCME6DzBGuhRYfg,10698
|
|
9
|
+
aws_bootstrap/resources/remote_setup.sh,sha256=FzpXEw-LvlXROi-PmO72yEyDWWi-3Tul6D7-vFDubXQ,5460
|
|
10
|
+
aws_bootstrap/resources/requirements.txt,sha256=gpYl1MFCfWXiAhbIUgAjuTHONz3MKci25msIyOkMmUk,75
|
|
11
|
+
aws_bootstrap/tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
12
|
+
aws_bootstrap/tests/test_cli.py,sha256=4lvv1YqiaJ9_KnWqbDVWh2aH03by0liYvz6tx-hLj7k,18923
|
|
13
|
+
aws_bootstrap/tests/test_config.py,sha256=arvET6KNl4Vqsz0zFrSdhciXGU688bfsvCr3dSpziN0,1050
|
|
14
|
+
aws_bootstrap/tests/test_ec2.py,sha256=Jmqsjv973hxXbZWfGgECtm6aa2156Lzji227sYMBuMg,10547
|
|
15
|
+
aws_bootstrap/tests/test_ssh_config.py,sha256=Rt3e7B22d8kK0PzFgXB4gwpF4HvIseiqzcpouCwMo5M,10333
|
|
16
|
+
aws_bootstrap/tests/test_ssh_gpu.py,sha256=W6GQzILCy_qPrvWQlCC8Ris-vuTzTGiyNXEyMzwD1kM,5154
|
|
17
|
+
aws_bootstrap_g4dn-0.1.0.dist-info/licenses/LICENSE,sha256=Hen77Mt8sazSQJ9DgrmZuAvDwo2vc5JAkR_avuFV-CM,1067
|
|
18
|
+
aws_bootstrap_g4dn-0.1.0.dist-info/METADATA,sha256=FT4d1jH8ANZ_Kfc_5VXt6jMEFIHsIJ2PzeyU3UnXHA0,11596
|
|
19
|
+
aws_bootstrap_g4dn-0.1.0.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
|
|
20
|
+
aws_bootstrap_g4dn-0.1.0.dist-info/entry_points.txt,sha256=T8FXfOgmLEvFi8DHaFJ3tCzId9J3_d2Y6qT98OXxCjA,57
|
|
21
|
+
aws_bootstrap_g4dn-0.1.0.dist-info/top_level.txt,sha256=mix9gZRs8JUv0OMSB_rwdGcRnTKzsKgHrE5fyAn5zJw,14
|
|
22
|
+
aws_bootstrap_g4dn-0.1.0.dist-info/RECORD,,
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2025 PromptRomp
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
aws_bootstrap
|