aws-bootstrap-g4dn 0.5.0__tar.gz → 0.6.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (46) hide show
  1. {aws_bootstrap_g4dn-0.5.0 → aws_bootstrap_g4dn-0.6.0}/CLAUDE.md +36 -5
  2. {aws_bootstrap_g4dn-0.5.0 → aws_bootstrap_g4dn-0.6.0}/PKG-INFO +53 -7
  3. {aws_bootstrap_g4dn-0.5.0 → aws_bootstrap_g4dn-0.6.0}/README.md +52 -6
  4. {aws_bootstrap_g4dn-0.5.0 → aws_bootstrap_g4dn-0.6.0}/aws_bootstrap/cli.py +172 -8
  5. {aws_bootstrap_g4dn-0.5.0 → aws_bootstrap_g4dn-0.6.0}/aws_bootstrap/config.py +2 -0
  6. {aws_bootstrap_g4dn-0.5.0 → aws_bootstrap_g4dn-0.6.0}/aws_bootstrap/ec2.py +128 -0
  7. {aws_bootstrap_g4dn-0.5.0 → aws_bootstrap_g4dn-0.6.0}/aws_bootstrap/resources/remote_setup.sh +2 -2
  8. {aws_bootstrap_g4dn-0.5.0 → aws_bootstrap_g4dn-0.6.0}/aws_bootstrap/ssh.py +121 -0
  9. {aws_bootstrap_g4dn-0.5.0 → aws_bootstrap_g4dn-0.6.0}/aws_bootstrap/tests/test_cli.py +372 -4
  10. {aws_bootstrap_g4dn-0.5.0 → aws_bootstrap_g4dn-0.6.0}/aws_bootstrap/tests/test_config.py +18 -0
  11. aws_bootstrap_g4dn-0.6.0/aws_bootstrap/tests/test_ebs.py +245 -0
  12. {aws_bootstrap_g4dn-0.5.0 → aws_bootstrap_g4dn-0.6.0}/aws_bootstrap/tests/test_ssh_config.py +76 -0
  13. aws_bootstrap_g4dn-0.6.0/aws_bootstrap/tests/test_ssh_ebs.py +76 -0
  14. {aws_bootstrap_g4dn-0.5.0 → aws_bootstrap_g4dn-0.6.0}/aws_bootstrap_g4dn.egg-info/PKG-INFO +53 -7
  15. {aws_bootstrap_g4dn-0.5.0 → aws_bootstrap_g4dn-0.6.0}/aws_bootstrap_g4dn.egg-info/SOURCES.txt +2 -0
  16. {aws_bootstrap_g4dn-0.5.0 → aws_bootstrap_g4dn-0.6.0}/.github/ISSUE_TEMPLATE/bug_report.md +0 -0
  17. {aws_bootstrap_g4dn-0.5.0 → aws_bootstrap_g4dn-0.6.0}/.github/ISSUE_TEMPLATE/feature_request.md +0 -0
  18. {aws_bootstrap_g4dn-0.5.0 → aws_bootstrap_g4dn-0.6.0}/.github/workflows/ci.yml +0 -0
  19. {aws_bootstrap_g4dn-0.5.0 → aws_bootstrap_g4dn-0.6.0}/.github/workflows/publish-to-pypi.yml +0 -0
  20. {aws_bootstrap_g4dn-0.5.0 → aws_bootstrap_g4dn-0.6.0}/.gitignore +0 -0
  21. {aws_bootstrap_g4dn-0.5.0 → aws_bootstrap_g4dn-0.6.0}/.pre-commit-config.yaml +0 -0
  22. {aws_bootstrap_g4dn-0.5.0 → aws_bootstrap_g4dn-0.6.0}/CODE_OF_CONDUCT.md +0 -0
  23. {aws_bootstrap_g4dn-0.5.0 → aws_bootstrap_g4dn-0.6.0}/CONTRIBUTING.md +0 -0
  24. {aws_bootstrap_g4dn-0.5.0 → aws_bootstrap_g4dn-0.6.0}/LICENSE +0 -0
  25. {aws_bootstrap_g4dn-0.5.0 → aws_bootstrap_g4dn-0.6.0}/SECURITY.md +0 -0
  26. {aws_bootstrap_g4dn-0.5.0 → aws_bootstrap_g4dn-0.6.0}/aws_bootstrap/__init__.py +0 -0
  27. {aws_bootstrap_g4dn-0.5.0 → aws_bootstrap_g4dn-0.6.0}/aws_bootstrap/gpu.py +0 -0
  28. {aws_bootstrap_g4dn-0.5.0 → aws_bootstrap_g4dn-0.6.0}/aws_bootstrap/resources/__init__.py +0 -0
  29. {aws_bootstrap_g4dn-0.5.0 → aws_bootstrap_g4dn-0.6.0}/aws_bootstrap/resources/gpu_benchmark.py +0 -0
  30. {aws_bootstrap_g4dn-0.5.0 → aws_bootstrap_g4dn-0.6.0}/aws_bootstrap/resources/gpu_smoke_test.ipynb +0 -0
  31. {aws_bootstrap_g4dn-0.5.0 → aws_bootstrap_g4dn-0.6.0}/aws_bootstrap/resources/launch.json +0 -0
  32. {aws_bootstrap_g4dn-0.5.0 → aws_bootstrap_g4dn-0.6.0}/aws_bootstrap/resources/requirements.txt +0 -0
  33. {aws_bootstrap_g4dn-0.5.0 → aws_bootstrap_g4dn-0.6.0}/aws_bootstrap/resources/saxpy.cu +0 -0
  34. {aws_bootstrap_g4dn-0.5.0 → aws_bootstrap_g4dn-0.6.0}/aws_bootstrap/resources/tasks.json +0 -0
  35. {aws_bootstrap_g4dn-0.5.0 → aws_bootstrap_g4dn-0.6.0}/aws_bootstrap/tests/__init__.py +0 -0
  36. {aws_bootstrap_g4dn-0.5.0 → aws_bootstrap_g4dn-0.6.0}/aws_bootstrap/tests/test_ec2.py +0 -0
  37. {aws_bootstrap_g4dn-0.5.0 → aws_bootstrap_g4dn-0.6.0}/aws_bootstrap/tests/test_gpu.py +0 -0
  38. {aws_bootstrap_g4dn-0.5.0 → aws_bootstrap_g4dn-0.6.0}/aws_bootstrap/tests/test_ssh_gpu.py +0 -0
  39. {aws_bootstrap_g4dn-0.5.0 → aws_bootstrap_g4dn-0.6.0}/aws_bootstrap_g4dn.egg-info/dependency_links.txt +0 -0
  40. {aws_bootstrap_g4dn-0.5.0 → aws_bootstrap_g4dn-0.6.0}/aws_bootstrap_g4dn.egg-info/entry_points.txt +0 -0
  41. {aws_bootstrap_g4dn-0.5.0 → aws_bootstrap_g4dn-0.6.0}/aws_bootstrap_g4dn.egg-info/requires.txt +0 -0
  42. {aws_bootstrap_g4dn-0.5.0 → aws_bootstrap_g4dn-0.6.0}/aws_bootstrap_g4dn.egg-info/top_level.txt +0 -0
  43. {aws_bootstrap_g4dn-0.5.0 → aws_bootstrap_g4dn-0.6.0}/docs/nsight-remote-profiling.md +0 -0
  44. {aws_bootstrap_g4dn-0.5.0 → aws_bootstrap_g4dn-0.6.0}/pyproject.toml +0 -0
  45. {aws_bootstrap_g4dn-0.5.0 → aws_bootstrap_g4dn-0.6.0}/setup.cfg +0 -0
  46. {aws_bootstrap_g4dn-0.5.0 → aws_bootstrap_g4dn-0.6.0}/uv.lock +0 -0
@@ -32,9 +32,9 @@ aws_bootstrap/
32
32
  __init__.py # Package init
33
33
  cli.py # Click CLI entry point (launch, status, terminate commands)
34
34
  config.py # LaunchConfig dataclass with defaults
35
- ec2.py # AMI lookup, security group, instance launch/find/terminate, polling, spot pricing
35
+ ec2.py # AMI lookup, security group, instance launch/find/terminate, polling, spot pricing, EBS volume ops
36
36
  gpu.py # GPU architecture mapping and GpuInfo dataclass
37
- ssh.py # SSH key pair import, SSH readiness check, remote setup, ~/.ssh/config management, GPU queries
37
+ ssh.py # SSH key pair import, SSH readiness check, remote setup, ~/.ssh/config management, GPU queries, EBS mount
38
38
  resources/ # Non-Python artifacts SCP'd to remote instances
39
39
  __init__.py
40
40
  gpu_benchmark.py # GPU throughput benchmark (CNN + Transformer), copied to ~/gpu_benchmark.py on instance
@@ -51,6 +51,8 @@ aws_bootstrap/
51
51
  test_gpu.py
52
52
  test_ssh_config.py
53
53
  test_ssh_gpu.py
54
+ test_ebs.py
55
+ test_ssh_ebs.py
54
56
  docs/
55
57
  nsight-remote-profiling.md # Nsight Compute, Nsight Systems, and Nsight VSCE remote profiling guide
56
58
  spot-request-lifecycle.md # Research notes on spot request cleanup
@@ -60,9 +62,10 @@ Entry point: `aws-bootstrap = "aws_bootstrap.cli:main"` (installed via `uv sync`
60
62
 
61
63
  ## CLI Commands
62
64
 
63
- - **`launch`** — provisions an EC2 instance (spot by default, falls back to on-demand on capacity errors); adds SSH config alias (e.g. `aws-gpu1`) to `~/.ssh/config`; `--python-version` controls which Python `uv` installs in the remote venv; `--ssh-port` overrides the default SSH port (22) for security group ingress, connection checks, and SSH config
64
- - **`status`** — lists all non-terminated instances (including `shutting-down`) with type, IP, SSH alias, pricing (spot price/hr or on-demand), uptime, and estimated cost for running spot instances; `--gpu` flag queries GPU info via SSH, reporting both CUDA toolkit version (from `nvcc`) and driver-supported max (from `nvidia-smi`); `--instructions` (default: on) prints connection commands (SSH, Jupyter tunnel, VSCode Remote SSH, GPU benchmark) for each running instance; suppress with `--no-instructions`
65
- - **`terminate`** — terminates instances by ID or SSH alias (e.g. `aws-gpu1`, resolved via `~/.ssh/config`), or all aws-bootstrap instances in the region if no arguments given; removes SSH config aliases
65
+ - **`launch`** — provisions an EC2 instance (spot by default, falls back to on-demand on capacity errors); adds SSH config alias (e.g. `aws-gpu1`) to `~/.ssh/config`; `--python-version` controls which Python `uv` installs in the remote venv; `--ssh-port` overrides the default SSH port (22) for security group ingress, connection checks, and SSH config; `--ebs-storage SIZE` creates and attaches a new gp3 EBS data volume (mounted at `/data`); `--ebs-volume-id ID` attaches an existing EBS volume (mutually exclusive with `--ebs-storage`)
66
+ - **`status`** — lists all non-terminated instances (including `shutting-down`) with type, IP, SSH alias, EBS data volumes, pricing (spot price/hr or on-demand), uptime, and estimated cost for running spot instances; `--gpu` flag queries GPU info via SSH, reporting both CUDA toolkit version (from `nvcc`) and driver-supported max (from `nvidia-smi`); `--instructions` (default: on) prints connection commands (SSH, Jupyter tunnel, VSCode Remote SSH, GPU benchmark) for each running instance; suppress with `--no-instructions`
67
+ - **`terminate`** — terminates instances by ID or SSH alias (e.g. `aws-gpu1`, resolved via `~/.ssh/config`), or all aws-bootstrap instances in the region if no arguments given; removes SSH config aliases; deletes associated EBS data volumes by default; `--keep-ebs` preserves volumes and prints reattach commands
68
+ - **`cleanup`** — removes stale `~/.ssh/config` entries for terminated/non-existent instances; compares managed SSH config blocks against live EC2 instances; `--dry-run` previews removals without modifying config; `--yes` skips the confirmation prompt
66
69
  - **`list instance-types`** — lists EC2 instance types matching a family prefix (default: `g4dn`), showing vCPUs, memory, and GPU info
67
70
  - **`list amis`** — lists available AMIs matching a name pattern (default: Deep Learning Base OSS Nvidia Driver GPU AMIs), sorted newest-first
68
71
 
@@ -112,6 +115,34 @@ The `KNOWN_CUDA_TAGS` array in `remote_setup.sh` lists the CUDA wheel tags publi
112
115
 
113
116
  `resources/gpu_benchmark.py` is uploaded to `~/gpu_benchmark.py` on the remote instance during setup. It benchmarks GPU throughput with two modes: CNN on MNIST and a GPT-style Transformer on synthetic data. It reports samples/sec, batch times, and peak GPU memory. Supports `--precision` (fp32/fp16/bf16/tf32), `--diagnose` for CUDA smoke tests, and separate `--transformer-batch-size` (default 32, T4-safe). Dependencies (`torch`, `torchvision`, `tqdm`) are already installed by the setup script.
114
117
 
118
+ ## EBS Data Volumes
119
+
120
+ The `--ebs-storage` and `--ebs-volume-id` options on `launch` create or attach persistent gp3 EBS volumes mounted at `/data`. The implementation spans three modules:
121
+
122
+ - **`ec2.py`** — Volume lifecycle: `create_ebs_volume`, `validate_ebs_volume`, `attach_ebs_volume`, `detach_ebs_volume`, `delete_ebs_volume`, `find_ebs_volumes_for_instance`. Constants `EBS_DEVICE_NAME` (`/dev/sdf`) and `EBS_MOUNT_POINT` (`/data`).
123
+ - **`ssh.py`** — `mount_ebs_volume()` SSHs to the instance and runs a shell script that detects the device, optionally formats it, mounts it, and adds an fstab entry.
124
+ - **`cli.py`** — Orchestrates the flow: create/validate → attach → wait for SSH → mount. Mount failures are non-fatal (warn and continue).
125
+
126
+ ### Tagging strategy
127
+
128
+ Volumes are tagged for discovery by `status` and `terminate`:
129
+
130
+ | Tag | Value | Purpose |
131
+ |-----|-------|---------|
132
+ | `created-by` | `aws-bootstrap-g4dn` | Standard tool-managed resource tag |
133
+ | `Name` | `aws-bootstrap-data-{instance_id}` | Human-readable in AWS console |
134
+ | `aws-bootstrap-instance` | `i-xxxxxxxxx` | Links volume to instance for `find_ebs_volumes_for_instance` |
135
+
136
+ ### NVMe device detection
137
+
138
+ On Nitro instances (g4dn), `/dev/sdf` is remapped to `/dev/nvmeXn1`. The mount script detects the correct device by matching the volume ID serial number via `lsblk -o NAME,SERIAL -dpn`, with fallbacks to `/dev/nvme1n1`, `/dev/xvdf`, `/dev/sdf`.
139
+
140
+ ### Spot interruption and terminate cleanup
141
+
142
+ Non-root EBS volumes attached via API have `DeleteOnTermination=False` by default. This means data volumes **survive spot interruptions** — when AWS reclaims the instance, the volume detaches and becomes `available`, preserving all data. The user can reattach it to a new instance with `--ebs-volume-id`.
143
+
144
+ The `terminate` command discovers volumes via `find_ebs_volumes_for_instance`, waits for them to detach (becomes `available`), then deletes them. `--keep-ebs` skips deletion and prints the volume ID with a reattach command.
145
+
115
146
  ## Versioning & Publishing
116
147
 
117
148
  Version is derived automatically from git tags via **setuptools-scm** — no hardcoded version string in the codebase.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: aws-bootstrap-g4dn
3
- Version: 0.5.0
3
+ Version: 0.6.0
4
4
  Summary: Bootstrap AWS EC2 GPU instances for hybrid local-remote development
5
5
  Author: Adam Ever-Hadani
6
6
  License-Expression: MIT
@@ -44,7 +44,8 @@ ssh aws-gpu1 # You're in, venv activated, PyTorch works
44
44
  | 📊 | **GPU benchmark included** | CNN (MNIST) + Transformer benchmarks with FP16/FP32/BF16 precision and tqdm progress |
45
45
  | 📓 | **Jupyter ready** | Lab server auto-starts as a systemd service on port 8888 — just SSH tunnel and open |
46
46
  | 🖥️ | **`status --gpu`** | Shows CUDA toolkit version, driver max, GPU architecture, spot pricing, uptime, and estimated cost |
47
- | 🗑️ | **Clean terminate** | Stops instances, removes SSH aliases, shows shutting-down state until fully gone |
47
+ | 💾 | **EBS data volumes** | Attach persistent storage at `/data` survives spot interruptions and termination, reattach to new instances |
48
+ | 🗑️ | **Clean terminate** | Stops instances, removes SSH aliases, cleans up EBS volumes (or preserves with `--keep-ebs`) |
48
49
 
49
50
  ### 🎯 Target Workflows
50
51
 
@@ -132,16 +133,24 @@ aws-bootstrap launch --python-version 3.13
132
133
  # Use a non-default SSH port
133
134
  aws-bootstrap launch --ssh-port 2222
134
135
 
136
+ # Attach a persistent EBS data volume (96 GB gp3, mounted at /data)
137
+ aws-bootstrap launch --ebs-storage 96
138
+
139
+ # Reattach an existing EBS volume from a previous instance
140
+ aws-bootstrap launch --ebs-volume-id vol-0abc123def456
141
+
135
142
  # Use a specific AWS profile
136
143
  aws-bootstrap launch --profile my-aws-profile
137
144
  ```
138
145
 
139
146
  After launch, the CLI:
140
147
 
141
- 1. **Adds an SSH alias** (e.g. `aws-gpu1`) to `~/.ssh/config`
142
- 2. **Runs remote setup** installs utilities, creates a Python venv, installs CUDA-matched PyTorch, sets up Jupyter
143
- 3. **Runs a CUDA smoke test** — verifies `torch.cuda.is_available()` and runs a quick GPU matmul
144
- 4. **Prints connection commands** — SSH, Jupyter tunnel, GPU benchmark, and terminate
148
+ 1. **Creates/attaches EBS volume** (if `--ebs-storage` or `--ebs-volume-id` was specified)
149
+ 2. **Adds an SSH alias** (e.g. `aws-gpu1`) to `~/.ssh/config`
150
+ 3. **Runs remote setup** — installs utilities, creates a Python venv, installs CUDA-matched PyTorch, sets up Jupyter
151
+ 4. **Mounts EBS volume** at `/data` (if applicable formats new volumes, mounts existing ones as-is)
152
+ 5. **Runs a CUDA smoke test** — verifies `torch.cuda.is_available()` and runs a quick GPU matmul
153
+ 6. **Prints connection commands** — SSH, Jupyter tunnel, GPU benchmark, and terminate
145
154
 
146
155
  ```bash
147
156
  ssh aws-gpu1 # venv auto-activates on login
@@ -154,7 +163,7 @@ The setup script runs automatically on the instance after SSH becomes available:
154
163
  | Step | What |
155
164
  |------|------|
156
165
  | **GPU verify** | Confirms `nvidia-smi` and `nvcc` are working |
157
- | **Utilities** | Installs `htop`, `tmux`, `tree`, `jq` |
166
+ | **Utilities** | Installs `htop`, `tmux`, `tree`, `jq`, `ffmpeg` |
158
167
  | **Python venv** | Creates `~/venv` with `uv`, auto-activates in `~/.bashrc`. Use `--python-version` to pin a specific Python (e.g. `3.13`) |
159
168
  | **CUDA-aware PyTorch** | Detects CUDA toolkit version → installs PyTorch from the matching `cu{TAG}` wheel index |
160
169
  | **CUDA smoke test** | Runs `torch.cuda.is_available()` + GPU matmul to verify the stack |
@@ -261,6 +270,9 @@ aws-bootstrap status --region us-east-1
261
270
  # Terminate all aws-bootstrap instances (with confirmation prompt)
262
271
  aws-bootstrap terminate
263
272
 
273
+ # Terminate but preserve EBS data volumes for reuse
274
+ aws-bootstrap terminate --keep-ebs
275
+
264
276
  # Terminate by SSH alias (resolved via ~/.ssh/config)
265
277
  aws-bootstrap terminate aws-gpu1
266
278
 
@@ -272,6 +284,15 @@ aws-bootstrap terminate aws-gpu1 i-def456
272
284
 
273
285
  # Skip confirmation prompt
274
286
  aws-bootstrap terminate --yes
287
+
288
+ # Remove stale SSH config entries for terminated instances
289
+ aws-bootstrap cleanup
290
+
291
+ # Preview what would be removed without modifying config
292
+ aws-bootstrap cleanup --dry-run
293
+
294
+ # Skip confirmation prompt
295
+ aws-bootstrap cleanup --yes
275
296
  ```
276
297
 
277
298
  `status --gpu` reports both the **installed CUDA toolkit** version (from `nvcc`) and the **maximum CUDA version supported by the driver** (from `nvidia-smi`), so you can see at a glance whether they match:
@@ -282,6 +303,31 @@ CUDA: 12.8 (driver supports up to 13.0)
282
303
 
283
304
  SSH aliases are managed automatically — they're created on `launch`, shown in `status`, and cleaned up on `terminate`. Aliases use sequential numbering (`aws-gpu1`, `aws-gpu2`, etc.) and never reuse numbers from previous instances. You can use aliases anywhere you'd use an instance ID, e.g. `aws-bootstrap terminate aws-gpu1`.
284
305
 
306
+ ## EBS Data Volumes
307
+
308
+ Attach persistent EBS storage to keep datasets and model checkpoints across instance lifecycles. Volumes are mounted at `/data` and persist independently of the instance.
309
+
310
+ ```bash
311
+ # Create a new 96 GB gp3 volume, formatted and mounted at /data
312
+ aws-bootstrap launch --ebs-storage 96
313
+
314
+ # After terminating with --keep-ebs, reattach the same volume to a new instance
315
+ aws-bootstrap terminate --keep-ebs
316
+ # Output: Preserving EBS volume: vol-0abc123...
317
+ # Reattach with: aws-bootstrap launch --ebs-volume-id vol-0abc123...
318
+
319
+ aws-bootstrap launch --ebs-volume-id vol-0abc123def456
320
+ ```
321
+
322
+ Key behaviors:
323
+ - `--ebs-storage` and `--ebs-volume-id` are mutually exclusive
324
+ - New volumes are formatted as ext4; existing volumes are mounted as-is
325
+ - Volumes are tagged for automatic discovery by `status` and `terminate`
326
+ - `terminate` deletes data volumes by default; use `--keep-ebs` to preserve them
327
+ - **Spot-safe** — data volumes survive spot interruptions. If AWS reclaims your instance, the volume detaches automatically and can be reattached to a new instance with `--ebs-volume-id`
328
+ - EBS volumes must be in the same availability zone as the instance
329
+ - Mount failures are non-fatal — the instance remains usable
330
+
285
331
  ## EC2 vCPU Quotas
286
332
 
287
333
  AWS accounts have [service quotas](https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/ec2-resource-limits.html) that limit how many vCPUs you can run per instance family. New or lightly-used accounts often have a **default quota of 0 vCPUs** for GPU instance families (G and VT), which will cause errors on launch:
@@ -25,7 +25,8 @@ ssh aws-gpu1 # You're in, venv activated, PyTorch works
25
25
  | 📊 | **GPU benchmark included** | CNN (MNIST) + Transformer benchmarks with FP16/FP32/BF16 precision and tqdm progress |
26
26
  | 📓 | **Jupyter ready** | Lab server auto-starts as a systemd service on port 8888 — just SSH tunnel and open |
27
27
  | 🖥️ | **`status --gpu`** | Shows CUDA toolkit version, driver max, GPU architecture, spot pricing, uptime, and estimated cost |
28
- | 🗑️ | **Clean terminate** | Stops instances, removes SSH aliases, shows shutting-down state until fully gone |
28
+ | 💾 | **EBS data volumes** | Attach persistent storage at `/data` survives spot interruptions and termination, reattach to new instances |
29
+ | 🗑️ | **Clean terminate** | Stops instances, removes SSH aliases, cleans up EBS volumes (or preserves with `--keep-ebs`) |
29
30
 
30
31
  ### 🎯 Target Workflows
31
32
 
@@ -113,16 +114,24 @@ aws-bootstrap launch --python-version 3.13
113
114
  # Use a non-default SSH port
114
115
  aws-bootstrap launch --ssh-port 2222
115
116
 
117
+ # Attach a persistent EBS data volume (96 GB gp3, mounted at /data)
118
+ aws-bootstrap launch --ebs-storage 96
119
+
120
+ # Reattach an existing EBS volume from a previous instance
121
+ aws-bootstrap launch --ebs-volume-id vol-0abc123def456
122
+
116
123
  # Use a specific AWS profile
117
124
  aws-bootstrap launch --profile my-aws-profile
118
125
  ```
119
126
 
120
127
  After launch, the CLI:
121
128
 
122
- 1. **Adds an SSH alias** (e.g. `aws-gpu1`) to `~/.ssh/config`
123
- 2. **Runs remote setup** installs utilities, creates a Python venv, installs CUDA-matched PyTorch, sets up Jupyter
124
- 3. **Runs a CUDA smoke test** — verifies `torch.cuda.is_available()` and runs a quick GPU matmul
125
- 4. **Prints connection commands** — SSH, Jupyter tunnel, GPU benchmark, and terminate
129
+ 1. **Creates/attaches EBS volume** (if `--ebs-storage` or `--ebs-volume-id` was specified)
130
+ 2. **Adds an SSH alias** (e.g. `aws-gpu1`) to `~/.ssh/config`
131
+ 3. **Runs remote setup** — installs utilities, creates a Python venv, installs CUDA-matched PyTorch, sets up Jupyter
132
+ 4. **Mounts EBS volume** at `/data` (if applicable formats new volumes, mounts existing ones as-is)
133
+ 5. **Runs a CUDA smoke test** — verifies `torch.cuda.is_available()` and runs a quick GPU matmul
134
+ 6. **Prints connection commands** — SSH, Jupyter tunnel, GPU benchmark, and terminate
126
135
 
127
136
  ```bash
128
137
  ssh aws-gpu1 # venv auto-activates on login
@@ -135,7 +144,7 @@ The setup script runs automatically on the instance after SSH becomes available:
135
144
  | Step | What |
136
145
  |------|------|
137
146
  | **GPU verify** | Confirms `nvidia-smi` and `nvcc` are working |
138
- | **Utilities** | Installs `htop`, `tmux`, `tree`, `jq` |
147
+ | **Utilities** | Installs `htop`, `tmux`, `tree`, `jq`, `ffmpeg` |
139
148
  | **Python venv** | Creates `~/venv` with `uv`, auto-activates in `~/.bashrc`. Use `--python-version` to pin a specific Python (e.g. `3.13`) |
140
149
  | **CUDA-aware PyTorch** | Detects CUDA toolkit version → installs PyTorch from the matching `cu{TAG}` wheel index |
141
150
  | **CUDA smoke test** | Runs `torch.cuda.is_available()` + GPU matmul to verify the stack |
@@ -242,6 +251,9 @@ aws-bootstrap status --region us-east-1
242
251
  # Terminate all aws-bootstrap instances (with confirmation prompt)
243
252
  aws-bootstrap terminate
244
253
 
254
+ # Terminate but preserve EBS data volumes for reuse
255
+ aws-bootstrap terminate --keep-ebs
256
+
245
257
  # Terminate by SSH alias (resolved via ~/.ssh/config)
246
258
  aws-bootstrap terminate aws-gpu1
247
259
 
@@ -253,6 +265,15 @@ aws-bootstrap terminate aws-gpu1 i-def456
253
265
 
254
266
  # Skip confirmation prompt
255
267
  aws-bootstrap terminate --yes
268
+
269
+ # Remove stale SSH config entries for terminated instances
270
+ aws-bootstrap cleanup
271
+
272
+ # Preview what would be removed without modifying config
273
+ aws-bootstrap cleanup --dry-run
274
+
275
+ # Skip confirmation prompt
276
+ aws-bootstrap cleanup --yes
256
277
  ```
257
278
 
258
279
  `status --gpu` reports both the **installed CUDA toolkit** version (from `nvcc`) and the **maximum CUDA version supported by the driver** (from `nvidia-smi`), so you can see at a glance whether they match:
@@ -263,6 +284,31 @@ CUDA: 12.8 (driver supports up to 13.0)
263
284
 
264
285
  SSH aliases are managed automatically — they're created on `launch`, shown in `status`, and cleaned up on `terminate`. Aliases use sequential numbering (`aws-gpu1`, `aws-gpu2`, etc.) and never reuse numbers from previous instances. You can use aliases anywhere you'd use an instance ID, e.g. `aws-bootstrap terminate aws-gpu1`.
265
286
 
287
+ ## EBS Data Volumes
288
+
289
+ Attach persistent EBS storage to keep datasets and model checkpoints across instance lifecycles. Volumes are mounted at `/data` and persist independently of the instance.
290
+
291
+ ```bash
292
+ # Create a new 96 GB gp3 volume, formatted and mounted at /data
293
+ aws-bootstrap launch --ebs-storage 96
294
+
295
+ # After terminating with --keep-ebs, reattach the same volume to a new instance
296
+ aws-bootstrap terminate --keep-ebs
297
+ # Output: Preserving EBS volume: vol-0abc123...
298
+ # Reattach with: aws-bootstrap launch --ebs-volume-id vol-0abc123...
299
+
300
+ aws-bootstrap launch --ebs-volume-id vol-0abc123def456
301
+ ```
302
+
303
+ Key behaviors:
304
+ - `--ebs-storage` and `--ebs-volume-id` are mutually exclusive
305
+ - New volumes are formatted as ext4; existing volumes are mounted as-is
306
+ - Volumes are tagged for automatic discovery by `status` and `terminate`
307
+ - `terminate` deletes data volumes by default; use `--keep-ebs` to preserve them
308
+ - **Spot-safe** — data volumes survive spot interruptions. If AWS reclaims your instance, the volume detaches automatically and can be reattached to a new instance with `--ebs-volume-id`
309
+ - EBS volumes must be in the same availability zone as the instance
310
+ - Mount failures are non-fatal — the instance remains usable
311
+
266
312
  ## EC2 vCPU Quotas
267
313
 
268
314
  AWS accounts have [service quotas](https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/ec2-resource-limits.html) that limit how many vCPUs you can run per instance family. New or lightly-used accounts often have a **default quota of 0 vCPUs** for GPU instance families (G and VT), which will cause errors on launch:
@@ -10,8 +10,13 @@ import click
10
10
 
11
11
  from .config import LaunchConfig
12
12
  from .ec2 import (
13
+ EBS_MOUNT_POINT,
13
14
  CLIError,
15
+ attach_ebs_volume,
16
+ create_ebs_volume,
17
+ delete_ebs_volume,
14
18
  ensure_security_group,
19
+ find_ebs_volumes_for_instance,
15
20
  find_tagged_instances,
16
21
  get_latest_ami,
17
22
  get_spot_price,
@@ -19,13 +24,17 @@ from .ec2 import (
19
24
  list_amis,
20
25
  list_instance_types,
21
26
  terminate_tagged_instances,
27
+ validate_ebs_volume,
22
28
  wait_instance_ready,
23
29
  )
24
30
  from .ssh import (
25
31
  add_ssh_host,
32
+ cleanup_stale_ssh_hosts,
33
+ find_stale_ssh_hosts,
26
34
  get_ssh_host_details,
27
35
  import_key_pair,
28
36
  list_ssh_hosts,
37
+ mount_ebs_volume,
29
38
  private_key_path,
30
39
  query_gpu_info,
31
40
  remove_ssh_host,
@@ -120,6 +129,18 @@ def main():
120
129
  help="Python version for the remote venv (e.g. 3.13, 3.14.2). Passed to uv during setup.",
121
130
  )
122
131
  @click.option("--ssh-port", default=22, show_default=True, type=int, help="SSH port on the remote instance.")
132
+ @click.option(
133
+ "--ebs-storage",
134
+ default=None,
135
+ type=int,
136
+ help="Create and attach a new EBS data volume (size in GB, gp3). Mounted at /data.",
137
+ )
138
+ @click.option(
139
+ "--ebs-volume-id",
140
+ default=None,
141
+ type=str,
142
+ help="Attach an existing EBS volume by ID (e.g. vol-0abc123). Mounted at /data.",
143
+ )
123
144
  def launch(
124
145
  instance_type,
125
146
  ami_filter,
@@ -134,8 +155,13 @@ def launch(
134
155
  profile,
135
156
  python_version,
136
157
  ssh_port,
158
+ ebs_storage,
159
+ ebs_volume_id,
137
160
  ):
138
161
  """Launch a GPU-accelerated EC2 instance."""
162
+ if ebs_storage is not None and ebs_volume_id is not None:
163
+ raise CLIError("--ebs-storage and --ebs-volume-id are mutually exclusive.")
164
+
139
165
  config = LaunchConfig(
140
166
  instance_type=instance_type,
141
167
  spot=spot,
@@ -148,6 +174,8 @@ def launch(
148
174
  dry_run=dry_run,
149
175
  ssh_port=ssh_port,
150
176
  python_version=python_version,
177
+ ebs_storage=ebs_storage,
178
+ ebs_volume_id=ebs_volume_id,
151
179
  )
152
180
  if ami_filter:
153
181
  config.ami_filter = ami_filter
@@ -162,18 +190,21 @@ def launch(
162
190
  session = boto3.Session(profile_name=config.profile, region_name=config.region)
163
191
  ec2 = session.client("ec2")
164
192
 
193
+ has_ebs = config.ebs_storage is not None or config.ebs_volume_id is not None
194
+ total_steps = 7 if has_ebs else 6
195
+
165
196
  # Step 1: AMI lookup
166
- step(1, 6, "Looking up AMI...")
197
+ step(1, total_steps, "Looking up AMI...")
167
198
  ami = get_latest_ami(ec2, config.ami_filter)
168
199
  info(f"Found: {ami['Name']}")
169
200
  val("AMI ID", ami["ImageId"])
170
201
 
171
202
  # Step 2: SSH key pair
172
- step(2, 6, "Importing SSH key pair...")
203
+ step(2, total_steps, "Importing SSH key pair...")
173
204
  import_key_pair(ec2, config.key_name, config.key_path)
174
205
 
175
206
  # Step 3: Security group
176
- step(3, 6, "Ensuring security group...")
207
+ step(3, total_steps, "Ensuring security group...")
177
208
  sg_id = ensure_security_group(ec2, config.security_group, config.tag_value, ssh_port=config.ssh_port)
178
209
 
179
210
  pricing = "spot" if config.spot else "on-demand"
@@ -193,18 +224,22 @@ def launch(
193
224
  val("SSH port", str(config.ssh_port))
194
225
  if config.python_version:
195
226
  val("Python version", config.python_version)
227
+ if config.ebs_storage:
228
+ val("EBS data volume", f"{config.ebs_storage} GB gp3 (new, mounted at {EBS_MOUNT_POINT})")
229
+ if config.ebs_volume_id:
230
+ val("EBS data volume", f"{config.ebs_volume_id} (existing, mounted at {EBS_MOUNT_POINT})")
196
231
  click.echo()
197
232
  click.secho("No resources launched (dry-run mode).", fg="yellow")
198
233
  return
199
234
 
200
235
  # Step 4: Launch instance
201
- step(4, 6, f"Launching {config.instance_type} instance ({pricing})...")
236
+ step(4, total_steps, f"Launching {config.instance_type} instance ({pricing})...")
202
237
  instance = launch_instance(ec2, config, ami["ImageId"], sg_id)
203
238
  instance_id = instance["InstanceId"]
204
239
  val("Instance ID", instance_id)
205
240
 
206
241
  # Step 5: Wait for ready
207
- step(5, 6, "Waiting for instance to be ready...")
242
+ step(5, total_steps, "Waiting for instance to be ready...")
208
243
  instance = wait_instance_ready(ec2, instance_id)
209
244
  public_ip = instance.get("PublicIpAddress")
210
245
  if not public_ip:
@@ -213,9 +248,39 @@ def launch(
213
248
  return
214
249
 
215
250
  val("Public IP", public_ip)
251
+ az = instance["Placement"]["AvailabilityZone"]
252
+
253
+ # Step 5.5 (optional): EBS data volume
254
+ ebs_volume_attached = None
255
+ ebs_format = False
256
+ if has_ebs:
257
+ step(6, total_steps, "Setting up EBS data volume...")
258
+ if config.ebs_storage:
259
+ info(f"Creating {config.ebs_storage} GB gp3 volume in {az}...")
260
+ ebs_volume_attached = create_ebs_volume(ec2, config.ebs_storage, az, config.tag_value, instance_id)
261
+ val("Volume ID", ebs_volume_attached)
262
+ ebs_format = True
263
+ elif config.ebs_volume_id:
264
+ info(f"Validating volume {config.ebs_volume_id}...")
265
+ validate_ebs_volume(ec2, config.ebs_volume_id, az)
266
+ ebs_volume_attached = config.ebs_volume_id
267
+ # Tag the existing volume for discovery
268
+ ec2.create_tags(
269
+ Resources=[ebs_volume_attached],
270
+ Tags=[
271
+ {"Key": "aws-bootstrap-instance", "Value": instance_id},
272
+ {"Key": "created-by", "Value": config.tag_value},
273
+ ],
274
+ )
275
+ ebs_format = False
216
276
 
217
- # Step 6: SSH and remote setup
218
- step(6, 6, "Waiting for SSH access...")
277
+ info(f"Attaching {ebs_volume_attached} to {instance_id}...")
278
+ attach_ebs_volume(ec2, ebs_volume_attached, instance_id)
279
+ success("EBS volume attached.")
280
+
281
+ # SSH and remote setup step
282
+ ssh_step = 7 if has_ebs else 6
283
+ step(ssh_step, total_steps, "Waiting for SSH access...")
219
284
  private_key = private_key_path(config.key_path)
220
285
  if not wait_for_ssh(public_ip, config.ssh_user, config.key_path, port=config.ssh_port):
221
286
  warn("SSH did not become available within the timeout.")
@@ -238,6 +303,22 @@ def launch(
238
303
  else:
239
304
  warn("Remote setup failed. Instance is still running.")
240
305
 
306
+ # Mount EBS volume via SSH (after setup so the instance is fully ready)
307
+ if ebs_volume_attached:
308
+ info(f"Mounting EBS volume at {EBS_MOUNT_POINT}...")
309
+ if mount_ebs_volume(
310
+ public_ip,
311
+ config.ssh_user,
312
+ config.key_path,
313
+ ebs_volume_attached,
314
+ mount_point=EBS_MOUNT_POINT,
315
+ format_volume=ebs_format,
316
+ port=config.ssh_port,
317
+ ):
318
+ success(f"EBS volume mounted at {EBS_MOUNT_POINT}.")
319
+ else:
320
+ warn(f"Failed to mount EBS volume at {EBS_MOUNT_POINT}. You may need to mount it manually.")
321
+
241
322
  # Add SSH config alias
242
323
  alias = add_ssh_host(
243
324
  instance_id=instance_id,
@@ -260,6 +341,12 @@ def launch(
260
341
  val("Instance", config.instance_type)
261
342
  val("Pricing", pricing)
262
343
  val("SSH alias", alias)
344
+ if ebs_volume_attached:
345
+ if config.ebs_storage:
346
+ ebs_label = f"{ebs_volume_attached} ({config.ebs_storage} GB, {EBS_MOUNT_POINT})"
347
+ else:
348
+ ebs_label = f"{ebs_volume_attached} ({EBS_MOUNT_POINT})"
349
+ val("EBS data volume", ebs_label)
263
350
 
264
351
  port_flag = f" -p {config.ssh_port}" if config.ssh_port != 22 else ""
265
352
 
@@ -371,6 +458,12 @@ def status(region, profile, gpu, instructions):
371
458
  else:
372
459
  click.echo(" GPU: " + click.style("unavailable", dim=True))
373
460
 
461
+ # EBS data volumes
462
+ ebs_volumes = find_ebs_volumes_for_instance(ec2, inst["InstanceId"], "aws-bootstrap-g4dn")
463
+ for vol in ebs_volumes:
464
+ vol_state = f", {vol['State']}" if vol["State"] != "in-use" else ""
465
+ val(" EBS", f"{vol['VolumeId']} ({vol['Size']} GB, {EBS_MOUNT_POINT}{vol_state})")
466
+
374
467
  lifecycle = inst["Lifecycle"]
375
468
  is_spot = lifecycle == "spot"
376
469
 
@@ -429,8 +522,9 @@ def status(region, profile, gpu, instructions):
429
522
  @click.option("--region", default="us-west-2", show_default=True, help="AWS region.")
430
523
  @click.option("--profile", default=None, help="AWS profile override.")
431
524
  @click.option("--yes", "-y", is_flag=True, default=False, help="Skip confirmation prompt.")
525
+ @click.option("--keep-ebs", is_flag=True, default=False, help="Preserve EBS data volumes instead of deleting them.")
432
526
  @click.argument("instance_ids", nargs=-1, metavar="[INSTANCE_ID_OR_ALIAS]...")
433
- def terminate(region, profile, yes, instance_ids):
527
+ def terminate(region, profile, yes, keep_ebs, instance_ids):
434
528
  """Terminate instances created by aws-bootstrap.
435
529
 
436
530
  Pass specific instance IDs or SSH aliases (e.g. aws-gpu1) to terminate,
@@ -468,6 +562,13 @@ def terminate(region, profile, yes, instance_ids):
468
562
  click.secho(" Cancelled.", fg="yellow")
469
563
  return
470
564
 
565
+ # Discover EBS volumes before termination (while instances still exist)
566
+ ebs_by_instance: dict[str, list[dict]] = {}
567
+ for target in targets:
568
+ volumes = find_ebs_volumes_for_instance(ec2, target, "aws-bootstrap-g4dn")
569
+ if volumes:
570
+ ebs_by_instance[target] = volumes
571
+
471
572
  changes = terminate_tagged_instances(ec2, targets)
472
573
  click.echo()
473
574
  for change in changes:
@@ -479,10 +580,73 @@ def terminate(region, profile, yes, instance_ids):
479
580
  removed_alias = remove_ssh_host(change["InstanceId"])
480
581
  if removed_alias:
481
582
  info(f"Removed SSH config alias: {removed_alias}")
583
+
584
+ # Handle EBS volume cleanup
585
+ for _iid, volumes in ebs_by_instance.items():
586
+ for vol in volumes:
587
+ vid = vol["VolumeId"]
588
+ if keep_ebs:
589
+ click.echo()
590
+ info(f"Preserving EBS volume: {vid} ({vol['Size']} GB)")
591
+ info(f"Reattach with: aws-bootstrap launch --ebs-volume-id {vid}")
592
+ else:
593
+ click.echo()
594
+ info(f"Waiting for EBS volume {vid} to detach...")
595
+ try:
596
+ waiter = ec2.get_waiter("volume_available")
597
+ waiter.wait(VolumeIds=[vid], WaiterConfig={"Delay": 10, "MaxAttempts": 30})
598
+ delete_ebs_volume(ec2, vid)
599
+ success(f"Deleted EBS volume: {vid}")
600
+ except Exception as e:
601
+ warn(f"Failed to delete EBS volume {vid}: {e}")
602
+
482
603
  click.echo()
483
604
  success(f"Terminated {len(changes)} instance(s).")
484
605
 
485
606
 
607
+ @main.command()
608
+ @click.option("--dry-run", is_flag=True, default=False, help="Show what would be removed without removing.")
609
+ @click.option("--yes", "-y", is_flag=True, default=False, help="Skip confirmation prompt.")
610
+ @click.option("--region", default="us-west-2", show_default=True, help="AWS region.")
611
+ @click.option("--profile", default=None, help="AWS profile override.")
612
+ def cleanup(dry_run, yes, region, profile):
613
+ """Remove stale SSH config entries for terminated instances."""
614
+ session = boto3.Session(profile_name=profile, region_name=region)
615
+ ec2 = session.client("ec2")
616
+
617
+ live_instances = find_tagged_instances(ec2, "aws-bootstrap-g4dn")
618
+ live_ids = {inst["InstanceId"] for inst in live_instances}
619
+
620
+ stale = find_stale_ssh_hosts(live_ids)
621
+ if not stale:
622
+ click.secho("No stale SSH config entries found.", fg="green")
623
+ return
624
+
625
+ click.secho(f"\n Found {len(stale)} stale SSH config entry(ies):\n", bold=True, fg="cyan")
626
+ for iid, alias in stale:
627
+ click.echo(" " + click.style(alias, fg="bright_white") + f" ({iid})")
628
+
629
+ if dry_run:
630
+ click.echo()
631
+ for iid, alias in stale:
632
+ info(f"Would remove {alias} ({iid})")
633
+ return
634
+
635
+ if not yes:
636
+ click.echo()
637
+ if not click.confirm(f" Remove {len(stale)} stale entry(ies)?"):
638
+ click.secho(" Cancelled.", fg="yellow")
639
+ return
640
+
641
+ results = cleanup_stale_ssh_hosts(live_ids)
642
+ click.echo()
643
+ for r in results:
644
+ success(f"Removed {r.alias} ({r.instance_id})")
645
+
646
+ click.echo()
647
+ success(f"Cleaned up {len(results)} stale entry(ies).")
648
+
649
+
486
650
  # ---------------------------------------------------------------------------
487
651
  # list command group
488
652
  # ---------------------------------------------------------------------------
@@ -24,3 +24,5 @@ class LaunchConfig:
24
24
  alias_prefix: str = "aws-gpu"
25
25
  ssh_port: int = 22
26
26
  python_version: str | None = None
27
+ ebs_storage: int | None = None
28
+ ebs_volume_id: str | None = None