aws-bootstrap-g4dn 0.4.0__tar.gz → 0.6.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (46) hide show
  1. {aws_bootstrap_g4dn-0.4.0 → aws_bootstrap_g4dn-0.6.0}/CLAUDE.md +36 -5
  2. {aws_bootstrap_g4dn-0.4.0/aws_bootstrap_g4dn.egg-info → aws_bootstrap_g4dn-0.6.0}/PKG-INFO +62 -10
  3. {aws_bootstrap_g4dn-0.4.0 → aws_bootstrap_g4dn-0.6.0}/README.md +61 -9
  4. {aws_bootstrap_g4dn-0.4.0 → aws_bootstrap_g4dn-0.6.0}/aws_bootstrap/cli.py +190 -14
  5. {aws_bootstrap_g4dn-0.4.0 → aws_bootstrap_g4dn-0.6.0}/aws_bootstrap/config.py +2 -0
  6. {aws_bootstrap_g4dn-0.4.0 → aws_bootstrap_g4dn-0.6.0}/aws_bootstrap/ec2.py +128 -0
  7. {aws_bootstrap_g4dn-0.4.0 → aws_bootstrap_g4dn-0.6.0}/aws_bootstrap/resources/remote_setup.sh +2 -2
  8. {aws_bootstrap_g4dn-0.4.0 → aws_bootstrap_g4dn-0.6.0}/aws_bootstrap/ssh.py +149 -0
  9. {aws_bootstrap_g4dn-0.4.0 → aws_bootstrap_g4dn-0.6.0}/aws_bootstrap/tests/test_cli.py +424 -4
  10. {aws_bootstrap_g4dn-0.4.0 → aws_bootstrap_g4dn-0.6.0}/aws_bootstrap/tests/test_config.py +18 -0
  11. aws_bootstrap_g4dn-0.6.0/aws_bootstrap/tests/test_ebs.py +245 -0
  12. {aws_bootstrap_g4dn-0.4.0 → aws_bootstrap_g4dn-0.6.0}/aws_bootstrap/tests/test_ssh_config.py +152 -0
  13. aws_bootstrap_g4dn-0.6.0/aws_bootstrap/tests/test_ssh_ebs.py +76 -0
  14. {aws_bootstrap_g4dn-0.4.0 → aws_bootstrap_g4dn-0.6.0/aws_bootstrap_g4dn.egg-info}/PKG-INFO +62 -10
  15. {aws_bootstrap_g4dn-0.4.0 → aws_bootstrap_g4dn-0.6.0}/aws_bootstrap_g4dn.egg-info/SOURCES.txt +2 -0
  16. {aws_bootstrap_g4dn-0.4.0 → aws_bootstrap_g4dn-0.6.0}/.github/ISSUE_TEMPLATE/bug_report.md +0 -0
  17. {aws_bootstrap_g4dn-0.4.0 → aws_bootstrap_g4dn-0.6.0}/.github/ISSUE_TEMPLATE/feature_request.md +0 -0
  18. {aws_bootstrap_g4dn-0.4.0 → aws_bootstrap_g4dn-0.6.0}/.github/workflows/ci.yml +0 -0
  19. {aws_bootstrap_g4dn-0.4.0 → aws_bootstrap_g4dn-0.6.0}/.github/workflows/publish-to-pypi.yml +0 -0
  20. {aws_bootstrap_g4dn-0.4.0 → aws_bootstrap_g4dn-0.6.0}/.gitignore +0 -0
  21. {aws_bootstrap_g4dn-0.4.0 → aws_bootstrap_g4dn-0.6.0}/.pre-commit-config.yaml +0 -0
  22. {aws_bootstrap_g4dn-0.4.0 → aws_bootstrap_g4dn-0.6.0}/CODE_OF_CONDUCT.md +0 -0
  23. {aws_bootstrap_g4dn-0.4.0 → aws_bootstrap_g4dn-0.6.0}/CONTRIBUTING.md +0 -0
  24. {aws_bootstrap_g4dn-0.4.0 → aws_bootstrap_g4dn-0.6.0}/LICENSE +0 -0
  25. {aws_bootstrap_g4dn-0.4.0 → aws_bootstrap_g4dn-0.6.0}/SECURITY.md +0 -0
  26. {aws_bootstrap_g4dn-0.4.0 → aws_bootstrap_g4dn-0.6.0}/aws_bootstrap/__init__.py +0 -0
  27. {aws_bootstrap_g4dn-0.4.0 → aws_bootstrap_g4dn-0.6.0}/aws_bootstrap/gpu.py +0 -0
  28. {aws_bootstrap_g4dn-0.4.0 → aws_bootstrap_g4dn-0.6.0}/aws_bootstrap/resources/__init__.py +0 -0
  29. {aws_bootstrap_g4dn-0.4.0 → aws_bootstrap_g4dn-0.6.0}/aws_bootstrap/resources/gpu_benchmark.py +0 -0
  30. {aws_bootstrap_g4dn-0.4.0 → aws_bootstrap_g4dn-0.6.0}/aws_bootstrap/resources/gpu_smoke_test.ipynb +0 -0
  31. {aws_bootstrap_g4dn-0.4.0 → aws_bootstrap_g4dn-0.6.0}/aws_bootstrap/resources/launch.json +0 -0
  32. {aws_bootstrap_g4dn-0.4.0 → aws_bootstrap_g4dn-0.6.0}/aws_bootstrap/resources/requirements.txt +0 -0
  33. {aws_bootstrap_g4dn-0.4.0 → aws_bootstrap_g4dn-0.6.0}/aws_bootstrap/resources/saxpy.cu +0 -0
  34. {aws_bootstrap_g4dn-0.4.0 → aws_bootstrap_g4dn-0.6.0}/aws_bootstrap/resources/tasks.json +0 -0
  35. {aws_bootstrap_g4dn-0.4.0 → aws_bootstrap_g4dn-0.6.0}/aws_bootstrap/tests/__init__.py +0 -0
  36. {aws_bootstrap_g4dn-0.4.0 → aws_bootstrap_g4dn-0.6.0}/aws_bootstrap/tests/test_ec2.py +0 -0
  37. {aws_bootstrap_g4dn-0.4.0 → aws_bootstrap_g4dn-0.6.0}/aws_bootstrap/tests/test_gpu.py +0 -0
  38. {aws_bootstrap_g4dn-0.4.0 → aws_bootstrap_g4dn-0.6.0}/aws_bootstrap/tests/test_ssh_gpu.py +0 -0
  39. {aws_bootstrap_g4dn-0.4.0 → aws_bootstrap_g4dn-0.6.0}/aws_bootstrap_g4dn.egg-info/dependency_links.txt +0 -0
  40. {aws_bootstrap_g4dn-0.4.0 → aws_bootstrap_g4dn-0.6.0}/aws_bootstrap_g4dn.egg-info/entry_points.txt +0 -0
  41. {aws_bootstrap_g4dn-0.4.0 → aws_bootstrap_g4dn-0.6.0}/aws_bootstrap_g4dn.egg-info/requires.txt +0 -0
  42. {aws_bootstrap_g4dn-0.4.0 → aws_bootstrap_g4dn-0.6.0}/aws_bootstrap_g4dn.egg-info/top_level.txt +0 -0
  43. {aws_bootstrap_g4dn-0.4.0 → aws_bootstrap_g4dn-0.6.0}/docs/nsight-remote-profiling.md +0 -0
  44. {aws_bootstrap_g4dn-0.4.0 → aws_bootstrap_g4dn-0.6.0}/pyproject.toml +0 -0
  45. {aws_bootstrap_g4dn-0.4.0 → aws_bootstrap_g4dn-0.6.0}/setup.cfg +0 -0
  46. {aws_bootstrap_g4dn-0.4.0 → aws_bootstrap_g4dn-0.6.0}/uv.lock +0 -0
@@ -32,9 +32,9 @@ aws_bootstrap/
32
32
  __init__.py # Package init
33
33
  cli.py # Click CLI entry point (launch, status, terminate commands)
34
34
  config.py # LaunchConfig dataclass with defaults
35
- ec2.py # AMI lookup, security group, instance launch/find/terminate, polling, spot pricing
35
+ ec2.py # AMI lookup, security group, instance launch/find/terminate, polling, spot pricing, EBS volume ops
36
36
  gpu.py # GPU architecture mapping and GpuInfo dataclass
37
- ssh.py # SSH key pair import, SSH readiness check, remote setup, ~/.ssh/config management, GPU queries
37
+ ssh.py # SSH key pair import, SSH readiness check, remote setup, ~/.ssh/config management, GPU queries, EBS mount
38
38
  resources/ # Non-Python artifacts SCP'd to remote instances
39
39
  __init__.py
40
40
  gpu_benchmark.py # GPU throughput benchmark (CNN + Transformer), copied to ~/gpu_benchmark.py on instance
@@ -51,6 +51,8 @@ aws_bootstrap/
51
51
  test_gpu.py
52
52
  test_ssh_config.py
53
53
  test_ssh_gpu.py
54
+ test_ebs.py
55
+ test_ssh_ebs.py
54
56
  docs/
55
57
  nsight-remote-profiling.md # Nsight Compute, Nsight Systems, and Nsight VSCE remote profiling guide
56
58
  spot-request-lifecycle.md # Research notes on spot request cleanup
@@ -60,9 +62,10 @@ Entry point: `aws-bootstrap = "aws_bootstrap.cli:main"` (installed via `uv sync`
60
62
 
61
63
  ## CLI Commands
62
64
 
63
- - **`launch`** — provisions an EC2 instance (spot by default, falls back to on-demand on capacity errors); adds SSH config alias (e.g. `aws-gpu1`) to `~/.ssh/config`; `--python-version` controls which Python `uv` installs in the remote venv; `--ssh-port` overrides the default SSH port (22) for security group ingress, connection checks, and SSH config
64
- - **`status`** — lists all non-terminated instances (including `shutting-down`) with type, IP, SSH alias, pricing (spot price/hr or on-demand), uptime, and estimated cost for running spot instances; `--gpu` flag queries GPU info via SSH, reporting both CUDA toolkit version (from `nvcc`) and driver-supported max (from `nvidia-smi`); `--instructions` (default: on) prints connection commands (SSH, Jupyter tunnel, VSCode Remote SSH, GPU benchmark) for each running instance; suppress with `--no-instructions`
65
- - **`terminate`** — terminates instances by ID or all aws-bootstrap instances in the region; removes SSH config aliases
65
+ - **`launch`** — provisions an EC2 instance (spot by default, falls back to on-demand on capacity errors); adds SSH config alias (e.g. `aws-gpu1`) to `~/.ssh/config`; `--python-version` controls which Python `uv` installs in the remote venv; `--ssh-port` overrides the default SSH port (22) for security group ingress, connection checks, and SSH config; `--ebs-storage SIZE` creates and attaches a new gp3 EBS data volume (mounted at `/data`); `--ebs-volume-id ID` attaches an existing EBS volume (mutually exclusive with `--ebs-storage`)
66
+ - **`status`** — lists all non-terminated instances (including `shutting-down`) with type, IP, SSH alias, EBS data volumes, pricing (spot price/hr or on-demand), uptime, and estimated cost for running spot instances; `--gpu` flag queries GPU info via SSH, reporting both CUDA toolkit version (from `nvcc`) and driver-supported max (from `nvidia-smi`); `--instructions` (default: on) prints connection commands (SSH, Jupyter tunnel, VSCode Remote SSH, GPU benchmark) for each running instance; suppress with `--no-instructions`
67
+ - **`terminate`** — terminates instances by ID or SSH alias (e.g. `aws-gpu1`, resolved via `~/.ssh/config`), or all aws-bootstrap instances in the region if no arguments given; removes SSH config aliases; deletes associated EBS data volumes by default; `--keep-ebs` preserves volumes and prints reattach commands
68
+ - **`cleanup`** — removes stale `~/.ssh/config` entries for terminated/non-existent instances; compares managed SSH config blocks against live EC2 instances; `--dry-run` previews removals without modifying config; `--yes` skips the confirmation prompt
66
69
  - **`list instance-types`** — lists EC2 instance types matching a family prefix (default: `g4dn`), showing vCPUs, memory, and GPU info
67
70
  - **`list amis`** — lists available AMIs matching a name pattern (default: Deep Learning Base OSS Nvidia Driver GPU AMIs), sorted newest-first
68
71
 
@@ -112,6 +115,34 @@ The `KNOWN_CUDA_TAGS` array in `remote_setup.sh` lists the CUDA wheel tags publi
112
115
 
113
116
  `resources/gpu_benchmark.py` is uploaded to `~/gpu_benchmark.py` on the remote instance during setup. It benchmarks GPU throughput with two modes: CNN on MNIST and a GPT-style Transformer on synthetic data. It reports samples/sec, batch times, and peak GPU memory. Supports `--precision` (fp32/fp16/bf16/tf32), `--diagnose` for CUDA smoke tests, and separate `--transformer-batch-size` (default 32, T4-safe). Dependencies (`torch`, `torchvision`, `tqdm`) are already installed by the setup script.
114
117
 
118
+ ## EBS Data Volumes
119
+
120
+ The `--ebs-storage` and `--ebs-volume-id` options on `launch` create or attach persistent gp3 EBS volumes mounted at `/data`. The implementation spans three modules:
121
+
122
+ - **`ec2.py`** — Volume lifecycle: `create_ebs_volume`, `validate_ebs_volume`, `attach_ebs_volume`, `detach_ebs_volume`, `delete_ebs_volume`, `find_ebs_volumes_for_instance`. Constants `EBS_DEVICE_NAME` (`/dev/sdf`) and `EBS_MOUNT_POINT` (`/data`).
123
+ - **`ssh.py`** — `mount_ebs_volume()` SSHs to the instance and runs a shell script that detects the device, optionally formats it, mounts it, and adds an fstab entry.
124
+ - **`cli.py`** — Orchestrates the flow: create/validate → attach → wait for SSH → mount. Mount failures are non-fatal (warn and continue).
125
+
126
+ ### Tagging strategy
127
+
128
+ Volumes are tagged for discovery by `status` and `terminate`:
129
+
130
+ | Tag | Value | Purpose |
131
+ |-----|-------|---------|
132
+ | `created-by` | `aws-bootstrap-g4dn` | Standard tool-managed resource tag |
133
+ | `Name` | `aws-bootstrap-data-{instance_id}` | Human-readable in AWS console |
134
+ | `aws-bootstrap-instance` | `i-xxxxxxxxx` | Links volume to instance for `find_ebs_volumes_for_instance` |
135
+
136
+ ### NVMe device detection
137
+
138
+ On Nitro instances (g4dn), `/dev/sdf` is remapped to `/dev/nvmeXn1`. The mount script detects the correct device by matching the volume ID serial number via `lsblk -o NAME,SERIAL -dpn`, with fallbacks to `/dev/nvme1n1`, `/dev/xvdf`, `/dev/sdf`.
139
+
140
+ ### Spot interruption and terminate cleanup
141
+
142
+ Non-root EBS volumes attached via API have `DeleteOnTermination=False` by default. This means data volumes **survive spot interruptions** — when AWS reclaims the instance, the volume detaches and becomes `available`, preserving all data. The user can reattach it to a new instance with `--ebs-volume-id`.
143
+
144
+ The `terminate` command discovers volumes via `find_ebs_volumes_for_instance`, waits for them to detach (becomes `available`), then deletes them. `--keep-ebs` skips deletion and prints the volume ID with a reattach command.
145
+
115
146
  ## Versioning & Publishing
116
147
 
117
148
  Version is derived automatically from git tags via **setuptools-scm** — no hardcoded version string in the codebase.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: aws-bootstrap-g4dn
3
- Version: 0.4.0
3
+ Version: 0.6.0
4
4
  Summary: Bootstrap AWS EC2 GPU instances for hybrid local-remote development
5
5
  Author: Adam Ever-Hadani
6
6
  License-Expression: MIT
@@ -44,7 +44,8 @@ ssh aws-gpu1 # You're in, venv activated, PyTorch works
44
44
  | 📊 | **GPU benchmark included** | CNN (MNIST) + Transformer benchmarks with FP16/FP32/BF16 precision and tqdm progress |
45
45
  | 📓 | **Jupyter ready** | Lab server auto-starts as a systemd service on port 8888 — just SSH tunnel and open |
46
46
  | 🖥️ | **`status --gpu`** | Shows CUDA toolkit version, driver max, GPU architecture, spot pricing, uptime, and estimated cost |
47
- | 🗑️ | **Clean terminate** | Stops instances, removes SSH aliases, shows shutting-down state until fully gone |
47
+ | 💾 | **EBS data volumes** | Attach persistent storage at `/data` survives spot interruptions and termination, reattach to new instances |
48
+ | 🗑️ | **Clean terminate** | Stops instances, removes SSH aliases, cleans up EBS volumes (or preserves with `--keep-ebs`) |
48
49
 
49
50
  ### 🎯 Target Workflows
50
51
 
@@ -132,16 +133,24 @@ aws-bootstrap launch --python-version 3.13
132
133
  # Use a non-default SSH port
133
134
  aws-bootstrap launch --ssh-port 2222
134
135
 
136
+ # Attach a persistent EBS data volume (96 GB gp3, mounted at /data)
137
+ aws-bootstrap launch --ebs-storage 96
138
+
139
+ # Reattach an existing EBS volume from a previous instance
140
+ aws-bootstrap launch --ebs-volume-id vol-0abc123def456
141
+
135
142
  # Use a specific AWS profile
136
143
  aws-bootstrap launch --profile my-aws-profile
137
144
  ```
138
145
 
139
146
  After launch, the CLI:
140
147
 
141
- 1. **Adds an SSH alias** (e.g. `aws-gpu1`) to `~/.ssh/config`
142
- 2. **Runs remote setup** installs utilities, creates a Python venv, installs CUDA-matched PyTorch, sets up Jupyter
143
- 3. **Runs a CUDA smoke test** — verifies `torch.cuda.is_available()` and runs a quick GPU matmul
144
- 4. **Prints connection commands** — SSH, Jupyter tunnel, GPU benchmark, and terminate
148
+ 1. **Creates/attaches EBS volume** (if `--ebs-storage` or `--ebs-volume-id` was specified)
149
+ 2. **Adds an SSH alias** (e.g. `aws-gpu1`) to `~/.ssh/config`
150
+ 3. **Runs remote setup** — installs utilities, creates a Python venv, installs CUDA-matched PyTorch, sets up Jupyter
151
+ 4. **Mounts EBS volume** at `/data` (if applicable formats new volumes, mounts existing ones as-is)
152
+ 5. **Runs a CUDA smoke test** — verifies `torch.cuda.is_available()` and runs a quick GPU matmul
153
+ 6. **Prints connection commands** — SSH, Jupyter tunnel, GPU benchmark, and terminate
145
154
 
146
155
  ```bash
147
156
  ssh aws-gpu1 # venv auto-activates on login
@@ -154,7 +163,7 @@ The setup script runs automatically on the instance after SSH becomes available:
154
163
  | Step | What |
155
164
  |------|------|
156
165
  | **GPU verify** | Confirms `nvidia-smi` and `nvcc` are working |
157
- | **Utilities** | Installs `htop`, `tmux`, `tree`, `jq` |
166
+ | **Utilities** | Installs `htop`, `tmux`, `tree`, `jq`, `ffmpeg` |
158
167
  | **Python venv** | Creates `~/venv` with `uv`, auto-activates in `~/.bashrc`. Use `--python-version` to pin a specific Python (e.g. `3.13`) |
159
168
  | **CUDA-aware PyTorch** | Detects CUDA toolkit version → installs PyTorch from the matching `cu{TAG}` wheel index |
160
169
  | **CUDA smoke test** | Runs `torch.cuda.is_available()` + GPU matmul to verify the stack |
@@ -261,11 +270,29 @@ aws-bootstrap status --region us-east-1
261
270
  # Terminate all aws-bootstrap instances (with confirmation prompt)
262
271
  aws-bootstrap terminate
263
272
 
264
- # Terminate specific instances
265
- aws-bootstrap terminate i-abc123 i-def456
273
+ # Terminate but preserve EBS data volumes for reuse
274
+ aws-bootstrap terminate --keep-ebs
275
+
276
+ # Terminate by SSH alias (resolved via ~/.ssh/config)
277
+ aws-bootstrap terminate aws-gpu1
278
+
279
+ # Terminate by instance ID
280
+ aws-bootstrap terminate i-abc123
281
+
282
+ # Mix aliases and instance IDs
283
+ aws-bootstrap terminate aws-gpu1 i-def456
266
284
 
267
285
  # Skip confirmation prompt
268
286
  aws-bootstrap terminate --yes
287
+
288
+ # Remove stale SSH config entries for terminated instances
289
+ aws-bootstrap cleanup
290
+
291
+ # Preview what would be removed without modifying config
292
+ aws-bootstrap cleanup --dry-run
293
+
294
+ # Skip confirmation prompt
295
+ aws-bootstrap cleanup --yes
269
296
  ```
270
297
 
271
298
  `status --gpu` reports both the **installed CUDA toolkit** version (from `nvcc`) and the **maximum CUDA version supported by the driver** (from `nvidia-smi`), so you can see at a glance whether they match:
@@ -274,7 +301,32 @@ aws-bootstrap terminate --yes
274
301
  CUDA: 12.8 (driver supports up to 13.0)
275
302
  ```
276
303
 
277
- SSH aliases are managed automatically — they're created on `launch`, shown in `status`, and cleaned up on `terminate`. Aliases use sequential numbering (`aws-gpu1`, `aws-gpu2`, etc.) and never reuse numbers from previous instances.
304
+ SSH aliases are managed automatically — they're created on `launch`, shown in `status`, and cleaned up on `terminate`. Aliases use sequential numbering (`aws-gpu1`, `aws-gpu2`, etc.) and never reuse numbers from previous instances. You can use aliases anywhere you'd use an instance ID, e.g. `aws-bootstrap terminate aws-gpu1`.
305
+
306
+ ## EBS Data Volumes
307
+
308
+ Attach persistent EBS storage to keep datasets and model checkpoints across instance lifecycles. Volumes are mounted at `/data` and persist independently of the instance.
309
+
310
+ ```bash
311
+ # Create a new 96 GB gp3 volume, formatted and mounted at /data
312
+ aws-bootstrap launch --ebs-storage 96
313
+
314
+ # After terminating with --keep-ebs, reattach the same volume to a new instance
315
+ aws-bootstrap terminate --keep-ebs
316
+ # Output: Preserving EBS volume: vol-0abc123...
317
+ # Reattach with: aws-bootstrap launch --ebs-volume-id vol-0abc123...
318
+
319
+ aws-bootstrap launch --ebs-volume-id vol-0abc123def456
320
+ ```
321
+
322
+ Key behaviors:
323
+ - `--ebs-storage` and `--ebs-volume-id` are mutually exclusive
324
+ - New volumes are formatted as ext4; existing volumes are mounted as-is
325
+ - Volumes are tagged for automatic discovery by `status` and `terminate`
326
+ - `terminate` deletes data volumes by default; use `--keep-ebs` to preserve them
327
+ - **Spot-safe** — data volumes survive spot interruptions. If AWS reclaims your instance, the volume detaches automatically and can be reattached to a new instance with `--ebs-volume-id`
328
+ - EBS volumes must be in the same availability zone as the instance
329
+ - Mount failures are non-fatal — the instance remains usable
278
330
 
279
331
  ## EC2 vCPU Quotas
280
332
 
@@ -25,7 +25,8 @@ ssh aws-gpu1 # You're in, venv activated, PyTorch works
25
25
  | 📊 | **GPU benchmark included** | CNN (MNIST) + Transformer benchmarks with FP16/FP32/BF16 precision and tqdm progress |
26
26
  | 📓 | **Jupyter ready** | Lab server auto-starts as a systemd service on port 8888 — just SSH tunnel and open |
27
27
  | 🖥️ | **`status --gpu`** | Shows CUDA toolkit version, driver max, GPU architecture, spot pricing, uptime, and estimated cost |
28
- | 🗑️ | **Clean terminate** | Stops instances, removes SSH aliases, shows shutting-down state until fully gone |
28
+ | 💾 | **EBS data volumes** | Attach persistent storage at `/data` survives spot interruptions and termination, reattach to new instances |
29
+ | 🗑️ | **Clean terminate** | Stops instances, removes SSH aliases, cleans up EBS volumes (or preserves with `--keep-ebs`) |
29
30
 
30
31
  ### 🎯 Target Workflows
31
32
 
@@ -113,16 +114,24 @@ aws-bootstrap launch --python-version 3.13
113
114
  # Use a non-default SSH port
114
115
  aws-bootstrap launch --ssh-port 2222
115
116
 
117
+ # Attach a persistent EBS data volume (96 GB gp3, mounted at /data)
118
+ aws-bootstrap launch --ebs-storage 96
119
+
120
+ # Reattach an existing EBS volume from a previous instance
121
+ aws-bootstrap launch --ebs-volume-id vol-0abc123def456
122
+
116
123
  # Use a specific AWS profile
117
124
  aws-bootstrap launch --profile my-aws-profile
118
125
  ```
119
126
 
120
127
  After launch, the CLI:
121
128
 
122
- 1. **Adds an SSH alias** (e.g. `aws-gpu1`) to `~/.ssh/config`
123
- 2. **Runs remote setup** installs utilities, creates a Python venv, installs CUDA-matched PyTorch, sets up Jupyter
124
- 3. **Runs a CUDA smoke test** — verifies `torch.cuda.is_available()` and runs a quick GPU matmul
125
- 4. **Prints connection commands** — SSH, Jupyter tunnel, GPU benchmark, and terminate
129
+ 1. **Creates/attaches EBS volume** (if `--ebs-storage` or `--ebs-volume-id` was specified)
130
+ 2. **Adds an SSH alias** (e.g. `aws-gpu1`) to `~/.ssh/config`
131
+ 3. **Runs remote setup** — installs utilities, creates a Python venv, installs CUDA-matched PyTorch, sets up Jupyter
132
+ 4. **Mounts EBS volume** at `/data` (if applicable formats new volumes, mounts existing ones as-is)
133
+ 5. **Runs a CUDA smoke test** — verifies `torch.cuda.is_available()` and runs a quick GPU matmul
134
+ 6. **Prints connection commands** — SSH, Jupyter tunnel, GPU benchmark, and terminate
126
135
 
127
136
  ```bash
128
137
  ssh aws-gpu1 # venv auto-activates on login
@@ -135,7 +144,7 @@ The setup script runs automatically on the instance after SSH becomes available:
135
144
  | Step | What |
136
145
  |------|------|
137
146
  | **GPU verify** | Confirms `nvidia-smi` and `nvcc` are working |
138
- | **Utilities** | Installs `htop`, `tmux`, `tree`, `jq` |
147
+ | **Utilities** | Installs `htop`, `tmux`, `tree`, `jq`, `ffmpeg` |
139
148
  | **Python venv** | Creates `~/venv` with `uv`, auto-activates in `~/.bashrc`. Use `--python-version` to pin a specific Python (e.g. `3.13`) |
140
149
  | **CUDA-aware PyTorch** | Detects CUDA toolkit version → installs PyTorch from the matching `cu{TAG}` wheel index |
141
150
  | **CUDA smoke test** | Runs `torch.cuda.is_available()` + GPU matmul to verify the stack |
@@ -242,11 +251,29 @@ aws-bootstrap status --region us-east-1
242
251
  # Terminate all aws-bootstrap instances (with confirmation prompt)
243
252
  aws-bootstrap terminate
244
253
 
245
- # Terminate specific instances
246
- aws-bootstrap terminate i-abc123 i-def456
254
+ # Terminate but preserve EBS data volumes for reuse
255
+ aws-bootstrap terminate --keep-ebs
256
+
257
+ # Terminate by SSH alias (resolved via ~/.ssh/config)
258
+ aws-bootstrap terminate aws-gpu1
259
+
260
+ # Terminate by instance ID
261
+ aws-bootstrap terminate i-abc123
262
+
263
+ # Mix aliases and instance IDs
264
+ aws-bootstrap terminate aws-gpu1 i-def456
247
265
 
248
266
  # Skip confirmation prompt
249
267
  aws-bootstrap terminate --yes
268
+
269
+ # Remove stale SSH config entries for terminated instances
270
+ aws-bootstrap cleanup
271
+
272
+ # Preview what would be removed without modifying config
273
+ aws-bootstrap cleanup --dry-run
274
+
275
+ # Skip confirmation prompt
276
+ aws-bootstrap cleanup --yes
250
277
  ```
251
278
 
252
279
  `status --gpu` reports both the **installed CUDA toolkit** version (from `nvcc`) and the **maximum CUDA version supported by the driver** (from `nvidia-smi`), so you can see at a glance whether they match:
@@ -255,7 +282,32 @@ aws-bootstrap terminate --yes
255
282
  CUDA: 12.8 (driver supports up to 13.0)
256
283
  ```
257
284
 
258
- SSH aliases are managed automatically — they're created on `launch`, shown in `status`, and cleaned up on `terminate`. Aliases use sequential numbering (`aws-gpu1`, `aws-gpu2`, etc.) and never reuse numbers from previous instances.
285
+ SSH aliases are managed automatically — they're created on `launch`, shown in `status`, and cleaned up on `terminate`. Aliases use sequential numbering (`aws-gpu1`, `aws-gpu2`, etc.) and never reuse numbers from previous instances. You can use aliases anywhere you'd use an instance ID, e.g. `aws-bootstrap terminate aws-gpu1`.
286
+
287
+ ## EBS Data Volumes
288
+
289
+ Attach persistent EBS storage to keep datasets and model checkpoints across instance lifecycles. Volumes are mounted at `/data` and persist independently of the instance.
290
+
291
+ ```bash
292
+ # Create a new 96 GB gp3 volume, formatted and mounted at /data
293
+ aws-bootstrap launch --ebs-storage 96
294
+
295
+ # After terminating with --keep-ebs, reattach the same volume to a new instance
296
+ aws-bootstrap terminate --keep-ebs
297
+ # Output: Preserving EBS volume: vol-0abc123...
298
+ # Reattach with: aws-bootstrap launch --ebs-volume-id vol-0abc123...
299
+
300
+ aws-bootstrap launch --ebs-volume-id vol-0abc123def456
301
+ ```
302
+
303
+ Key behaviors:
304
+ - `--ebs-storage` and `--ebs-volume-id` are mutually exclusive
305
+ - New volumes are formatted as ext4; existing volumes are mounted as-is
306
+ - Volumes are tagged for automatic discovery by `status` and `terminate`
307
+ - `terminate` deletes data volumes by default; use `--keep-ebs` to preserve them
308
+ - **Spot-safe** — data volumes survive spot interruptions. If AWS reclaims your instance, the volume detaches automatically and can be reattached to a new instance with `--ebs-volume-id`
309
+ - EBS volumes must be in the same availability zone as the instance
310
+ - Mount failures are non-fatal — the instance remains usable
259
311
 
260
312
  ## EC2 vCPU Quotas
261
313
 
@@ -10,8 +10,13 @@ import click
10
10
 
11
11
  from .config import LaunchConfig
12
12
  from .ec2 import (
13
+ EBS_MOUNT_POINT,
13
14
  CLIError,
15
+ attach_ebs_volume,
16
+ create_ebs_volume,
17
+ delete_ebs_volume,
14
18
  ensure_security_group,
19
+ find_ebs_volumes_for_instance,
15
20
  find_tagged_instances,
16
21
  get_latest_ami,
17
22
  get_spot_price,
@@ -19,16 +24,21 @@ from .ec2 import (
19
24
  list_amis,
20
25
  list_instance_types,
21
26
  terminate_tagged_instances,
27
+ validate_ebs_volume,
22
28
  wait_instance_ready,
23
29
  )
24
30
  from .ssh import (
25
31
  add_ssh_host,
32
+ cleanup_stale_ssh_hosts,
33
+ find_stale_ssh_hosts,
26
34
  get_ssh_host_details,
27
35
  import_key_pair,
28
36
  list_ssh_hosts,
37
+ mount_ebs_volume,
29
38
  private_key_path,
30
39
  query_gpu_info,
31
40
  remove_ssh_host,
41
+ resolve_instance_id,
32
42
  run_remote_setup,
33
43
  wait_for_ssh,
34
44
  )
@@ -119,6 +129,18 @@ def main():
119
129
  help="Python version for the remote venv (e.g. 3.13, 3.14.2). Passed to uv during setup.",
120
130
  )
121
131
  @click.option("--ssh-port", default=22, show_default=True, type=int, help="SSH port on the remote instance.")
132
+ @click.option(
133
+ "--ebs-storage",
134
+ default=None,
135
+ type=int,
136
+ help="Create and attach a new EBS data volume (size in GB, gp3). Mounted at /data.",
137
+ )
138
+ @click.option(
139
+ "--ebs-volume-id",
140
+ default=None,
141
+ type=str,
142
+ help="Attach an existing EBS volume by ID (e.g. vol-0abc123). Mounted at /data.",
143
+ )
122
144
  def launch(
123
145
  instance_type,
124
146
  ami_filter,
@@ -133,8 +155,13 @@ def launch(
133
155
  profile,
134
156
  python_version,
135
157
  ssh_port,
158
+ ebs_storage,
159
+ ebs_volume_id,
136
160
  ):
137
161
  """Launch a GPU-accelerated EC2 instance."""
162
+ if ebs_storage is not None and ebs_volume_id is not None:
163
+ raise CLIError("--ebs-storage and --ebs-volume-id are mutually exclusive.")
164
+
138
165
  config = LaunchConfig(
139
166
  instance_type=instance_type,
140
167
  spot=spot,
@@ -147,6 +174,8 @@ def launch(
147
174
  dry_run=dry_run,
148
175
  ssh_port=ssh_port,
149
176
  python_version=python_version,
177
+ ebs_storage=ebs_storage,
178
+ ebs_volume_id=ebs_volume_id,
150
179
  )
151
180
  if ami_filter:
152
181
  config.ami_filter = ami_filter
@@ -161,18 +190,21 @@ def launch(
161
190
  session = boto3.Session(profile_name=config.profile, region_name=config.region)
162
191
  ec2 = session.client("ec2")
163
192
 
193
+ has_ebs = config.ebs_storage is not None or config.ebs_volume_id is not None
194
+ total_steps = 7 if has_ebs else 6
195
+
164
196
  # Step 1: AMI lookup
165
- step(1, 6, "Looking up AMI...")
197
+ step(1, total_steps, "Looking up AMI...")
166
198
  ami = get_latest_ami(ec2, config.ami_filter)
167
199
  info(f"Found: {ami['Name']}")
168
200
  val("AMI ID", ami["ImageId"])
169
201
 
170
202
  # Step 2: SSH key pair
171
- step(2, 6, "Importing SSH key pair...")
203
+ step(2, total_steps, "Importing SSH key pair...")
172
204
  import_key_pair(ec2, config.key_name, config.key_path)
173
205
 
174
206
  # Step 3: Security group
175
- step(3, 6, "Ensuring security group...")
207
+ step(3, total_steps, "Ensuring security group...")
176
208
  sg_id = ensure_security_group(ec2, config.security_group, config.tag_value, ssh_port=config.ssh_port)
177
209
 
178
210
  pricing = "spot" if config.spot else "on-demand"
@@ -192,18 +224,22 @@ def launch(
192
224
  val("SSH port", str(config.ssh_port))
193
225
  if config.python_version:
194
226
  val("Python version", config.python_version)
227
+ if config.ebs_storage:
228
+ val("EBS data volume", f"{config.ebs_storage} GB gp3 (new, mounted at {EBS_MOUNT_POINT})")
229
+ if config.ebs_volume_id:
230
+ val("EBS data volume", f"{config.ebs_volume_id} (existing, mounted at {EBS_MOUNT_POINT})")
195
231
  click.echo()
196
232
  click.secho("No resources launched (dry-run mode).", fg="yellow")
197
233
  return
198
234
 
199
235
  # Step 4: Launch instance
200
- step(4, 6, f"Launching {config.instance_type} instance ({pricing})...")
236
+ step(4, total_steps, f"Launching {config.instance_type} instance ({pricing})...")
201
237
  instance = launch_instance(ec2, config, ami["ImageId"], sg_id)
202
238
  instance_id = instance["InstanceId"]
203
239
  val("Instance ID", instance_id)
204
240
 
205
241
  # Step 5: Wait for ready
206
- step(5, 6, "Waiting for instance to be ready...")
242
+ step(5, total_steps, "Waiting for instance to be ready...")
207
243
  instance = wait_instance_ready(ec2, instance_id)
208
244
  public_ip = instance.get("PublicIpAddress")
209
245
  if not public_ip:
@@ -212,9 +248,39 @@ def launch(
212
248
  return
213
249
 
214
250
  val("Public IP", public_ip)
251
+ az = instance["Placement"]["AvailabilityZone"]
252
+
253
+ # Step 5.5 (optional): EBS data volume
254
+ ebs_volume_attached = None
255
+ ebs_format = False
256
+ if has_ebs:
257
+ step(6, total_steps, "Setting up EBS data volume...")
258
+ if config.ebs_storage:
259
+ info(f"Creating {config.ebs_storage} GB gp3 volume in {az}...")
260
+ ebs_volume_attached = create_ebs_volume(ec2, config.ebs_storage, az, config.tag_value, instance_id)
261
+ val("Volume ID", ebs_volume_attached)
262
+ ebs_format = True
263
+ elif config.ebs_volume_id:
264
+ info(f"Validating volume {config.ebs_volume_id}...")
265
+ validate_ebs_volume(ec2, config.ebs_volume_id, az)
266
+ ebs_volume_attached = config.ebs_volume_id
267
+ # Tag the existing volume for discovery
268
+ ec2.create_tags(
269
+ Resources=[ebs_volume_attached],
270
+ Tags=[
271
+ {"Key": "aws-bootstrap-instance", "Value": instance_id},
272
+ {"Key": "created-by", "Value": config.tag_value},
273
+ ],
274
+ )
275
+ ebs_format = False
215
276
 
216
- # Step 6: SSH and remote setup
217
- step(6, 6, "Waiting for SSH access...")
277
+ info(f"Attaching {ebs_volume_attached} to {instance_id}...")
278
+ attach_ebs_volume(ec2, ebs_volume_attached, instance_id)
279
+ success("EBS volume attached.")
280
+
281
+ # SSH and remote setup step
282
+ ssh_step = 7 if has_ebs else 6
283
+ step(ssh_step, total_steps, "Waiting for SSH access...")
218
284
  private_key = private_key_path(config.key_path)
219
285
  if not wait_for_ssh(public_ip, config.ssh_user, config.key_path, port=config.ssh_port):
220
286
  warn("SSH did not become available within the timeout.")
@@ -237,6 +303,22 @@ def launch(
237
303
  else:
238
304
  warn("Remote setup failed. Instance is still running.")
239
305
 
306
+ # Mount EBS volume via SSH (after setup so the instance is fully ready)
307
+ if ebs_volume_attached:
308
+ info(f"Mounting EBS volume at {EBS_MOUNT_POINT}...")
309
+ if mount_ebs_volume(
310
+ public_ip,
311
+ config.ssh_user,
312
+ config.key_path,
313
+ ebs_volume_attached,
314
+ mount_point=EBS_MOUNT_POINT,
315
+ format_volume=ebs_format,
316
+ port=config.ssh_port,
317
+ ):
318
+ success(f"EBS volume mounted at {EBS_MOUNT_POINT}.")
319
+ else:
320
+ warn(f"Failed to mount EBS volume at {EBS_MOUNT_POINT}. You may need to mount it manually.")
321
+
240
322
  # Add SSH config alias
241
323
  alias = add_ssh_host(
242
324
  instance_id=instance_id,
@@ -259,6 +341,12 @@ def launch(
259
341
  val("Instance", config.instance_type)
260
342
  val("Pricing", pricing)
261
343
  val("SSH alias", alias)
344
+ if ebs_volume_attached:
345
+ if config.ebs_storage:
346
+ ebs_label = f"{ebs_volume_attached} ({config.ebs_storage} GB, {EBS_MOUNT_POINT})"
347
+ else:
348
+ ebs_label = f"{ebs_volume_attached} ({EBS_MOUNT_POINT})"
349
+ val("EBS data volume", ebs_label)
262
350
 
263
351
  port_flag = f" -p {config.ssh_port}" if config.ssh_port != 22 else ""
264
352
 
@@ -288,7 +376,7 @@ def launch(
288
376
 
289
377
  click.echo()
290
378
  click.secho(" Terminate:", fg="cyan")
291
- click.secho(f" aws-bootstrap terminate {instance_id} --region {config.region}", bold=True)
379
+ click.secho(f" aws-bootstrap terminate {alias} --region {config.region}", bold=True)
292
380
  click.echo()
293
381
 
294
382
 
@@ -370,6 +458,12 @@ def status(region, profile, gpu, instructions):
370
458
  else:
371
459
  click.echo(" GPU: " + click.style("unavailable", dim=True))
372
460
 
461
+ # EBS data volumes
462
+ ebs_volumes = find_ebs_volumes_for_instance(ec2, inst["InstanceId"], "aws-bootstrap-g4dn")
463
+ for vol in ebs_volumes:
464
+ vol_state = f", {vol['State']}" if vol["State"] != "in-use" else ""
465
+ val(" EBS", f"{vol['VolumeId']} ({vol['Size']} GB, {EBS_MOUNT_POINT}{vol_state})")
466
+
373
467
  lifecycle = inst["Lifecycle"]
374
468
  is_spot = lifecycle == "spot"
375
469
 
@@ -419,7 +513,8 @@ def status(region, profile, gpu, instructions):
419
513
 
420
514
  click.echo()
421
515
  first_id = instances[0]["InstanceId"]
422
- click.echo(" To terminate: " + click.style(f"aws-bootstrap terminate {first_id}", bold=True))
516
+ first_ref = ssh_hosts.get(first_id, first_id)
517
+ click.echo(" To terminate: " + click.style(f"aws-bootstrap terminate {first_ref}", bold=True))
423
518
  click.echo()
424
519
 
425
520
 
@@ -427,18 +522,29 @@ def status(region, profile, gpu, instructions):
427
522
  @click.option("--region", default="us-west-2", show_default=True, help="AWS region.")
428
523
  @click.option("--profile", default=None, help="AWS profile override.")
429
524
  @click.option("--yes", "-y", is_flag=True, default=False, help="Skip confirmation prompt.")
430
- @click.argument("instance_ids", nargs=-1)
431
- def terminate(region, profile, yes, instance_ids):
525
+ @click.option("--keep-ebs", is_flag=True, default=False, help="Preserve EBS data volumes instead of deleting them.")
526
+ @click.argument("instance_ids", nargs=-1, metavar="[INSTANCE_ID_OR_ALIAS]...")
527
+ def terminate(region, profile, yes, keep_ebs, instance_ids):
432
528
  """Terminate instances created by aws-bootstrap.
433
529
 
434
- Pass specific instance IDs to terminate, or omit to terminate all
435
- aws-bootstrap instances in the region.
530
+ Pass specific instance IDs or SSH aliases (e.g. aws-gpu1) to terminate,
531
+ or omit to terminate all aws-bootstrap instances in the region.
436
532
  """
437
533
  session = boto3.Session(profile_name=profile, region_name=region)
438
534
  ec2 = session.client("ec2")
439
535
 
440
536
  if instance_ids:
441
- targets = list(instance_ids)
537
+ targets = []
538
+ for value in instance_ids:
539
+ resolved = resolve_instance_id(value)
540
+ if resolved is None:
541
+ raise CLIError(
542
+ f"Could not resolve '{value}' to an instance ID.\n\n"
543
+ " It is not a valid instance ID or a known SSH alias."
544
+ )
545
+ if resolved != value:
546
+ info(f"Resolved alias '{value}' -> {resolved}")
547
+ targets.append(resolved)
442
548
  else:
443
549
  instances = find_tagged_instances(ec2, "aws-bootstrap-g4dn")
444
550
  if not instances:
@@ -456,6 +562,13 @@ def terminate(region, profile, yes, instance_ids):
456
562
  click.secho(" Cancelled.", fg="yellow")
457
563
  return
458
564
 
565
+ # Discover EBS volumes before termination (while instances still exist)
566
+ ebs_by_instance: dict[str, list[dict]] = {}
567
+ for target in targets:
568
+ volumes = find_ebs_volumes_for_instance(ec2, target, "aws-bootstrap-g4dn")
569
+ if volumes:
570
+ ebs_by_instance[target] = volumes
571
+
459
572
  changes = terminate_tagged_instances(ec2, targets)
460
573
  click.echo()
461
574
  for change in changes:
@@ -467,10 +580,73 @@ def terminate(region, profile, yes, instance_ids):
467
580
  removed_alias = remove_ssh_host(change["InstanceId"])
468
581
  if removed_alias:
469
582
  info(f"Removed SSH config alias: {removed_alias}")
583
+
584
+ # Handle EBS volume cleanup
585
+ for _iid, volumes in ebs_by_instance.items():
586
+ for vol in volumes:
587
+ vid = vol["VolumeId"]
588
+ if keep_ebs:
589
+ click.echo()
590
+ info(f"Preserving EBS volume: {vid} ({vol['Size']} GB)")
591
+ info(f"Reattach with: aws-bootstrap launch --ebs-volume-id {vid}")
592
+ else:
593
+ click.echo()
594
+ info(f"Waiting for EBS volume {vid} to detach...")
595
+ try:
596
+ waiter = ec2.get_waiter("volume_available")
597
+ waiter.wait(VolumeIds=[vid], WaiterConfig={"Delay": 10, "MaxAttempts": 30})
598
+ delete_ebs_volume(ec2, vid)
599
+ success(f"Deleted EBS volume: {vid}")
600
+ except Exception as e:
601
+ warn(f"Failed to delete EBS volume {vid}: {e}")
602
+
470
603
  click.echo()
471
604
  success(f"Terminated {len(changes)} instance(s).")
472
605
 
473
606
 
607
+ @main.command()
608
+ @click.option("--dry-run", is_flag=True, default=False, help="Show what would be removed without removing.")
609
+ @click.option("--yes", "-y", is_flag=True, default=False, help="Skip confirmation prompt.")
610
+ @click.option("--region", default="us-west-2", show_default=True, help="AWS region.")
611
+ @click.option("--profile", default=None, help="AWS profile override.")
612
+ def cleanup(dry_run, yes, region, profile):
613
+ """Remove stale SSH config entries for terminated instances."""
614
+ session = boto3.Session(profile_name=profile, region_name=region)
615
+ ec2 = session.client("ec2")
616
+
617
+ live_instances = find_tagged_instances(ec2, "aws-bootstrap-g4dn")
618
+ live_ids = {inst["InstanceId"] for inst in live_instances}
619
+
620
+ stale = find_stale_ssh_hosts(live_ids)
621
+ if not stale:
622
+ click.secho("No stale SSH config entries found.", fg="green")
623
+ return
624
+
625
+ click.secho(f"\n Found {len(stale)} stale SSH config entry(ies):\n", bold=True, fg="cyan")
626
+ for iid, alias in stale:
627
+ click.echo(" " + click.style(alias, fg="bright_white") + f" ({iid})")
628
+
629
+ if dry_run:
630
+ click.echo()
631
+ for iid, alias in stale:
632
+ info(f"Would remove {alias} ({iid})")
633
+ return
634
+
635
+ if not yes:
636
+ click.echo()
637
+ if not click.confirm(f" Remove {len(stale)} stale entry(ies)?"):
638
+ click.secho(" Cancelled.", fg="yellow")
639
+ return
640
+
641
+ results = cleanup_stale_ssh_hosts(live_ids)
642
+ click.echo()
643
+ for r in results:
644
+ success(f"Removed {r.alias} ({r.instance_id})")
645
+
646
+ click.echo()
647
+ success(f"Cleaned up {len(results)} stale entry(ies).")
648
+
649
+
474
650
  # ---------------------------------------------------------------------------
475
651
  # list command group
476
652
  # ---------------------------------------------------------------------------