aws-bootstrap-g4dn 0.4.0__tar.gz → 0.6.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {aws_bootstrap_g4dn-0.4.0 → aws_bootstrap_g4dn-0.6.0}/CLAUDE.md +36 -5
- {aws_bootstrap_g4dn-0.4.0/aws_bootstrap_g4dn.egg-info → aws_bootstrap_g4dn-0.6.0}/PKG-INFO +62 -10
- {aws_bootstrap_g4dn-0.4.0 → aws_bootstrap_g4dn-0.6.0}/README.md +61 -9
- {aws_bootstrap_g4dn-0.4.0 → aws_bootstrap_g4dn-0.6.0}/aws_bootstrap/cli.py +190 -14
- {aws_bootstrap_g4dn-0.4.0 → aws_bootstrap_g4dn-0.6.0}/aws_bootstrap/config.py +2 -0
- {aws_bootstrap_g4dn-0.4.0 → aws_bootstrap_g4dn-0.6.0}/aws_bootstrap/ec2.py +128 -0
- {aws_bootstrap_g4dn-0.4.0 → aws_bootstrap_g4dn-0.6.0}/aws_bootstrap/resources/remote_setup.sh +2 -2
- {aws_bootstrap_g4dn-0.4.0 → aws_bootstrap_g4dn-0.6.0}/aws_bootstrap/ssh.py +149 -0
- {aws_bootstrap_g4dn-0.4.0 → aws_bootstrap_g4dn-0.6.0}/aws_bootstrap/tests/test_cli.py +424 -4
- {aws_bootstrap_g4dn-0.4.0 → aws_bootstrap_g4dn-0.6.0}/aws_bootstrap/tests/test_config.py +18 -0
- aws_bootstrap_g4dn-0.6.0/aws_bootstrap/tests/test_ebs.py +245 -0
- {aws_bootstrap_g4dn-0.4.0 → aws_bootstrap_g4dn-0.6.0}/aws_bootstrap/tests/test_ssh_config.py +152 -0
- aws_bootstrap_g4dn-0.6.0/aws_bootstrap/tests/test_ssh_ebs.py +76 -0
- {aws_bootstrap_g4dn-0.4.0 → aws_bootstrap_g4dn-0.6.0/aws_bootstrap_g4dn.egg-info}/PKG-INFO +62 -10
- {aws_bootstrap_g4dn-0.4.0 → aws_bootstrap_g4dn-0.6.0}/aws_bootstrap_g4dn.egg-info/SOURCES.txt +2 -0
- {aws_bootstrap_g4dn-0.4.0 → aws_bootstrap_g4dn-0.6.0}/.github/ISSUE_TEMPLATE/bug_report.md +0 -0
- {aws_bootstrap_g4dn-0.4.0 → aws_bootstrap_g4dn-0.6.0}/.github/ISSUE_TEMPLATE/feature_request.md +0 -0
- {aws_bootstrap_g4dn-0.4.0 → aws_bootstrap_g4dn-0.6.0}/.github/workflows/ci.yml +0 -0
- {aws_bootstrap_g4dn-0.4.0 → aws_bootstrap_g4dn-0.6.0}/.github/workflows/publish-to-pypi.yml +0 -0
- {aws_bootstrap_g4dn-0.4.0 → aws_bootstrap_g4dn-0.6.0}/.gitignore +0 -0
- {aws_bootstrap_g4dn-0.4.0 → aws_bootstrap_g4dn-0.6.0}/.pre-commit-config.yaml +0 -0
- {aws_bootstrap_g4dn-0.4.0 → aws_bootstrap_g4dn-0.6.0}/CODE_OF_CONDUCT.md +0 -0
- {aws_bootstrap_g4dn-0.4.0 → aws_bootstrap_g4dn-0.6.0}/CONTRIBUTING.md +0 -0
- {aws_bootstrap_g4dn-0.4.0 → aws_bootstrap_g4dn-0.6.0}/LICENSE +0 -0
- {aws_bootstrap_g4dn-0.4.0 → aws_bootstrap_g4dn-0.6.0}/SECURITY.md +0 -0
- {aws_bootstrap_g4dn-0.4.0 → aws_bootstrap_g4dn-0.6.0}/aws_bootstrap/__init__.py +0 -0
- {aws_bootstrap_g4dn-0.4.0 → aws_bootstrap_g4dn-0.6.0}/aws_bootstrap/gpu.py +0 -0
- {aws_bootstrap_g4dn-0.4.0 → aws_bootstrap_g4dn-0.6.0}/aws_bootstrap/resources/__init__.py +0 -0
- {aws_bootstrap_g4dn-0.4.0 → aws_bootstrap_g4dn-0.6.0}/aws_bootstrap/resources/gpu_benchmark.py +0 -0
- {aws_bootstrap_g4dn-0.4.0 → aws_bootstrap_g4dn-0.6.0}/aws_bootstrap/resources/gpu_smoke_test.ipynb +0 -0
- {aws_bootstrap_g4dn-0.4.0 → aws_bootstrap_g4dn-0.6.0}/aws_bootstrap/resources/launch.json +0 -0
- {aws_bootstrap_g4dn-0.4.0 → aws_bootstrap_g4dn-0.6.0}/aws_bootstrap/resources/requirements.txt +0 -0
- {aws_bootstrap_g4dn-0.4.0 → aws_bootstrap_g4dn-0.6.0}/aws_bootstrap/resources/saxpy.cu +0 -0
- {aws_bootstrap_g4dn-0.4.0 → aws_bootstrap_g4dn-0.6.0}/aws_bootstrap/resources/tasks.json +0 -0
- {aws_bootstrap_g4dn-0.4.0 → aws_bootstrap_g4dn-0.6.0}/aws_bootstrap/tests/__init__.py +0 -0
- {aws_bootstrap_g4dn-0.4.0 → aws_bootstrap_g4dn-0.6.0}/aws_bootstrap/tests/test_ec2.py +0 -0
- {aws_bootstrap_g4dn-0.4.0 → aws_bootstrap_g4dn-0.6.0}/aws_bootstrap/tests/test_gpu.py +0 -0
- {aws_bootstrap_g4dn-0.4.0 → aws_bootstrap_g4dn-0.6.0}/aws_bootstrap/tests/test_ssh_gpu.py +0 -0
- {aws_bootstrap_g4dn-0.4.0 → aws_bootstrap_g4dn-0.6.0}/aws_bootstrap_g4dn.egg-info/dependency_links.txt +0 -0
- {aws_bootstrap_g4dn-0.4.0 → aws_bootstrap_g4dn-0.6.0}/aws_bootstrap_g4dn.egg-info/entry_points.txt +0 -0
- {aws_bootstrap_g4dn-0.4.0 → aws_bootstrap_g4dn-0.6.0}/aws_bootstrap_g4dn.egg-info/requires.txt +0 -0
- {aws_bootstrap_g4dn-0.4.0 → aws_bootstrap_g4dn-0.6.0}/aws_bootstrap_g4dn.egg-info/top_level.txt +0 -0
- {aws_bootstrap_g4dn-0.4.0 → aws_bootstrap_g4dn-0.6.0}/docs/nsight-remote-profiling.md +0 -0
- {aws_bootstrap_g4dn-0.4.0 → aws_bootstrap_g4dn-0.6.0}/pyproject.toml +0 -0
- {aws_bootstrap_g4dn-0.4.0 → aws_bootstrap_g4dn-0.6.0}/setup.cfg +0 -0
- {aws_bootstrap_g4dn-0.4.0 → aws_bootstrap_g4dn-0.6.0}/uv.lock +0 -0
|
@@ -32,9 +32,9 @@ aws_bootstrap/
|
|
|
32
32
|
__init__.py # Package init
|
|
33
33
|
cli.py # Click CLI entry point (launch, status, terminate commands)
|
|
34
34
|
config.py # LaunchConfig dataclass with defaults
|
|
35
|
-
ec2.py # AMI lookup, security group, instance launch/find/terminate, polling, spot pricing
|
|
35
|
+
ec2.py # AMI lookup, security group, instance launch/find/terminate, polling, spot pricing, EBS volume ops
|
|
36
36
|
gpu.py # GPU architecture mapping and GpuInfo dataclass
|
|
37
|
-
ssh.py # SSH key pair import, SSH readiness check, remote setup, ~/.ssh/config management, GPU queries
|
|
37
|
+
ssh.py # SSH key pair import, SSH readiness check, remote setup, ~/.ssh/config management, GPU queries, EBS mount
|
|
38
38
|
resources/ # Non-Python artifacts SCP'd to remote instances
|
|
39
39
|
__init__.py
|
|
40
40
|
gpu_benchmark.py # GPU throughput benchmark (CNN + Transformer), copied to ~/gpu_benchmark.py on instance
|
|
@@ -51,6 +51,8 @@ aws_bootstrap/
|
|
|
51
51
|
test_gpu.py
|
|
52
52
|
test_ssh_config.py
|
|
53
53
|
test_ssh_gpu.py
|
|
54
|
+
test_ebs.py
|
|
55
|
+
test_ssh_ebs.py
|
|
54
56
|
docs/
|
|
55
57
|
nsight-remote-profiling.md # Nsight Compute, Nsight Systems, and Nsight VSCE remote profiling guide
|
|
56
58
|
spot-request-lifecycle.md # Research notes on spot request cleanup
|
|
@@ -60,9 +62,10 @@ Entry point: `aws-bootstrap = "aws_bootstrap.cli:main"` (installed via `uv sync`
|
|
|
60
62
|
|
|
61
63
|
## CLI Commands
|
|
62
64
|
|
|
63
|
-
- **`launch`** — provisions an EC2 instance (spot by default, falls back to on-demand on capacity errors); adds SSH config alias (e.g. `aws-gpu1`) to `~/.ssh/config`; `--python-version` controls which Python `uv` installs in the remote venv; `--ssh-port` overrides the default SSH port (22) for security group ingress, connection checks, and SSH config
|
|
64
|
-
- **`status`** — lists all non-terminated instances (including `shutting-down`) with type, IP, SSH alias, pricing (spot price/hr or on-demand), uptime, and estimated cost for running spot instances; `--gpu` flag queries GPU info via SSH, reporting both CUDA toolkit version (from `nvcc`) and driver-supported max (from `nvidia-smi`); `--instructions` (default: on) prints connection commands (SSH, Jupyter tunnel, VSCode Remote SSH, GPU benchmark) for each running instance; suppress with `--no-instructions`
|
|
65
|
-
- **`terminate`** — terminates instances by ID or all aws-bootstrap instances in the region; removes SSH config aliases
|
|
65
|
+
- **`launch`** — provisions an EC2 instance (spot by default, falls back to on-demand on capacity errors); adds SSH config alias (e.g. `aws-gpu1`) to `~/.ssh/config`; `--python-version` controls which Python `uv` installs in the remote venv; `--ssh-port` overrides the default SSH port (22) for security group ingress, connection checks, and SSH config; `--ebs-storage SIZE` creates and attaches a new gp3 EBS data volume (mounted at `/data`); `--ebs-volume-id ID` attaches an existing EBS volume (mutually exclusive with `--ebs-storage`)
|
|
66
|
+
- **`status`** — lists all non-terminated instances (including `shutting-down`) with type, IP, SSH alias, EBS data volumes, pricing (spot price/hr or on-demand), uptime, and estimated cost for running spot instances; `--gpu` flag queries GPU info via SSH, reporting both CUDA toolkit version (from `nvcc`) and driver-supported max (from `nvidia-smi`); `--instructions` (default: on) prints connection commands (SSH, Jupyter tunnel, VSCode Remote SSH, GPU benchmark) for each running instance; suppress with `--no-instructions`
|
|
67
|
+
- **`terminate`** — terminates instances by ID or SSH alias (e.g. `aws-gpu1`, resolved via `~/.ssh/config`), or all aws-bootstrap instances in the region if no arguments given; removes SSH config aliases; deletes associated EBS data volumes by default; `--keep-ebs` preserves volumes and prints reattach commands
|
|
68
|
+
- **`cleanup`** — removes stale `~/.ssh/config` entries for terminated/non-existent instances; compares managed SSH config blocks against live EC2 instances; `--dry-run` previews removals without modifying config; `--yes` skips the confirmation prompt
|
|
66
69
|
- **`list instance-types`** — lists EC2 instance types matching a family prefix (default: `g4dn`), showing vCPUs, memory, and GPU info
|
|
67
70
|
- **`list amis`** — lists available AMIs matching a name pattern (default: Deep Learning Base OSS Nvidia Driver GPU AMIs), sorted newest-first
|
|
68
71
|
|
|
@@ -112,6 +115,34 @@ The `KNOWN_CUDA_TAGS` array in `remote_setup.sh` lists the CUDA wheel tags publi
|
|
|
112
115
|
|
|
113
116
|
`resources/gpu_benchmark.py` is uploaded to `~/gpu_benchmark.py` on the remote instance during setup. It benchmarks GPU throughput with two modes: CNN on MNIST and a GPT-style Transformer on synthetic data. It reports samples/sec, batch times, and peak GPU memory. Supports `--precision` (fp32/fp16/bf16/tf32), `--diagnose` for CUDA smoke tests, and separate `--transformer-batch-size` (default 32, T4-safe). Dependencies (`torch`, `torchvision`, `tqdm`) are already installed by the setup script.
|
|
114
117
|
|
|
118
|
+
## EBS Data Volumes
|
|
119
|
+
|
|
120
|
+
The `--ebs-storage` and `--ebs-volume-id` options on `launch` create or attach persistent gp3 EBS volumes mounted at `/data`. The implementation spans three modules:
|
|
121
|
+
|
|
122
|
+
- **`ec2.py`** — Volume lifecycle: `create_ebs_volume`, `validate_ebs_volume`, `attach_ebs_volume`, `detach_ebs_volume`, `delete_ebs_volume`, `find_ebs_volumes_for_instance`. Constants `EBS_DEVICE_NAME` (`/dev/sdf`) and `EBS_MOUNT_POINT` (`/data`).
|
|
123
|
+
- **`ssh.py`** — `mount_ebs_volume()` SSHs to the instance and runs a shell script that detects the device, optionally formats it, mounts it, and adds an fstab entry.
|
|
124
|
+
- **`cli.py`** — Orchestrates the flow: create/validate → attach → wait for SSH → mount. Mount failures are non-fatal (warn and continue).
|
|
125
|
+
|
|
126
|
+
### Tagging strategy
|
|
127
|
+
|
|
128
|
+
Volumes are tagged for discovery by `status` and `terminate`:
|
|
129
|
+
|
|
130
|
+
| Tag | Value | Purpose |
|
|
131
|
+
|-----|-------|---------|
|
|
132
|
+
| `created-by` | `aws-bootstrap-g4dn` | Standard tool-managed resource tag |
|
|
133
|
+
| `Name` | `aws-bootstrap-data-{instance_id}` | Human-readable in AWS console |
|
|
134
|
+
| `aws-bootstrap-instance` | `i-xxxxxxxxx` | Links volume to instance for `find_ebs_volumes_for_instance` |
|
|
135
|
+
|
|
136
|
+
### NVMe device detection
|
|
137
|
+
|
|
138
|
+
On Nitro instances (g4dn), `/dev/sdf` is remapped to `/dev/nvmeXn1`. The mount script detects the correct device by matching the volume ID serial number via `lsblk -o NAME,SERIAL -dpn`, with fallbacks to `/dev/nvme1n1`, `/dev/xvdf`, `/dev/sdf`.
|
|
139
|
+
|
|
140
|
+
### Spot interruption and terminate cleanup
|
|
141
|
+
|
|
142
|
+
Non-root EBS volumes attached via API have `DeleteOnTermination=False` by default. This means data volumes **survive spot interruptions** — when AWS reclaims the instance, the volume detaches and becomes `available`, preserving all data. The user can reattach it to a new instance with `--ebs-volume-id`.
|
|
143
|
+
|
|
144
|
+
The `terminate` command discovers volumes via `find_ebs_volumes_for_instance`, waits for them to detach (becomes `available`), then deletes them. `--keep-ebs` skips deletion and prints the volume ID with a reattach command.
|
|
145
|
+
|
|
115
146
|
## Versioning & Publishing
|
|
116
147
|
|
|
117
148
|
Version is derived automatically from git tags via **setuptools-scm** — no hardcoded version string in the codebase.
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: aws-bootstrap-g4dn
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.6.0
|
|
4
4
|
Summary: Bootstrap AWS EC2 GPU instances for hybrid local-remote development
|
|
5
5
|
Author: Adam Ever-Hadani
|
|
6
6
|
License-Expression: MIT
|
|
@@ -44,7 +44,8 @@ ssh aws-gpu1 # You're in, venv activated, PyTorch works
|
|
|
44
44
|
| 📊 | **GPU benchmark included** | CNN (MNIST) + Transformer benchmarks with FP16/FP32/BF16 precision and tqdm progress |
|
|
45
45
|
| 📓 | **Jupyter ready** | Lab server auto-starts as a systemd service on port 8888 — just SSH tunnel and open |
|
|
46
46
|
| 🖥️ | **`status --gpu`** | Shows CUDA toolkit version, driver max, GPU architecture, spot pricing, uptime, and estimated cost |
|
|
47
|
-
|
|
|
47
|
+
| 💾 | **EBS data volumes** | Attach persistent storage at `/data` — survives spot interruptions and termination, reattach to new instances |
|
|
48
|
+
| 🗑️ | **Clean terminate** | Stops instances, removes SSH aliases, cleans up EBS volumes (or preserves with `--keep-ebs`) |
|
|
48
49
|
|
|
49
50
|
### 🎯 Target Workflows
|
|
50
51
|
|
|
@@ -132,16 +133,24 @@ aws-bootstrap launch --python-version 3.13
|
|
|
132
133
|
# Use a non-default SSH port
|
|
133
134
|
aws-bootstrap launch --ssh-port 2222
|
|
134
135
|
|
|
136
|
+
# Attach a persistent EBS data volume (96 GB gp3, mounted at /data)
|
|
137
|
+
aws-bootstrap launch --ebs-storage 96
|
|
138
|
+
|
|
139
|
+
# Reattach an existing EBS volume from a previous instance
|
|
140
|
+
aws-bootstrap launch --ebs-volume-id vol-0abc123def456
|
|
141
|
+
|
|
135
142
|
# Use a specific AWS profile
|
|
136
143
|
aws-bootstrap launch --profile my-aws-profile
|
|
137
144
|
```
|
|
138
145
|
|
|
139
146
|
After launch, the CLI:
|
|
140
147
|
|
|
141
|
-
1. **
|
|
142
|
-
2. **
|
|
143
|
-
3. **Runs
|
|
144
|
-
4. **
|
|
148
|
+
1. **Creates/attaches EBS volume** (if `--ebs-storage` or `--ebs-volume-id` was specified)
|
|
149
|
+
2. **Adds an SSH alias** (e.g. `aws-gpu1`) to `~/.ssh/config`
|
|
150
|
+
3. **Runs remote setup** — installs utilities, creates a Python venv, installs CUDA-matched PyTorch, sets up Jupyter
|
|
151
|
+
4. **Mounts EBS volume** at `/data` (if applicable — formats new volumes, mounts existing ones as-is)
|
|
152
|
+
5. **Runs a CUDA smoke test** — verifies `torch.cuda.is_available()` and runs a quick GPU matmul
|
|
153
|
+
6. **Prints connection commands** — SSH, Jupyter tunnel, GPU benchmark, and terminate
|
|
145
154
|
|
|
146
155
|
```bash
|
|
147
156
|
ssh aws-gpu1 # venv auto-activates on login
|
|
@@ -154,7 +163,7 @@ The setup script runs automatically on the instance after SSH becomes available:
|
|
|
154
163
|
| Step | What |
|
|
155
164
|
|------|------|
|
|
156
165
|
| **GPU verify** | Confirms `nvidia-smi` and `nvcc` are working |
|
|
157
|
-
| **Utilities** | Installs `htop`, `tmux`, `tree`, `jq` |
|
|
166
|
+
| **Utilities** | Installs `htop`, `tmux`, `tree`, `jq`, `ffmpeg` |
|
|
158
167
|
| **Python venv** | Creates `~/venv` with `uv`, auto-activates in `~/.bashrc`. Use `--python-version` to pin a specific Python (e.g. `3.13`) |
|
|
159
168
|
| **CUDA-aware PyTorch** | Detects CUDA toolkit version → installs PyTorch from the matching `cu{TAG}` wheel index |
|
|
160
169
|
| **CUDA smoke test** | Runs `torch.cuda.is_available()` + GPU matmul to verify the stack |
|
|
@@ -261,11 +270,29 @@ aws-bootstrap status --region us-east-1
|
|
|
261
270
|
# Terminate all aws-bootstrap instances (with confirmation prompt)
|
|
262
271
|
aws-bootstrap terminate
|
|
263
272
|
|
|
264
|
-
# Terminate
|
|
265
|
-
aws-bootstrap terminate
|
|
273
|
+
# Terminate but preserve EBS data volumes for reuse
|
|
274
|
+
aws-bootstrap terminate --keep-ebs
|
|
275
|
+
|
|
276
|
+
# Terminate by SSH alias (resolved via ~/.ssh/config)
|
|
277
|
+
aws-bootstrap terminate aws-gpu1
|
|
278
|
+
|
|
279
|
+
# Terminate by instance ID
|
|
280
|
+
aws-bootstrap terminate i-abc123
|
|
281
|
+
|
|
282
|
+
# Mix aliases and instance IDs
|
|
283
|
+
aws-bootstrap terminate aws-gpu1 i-def456
|
|
266
284
|
|
|
267
285
|
# Skip confirmation prompt
|
|
268
286
|
aws-bootstrap terminate --yes
|
|
287
|
+
|
|
288
|
+
# Remove stale SSH config entries for terminated instances
|
|
289
|
+
aws-bootstrap cleanup
|
|
290
|
+
|
|
291
|
+
# Preview what would be removed without modifying config
|
|
292
|
+
aws-bootstrap cleanup --dry-run
|
|
293
|
+
|
|
294
|
+
# Skip confirmation prompt
|
|
295
|
+
aws-bootstrap cleanup --yes
|
|
269
296
|
```
|
|
270
297
|
|
|
271
298
|
`status --gpu` reports both the **installed CUDA toolkit** version (from `nvcc`) and the **maximum CUDA version supported by the driver** (from `nvidia-smi`), so you can see at a glance whether they match:
|
|
@@ -274,7 +301,32 @@ aws-bootstrap terminate --yes
|
|
|
274
301
|
CUDA: 12.8 (driver supports up to 13.0)
|
|
275
302
|
```
|
|
276
303
|
|
|
277
|
-
SSH aliases are managed automatically — they're created on `launch`, shown in `status`, and cleaned up on `terminate`. Aliases use sequential numbering (`aws-gpu1`, `aws-gpu2`, etc.) and never reuse numbers from previous instances.
|
|
304
|
+
SSH aliases are managed automatically — they're created on `launch`, shown in `status`, and cleaned up on `terminate`. Aliases use sequential numbering (`aws-gpu1`, `aws-gpu2`, etc.) and never reuse numbers from previous instances. You can use aliases anywhere you'd use an instance ID, e.g. `aws-bootstrap terminate aws-gpu1`.
|
|
305
|
+
|
|
306
|
+
## EBS Data Volumes
|
|
307
|
+
|
|
308
|
+
Attach persistent EBS storage to keep datasets and model checkpoints across instance lifecycles. Volumes are mounted at `/data` and persist independently of the instance.
|
|
309
|
+
|
|
310
|
+
```bash
|
|
311
|
+
# Create a new 96 GB gp3 volume, formatted and mounted at /data
|
|
312
|
+
aws-bootstrap launch --ebs-storage 96
|
|
313
|
+
|
|
314
|
+
# After terminating with --keep-ebs, reattach the same volume to a new instance
|
|
315
|
+
aws-bootstrap terminate --keep-ebs
|
|
316
|
+
# Output: Preserving EBS volume: vol-0abc123...
|
|
317
|
+
# Reattach with: aws-bootstrap launch --ebs-volume-id vol-0abc123...
|
|
318
|
+
|
|
319
|
+
aws-bootstrap launch --ebs-volume-id vol-0abc123def456
|
|
320
|
+
```
|
|
321
|
+
|
|
322
|
+
Key behaviors:
|
|
323
|
+
- `--ebs-storage` and `--ebs-volume-id` are mutually exclusive
|
|
324
|
+
- New volumes are formatted as ext4; existing volumes are mounted as-is
|
|
325
|
+
- Volumes are tagged for automatic discovery by `status` and `terminate`
|
|
326
|
+
- `terminate` deletes data volumes by default; use `--keep-ebs` to preserve them
|
|
327
|
+
- **Spot-safe** — data volumes survive spot interruptions. If AWS reclaims your instance, the volume detaches automatically and can be reattached to a new instance with `--ebs-volume-id`
|
|
328
|
+
- EBS volumes must be in the same availability zone as the instance
|
|
329
|
+
- Mount failures are non-fatal — the instance remains usable
|
|
278
330
|
|
|
279
331
|
## EC2 vCPU Quotas
|
|
280
332
|
|
|
@@ -25,7 +25,8 @@ ssh aws-gpu1 # You're in, venv activated, PyTorch works
|
|
|
25
25
|
| 📊 | **GPU benchmark included** | CNN (MNIST) + Transformer benchmarks with FP16/FP32/BF16 precision and tqdm progress |
|
|
26
26
|
| 📓 | **Jupyter ready** | Lab server auto-starts as a systemd service on port 8888 — just SSH tunnel and open |
|
|
27
27
|
| 🖥️ | **`status --gpu`** | Shows CUDA toolkit version, driver max, GPU architecture, spot pricing, uptime, and estimated cost |
|
|
28
|
-
|
|
|
28
|
+
| 💾 | **EBS data volumes** | Attach persistent storage at `/data` — survives spot interruptions and termination, reattach to new instances |
|
|
29
|
+
| 🗑️ | **Clean terminate** | Stops instances, removes SSH aliases, cleans up EBS volumes (or preserves with `--keep-ebs`) |
|
|
29
30
|
|
|
30
31
|
### 🎯 Target Workflows
|
|
31
32
|
|
|
@@ -113,16 +114,24 @@ aws-bootstrap launch --python-version 3.13
|
|
|
113
114
|
# Use a non-default SSH port
|
|
114
115
|
aws-bootstrap launch --ssh-port 2222
|
|
115
116
|
|
|
117
|
+
# Attach a persistent EBS data volume (96 GB gp3, mounted at /data)
|
|
118
|
+
aws-bootstrap launch --ebs-storage 96
|
|
119
|
+
|
|
120
|
+
# Reattach an existing EBS volume from a previous instance
|
|
121
|
+
aws-bootstrap launch --ebs-volume-id vol-0abc123def456
|
|
122
|
+
|
|
116
123
|
# Use a specific AWS profile
|
|
117
124
|
aws-bootstrap launch --profile my-aws-profile
|
|
118
125
|
```
|
|
119
126
|
|
|
120
127
|
After launch, the CLI:
|
|
121
128
|
|
|
122
|
-
1. **
|
|
123
|
-
2. **
|
|
124
|
-
3. **Runs
|
|
125
|
-
4. **
|
|
129
|
+
1. **Creates/attaches EBS volume** (if `--ebs-storage` or `--ebs-volume-id` was specified)
|
|
130
|
+
2. **Adds an SSH alias** (e.g. `aws-gpu1`) to `~/.ssh/config`
|
|
131
|
+
3. **Runs remote setup** — installs utilities, creates a Python venv, installs CUDA-matched PyTorch, sets up Jupyter
|
|
132
|
+
4. **Mounts EBS volume** at `/data` (if applicable — formats new volumes, mounts existing ones as-is)
|
|
133
|
+
5. **Runs a CUDA smoke test** — verifies `torch.cuda.is_available()` and runs a quick GPU matmul
|
|
134
|
+
6. **Prints connection commands** — SSH, Jupyter tunnel, GPU benchmark, and terminate
|
|
126
135
|
|
|
127
136
|
```bash
|
|
128
137
|
ssh aws-gpu1 # venv auto-activates on login
|
|
@@ -135,7 +144,7 @@ The setup script runs automatically on the instance after SSH becomes available:
|
|
|
135
144
|
| Step | What |
|
|
136
145
|
|------|------|
|
|
137
146
|
| **GPU verify** | Confirms `nvidia-smi` and `nvcc` are working |
|
|
138
|
-
| **Utilities** | Installs `htop`, `tmux`, `tree`, `jq` |
|
|
147
|
+
| **Utilities** | Installs `htop`, `tmux`, `tree`, `jq`, `ffmpeg` |
|
|
139
148
|
| **Python venv** | Creates `~/venv` with `uv`, auto-activates in `~/.bashrc`. Use `--python-version` to pin a specific Python (e.g. `3.13`) |
|
|
140
149
|
| **CUDA-aware PyTorch** | Detects CUDA toolkit version → installs PyTorch from the matching `cu{TAG}` wheel index |
|
|
141
150
|
| **CUDA smoke test** | Runs `torch.cuda.is_available()` + GPU matmul to verify the stack |
|
|
@@ -242,11 +251,29 @@ aws-bootstrap status --region us-east-1
|
|
|
242
251
|
# Terminate all aws-bootstrap instances (with confirmation prompt)
|
|
243
252
|
aws-bootstrap terminate
|
|
244
253
|
|
|
245
|
-
# Terminate
|
|
246
|
-
aws-bootstrap terminate
|
|
254
|
+
# Terminate but preserve EBS data volumes for reuse
|
|
255
|
+
aws-bootstrap terminate --keep-ebs
|
|
256
|
+
|
|
257
|
+
# Terminate by SSH alias (resolved via ~/.ssh/config)
|
|
258
|
+
aws-bootstrap terminate aws-gpu1
|
|
259
|
+
|
|
260
|
+
# Terminate by instance ID
|
|
261
|
+
aws-bootstrap terminate i-abc123
|
|
262
|
+
|
|
263
|
+
# Mix aliases and instance IDs
|
|
264
|
+
aws-bootstrap terminate aws-gpu1 i-def456
|
|
247
265
|
|
|
248
266
|
# Skip confirmation prompt
|
|
249
267
|
aws-bootstrap terminate --yes
|
|
268
|
+
|
|
269
|
+
# Remove stale SSH config entries for terminated instances
|
|
270
|
+
aws-bootstrap cleanup
|
|
271
|
+
|
|
272
|
+
# Preview what would be removed without modifying config
|
|
273
|
+
aws-bootstrap cleanup --dry-run
|
|
274
|
+
|
|
275
|
+
# Skip confirmation prompt
|
|
276
|
+
aws-bootstrap cleanup --yes
|
|
250
277
|
```
|
|
251
278
|
|
|
252
279
|
`status --gpu` reports both the **installed CUDA toolkit** version (from `nvcc`) and the **maximum CUDA version supported by the driver** (from `nvidia-smi`), so you can see at a glance whether they match:
|
|
@@ -255,7 +282,32 @@ aws-bootstrap terminate --yes
|
|
|
255
282
|
CUDA: 12.8 (driver supports up to 13.0)
|
|
256
283
|
```
|
|
257
284
|
|
|
258
|
-
SSH aliases are managed automatically — they're created on `launch`, shown in `status`, and cleaned up on `terminate`. Aliases use sequential numbering (`aws-gpu1`, `aws-gpu2`, etc.) and never reuse numbers from previous instances.
|
|
285
|
+
SSH aliases are managed automatically — they're created on `launch`, shown in `status`, and cleaned up on `terminate`. Aliases use sequential numbering (`aws-gpu1`, `aws-gpu2`, etc.) and never reuse numbers from previous instances. You can use aliases anywhere you'd use an instance ID, e.g. `aws-bootstrap terminate aws-gpu1`.
|
|
286
|
+
|
|
287
|
+
## EBS Data Volumes
|
|
288
|
+
|
|
289
|
+
Attach persistent EBS storage to keep datasets and model checkpoints across instance lifecycles. Volumes are mounted at `/data` and persist independently of the instance.
|
|
290
|
+
|
|
291
|
+
```bash
|
|
292
|
+
# Create a new 96 GB gp3 volume, formatted and mounted at /data
|
|
293
|
+
aws-bootstrap launch --ebs-storage 96
|
|
294
|
+
|
|
295
|
+
# After terminating with --keep-ebs, reattach the same volume to a new instance
|
|
296
|
+
aws-bootstrap terminate --keep-ebs
|
|
297
|
+
# Output: Preserving EBS volume: vol-0abc123...
|
|
298
|
+
# Reattach with: aws-bootstrap launch --ebs-volume-id vol-0abc123...
|
|
299
|
+
|
|
300
|
+
aws-bootstrap launch --ebs-volume-id vol-0abc123def456
|
|
301
|
+
```
|
|
302
|
+
|
|
303
|
+
Key behaviors:
|
|
304
|
+
- `--ebs-storage` and `--ebs-volume-id` are mutually exclusive
|
|
305
|
+
- New volumes are formatted as ext4; existing volumes are mounted as-is
|
|
306
|
+
- Volumes are tagged for automatic discovery by `status` and `terminate`
|
|
307
|
+
- `terminate` deletes data volumes by default; use `--keep-ebs` to preserve them
|
|
308
|
+
- **Spot-safe** — data volumes survive spot interruptions. If AWS reclaims your instance, the volume detaches automatically and can be reattached to a new instance with `--ebs-volume-id`
|
|
309
|
+
- EBS volumes must be in the same availability zone as the instance
|
|
310
|
+
- Mount failures are non-fatal — the instance remains usable
|
|
259
311
|
|
|
260
312
|
## EC2 vCPU Quotas
|
|
261
313
|
|
|
@@ -10,8 +10,13 @@ import click
|
|
|
10
10
|
|
|
11
11
|
from .config import LaunchConfig
|
|
12
12
|
from .ec2 import (
|
|
13
|
+
EBS_MOUNT_POINT,
|
|
13
14
|
CLIError,
|
|
15
|
+
attach_ebs_volume,
|
|
16
|
+
create_ebs_volume,
|
|
17
|
+
delete_ebs_volume,
|
|
14
18
|
ensure_security_group,
|
|
19
|
+
find_ebs_volumes_for_instance,
|
|
15
20
|
find_tagged_instances,
|
|
16
21
|
get_latest_ami,
|
|
17
22
|
get_spot_price,
|
|
@@ -19,16 +24,21 @@ from .ec2 import (
|
|
|
19
24
|
list_amis,
|
|
20
25
|
list_instance_types,
|
|
21
26
|
terminate_tagged_instances,
|
|
27
|
+
validate_ebs_volume,
|
|
22
28
|
wait_instance_ready,
|
|
23
29
|
)
|
|
24
30
|
from .ssh import (
|
|
25
31
|
add_ssh_host,
|
|
32
|
+
cleanup_stale_ssh_hosts,
|
|
33
|
+
find_stale_ssh_hosts,
|
|
26
34
|
get_ssh_host_details,
|
|
27
35
|
import_key_pair,
|
|
28
36
|
list_ssh_hosts,
|
|
37
|
+
mount_ebs_volume,
|
|
29
38
|
private_key_path,
|
|
30
39
|
query_gpu_info,
|
|
31
40
|
remove_ssh_host,
|
|
41
|
+
resolve_instance_id,
|
|
32
42
|
run_remote_setup,
|
|
33
43
|
wait_for_ssh,
|
|
34
44
|
)
|
|
@@ -119,6 +129,18 @@ def main():
|
|
|
119
129
|
help="Python version for the remote venv (e.g. 3.13, 3.14.2). Passed to uv during setup.",
|
|
120
130
|
)
|
|
121
131
|
@click.option("--ssh-port", default=22, show_default=True, type=int, help="SSH port on the remote instance.")
|
|
132
|
+
@click.option(
|
|
133
|
+
"--ebs-storage",
|
|
134
|
+
default=None,
|
|
135
|
+
type=int,
|
|
136
|
+
help="Create and attach a new EBS data volume (size in GB, gp3). Mounted at /data.",
|
|
137
|
+
)
|
|
138
|
+
@click.option(
|
|
139
|
+
"--ebs-volume-id",
|
|
140
|
+
default=None,
|
|
141
|
+
type=str,
|
|
142
|
+
help="Attach an existing EBS volume by ID (e.g. vol-0abc123). Mounted at /data.",
|
|
143
|
+
)
|
|
122
144
|
def launch(
|
|
123
145
|
instance_type,
|
|
124
146
|
ami_filter,
|
|
@@ -133,8 +155,13 @@ def launch(
|
|
|
133
155
|
profile,
|
|
134
156
|
python_version,
|
|
135
157
|
ssh_port,
|
|
158
|
+
ebs_storage,
|
|
159
|
+
ebs_volume_id,
|
|
136
160
|
):
|
|
137
161
|
"""Launch a GPU-accelerated EC2 instance."""
|
|
162
|
+
if ebs_storage is not None and ebs_volume_id is not None:
|
|
163
|
+
raise CLIError("--ebs-storage and --ebs-volume-id are mutually exclusive.")
|
|
164
|
+
|
|
138
165
|
config = LaunchConfig(
|
|
139
166
|
instance_type=instance_type,
|
|
140
167
|
spot=spot,
|
|
@@ -147,6 +174,8 @@ def launch(
|
|
|
147
174
|
dry_run=dry_run,
|
|
148
175
|
ssh_port=ssh_port,
|
|
149
176
|
python_version=python_version,
|
|
177
|
+
ebs_storage=ebs_storage,
|
|
178
|
+
ebs_volume_id=ebs_volume_id,
|
|
150
179
|
)
|
|
151
180
|
if ami_filter:
|
|
152
181
|
config.ami_filter = ami_filter
|
|
@@ -161,18 +190,21 @@ def launch(
|
|
|
161
190
|
session = boto3.Session(profile_name=config.profile, region_name=config.region)
|
|
162
191
|
ec2 = session.client("ec2")
|
|
163
192
|
|
|
193
|
+
has_ebs = config.ebs_storage is not None or config.ebs_volume_id is not None
|
|
194
|
+
total_steps = 7 if has_ebs else 6
|
|
195
|
+
|
|
164
196
|
# Step 1: AMI lookup
|
|
165
|
-
step(1,
|
|
197
|
+
step(1, total_steps, "Looking up AMI...")
|
|
166
198
|
ami = get_latest_ami(ec2, config.ami_filter)
|
|
167
199
|
info(f"Found: {ami['Name']}")
|
|
168
200
|
val("AMI ID", ami["ImageId"])
|
|
169
201
|
|
|
170
202
|
# Step 2: SSH key pair
|
|
171
|
-
step(2,
|
|
203
|
+
step(2, total_steps, "Importing SSH key pair...")
|
|
172
204
|
import_key_pair(ec2, config.key_name, config.key_path)
|
|
173
205
|
|
|
174
206
|
# Step 3: Security group
|
|
175
|
-
step(3,
|
|
207
|
+
step(3, total_steps, "Ensuring security group...")
|
|
176
208
|
sg_id = ensure_security_group(ec2, config.security_group, config.tag_value, ssh_port=config.ssh_port)
|
|
177
209
|
|
|
178
210
|
pricing = "spot" if config.spot else "on-demand"
|
|
@@ -192,18 +224,22 @@ def launch(
|
|
|
192
224
|
val("SSH port", str(config.ssh_port))
|
|
193
225
|
if config.python_version:
|
|
194
226
|
val("Python version", config.python_version)
|
|
227
|
+
if config.ebs_storage:
|
|
228
|
+
val("EBS data volume", f"{config.ebs_storage} GB gp3 (new, mounted at {EBS_MOUNT_POINT})")
|
|
229
|
+
if config.ebs_volume_id:
|
|
230
|
+
val("EBS data volume", f"{config.ebs_volume_id} (existing, mounted at {EBS_MOUNT_POINT})")
|
|
195
231
|
click.echo()
|
|
196
232
|
click.secho("No resources launched (dry-run mode).", fg="yellow")
|
|
197
233
|
return
|
|
198
234
|
|
|
199
235
|
# Step 4: Launch instance
|
|
200
|
-
step(4,
|
|
236
|
+
step(4, total_steps, f"Launching {config.instance_type} instance ({pricing})...")
|
|
201
237
|
instance = launch_instance(ec2, config, ami["ImageId"], sg_id)
|
|
202
238
|
instance_id = instance["InstanceId"]
|
|
203
239
|
val("Instance ID", instance_id)
|
|
204
240
|
|
|
205
241
|
# Step 5: Wait for ready
|
|
206
|
-
step(5,
|
|
242
|
+
step(5, total_steps, "Waiting for instance to be ready...")
|
|
207
243
|
instance = wait_instance_ready(ec2, instance_id)
|
|
208
244
|
public_ip = instance.get("PublicIpAddress")
|
|
209
245
|
if not public_ip:
|
|
@@ -212,9 +248,39 @@ def launch(
|
|
|
212
248
|
return
|
|
213
249
|
|
|
214
250
|
val("Public IP", public_ip)
|
|
251
|
+
az = instance["Placement"]["AvailabilityZone"]
|
|
252
|
+
|
|
253
|
+
# Step 5.5 (optional): EBS data volume
|
|
254
|
+
ebs_volume_attached = None
|
|
255
|
+
ebs_format = False
|
|
256
|
+
if has_ebs:
|
|
257
|
+
step(6, total_steps, "Setting up EBS data volume...")
|
|
258
|
+
if config.ebs_storage:
|
|
259
|
+
info(f"Creating {config.ebs_storage} GB gp3 volume in {az}...")
|
|
260
|
+
ebs_volume_attached = create_ebs_volume(ec2, config.ebs_storage, az, config.tag_value, instance_id)
|
|
261
|
+
val("Volume ID", ebs_volume_attached)
|
|
262
|
+
ebs_format = True
|
|
263
|
+
elif config.ebs_volume_id:
|
|
264
|
+
info(f"Validating volume {config.ebs_volume_id}...")
|
|
265
|
+
validate_ebs_volume(ec2, config.ebs_volume_id, az)
|
|
266
|
+
ebs_volume_attached = config.ebs_volume_id
|
|
267
|
+
# Tag the existing volume for discovery
|
|
268
|
+
ec2.create_tags(
|
|
269
|
+
Resources=[ebs_volume_attached],
|
|
270
|
+
Tags=[
|
|
271
|
+
{"Key": "aws-bootstrap-instance", "Value": instance_id},
|
|
272
|
+
{"Key": "created-by", "Value": config.tag_value},
|
|
273
|
+
],
|
|
274
|
+
)
|
|
275
|
+
ebs_format = False
|
|
215
276
|
|
|
216
|
-
|
|
217
|
-
|
|
277
|
+
info(f"Attaching {ebs_volume_attached} to {instance_id}...")
|
|
278
|
+
attach_ebs_volume(ec2, ebs_volume_attached, instance_id)
|
|
279
|
+
success("EBS volume attached.")
|
|
280
|
+
|
|
281
|
+
# SSH and remote setup step
|
|
282
|
+
ssh_step = 7 if has_ebs else 6
|
|
283
|
+
step(ssh_step, total_steps, "Waiting for SSH access...")
|
|
218
284
|
private_key = private_key_path(config.key_path)
|
|
219
285
|
if not wait_for_ssh(public_ip, config.ssh_user, config.key_path, port=config.ssh_port):
|
|
220
286
|
warn("SSH did not become available within the timeout.")
|
|
@@ -237,6 +303,22 @@ def launch(
|
|
|
237
303
|
else:
|
|
238
304
|
warn("Remote setup failed. Instance is still running.")
|
|
239
305
|
|
|
306
|
+
# Mount EBS volume via SSH (after setup so the instance is fully ready)
|
|
307
|
+
if ebs_volume_attached:
|
|
308
|
+
info(f"Mounting EBS volume at {EBS_MOUNT_POINT}...")
|
|
309
|
+
if mount_ebs_volume(
|
|
310
|
+
public_ip,
|
|
311
|
+
config.ssh_user,
|
|
312
|
+
config.key_path,
|
|
313
|
+
ebs_volume_attached,
|
|
314
|
+
mount_point=EBS_MOUNT_POINT,
|
|
315
|
+
format_volume=ebs_format,
|
|
316
|
+
port=config.ssh_port,
|
|
317
|
+
):
|
|
318
|
+
success(f"EBS volume mounted at {EBS_MOUNT_POINT}.")
|
|
319
|
+
else:
|
|
320
|
+
warn(f"Failed to mount EBS volume at {EBS_MOUNT_POINT}. You may need to mount it manually.")
|
|
321
|
+
|
|
240
322
|
# Add SSH config alias
|
|
241
323
|
alias = add_ssh_host(
|
|
242
324
|
instance_id=instance_id,
|
|
@@ -259,6 +341,12 @@ def launch(
|
|
|
259
341
|
val("Instance", config.instance_type)
|
|
260
342
|
val("Pricing", pricing)
|
|
261
343
|
val("SSH alias", alias)
|
|
344
|
+
if ebs_volume_attached:
|
|
345
|
+
if config.ebs_storage:
|
|
346
|
+
ebs_label = f"{ebs_volume_attached} ({config.ebs_storage} GB, {EBS_MOUNT_POINT})"
|
|
347
|
+
else:
|
|
348
|
+
ebs_label = f"{ebs_volume_attached} ({EBS_MOUNT_POINT})"
|
|
349
|
+
val("EBS data volume", ebs_label)
|
|
262
350
|
|
|
263
351
|
port_flag = f" -p {config.ssh_port}" if config.ssh_port != 22 else ""
|
|
264
352
|
|
|
@@ -288,7 +376,7 @@ def launch(
|
|
|
288
376
|
|
|
289
377
|
click.echo()
|
|
290
378
|
click.secho(" Terminate:", fg="cyan")
|
|
291
|
-
click.secho(f" aws-bootstrap terminate {
|
|
379
|
+
click.secho(f" aws-bootstrap terminate {alias} --region {config.region}", bold=True)
|
|
292
380
|
click.echo()
|
|
293
381
|
|
|
294
382
|
|
|
@@ -370,6 +458,12 @@ def status(region, profile, gpu, instructions):
|
|
|
370
458
|
else:
|
|
371
459
|
click.echo(" GPU: " + click.style("unavailable", dim=True))
|
|
372
460
|
|
|
461
|
+
# EBS data volumes
|
|
462
|
+
ebs_volumes = find_ebs_volumes_for_instance(ec2, inst["InstanceId"], "aws-bootstrap-g4dn")
|
|
463
|
+
for vol in ebs_volumes:
|
|
464
|
+
vol_state = f", {vol['State']}" if vol["State"] != "in-use" else ""
|
|
465
|
+
val(" EBS", f"{vol['VolumeId']} ({vol['Size']} GB, {EBS_MOUNT_POINT}{vol_state})")
|
|
466
|
+
|
|
373
467
|
lifecycle = inst["Lifecycle"]
|
|
374
468
|
is_spot = lifecycle == "spot"
|
|
375
469
|
|
|
@@ -419,7 +513,8 @@ def status(region, profile, gpu, instructions):
|
|
|
419
513
|
|
|
420
514
|
click.echo()
|
|
421
515
|
first_id = instances[0]["InstanceId"]
|
|
422
|
-
|
|
516
|
+
first_ref = ssh_hosts.get(first_id, first_id)
|
|
517
|
+
click.echo(" To terminate: " + click.style(f"aws-bootstrap terminate {first_ref}", bold=True))
|
|
423
518
|
click.echo()
|
|
424
519
|
|
|
425
520
|
|
|
@@ -427,18 +522,29 @@ def status(region, profile, gpu, instructions):
|
|
|
427
522
|
@click.option("--region", default="us-west-2", show_default=True, help="AWS region.")
|
|
428
523
|
@click.option("--profile", default=None, help="AWS profile override.")
|
|
429
524
|
@click.option("--yes", "-y", is_flag=True, default=False, help="Skip confirmation prompt.")
|
|
430
|
-
@click.
|
|
431
|
-
|
|
525
|
+
@click.option("--keep-ebs", is_flag=True, default=False, help="Preserve EBS data volumes instead of deleting them.")
|
|
526
|
+
@click.argument("instance_ids", nargs=-1, metavar="[INSTANCE_ID_OR_ALIAS]...")
|
|
527
|
+
def terminate(region, profile, yes, keep_ebs, instance_ids):
|
|
432
528
|
"""Terminate instances created by aws-bootstrap.
|
|
433
529
|
|
|
434
|
-
Pass specific instance IDs
|
|
435
|
-
aws-bootstrap instances in the region.
|
|
530
|
+
Pass specific instance IDs or SSH aliases (e.g. aws-gpu1) to terminate,
|
|
531
|
+
or omit to terminate all aws-bootstrap instances in the region.
|
|
436
532
|
"""
|
|
437
533
|
session = boto3.Session(profile_name=profile, region_name=region)
|
|
438
534
|
ec2 = session.client("ec2")
|
|
439
535
|
|
|
440
536
|
if instance_ids:
|
|
441
|
-
targets =
|
|
537
|
+
targets = []
|
|
538
|
+
for value in instance_ids:
|
|
539
|
+
resolved = resolve_instance_id(value)
|
|
540
|
+
if resolved is None:
|
|
541
|
+
raise CLIError(
|
|
542
|
+
f"Could not resolve '{value}' to an instance ID.\n\n"
|
|
543
|
+
" It is not a valid instance ID or a known SSH alias."
|
|
544
|
+
)
|
|
545
|
+
if resolved != value:
|
|
546
|
+
info(f"Resolved alias '{value}' -> {resolved}")
|
|
547
|
+
targets.append(resolved)
|
|
442
548
|
else:
|
|
443
549
|
instances = find_tagged_instances(ec2, "aws-bootstrap-g4dn")
|
|
444
550
|
if not instances:
|
|
@@ -456,6 +562,13 @@ def terminate(region, profile, yes, instance_ids):
|
|
|
456
562
|
click.secho(" Cancelled.", fg="yellow")
|
|
457
563
|
return
|
|
458
564
|
|
|
565
|
+
# Discover EBS volumes before termination (while instances still exist)
|
|
566
|
+
ebs_by_instance: dict[str, list[dict]] = {}
|
|
567
|
+
for target in targets:
|
|
568
|
+
volumes = find_ebs_volumes_for_instance(ec2, target, "aws-bootstrap-g4dn")
|
|
569
|
+
if volumes:
|
|
570
|
+
ebs_by_instance[target] = volumes
|
|
571
|
+
|
|
459
572
|
changes = terminate_tagged_instances(ec2, targets)
|
|
460
573
|
click.echo()
|
|
461
574
|
for change in changes:
|
|
@@ -467,10 +580,73 @@ def terminate(region, profile, yes, instance_ids):
|
|
|
467
580
|
removed_alias = remove_ssh_host(change["InstanceId"])
|
|
468
581
|
if removed_alias:
|
|
469
582
|
info(f"Removed SSH config alias: {removed_alias}")
|
|
583
|
+
|
|
584
|
+
# Handle EBS volume cleanup
|
|
585
|
+
for _iid, volumes in ebs_by_instance.items():
|
|
586
|
+
for vol in volumes:
|
|
587
|
+
vid = vol["VolumeId"]
|
|
588
|
+
if keep_ebs:
|
|
589
|
+
click.echo()
|
|
590
|
+
info(f"Preserving EBS volume: {vid} ({vol['Size']} GB)")
|
|
591
|
+
info(f"Reattach with: aws-bootstrap launch --ebs-volume-id {vid}")
|
|
592
|
+
else:
|
|
593
|
+
click.echo()
|
|
594
|
+
info(f"Waiting for EBS volume {vid} to detach...")
|
|
595
|
+
try:
|
|
596
|
+
waiter = ec2.get_waiter("volume_available")
|
|
597
|
+
waiter.wait(VolumeIds=[vid], WaiterConfig={"Delay": 10, "MaxAttempts": 30})
|
|
598
|
+
delete_ebs_volume(ec2, vid)
|
|
599
|
+
success(f"Deleted EBS volume: {vid}")
|
|
600
|
+
except Exception as e:
|
|
601
|
+
warn(f"Failed to delete EBS volume {vid}: {e}")
|
|
602
|
+
|
|
470
603
|
click.echo()
|
|
471
604
|
success(f"Terminated {len(changes)} instance(s).")
|
|
472
605
|
|
|
473
606
|
|
|
607
|
+
@main.command()
|
|
608
|
+
@click.option("--dry-run", is_flag=True, default=False, help="Show what would be removed without removing.")
|
|
609
|
+
@click.option("--yes", "-y", is_flag=True, default=False, help="Skip confirmation prompt.")
|
|
610
|
+
@click.option("--region", default="us-west-2", show_default=True, help="AWS region.")
|
|
611
|
+
@click.option("--profile", default=None, help="AWS profile override.")
|
|
612
|
+
def cleanup(dry_run, yes, region, profile):
|
|
613
|
+
"""Remove stale SSH config entries for terminated instances."""
|
|
614
|
+
session = boto3.Session(profile_name=profile, region_name=region)
|
|
615
|
+
ec2 = session.client("ec2")
|
|
616
|
+
|
|
617
|
+
live_instances = find_tagged_instances(ec2, "aws-bootstrap-g4dn")
|
|
618
|
+
live_ids = {inst["InstanceId"] for inst in live_instances}
|
|
619
|
+
|
|
620
|
+
stale = find_stale_ssh_hosts(live_ids)
|
|
621
|
+
if not stale:
|
|
622
|
+
click.secho("No stale SSH config entries found.", fg="green")
|
|
623
|
+
return
|
|
624
|
+
|
|
625
|
+
click.secho(f"\n Found {len(stale)} stale SSH config entry(ies):\n", bold=True, fg="cyan")
|
|
626
|
+
for iid, alias in stale:
|
|
627
|
+
click.echo(" " + click.style(alias, fg="bright_white") + f" ({iid})")
|
|
628
|
+
|
|
629
|
+
if dry_run:
|
|
630
|
+
click.echo()
|
|
631
|
+
for iid, alias in stale:
|
|
632
|
+
info(f"Would remove {alias} ({iid})")
|
|
633
|
+
return
|
|
634
|
+
|
|
635
|
+
if not yes:
|
|
636
|
+
click.echo()
|
|
637
|
+
if not click.confirm(f" Remove {len(stale)} stale entry(ies)?"):
|
|
638
|
+
click.secho(" Cancelled.", fg="yellow")
|
|
639
|
+
return
|
|
640
|
+
|
|
641
|
+
results = cleanup_stale_ssh_hosts(live_ids)
|
|
642
|
+
click.echo()
|
|
643
|
+
for r in results:
|
|
644
|
+
success(f"Removed {r.alias} ({r.instance_id})")
|
|
645
|
+
|
|
646
|
+
click.echo()
|
|
647
|
+
success(f"Cleaned up {len(results)} stale entry(ies).")
|
|
648
|
+
|
|
649
|
+
|
|
474
650
|
# ---------------------------------------------------------------------------
|
|
475
651
|
# list command group
|
|
476
652
|
# ---------------------------------------------------------------------------
|