aws-bootstrap-g4dn 0.5.0__tar.gz → 0.6.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {aws_bootstrap_g4dn-0.5.0 → aws_bootstrap_g4dn-0.6.0}/CLAUDE.md +36 -5
- {aws_bootstrap_g4dn-0.5.0 → aws_bootstrap_g4dn-0.6.0}/PKG-INFO +53 -7
- {aws_bootstrap_g4dn-0.5.0 → aws_bootstrap_g4dn-0.6.0}/README.md +52 -6
- {aws_bootstrap_g4dn-0.5.0 → aws_bootstrap_g4dn-0.6.0}/aws_bootstrap/cli.py +172 -8
- {aws_bootstrap_g4dn-0.5.0 → aws_bootstrap_g4dn-0.6.0}/aws_bootstrap/config.py +2 -0
- {aws_bootstrap_g4dn-0.5.0 → aws_bootstrap_g4dn-0.6.0}/aws_bootstrap/ec2.py +128 -0
- {aws_bootstrap_g4dn-0.5.0 → aws_bootstrap_g4dn-0.6.0}/aws_bootstrap/resources/remote_setup.sh +2 -2
- {aws_bootstrap_g4dn-0.5.0 → aws_bootstrap_g4dn-0.6.0}/aws_bootstrap/ssh.py +121 -0
- {aws_bootstrap_g4dn-0.5.0 → aws_bootstrap_g4dn-0.6.0}/aws_bootstrap/tests/test_cli.py +372 -4
- {aws_bootstrap_g4dn-0.5.0 → aws_bootstrap_g4dn-0.6.0}/aws_bootstrap/tests/test_config.py +18 -0
- aws_bootstrap_g4dn-0.6.0/aws_bootstrap/tests/test_ebs.py +245 -0
- {aws_bootstrap_g4dn-0.5.0 → aws_bootstrap_g4dn-0.6.0}/aws_bootstrap/tests/test_ssh_config.py +76 -0
- aws_bootstrap_g4dn-0.6.0/aws_bootstrap/tests/test_ssh_ebs.py +76 -0
- {aws_bootstrap_g4dn-0.5.0 → aws_bootstrap_g4dn-0.6.0}/aws_bootstrap_g4dn.egg-info/PKG-INFO +53 -7
- {aws_bootstrap_g4dn-0.5.0 → aws_bootstrap_g4dn-0.6.0}/aws_bootstrap_g4dn.egg-info/SOURCES.txt +2 -0
- {aws_bootstrap_g4dn-0.5.0 → aws_bootstrap_g4dn-0.6.0}/.github/ISSUE_TEMPLATE/bug_report.md +0 -0
- {aws_bootstrap_g4dn-0.5.0 → aws_bootstrap_g4dn-0.6.0}/.github/ISSUE_TEMPLATE/feature_request.md +0 -0
- {aws_bootstrap_g4dn-0.5.0 → aws_bootstrap_g4dn-0.6.0}/.github/workflows/ci.yml +0 -0
- {aws_bootstrap_g4dn-0.5.0 → aws_bootstrap_g4dn-0.6.0}/.github/workflows/publish-to-pypi.yml +0 -0
- {aws_bootstrap_g4dn-0.5.0 → aws_bootstrap_g4dn-0.6.0}/.gitignore +0 -0
- {aws_bootstrap_g4dn-0.5.0 → aws_bootstrap_g4dn-0.6.0}/.pre-commit-config.yaml +0 -0
- {aws_bootstrap_g4dn-0.5.0 → aws_bootstrap_g4dn-0.6.0}/CODE_OF_CONDUCT.md +0 -0
- {aws_bootstrap_g4dn-0.5.0 → aws_bootstrap_g4dn-0.6.0}/CONTRIBUTING.md +0 -0
- {aws_bootstrap_g4dn-0.5.0 → aws_bootstrap_g4dn-0.6.0}/LICENSE +0 -0
- {aws_bootstrap_g4dn-0.5.0 → aws_bootstrap_g4dn-0.6.0}/SECURITY.md +0 -0
- {aws_bootstrap_g4dn-0.5.0 → aws_bootstrap_g4dn-0.6.0}/aws_bootstrap/__init__.py +0 -0
- {aws_bootstrap_g4dn-0.5.0 → aws_bootstrap_g4dn-0.6.0}/aws_bootstrap/gpu.py +0 -0
- {aws_bootstrap_g4dn-0.5.0 → aws_bootstrap_g4dn-0.6.0}/aws_bootstrap/resources/__init__.py +0 -0
- {aws_bootstrap_g4dn-0.5.0 → aws_bootstrap_g4dn-0.6.0}/aws_bootstrap/resources/gpu_benchmark.py +0 -0
- {aws_bootstrap_g4dn-0.5.0 → aws_bootstrap_g4dn-0.6.0}/aws_bootstrap/resources/gpu_smoke_test.ipynb +0 -0
- {aws_bootstrap_g4dn-0.5.0 → aws_bootstrap_g4dn-0.6.0}/aws_bootstrap/resources/launch.json +0 -0
- {aws_bootstrap_g4dn-0.5.0 → aws_bootstrap_g4dn-0.6.0}/aws_bootstrap/resources/requirements.txt +0 -0
- {aws_bootstrap_g4dn-0.5.0 → aws_bootstrap_g4dn-0.6.0}/aws_bootstrap/resources/saxpy.cu +0 -0
- {aws_bootstrap_g4dn-0.5.0 → aws_bootstrap_g4dn-0.6.0}/aws_bootstrap/resources/tasks.json +0 -0
- {aws_bootstrap_g4dn-0.5.0 → aws_bootstrap_g4dn-0.6.0}/aws_bootstrap/tests/__init__.py +0 -0
- {aws_bootstrap_g4dn-0.5.0 → aws_bootstrap_g4dn-0.6.0}/aws_bootstrap/tests/test_ec2.py +0 -0
- {aws_bootstrap_g4dn-0.5.0 → aws_bootstrap_g4dn-0.6.0}/aws_bootstrap/tests/test_gpu.py +0 -0
- {aws_bootstrap_g4dn-0.5.0 → aws_bootstrap_g4dn-0.6.0}/aws_bootstrap/tests/test_ssh_gpu.py +0 -0
- {aws_bootstrap_g4dn-0.5.0 → aws_bootstrap_g4dn-0.6.0}/aws_bootstrap_g4dn.egg-info/dependency_links.txt +0 -0
- {aws_bootstrap_g4dn-0.5.0 → aws_bootstrap_g4dn-0.6.0}/aws_bootstrap_g4dn.egg-info/entry_points.txt +0 -0
- {aws_bootstrap_g4dn-0.5.0 → aws_bootstrap_g4dn-0.6.0}/aws_bootstrap_g4dn.egg-info/requires.txt +0 -0
- {aws_bootstrap_g4dn-0.5.0 → aws_bootstrap_g4dn-0.6.0}/aws_bootstrap_g4dn.egg-info/top_level.txt +0 -0
- {aws_bootstrap_g4dn-0.5.0 → aws_bootstrap_g4dn-0.6.0}/docs/nsight-remote-profiling.md +0 -0
- {aws_bootstrap_g4dn-0.5.0 → aws_bootstrap_g4dn-0.6.0}/pyproject.toml +0 -0
- {aws_bootstrap_g4dn-0.5.0 → aws_bootstrap_g4dn-0.6.0}/setup.cfg +0 -0
- {aws_bootstrap_g4dn-0.5.0 → aws_bootstrap_g4dn-0.6.0}/uv.lock +0 -0
|
@@ -32,9 +32,9 @@ aws_bootstrap/
|
|
|
32
32
|
__init__.py # Package init
|
|
33
33
|
cli.py # Click CLI entry point (launch, status, terminate commands)
|
|
34
34
|
config.py # LaunchConfig dataclass with defaults
|
|
35
|
-
ec2.py # AMI lookup, security group, instance launch/find/terminate, polling, spot pricing
|
|
35
|
+
ec2.py # AMI lookup, security group, instance launch/find/terminate, polling, spot pricing, EBS volume ops
|
|
36
36
|
gpu.py # GPU architecture mapping and GpuInfo dataclass
|
|
37
|
-
ssh.py # SSH key pair import, SSH readiness check, remote setup, ~/.ssh/config management, GPU queries
|
|
37
|
+
ssh.py # SSH key pair import, SSH readiness check, remote setup, ~/.ssh/config management, GPU queries, EBS mount
|
|
38
38
|
resources/ # Non-Python artifacts SCP'd to remote instances
|
|
39
39
|
__init__.py
|
|
40
40
|
gpu_benchmark.py # GPU throughput benchmark (CNN + Transformer), copied to ~/gpu_benchmark.py on instance
|
|
@@ -51,6 +51,8 @@ aws_bootstrap/
|
|
|
51
51
|
test_gpu.py
|
|
52
52
|
test_ssh_config.py
|
|
53
53
|
test_ssh_gpu.py
|
|
54
|
+
test_ebs.py
|
|
55
|
+
test_ssh_ebs.py
|
|
54
56
|
docs/
|
|
55
57
|
nsight-remote-profiling.md # Nsight Compute, Nsight Systems, and Nsight VSCE remote profiling guide
|
|
56
58
|
spot-request-lifecycle.md # Research notes on spot request cleanup
|
|
@@ -60,9 +62,10 @@ Entry point: `aws-bootstrap = "aws_bootstrap.cli:main"` (installed via `uv sync`
|
|
|
60
62
|
|
|
61
63
|
## CLI Commands
|
|
62
64
|
|
|
63
|
-
- **`launch`** — provisions an EC2 instance (spot by default, falls back to on-demand on capacity errors); adds SSH config alias (e.g. `aws-gpu1`) to `~/.ssh/config`; `--python-version` controls which Python `uv` installs in the remote venv; `--ssh-port` overrides the default SSH port (22) for security group ingress, connection checks, and SSH config
|
|
64
|
-
- **`status`** — lists all non-terminated instances (including `shutting-down`) with type, IP, SSH alias, pricing (spot price/hr or on-demand), uptime, and estimated cost for running spot instances; `--gpu` flag queries GPU info via SSH, reporting both CUDA toolkit version (from `nvcc`) and driver-supported max (from `nvidia-smi`); `--instructions` (default: on) prints connection commands (SSH, Jupyter tunnel, VSCode Remote SSH, GPU benchmark) for each running instance; suppress with `--no-instructions`
|
|
65
|
-
- **`terminate`** — terminates instances by ID or SSH alias (e.g. `aws-gpu1`, resolved via `~/.ssh/config`), or all aws-bootstrap instances in the region if no arguments given; removes SSH config aliases
|
|
65
|
+
- **`launch`** — provisions an EC2 instance (spot by default, falls back to on-demand on capacity errors); adds SSH config alias (e.g. `aws-gpu1`) to `~/.ssh/config`; `--python-version` controls which Python `uv` installs in the remote venv; `--ssh-port` overrides the default SSH port (22) for security group ingress, connection checks, and SSH config; `--ebs-storage SIZE` creates and attaches a new gp3 EBS data volume (mounted at `/data`); `--ebs-volume-id ID` attaches an existing EBS volume (mutually exclusive with `--ebs-storage`)
|
|
66
|
+
- **`status`** — lists all non-terminated instances (including `shutting-down`) with type, IP, SSH alias, EBS data volumes, pricing (spot price/hr or on-demand), uptime, and estimated cost for running spot instances; `--gpu` flag queries GPU info via SSH, reporting both CUDA toolkit version (from `nvcc`) and driver-supported max (from `nvidia-smi`); `--instructions` (default: on) prints connection commands (SSH, Jupyter tunnel, VSCode Remote SSH, GPU benchmark) for each running instance; suppress with `--no-instructions`
|
|
67
|
+
- **`terminate`** — terminates instances by ID or SSH alias (e.g. `aws-gpu1`, resolved via `~/.ssh/config`), or all aws-bootstrap instances in the region if no arguments given; removes SSH config aliases; deletes associated EBS data volumes by default; `--keep-ebs` preserves volumes and prints reattach commands
|
|
68
|
+
- **`cleanup`** — removes stale `~/.ssh/config` entries for terminated/non-existent instances; compares managed SSH config blocks against live EC2 instances; `--dry-run` previews removals without modifying config; `--yes` skips the confirmation prompt
|
|
66
69
|
- **`list instance-types`** — lists EC2 instance types matching a family prefix (default: `g4dn`), showing vCPUs, memory, and GPU info
|
|
67
70
|
- **`list amis`** — lists available AMIs matching a name pattern (default: Deep Learning Base OSS Nvidia Driver GPU AMIs), sorted newest-first
|
|
68
71
|
|
|
@@ -112,6 +115,34 @@ The `KNOWN_CUDA_TAGS` array in `remote_setup.sh` lists the CUDA wheel tags publi
|
|
|
112
115
|
|
|
113
116
|
`resources/gpu_benchmark.py` is uploaded to `~/gpu_benchmark.py` on the remote instance during setup. It benchmarks GPU throughput with two modes: CNN on MNIST and a GPT-style Transformer on synthetic data. It reports samples/sec, batch times, and peak GPU memory. Supports `--precision` (fp32/fp16/bf16/tf32), `--diagnose` for CUDA smoke tests, and separate `--transformer-batch-size` (default 32, T4-safe). Dependencies (`torch`, `torchvision`, `tqdm`) are already installed by the setup script.
|
|
114
117
|
|
|
118
|
+
## EBS Data Volumes
|
|
119
|
+
|
|
120
|
+
The `--ebs-storage` and `--ebs-volume-id` options on `launch` create or attach persistent gp3 EBS volumes mounted at `/data`. The implementation spans three modules:
|
|
121
|
+
|
|
122
|
+
- **`ec2.py`** — Volume lifecycle: `create_ebs_volume`, `validate_ebs_volume`, `attach_ebs_volume`, `detach_ebs_volume`, `delete_ebs_volume`, `find_ebs_volumes_for_instance`. Constants `EBS_DEVICE_NAME` (`/dev/sdf`) and `EBS_MOUNT_POINT` (`/data`).
|
|
123
|
+
- **`ssh.py`** — `mount_ebs_volume()` SSHs to the instance and runs a shell script that detects the device, optionally formats it, mounts it, and adds an fstab entry.
|
|
124
|
+
- **`cli.py`** — Orchestrates the flow: create/validate → attach → wait for SSH → mount. Mount failures are non-fatal (warn and continue).
|
|
125
|
+
|
|
126
|
+
### Tagging strategy
|
|
127
|
+
|
|
128
|
+
Volumes are tagged for discovery by `status` and `terminate`:
|
|
129
|
+
|
|
130
|
+
| Tag | Value | Purpose |
|
|
131
|
+
|-----|-------|---------|
|
|
132
|
+
| `created-by` | `aws-bootstrap-g4dn` | Standard tool-managed resource tag |
|
|
133
|
+
| `Name` | `aws-bootstrap-data-{instance_id}` | Human-readable in AWS console |
|
|
134
|
+
| `aws-bootstrap-instance` | `i-xxxxxxxxx` | Links volume to instance for `find_ebs_volumes_for_instance` |
|
|
135
|
+
|
|
136
|
+
### NVMe device detection
|
|
137
|
+
|
|
138
|
+
On Nitro instances (g4dn), `/dev/sdf` is remapped to `/dev/nvmeXn1`. The mount script detects the correct device by matching the volume ID serial number via `lsblk -o NAME,SERIAL -dpn`, with fallbacks to `/dev/nvme1n1`, `/dev/xvdf`, `/dev/sdf`.
|
|
139
|
+
|
|
140
|
+
### Spot interruption and terminate cleanup
|
|
141
|
+
|
|
142
|
+
Non-root EBS volumes attached via API have `DeleteOnTermination=False` by default. This means data volumes **survive spot interruptions** — when AWS reclaims the instance, the volume detaches and becomes `available`, preserving all data. The user can reattach it to a new instance with `--ebs-volume-id`.
|
|
143
|
+
|
|
144
|
+
The `terminate` command discovers volumes via `find_ebs_volumes_for_instance`, waits for them to detach (becomes `available`), then deletes them. `--keep-ebs` skips deletion and prints the volume ID with a reattach command.
|
|
145
|
+
|
|
115
146
|
## Versioning & Publishing
|
|
116
147
|
|
|
117
148
|
Version is derived automatically from git tags via **setuptools-scm** — no hardcoded version string in the codebase.
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: aws-bootstrap-g4dn
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.6.0
|
|
4
4
|
Summary: Bootstrap AWS EC2 GPU instances for hybrid local-remote development
|
|
5
5
|
Author: Adam Ever-Hadani
|
|
6
6
|
License-Expression: MIT
|
|
@@ -44,7 +44,8 @@ ssh aws-gpu1 # You're in, venv activated, PyTorch works
|
|
|
44
44
|
| 📊 | **GPU benchmark included** | CNN (MNIST) + Transformer benchmarks with FP16/FP32/BF16 precision and tqdm progress |
|
|
45
45
|
| 📓 | **Jupyter ready** | Lab server auto-starts as a systemd service on port 8888 — just SSH tunnel and open |
|
|
46
46
|
| 🖥️ | **`status --gpu`** | Shows CUDA toolkit version, driver max, GPU architecture, spot pricing, uptime, and estimated cost |
|
|
47
|
-
|
|
|
47
|
+
| 💾 | **EBS data volumes** | Attach persistent storage at `/data` — survives spot interruptions and termination, reattach to new instances |
|
|
48
|
+
| 🗑️ | **Clean terminate** | Stops instances, removes SSH aliases, cleans up EBS volumes (or preserves with `--keep-ebs`) |
|
|
48
49
|
|
|
49
50
|
### 🎯 Target Workflows
|
|
50
51
|
|
|
@@ -132,16 +133,24 @@ aws-bootstrap launch --python-version 3.13
|
|
|
132
133
|
# Use a non-default SSH port
|
|
133
134
|
aws-bootstrap launch --ssh-port 2222
|
|
134
135
|
|
|
136
|
+
# Attach a persistent EBS data volume (96 GB gp3, mounted at /data)
|
|
137
|
+
aws-bootstrap launch --ebs-storage 96
|
|
138
|
+
|
|
139
|
+
# Reattach an existing EBS volume from a previous instance
|
|
140
|
+
aws-bootstrap launch --ebs-volume-id vol-0abc123def456
|
|
141
|
+
|
|
135
142
|
# Use a specific AWS profile
|
|
136
143
|
aws-bootstrap launch --profile my-aws-profile
|
|
137
144
|
```
|
|
138
145
|
|
|
139
146
|
After launch, the CLI:
|
|
140
147
|
|
|
141
|
-
1. **
|
|
142
|
-
2. **
|
|
143
|
-
3. **Runs
|
|
144
|
-
4. **
|
|
148
|
+
1. **Creates/attaches EBS volume** (if `--ebs-storage` or `--ebs-volume-id` was specified)
|
|
149
|
+
2. **Adds an SSH alias** (e.g. `aws-gpu1`) to `~/.ssh/config`
|
|
150
|
+
3. **Runs remote setup** — installs utilities, creates a Python venv, installs CUDA-matched PyTorch, sets up Jupyter
|
|
151
|
+
4. **Mounts EBS volume** at `/data` (if applicable — formats new volumes, mounts existing ones as-is)
|
|
152
|
+
5. **Runs a CUDA smoke test** — verifies `torch.cuda.is_available()` and runs a quick GPU matmul
|
|
153
|
+
6. **Prints connection commands** — SSH, Jupyter tunnel, GPU benchmark, and terminate
|
|
145
154
|
|
|
146
155
|
```bash
|
|
147
156
|
ssh aws-gpu1 # venv auto-activates on login
|
|
@@ -154,7 +163,7 @@ The setup script runs automatically on the instance after SSH becomes available:
|
|
|
154
163
|
| Step | What |
|
|
155
164
|
|------|------|
|
|
156
165
|
| **GPU verify** | Confirms `nvidia-smi` and `nvcc` are working |
|
|
157
|
-
| **Utilities** | Installs `htop`, `tmux`, `tree`, `jq` |
|
|
166
|
+
| **Utilities** | Installs `htop`, `tmux`, `tree`, `jq`, `ffmpeg` |
|
|
158
167
|
| **Python venv** | Creates `~/venv` with `uv`, auto-activates in `~/.bashrc`. Use `--python-version` to pin a specific Python (e.g. `3.13`) |
|
|
159
168
|
| **CUDA-aware PyTorch** | Detects CUDA toolkit version → installs PyTorch from the matching `cu{TAG}` wheel index |
|
|
160
169
|
| **CUDA smoke test** | Runs `torch.cuda.is_available()` + GPU matmul to verify the stack |
|
|
@@ -261,6 +270,9 @@ aws-bootstrap status --region us-east-1
|
|
|
261
270
|
# Terminate all aws-bootstrap instances (with confirmation prompt)
|
|
262
271
|
aws-bootstrap terminate
|
|
263
272
|
|
|
273
|
+
# Terminate but preserve EBS data volumes for reuse
|
|
274
|
+
aws-bootstrap terminate --keep-ebs
|
|
275
|
+
|
|
264
276
|
# Terminate by SSH alias (resolved via ~/.ssh/config)
|
|
265
277
|
aws-bootstrap terminate aws-gpu1
|
|
266
278
|
|
|
@@ -272,6 +284,15 @@ aws-bootstrap terminate aws-gpu1 i-def456
|
|
|
272
284
|
|
|
273
285
|
# Skip confirmation prompt
|
|
274
286
|
aws-bootstrap terminate --yes
|
|
287
|
+
|
|
288
|
+
# Remove stale SSH config entries for terminated instances
|
|
289
|
+
aws-bootstrap cleanup
|
|
290
|
+
|
|
291
|
+
# Preview what would be removed without modifying config
|
|
292
|
+
aws-bootstrap cleanup --dry-run
|
|
293
|
+
|
|
294
|
+
# Skip confirmation prompt
|
|
295
|
+
aws-bootstrap cleanup --yes
|
|
275
296
|
```
|
|
276
297
|
|
|
277
298
|
`status --gpu` reports both the **installed CUDA toolkit** version (from `nvcc`) and the **maximum CUDA version supported by the driver** (from `nvidia-smi`), so you can see at a glance whether they match:
|
|
@@ -282,6 +303,31 @@ CUDA: 12.8 (driver supports up to 13.0)
|
|
|
282
303
|
|
|
283
304
|
SSH aliases are managed automatically — they're created on `launch`, shown in `status`, and cleaned up on `terminate`. Aliases use sequential numbering (`aws-gpu1`, `aws-gpu2`, etc.) and never reuse numbers from previous instances. You can use aliases anywhere you'd use an instance ID, e.g. `aws-bootstrap terminate aws-gpu1`.
|
|
284
305
|
|
|
306
|
+
## EBS Data Volumes
|
|
307
|
+
|
|
308
|
+
Attach persistent EBS storage to keep datasets and model checkpoints across instance lifecycles. Volumes are mounted at `/data` and persist independently of the instance.
|
|
309
|
+
|
|
310
|
+
```bash
|
|
311
|
+
# Create a new 96 GB gp3 volume, formatted and mounted at /data
|
|
312
|
+
aws-bootstrap launch --ebs-storage 96
|
|
313
|
+
|
|
314
|
+
# After terminating with --keep-ebs, reattach the same volume to a new instance
|
|
315
|
+
aws-bootstrap terminate --keep-ebs
|
|
316
|
+
# Output: Preserving EBS volume: vol-0abc123...
|
|
317
|
+
# Reattach with: aws-bootstrap launch --ebs-volume-id vol-0abc123...
|
|
318
|
+
|
|
319
|
+
aws-bootstrap launch --ebs-volume-id vol-0abc123def456
|
|
320
|
+
```
|
|
321
|
+
|
|
322
|
+
Key behaviors:
|
|
323
|
+
- `--ebs-storage` and `--ebs-volume-id` are mutually exclusive
|
|
324
|
+
- New volumes are formatted as ext4; existing volumes are mounted as-is
|
|
325
|
+
- Volumes are tagged for automatic discovery by `status` and `terminate`
|
|
326
|
+
- `terminate` deletes data volumes by default; use `--keep-ebs` to preserve them
|
|
327
|
+
- **Spot-safe** — data volumes survive spot interruptions. If AWS reclaims your instance, the volume detaches automatically and can be reattached to a new instance with `--ebs-volume-id`
|
|
328
|
+
- EBS volumes must be in the same availability zone as the instance
|
|
329
|
+
- Mount failures are non-fatal — the instance remains usable
|
|
330
|
+
|
|
285
331
|
## EC2 vCPU Quotas
|
|
286
332
|
|
|
287
333
|
AWS accounts have [service quotas](https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/ec2-resource-limits.html) that limit how many vCPUs you can run per instance family. New or lightly-used accounts often have a **default quota of 0 vCPUs** for GPU instance families (G and VT), which will cause errors on launch:
|
|
@@ -25,7 +25,8 @@ ssh aws-gpu1 # You're in, venv activated, PyTorch works
|
|
|
25
25
|
| 📊 | **GPU benchmark included** | CNN (MNIST) + Transformer benchmarks with FP16/FP32/BF16 precision and tqdm progress |
|
|
26
26
|
| 📓 | **Jupyter ready** | Lab server auto-starts as a systemd service on port 8888 — just SSH tunnel and open |
|
|
27
27
|
| 🖥️ | **`status --gpu`** | Shows CUDA toolkit version, driver max, GPU architecture, spot pricing, uptime, and estimated cost |
|
|
28
|
-
|
|
|
28
|
+
| 💾 | **EBS data volumes** | Attach persistent storage at `/data` — survives spot interruptions and termination, reattach to new instances |
|
|
29
|
+
| 🗑️ | **Clean terminate** | Stops instances, removes SSH aliases, cleans up EBS volumes (or preserves with `--keep-ebs`) |
|
|
29
30
|
|
|
30
31
|
### 🎯 Target Workflows
|
|
31
32
|
|
|
@@ -113,16 +114,24 @@ aws-bootstrap launch --python-version 3.13
|
|
|
113
114
|
# Use a non-default SSH port
|
|
114
115
|
aws-bootstrap launch --ssh-port 2222
|
|
115
116
|
|
|
117
|
+
# Attach a persistent EBS data volume (96 GB gp3, mounted at /data)
|
|
118
|
+
aws-bootstrap launch --ebs-storage 96
|
|
119
|
+
|
|
120
|
+
# Reattach an existing EBS volume from a previous instance
|
|
121
|
+
aws-bootstrap launch --ebs-volume-id vol-0abc123def456
|
|
122
|
+
|
|
116
123
|
# Use a specific AWS profile
|
|
117
124
|
aws-bootstrap launch --profile my-aws-profile
|
|
118
125
|
```
|
|
119
126
|
|
|
120
127
|
After launch, the CLI:
|
|
121
128
|
|
|
122
|
-
1. **
|
|
123
|
-
2. **
|
|
124
|
-
3. **Runs
|
|
125
|
-
4. **
|
|
129
|
+
1. **Creates/attaches EBS volume** (if `--ebs-storage` or `--ebs-volume-id` was specified)
|
|
130
|
+
2. **Adds an SSH alias** (e.g. `aws-gpu1`) to `~/.ssh/config`
|
|
131
|
+
3. **Runs remote setup** — installs utilities, creates a Python venv, installs CUDA-matched PyTorch, sets up Jupyter
|
|
132
|
+
4. **Mounts EBS volume** at `/data` (if applicable — formats new volumes, mounts existing ones as-is)
|
|
133
|
+
5. **Runs a CUDA smoke test** — verifies `torch.cuda.is_available()` and runs a quick GPU matmul
|
|
134
|
+
6. **Prints connection commands** — SSH, Jupyter tunnel, GPU benchmark, and terminate
|
|
126
135
|
|
|
127
136
|
```bash
|
|
128
137
|
ssh aws-gpu1 # venv auto-activates on login
|
|
@@ -135,7 +144,7 @@ The setup script runs automatically on the instance after SSH becomes available:
|
|
|
135
144
|
| Step | What |
|
|
136
145
|
|------|------|
|
|
137
146
|
| **GPU verify** | Confirms `nvidia-smi` and `nvcc` are working |
|
|
138
|
-
| **Utilities** | Installs `htop`, `tmux`, `tree`, `jq` |
|
|
147
|
+
| **Utilities** | Installs `htop`, `tmux`, `tree`, `jq`, `ffmpeg` |
|
|
139
148
|
| **Python venv** | Creates `~/venv` with `uv`, auto-activates in `~/.bashrc`. Use `--python-version` to pin a specific Python (e.g. `3.13`) |
|
|
140
149
|
| **CUDA-aware PyTorch** | Detects CUDA toolkit version → installs PyTorch from the matching `cu{TAG}` wheel index |
|
|
141
150
|
| **CUDA smoke test** | Runs `torch.cuda.is_available()` + GPU matmul to verify the stack |
|
|
@@ -242,6 +251,9 @@ aws-bootstrap status --region us-east-1
|
|
|
242
251
|
# Terminate all aws-bootstrap instances (with confirmation prompt)
|
|
243
252
|
aws-bootstrap terminate
|
|
244
253
|
|
|
254
|
+
# Terminate but preserve EBS data volumes for reuse
|
|
255
|
+
aws-bootstrap terminate --keep-ebs
|
|
256
|
+
|
|
245
257
|
# Terminate by SSH alias (resolved via ~/.ssh/config)
|
|
246
258
|
aws-bootstrap terminate aws-gpu1
|
|
247
259
|
|
|
@@ -253,6 +265,15 @@ aws-bootstrap terminate aws-gpu1 i-def456
|
|
|
253
265
|
|
|
254
266
|
# Skip confirmation prompt
|
|
255
267
|
aws-bootstrap terminate --yes
|
|
268
|
+
|
|
269
|
+
# Remove stale SSH config entries for terminated instances
|
|
270
|
+
aws-bootstrap cleanup
|
|
271
|
+
|
|
272
|
+
# Preview what would be removed without modifying config
|
|
273
|
+
aws-bootstrap cleanup --dry-run
|
|
274
|
+
|
|
275
|
+
# Skip confirmation prompt
|
|
276
|
+
aws-bootstrap cleanup --yes
|
|
256
277
|
```
|
|
257
278
|
|
|
258
279
|
`status --gpu` reports both the **installed CUDA toolkit** version (from `nvcc`) and the **maximum CUDA version supported by the driver** (from `nvidia-smi`), so you can see at a glance whether they match:
|
|
@@ -263,6 +284,31 @@ CUDA: 12.8 (driver supports up to 13.0)
|
|
|
263
284
|
|
|
264
285
|
SSH aliases are managed automatically — they're created on `launch`, shown in `status`, and cleaned up on `terminate`. Aliases use sequential numbering (`aws-gpu1`, `aws-gpu2`, etc.) and never reuse numbers from previous instances. You can use aliases anywhere you'd use an instance ID, e.g. `aws-bootstrap terminate aws-gpu1`.
|
|
265
286
|
|
|
287
|
+
## EBS Data Volumes
|
|
288
|
+
|
|
289
|
+
Attach persistent EBS storage to keep datasets and model checkpoints across instance lifecycles. Volumes are mounted at `/data` and persist independently of the instance.
|
|
290
|
+
|
|
291
|
+
```bash
|
|
292
|
+
# Create a new 96 GB gp3 volume, formatted and mounted at /data
|
|
293
|
+
aws-bootstrap launch --ebs-storage 96
|
|
294
|
+
|
|
295
|
+
# After terminating with --keep-ebs, reattach the same volume to a new instance
|
|
296
|
+
aws-bootstrap terminate --keep-ebs
|
|
297
|
+
# Output: Preserving EBS volume: vol-0abc123...
|
|
298
|
+
# Reattach with: aws-bootstrap launch --ebs-volume-id vol-0abc123...
|
|
299
|
+
|
|
300
|
+
aws-bootstrap launch --ebs-volume-id vol-0abc123def456
|
|
301
|
+
```
|
|
302
|
+
|
|
303
|
+
Key behaviors:
|
|
304
|
+
- `--ebs-storage` and `--ebs-volume-id` are mutually exclusive
|
|
305
|
+
- New volumes are formatted as ext4; existing volumes are mounted as-is
|
|
306
|
+
- Volumes are tagged for automatic discovery by `status` and `terminate`
|
|
307
|
+
- `terminate` deletes data volumes by default; use `--keep-ebs` to preserve them
|
|
308
|
+
- **Spot-safe** — data volumes survive spot interruptions. If AWS reclaims your instance, the volume detaches automatically and can be reattached to a new instance with `--ebs-volume-id`
|
|
309
|
+
- EBS volumes must be in the same availability zone as the instance
|
|
310
|
+
- Mount failures are non-fatal — the instance remains usable
|
|
311
|
+
|
|
266
312
|
## EC2 vCPU Quotas
|
|
267
313
|
|
|
268
314
|
AWS accounts have [service quotas](https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/ec2-resource-limits.html) that limit how many vCPUs you can run per instance family. New or lightly-used accounts often have a **default quota of 0 vCPUs** for GPU instance families (G and VT), which will cause errors on launch:
|
|
@@ -10,8 +10,13 @@ import click
|
|
|
10
10
|
|
|
11
11
|
from .config import LaunchConfig
|
|
12
12
|
from .ec2 import (
|
|
13
|
+
EBS_MOUNT_POINT,
|
|
13
14
|
CLIError,
|
|
15
|
+
attach_ebs_volume,
|
|
16
|
+
create_ebs_volume,
|
|
17
|
+
delete_ebs_volume,
|
|
14
18
|
ensure_security_group,
|
|
19
|
+
find_ebs_volumes_for_instance,
|
|
15
20
|
find_tagged_instances,
|
|
16
21
|
get_latest_ami,
|
|
17
22
|
get_spot_price,
|
|
@@ -19,13 +24,17 @@ from .ec2 import (
|
|
|
19
24
|
list_amis,
|
|
20
25
|
list_instance_types,
|
|
21
26
|
terminate_tagged_instances,
|
|
27
|
+
validate_ebs_volume,
|
|
22
28
|
wait_instance_ready,
|
|
23
29
|
)
|
|
24
30
|
from .ssh import (
|
|
25
31
|
add_ssh_host,
|
|
32
|
+
cleanup_stale_ssh_hosts,
|
|
33
|
+
find_stale_ssh_hosts,
|
|
26
34
|
get_ssh_host_details,
|
|
27
35
|
import_key_pair,
|
|
28
36
|
list_ssh_hosts,
|
|
37
|
+
mount_ebs_volume,
|
|
29
38
|
private_key_path,
|
|
30
39
|
query_gpu_info,
|
|
31
40
|
remove_ssh_host,
|
|
@@ -120,6 +129,18 @@ def main():
|
|
|
120
129
|
help="Python version for the remote venv (e.g. 3.13, 3.14.2). Passed to uv during setup.",
|
|
121
130
|
)
|
|
122
131
|
@click.option("--ssh-port", default=22, show_default=True, type=int, help="SSH port on the remote instance.")
|
|
132
|
+
@click.option(
|
|
133
|
+
"--ebs-storage",
|
|
134
|
+
default=None,
|
|
135
|
+
type=int,
|
|
136
|
+
help="Create and attach a new EBS data volume (size in GB, gp3). Mounted at /data.",
|
|
137
|
+
)
|
|
138
|
+
@click.option(
|
|
139
|
+
"--ebs-volume-id",
|
|
140
|
+
default=None,
|
|
141
|
+
type=str,
|
|
142
|
+
help="Attach an existing EBS volume by ID (e.g. vol-0abc123). Mounted at /data.",
|
|
143
|
+
)
|
|
123
144
|
def launch(
|
|
124
145
|
instance_type,
|
|
125
146
|
ami_filter,
|
|
@@ -134,8 +155,13 @@ def launch(
|
|
|
134
155
|
profile,
|
|
135
156
|
python_version,
|
|
136
157
|
ssh_port,
|
|
158
|
+
ebs_storage,
|
|
159
|
+
ebs_volume_id,
|
|
137
160
|
):
|
|
138
161
|
"""Launch a GPU-accelerated EC2 instance."""
|
|
162
|
+
if ebs_storage is not None and ebs_volume_id is not None:
|
|
163
|
+
raise CLIError("--ebs-storage and --ebs-volume-id are mutually exclusive.")
|
|
164
|
+
|
|
139
165
|
config = LaunchConfig(
|
|
140
166
|
instance_type=instance_type,
|
|
141
167
|
spot=spot,
|
|
@@ -148,6 +174,8 @@ def launch(
|
|
|
148
174
|
dry_run=dry_run,
|
|
149
175
|
ssh_port=ssh_port,
|
|
150
176
|
python_version=python_version,
|
|
177
|
+
ebs_storage=ebs_storage,
|
|
178
|
+
ebs_volume_id=ebs_volume_id,
|
|
151
179
|
)
|
|
152
180
|
if ami_filter:
|
|
153
181
|
config.ami_filter = ami_filter
|
|
@@ -162,18 +190,21 @@ def launch(
|
|
|
162
190
|
session = boto3.Session(profile_name=config.profile, region_name=config.region)
|
|
163
191
|
ec2 = session.client("ec2")
|
|
164
192
|
|
|
193
|
+
has_ebs = config.ebs_storage is not None or config.ebs_volume_id is not None
|
|
194
|
+
total_steps = 7 if has_ebs else 6
|
|
195
|
+
|
|
165
196
|
# Step 1: AMI lookup
|
|
166
|
-
step(1,
|
|
197
|
+
step(1, total_steps, "Looking up AMI...")
|
|
167
198
|
ami = get_latest_ami(ec2, config.ami_filter)
|
|
168
199
|
info(f"Found: {ami['Name']}")
|
|
169
200
|
val("AMI ID", ami["ImageId"])
|
|
170
201
|
|
|
171
202
|
# Step 2: SSH key pair
|
|
172
|
-
step(2,
|
|
203
|
+
step(2, total_steps, "Importing SSH key pair...")
|
|
173
204
|
import_key_pair(ec2, config.key_name, config.key_path)
|
|
174
205
|
|
|
175
206
|
# Step 3: Security group
|
|
176
|
-
step(3,
|
|
207
|
+
step(3, total_steps, "Ensuring security group...")
|
|
177
208
|
sg_id = ensure_security_group(ec2, config.security_group, config.tag_value, ssh_port=config.ssh_port)
|
|
178
209
|
|
|
179
210
|
pricing = "spot" if config.spot else "on-demand"
|
|
@@ -193,18 +224,22 @@ def launch(
|
|
|
193
224
|
val("SSH port", str(config.ssh_port))
|
|
194
225
|
if config.python_version:
|
|
195
226
|
val("Python version", config.python_version)
|
|
227
|
+
if config.ebs_storage:
|
|
228
|
+
val("EBS data volume", f"{config.ebs_storage} GB gp3 (new, mounted at {EBS_MOUNT_POINT})")
|
|
229
|
+
if config.ebs_volume_id:
|
|
230
|
+
val("EBS data volume", f"{config.ebs_volume_id} (existing, mounted at {EBS_MOUNT_POINT})")
|
|
196
231
|
click.echo()
|
|
197
232
|
click.secho("No resources launched (dry-run mode).", fg="yellow")
|
|
198
233
|
return
|
|
199
234
|
|
|
200
235
|
# Step 4: Launch instance
|
|
201
|
-
step(4,
|
|
236
|
+
step(4, total_steps, f"Launching {config.instance_type} instance ({pricing})...")
|
|
202
237
|
instance = launch_instance(ec2, config, ami["ImageId"], sg_id)
|
|
203
238
|
instance_id = instance["InstanceId"]
|
|
204
239
|
val("Instance ID", instance_id)
|
|
205
240
|
|
|
206
241
|
# Step 5: Wait for ready
|
|
207
|
-
step(5,
|
|
242
|
+
step(5, total_steps, "Waiting for instance to be ready...")
|
|
208
243
|
instance = wait_instance_ready(ec2, instance_id)
|
|
209
244
|
public_ip = instance.get("PublicIpAddress")
|
|
210
245
|
if not public_ip:
|
|
@@ -213,9 +248,39 @@ def launch(
|
|
|
213
248
|
return
|
|
214
249
|
|
|
215
250
|
val("Public IP", public_ip)
|
|
251
|
+
az = instance["Placement"]["AvailabilityZone"]
|
|
252
|
+
|
|
253
|
+
# Step 5.5 (optional): EBS data volume
|
|
254
|
+
ebs_volume_attached = None
|
|
255
|
+
ebs_format = False
|
|
256
|
+
if has_ebs:
|
|
257
|
+
step(6, total_steps, "Setting up EBS data volume...")
|
|
258
|
+
if config.ebs_storage:
|
|
259
|
+
info(f"Creating {config.ebs_storage} GB gp3 volume in {az}...")
|
|
260
|
+
ebs_volume_attached = create_ebs_volume(ec2, config.ebs_storage, az, config.tag_value, instance_id)
|
|
261
|
+
val("Volume ID", ebs_volume_attached)
|
|
262
|
+
ebs_format = True
|
|
263
|
+
elif config.ebs_volume_id:
|
|
264
|
+
info(f"Validating volume {config.ebs_volume_id}...")
|
|
265
|
+
validate_ebs_volume(ec2, config.ebs_volume_id, az)
|
|
266
|
+
ebs_volume_attached = config.ebs_volume_id
|
|
267
|
+
# Tag the existing volume for discovery
|
|
268
|
+
ec2.create_tags(
|
|
269
|
+
Resources=[ebs_volume_attached],
|
|
270
|
+
Tags=[
|
|
271
|
+
{"Key": "aws-bootstrap-instance", "Value": instance_id},
|
|
272
|
+
{"Key": "created-by", "Value": config.tag_value},
|
|
273
|
+
],
|
|
274
|
+
)
|
|
275
|
+
ebs_format = False
|
|
216
276
|
|
|
217
|
-
|
|
218
|
-
|
|
277
|
+
info(f"Attaching {ebs_volume_attached} to {instance_id}...")
|
|
278
|
+
attach_ebs_volume(ec2, ebs_volume_attached, instance_id)
|
|
279
|
+
success("EBS volume attached.")
|
|
280
|
+
|
|
281
|
+
# SSH and remote setup step
|
|
282
|
+
ssh_step = 7 if has_ebs else 6
|
|
283
|
+
step(ssh_step, total_steps, "Waiting for SSH access...")
|
|
219
284
|
private_key = private_key_path(config.key_path)
|
|
220
285
|
if not wait_for_ssh(public_ip, config.ssh_user, config.key_path, port=config.ssh_port):
|
|
221
286
|
warn("SSH did not become available within the timeout.")
|
|
@@ -238,6 +303,22 @@ def launch(
|
|
|
238
303
|
else:
|
|
239
304
|
warn("Remote setup failed. Instance is still running.")
|
|
240
305
|
|
|
306
|
+
# Mount EBS volume via SSH (after setup so the instance is fully ready)
|
|
307
|
+
if ebs_volume_attached:
|
|
308
|
+
info(f"Mounting EBS volume at {EBS_MOUNT_POINT}...")
|
|
309
|
+
if mount_ebs_volume(
|
|
310
|
+
public_ip,
|
|
311
|
+
config.ssh_user,
|
|
312
|
+
config.key_path,
|
|
313
|
+
ebs_volume_attached,
|
|
314
|
+
mount_point=EBS_MOUNT_POINT,
|
|
315
|
+
format_volume=ebs_format,
|
|
316
|
+
port=config.ssh_port,
|
|
317
|
+
):
|
|
318
|
+
success(f"EBS volume mounted at {EBS_MOUNT_POINT}.")
|
|
319
|
+
else:
|
|
320
|
+
warn(f"Failed to mount EBS volume at {EBS_MOUNT_POINT}. You may need to mount it manually.")
|
|
321
|
+
|
|
241
322
|
# Add SSH config alias
|
|
242
323
|
alias = add_ssh_host(
|
|
243
324
|
instance_id=instance_id,
|
|
@@ -260,6 +341,12 @@ def launch(
|
|
|
260
341
|
val("Instance", config.instance_type)
|
|
261
342
|
val("Pricing", pricing)
|
|
262
343
|
val("SSH alias", alias)
|
|
344
|
+
if ebs_volume_attached:
|
|
345
|
+
if config.ebs_storage:
|
|
346
|
+
ebs_label = f"{ebs_volume_attached} ({config.ebs_storage} GB, {EBS_MOUNT_POINT})"
|
|
347
|
+
else:
|
|
348
|
+
ebs_label = f"{ebs_volume_attached} ({EBS_MOUNT_POINT})"
|
|
349
|
+
val("EBS data volume", ebs_label)
|
|
263
350
|
|
|
264
351
|
port_flag = f" -p {config.ssh_port}" if config.ssh_port != 22 else ""
|
|
265
352
|
|
|
@@ -371,6 +458,12 @@ def status(region, profile, gpu, instructions):
|
|
|
371
458
|
else:
|
|
372
459
|
click.echo(" GPU: " + click.style("unavailable", dim=True))
|
|
373
460
|
|
|
461
|
+
# EBS data volumes
|
|
462
|
+
ebs_volumes = find_ebs_volumes_for_instance(ec2, inst["InstanceId"], "aws-bootstrap-g4dn")
|
|
463
|
+
for vol in ebs_volumes:
|
|
464
|
+
vol_state = f", {vol['State']}" if vol["State"] != "in-use" else ""
|
|
465
|
+
val(" EBS", f"{vol['VolumeId']} ({vol['Size']} GB, {EBS_MOUNT_POINT}{vol_state})")
|
|
466
|
+
|
|
374
467
|
lifecycle = inst["Lifecycle"]
|
|
375
468
|
is_spot = lifecycle == "spot"
|
|
376
469
|
|
|
@@ -429,8 +522,9 @@ def status(region, profile, gpu, instructions):
|
|
|
429
522
|
@click.option("--region", default="us-west-2", show_default=True, help="AWS region.")
|
|
430
523
|
@click.option("--profile", default=None, help="AWS profile override.")
|
|
431
524
|
@click.option("--yes", "-y", is_flag=True, default=False, help="Skip confirmation prompt.")
|
|
525
|
+
@click.option("--keep-ebs", is_flag=True, default=False, help="Preserve EBS data volumes instead of deleting them.")
|
|
432
526
|
@click.argument("instance_ids", nargs=-1, metavar="[INSTANCE_ID_OR_ALIAS]...")
|
|
433
|
-
def terminate(region, profile, yes, instance_ids):
|
|
527
|
+
def terminate(region, profile, yes, keep_ebs, instance_ids):
|
|
434
528
|
"""Terminate instances created by aws-bootstrap.
|
|
435
529
|
|
|
436
530
|
Pass specific instance IDs or SSH aliases (e.g. aws-gpu1) to terminate,
|
|
@@ -468,6 +562,13 @@ def terminate(region, profile, yes, instance_ids):
|
|
|
468
562
|
click.secho(" Cancelled.", fg="yellow")
|
|
469
563
|
return
|
|
470
564
|
|
|
565
|
+
# Discover EBS volumes before termination (while instances still exist)
|
|
566
|
+
ebs_by_instance: dict[str, list[dict]] = {}
|
|
567
|
+
for target in targets:
|
|
568
|
+
volumes = find_ebs_volumes_for_instance(ec2, target, "aws-bootstrap-g4dn")
|
|
569
|
+
if volumes:
|
|
570
|
+
ebs_by_instance[target] = volumes
|
|
571
|
+
|
|
471
572
|
changes = terminate_tagged_instances(ec2, targets)
|
|
472
573
|
click.echo()
|
|
473
574
|
for change in changes:
|
|
@@ -479,10 +580,73 @@ def terminate(region, profile, yes, instance_ids):
|
|
|
479
580
|
removed_alias = remove_ssh_host(change["InstanceId"])
|
|
480
581
|
if removed_alias:
|
|
481
582
|
info(f"Removed SSH config alias: {removed_alias}")
|
|
583
|
+
|
|
584
|
+
# Handle EBS volume cleanup
|
|
585
|
+
for _iid, volumes in ebs_by_instance.items():
|
|
586
|
+
for vol in volumes:
|
|
587
|
+
vid = vol["VolumeId"]
|
|
588
|
+
if keep_ebs:
|
|
589
|
+
click.echo()
|
|
590
|
+
info(f"Preserving EBS volume: {vid} ({vol['Size']} GB)")
|
|
591
|
+
info(f"Reattach with: aws-bootstrap launch --ebs-volume-id {vid}")
|
|
592
|
+
else:
|
|
593
|
+
click.echo()
|
|
594
|
+
info(f"Waiting for EBS volume {vid} to detach...")
|
|
595
|
+
try:
|
|
596
|
+
waiter = ec2.get_waiter("volume_available")
|
|
597
|
+
waiter.wait(VolumeIds=[vid], WaiterConfig={"Delay": 10, "MaxAttempts": 30})
|
|
598
|
+
delete_ebs_volume(ec2, vid)
|
|
599
|
+
success(f"Deleted EBS volume: {vid}")
|
|
600
|
+
except Exception as e:
|
|
601
|
+
warn(f"Failed to delete EBS volume {vid}: {e}")
|
|
602
|
+
|
|
482
603
|
click.echo()
|
|
483
604
|
success(f"Terminated {len(changes)} instance(s).")
|
|
484
605
|
|
|
485
606
|
|
|
607
|
+
@main.command()
|
|
608
|
+
@click.option("--dry-run", is_flag=True, default=False, help="Show what would be removed without removing.")
|
|
609
|
+
@click.option("--yes", "-y", is_flag=True, default=False, help="Skip confirmation prompt.")
|
|
610
|
+
@click.option("--region", default="us-west-2", show_default=True, help="AWS region.")
|
|
611
|
+
@click.option("--profile", default=None, help="AWS profile override.")
|
|
612
|
+
def cleanup(dry_run, yes, region, profile):
|
|
613
|
+
"""Remove stale SSH config entries for terminated instances."""
|
|
614
|
+
session = boto3.Session(profile_name=profile, region_name=region)
|
|
615
|
+
ec2 = session.client("ec2")
|
|
616
|
+
|
|
617
|
+
live_instances = find_tagged_instances(ec2, "aws-bootstrap-g4dn")
|
|
618
|
+
live_ids = {inst["InstanceId"] for inst in live_instances}
|
|
619
|
+
|
|
620
|
+
stale = find_stale_ssh_hosts(live_ids)
|
|
621
|
+
if not stale:
|
|
622
|
+
click.secho("No stale SSH config entries found.", fg="green")
|
|
623
|
+
return
|
|
624
|
+
|
|
625
|
+
click.secho(f"\n Found {len(stale)} stale SSH config entry(ies):\n", bold=True, fg="cyan")
|
|
626
|
+
for iid, alias in stale:
|
|
627
|
+
click.echo(" " + click.style(alias, fg="bright_white") + f" ({iid})")
|
|
628
|
+
|
|
629
|
+
if dry_run:
|
|
630
|
+
click.echo()
|
|
631
|
+
for iid, alias in stale:
|
|
632
|
+
info(f"Would remove {alias} ({iid})")
|
|
633
|
+
return
|
|
634
|
+
|
|
635
|
+
if not yes:
|
|
636
|
+
click.echo()
|
|
637
|
+
if not click.confirm(f" Remove {len(stale)} stale entry(ies)?"):
|
|
638
|
+
click.secho(" Cancelled.", fg="yellow")
|
|
639
|
+
return
|
|
640
|
+
|
|
641
|
+
results = cleanup_stale_ssh_hosts(live_ids)
|
|
642
|
+
click.echo()
|
|
643
|
+
for r in results:
|
|
644
|
+
success(f"Removed {r.alias} ({r.instance_id})")
|
|
645
|
+
|
|
646
|
+
click.echo()
|
|
647
|
+
success(f"Cleaned up {len(results)} stale entry(ies).")
|
|
648
|
+
|
|
649
|
+
|
|
486
650
|
# ---------------------------------------------------------------------------
|
|
487
651
|
# list command group
|
|
488
652
|
# ---------------------------------------------------------------------------
|