aws-bootstrap-g4dn 0.5.0__tar.gz → 0.7.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {aws_bootstrap_g4dn-0.5.0 → aws_bootstrap_g4dn-0.7.0}/.pre-commit-config.yaml +2 -2
- {aws_bootstrap_g4dn-0.5.0 → aws_bootstrap_g4dn-0.7.0}/CLAUDE.md +54 -5
- {aws_bootstrap_g4dn-0.5.0 → aws_bootstrap_g4dn-0.7.0}/PKG-INFO +79 -7
- {aws_bootstrap_g4dn-0.5.0 → aws_bootstrap_g4dn-0.7.0}/README.md +76 -6
- aws_bootstrap_g4dn-0.7.0/aws_bootstrap/cli.py +963 -0
- {aws_bootstrap_g4dn-0.5.0 → aws_bootstrap_g4dn-0.7.0}/aws_bootstrap/config.py +2 -0
- {aws_bootstrap_g4dn-0.5.0 → aws_bootstrap_g4dn-0.7.0}/aws_bootstrap/ec2.py +137 -8
- aws_bootstrap_g4dn-0.7.0/aws_bootstrap/output.py +106 -0
- {aws_bootstrap_g4dn-0.5.0 → aws_bootstrap_g4dn-0.7.0}/aws_bootstrap/resources/remote_setup.sh +2 -2
- {aws_bootstrap_g4dn-0.5.0 → aws_bootstrap_g4dn-0.7.0}/aws_bootstrap/ssh.py +142 -20
- {aws_bootstrap_g4dn-0.5.0 → aws_bootstrap_g4dn-0.7.0}/aws_bootstrap/tests/test_cli.py +652 -4
- {aws_bootstrap_g4dn-0.5.0 → aws_bootstrap_g4dn-0.7.0}/aws_bootstrap/tests/test_config.py +18 -0
- aws_bootstrap_g4dn-0.7.0/aws_bootstrap/tests/test_ebs.py +245 -0
- aws_bootstrap_g4dn-0.7.0/aws_bootstrap/tests/test_output.py +192 -0
- {aws_bootstrap_g4dn-0.5.0 → aws_bootstrap_g4dn-0.7.0}/aws_bootstrap/tests/test_ssh_config.py +76 -0
- aws_bootstrap_g4dn-0.7.0/aws_bootstrap/tests/test_ssh_ebs.py +76 -0
- {aws_bootstrap_g4dn-0.5.0 → aws_bootstrap_g4dn-0.7.0}/aws_bootstrap_g4dn.egg-info/PKG-INFO +79 -7
- {aws_bootstrap_g4dn-0.5.0 → aws_bootstrap_g4dn-0.7.0}/aws_bootstrap_g4dn.egg-info/SOURCES.txt +4 -0
- aws_bootstrap_g4dn-0.7.0/aws_bootstrap_g4dn.egg-info/requires.txt +4 -0
- {aws_bootstrap_g4dn-0.5.0 → aws_bootstrap_g4dn-0.7.0}/pyproject.toml +5 -1
- {aws_bootstrap_g4dn-0.5.0 → aws_bootstrap_g4dn-0.7.0}/uv.lock +101 -21
- aws_bootstrap_g4dn-0.5.0/aws_bootstrap/cli.py +0 -547
- aws_bootstrap_g4dn-0.5.0/aws_bootstrap_g4dn.egg-info/requires.txt +0 -2
- {aws_bootstrap_g4dn-0.5.0 → aws_bootstrap_g4dn-0.7.0}/.github/ISSUE_TEMPLATE/bug_report.md +0 -0
- {aws_bootstrap_g4dn-0.5.0 → aws_bootstrap_g4dn-0.7.0}/.github/ISSUE_TEMPLATE/feature_request.md +0 -0
- {aws_bootstrap_g4dn-0.5.0 → aws_bootstrap_g4dn-0.7.0}/.github/workflows/ci.yml +0 -0
- {aws_bootstrap_g4dn-0.5.0 → aws_bootstrap_g4dn-0.7.0}/.github/workflows/publish-to-pypi.yml +0 -0
- {aws_bootstrap_g4dn-0.5.0 → aws_bootstrap_g4dn-0.7.0}/.gitignore +0 -0
- {aws_bootstrap_g4dn-0.5.0 → aws_bootstrap_g4dn-0.7.0}/CODE_OF_CONDUCT.md +0 -0
- {aws_bootstrap_g4dn-0.5.0 → aws_bootstrap_g4dn-0.7.0}/CONTRIBUTING.md +0 -0
- {aws_bootstrap_g4dn-0.5.0 → aws_bootstrap_g4dn-0.7.0}/LICENSE +0 -0
- {aws_bootstrap_g4dn-0.5.0 → aws_bootstrap_g4dn-0.7.0}/SECURITY.md +0 -0
- {aws_bootstrap_g4dn-0.5.0 → aws_bootstrap_g4dn-0.7.0}/aws_bootstrap/__init__.py +0 -0
- {aws_bootstrap_g4dn-0.5.0 → aws_bootstrap_g4dn-0.7.0}/aws_bootstrap/gpu.py +0 -0
- {aws_bootstrap_g4dn-0.5.0 → aws_bootstrap_g4dn-0.7.0}/aws_bootstrap/resources/__init__.py +0 -0
- {aws_bootstrap_g4dn-0.5.0 → aws_bootstrap_g4dn-0.7.0}/aws_bootstrap/resources/gpu_benchmark.py +0 -0
- {aws_bootstrap_g4dn-0.5.0 → aws_bootstrap_g4dn-0.7.0}/aws_bootstrap/resources/gpu_smoke_test.ipynb +0 -0
- {aws_bootstrap_g4dn-0.5.0 → aws_bootstrap_g4dn-0.7.0}/aws_bootstrap/resources/launch.json +0 -0
- {aws_bootstrap_g4dn-0.5.0 → aws_bootstrap_g4dn-0.7.0}/aws_bootstrap/resources/requirements.txt +0 -0
- {aws_bootstrap_g4dn-0.5.0 → aws_bootstrap_g4dn-0.7.0}/aws_bootstrap/resources/saxpy.cu +0 -0
- {aws_bootstrap_g4dn-0.5.0 → aws_bootstrap_g4dn-0.7.0}/aws_bootstrap/resources/tasks.json +0 -0
- {aws_bootstrap_g4dn-0.5.0 → aws_bootstrap_g4dn-0.7.0}/aws_bootstrap/tests/__init__.py +0 -0
- {aws_bootstrap_g4dn-0.5.0 → aws_bootstrap_g4dn-0.7.0}/aws_bootstrap/tests/test_ec2.py +0 -0
- {aws_bootstrap_g4dn-0.5.0 → aws_bootstrap_g4dn-0.7.0}/aws_bootstrap/tests/test_gpu.py +0 -0
- {aws_bootstrap_g4dn-0.5.0 → aws_bootstrap_g4dn-0.7.0}/aws_bootstrap/tests/test_ssh_gpu.py +0 -0
- {aws_bootstrap_g4dn-0.5.0 → aws_bootstrap_g4dn-0.7.0}/aws_bootstrap_g4dn.egg-info/dependency_links.txt +0 -0
- {aws_bootstrap_g4dn-0.5.0 → aws_bootstrap_g4dn-0.7.0}/aws_bootstrap_g4dn.egg-info/entry_points.txt +0 -0
- {aws_bootstrap_g4dn-0.5.0 → aws_bootstrap_g4dn-0.7.0}/aws_bootstrap_g4dn.egg-info/top_level.txt +0 -0
- {aws_bootstrap_g4dn-0.5.0 → aws_bootstrap_g4dn-0.7.0}/docs/nsight-remote-profiling.md +0 -0
- {aws_bootstrap_g4dn-0.5.0 → aws_bootstrap_g4dn-0.7.0}/setup.cfg +0 -0
|
@@ -17,7 +17,7 @@ repos:
|
|
|
17
17
|
- id: end-of-file-fixer
|
|
18
18
|
- id: trailing-whitespace
|
|
19
19
|
- repo: https://github.com/astral-sh/ruff-pre-commit
|
|
20
|
-
rev: v0.
|
|
20
|
+
rev: v0.15.0
|
|
21
21
|
hooks:
|
|
22
22
|
# Run the linter.
|
|
23
23
|
- id: ruff-check
|
|
@@ -28,7 +28,7 @@ repos:
|
|
|
28
28
|
rev: v1.19.0
|
|
29
29
|
hooks:
|
|
30
30
|
- id: mypy
|
|
31
|
-
additional_dependencies: [types-pyyaml>=6.0.12]
|
|
31
|
+
additional_dependencies: [types-pyyaml>=6.0.12, types-tabulate>=0.9]
|
|
32
32
|
- repo: local
|
|
33
33
|
hooks:
|
|
34
34
|
- id: pytest
|
|
@@ -13,6 +13,8 @@ Target workflows: Jupyter server-client, VSCode Remote SSH, and NVIDIA Nsight re
|
|
|
13
13
|
- **Python 3.12+** with **uv** package manager (astral-sh/uv) — used for venv creation, dependency management, and running the project
|
|
14
14
|
- **boto3** — AWS SDK for EC2 provisioning (AMI lookup, security groups, instance launch, waiters)
|
|
15
15
|
- **click** — CLI framework with built-in color support (`click.secho`, `click.style`)
|
|
16
|
+
- **pyyaml** — YAML serialization for `--output yaml`
|
|
17
|
+
- **tabulate** — Table formatting for `--output table`
|
|
16
18
|
- **setuptools + setuptools-scm** — build backend with git-tag-based versioning (configured in pyproject.toml)
|
|
17
19
|
- **AWS CLI v2** with a configured AWS profile (`AWS_PROFILE` env var or `--profile` flag)
|
|
18
20
|
- **direnv** for automatic venv activation (`.envrc` sources `.venv/bin/activate`)
|
|
@@ -32,9 +34,10 @@ aws_bootstrap/
|
|
|
32
34
|
__init__.py # Package init
|
|
33
35
|
cli.py # Click CLI entry point (launch, status, terminate commands)
|
|
34
36
|
config.py # LaunchConfig dataclass with defaults
|
|
35
|
-
ec2.py # AMI lookup, security group, instance launch/find/terminate, polling, spot pricing
|
|
37
|
+
ec2.py # AMI lookup, security group, instance launch/find/terminate, polling, spot pricing, EBS volume ops
|
|
36
38
|
gpu.py # GPU architecture mapping and GpuInfo dataclass
|
|
37
|
-
|
|
39
|
+
output.py # Output formatting: OutputFormat enum, emit(), echo/secho wrappers for structured output
|
|
40
|
+
ssh.py # SSH key pair import, SSH readiness check, remote setup, ~/.ssh/config management, GPU queries, EBS mount
|
|
38
41
|
resources/ # Non-Python artifacts SCP'd to remote instances
|
|
39
42
|
__init__.py
|
|
40
43
|
gpu_benchmark.py # GPU throughput benchmark (CNN + Transformer), copied to ~/gpu_benchmark.py on instance
|
|
@@ -48,9 +51,12 @@ aws_bootstrap/
|
|
|
48
51
|
test_config.py
|
|
49
52
|
test_cli.py
|
|
50
53
|
test_ec2.py
|
|
54
|
+
test_output.py
|
|
51
55
|
test_gpu.py
|
|
52
56
|
test_ssh_config.py
|
|
53
57
|
test_ssh_gpu.py
|
|
58
|
+
test_ebs.py
|
|
59
|
+
test_ssh_ebs.py
|
|
54
60
|
docs/
|
|
55
61
|
nsight-remote-profiling.md # Nsight Compute, Nsight Systems, and Nsight VSCE remote profiling guide
|
|
56
62
|
spot-request-lifecycle.md # Research notes on spot request cleanup
|
|
@@ -60,9 +66,12 @@ Entry point: `aws-bootstrap = "aws_bootstrap.cli:main"` (installed via `uv sync`
|
|
|
60
66
|
|
|
61
67
|
## CLI Commands
|
|
62
68
|
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
- **`
|
|
69
|
+
**Global option:** `--output` / `-o` controls output format: `text` (default, human-readable with color), `json`, `yaml`, `table`. Structured formats (json/yaml/table) suppress all progress messages and emit machine-readable output. Commands requiring confirmation (`terminate`, `cleanup`) require `--yes` in structured modes.
|
|
70
|
+
|
|
71
|
+
- **`launch`** — provisions an EC2 instance (spot by default, falls back to on-demand on capacity errors); adds SSH config alias (e.g. `aws-gpu1`) to `~/.ssh/config`; `--python-version` controls which Python `uv` installs in the remote venv; `--ssh-port` overrides the default SSH port (22) for security group ingress, connection checks, and SSH config; `--ebs-storage SIZE` creates and attaches a new gp3 EBS data volume (mounted at `/data`); `--ebs-volume-id ID` attaches an existing EBS volume (mutually exclusive with `--ebs-storage`)
|
|
72
|
+
- **`status`** — lists all non-terminated instances (including `shutting-down`) with type, IP, SSH alias, EBS data volumes, pricing (spot price/hr or on-demand), uptime, and estimated cost for running spot instances; `--gpu` flag queries GPU info via SSH, reporting both CUDA toolkit version (from `nvcc`) and driver-supported max (from `nvidia-smi`); `--instructions` (default: on) prints connection commands (SSH, Jupyter tunnel, VSCode Remote SSH, GPU benchmark) for each running instance; suppress with `--no-instructions`
|
|
73
|
+
- **`terminate`** — terminates instances by ID or SSH alias (e.g. `aws-gpu1`, resolved via `~/.ssh/config`), or all aws-bootstrap instances in the region if no arguments given; removes SSH config aliases; deletes associated EBS data volumes by default; `--keep-ebs` preserves volumes and prints reattach commands
|
|
74
|
+
- **`cleanup`** — removes stale `~/.ssh/config` entries for terminated/non-existent instances; compares managed SSH config blocks against live EC2 instances; `--dry-run` previews removals without modifying config; `--yes` skips the confirmation prompt
|
|
66
75
|
- **`list instance-types`** — lists EC2 instance types matching a family prefix (default: `g4dn`), showing vCPUs, memory, and GPU info
|
|
67
76
|
- **`list amis`** — lists available AMIs matching a name pattern (default: Deep Learning Base OSS Nvidia Driver GPU AMIs), sorted newest-first
|
|
68
77
|
|
|
@@ -91,6 +100,18 @@ uv run pytest
|
|
|
91
100
|
|
|
92
101
|
Use `uv add <package>` to add dependencies and `uv add --group dev <package>` for dev dependencies.
|
|
93
102
|
|
|
103
|
+
## Structured Output Architecture
|
|
104
|
+
|
|
105
|
+
The `--output` option uses a context-aware suppression pattern via `aws_bootstrap/output.py`:
|
|
106
|
+
|
|
107
|
+
- **`output.echo()` / `output.secho()`** — wrap `click.echo`/`click.secho`; silent in non-text modes. Used in `ec2.py` and `ssh.py` for progress messages.
|
|
108
|
+
- **`is_text(ctx)`** — checks if the current output format is text. Used in `cli.py` to guard text-only blocks.
|
|
109
|
+
- **`emit(data, headers=..., ctx=...)`** — dispatches structured data to JSON/YAML/table renderers. No-op in text mode.
|
|
110
|
+
- **CLI helper guards** — `step()`, `info()`, `val()`, `success()`, `warn()` in `cli.py` check `is_text()` and return early in structured modes.
|
|
111
|
+
- Each CLI command builds a result dict alongside existing logic, emits it via `emit()` for non-text formats, and falls through to text output for text mode.
|
|
112
|
+
- **Confirmation prompts** (`terminate`, `cleanup`) require `--yes` in structured modes to avoid corrupting output.
|
|
113
|
+
- The spot-fallback `click.confirm()` in `ec2.py` auto-confirms in structured modes.
|
|
114
|
+
|
|
94
115
|
## CUDA-Aware PyTorch Installation
|
|
95
116
|
|
|
96
117
|
`remote_setup.sh` detects the CUDA toolkit version on the instance (via `nvcc`, falling back to `nvidia-smi`) and installs PyTorch from the matching CUDA wheel index (`https://download.pytorch.org/whl/cu{TAG}`). This ensures `torch.version.cuda` matches the system's CUDA toolkit, which is required for compiling custom CUDA extensions with `nvcc`.
|
|
@@ -112,6 +133,34 @@ The `KNOWN_CUDA_TAGS` array in `remote_setup.sh` lists the CUDA wheel tags publi
|
|
|
112
133
|
|
|
113
134
|
`resources/gpu_benchmark.py` is uploaded to `~/gpu_benchmark.py` on the remote instance during setup. It benchmarks GPU throughput with two modes: CNN on MNIST and a GPT-style Transformer on synthetic data. It reports samples/sec, batch times, and peak GPU memory. Supports `--precision` (fp32/fp16/bf16/tf32), `--diagnose` for CUDA smoke tests, and separate `--transformer-batch-size` (default 32, T4-safe). Dependencies (`torch`, `torchvision`, `tqdm`) are already installed by the setup script.
|
|
114
135
|
|
|
136
|
+
## EBS Data Volumes
|
|
137
|
+
|
|
138
|
+
The `--ebs-storage` and `--ebs-volume-id` options on `launch` create or attach persistent gp3 EBS volumes mounted at `/data`. The implementation spans three modules:
|
|
139
|
+
|
|
140
|
+
- **`ec2.py`** — Volume lifecycle: `create_ebs_volume`, `validate_ebs_volume`, `attach_ebs_volume`, `detach_ebs_volume`, `delete_ebs_volume`, `find_ebs_volumes_for_instance`. Constants `EBS_DEVICE_NAME` (`/dev/sdf`) and `EBS_MOUNT_POINT` (`/data`).
|
|
141
|
+
- **`ssh.py`** — `mount_ebs_volume()` SSHs to the instance and runs a shell script that detects the device, optionally formats it, mounts it, and adds an fstab entry.
|
|
142
|
+
- **`cli.py`** — Orchestrates the flow: create/validate → attach → wait for SSH → mount. Mount failures are non-fatal (warn and continue).
|
|
143
|
+
|
|
144
|
+
### Tagging strategy
|
|
145
|
+
|
|
146
|
+
Volumes are tagged for discovery by `status` and `terminate`:
|
|
147
|
+
|
|
148
|
+
| Tag | Value | Purpose |
|
|
149
|
+
|-----|-------|---------|
|
|
150
|
+
| `created-by` | `aws-bootstrap-g4dn` | Standard tool-managed resource tag |
|
|
151
|
+
| `Name` | `aws-bootstrap-data-{instance_id}` | Human-readable in AWS console |
|
|
152
|
+
| `aws-bootstrap-instance` | `i-xxxxxxxxx` | Links volume to instance for `find_ebs_volumes_for_instance` |
|
|
153
|
+
|
|
154
|
+
### NVMe device detection
|
|
155
|
+
|
|
156
|
+
On Nitro instances (g4dn), `/dev/sdf` is remapped to `/dev/nvmeXn1`. The mount script detects the correct device by matching the volume ID serial number via `lsblk -o NAME,SERIAL -dpn`, with fallbacks to `/dev/nvme1n1`, `/dev/xvdf`, `/dev/sdf`.
|
|
157
|
+
|
|
158
|
+
### Spot interruption and terminate cleanup
|
|
159
|
+
|
|
160
|
+
Non-root EBS volumes attached via API have `DeleteOnTermination=False` by default. This means data volumes **survive spot interruptions** — when AWS reclaims the instance, the volume detaches and becomes `available`, preserving all data. The user can reattach it to a new instance with `--ebs-volume-id`.
|
|
161
|
+
|
|
162
|
+
The `terminate` command discovers volumes via `find_ebs_volumes_for_instance`, waits for them to detach (becomes `available`), then deletes them. `--keep-ebs` skips deletion and prints the volume ID with a reattach command.
|
|
163
|
+
|
|
115
164
|
## Versioning & Publishing
|
|
116
165
|
|
|
117
166
|
Version is derived automatically from git tags via **setuptools-scm** — no hardcoded version string in the codebase.
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: aws-bootstrap-g4dn
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.7.0
|
|
4
4
|
Summary: Bootstrap AWS EC2 GPU instances for hybrid local-remote development
|
|
5
5
|
Author: Adam Ever-Hadani
|
|
6
6
|
License-Expression: MIT
|
|
@@ -15,6 +15,8 @@ Description-Content-Type: text/markdown
|
|
|
15
15
|
License-File: LICENSE
|
|
16
16
|
Requires-Dist: boto3>=1.35
|
|
17
17
|
Requires-Dist: click>=8.1
|
|
18
|
+
Requires-Dist: pyyaml>=6.0.3
|
|
19
|
+
Requires-Dist: tabulate>=0.9.0
|
|
18
20
|
Dynamic: license-file
|
|
19
21
|
|
|
20
22
|
# aws-bootstrap-g4dn
|
|
@@ -44,7 +46,8 @@ ssh aws-gpu1 # You're in, venv activated, PyTorch works
|
|
|
44
46
|
| 📊 | **GPU benchmark included** | CNN (MNIST) + Transformer benchmarks with FP16/FP32/BF16 precision and tqdm progress |
|
|
45
47
|
| 📓 | **Jupyter ready** | Lab server auto-starts as a systemd service on port 8888 — just SSH tunnel and open |
|
|
46
48
|
| 🖥️ | **`status --gpu`** | Shows CUDA toolkit version, driver max, GPU architecture, spot pricing, uptime, and estimated cost |
|
|
47
|
-
|
|
|
49
|
+
| 💾 | **EBS data volumes** | Attach persistent storage at `/data` — survives spot interruptions and termination, reattach to new instances |
|
|
50
|
+
| 🗑️ | **Clean terminate** | Stops instances, removes SSH aliases, cleans up EBS volumes (or preserves with `--keep-ebs`) |
|
|
48
51
|
|
|
49
52
|
### 🎯 Target Workflows
|
|
50
53
|
|
|
@@ -132,16 +135,24 @@ aws-bootstrap launch --python-version 3.13
|
|
|
132
135
|
# Use a non-default SSH port
|
|
133
136
|
aws-bootstrap launch --ssh-port 2222
|
|
134
137
|
|
|
138
|
+
# Attach a persistent EBS data volume (96 GB gp3, mounted at /data)
|
|
139
|
+
aws-bootstrap launch --ebs-storage 96
|
|
140
|
+
|
|
141
|
+
# Reattach an existing EBS volume from a previous instance
|
|
142
|
+
aws-bootstrap launch --ebs-volume-id vol-0abc123def456
|
|
143
|
+
|
|
135
144
|
# Use a specific AWS profile
|
|
136
145
|
aws-bootstrap launch --profile my-aws-profile
|
|
137
146
|
```
|
|
138
147
|
|
|
139
148
|
After launch, the CLI:
|
|
140
149
|
|
|
141
|
-
1. **
|
|
142
|
-
2. **
|
|
143
|
-
3. **Runs
|
|
144
|
-
4. **
|
|
150
|
+
1. **Creates/attaches EBS volume** (if `--ebs-storage` or `--ebs-volume-id` was specified)
|
|
151
|
+
2. **Adds an SSH alias** (e.g. `aws-gpu1`) to `~/.ssh/config`
|
|
152
|
+
3. **Runs remote setup** — installs utilities, creates a Python venv, installs CUDA-matched PyTorch, sets up Jupyter
|
|
153
|
+
4. **Mounts EBS volume** at `/data` (if applicable — formats new volumes, mounts existing ones as-is)
|
|
154
|
+
5. **Runs a CUDA smoke test** — verifies `torch.cuda.is_available()` and runs a quick GPU matmul
|
|
155
|
+
6. **Prints connection commands** — SSH, Jupyter tunnel, GPU benchmark, and terminate
|
|
145
156
|
|
|
146
157
|
```bash
|
|
147
158
|
ssh aws-gpu1 # venv auto-activates on login
|
|
@@ -154,7 +165,7 @@ The setup script runs automatically on the instance after SSH becomes available:
|
|
|
154
165
|
| Step | What |
|
|
155
166
|
|------|------|
|
|
156
167
|
| **GPU verify** | Confirms `nvidia-smi` and `nvcc` are working |
|
|
157
|
-
| **Utilities** | Installs `htop`, `tmux`, `tree`, `jq` |
|
|
168
|
+
| **Utilities** | Installs `htop`, `tmux`, `tree`, `jq`, `ffmpeg` |
|
|
158
169
|
| **Python venv** | Creates `~/venv` with `uv`, auto-activates in `~/.bashrc`. Use `--python-version` to pin a specific Python (e.g. `3.13`) |
|
|
159
170
|
| **CUDA-aware PyTorch** | Detects CUDA toolkit version → installs PyTorch from the matching `cu{TAG}` wheel index |
|
|
160
171
|
| **CUDA smoke test** | Runs `torch.cuda.is_available()` + GPU matmul to verify the stack |
|
|
@@ -223,6 +234,30 @@ Then install the [Nsight VSCE extension](https://marketplace.visualstudio.com/it
|
|
|
223
234
|
|
|
224
235
|
See [Nsight remote profiling guide](docs/nsight-remote-profiling.md) for more details on CUDA debugging and profiling workflows.
|
|
225
236
|
|
|
237
|
+
### 📤 Structured Output
|
|
238
|
+
|
|
239
|
+
All commands support `--output` / `-o` for machine-readable output — useful for scripting, piping to `jq`, or LLM tool-use:
|
|
240
|
+
|
|
241
|
+
```bash
|
|
242
|
+
# JSON output (pipe to jq)
|
|
243
|
+
aws-bootstrap -o json status
|
|
244
|
+
aws-bootstrap -o json status | jq '.instances[0].instance_id'
|
|
245
|
+
|
|
246
|
+
# YAML output
|
|
247
|
+
aws-bootstrap -o yaml status
|
|
248
|
+
|
|
249
|
+
# Table output
|
|
250
|
+
aws-bootstrap -o table status
|
|
251
|
+
|
|
252
|
+
# Works with all commands
|
|
253
|
+
aws-bootstrap -o json list instance-types | jq '.[].instance_type'
|
|
254
|
+
aws-bootstrap -o json launch --dry-run
|
|
255
|
+
aws-bootstrap -o json terminate --yes
|
|
256
|
+
aws-bootstrap -o json cleanup --dry-run
|
|
257
|
+
```
|
|
258
|
+
|
|
259
|
+
Supported formats: `text` (default, human-readable with color), `json`, `yaml`, `table`. Commands that require confirmation (`terminate`, `cleanup`) require `--yes` in structured output modes.
|
|
260
|
+
|
|
226
261
|
### 📋 Listing Resources
|
|
227
262
|
|
|
228
263
|
```bash
|
|
@@ -261,6 +296,9 @@ aws-bootstrap status --region us-east-1
|
|
|
261
296
|
# Terminate all aws-bootstrap instances (with confirmation prompt)
|
|
262
297
|
aws-bootstrap terminate
|
|
263
298
|
|
|
299
|
+
# Terminate but preserve EBS data volumes for reuse
|
|
300
|
+
aws-bootstrap terminate --keep-ebs
|
|
301
|
+
|
|
264
302
|
# Terminate by SSH alias (resolved via ~/.ssh/config)
|
|
265
303
|
aws-bootstrap terminate aws-gpu1
|
|
266
304
|
|
|
@@ -272,6 +310,15 @@ aws-bootstrap terminate aws-gpu1 i-def456
|
|
|
272
310
|
|
|
273
311
|
# Skip confirmation prompt
|
|
274
312
|
aws-bootstrap terminate --yes
|
|
313
|
+
|
|
314
|
+
# Remove stale SSH config entries for terminated instances
|
|
315
|
+
aws-bootstrap cleanup
|
|
316
|
+
|
|
317
|
+
# Preview what would be removed without modifying config
|
|
318
|
+
aws-bootstrap cleanup --dry-run
|
|
319
|
+
|
|
320
|
+
# Skip confirmation prompt
|
|
321
|
+
aws-bootstrap cleanup --yes
|
|
275
322
|
```
|
|
276
323
|
|
|
277
324
|
`status --gpu` reports both the **installed CUDA toolkit** version (from `nvcc`) and the **maximum CUDA version supported by the driver** (from `nvidia-smi`), so you can see at a glance whether they match:
|
|
@@ -282,6 +329,31 @@ CUDA: 12.8 (driver supports up to 13.0)
|
|
|
282
329
|
|
|
283
330
|
SSH aliases are managed automatically — they're created on `launch`, shown in `status`, and cleaned up on `terminate`. Aliases use sequential numbering (`aws-gpu1`, `aws-gpu2`, etc.) and never reuse numbers from previous instances. You can use aliases anywhere you'd use an instance ID, e.g. `aws-bootstrap terminate aws-gpu1`.
|
|
284
331
|
|
|
332
|
+
## EBS Data Volumes
|
|
333
|
+
|
|
334
|
+
Attach persistent EBS storage to keep datasets and model checkpoints across instance lifecycles. Volumes are mounted at `/data` and persist independently of the instance.
|
|
335
|
+
|
|
336
|
+
```bash
|
|
337
|
+
# Create a new 96 GB gp3 volume, formatted and mounted at /data
|
|
338
|
+
aws-bootstrap launch --ebs-storage 96
|
|
339
|
+
|
|
340
|
+
# After terminating with --keep-ebs, reattach the same volume to a new instance
|
|
341
|
+
aws-bootstrap terminate --keep-ebs
|
|
342
|
+
# Output: Preserving EBS volume: vol-0abc123...
|
|
343
|
+
# Reattach with: aws-bootstrap launch --ebs-volume-id vol-0abc123...
|
|
344
|
+
|
|
345
|
+
aws-bootstrap launch --ebs-volume-id vol-0abc123def456
|
|
346
|
+
```
|
|
347
|
+
|
|
348
|
+
Key behaviors:
|
|
349
|
+
- `--ebs-storage` and `--ebs-volume-id` are mutually exclusive
|
|
350
|
+
- New volumes are formatted as ext4; existing volumes are mounted as-is
|
|
351
|
+
- Volumes are tagged for automatic discovery by `status` and `terminate`
|
|
352
|
+
- `terminate` deletes data volumes by default; use `--keep-ebs` to preserve them
|
|
353
|
+
- **Spot-safe** — data volumes survive spot interruptions. If AWS reclaims your instance, the volume detaches automatically and can be reattached to a new instance with `--ebs-volume-id`
|
|
354
|
+
- EBS volumes must be in the same availability zone as the instance
|
|
355
|
+
- Mount failures are non-fatal — the instance remains usable
|
|
356
|
+
|
|
285
357
|
## EC2 vCPU Quotas
|
|
286
358
|
|
|
287
359
|
AWS accounts have [service quotas](https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/ec2-resource-limits.html) that limit how many vCPUs you can run per instance family. New or lightly-used accounts often have a **default quota of 0 vCPUs** for GPU instance families (G and VT), which will cause errors on launch:
|
|
@@ -25,7 +25,8 @@ ssh aws-gpu1 # You're in, venv activated, PyTorch works
|
|
|
25
25
|
| 📊 | **GPU benchmark included** | CNN (MNIST) + Transformer benchmarks with FP16/FP32/BF16 precision and tqdm progress |
|
|
26
26
|
| 📓 | **Jupyter ready** | Lab server auto-starts as a systemd service on port 8888 — just SSH tunnel and open |
|
|
27
27
|
| 🖥️ | **`status --gpu`** | Shows CUDA toolkit version, driver max, GPU architecture, spot pricing, uptime, and estimated cost |
|
|
28
|
-
|
|
|
28
|
+
| 💾 | **EBS data volumes** | Attach persistent storage at `/data` — survives spot interruptions and termination, reattach to new instances |
|
|
29
|
+
| 🗑️ | **Clean terminate** | Stops instances, removes SSH aliases, cleans up EBS volumes (or preserves with `--keep-ebs`) |
|
|
29
30
|
|
|
30
31
|
### 🎯 Target Workflows
|
|
31
32
|
|
|
@@ -113,16 +114,24 @@ aws-bootstrap launch --python-version 3.13
|
|
|
113
114
|
# Use a non-default SSH port
|
|
114
115
|
aws-bootstrap launch --ssh-port 2222
|
|
115
116
|
|
|
117
|
+
# Attach a persistent EBS data volume (96 GB gp3, mounted at /data)
|
|
118
|
+
aws-bootstrap launch --ebs-storage 96
|
|
119
|
+
|
|
120
|
+
# Reattach an existing EBS volume from a previous instance
|
|
121
|
+
aws-bootstrap launch --ebs-volume-id vol-0abc123def456
|
|
122
|
+
|
|
116
123
|
# Use a specific AWS profile
|
|
117
124
|
aws-bootstrap launch --profile my-aws-profile
|
|
118
125
|
```
|
|
119
126
|
|
|
120
127
|
After launch, the CLI:
|
|
121
128
|
|
|
122
|
-
1. **
|
|
123
|
-
2. **
|
|
124
|
-
3. **Runs
|
|
125
|
-
4. **
|
|
129
|
+
1. **Creates/attaches EBS volume** (if `--ebs-storage` or `--ebs-volume-id` was specified)
|
|
130
|
+
2. **Adds an SSH alias** (e.g. `aws-gpu1`) to `~/.ssh/config`
|
|
131
|
+
3. **Runs remote setup** — installs utilities, creates a Python venv, installs CUDA-matched PyTorch, sets up Jupyter
|
|
132
|
+
4. **Mounts EBS volume** at `/data` (if applicable — formats new volumes, mounts existing ones as-is)
|
|
133
|
+
5. **Runs a CUDA smoke test** — verifies `torch.cuda.is_available()` and runs a quick GPU matmul
|
|
134
|
+
6. **Prints connection commands** — SSH, Jupyter tunnel, GPU benchmark, and terminate
|
|
126
135
|
|
|
127
136
|
```bash
|
|
128
137
|
ssh aws-gpu1 # venv auto-activates on login
|
|
@@ -135,7 +144,7 @@ The setup script runs automatically on the instance after SSH becomes available:
|
|
|
135
144
|
| Step | What |
|
|
136
145
|
|------|------|
|
|
137
146
|
| **GPU verify** | Confirms `nvidia-smi` and `nvcc` are working |
|
|
138
|
-
| **Utilities** | Installs `htop`, `tmux`, `tree`, `jq` |
|
|
147
|
+
| **Utilities** | Installs `htop`, `tmux`, `tree`, `jq`, `ffmpeg` |
|
|
139
148
|
| **Python venv** | Creates `~/venv` with `uv`, auto-activates in `~/.bashrc`. Use `--python-version` to pin a specific Python (e.g. `3.13`) |
|
|
140
149
|
| **CUDA-aware PyTorch** | Detects CUDA toolkit version → installs PyTorch from the matching `cu{TAG}` wheel index |
|
|
141
150
|
| **CUDA smoke test** | Runs `torch.cuda.is_available()` + GPU matmul to verify the stack |
|
|
@@ -204,6 +213,30 @@ Then install the [Nsight VSCE extension](https://marketplace.visualstudio.com/it
|
|
|
204
213
|
|
|
205
214
|
See [Nsight remote profiling guide](docs/nsight-remote-profiling.md) for more details on CUDA debugging and profiling workflows.
|
|
206
215
|
|
|
216
|
+
### 📤 Structured Output
|
|
217
|
+
|
|
218
|
+
All commands support `--output` / `-o` for machine-readable output — useful for scripting, piping to `jq`, or LLM tool-use:
|
|
219
|
+
|
|
220
|
+
```bash
|
|
221
|
+
# JSON output (pipe to jq)
|
|
222
|
+
aws-bootstrap -o json status
|
|
223
|
+
aws-bootstrap -o json status | jq '.instances[0].instance_id'
|
|
224
|
+
|
|
225
|
+
# YAML output
|
|
226
|
+
aws-bootstrap -o yaml status
|
|
227
|
+
|
|
228
|
+
# Table output
|
|
229
|
+
aws-bootstrap -o table status
|
|
230
|
+
|
|
231
|
+
# Works with all commands
|
|
232
|
+
aws-bootstrap -o json list instance-types | jq '.[].instance_type'
|
|
233
|
+
aws-bootstrap -o json launch --dry-run
|
|
234
|
+
aws-bootstrap -o json terminate --yes
|
|
235
|
+
aws-bootstrap -o json cleanup --dry-run
|
|
236
|
+
```
|
|
237
|
+
|
|
238
|
+
Supported formats: `text` (default, human-readable with color), `json`, `yaml`, `table`. Commands that require confirmation (`terminate`, `cleanup`) require `--yes` in structured output modes.
|
|
239
|
+
|
|
207
240
|
### 📋 Listing Resources
|
|
208
241
|
|
|
209
242
|
```bash
|
|
@@ -242,6 +275,9 @@ aws-bootstrap status --region us-east-1
|
|
|
242
275
|
# Terminate all aws-bootstrap instances (with confirmation prompt)
|
|
243
276
|
aws-bootstrap terminate
|
|
244
277
|
|
|
278
|
+
# Terminate but preserve EBS data volumes for reuse
|
|
279
|
+
aws-bootstrap terminate --keep-ebs
|
|
280
|
+
|
|
245
281
|
# Terminate by SSH alias (resolved via ~/.ssh/config)
|
|
246
282
|
aws-bootstrap terminate aws-gpu1
|
|
247
283
|
|
|
@@ -253,6 +289,15 @@ aws-bootstrap terminate aws-gpu1 i-def456
|
|
|
253
289
|
|
|
254
290
|
# Skip confirmation prompt
|
|
255
291
|
aws-bootstrap terminate --yes
|
|
292
|
+
|
|
293
|
+
# Remove stale SSH config entries for terminated instances
|
|
294
|
+
aws-bootstrap cleanup
|
|
295
|
+
|
|
296
|
+
# Preview what would be removed without modifying config
|
|
297
|
+
aws-bootstrap cleanup --dry-run
|
|
298
|
+
|
|
299
|
+
# Skip confirmation prompt
|
|
300
|
+
aws-bootstrap cleanup --yes
|
|
256
301
|
```
|
|
257
302
|
|
|
258
303
|
`status --gpu` reports both the **installed CUDA toolkit** version (from `nvcc`) and the **maximum CUDA version supported by the driver** (from `nvidia-smi`), so you can see at a glance whether they match:
|
|
@@ -263,6 +308,31 @@ CUDA: 12.8 (driver supports up to 13.0)
|
|
|
263
308
|
|
|
264
309
|
SSH aliases are managed automatically — they're created on `launch`, shown in `status`, and cleaned up on `terminate`. Aliases use sequential numbering (`aws-gpu1`, `aws-gpu2`, etc.) and never reuse numbers from previous instances. You can use aliases anywhere you'd use an instance ID, e.g. `aws-bootstrap terminate aws-gpu1`.
|
|
265
310
|
|
|
311
|
+
## EBS Data Volumes
|
|
312
|
+
|
|
313
|
+
Attach persistent EBS storage to keep datasets and model checkpoints across instance lifecycles. Volumes are mounted at `/data` and persist independently of the instance.
|
|
314
|
+
|
|
315
|
+
```bash
|
|
316
|
+
# Create a new 96 GB gp3 volume, formatted and mounted at /data
|
|
317
|
+
aws-bootstrap launch --ebs-storage 96
|
|
318
|
+
|
|
319
|
+
# After terminating with --keep-ebs, reattach the same volume to a new instance
|
|
320
|
+
aws-bootstrap terminate --keep-ebs
|
|
321
|
+
# Output: Preserving EBS volume: vol-0abc123...
|
|
322
|
+
# Reattach with: aws-bootstrap launch --ebs-volume-id vol-0abc123...
|
|
323
|
+
|
|
324
|
+
aws-bootstrap launch --ebs-volume-id vol-0abc123def456
|
|
325
|
+
```
|
|
326
|
+
|
|
327
|
+
Key behaviors:
|
|
328
|
+
- `--ebs-storage` and `--ebs-volume-id` are mutually exclusive
|
|
329
|
+
- New volumes are formatted as ext4; existing volumes are mounted as-is
|
|
330
|
+
- Volumes are tagged for automatic discovery by `status` and `terminate`
|
|
331
|
+
- `terminate` deletes data volumes by default; use `--keep-ebs` to preserve them
|
|
332
|
+
- **Spot-safe** — data volumes survive spot interruptions. If AWS reclaims your instance, the volume detaches automatically and can be reattached to a new instance with `--ebs-volume-id`
|
|
333
|
+
- EBS volumes must be in the same availability zone as the instance
|
|
334
|
+
- Mount failures are non-fatal — the instance remains usable
|
|
335
|
+
|
|
266
336
|
## EC2 vCPU Quotas
|
|
267
337
|
|
|
268
338
|
AWS accounts have [service quotas](https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/ec2-resource-limits.html) that limit how many vCPUs you can run per instance family. New or lightly-used accounts often have a **default quota of 0 vCPUs** for GPU instance families (G and VT), which will cause errors on launch:
|