aws-bootstrap-g4dn 0.5.0__tar.gz → 0.7.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (50) hide show
  1. {aws_bootstrap_g4dn-0.5.0 → aws_bootstrap_g4dn-0.7.0}/.pre-commit-config.yaml +2 -2
  2. {aws_bootstrap_g4dn-0.5.0 → aws_bootstrap_g4dn-0.7.0}/CLAUDE.md +54 -5
  3. {aws_bootstrap_g4dn-0.5.0 → aws_bootstrap_g4dn-0.7.0}/PKG-INFO +79 -7
  4. {aws_bootstrap_g4dn-0.5.0 → aws_bootstrap_g4dn-0.7.0}/README.md +76 -6
  5. aws_bootstrap_g4dn-0.7.0/aws_bootstrap/cli.py +963 -0
  6. {aws_bootstrap_g4dn-0.5.0 → aws_bootstrap_g4dn-0.7.0}/aws_bootstrap/config.py +2 -0
  7. {aws_bootstrap_g4dn-0.5.0 → aws_bootstrap_g4dn-0.7.0}/aws_bootstrap/ec2.py +137 -8
  8. aws_bootstrap_g4dn-0.7.0/aws_bootstrap/output.py +106 -0
  9. {aws_bootstrap_g4dn-0.5.0 → aws_bootstrap_g4dn-0.7.0}/aws_bootstrap/resources/remote_setup.sh +2 -2
  10. {aws_bootstrap_g4dn-0.5.0 → aws_bootstrap_g4dn-0.7.0}/aws_bootstrap/ssh.py +142 -20
  11. {aws_bootstrap_g4dn-0.5.0 → aws_bootstrap_g4dn-0.7.0}/aws_bootstrap/tests/test_cli.py +652 -4
  12. {aws_bootstrap_g4dn-0.5.0 → aws_bootstrap_g4dn-0.7.0}/aws_bootstrap/tests/test_config.py +18 -0
  13. aws_bootstrap_g4dn-0.7.0/aws_bootstrap/tests/test_ebs.py +245 -0
  14. aws_bootstrap_g4dn-0.7.0/aws_bootstrap/tests/test_output.py +192 -0
  15. {aws_bootstrap_g4dn-0.5.0 → aws_bootstrap_g4dn-0.7.0}/aws_bootstrap/tests/test_ssh_config.py +76 -0
  16. aws_bootstrap_g4dn-0.7.0/aws_bootstrap/tests/test_ssh_ebs.py +76 -0
  17. {aws_bootstrap_g4dn-0.5.0 → aws_bootstrap_g4dn-0.7.0}/aws_bootstrap_g4dn.egg-info/PKG-INFO +79 -7
  18. {aws_bootstrap_g4dn-0.5.0 → aws_bootstrap_g4dn-0.7.0}/aws_bootstrap_g4dn.egg-info/SOURCES.txt +4 -0
  19. aws_bootstrap_g4dn-0.7.0/aws_bootstrap_g4dn.egg-info/requires.txt +4 -0
  20. {aws_bootstrap_g4dn-0.5.0 → aws_bootstrap_g4dn-0.7.0}/pyproject.toml +5 -1
  21. {aws_bootstrap_g4dn-0.5.0 → aws_bootstrap_g4dn-0.7.0}/uv.lock +101 -21
  22. aws_bootstrap_g4dn-0.5.0/aws_bootstrap/cli.py +0 -547
  23. aws_bootstrap_g4dn-0.5.0/aws_bootstrap_g4dn.egg-info/requires.txt +0 -2
  24. {aws_bootstrap_g4dn-0.5.0 → aws_bootstrap_g4dn-0.7.0}/.github/ISSUE_TEMPLATE/bug_report.md +0 -0
  25. {aws_bootstrap_g4dn-0.5.0 → aws_bootstrap_g4dn-0.7.0}/.github/ISSUE_TEMPLATE/feature_request.md +0 -0
  26. {aws_bootstrap_g4dn-0.5.0 → aws_bootstrap_g4dn-0.7.0}/.github/workflows/ci.yml +0 -0
  27. {aws_bootstrap_g4dn-0.5.0 → aws_bootstrap_g4dn-0.7.0}/.github/workflows/publish-to-pypi.yml +0 -0
  28. {aws_bootstrap_g4dn-0.5.0 → aws_bootstrap_g4dn-0.7.0}/.gitignore +0 -0
  29. {aws_bootstrap_g4dn-0.5.0 → aws_bootstrap_g4dn-0.7.0}/CODE_OF_CONDUCT.md +0 -0
  30. {aws_bootstrap_g4dn-0.5.0 → aws_bootstrap_g4dn-0.7.0}/CONTRIBUTING.md +0 -0
  31. {aws_bootstrap_g4dn-0.5.0 → aws_bootstrap_g4dn-0.7.0}/LICENSE +0 -0
  32. {aws_bootstrap_g4dn-0.5.0 → aws_bootstrap_g4dn-0.7.0}/SECURITY.md +0 -0
  33. {aws_bootstrap_g4dn-0.5.0 → aws_bootstrap_g4dn-0.7.0}/aws_bootstrap/__init__.py +0 -0
  34. {aws_bootstrap_g4dn-0.5.0 → aws_bootstrap_g4dn-0.7.0}/aws_bootstrap/gpu.py +0 -0
  35. {aws_bootstrap_g4dn-0.5.0 → aws_bootstrap_g4dn-0.7.0}/aws_bootstrap/resources/__init__.py +0 -0
  36. {aws_bootstrap_g4dn-0.5.0 → aws_bootstrap_g4dn-0.7.0}/aws_bootstrap/resources/gpu_benchmark.py +0 -0
  37. {aws_bootstrap_g4dn-0.5.0 → aws_bootstrap_g4dn-0.7.0}/aws_bootstrap/resources/gpu_smoke_test.ipynb +0 -0
  38. {aws_bootstrap_g4dn-0.5.0 → aws_bootstrap_g4dn-0.7.0}/aws_bootstrap/resources/launch.json +0 -0
  39. {aws_bootstrap_g4dn-0.5.0 → aws_bootstrap_g4dn-0.7.0}/aws_bootstrap/resources/requirements.txt +0 -0
  40. {aws_bootstrap_g4dn-0.5.0 → aws_bootstrap_g4dn-0.7.0}/aws_bootstrap/resources/saxpy.cu +0 -0
  41. {aws_bootstrap_g4dn-0.5.0 → aws_bootstrap_g4dn-0.7.0}/aws_bootstrap/resources/tasks.json +0 -0
  42. {aws_bootstrap_g4dn-0.5.0 → aws_bootstrap_g4dn-0.7.0}/aws_bootstrap/tests/__init__.py +0 -0
  43. {aws_bootstrap_g4dn-0.5.0 → aws_bootstrap_g4dn-0.7.0}/aws_bootstrap/tests/test_ec2.py +0 -0
  44. {aws_bootstrap_g4dn-0.5.0 → aws_bootstrap_g4dn-0.7.0}/aws_bootstrap/tests/test_gpu.py +0 -0
  45. {aws_bootstrap_g4dn-0.5.0 → aws_bootstrap_g4dn-0.7.0}/aws_bootstrap/tests/test_ssh_gpu.py +0 -0
  46. {aws_bootstrap_g4dn-0.5.0 → aws_bootstrap_g4dn-0.7.0}/aws_bootstrap_g4dn.egg-info/dependency_links.txt +0 -0
  47. {aws_bootstrap_g4dn-0.5.0 → aws_bootstrap_g4dn-0.7.0}/aws_bootstrap_g4dn.egg-info/entry_points.txt +0 -0
  48. {aws_bootstrap_g4dn-0.5.0 → aws_bootstrap_g4dn-0.7.0}/aws_bootstrap_g4dn.egg-info/top_level.txt +0 -0
  49. {aws_bootstrap_g4dn-0.5.0 → aws_bootstrap_g4dn-0.7.0}/docs/nsight-remote-profiling.md +0 -0
  50. {aws_bootstrap_g4dn-0.5.0 → aws_bootstrap_g4dn-0.7.0}/setup.cfg +0 -0
@@ -17,7 +17,7 @@ repos:
17
17
  - id: end-of-file-fixer
18
18
  - id: trailing-whitespace
19
19
  - repo: https://github.com/astral-sh/ruff-pre-commit
20
- rev: v0.14.7
20
+ rev: v0.15.0
21
21
  hooks:
22
22
  # Run the linter.
23
23
  - id: ruff-check
@@ -28,7 +28,7 @@ repos:
28
28
  rev: v1.19.0
29
29
  hooks:
30
30
  - id: mypy
31
- additional_dependencies: [types-pyyaml>=6.0.12]
31
+ additional_dependencies: [types-pyyaml>=6.0.12, types-tabulate>=0.9]
32
32
  - repo: local
33
33
  hooks:
34
34
  - id: pytest
@@ -13,6 +13,8 @@ Target workflows: Jupyter server-client, VSCode Remote SSH, and NVIDIA Nsight re
13
13
  - **Python 3.12+** with **uv** package manager (astral-sh/uv) — used for venv creation, dependency management, and running the project
14
14
  - **boto3** — AWS SDK for EC2 provisioning (AMI lookup, security groups, instance launch, waiters)
15
15
  - **click** — CLI framework with built-in color support (`click.secho`, `click.style`)
16
+ - **pyyaml** — YAML serialization for `--output yaml`
17
+ - **tabulate** — Table formatting for `--output table`
16
18
  - **setuptools + setuptools-scm** — build backend with git-tag-based versioning (configured in pyproject.toml)
17
19
  - **AWS CLI v2** with a configured AWS profile (`AWS_PROFILE` env var or `--profile` flag)
18
20
  - **direnv** for automatic venv activation (`.envrc` sources `.venv/bin/activate`)
@@ -32,9 +34,10 @@ aws_bootstrap/
32
34
  __init__.py # Package init
33
35
  cli.py # Click CLI entry point (launch, status, terminate commands)
34
36
  config.py # LaunchConfig dataclass with defaults
35
- ec2.py # AMI lookup, security group, instance launch/find/terminate, polling, spot pricing
37
+ ec2.py # AMI lookup, security group, instance launch/find/terminate, polling, spot pricing, EBS volume ops
36
38
  gpu.py # GPU architecture mapping and GpuInfo dataclass
37
- ssh.py # SSH key pair import, SSH readiness check, remote setup, ~/.ssh/config management, GPU queries
39
+ output.py # Output formatting: OutputFormat enum, emit(), echo/secho wrappers for structured output
40
+ ssh.py # SSH key pair import, SSH readiness check, remote setup, ~/.ssh/config management, GPU queries, EBS mount
38
41
  resources/ # Non-Python artifacts SCP'd to remote instances
39
42
  __init__.py
40
43
  gpu_benchmark.py # GPU throughput benchmark (CNN + Transformer), copied to ~/gpu_benchmark.py on instance
@@ -48,9 +51,12 @@ aws_bootstrap/
48
51
  test_config.py
49
52
  test_cli.py
50
53
  test_ec2.py
54
+ test_output.py
51
55
  test_gpu.py
52
56
  test_ssh_config.py
53
57
  test_ssh_gpu.py
58
+ test_ebs.py
59
+ test_ssh_ebs.py
54
60
  docs/
55
61
  nsight-remote-profiling.md # Nsight Compute, Nsight Systems, and Nsight VSCE remote profiling guide
56
62
  spot-request-lifecycle.md # Research notes on spot request cleanup
@@ -60,9 +66,12 @@ Entry point: `aws-bootstrap = "aws_bootstrap.cli:main"` (installed via `uv sync`
60
66
 
61
67
  ## CLI Commands
62
68
 
63
- - **`launch`** provisions an EC2 instance (spot by default, falls back to on-demand on capacity errors); adds SSH config alias (e.g. `aws-gpu1`) to `~/.ssh/config`; `--python-version` controls which Python `uv` installs in the remote venv; `--ssh-port` overrides the default SSH port (22) for security group ingress, connection checks, and SSH config
64
- - **`status`** — lists all non-terminated instances (including `shutting-down`) with type, IP, SSH alias, pricing (spot price/hr or on-demand), uptime, and estimated cost for running spot instances; `--gpu` flag queries GPU info via SSH, reporting both CUDA toolkit version (from `nvcc`) and driver-supported max (from `nvidia-smi`); `--instructions` (default: on) prints connection commands (SSH, Jupyter tunnel, VSCode Remote SSH, GPU benchmark) for each running instance; suppress with `--no-instructions`
65
- - **`terminate`** — terminates instances by ID or SSH alias (e.g. `aws-gpu1`, resolved via `~/.ssh/config`), or all aws-bootstrap instances in the region if no arguments given; removes SSH config aliases
69
+ **Global option:** `--output` / `-o` controls output format: `text` (default, human-readable with color), `json`, `yaml`, `table`. Structured formats (json/yaml/table) suppress all progress messages and emit machine-readable output. Commands requiring confirmation (`terminate`, `cleanup`) require `--yes` in structured modes.
70
+
71
+ - **`launch`** — provisions an EC2 instance (spot by default, falls back to on-demand on capacity errors); adds SSH config alias (e.g. `aws-gpu1`) to `~/.ssh/config`; `--python-version` controls which Python `uv` installs in the remote venv; `--ssh-port` overrides the default SSH port (22) for security group ingress, connection checks, and SSH config; `--ebs-storage SIZE` creates and attaches a new gp3 EBS data volume (mounted at `/data`); `--ebs-volume-id ID` attaches an existing EBS volume (mutually exclusive with `--ebs-storage`)
72
+ - **`status`** — lists all non-terminated instances (including `shutting-down`) with type, IP, SSH alias, EBS data volumes, pricing (spot price/hr or on-demand), uptime, and estimated cost for running spot instances; `--gpu` flag queries GPU info via SSH, reporting both CUDA toolkit version (from `nvcc`) and driver-supported max (from `nvidia-smi`); `--instructions` (default: on) prints connection commands (SSH, Jupyter tunnel, VSCode Remote SSH, GPU benchmark) for each running instance; suppress with `--no-instructions`
73
+ - **`terminate`** — terminates instances by ID or SSH alias (e.g. `aws-gpu1`, resolved via `~/.ssh/config`), or all aws-bootstrap instances in the region if no arguments given; removes SSH config aliases; deletes associated EBS data volumes by default; `--keep-ebs` preserves volumes and prints reattach commands
74
+ - **`cleanup`** — removes stale `~/.ssh/config` entries for terminated/non-existent instances; compares managed SSH config blocks against live EC2 instances; `--dry-run` previews removals without modifying config; `--yes` skips the confirmation prompt
66
75
  - **`list instance-types`** — lists EC2 instance types matching a family prefix (default: `g4dn`), showing vCPUs, memory, and GPU info
67
76
  - **`list amis`** — lists available AMIs matching a name pattern (default: Deep Learning Base OSS Nvidia Driver GPU AMIs), sorted newest-first
68
77
 
@@ -91,6 +100,18 @@ uv run pytest
91
100
 
92
101
  Use `uv add <package>` to add dependencies and `uv add --group dev <package>` for dev dependencies.
93
102
 
103
+ ## Structured Output Architecture
104
+
105
+ The `--output` option uses a context-aware suppression pattern via `aws_bootstrap/output.py`:
106
+
107
+ - **`output.echo()` / `output.secho()`** — wrap `click.echo`/`click.secho`; silent in non-text modes. Used in `ec2.py` and `ssh.py` for progress messages.
108
+ - **`is_text(ctx)`** — checks if the current output format is text. Used in `cli.py` to guard text-only blocks.
109
+ - **`emit(data, headers=..., ctx=...)`** — dispatches structured data to JSON/YAML/table renderers. No-op in text mode.
110
+ - **CLI helper guards** — `step()`, `info()`, `val()`, `success()`, `warn()` in `cli.py` check `is_text()` and return early in structured modes.
111
+ - Each CLI command builds a result dict alongside existing logic, emits it via `emit()` for non-text formats, and falls through to text output for text mode.
112
+ - **Confirmation prompts** (`terminate`, `cleanup`) require `--yes` in structured modes to avoid corrupting output.
113
+ - The spot-fallback `click.confirm()` in `ec2.py` auto-confirms in structured modes.
114
+
94
115
  ## CUDA-Aware PyTorch Installation
95
116
 
96
117
  `remote_setup.sh` detects the CUDA toolkit version on the instance (via `nvcc`, falling back to `nvidia-smi`) and installs PyTorch from the matching CUDA wheel index (`https://download.pytorch.org/whl/cu{TAG}`). This ensures `torch.version.cuda` matches the system's CUDA toolkit, which is required for compiling custom CUDA extensions with `nvcc`.
@@ -112,6 +133,34 @@ The `KNOWN_CUDA_TAGS` array in `remote_setup.sh` lists the CUDA wheel tags publi
112
133
 
113
134
  `resources/gpu_benchmark.py` is uploaded to `~/gpu_benchmark.py` on the remote instance during setup. It benchmarks GPU throughput with two modes: CNN on MNIST and a GPT-style Transformer on synthetic data. It reports samples/sec, batch times, and peak GPU memory. Supports `--precision` (fp32/fp16/bf16/tf32), `--diagnose` for CUDA smoke tests, and separate `--transformer-batch-size` (default 32, T4-safe). Dependencies (`torch`, `torchvision`, `tqdm`) are already installed by the setup script.
114
135
 
136
+ ## EBS Data Volumes
137
+
138
+ The `--ebs-storage` and `--ebs-volume-id` options on `launch` create or attach persistent gp3 EBS volumes mounted at `/data`. The implementation spans three modules:
139
+
140
+ - **`ec2.py`** — Volume lifecycle: `create_ebs_volume`, `validate_ebs_volume`, `attach_ebs_volume`, `detach_ebs_volume`, `delete_ebs_volume`, `find_ebs_volumes_for_instance`. Constants `EBS_DEVICE_NAME` (`/dev/sdf`) and `EBS_MOUNT_POINT` (`/data`).
141
+ - **`ssh.py`** — `mount_ebs_volume()` SSHs to the instance and runs a shell script that detects the device, optionally formats it, mounts it, and adds an fstab entry.
142
+ - **`cli.py`** — Orchestrates the flow: create/validate → attach → wait for SSH → mount. Mount failures are non-fatal (warn and continue).
143
+
144
+ ### Tagging strategy
145
+
146
+ Volumes are tagged for discovery by `status` and `terminate`:
147
+
148
+ | Tag | Value | Purpose |
149
+ |-----|-------|---------|
150
+ | `created-by` | `aws-bootstrap-g4dn` | Standard tool-managed resource tag |
151
+ | `Name` | `aws-bootstrap-data-{instance_id}` | Human-readable in AWS console |
152
+ | `aws-bootstrap-instance` | `i-xxxxxxxxx` | Links volume to instance for `find_ebs_volumes_for_instance` |
153
+
154
+ ### NVMe device detection
155
+
156
+ On Nitro instances (g4dn), `/dev/sdf` is remapped to `/dev/nvmeXn1`. The mount script detects the correct device by matching the volume ID serial number via `lsblk -o NAME,SERIAL -dpn`, with fallbacks to `/dev/nvme1n1`, `/dev/xvdf`, `/dev/sdf`.
157
+
158
+ ### Spot interruption and terminate cleanup
159
+
160
+ Non-root EBS volumes attached via API have `DeleteOnTermination=False` by default. This means data volumes **survive spot interruptions** — when AWS reclaims the instance, the volume detaches and becomes `available`, preserving all data. The user can reattach it to a new instance with `--ebs-volume-id`.
161
+
162
+ The `terminate` command discovers volumes via `find_ebs_volumes_for_instance`, waits for them to detach (becomes `available`), then deletes them. `--keep-ebs` skips deletion and prints the volume ID with a reattach command.
163
+
115
164
  ## Versioning & Publishing
116
165
 
117
166
  Version is derived automatically from git tags via **setuptools-scm** — no hardcoded version string in the codebase.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: aws-bootstrap-g4dn
3
- Version: 0.5.0
3
+ Version: 0.7.0
4
4
  Summary: Bootstrap AWS EC2 GPU instances for hybrid local-remote development
5
5
  Author: Adam Ever-Hadani
6
6
  License-Expression: MIT
@@ -15,6 +15,8 @@ Description-Content-Type: text/markdown
15
15
  License-File: LICENSE
16
16
  Requires-Dist: boto3>=1.35
17
17
  Requires-Dist: click>=8.1
18
+ Requires-Dist: pyyaml>=6.0.3
19
+ Requires-Dist: tabulate>=0.9.0
18
20
  Dynamic: license-file
19
21
 
20
22
  # aws-bootstrap-g4dn
@@ -44,7 +46,8 @@ ssh aws-gpu1 # You're in, venv activated, PyTorch works
44
46
  | 📊 | **GPU benchmark included** | CNN (MNIST) + Transformer benchmarks with FP16/FP32/BF16 precision and tqdm progress |
45
47
  | 📓 | **Jupyter ready** | Lab server auto-starts as a systemd service on port 8888 — just SSH tunnel and open |
46
48
  | 🖥️ | **`status --gpu`** | Shows CUDA toolkit version, driver max, GPU architecture, spot pricing, uptime, and estimated cost |
47
- | 🗑️ | **Clean terminate** | Stops instances, removes SSH aliases, shows shutting-down state until fully gone |
49
+ | 💾 | **EBS data volumes** | Attach persistent storage at `/data` survives spot interruptions and termination, reattach to new instances |
50
+ | 🗑️ | **Clean terminate** | Stops instances, removes SSH aliases, cleans up EBS volumes (or preserves with `--keep-ebs`) |
48
51
 
49
52
  ### 🎯 Target Workflows
50
53
 
@@ -132,16 +135,24 @@ aws-bootstrap launch --python-version 3.13
132
135
  # Use a non-default SSH port
133
136
  aws-bootstrap launch --ssh-port 2222
134
137
 
138
+ # Attach a persistent EBS data volume (96 GB gp3, mounted at /data)
139
+ aws-bootstrap launch --ebs-storage 96
140
+
141
+ # Reattach an existing EBS volume from a previous instance
142
+ aws-bootstrap launch --ebs-volume-id vol-0abc123def456
143
+
135
144
  # Use a specific AWS profile
136
145
  aws-bootstrap launch --profile my-aws-profile
137
146
  ```
138
147
 
139
148
  After launch, the CLI:
140
149
 
141
- 1. **Adds an SSH alias** (e.g. `aws-gpu1`) to `~/.ssh/config`
142
- 2. **Runs remote setup** installs utilities, creates a Python venv, installs CUDA-matched PyTorch, sets up Jupyter
143
- 3. **Runs a CUDA smoke test** — verifies `torch.cuda.is_available()` and runs a quick GPU matmul
144
- 4. **Prints connection commands** — SSH, Jupyter tunnel, GPU benchmark, and terminate
150
+ 1. **Creates/attaches EBS volume** (if `--ebs-storage` or `--ebs-volume-id` was specified)
151
+ 2. **Adds an SSH alias** (e.g. `aws-gpu1`) to `~/.ssh/config`
152
+ 3. **Runs remote setup** — installs utilities, creates a Python venv, installs CUDA-matched PyTorch, sets up Jupyter
153
+ 4. **Mounts EBS volume** at `/data` (if applicable formats new volumes, mounts existing ones as-is)
154
+ 5. **Runs a CUDA smoke test** — verifies `torch.cuda.is_available()` and runs a quick GPU matmul
155
+ 6. **Prints connection commands** — SSH, Jupyter tunnel, GPU benchmark, and terminate
145
156
 
146
157
  ```bash
147
158
  ssh aws-gpu1 # venv auto-activates on login
@@ -154,7 +165,7 @@ The setup script runs automatically on the instance after SSH becomes available:
154
165
  | Step | What |
155
166
  |------|------|
156
167
  | **GPU verify** | Confirms `nvidia-smi` and `nvcc` are working |
157
- | **Utilities** | Installs `htop`, `tmux`, `tree`, `jq` |
168
+ | **Utilities** | Installs `htop`, `tmux`, `tree`, `jq`, `ffmpeg` |
158
169
  | **Python venv** | Creates `~/venv` with `uv`, auto-activates in `~/.bashrc`. Use `--python-version` to pin a specific Python (e.g. `3.13`) |
159
170
  | **CUDA-aware PyTorch** | Detects CUDA toolkit version → installs PyTorch from the matching `cu{TAG}` wheel index |
160
171
  | **CUDA smoke test** | Runs `torch.cuda.is_available()` + GPU matmul to verify the stack |
@@ -223,6 +234,30 @@ Then install the [Nsight VSCE extension](https://marketplace.visualstudio.com/it
223
234
 
224
235
  See [Nsight remote profiling guide](docs/nsight-remote-profiling.md) for more details on CUDA debugging and profiling workflows.
225
236
 
237
+ ### 📤 Structured Output
238
+
239
+ All commands support `--output` / `-o` for machine-readable output — useful for scripting, piping to `jq`, or LLM tool-use:
240
+
241
+ ```bash
242
+ # JSON output (pipe to jq)
243
+ aws-bootstrap -o json status
244
+ aws-bootstrap -o json status | jq '.instances[0].instance_id'
245
+
246
+ # YAML output
247
+ aws-bootstrap -o yaml status
248
+
249
+ # Table output
250
+ aws-bootstrap -o table status
251
+
252
+ # Works with all commands
253
+ aws-bootstrap -o json list instance-types | jq '.[].instance_type'
254
+ aws-bootstrap -o json launch --dry-run
255
+ aws-bootstrap -o json terminate --yes
256
+ aws-bootstrap -o json cleanup --dry-run
257
+ ```
258
+
259
+ Supported formats: `text` (default, human-readable with color), `json`, `yaml`, `table`. Commands that require confirmation (`terminate`, `cleanup`) require `--yes` in structured output modes.
260
+
226
261
  ### 📋 Listing Resources
227
262
 
228
263
  ```bash
@@ -261,6 +296,9 @@ aws-bootstrap status --region us-east-1
261
296
  # Terminate all aws-bootstrap instances (with confirmation prompt)
262
297
  aws-bootstrap terminate
263
298
 
299
+ # Terminate but preserve EBS data volumes for reuse
300
+ aws-bootstrap terminate --keep-ebs
301
+
264
302
  # Terminate by SSH alias (resolved via ~/.ssh/config)
265
303
  aws-bootstrap terminate aws-gpu1
266
304
 
@@ -272,6 +310,15 @@ aws-bootstrap terminate aws-gpu1 i-def456
272
310
 
273
311
  # Skip confirmation prompt
274
312
  aws-bootstrap terminate --yes
313
+
314
+ # Remove stale SSH config entries for terminated instances
315
+ aws-bootstrap cleanup
316
+
317
+ # Preview what would be removed without modifying config
318
+ aws-bootstrap cleanup --dry-run
319
+
320
+ # Skip confirmation prompt
321
+ aws-bootstrap cleanup --yes
275
322
  ```
276
323
 
277
324
  `status --gpu` reports both the **installed CUDA toolkit** version (from `nvcc`) and the **maximum CUDA version supported by the driver** (from `nvidia-smi`), so you can see at a glance whether they match:
@@ -282,6 +329,31 @@ CUDA: 12.8 (driver supports up to 13.0)
282
329
 
283
330
  SSH aliases are managed automatically — they're created on `launch`, shown in `status`, and cleaned up on `terminate`. Aliases use sequential numbering (`aws-gpu1`, `aws-gpu2`, etc.) and never reuse numbers from previous instances. You can use aliases anywhere you'd use an instance ID, e.g. `aws-bootstrap terminate aws-gpu1`.
284
331
 
332
+ ## EBS Data Volumes
333
+
334
+ Attach persistent EBS storage to keep datasets and model checkpoints across instance lifecycles. Volumes are mounted at `/data` and persist independently of the instance.
335
+
336
+ ```bash
337
+ # Create a new 96 GB gp3 volume, formatted and mounted at /data
338
+ aws-bootstrap launch --ebs-storage 96
339
+
340
+ # After terminating with --keep-ebs, reattach the same volume to a new instance
341
+ aws-bootstrap terminate --keep-ebs
342
+ # Output: Preserving EBS volume: vol-0abc123...
343
+ # Reattach with: aws-bootstrap launch --ebs-volume-id vol-0abc123...
344
+
345
+ aws-bootstrap launch --ebs-volume-id vol-0abc123def456
346
+ ```
347
+
348
+ Key behaviors:
349
+ - `--ebs-storage` and `--ebs-volume-id` are mutually exclusive
350
+ - New volumes are formatted as ext4; existing volumes are mounted as-is
351
+ - Volumes are tagged for automatic discovery by `status` and `terminate`
352
+ - `terminate` deletes data volumes by default; use `--keep-ebs` to preserve them
353
+ - **Spot-safe** — data volumes survive spot interruptions. If AWS reclaims your instance, the volume detaches automatically and can be reattached to a new instance with `--ebs-volume-id`
354
+ - EBS volumes must be in the same availability zone as the instance
355
+ - Mount failures are non-fatal — the instance remains usable
356
+
285
357
  ## EC2 vCPU Quotas
286
358
 
287
359
  AWS accounts have [service quotas](https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/ec2-resource-limits.html) that limit how many vCPUs you can run per instance family. New or lightly-used accounts often have a **default quota of 0 vCPUs** for GPU instance families (G and VT), which will cause errors on launch:
@@ -25,7 +25,8 @@ ssh aws-gpu1 # You're in, venv activated, PyTorch works
25
25
  | 📊 | **GPU benchmark included** | CNN (MNIST) + Transformer benchmarks with FP16/FP32/BF16 precision and tqdm progress |
26
26
  | 📓 | **Jupyter ready** | Lab server auto-starts as a systemd service on port 8888 — just SSH tunnel and open |
27
27
  | 🖥️ | **`status --gpu`** | Shows CUDA toolkit version, driver max, GPU architecture, spot pricing, uptime, and estimated cost |
28
- | 🗑️ | **Clean terminate** | Stops instances, removes SSH aliases, shows shutting-down state until fully gone |
28
+ | 💾 | **EBS data volumes** | Attach persistent storage at `/data` survives spot interruptions and termination, reattach to new instances |
29
+ | 🗑️ | **Clean terminate** | Stops instances, removes SSH aliases, cleans up EBS volumes (or preserves with `--keep-ebs`) |
29
30
 
30
31
  ### 🎯 Target Workflows
31
32
 
@@ -113,16 +114,24 @@ aws-bootstrap launch --python-version 3.13
113
114
  # Use a non-default SSH port
114
115
  aws-bootstrap launch --ssh-port 2222
115
116
 
117
+ # Attach a persistent EBS data volume (96 GB gp3, mounted at /data)
118
+ aws-bootstrap launch --ebs-storage 96
119
+
120
+ # Reattach an existing EBS volume from a previous instance
121
+ aws-bootstrap launch --ebs-volume-id vol-0abc123def456
122
+
116
123
  # Use a specific AWS profile
117
124
  aws-bootstrap launch --profile my-aws-profile
118
125
  ```
119
126
 
120
127
  After launch, the CLI:
121
128
 
122
- 1. **Adds an SSH alias** (e.g. `aws-gpu1`) to `~/.ssh/config`
123
- 2. **Runs remote setup** installs utilities, creates a Python venv, installs CUDA-matched PyTorch, sets up Jupyter
124
- 3. **Runs a CUDA smoke test** — verifies `torch.cuda.is_available()` and runs a quick GPU matmul
125
- 4. **Prints connection commands** — SSH, Jupyter tunnel, GPU benchmark, and terminate
129
+ 1. **Creates/attaches EBS volume** (if `--ebs-storage` or `--ebs-volume-id` was specified)
130
+ 2. **Adds an SSH alias** (e.g. `aws-gpu1`) to `~/.ssh/config`
131
+ 3. **Runs remote setup** — installs utilities, creates a Python venv, installs CUDA-matched PyTorch, sets up Jupyter
132
+ 4. **Mounts EBS volume** at `/data` (if applicable formats new volumes, mounts existing ones as-is)
133
+ 5. **Runs a CUDA smoke test** — verifies `torch.cuda.is_available()` and runs a quick GPU matmul
134
+ 6. **Prints connection commands** — SSH, Jupyter tunnel, GPU benchmark, and terminate
126
135
 
127
136
  ```bash
128
137
  ssh aws-gpu1 # venv auto-activates on login
@@ -135,7 +144,7 @@ The setup script runs automatically on the instance after SSH becomes available:
135
144
  | Step | What |
136
145
  |------|------|
137
146
  | **GPU verify** | Confirms `nvidia-smi` and `nvcc` are working |
138
- | **Utilities** | Installs `htop`, `tmux`, `tree`, `jq` |
147
+ | **Utilities** | Installs `htop`, `tmux`, `tree`, `jq`, `ffmpeg` |
139
148
  | **Python venv** | Creates `~/venv` with `uv`, auto-activates in `~/.bashrc`. Use `--python-version` to pin a specific Python (e.g. `3.13`) |
140
149
  | **CUDA-aware PyTorch** | Detects CUDA toolkit version → installs PyTorch from the matching `cu{TAG}` wheel index |
141
150
  | **CUDA smoke test** | Runs `torch.cuda.is_available()` + GPU matmul to verify the stack |
@@ -204,6 +213,30 @@ Then install the [Nsight VSCE extension](https://marketplace.visualstudio.com/it
204
213
 
205
214
  See [Nsight remote profiling guide](docs/nsight-remote-profiling.md) for more details on CUDA debugging and profiling workflows.
206
215
 
216
+ ### 📤 Structured Output
217
+
218
+ All commands support `--output` / `-o` for machine-readable output — useful for scripting, piping to `jq`, or LLM tool-use:
219
+
220
+ ```bash
221
+ # JSON output (pipe to jq)
222
+ aws-bootstrap -o json status
223
+ aws-bootstrap -o json status | jq '.instances[0].instance_id'
224
+
225
+ # YAML output
226
+ aws-bootstrap -o yaml status
227
+
228
+ # Table output
229
+ aws-bootstrap -o table status
230
+
231
+ # Works with all commands
232
+ aws-bootstrap -o json list instance-types | jq '.[].instance_type'
233
+ aws-bootstrap -o json launch --dry-run
234
+ aws-bootstrap -o json terminate --yes
235
+ aws-bootstrap -o json cleanup --dry-run
236
+ ```
237
+
238
+ Supported formats: `text` (default, human-readable with color), `json`, `yaml`, `table`. Commands that require confirmation (`terminate`, `cleanup`) require `--yes` in structured output modes.
239
+
207
240
  ### 📋 Listing Resources
208
241
 
209
242
  ```bash
@@ -242,6 +275,9 @@ aws-bootstrap status --region us-east-1
242
275
  # Terminate all aws-bootstrap instances (with confirmation prompt)
243
276
  aws-bootstrap terminate
244
277
 
278
+ # Terminate but preserve EBS data volumes for reuse
279
+ aws-bootstrap terminate --keep-ebs
280
+
245
281
  # Terminate by SSH alias (resolved via ~/.ssh/config)
246
282
  aws-bootstrap terminate aws-gpu1
247
283
 
@@ -253,6 +289,15 @@ aws-bootstrap terminate aws-gpu1 i-def456
253
289
 
254
290
  # Skip confirmation prompt
255
291
  aws-bootstrap terminate --yes
292
+
293
+ # Remove stale SSH config entries for terminated instances
294
+ aws-bootstrap cleanup
295
+
296
+ # Preview what would be removed without modifying config
297
+ aws-bootstrap cleanup --dry-run
298
+
299
+ # Skip confirmation prompt
300
+ aws-bootstrap cleanup --yes
256
301
  ```
257
302
 
258
303
  `status --gpu` reports both the **installed CUDA toolkit** version (from `nvcc`) and the **maximum CUDA version supported by the driver** (from `nvidia-smi`), so you can see at a glance whether they match:
@@ -263,6 +308,31 @@ CUDA: 12.8 (driver supports up to 13.0)
263
308
 
264
309
  SSH aliases are managed automatically — they're created on `launch`, shown in `status`, and cleaned up on `terminate`. Aliases use sequential numbering (`aws-gpu1`, `aws-gpu2`, etc.) and never reuse numbers from previous instances. You can use aliases anywhere you'd use an instance ID, e.g. `aws-bootstrap terminate aws-gpu1`.
265
310
 
311
+ ## EBS Data Volumes
312
+
313
+ Attach persistent EBS storage to keep datasets and model checkpoints across instance lifecycles. Volumes are mounted at `/data` and persist independently of the instance.
314
+
315
+ ```bash
316
+ # Create a new 96 GB gp3 volume, formatted and mounted at /data
317
+ aws-bootstrap launch --ebs-storage 96
318
+
319
+ # After terminating with --keep-ebs, reattach the same volume to a new instance
320
+ aws-bootstrap terminate --keep-ebs
321
+ # Output: Preserving EBS volume: vol-0abc123...
322
+ # Reattach with: aws-bootstrap launch --ebs-volume-id vol-0abc123...
323
+
324
+ aws-bootstrap launch --ebs-volume-id vol-0abc123def456
325
+ ```
326
+
327
+ Key behaviors:
328
+ - `--ebs-storage` and `--ebs-volume-id` are mutually exclusive
329
+ - New volumes are formatted as ext4; existing volumes are mounted as-is
330
+ - Volumes are tagged for automatic discovery by `status` and `terminate`
331
+ - `terminate` deletes data volumes by default; use `--keep-ebs` to preserve them
332
+ - **Spot-safe** — data volumes survive spot interruptions. If AWS reclaims your instance, the volume detaches automatically and can be reattached to a new instance with `--ebs-volume-id`
333
+ - EBS volumes must be in the same availability zone as the instance
334
+ - Mount failures are non-fatal — the instance remains usable
335
+
266
336
  ## EC2 vCPU Quotas
267
337
 
268
338
  AWS accounts have [service quotas](https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/ec2-resource-limits.html) that limit how many vCPUs you can run per instance family. New or lightly-used accounts often have a **default quota of 0 vCPUs** for GPU instance families (G and VT), which will cause errors on launch: