aws-bootstrap-g4dn 0.2.0__tar.gz → 0.4.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (44) hide show
  1. {aws_bootstrap_g4dn-0.2.0 → aws_bootstrap_g4dn-0.4.0}/.github/workflows/ci.yml +1 -1
  2. {aws_bootstrap_g4dn-0.2.0 → aws_bootstrap_g4dn-0.4.0}/.pre-commit-config.yaml +1 -0
  3. {aws_bootstrap_g4dn-0.2.0 → aws_bootstrap_g4dn-0.4.0}/CLAUDE.md +13 -5
  4. {aws_bootstrap_g4dn-0.2.0/aws_bootstrap_g4dn.egg-info → aws_bootstrap_g4dn-0.4.0}/PKG-INFO +41 -6
  5. aws_bootstrap_g4dn-0.2.0/PKG-INFO → aws_bootstrap_g4dn-0.4.0/README.md +36 -20
  6. {aws_bootstrap_g4dn-0.2.0 → aws_bootstrap_g4dn-0.4.0}/aws_bootstrap/cli.py +75 -11
  7. {aws_bootstrap_g4dn-0.2.0 → aws_bootstrap_g4dn-0.4.0}/aws_bootstrap/config.py +2 -0
  8. {aws_bootstrap_g4dn-0.2.0 → aws_bootstrap_g4dn-0.4.0}/aws_bootstrap/ec2.py +3 -3
  9. aws_bootstrap_g4dn-0.4.0/aws_bootstrap/gpu.py +27 -0
  10. {aws_bootstrap_g4dn-0.2.0 → aws_bootstrap_g4dn-0.4.0}/aws_bootstrap/resources/gpu_benchmark.py +15 -5
  11. aws_bootstrap_g4dn-0.4.0/aws_bootstrap/resources/launch.json +42 -0
  12. {aws_bootstrap_g4dn-0.2.0 → aws_bootstrap_g4dn-0.4.0}/aws_bootstrap/resources/remote_setup.sh +90 -6
  13. aws_bootstrap_g4dn-0.4.0/aws_bootstrap/resources/saxpy.cu +49 -0
  14. aws_bootstrap_g4dn-0.4.0/aws_bootstrap/resources/tasks.json +48 -0
  15. {aws_bootstrap_g4dn-0.2.0 → aws_bootstrap_g4dn-0.4.0}/aws_bootstrap/ssh.py +83 -47
  16. {aws_bootstrap_g4dn-0.2.0 → aws_bootstrap_g4dn-0.4.0}/aws_bootstrap/tests/test_cli.py +205 -7
  17. aws_bootstrap_g4dn-0.2.0/aws_bootstrap/tests/test_ssh_gpu.py → aws_bootstrap_g4dn-0.4.0/aws_bootstrap/tests/test_gpu.py +3 -43
  18. {aws_bootstrap_g4dn-0.2.0 → aws_bootstrap_g4dn-0.4.0}/aws_bootstrap/tests/test_ssh_config.py +36 -0
  19. aws_bootstrap_g4dn-0.4.0/aws_bootstrap/tests/test_ssh_gpu.py +44 -0
  20. aws_bootstrap_g4dn-0.2.0/README.md → aws_bootstrap_g4dn-0.4.0/aws_bootstrap_g4dn.egg-info/PKG-INFO +55 -4
  21. {aws_bootstrap_g4dn-0.2.0 → aws_bootstrap_g4dn-0.4.0}/aws_bootstrap_g4dn.egg-info/SOURCES.txt +7 -1
  22. aws_bootstrap_g4dn-0.4.0/docs/nsight-remote-profiling.md +245 -0
  23. {aws_bootstrap_g4dn-0.2.0 → aws_bootstrap_g4dn-0.4.0}/pyproject.toml +8 -3
  24. {aws_bootstrap_g4dn-0.2.0 → aws_bootstrap_g4dn-0.4.0}/uv.lock +35 -1
  25. {aws_bootstrap_g4dn-0.2.0 → aws_bootstrap_g4dn-0.4.0}/.github/ISSUE_TEMPLATE/bug_report.md +0 -0
  26. {aws_bootstrap_g4dn-0.2.0 → aws_bootstrap_g4dn-0.4.0}/.github/ISSUE_TEMPLATE/feature_request.md +0 -0
  27. {aws_bootstrap_g4dn-0.2.0 → aws_bootstrap_g4dn-0.4.0}/.github/workflows/publish-to-pypi.yml +0 -0
  28. {aws_bootstrap_g4dn-0.2.0 → aws_bootstrap_g4dn-0.4.0}/.gitignore +0 -0
  29. {aws_bootstrap_g4dn-0.2.0 → aws_bootstrap_g4dn-0.4.0}/CODE_OF_CONDUCT.md +0 -0
  30. {aws_bootstrap_g4dn-0.2.0 → aws_bootstrap_g4dn-0.4.0}/CONTRIBUTING.md +0 -0
  31. {aws_bootstrap_g4dn-0.2.0 → aws_bootstrap_g4dn-0.4.0}/LICENSE +0 -0
  32. {aws_bootstrap_g4dn-0.2.0 → aws_bootstrap_g4dn-0.4.0}/SECURITY.md +0 -0
  33. {aws_bootstrap_g4dn-0.2.0 → aws_bootstrap_g4dn-0.4.0}/aws_bootstrap/__init__.py +0 -0
  34. {aws_bootstrap_g4dn-0.2.0 → aws_bootstrap_g4dn-0.4.0}/aws_bootstrap/resources/__init__.py +0 -0
  35. {aws_bootstrap_g4dn-0.2.0 → aws_bootstrap_g4dn-0.4.0}/aws_bootstrap/resources/gpu_smoke_test.ipynb +0 -0
  36. {aws_bootstrap_g4dn-0.2.0 → aws_bootstrap_g4dn-0.4.0}/aws_bootstrap/resources/requirements.txt +0 -0
  37. {aws_bootstrap_g4dn-0.2.0 → aws_bootstrap_g4dn-0.4.0}/aws_bootstrap/tests/__init__.py +0 -0
  38. {aws_bootstrap_g4dn-0.2.0 → aws_bootstrap_g4dn-0.4.0}/aws_bootstrap/tests/test_config.py +0 -0
  39. {aws_bootstrap_g4dn-0.2.0 → aws_bootstrap_g4dn-0.4.0}/aws_bootstrap/tests/test_ec2.py +0 -0
  40. {aws_bootstrap_g4dn-0.2.0 → aws_bootstrap_g4dn-0.4.0}/aws_bootstrap_g4dn.egg-info/dependency_links.txt +0 -0
  41. {aws_bootstrap_g4dn-0.2.0 → aws_bootstrap_g4dn-0.4.0}/aws_bootstrap_g4dn.egg-info/entry_points.txt +0 -0
  42. {aws_bootstrap_g4dn-0.2.0 → aws_bootstrap_g4dn-0.4.0}/aws_bootstrap_g4dn.egg-info/requires.txt +0 -0
  43. {aws_bootstrap_g4dn-0.2.0 → aws_bootstrap_g4dn-0.4.0}/aws_bootstrap_g4dn.egg-info/top_level.txt +0 -0
  44. {aws_bootstrap_g4dn-0.2.0 → aws_bootstrap_g4dn-0.4.0}/setup.cfg +0 -0
@@ -20,7 +20,7 @@ jobs:
20
20
  strategy:
21
21
  fail-fast: false
22
22
  matrix:
23
- python-version: ["3.14"]
23
+ python-version: ["3.12", "3.13", "3.14"]
24
24
 
25
25
  steps:
26
26
  - uses: actions/checkout@v4
@@ -8,6 +8,7 @@ repos:
8
8
  - id: fix-byte-order-marker
9
9
  - id: check-case-conflict
10
10
  - id: check-json
11
+ exclude: ^aws_bootstrap/resources/(launch|tasks)\.json$
11
12
  - id: check-yaml
12
13
  args: [ --unsafe ]
13
14
  - id: detect-aws-credentials
@@ -10,7 +10,7 @@ Target workflows: Jupyter server-client, VSCode Remote SSH, and NVIDIA Nsight re
10
10
 
11
11
  ## Tech Stack & Requirements
12
12
 
13
- - **Python 3.14+** with **uv** package manager (astral-sh/uv) — used for venv creation, dependency management, and running the project
13
+ - **Python 3.12+** with **uv** package manager (astral-sh/uv) — used for venv creation, dependency management, and running the project
14
14
  - **boto3** — AWS SDK for EC2 provisioning (AMI lookup, security groups, instance launch, waiters)
15
15
  - **click** — CLI framework with built-in color support (`click.secho`, `click.style`)
16
16
  - **setuptools + setuptools-scm** — build backend with git-tag-based versioning (configured in pyproject.toml)
@@ -33,20 +33,26 @@ aws_bootstrap/
33
33
  cli.py # Click CLI entry point (launch, status, terminate commands)
34
34
  config.py # LaunchConfig dataclass with defaults
35
35
  ec2.py # AMI lookup, security group, instance launch/find/terminate, polling, spot pricing
36
- ssh.py # SSH key pair import, SSH readiness check, remote setup, ~/.ssh/config management
36
+ gpu.py # GPU architecture mapping and GpuInfo dataclass
37
+ ssh.py # SSH key pair import, SSH readiness check, remote setup, ~/.ssh/config management, GPU queries
37
38
  resources/ # Non-Python artifacts SCP'd to remote instances
38
39
  __init__.py
39
40
  gpu_benchmark.py # GPU throughput benchmark (CNN + Transformer), copied to ~/gpu_benchmark.py on instance
40
41
  gpu_smoke_test.ipynb # Interactive Jupyter notebook for GPU verification, copied to ~/gpu_smoke_test.ipynb
42
+ launch.json # VSCode CUDA debug config template (deployed to ~/workspace/.vscode/launch.json)
43
+ saxpy.cu # Example CUDA SAXPY source (deployed to ~/workspace/saxpy.cu)
44
+ tasks.json # VSCode CUDA build tasks template (deployed to ~/workspace/.vscode/tasks.json)
41
45
  remote_setup.sh # Uploaded & run on instance post-boot (GPU verify, Jupyter, etc.)
42
46
  requirements.txt # Python dependencies installed on the remote instance
43
47
  tests/ # Unit tests (pytest)
44
48
  test_config.py
45
49
  test_cli.py
46
50
  test_ec2.py
51
+ test_gpu.py
47
52
  test_ssh_config.py
48
53
  test_ssh_gpu.py
49
54
  docs/
55
+ nsight-remote-profiling.md # Nsight Compute, Nsight Systems, and Nsight VSCE remote profiling guide
50
56
  spot-request-lifecycle.md # Research notes on spot request cleanup
51
57
  ```
52
58
 
@@ -54,8 +60,8 @@ Entry point: `aws-bootstrap = "aws_bootstrap.cli:main"` (installed via `uv sync`
54
60
 
55
61
  ## CLI Commands
56
62
 
57
- - **`launch`** — provisions an EC2 instance (spot by default, falls back to on-demand on capacity errors); adds SSH config alias (e.g. `aws-gpu1`) to `~/.ssh/config`
58
- - **`status`** — lists all non-terminated instances (including `shutting-down`) with type, IP, SSH alias, pricing (spot price/hr or on-demand), uptime, and estimated cost for running spot instances; `--gpu` flag queries GPU info via SSH, reporting both CUDA toolkit version (from `nvcc`) and driver-supported max (from `nvidia-smi`)
63
+ - **`launch`** — provisions an EC2 instance (spot by default, falls back to on-demand on capacity errors); adds SSH config alias (e.g. `aws-gpu1`) to `~/.ssh/config`; `--python-version` controls which Python `uv` installs in the remote venv; `--ssh-port` overrides the default SSH port (22) for security group ingress, connection checks, and SSH config
64
+ - **`status`** — lists all non-terminated instances (including `shutting-down`) with type, IP, SSH alias, pricing (spot price/hr or on-demand), uptime, and estimated cost for running spot instances; `--gpu` flag queries GPU info via SSH, reporting both CUDA toolkit version (from `nvcc`) and driver-supported max (from `nvidia-smi`); `--instructions` (default: on) prints connection commands (SSH, Jupyter tunnel, VSCode Remote SSH, GPU benchmark) for each running instance; suppress with `--no-instructions`
59
65
  - **`terminate`** — terminates instances by ID or all aws-bootstrap instances in the region; removes SSH config aliases
60
66
  - **`list instance-types`** — lists EC2 instance types matching a family prefix (default: `g4dn`), showing vCPUs, memory, and GPU info
61
67
  - **`list amis`** — lists available AMIs matching a name pattern (default: Deep Learning Base OSS Nvidia Driver GPU AMIs), sorted newest-first
@@ -96,9 +102,11 @@ The `KNOWN_CUDA_TAGS` array in `remote_setup.sh` lists the CUDA wheel tags publi
96
102
  ## Remote Setup Details
97
103
 
98
104
  `remote_setup.sh` also:
99
- - Creates `~/venv` and appends `source ~/venv/bin/activate` to `~/.bashrc` so the venv is auto-activated on SSH login
105
+ - Creates `~/venv` and appends `source ~/venv/bin/activate` to `~/.bashrc` so the venv is auto-activated on SSH login. When `--python-version` is passed to `launch`, the CLI sets `PYTHON_VERSION` as an inline env var on the SSH command; `remote_setup.sh` reads it to run `uv python install` and `uv venv --python` with the requested version
106
+ - Adds NVIDIA Nsight Systems (`nsys`) to PATH if installed under `/opt/nvidia/nsight-systems/` (pre-installed on Deep Learning AMIs but not on PATH by default). Fixes directory permissions, finds the latest version, and prepends its `bin/` to PATH in `~/.bashrc`
100
107
  - Runs a quick CUDA smoke test (`torch.cuda.is_available()` + GPU matmul) after PyTorch installation to verify the GPU stack; prints a WARNING on failure but does not abort
101
108
  - Copies `gpu_benchmark.py` to `~/gpu_benchmark.py` and `gpu_smoke_test.ipynb` to `~/gpu_smoke_test.ipynb`
109
+ - Sets up `~/workspace/.vscode/` with `launch.json` and `tasks.json` for CUDA debugging. Detects `cuda-gdb` path and GPU SM architecture (via `nvidia-smi --query-gpu=compute_cap`) at deploy time, replacing `__CUDA_GDB_PATH__` and `__GPU_ARCH__` placeholders in the template files via `sed`
102
110
 
103
111
  ## GPU Benchmark
104
112
 
@@ -1,13 +1,16 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: aws-bootstrap-g4dn
3
- Version: 0.2.0
3
+ Version: 0.4.0
4
4
  Summary: Bootstrap AWS EC2 GPU instances for hybrid local-remote development
5
5
  Author: Adam Ever-Hadani
6
6
  License-Expression: MIT
7
7
  Project-URL: Homepage, https://github.com/promptromp/aws-bootstrap-g4dn
8
8
  Project-URL: Issues, https://github.com/promptromp/aws-bootstrap-g4dn/issues
9
9
  Keywords: aws,ec2,gpu,cuda,deep-learning,spot-instances,cli
10
- Requires-Python: >=3.14
10
+ Classifier: Programming Language :: Python :: 3.12
11
+ Classifier: Programming Language :: Python :: 3.13
12
+ Classifier: Programming Language :: Python :: 3.14
13
+ Requires-Python: >=3.12
11
14
  Description-Content-Type: text/markdown
12
15
  License-File: LICENSE
13
16
  Requires-Dist: boto3>=1.35
@@ -46,7 +49,7 @@ ssh aws-gpu1 # You're in, venv activated, PyTorch works
46
49
  ### 🎯 Target Workflows
47
50
 
48
51
  1. **Jupyter server-client** — Jupyter runs on the instance, connect from your local browser
49
- 2. **VSCode Remote SSH** — `ssh aws-gpu1` just works with the Remote SSH extension
52
+ 2. **VSCode Remote SSH** — opens `~/workspace` with pre-configured CUDA debug/build tasks and an example `.cu` file
50
53
  3. **NVIDIA Nsight remote debugging** — GPU debugging over SSH
51
54
 
52
55
  ---
@@ -55,7 +58,7 @@ ssh aws-gpu1 # You're in, venv activated, PyTorch works
55
58
 
56
59
  1. AWS profile configured with relevant permissions (profile name can be passed via `--profile` or read from `AWS_PROFILE` env var)
57
60
  2. AWS CLI v2 — see [here](https://docs.aws.amazon.com/cli/latest/userguide/getting-started-install.html)
58
- 3. Python 3.14+ and [uv](https://github.com/astral-sh/uv)
61
+ 3. Python 3.12+ and [uv](https://github.com/astral-sh/uv)
59
62
  4. An SSH key pair (see below)
60
63
 
61
64
  ## Installation
@@ -123,6 +126,12 @@ aws-bootstrap launch --on-demand --instance-type g5.xlarge --region us-east-1
123
126
  # Launch without running the remote setup script
124
127
  aws-bootstrap launch --no-setup
125
128
 
129
+ # Use a specific Python version in the remote venv
130
+ aws-bootstrap launch --python-version 3.13
131
+
132
+ # Use a non-default SSH port
133
+ aws-bootstrap launch --ssh-port 2222
134
+
126
135
  # Use a specific AWS profile
127
136
  aws-bootstrap launch --profile my-aws-profile
128
137
  ```
@@ -146,13 +155,14 @@ The setup script runs automatically on the instance after SSH becomes available:
146
155
  |------|------|
147
156
  | **GPU verify** | Confirms `nvidia-smi` and `nvcc` are working |
148
157
  | **Utilities** | Installs `htop`, `tmux`, `tree`, `jq` |
149
- | **Python venv** | Creates `~/venv` with `uv`, auto-activates in `~/.bashrc` |
158
+ | **Python venv** | Creates `~/venv` with `uv`, auto-activates in `~/.bashrc`. Use `--python-version` to pin a specific Python (e.g. `3.13`) |
150
159
  | **CUDA-aware PyTorch** | Detects CUDA toolkit version → installs PyTorch from the matching `cu{TAG}` wheel index |
151
160
  | **CUDA smoke test** | Runs `torch.cuda.is_available()` + GPU matmul to verify the stack |
152
161
  | **GPU benchmark** | Copies `gpu_benchmark.py` to `~/gpu_benchmark.py` |
153
162
  | **GPU smoke test notebook** | Copies `gpu_smoke_test.ipynb` to `~/gpu_smoke_test.ipynb` (open in JupyterLab) |
154
163
  | **Jupyter** | Configures and starts JupyterLab as a systemd service on port 8888 |
155
164
  | **SSH keepalive** | Configures server-side keepalive to prevent idle disconnects |
165
+ | **VSCode workspace** | Creates `~/workspace/.vscode/` with `launch.json` and `tasks.json` (auto-detected `cuda-gdb` path and GPU arch), plus an example `saxpy.cu` |
156
166
 
157
167
  ### 📊 GPU Benchmark
158
168
 
@@ -191,6 +201,28 @@ ssh -i ~/.ssh/id_ed25519 -NL 8888:localhost:8888 ubuntu@<public-ip>
191
201
 
192
202
  A **GPU smoke test notebook** (`~/gpu_smoke_test.ipynb`) is pre-installed on every instance. Open it in JupyterLab to interactively verify the CUDA stack, run FP32/FP16 matmuls, train a small CNN on MNIST, and visualise training loss and GPU memory usage.
193
203
 
204
+ ### 🖥️ VSCode Remote SSH
205
+
206
+ The remote setup creates a `~/workspace` folder with pre-configured CUDA debug and build tasks:
207
+
208
+ ```
209
+ ~/workspace/
210
+ ├── .vscode/
211
+ │ ├── launch.json # CUDA debug configs (cuda-gdb path auto-detected)
212
+ │ └── tasks.json # nvcc build tasks (GPU arch auto-detected, e.g. sm_75)
213
+ └── saxpy.cu # Example CUDA source — open and press F5 to debug
214
+ ```
215
+
216
+ Connect directly from your terminal:
217
+
218
+ ```bash
219
+ code --folder-uri vscode-remote://ssh-remote+aws-gpu1/home/ubuntu/workspace
220
+ ```
221
+
222
+ Then install the [Nsight VSCE extension](https://marketplace.visualstudio.com/items?itemName=NVIDIA.nsight-vscode-edition) on the remote when prompted. Open `saxpy.cu`, set a breakpoint, and press F5.
223
+
224
+ See [Nsight remote profiling guide](docs/nsight-remote-profiling.md) for more details on CUDA debugging and profiling workflows.
225
+
194
226
  ### 📋 Listing Resources
195
227
 
196
228
  ```bash
@@ -220,6 +252,9 @@ aws-bootstrap status
220
252
  # Include GPU info (CUDA toolkit + driver version, GPU name, architecture) via SSH
221
253
  aws-bootstrap status --gpu
222
254
 
255
+ # Hide connection commands (shown by default for each running instance)
256
+ aws-bootstrap status --no-instructions
257
+
223
258
  # List instances in a specific region
224
259
  aws-bootstrap status --region us-east-1
225
260
 
@@ -310,7 +345,7 @@ aws-bootstrap launch --instance-type t3.medium --ami-filter "ubuntu/images/hvm-s
310
345
  | GPU instance pricing | [instances.vantage.sh](https://instances.vantage.sh/aws/ec2/g4dn.xlarge) |
311
346
  | Spot instance quotas | [AWS docs](https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/using-spot-limits.html) |
312
347
  | Deep Learning AMIs | [AWS docs](https://docs.aws.amazon.com/dlami/latest/devguide/what-is-dlami.html) |
313
- | Nvidia Nsight remote debugging | [Nvidia docs](https://docs.nvidia.com/nsight-visual-studio-edition/3.2/Content/Setup_Remote_Debugging.htm) |
348
+ | Nsight remote GPU profiling | [Guide](docs/nsight-remote-profiling.md) — Nsight Compute, Nsight Systems, and Nsight VSCE on EC2 |
314
349
 
315
350
  Tutorials on setting up a CUDA environment on EC2 GPU instances:
316
351
 
@@ -1,19 +1,3 @@
1
- Metadata-Version: 2.4
2
- Name: aws-bootstrap-g4dn
3
- Version: 0.2.0
4
- Summary: Bootstrap AWS EC2 GPU instances for hybrid local-remote development
5
- Author: Adam Ever-Hadani
6
- License-Expression: MIT
7
- Project-URL: Homepage, https://github.com/promptromp/aws-bootstrap-g4dn
8
- Project-URL: Issues, https://github.com/promptromp/aws-bootstrap-g4dn/issues
9
- Keywords: aws,ec2,gpu,cuda,deep-learning,spot-instances,cli
10
- Requires-Python: >=3.14
11
- Description-Content-Type: text/markdown
12
- License-File: LICENSE
13
- Requires-Dist: boto3>=1.35
14
- Requires-Dist: click>=8.1
15
- Dynamic: license-file
16
-
17
1
  # aws-bootstrap-g4dn
18
2
 
19
3
  --------------------------------------------------------------------------------
@@ -46,7 +30,7 @@ ssh aws-gpu1 # You're in, venv activated, PyTorch works
46
30
  ### 🎯 Target Workflows
47
31
 
48
32
  1. **Jupyter server-client** — Jupyter runs on the instance, connect from your local browser
49
- 2. **VSCode Remote SSH** — `ssh aws-gpu1` just works with the Remote SSH extension
33
+ 2. **VSCode Remote SSH** — opens `~/workspace` with pre-configured CUDA debug/build tasks and an example `.cu` file
50
34
  3. **NVIDIA Nsight remote debugging** — GPU debugging over SSH
51
35
 
52
36
  ---
@@ -55,7 +39,7 @@ ssh aws-gpu1 # You're in, venv activated, PyTorch works
55
39
 
56
40
  1. AWS profile configured with relevant permissions (profile name can be passed via `--profile` or read from `AWS_PROFILE` env var)
57
41
  2. AWS CLI v2 — see [here](https://docs.aws.amazon.com/cli/latest/userguide/getting-started-install.html)
58
- 3. Python 3.14+ and [uv](https://github.com/astral-sh/uv)
42
+ 3. Python 3.12+ and [uv](https://github.com/astral-sh/uv)
59
43
  4. An SSH key pair (see below)
60
44
 
61
45
  ## Installation
@@ -123,6 +107,12 @@ aws-bootstrap launch --on-demand --instance-type g5.xlarge --region us-east-1
123
107
  # Launch without running the remote setup script
124
108
  aws-bootstrap launch --no-setup
125
109
 
110
+ # Use a specific Python version in the remote venv
111
+ aws-bootstrap launch --python-version 3.13
112
+
113
+ # Use a non-default SSH port
114
+ aws-bootstrap launch --ssh-port 2222
115
+
126
116
  # Use a specific AWS profile
127
117
  aws-bootstrap launch --profile my-aws-profile
128
118
  ```
@@ -146,13 +136,14 @@ The setup script runs automatically on the instance after SSH becomes available:
146
136
  |------|------|
147
137
  | **GPU verify** | Confirms `nvidia-smi` and `nvcc` are working |
148
138
  | **Utilities** | Installs `htop`, `tmux`, `tree`, `jq` |
149
- | **Python venv** | Creates `~/venv` with `uv`, auto-activates in `~/.bashrc` |
139
+ | **Python venv** | Creates `~/venv` with `uv`, auto-activates in `~/.bashrc`. Use `--python-version` to pin a specific Python (e.g. `3.13`) |
150
140
  | **CUDA-aware PyTorch** | Detects CUDA toolkit version → installs PyTorch from the matching `cu{TAG}` wheel index |
151
141
  | **CUDA smoke test** | Runs `torch.cuda.is_available()` + GPU matmul to verify the stack |
152
142
  | **GPU benchmark** | Copies `gpu_benchmark.py` to `~/gpu_benchmark.py` |
153
143
  | **GPU smoke test notebook** | Copies `gpu_smoke_test.ipynb` to `~/gpu_smoke_test.ipynb` (open in JupyterLab) |
154
144
  | **Jupyter** | Configures and starts JupyterLab as a systemd service on port 8888 |
155
145
  | **SSH keepalive** | Configures server-side keepalive to prevent idle disconnects |
146
+ | **VSCode workspace** | Creates `~/workspace/.vscode/` with `launch.json` and `tasks.json` (auto-detected `cuda-gdb` path and GPU arch), plus an example `saxpy.cu` |
156
147
 
157
148
  ### 📊 GPU Benchmark
158
149
 
@@ -191,6 +182,28 @@ ssh -i ~/.ssh/id_ed25519 -NL 8888:localhost:8888 ubuntu@<public-ip>
191
182
 
192
183
  A **GPU smoke test notebook** (`~/gpu_smoke_test.ipynb`) is pre-installed on every instance. Open it in JupyterLab to interactively verify the CUDA stack, run FP32/FP16 matmuls, train a small CNN on MNIST, and visualise training loss and GPU memory usage.
193
184
 
185
+ ### 🖥️ VSCode Remote SSH
186
+
187
+ The remote setup creates a `~/workspace` folder with pre-configured CUDA debug and build tasks:
188
+
189
+ ```
190
+ ~/workspace/
191
+ ├── .vscode/
192
+ │ ├── launch.json # CUDA debug configs (cuda-gdb path auto-detected)
193
+ │ └── tasks.json # nvcc build tasks (GPU arch auto-detected, e.g. sm_75)
194
+ └── saxpy.cu # Example CUDA source — open and press F5 to debug
195
+ ```
196
+
197
+ Connect directly from your terminal:
198
+
199
+ ```bash
200
+ code --folder-uri vscode-remote://ssh-remote+aws-gpu1/home/ubuntu/workspace
201
+ ```
202
+
203
+ Then install the [Nsight VSCE extension](https://marketplace.visualstudio.com/items?itemName=NVIDIA.nsight-vscode-edition) on the remote when prompted. Open `saxpy.cu`, set a breakpoint, and press F5.
204
+
205
+ See [Nsight remote profiling guide](docs/nsight-remote-profiling.md) for more details on CUDA debugging and profiling workflows.
206
+
194
207
  ### 📋 Listing Resources
195
208
 
196
209
  ```bash
@@ -220,6 +233,9 @@ aws-bootstrap status
220
233
  # Include GPU info (CUDA toolkit + driver version, GPU name, architecture) via SSH
221
234
  aws-bootstrap status --gpu
222
235
 
236
+ # Hide connection commands (shown by default for each running instance)
237
+ aws-bootstrap status --no-instructions
238
+
223
239
  # List instances in a specific region
224
240
  aws-bootstrap status --region us-east-1
225
241
 
@@ -310,7 +326,7 @@ aws-bootstrap launch --instance-type t3.medium --ami-filter "ubuntu/images/hvm-s
310
326
  | GPU instance pricing | [instances.vantage.sh](https://instances.vantage.sh/aws/ec2/g4dn.xlarge) |
311
327
  | Spot instance quotas | [AWS docs](https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/using-spot-limits.html) |
312
328
  | Deep Learning AMIs | [AWS docs](https://docs.aws.amazon.com/dlami/latest/devguide/what-is-dlami.html) |
313
- | Nvidia Nsight remote debugging | [Nvidia docs](https://docs.nvidia.com/nsight-visual-studio-edition/3.2/Content/Setup_Remote_Debugging.htm) |
329
+ | Nsight remote GPU profiling | [Guide](docs/nsight-remote-profiling.md) — Nsight Compute, Nsight Systems, and Nsight VSCE on EC2 |
314
330
 
315
331
  Tutorials on setting up a CUDA environment on EC2 GPU instances:
316
332
 
@@ -113,6 +113,12 @@ def main():
113
113
  @click.option("--no-setup", is_flag=True, default=False, help="Skip running the remote setup script.")
114
114
  @click.option("--dry-run", is_flag=True, default=False, help="Show what would be done without executing.")
115
115
  @click.option("--profile", default=None, help="AWS profile override (defaults to AWS_PROFILE env var).")
116
+ @click.option(
117
+ "--python-version",
118
+ default=None,
119
+ help="Python version for the remote venv (e.g. 3.13, 3.14.2). Passed to uv during setup.",
120
+ )
121
+ @click.option("--ssh-port", default=22, show_default=True, type=int, help="SSH port on the remote instance.")
116
122
  def launch(
117
123
  instance_type,
118
124
  ami_filter,
@@ -125,6 +131,8 @@ def launch(
125
131
  no_setup,
126
132
  dry_run,
127
133
  profile,
134
+ python_version,
135
+ ssh_port,
128
136
  ):
129
137
  """Launch a GPU-accelerated EC2 instance."""
130
138
  config = LaunchConfig(
@@ -137,6 +145,8 @@ def launch(
137
145
  volume_size=volume_size,
138
146
  run_setup=not no_setup,
139
147
  dry_run=dry_run,
148
+ ssh_port=ssh_port,
149
+ python_version=python_version,
140
150
  )
141
151
  if ami_filter:
142
152
  config.ami_filter = ami_filter
@@ -163,7 +173,7 @@ def launch(
163
173
 
164
174
  # Step 3: Security group
165
175
  step(3, 6, "Ensuring security group...")
166
- sg_id = ensure_security_group(ec2, config.security_group, config.tag_value)
176
+ sg_id = ensure_security_group(ec2, config.security_group, config.tag_value, ssh_port=config.ssh_port)
167
177
 
168
178
  pricing = "spot" if config.spot else "on-demand"
169
179
 
@@ -178,6 +188,10 @@ def launch(
178
188
  val("Volume", f"{config.volume_size} GB gp3")
179
189
  val("Region", config.region)
180
190
  val("Remote setup", "yes" if config.run_setup else "no")
191
+ if config.ssh_port != 22:
192
+ val("SSH port", str(config.ssh_port))
193
+ if config.python_version:
194
+ val("Python version", config.python_version)
181
195
  click.echo()
182
196
  click.secho("No resources launched (dry-run mode).", fg="yellow")
183
197
  return
@@ -202,9 +216,13 @@ def launch(
202
216
  # Step 6: SSH and remote setup
203
217
  step(6, 6, "Waiting for SSH access...")
204
218
  private_key = private_key_path(config.key_path)
205
- if not wait_for_ssh(public_ip, config.ssh_user, config.key_path):
219
+ if not wait_for_ssh(public_ip, config.ssh_user, config.key_path, port=config.ssh_port):
206
220
  warn("SSH did not become available within the timeout.")
207
- info(f"Instance is running try connecting manually: ssh -i {private_key} {config.ssh_user}@{public_ip}")
221
+ port_flag = f" -p {config.ssh_port}" if config.ssh_port != 22 else ""
222
+ info(
223
+ f"Instance is running — try connecting manually:"
224
+ f" ssh -i {private_key}{port_flag} {config.ssh_user}@{public_ip}"
225
+ )
208
226
  return
209
227
 
210
228
  if config.run_setup:
@@ -212,7 +230,9 @@ def launch(
212
230
  warn(f"Setup script not found at {SETUP_SCRIPT}, skipping.")
213
231
  else:
214
232
  info("Running remote setup...")
215
- if run_remote_setup(public_ip, config.ssh_user, config.key_path, SETUP_SCRIPT):
233
+ if run_remote_setup(
234
+ public_ip, config.ssh_user, config.key_path, SETUP_SCRIPT, config.python_version, port=config.ssh_port
235
+ ):
216
236
  success("Remote setup completed successfully.")
217
237
  else:
218
238
  warn("Remote setup failed. Instance is still running.")
@@ -224,6 +244,7 @@ def launch(
224
244
  user=config.ssh_user,
225
245
  key_path=config.key_path,
226
246
  alias_prefix=config.alias_prefix,
247
+ port=config.ssh_port,
227
248
  )
228
249
  success(f"Added SSH config alias: {alias}")
229
250
 
@@ -239,18 +260,27 @@ def launch(
239
260
  val("Pricing", pricing)
240
261
  val("SSH alias", alias)
241
262
 
263
+ port_flag = f" -p {config.ssh_port}" if config.ssh_port != 22 else ""
264
+
242
265
  click.echo()
243
266
  click.secho(" SSH:", fg="cyan")
244
- click.secho(f" ssh {alias}", bold=True)
245
- info(f"or: ssh -i {private_key} {config.ssh_user}@{public_ip}")
267
+ click.secho(f" ssh{port_flag} {alias}", bold=True)
268
+ info(f"or: ssh -i {private_key}{port_flag} {config.ssh_user}@{public_ip}")
246
269
 
247
270
  click.echo()
248
271
  click.secho(" Jupyter (via SSH tunnel):", fg="cyan")
249
- click.secho(f" ssh -NL 8888:localhost:8888 {alias}", bold=True)
250
- info(f"or: ssh -i {private_key} -NL 8888:localhost:8888 {config.ssh_user}@{public_ip}")
272
+ click.secho(f" ssh -NL 8888:localhost:8888{port_flag} {alias}", bold=True)
273
+ info(f"or: ssh -i {private_key} -NL 8888:localhost:8888{port_flag} {config.ssh_user}@{public_ip}")
251
274
  info("Then open: http://localhost:8888")
252
275
  info("Notebook: ~/gpu_smoke_test.ipynb (GPU smoke test)")
253
276
 
277
+ click.echo()
278
+ click.secho(" VSCode Remote SSH:", fg="cyan")
279
+ click.secho(
280
+ f" code --folder-uri vscode-remote://ssh-remote+{alias}/home/{config.ssh_user}/workspace",
281
+ bold=True,
282
+ )
283
+
254
284
  click.echo()
255
285
  click.secho(" GPU Benchmark:", fg="cyan")
256
286
  click.secho(f" ssh {alias} 'python ~/gpu_benchmark.py'", bold=True)
@@ -266,7 +296,14 @@ def launch(
266
296
  @click.option("--region", default="us-west-2", show_default=True, help="AWS region.")
267
297
  @click.option("--profile", default=None, help="AWS profile override.")
268
298
  @click.option("--gpu", is_flag=True, default=False, help="Query GPU info (CUDA, driver) via SSH.")
269
- def status(region, profile, gpu):
299
+ @click.option(
300
+ "--instructions/--no-instructions",
301
+ "-I",
302
+ default=True,
303
+ show_default=True,
304
+ help="Show connection commands (SSH, Jupyter, VSCode) for each running instance.",
305
+ )
306
+ def status(region, profile, gpu, instructions):
270
307
  """Show running instances created by aws-bootstrap."""
271
308
  session = boto3.Session(profile_name=profile, region_name=region)
272
309
  ec2 = session.client("ec2")
@@ -305,11 +342,15 @@ def status(region, profile, gpu):
305
342
  if inst["PublicIp"]:
306
343
  val(" IP", inst["PublicIp"])
307
344
 
345
+ # Look up SSH config details once (used by --gpu and --with-instructions)
346
+ details = None
347
+ if (gpu or instructions) and state == "running" and inst["PublicIp"]:
348
+ details = get_ssh_host_details(inst["InstanceId"])
349
+
308
350
  # GPU info (opt-in, only for running instances with a public IP)
309
351
  if gpu and state == "running" and inst["PublicIp"]:
310
- details = get_ssh_host_details(inst["InstanceId"])
311
352
  if details:
312
- gpu_info = query_gpu_info(details.hostname, details.user, details.identity_file)
353
+ gpu_info = query_gpu_info(details.hostname, details.user, details.identity_file, port=details.port)
313
354
  else:
314
355
  gpu_info = query_gpu_info(
315
356
  inst["PublicIp"],
@@ -353,6 +394,29 @@ def status(region, profile, gpu):
353
394
  val(" Est. cost", f"~${est_cost:.4f}")
354
395
 
355
396
  val(" Launched", str(inst["LaunchTime"]))
397
+
398
+ # Connection instructions (opt-in, only for running instances with a public IP and alias)
399
+ if instructions and state == "running" and inst["PublicIp"] and alias:
400
+ user = details.user if details else "ubuntu"
401
+ port = details.port if details else 22
402
+ port_flag = f" -p {port}" if port != 22 else ""
403
+
404
+ click.echo()
405
+ click.secho(" SSH:", fg="cyan")
406
+ click.secho(f" ssh{port_flag} {alias}", bold=True)
407
+
408
+ click.secho(" Jupyter (via SSH tunnel):", fg="cyan")
409
+ click.secho(f" ssh -NL 8888:localhost:8888{port_flag} {alias}", bold=True)
410
+
411
+ click.secho(" VSCode Remote SSH:", fg="cyan")
412
+ click.secho(
413
+ f" code --folder-uri vscode-remote://ssh-remote+{alias}/home/{user}/workspace",
414
+ bold=True,
415
+ )
416
+
417
+ click.secho(" GPU Benchmark:", fg="cyan")
418
+ click.secho(f" ssh {alias} 'python ~/gpu_benchmark.py'", bold=True)
419
+
356
420
  click.echo()
357
421
  first_id = instances[0]["InstanceId"]
358
422
  click.echo(" To terminate: " + click.style(f"aws-bootstrap terminate {first_id}", bold=True))
@@ -22,3 +22,5 @@ class LaunchConfig:
22
22
  ssh_user: str = "ubuntu"
23
23
  tag_value: str = "aws-bootstrap-g4dn"
24
24
  alias_prefix: str = "aws-gpu"
25
+ ssh_port: int = 22
26
+ python_version: str | None = None
@@ -59,7 +59,7 @@ def get_latest_ami(ec2_client, ami_filter: str) -> dict:
59
59
  return images[0]
60
60
 
61
61
 
62
- def ensure_security_group(ec2_client, name: str, tag_value: str) -> str:
62
+ def ensure_security_group(ec2_client, name: str, tag_value: str, ssh_port: int = 22) -> str:
63
63
  """Find or create a security group with SSH ingress in the default VPC."""
64
64
  # Find default VPC
65
65
  vpcs = ec2_client.describe_vpcs(Filters=[{"Name": "isDefault", "Values": ["true"]}])
@@ -103,8 +103,8 @@ def ensure_security_group(ec2_client, name: str, tag_value: str) -> str:
103
103
  IpPermissions=[
104
104
  {
105
105
  "IpProtocol": "tcp",
106
- "FromPort": 22,
107
- "ToPort": 22,
106
+ "FromPort": ssh_port,
107
+ "ToPort": ssh_port,
108
108
  "IpRanges": [{"CidrIp": "0.0.0.0/0", "Description": "SSH access"}],
109
109
  }
110
110
  ],
@@ -0,0 +1,27 @@
1
+ """GPU architecture mapping and GPU info dataclass."""
2
+
3
+ from __future__ import annotations
4
+ from dataclasses import dataclass
5
+
6
+
7
+ _GPU_ARCHITECTURES: dict[str, str] = {
8
+ "7.0": "Volta",
9
+ "7.5": "Turing",
10
+ "8.0": "Ampere",
11
+ "8.6": "Ampere",
12
+ "8.7": "Ampere",
13
+ "8.9": "Ada Lovelace",
14
+ "9.0": "Hopper",
15
+ }
16
+
17
+
18
+ @dataclass
19
+ class GpuInfo:
20
+ """GPU information retrieved via nvidia-smi and nvcc."""
21
+
22
+ driver_version: str
23
+ cuda_driver_version: str # max CUDA version supported by driver (from nvidia-smi)
24
+ cuda_toolkit_version: str | None # actual CUDA toolkit installed (from nvcc), None if unavailable
25
+ gpu_name: str
26
+ compute_capability: str
27
+ architecture: str
@@ -628,7 +628,9 @@ def configure_precision(device: torch.device, requested: PrecisionMode) -> Preci
628
628
  return PrecisionMode.FP32
629
629
 
630
630
 
631
- def print_system_info(requested_precision: PrecisionMode) -> tuple[torch.device, PrecisionMode]:
631
+ def print_system_info(
632
+ requested_precision: PrecisionMode, force_cpu: bool = False
633
+ ) -> tuple[torch.device, PrecisionMode]:
632
634
  """Print system and CUDA information, return device and actual precision mode."""
633
635
  print("\n" + "=" * 60)
634
636
  print("System Information")
@@ -636,7 +638,7 @@ def print_system_info(requested_precision: PrecisionMode) -> tuple[torch.device,
636
638
  print(f"PyTorch version: {torch.__version__}")
637
639
  print(f"Python version: {sys.version.split()[0]}")
638
640
 
639
- if torch.cuda.is_available():
641
+ if torch.cuda.is_available() and not force_cpu:
640
642
  device = torch.device("cuda")
641
643
  print("CUDA available: Yes")
642
644
  print(f"CUDA version: {torch.version.cuda}")
@@ -666,8 +668,11 @@ def print_system_info(requested_precision: PrecisionMode) -> tuple[torch.device,
666
668
  else:
667
669
  device = torch.device("cpu")
668
670
  actual_precision = PrecisionMode.FP32
669
- print("CUDA available: No (running on CPU)")
670
- print("WARNING: GPU benchmark results will not be representative!")
671
+ if force_cpu:
672
+ print("CPU-only mode requested (--cpu flag)")
673
+ else:
674
+ print("CUDA available: No (running on CPU)")
675
+ print("Running on CPU for benchmarking")
671
676
 
672
677
  print("=" * 60)
673
678
  return device, actual_precision
@@ -724,10 +729,15 @@ def main() -> None:
724
729
  action="store_true",
725
730
  help="Run CUDA/cuBLAS diagnostic tests before benchmarking",
726
731
  )
732
+ parser.add_argument(
733
+ "--cpu",
734
+ action="store_true",
735
+ help="Force CPU-only execution (for CPU vs GPU comparison)",
736
+ )
727
737
  args = parser.parse_args()
728
738
 
729
739
  requested_precision = PrecisionMode(args.precision)
730
- device, actual_precision = print_system_info(requested_precision)
740
+ device, actual_precision = print_system_info(requested_precision, force_cpu=args.cpu)
731
741
 
732
742
  # Run diagnostics if requested
733
743
  if args.diagnose:
@@ -0,0 +1,42 @@
1
+ {
2
+ // CUDA debug configurations for VSCode
3
+ // Deployed to: ~/workspace/.vscode/launch.json
4
+ //
5
+ // Usage: Open any .cu file, press F5 to build and debug
6
+ "version": "0.2.0",
7
+ "configurations": [
8
+ {
9
+ "name": "CUDA: Build and Debug Active File",
10
+ "type": "cuda-gdb",
11
+ "request": "launch",
12
+ "program": "${fileDirname}/${fileBasenameNoExtension}",
13
+ "args": [],
14
+ "cwd": "${fileDirname}",
15
+ "miDebuggerPath": "__CUDA_GDB_PATH__",
16
+ "stopAtEntry": false,
17
+ "preLaunchTask": "nvcc: build active file (debug)"
18
+ },
19
+ {
20
+ "name": "CUDA: Build and Debug (stop at main)",
21
+ "type": "cuda-gdb",
22
+ "request": "launch",
23
+ "program": "${fileDirname}/${fileBasenameNoExtension}",
24
+ "args": [],
25
+ "cwd": "${fileDirname}",
26
+ "miDebuggerPath": "__CUDA_GDB_PATH__",
27
+ "stopAtEntry": true,
28
+ "preLaunchTask": "nvcc: build active file (debug)"
29
+ },
30
+ {
31
+ "name": "CUDA: Run Active File (no debug)",
32
+ "type": "cuda-gdb",
33
+ "request": "launch",
34
+ "program": "${fileDirname}/${fileBasenameNoExtension}",
35
+ "args": [],
36
+ "cwd": "${fileDirname}",
37
+ "miDebuggerPath": "__CUDA_GDB_PATH__",
38
+ "stopAtEntry": false,
39
+ "preLaunchTask": "nvcc: build active file (release)"
40
+ }
41
+ ]
42
+ }