aws-bootstrap-g4dn 0.3.0__tar.gz → 0.4.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (44) hide show
  1. {aws_bootstrap_g4dn-0.3.0 → aws_bootstrap_g4dn-0.4.0}/.pre-commit-config.yaml +1 -0
  2. {aws_bootstrap_g4dn-0.3.0 → aws_bootstrap_g4dn-0.4.0}/CLAUDE.md +6 -0
  3. {aws_bootstrap_g4dn-0.3.0/aws_bootstrap_g4dn.egg-info → aws_bootstrap_g4dn-0.4.0}/PKG-INFO +26 -3
  4. {aws_bootstrap_g4dn-0.3.0 → aws_bootstrap_g4dn-0.4.0}/README.md +25 -2
  5. {aws_bootstrap_g4dn-0.3.0 → aws_bootstrap_g4dn-0.4.0}/aws_bootstrap/cli.py +2 -2
  6. {aws_bootstrap_g4dn-0.3.0 → aws_bootstrap_g4dn-0.4.0}/aws_bootstrap/resources/gpu_benchmark.py +15 -5
  7. aws_bootstrap_g4dn-0.4.0/aws_bootstrap/resources/launch.json +42 -0
  8. {aws_bootstrap_g4dn-0.3.0 → aws_bootstrap_g4dn-0.4.0}/aws_bootstrap/resources/remote_setup.sh +83 -5
  9. aws_bootstrap_g4dn-0.4.0/aws_bootstrap/resources/saxpy.cu +49 -0
  10. aws_bootstrap_g4dn-0.4.0/aws_bootstrap/resources/tasks.json +48 -0
  11. {aws_bootstrap_g4dn-0.3.0 → aws_bootstrap_g4dn-0.4.0}/aws_bootstrap/ssh.py +36 -0
  12. {aws_bootstrap_g4dn-0.3.0 → aws_bootstrap_g4dn-0.4.0}/aws_bootstrap/tests/test_cli.py +1 -1
  13. {aws_bootstrap_g4dn-0.3.0 → aws_bootstrap_g4dn-0.4.0/aws_bootstrap_g4dn.egg-info}/PKG-INFO +26 -3
  14. {aws_bootstrap_g4dn-0.3.0 → aws_bootstrap_g4dn-0.4.0}/aws_bootstrap_g4dn.egg-info/SOURCES.txt +5 -1
  15. aws_bootstrap_g4dn-0.4.0/docs/nsight-remote-profiling.md +245 -0
  16. {aws_bootstrap_g4dn-0.3.0 → aws_bootstrap_g4dn-0.4.0}/pyproject.toml +1 -1
  17. {aws_bootstrap_g4dn-0.3.0 → aws_bootstrap_g4dn-0.4.0}/.github/ISSUE_TEMPLATE/bug_report.md +0 -0
  18. {aws_bootstrap_g4dn-0.3.0 → aws_bootstrap_g4dn-0.4.0}/.github/ISSUE_TEMPLATE/feature_request.md +0 -0
  19. {aws_bootstrap_g4dn-0.3.0 → aws_bootstrap_g4dn-0.4.0}/.github/workflows/ci.yml +0 -0
  20. {aws_bootstrap_g4dn-0.3.0 → aws_bootstrap_g4dn-0.4.0}/.github/workflows/publish-to-pypi.yml +0 -0
  21. {aws_bootstrap_g4dn-0.3.0 → aws_bootstrap_g4dn-0.4.0}/.gitignore +0 -0
  22. {aws_bootstrap_g4dn-0.3.0 → aws_bootstrap_g4dn-0.4.0}/CODE_OF_CONDUCT.md +0 -0
  23. {aws_bootstrap_g4dn-0.3.0 → aws_bootstrap_g4dn-0.4.0}/CONTRIBUTING.md +0 -0
  24. {aws_bootstrap_g4dn-0.3.0 → aws_bootstrap_g4dn-0.4.0}/LICENSE +0 -0
  25. {aws_bootstrap_g4dn-0.3.0 → aws_bootstrap_g4dn-0.4.0}/SECURITY.md +0 -0
  26. {aws_bootstrap_g4dn-0.3.0 → aws_bootstrap_g4dn-0.4.0}/aws_bootstrap/__init__.py +0 -0
  27. {aws_bootstrap_g4dn-0.3.0 → aws_bootstrap_g4dn-0.4.0}/aws_bootstrap/config.py +0 -0
  28. {aws_bootstrap_g4dn-0.3.0 → aws_bootstrap_g4dn-0.4.0}/aws_bootstrap/ec2.py +0 -0
  29. {aws_bootstrap_g4dn-0.3.0 → aws_bootstrap_g4dn-0.4.0}/aws_bootstrap/gpu.py +0 -0
  30. {aws_bootstrap_g4dn-0.3.0 → aws_bootstrap_g4dn-0.4.0}/aws_bootstrap/resources/__init__.py +0 -0
  31. {aws_bootstrap_g4dn-0.3.0 → aws_bootstrap_g4dn-0.4.0}/aws_bootstrap/resources/gpu_smoke_test.ipynb +0 -0
  32. {aws_bootstrap_g4dn-0.3.0 → aws_bootstrap_g4dn-0.4.0}/aws_bootstrap/resources/requirements.txt +0 -0
  33. {aws_bootstrap_g4dn-0.3.0 → aws_bootstrap_g4dn-0.4.0}/aws_bootstrap/tests/__init__.py +0 -0
  34. {aws_bootstrap_g4dn-0.3.0 → aws_bootstrap_g4dn-0.4.0}/aws_bootstrap/tests/test_config.py +0 -0
  35. {aws_bootstrap_g4dn-0.3.0 → aws_bootstrap_g4dn-0.4.0}/aws_bootstrap/tests/test_ec2.py +0 -0
  36. {aws_bootstrap_g4dn-0.3.0 → aws_bootstrap_g4dn-0.4.0}/aws_bootstrap/tests/test_gpu.py +0 -0
  37. {aws_bootstrap_g4dn-0.3.0 → aws_bootstrap_g4dn-0.4.0}/aws_bootstrap/tests/test_ssh_config.py +0 -0
  38. {aws_bootstrap_g4dn-0.3.0 → aws_bootstrap_g4dn-0.4.0}/aws_bootstrap/tests/test_ssh_gpu.py +0 -0
  39. {aws_bootstrap_g4dn-0.3.0 → aws_bootstrap_g4dn-0.4.0}/aws_bootstrap_g4dn.egg-info/dependency_links.txt +0 -0
  40. {aws_bootstrap_g4dn-0.3.0 → aws_bootstrap_g4dn-0.4.0}/aws_bootstrap_g4dn.egg-info/entry_points.txt +0 -0
  41. {aws_bootstrap_g4dn-0.3.0 → aws_bootstrap_g4dn-0.4.0}/aws_bootstrap_g4dn.egg-info/requires.txt +0 -0
  42. {aws_bootstrap_g4dn-0.3.0 → aws_bootstrap_g4dn-0.4.0}/aws_bootstrap_g4dn.egg-info/top_level.txt +0 -0
  43. {aws_bootstrap_g4dn-0.3.0 → aws_bootstrap_g4dn-0.4.0}/setup.cfg +0 -0
  44. {aws_bootstrap_g4dn-0.3.0 → aws_bootstrap_g4dn-0.4.0}/uv.lock +0 -0
@@ -8,6 +8,7 @@ repos:
8
8
  - id: fix-byte-order-marker
9
9
  - id: check-case-conflict
10
10
  - id: check-json
11
+ exclude: ^aws_bootstrap/resources/(launch|tasks)\.json$
11
12
  - id: check-yaml
12
13
  args: [ --unsafe ]
13
14
  - id: detect-aws-credentials
@@ -39,6 +39,9 @@ aws_bootstrap/
39
39
  __init__.py
40
40
  gpu_benchmark.py # GPU throughput benchmark (CNN + Transformer), copied to ~/gpu_benchmark.py on instance
41
41
  gpu_smoke_test.ipynb # Interactive Jupyter notebook for GPU verification, copied to ~/gpu_smoke_test.ipynb
42
+ launch.json # VSCode CUDA debug config template (deployed to ~/workspace/.vscode/launch.json)
43
+ saxpy.cu # Example CUDA SAXPY source (deployed to ~/workspace/saxpy.cu)
44
+ tasks.json # VSCode CUDA build tasks template (deployed to ~/workspace/.vscode/tasks.json)
42
45
  remote_setup.sh # Uploaded & run on instance post-boot (GPU verify, Jupyter, etc.)
43
46
  requirements.txt # Python dependencies installed on the remote instance
44
47
  tests/ # Unit tests (pytest)
@@ -49,6 +52,7 @@ aws_bootstrap/
49
52
  test_ssh_config.py
50
53
  test_ssh_gpu.py
51
54
  docs/
55
+ nsight-remote-profiling.md # Nsight Compute, Nsight Systems, and Nsight VSCE remote profiling guide
52
56
  spot-request-lifecycle.md # Research notes on spot request cleanup
53
57
  ```
54
58
 
@@ -99,8 +103,10 @@ The `KNOWN_CUDA_TAGS` array in `remote_setup.sh` lists the CUDA wheel tags publi
99
103
 
100
104
  `remote_setup.sh` also:
101
105
  - Creates `~/venv` and appends `source ~/venv/bin/activate` to `~/.bashrc` so the venv is auto-activated on SSH login. When `--python-version` is passed to `launch`, the CLI sets `PYTHON_VERSION` as an inline env var on the SSH command; `remote_setup.sh` reads it to run `uv python install` and `uv venv --python` with the requested version
106
+ - Adds NVIDIA Nsight Systems (`nsys`) to PATH if installed under `/opt/nvidia/nsight-systems/` (pre-installed on Deep Learning AMIs but not on PATH by default). Fixes directory permissions, finds the latest version, and prepends its `bin/` to PATH in `~/.bashrc`
102
107
  - Runs a quick CUDA smoke test (`torch.cuda.is_available()` + GPU matmul) after PyTorch installation to verify the GPU stack; prints a WARNING on failure but does not abort
103
108
  - Copies `gpu_benchmark.py` to `~/gpu_benchmark.py` and `gpu_smoke_test.ipynb` to `~/gpu_smoke_test.ipynb`
109
+ - Sets up `~/workspace/.vscode/` with `launch.json` and `tasks.json` for CUDA debugging. Detects `cuda-gdb` path and GPU SM architecture (via `nvidia-smi --query-gpu=compute_cap`) at deploy time, replacing `__CUDA_GDB_PATH__` and `__GPU_ARCH__` placeholders in the template files via `sed`
104
110
 
105
111
  ## GPU Benchmark
106
112
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: aws-bootstrap-g4dn
3
- Version: 0.3.0
3
+ Version: 0.4.0
4
4
  Summary: Bootstrap AWS EC2 GPU instances for hybrid local-remote development
5
5
  Author: Adam Ever-Hadani
6
6
  License-Expression: MIT
@@ -49,7 +49,7 @@ ssh aws-gpu1 # You're in, venv activated, PyTorch works
49
49
  ### 🎯 Target Workflows
50
50
 
51
51
  1. **Jupyter server-client** — Jupyter runs on the instance, connect from your local browser
52
- 2. **VSCode Remote SSH** — `ssh aws-gpu1` just works with the Remote SSH extension
52
+ 2. **VSCode Remote SSH** — opens `~/workspace` with pre-configured CUDA debug/build tasks and an example `.cu` file
53
53
  3. **NVIDIA Nsight remote debugging** — GPU debugging over SSH
54
54
 
55
55
  ---
@@ -162,6 +162,7 @@ The setup script runs automatically on the instance after SSH becomes available:
162
162
  | **GPU smoke test notebook** | Copies `gpu_smoke_test.ipynb` to `~/gpu_smoke_test.ipynb` (open in JupyterLab) |
163
163
  | **Jupyter** | Configures and starts JupyterLab as a systemd service on port 8888 |
164
164
  | **SSH keepalive** | Configures server-side keepalive to prevent idle disconnects |
165
+ | **VSCode workspace** | Creates `~/workspace/.vscode/` with `launch.json` and `tasks.json` (auto-detected `cuda-gdb` path and GPU arch), plus an example `saxpy.cu` |
165
166
 
166
167
  ### 📊 GPU Benchmark
167
168
 
@@ -200,6 +201,28 @@ ssh -i ~/.ssh/id_ed25519 -NL 8888:localhost:8888 ubuntu@<public-ip>
200
201
 
201
202
  A **GPU smoke test notebook** (`~/gpu_smoke_test.ipynb`) is pre-installed on every instance. Open it in JupyterLab to interactively verify the CUDA stack, run FP32/FP16 matmuls, train a small CNN on MNIST, and visualise training loss and GPU memory usage.
202
203
 
204
+ ### 🖥️ VSCode Remote SSH
205
+
206
+ The remote setup creates a `~/workspace` folder with pre-configured CUDA debug and build tasks:
207
+
208
+ ```
209
+ ~/workspace/
210
+ ├── .vscode/
211
+ │ ├── launch.json # CUDA debug configs (cuda-gdb path auto-detected)
212
+ │ └── tasks.json # nvcc build tasks (GPU arch auto-detected, e.g. sm_75)
213
+ └── saxpy.cu # Example CUDA source — open and press F5 to debug
214
+ ```
215
+
216
+ Connect directly from your terminal:
217
+
218
+ ```bash
219
+ code --folder-uri vscode-remote://ssh-remote+aws-gpu1/home/ubuntu/workspace
220
+ ```
221
+
222
+ Then install the [Nsight VSCE extension](https://marketplace.visualstudio.com/items?itemName=NVIDIA.nsight-vscode-edition) on the remote when prompted. Open `saxpy.cu`, set a breakpoint, and press F5.
223
+
224
+ See [Nsight remote profiling guide](docs/nsight-remote-profiling.md) for more details on CUDA debugging and profiling workflows.
225
+
203
226
  ### 📋 Listing Resources
204
227
 
205
228
  ```bash
@@ -322,7 +345,7 @@ aws-bootstrap launch --instance-type t3.medium --ami-filter "ubuntu/images/hvm-s
322
345
  | GPU instance pricing | [instances.vantage.sh](https://instances.vantage.sh/aws/ec2/g4dn.xlarge) |
323
346
  | Spot instance quotas | [AWS docs](https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/using-spot-limits.html) |
324
347
  | Deep Learning AMIs | [AWS docs](https://docs.aws.amazon.com/dlami/latest/devguide/what-is-dlami.html) |
325
- | Nvidia Nsight remote debugging | [Nvidia docs](https://docs.nvidia.com/nsight-visual-studio-edition/3.2/Content/Setup_Remote_Debugging.htm) |
348
+ | Nsight remote GPU profiling | [Guide](docs/nsight-remote-profiling.md) — Nsight Compute, Nsight Systems, and Nsight VSCE on EC2 |
326
349
 
327
350
  Tutorials on setting up a CUDA environment on EC2 GPU instances:
328
351
 
@@ -30,7 +30,7 @@ ssh aws-gpu1 # You're in, venv activated, PyTorch works
30
30
  ### 🎯 Target Workflows
31
31
 
32
32
  1. **Jupyter server-client** — Jupyter runs on the instance, connect from your local browser
33
- 2. **VSCode Remote SSH** — `ssh aws-gpu1` just works with the Remote SSH extension
33
+ 2. **VSCode Remote SSH** — opens `~/workspace` with pre-configured CUDA debug/build tasks and an example `.cu` file
34
34
  3. **NVIDIA Nsight remote debugging** — GPU debugging over SSH
35
35
 
36
36
  ---
@@ -143,6 +143,7 @@ The setup script runs automatically on the instance after SSH becomes available:
143
143
  | **GPU smoke test notebook** | Copies `gpu_smoke_test.ipynb` to `~/gpu_smoke_test.ipynb` (open in JupyterLab) |
144
144
  | **Jupyter** | Configures and starts JupyterLab as a systemd service on port 8888 |
145
145
  | **SSH keepalive** | Configures server-side keepalive to prevent idle disconnects |
146
+ | **VSCode workspace** | Creates `~/workspace/.vscode/` with `launch.json` and `tasks.json` (auto-detected `cuda-gdb` path and GPU arch), plus an example `saxpy.cu` |
146
147
 
147
148
  ### 📊 GPU Benchmark
148
149
 
@@ -181,6 +182,28 @@ ssh -i ~/.ssh/id_ed25519 -NL 8888:localhost:8888 ubuntu@<public-ip>
181
182
 
182
183
  A **GPU smoke test notebook** (`~/gpu_smoke_test.ipynb`) is pre-installed on every instance. Open it in JupyterLab to interactively verify the CUDA stack, run FP32/FP16 matmuls, train a small CNN on MNIST, and visualise training loss and GPU memory usage.
183
184
 
185
+ ### 🖥️ VSCode Remote SSH
186
+
187
+ The remote setup creates a `~/workspace` folder with pre-configured CUDA debug and build tasks:
188
+
189
+ ```
190
+ ~/workspace/
191
+ ├── .vscode/
192
+ │ ├── launch.json # CUDA debug configs (cuda-gdb path auto-detected)
193
+ │ └── tasks.json # nvcc build tasks (GPU arch auto-detected, e.g. sm_75)
194
+ └── saxpy.cu # Example CUDA source — open and press F5 to debug
195
+ ```
196
+
197
+ Connect directly from your terminal:
198
+
199
+ ```bash
200
+ code --folder-uri vscode-remote://ssh-remote+aws-gpu1/home/ubuntu/workspace
201
+ ```
202
+
203
+ Then install the [Nsight VSCE extension](https://marketplace.visualstudio.com/items?itemName=NVIDIA.nsight-vscode-edition) on the remote when prompted. Open `saxpy.cu`, set a breakpoint, and press F5.
204
+
205
+ See [Nsight remote profiling guide](docs/nsight-remote-profiling.md) for more details on CUDA debugging and profiling workflows.
206
+
184
207
  ### 📋 Listing Resources
185
208
 
186
209
  ```bash
@@ -303,7 +326,7 @@ aws-bootstrap launch --instance-type t3.medium --ami-filter "ubuntu/images/hvm-s
303
326
  | GPU instance pricing | [instances.vantage.sh](https://instances.vantage.sh/aws/ec2/g4dn.xlarge) |
304
327
  | Spot instance quotas | [AWS docs](https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/using-spot-limits.html) |
305
328
  | Deep Learning AMIs | [AWS docs](https://docs.aws.amazon.com/dlami/latest/devguide/what-is-dlami.html) |
306
- | Nvidia Nsight remote debugging | [Nvidia docs](https://docs.nvidia.com/nsight-visual-studio-edition/3.2/Content/Setup_Remote_Debugging.htm) |
329
+ | Nsight remote GPU profiling | [Guide](docs/nsight-remote-profiling.md) — Nsight Compute, Nsight Systems, and Nsight VSCE on EC2 |
307
330
 
308
331
  Tutorials on setting up a CUDA environment on EC2 GPU instances:
309
332
 
@@ -277,7 +277,7 @@ def launch(
277
277
  click.echo()
278
278
  click.secho(" VSCode Remote SSH:", fg="cyan")
279
279
  click.secho(
280
- f" code --folder-uri vscode-remote://ssh-remote+{alias}/home/{config.ssh_user}",
280
+ f" code --folder-uri vscode-remote://ssh-remote+{alias}/home/{config.ssh_user}/workspace",
281
281
  bold=True,
282
282
  )
283
283
 
@@ -410,7 +410,7 @@ def status(region, profile, gpu, instructions):
410
410
 
411
411
  click.secho(" VSCode Remote SSH:", fg="cyan")
412
412
  click.secho(
413
- f" code --folder-uri vscode-remote://ssh-remote+{alias}/home/{user}",
413
+ f" code --folder-uri vscode-remote://ssh-remote+{alias}/home/{user}/workspace",
414
414
  bold=True,
415
415
  )
416
416
 
@@ -628,7 +628,9 @@ def configure_precision(device: torch.device, requested: PrecisionMode) -> Preci
628
628
  return PrecisionMode.FP32
629
629
 
630
630
 
631
- def print_system_info(requested_precision: PrecisionMode) -> tuple[torch.device, PrecisionMode]:
631
+ def print_system_info(
632
+ requested_precision: PrecisionMode, force_cpu: bool = False
633
+ ) -> tuple[torch.device, PrecisionMode]:
632
634
  """Print system and CUDA information, return device and actual precision mode."""
633
635
  print("\n" + "=" * 60)
634
636
  print("System Information")
@@ -636,7 +638,7 @@ def print_system_info(requested_precision: PrecisionMode) -> tuple[torch.device,
636
638
  print(f"PyTorch version: {torch.__version__}")
637
639
  print(f"Python version: {sys.version.split()[0]}")
638
640
 
639
- if torch.cuda.is_available():
641
+ if torch.cuda.is_available() and not force_cpu:
640
642
  device = torch.device("cuda")
641
643
  print("CUDA available: Yes")
642
644
  print(f"CUDA version: {torch.version.cuda}")
@@ -666,8 +668,11 @@ def print_system_info(requested_precision: PrecisionMode) -> tuple[torch.device,
666
668
  else:
667
669
  device = torch.device("cpu")
668
670
  actual_precision = PrecisionMode.FP32
669
- print("CUDA available: No (running on CPU)")
670
- print("WARNING: GPU benchmark results will not be representative!")
671
+ if force_cpu:
672
+ print("CPU-only mode requested (--cpu flag)")
673
+ else:
674
+ print("CUDA available: No (running on CPU)")
675
+ print("Running on CPU for benchmarking")
671
676
 
672
677
  print("=" * 60)
673
678
  return device, actual_precision
@@ -724,10 +729,15 @@ def main() -> None:
724
729
  action="store_true",
725
730
  help="Run CUDA/cuBLAS diagnostic tests before benchmarking",
726
731
  )
732
+ parser.add_argument(
733
+ "--cpu",
734
+ action="store_true",
735
+ help="Force CPU-only execution (for CPU vs GPU comparison)",
736
+ )
727
737
  args = parser.parse_args()
728
738
 
729
739
  requested_precision = PrecisionMode(args.precision)
730
- device, actual_precision = print_system_info(requested_precision)
740
+ device, actual_precision = print_system_info(requested_precision, force_cpu=args.cpu)
731
741
 
732
742
  # Run diagnostics if requested
733
743
  if args.diagnose:
@@ -0,0 +1,42 @@
1
+ {
2
+ // CUDA debug configurations for VSCode
3
+ // Deployed to: ~/workspace/.vscode/launch.json
4
+ //
5
+ // Usage: Open any .cu file, press F5 to build and debug
6
+ "version": "0.2.0",
7
+ "configurations": [
8
+ {
9
+ "name": "CUDA: Build and Debug Active File",
10
+ "type": "cuda-gdb",
11
+ "request": "launch",
12
+ "program": "${fileDirname}/${fileBasenameNoExtension}",
13
+ "args": [],
14
+ "cwd": "${fileDirname}",
15
+ "miDebuggerPath": "__CUDA_GDB_PATH__",
16
+ "stopAtEntry": false,
17
+ "preLaunchTask": "nvcc: build active file (debug)"
18
+ },
19
+ {
20
+ "name": "CUDA: Build and Debug (stop at main)",
21
+ "type": "cuda-gdb",
22
+ "request": "launch",
23
+ "program": "${fileDirname}/${fileBasenameNoExtension}",
24
+ "args": [],
25
+ "cwd": "${fileDirname}",
26
+ "miDebuggerPath": "__CUDA_GDB_PATH__",
27
+ "stopAtEntry": true,
28
+ "preLaunchTask": "nvcc: build active file (debug)"
29
+ },
30
+ {
31
+ "name": "CUDA: Run Active File (no debug)",
32
+ "type": "cuda-gdb",
33
+ "request": "launch",
34
+ "program": "${fileDirname}/${fileBasenameNoExtension}",
35
+ "args": [],
36
+ "cwd": "${fileDirname}",
37
+ "miDebuggerPath": "__CUDA_GDB_PATH__",
38
+ "stopAtEntry": false,
39
+ "preLaunchTask": "nvcc: build active file (release)"
40
+ }
41
+ ]
42
+ }
@@ -7,7 +7,7 @@ echo "=== aws-bootstrap-g4dn remote setup ==="
7
7
 
8
8
  # 1. Verify GPU
9
9
  echo ""
10
- echo "[1/5] Verifying GPU and CUDA..."
10
+ echo "[1/6] Verifying GPU and CUDA..."
11
11
  if command -v nvidia-smi &>/dev/null; then
12
12
  nvidia-smi --query-gpu=name,driver_version,memory.total --format=csv,noheader
13
13
  else
@@ -20,15 +20,40 @@ else
20
20
  echo "WARNING: nvcc not found (CUDA toolkit may not be installed)"
21
21
  fi
22
22
 
23
+ # Make Nsight Systems (nsys) available on PATH if installed under /opt/nvidia
24
+ if ! command -v nsys &>/dev/null; then
25
+ NSIGHT_DIR="/opt/nvidia/nsight-systems"
26
+ if [ -d "$NSIGHT_DIR" ]; then
27
+ # Fix permissions — the parent dir is often root-only (drwx------)
28
+ sudo chmod o+rx "$NSIGHT_DIR"
29
+ # Find the latest version directory (lexicographic sort)
30
+ NSYS_VERSION=$(ls -1 "$NSIGHT_DIR" | sort -V | tail -1)
31
+ if [ -n "$NSYS_VERSION" ] && [ -x "$NSIGHT_DIR/$NSYS_VERSION/bin/nsys" ]; then
32
+ NSYS_BIN="$NSIGHT_DIR/$NSYS_VERSION/bin"
33
+ if ! grep -q "nsight-systems" ~/.bashrc 2>/dev/null; then
34
+ echo "export PATH=\"$NSYS_BIN:\$PATH\"" >> ~/.bashrc
35
+ fi
36
+ export PATH="$NSYS_BIN:$PATH"
37
+ echo " Nsight Systems $NSYS_VERSION added to PATH ($NSYS_BIN)"
38
+ else
39
+ echo " WARNING: Nsight Systems directory found but no nsys binary"
40
+ fi
41
+ else
42
+ echo " Nsight Systems not found at $NSIGHT_DIR"
43
+ fi
44
+ else
45
+ echo " nsys already on PATH: $(command -v nsys)"
46
+ fi
47
+
23
48
  # 2. Install utilities
24
49
  echo ""
25
- echo "[2/5] Installing utilities..."
50
+ echo "[2/6] Installing utilities..."
26
51
  sudo apt-get update -qq
27
52
  sudo apt-get install -y -qq htop tmux tree jq
28
53
 
29
54
  # 3. Set up Python environment with uv
30
55
  echo ""
31
- echo "[3/5] Setting up Python environment with uv..."
56
+ echo "[3/6] Setting up Python environment with uv..."
32
57
  if ! command -v uv &>/dev/null; then
33
58
  curl -LsSf https://astral.sh/uv/install.sh | sh
34
59
  fi
@@ -153,7 +178,7 @@ echo " Jupyter config written to $JUPYTER_CONFIG_DIR/jupyter_lab_config.py"
153
178
 
154
179
  # 4. Jupyter systemd service
155
180
  echo ""
156
- echo "[4/5] Setting up Jupyter systemd service..."
181
+ echo "[4/6] Setting up Jupyter systemd service..."
157
182
  LOGIN_USER=$(whoami)
158
183
 
159
184
  sudo tee /etc/systemd/system/jupyter.service > /dev/null << SVCEOF
@@ -180,7 +205,7 @@ echo " Jupyter service started (port 8888)"
180
205
 
181
206
  # 5. SSH keepalive
182
207
  echo ""
183
- echo "[5/5] Configuring SSH keepalive..."
208
+ echo "[5/6] Configuring SSH keepalive..."
184
209
  if ! grep -q "ClientAliveInterval" /etc/ssh/sshd_config; then
185
210
  echo "ClientAliveInterval 60" | sudo tee -a /etc/ssh/sshd_config > /dev/null
186
211
  echo "ClientAliveCountMax 10" | sudo tee -a /etc/ssh/sshd_config > /dev/null
@@ -190,5 +215,58 @@ else
190
215
  echo " SSH keepalive already configured"
191
216
  fi
192
217
 
218
+ # 6. VSCode workspace setup
219
+ echo ""
220
+ echo "[6/6] Setting up VSCode workspace..."
221
+ mkdir -p ~/workspace/.vscode
222
+
223
+ # Detect cuda-gdb path
224
+ CUDA_GDB_PATH=""
225
+ if command -v cuda-gdb &>/dev/null; then
226
+ CUDA_GDB_PATH=$(command -v cuda-gdb)
227
+ elif [ -x /usr/local/cuda/bin/cuda-gdb ]; then
228
+ CUDA_GDB_PATH="/usr/local/cuda/bin/cuda-gdb"
229
+ else
230
+ # Try glob for versioned CUDA installs
231
+ for p in /usr/local/cuda-*/bin/cuda-gdb; do
232
+ if [ -x "$p" ]; then
233
+ CUDA_GDB_PATH="$p"
234
+ fi
235
+ done
236
+ fi
237
+ if [ -z "$CUDA_GDB_PATH" ]; then
238
+ echo " WARNING: cuda-gdb not found — using placeholder in launch.json"
239
+ CUDA_GDB_PATH="cuda-gdb"
240
+ else
241
+ echo " cuda-gdb: $CUDA_GDB_PATH"
242
+ fi
243
+
244
+ # Detect GPU SM architecture
245
+ GPU_ARCH=""
246
+ if command -v nvidia-smi &>/dev/null; then
247
+ COMPUTE_CAP=$(nvidia-smi --query-gpu=compute_cap --format=csv,noheader 2>/dev/null | head -1 | tr -d '[:space:]')
248
+ if [ -n "$COMPUTE_CAP" ]; then
249
+ GPU_ARCH="sm_$(echo "$COMPUTE_CAP" | tr -d '.')"
250
+ fi
251
+ fi
252
+ if [ -z "$GPU_ARCH" ]; then
253
+ echo " WARNING: Could not detect GPU arch — defaulting to sm_75"
254
+ GPU_ARCH="sm_75"
255
+ else
256
+ echo " GPU arch: $GPU_ARCH"
257
+ fi
258
+
259
+ # Copy example CUDA source into workspace
260
+ cp /tmp/saxpy.cu ~/workspace/saxpy.cu
261
+ echo " Deployed saxpy.cu"
262
+
263
+ # Deploy launch.json with cuda-gdb path
264
+ sed "s|__CUDA_GDB_PATH__|${CUDA_GDB_PATH}|g" /tmp/launch.json > ~/workspace/.vscode/launch.json
265
+ echo " Deployed launch.json"
266
+
267
+ # Deploy tasks.json with GPU architecture
268
+ sed "s|__GPU_ARCH__|${GPU_ARCH}|g" /tmp/tasks.json > ~/workspace/.vscode/tasks.json
269
+ echo " Deployed tasks.json"
270
+
193
271
  echo ""
194
272
  echo "=== Remote setup complete ==="
@@ -0,0 +1,49 @@
1
+ /**
2
+ * SAXPY Example, CUDA Style
3
+ * Source: https://developer.nvidia.com/blog/easy-introduction-cuda-c-and-c/
4
+ *
5
+ * This is included as an example CUDA C++ source file to try out the VS Code launch configuration we include on the host machine.
6
+ *
7
+ */
8
+ #include <stdio.h>
9
+
10
+ __global__
11
+ void saxpy(int n, float a, float *x, float *y)
12
+ {
13
+ int i = blockIdx.x*blockDim.x + threadIdx.x;
14
+ if (i < n) y[i] = a*x[i] + y[i];
15
+ }
16
+
17
+ int main(void)
18
+ {
19
+ int N = 1<<20;
20
+ float *x, *y, *d_x, *d_y;
21
+ x = (float*)malloc(N*sizeof(float));
22
+ y = (float*)malloc(N*sizeof(float));
23
+
24
+ cudaMalloc(&d_x, N*sizeof(float));
25
+ cudaMalloc(&d_y, N*sizeof(float));
26
+
27
+ for (int i = 0; i < N; i++) {
28
+ x[i] = 1.0f;
29
+ y[i] = 2.0f;
30
+ }
31
+
32
+ cudaMemcpy(d_x, x, N*sizeof(float), cudaMemcpyHostToDevice);
33
+ cudaMemcpy(d_y, y, N*sizeof(float), cudaMemcpyHostToDevice);
34
+
35
+ // Perform SAXPY on 1M elements
36
+ saxpy<<<(N+255)/256, 256>>>(N, 2.0f, d_x, d_y);
37
+
38
+ cudaMemcpy(y, d_y, N*sizeof(float), cudaMemcpyDeviceToHost);
39
+
40
+ float maxError = 0.0f;
41
+ for (int i = 0; i < N; i++)
42
+ maxError = max(maxError, abs(y[i]-4.0f));
43
+ printf("Max error: %f\n", maxError);
44
+
45
+ cudaFree(d_x);
46
+ cudaFree(d_y);
47
+ free(x);
48
+ free(y);
49
+ }
@@ -0,0 +1,48 @@
1
+ {
2
+ // CUDA build tasks for VSCode
3
+ // Deployed to: ~/workspace/.vscode/tasks.json
4
+ "version": "2.0.0",
5
+ "tasks": [
6
+ {
7
+ "label": "nvcc: build active file (debug)",
8
+ "type": "shell",
9
+ "command": "nvcc",
10
+ "args": [
11
+ "-g", // Host debug symbols
12
+ "-G", // Device (GPU) debug symbols
13
+ "-O0", // No optimization
14
+ "-arch=__GPU_ARCH__", // GPU arch (auto-detected)
15
+ "-o",
16
+ "${fileDirname}/${fileBasenameNoExtension}",
17
+ "${file}"
18
+ ],
19
+ "options": {
20
+ "cwd": "${fileDirname}"
21
+ },
22
+ "problemMatcher": ["$nvcc"],
23
+ "group": {
24
+ "kind": "build",
25
+ "isDefault": true
26
+ },
27
+ "detail": "Compile active .cu file with debug symbols (-g -G)"
28
+ },
29
+ {
30
+ "label": "nvcc: build active file (release)",
31
+ "type": "shell",
32
+ "command": "nvcc",
33
+ "args": [
34
+ "-O3",
35
+ "-arch=__GPU_ARCH__",
36
+ "-o",
37
+ "${fileDirname}/${fileBasenameNoExtension}",
38
+ "${file}"
39
+ ],
40
+ "options": {
41
+ "cwd": "${fileDirname}"
42
+ },
43
+ "problemMatcher": ["$nvcc"],
44
+ "group": "build",
45
+ "detail": "Compile active .cu file optimized (no debug)"
46
+ }
47
+ ]
48
+ }
@@ -159,6 +159,42 @@ def run_remote_setup(
159
159
  click.secho(f" SCP failed: {nb_result.stderr}", fg="red", err=True)
160
160
  return False
161
161
 
162
+ # SCP the CUDA example source
163
+ saxpy_path = script_path.parent / "saxpy.cu"
164
+ click.echo(" Uploading saxpy.cu...")
165
+ saxpy_result = subprocess.run(
166
+ ["scp", *ssh_opts, *scp_port_opts, str(saxpy_path), f"{user}@{host}:/tmp/saxpy.cu"],
167
+ capture_output=True,
168
+ text=True,
169
+ )
170
+ if saxpy_result.returncode != 0:
171
+ click.secho(f" SCP failed: {saxpy_result.stderr}", fg="red", err=True)
172
+ return False
173
+
174
+ # SCP the VSCode launch.json
175
+ launch_json_path = script_path.parent / "launch.json"
176
+ click.echo(" Uploading launch.json...")
177
+ launch_result = subprocess.run(
178
+ ["scp", *ssh_opts, *scp_port_opts, str(launch_json_path), f"{user}@{host}:/tmp/launch.json"],
179
+ capture_output=True,
180
+ text=True,
181
+ )
182
+ if launch_result.returncode != 0:
183
+ click.secho(f" SCP failed: {launch_result.stderr}", fg="red", err=True)
184
+ return False
185
+
186
+ # SCP the VSCode tasks.json
187
+ tasks_json_path = script_path.parent / "tasks.json"
188
+ click.echo(" Uploading tasks.json...")
189
+ tasks_result = subprocess.run(
190
+ ["scp", *ssh_opts, *scp_port_opts, str(tasks_json_path), f"{user}@{host}:/tmp/tasks.json"],
191
+ capture_output=True,
192
+ text=True,
193
+ )
194
+ if tasks_result.returncode != 0:
195
+ click.secho(f" SCP failed: {tasks_result.stderr}", fg="red", err=True)
196
+ return False
197
+
162
198
  # SCP the script
163
199
  click.echo(" Uploading remote_setup.sh...")
164
200
  scp_result = subprocess.run(
@@ -565,7 +565,7 @@ def test_status_instructions_shown_by_default(mock_find, mock_spot, mock_session
565
565
  assert result.exit_code == 0
566
566
  assert "ssh aws-gpu1" in result.output
567
567
  assert "ssh -NL 8888:localhost:8888 aws-gpu1" in result.output
568
- assert "vscode-remote://ssh-remote+aws-gpu1/home/ubuntu" in result.output
568
+ assert "vscode-remote://ssh-remote+aws-gpu1/home/ubuntu/workspace" in result.output
569
569
  assert "python ~/gpu_benchmark.py" in result.output
570
570
 
571
571
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: aws-bootstrap-g4dn
3
- Version: 0.3.0
3
+ Version: 0.4.0
4
4
  Summary: Bootstrap AWS EC2 GPU instances for hybrid local-remote development
5
5
  Author: Adam Ever-Hadani
6
6
  License-Expression: MIT
@@ -49,7 +49,7 @@ ssh aws-gpu1 # You're in, venv activated, PyTorch works
49
49
  ### 🎯 Target Workflows
50
50
 
51
51
  1. **Jupyter server-client** — Jupyter runs on the instance, connect from your local browser
52
- 2. **VSCode Remote SSH** — `ssh aws-gpu1` just works with the Remote SSH extension
52
+ 2. **VSCode Remote SSH** — opens `~/workspace` with pre-configured CUDA debug/build tasks and an example `.cu` file
53
53
  3. **NVIDIA Nsight remote debugging** — GPU debugging over SSH
54
54
 
55
55
  ---
@@ -162,6 +162,7 @@ The setup script runs automatically on the instance after SSH becomes available:
162
162
  | **GPU smoke test notebook** | Copies `gpu_smoke_test.ipynb` to `~/gpu_smoke_test.ipynb` (open in JupyterLab) |
163
163
  | **Jupyter** | Configures and starts JupyterLab as a systemd service on port 8888 |
164
164
  | **SSH keepalive** | Configures server-side keepalive to prevent idle disconnects |
165
+ | **VSCode workspace** | Creates `~/workspace/.vscode/` with `launch.json` and `tasks.json` (auto-detected `cuda-gdb` path and GPU arch), plus an example `saxpy.cu` |
165
166
 
166
167
  ### 📊 GPU Benchmark
167
168
 
@@ -200,6 +201,28 @@ ssh -i ~/.ssh/id_ed25519 -NL 8888:localhost:8888 ubuntu@<public-ip>
200
201
 
201
202
  A **GPU smoke test notebook** (`~/gpu_smoke_test.ipynb`) is pre-installed on every instance. Open it in JupyterLab to interactively verify the CUDA stack, run FP32/FP16 matmuls, train a small CNN on MNIST, and visualise training loss and GPU memory usage.
202
203
 
204
+ ### 🖥️ VSCode Remote SSH
205
+
206
+ The remote setup creates a `~/workspace` folder with pre-configured CUDA debug and build tasks:
207
+
208
+ ```
209
+ ~/workspace/
210
+ ├── .vscode/
211
+ │ ├── launch.json # CUDA debug configs (cuda-gdb path auto-detected)
212
+ │ └── tasks.json # nvcc build tasks (GPU arch auto-detected, e.g. sm_75)
213
+ └── saxpy.cu # Example CUDA source — open and press F5 to debug
214
+ ```
215
+
216
+ Connect directly from your terminal:
217
+
218
+ ```bash
219
+ code --folder-uri vscode-remote://ssh-remote+aws-gpu1/home/ubuntu/workspace
220
+ ```
221
+
222
+ Then install the [Nsight VSCE extension](https://marketplace.visualstudio.com/items?itemName=NVIDIA.nsight-vscode-edition) on the remote when prompted. Open `saxpy.cu`, set a breakpoint, and press F5.
223
+
224
+ See [Nsight remote profiling guide](docs/nsight-remote-profiling.md) for more details on CUDA debugging and profiling workflows.
225
+
203
226
  ### 📋 Listing Resources
204
227
 
205
228
  ```bash
@@ -322,7 +345,7 @@ aws-bootstrap launch --instance-type t3.medium --ami-filter "ubuntu/images/hvm-s
322
345
  | GPU instance pricing | [instances.vantage.sh](https://instances.vantage.sh/aws/ec2/g4dn.xlarge) |
323
346
  | Spot instance quotas | [AWS docs](https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/using-spot-limits.html) |
324
347
  | Deep Learning AMIs | [AWS docs](https://docs.aws.amazon.com/dlami/latest/devguide/what-is-dlami.html) |
325
- | Nvidia Nsight remote debugging | [Nvidia docs](https://docs.nvidia.com/nsight-visual-studio-edition/3.2/Content/Setup_Remote_Debugging.htm) |
348
+ | Nsight remote GPU profiling | [Guide](docs/nsight-remote-profiling.md) — Nsight Compute, Nsight Systems, and Nsight VSCE on EC2 |
326
349
 
327
350
  Tutorials on setting up a CUDA environment on EC2 GPU instances:
328
351
 
@@ -21,8 +21,11 @@ aws_bootstrap/ssh.py
21
21
  aws_bootstrap/resources/__init__.py
22
22
  aws_bootstrap/resources/gpu_benchmark.py
23
23
  aws_bootstrap/resources/gpu_smoke_test.ipynb
24
+ aws_bootstrap/resources/launch.json
24
25
  aws_bootstrap/resources/remote_setup.sh
25
26
  aws_bootstrap/resources/requirements.txt
27
+ aws_bootstrap/resources/saxpy.cu
28
+ aws_bootstrap/resources/tasks.json
26
29
  aws_bootstrap/tests/__init__.py
27
30
  aws_bootstrap/tests/test_cli.py
28
31
  aws_bootstrap/tests/test_config.py
@@ -35,4 +38,5 @@ aws_bootstrap_g4dn.egg-info/SOURCES.txt
35
38
  aws_bootstrap_g4dn.egg-info/dependency_links.txt
36
39
  aws_bootstrap_g4dn.egg-info/entry_points.txt
37
40
  aws_bootstrap_g4dn.egg-info/requires.txt
38
- aws_bootstrap_g4dn.egg-info/top_level.txt
41
+ aws_bootstrap_g4dn.egg-info/top_level.txt
42
+ docs/nsight-remote-profiling.md
@@ -0,0 +1,245 @@
1
+ # NVIDIA Nsight Remote GPU Profiling on EC2
2
+
3
+ Guide to using NVIDIA's Nsight profiling and debugging tools with remote EC2 GPU instances provisioned by `aws-bootstrap`.
4
+
5
+ ## Overview
6
+
7
+ NVIDIA provides several Nsight tools for GPU profiling and debugging. The most relevant ones for remote EC2 work are:
8
+
9
+ | Tool | Purpose | macOS Host | Ports Required | Best Approach |
10
+ |------|---------|-----------|----------------|---------------|
11
+ | **Nsight Compute** | CUDA kernel profiling | Native GUI | SSH only (22) | GUI remote or CLI + local viewer |
12
+ | **Nsight Systems** | System-wide tracing | Native GUI | SSH (22) + 45555 | CLI + local viewer |
13
+ | **Nsight VSCE** | Interactive CUDA debugging | Via VSCode | SSH only (22) | VSCode Remote SSH |
14
+ | **Nsight Graphics** | Graphics/shader profiling | No | SSH only (22) | CLI captures (graphics workloads only) |
15
+
16
+ ---
17
+
18
+ ## Nsight Compute (Kernel-Level Profiler)
19
+
20
+ Nsight Compute is the most straightforward tool for remote profiling over SSH. It provides per-kernel performance metrics, roofline analysis, occupancy analysis, and memory throughput data.
21
+
22
+ ### How It Works
23
+
24
+ The GUI (`ncu-ui`) runs on your local machine and connects to the EC2 instance over SSH. Nsight Compute automatically deploys its CLI tools to a deployment directory on the remote target on first connection. All profiling traffic is tunneled through SSH — no extra ports needed.
25
+
26
+ Two profiling modes are available:
27
+
28
+ - **Interactive:** A SOCKS proxy tunnels through SSH, letting you step through kernels and control execution in real time.
29
+ - **Non-Interactive:** The profiler runs to completion on the remote and copies the report back automatically via SSH remote forwarding.
30
+
31
+ ### Setup
32
+
33
+ **Local machine (macOS/Linux/Windows):**
34
+
35
+ 1. Download Nsight Compute from [NVIDIA Developer](https://developer.nvidia.com/tools-overview/nsight-compute/get-started) (free, requires NVIDIA developer account)
36
+ 2. Install `ncu-ui` (the GUI application). As of 2025, macOS ARM64 (Apple Silicon) is natively supported.
37
+
38
+ **Remote EC2 instance:**
39
+
40
+ Nothing extra is needed — the GUI auto-deploys the CLI on first connection. The Deep Learning AMI already includes the CUDA toolkit.
41
+
42
+ ### GPU Performance Counter Permissions
43
+
44
+ By default, non-admin users cannot access GPU performance counters, which results in `ERR_NVGPUCTRPERM` errors. To fix this:
45
+
46
+ ```bash
47
+ ssh aws-gpu1
48
+ sudo bash -c 'echo "options nvidia NVreg_RestrictProfilingToAdminUsers=0" > /etc/modprobe.d/nvidia.conf'
49
+ sudo update-initramfs -u -k all
50
+ sudo reboot
51
+ ```
52
+
53
+ > **Important:** Rebooting an EC2 instance without an Elastic IP will assign a new public IP. After reboot, run `aws-bootstrap status` to see the new IP and update the SSH config alias. You may need to `aws-bootstrap terminate` and re-launch, or manually update `~/.ssh/config`. This is a one-time setup per instance.
54
+
55
+ ### Workflow A: GUI Remote Profiling
56
+
57
+ 1. Open `ncu-ui` locally.
58
+ 2. Click **Connect** and add a new SSH connection:
59
+ - **Host:** your EC2 public IP (from `aws-bootstrap status`)
60
+ - **Username:** `ubuntu`
61
+ - **Port:** 22 (or your custom `--ssh-port`)
62
+ - **Authentication:** Private key (`~/.ssh/id_ed25519`)
63
+ 3. Select the CUDA binary to profile on the remote machine.
64
+ 4. Choose an output file location on your local machine.
65
+ 5. Click **Launch** to start profiling.
66
+
67
+ Nsight Compute supports `ProxyJump` and `ProxyCommand` SSH options if you need to reach the instance through a bastion host.
68
+
69
+ ### Workflow B: CLI on Remote, View Locally (Recommended)
70
+
71
+ This is the most reliable approach — avoids real-time connection issues:
72
+
73
+ ```bash
74
+ # Profile on the remote instance
75
+ ssh aws-gpu1 'ncu -o /tmp/profile --set full ./my_cuda_app'
76
+
77
+ # Download the report
78
+ scp aws-gpu1:/tmp/profile.ncu-rep .
79
+
80
+ # Open locally in the GUI
81
+ ncu-ui profile.ncu-rep
82
+ ```
83
+
84
+ For source-level correlation, compile with `nvcc --lineinfo`.
85
+
86
+ ### References
87
+
88
+ - [Nsight Compute Documentation](https://docs.nvidia.com/nsight-compute/NsightCompute/index.html)
89
+ - [How to Set Up Nsight Compute on EC2](https://tspeterkim.github.io/posts/nsight-setup-on-ec2) — step-by-step walkthrough with screenshots
90
+
91
+ ---
92
+
93
+ ## Nsight Systems (System-Wide Profiler)
94
+
95
+ Nsight Systems traces CPU activity, GPU workloads (CUDA, Vulkan), OS runtime, threading, memory transfers, and NVTX annotations on a unified timeline. Useful for understanding end-to-end application performance.
96
+
97
+ ### Security Caveat
98
+
99
+ Unlike Nsight Compute, Nsight Systems uses SSH only for the initial connection. **Actual profiling data transfers over a raw, unencrypted TCP socket on port 45555.** NVIDIA explicitly warns against using this on untrusted networks.
100
+
101
+ For EC2, you can mitigate this by tunneling port 45555 through SSH:
102
+
103
+ ```bash
104
+ ssh -L 45555:localhost:45555 aws-gpu1
105
+ ```
106
+
107
+ Then configure the Nsight Systems GUI to connect to `localhost` instead of the remote IP.
108
+
109
+ ### Setup
110
+
111
+ **Local machine:**
112
+
113
+ Download Nsight Systems from [NVIDIA Developer](https://developer.nvidia.com/nsight-systems/get-started). The GUI (`nsys-ui`) is available for macOS, Linux, and Windows.
114
+
115
+ **Remote EC2 instance:**
116
+
117
+ The `nsys` CLI is typically included with the CUDA toolkit on Deep Learning AMIs. Verify with:
118
+
119
+ ```bash
120
+ ssh aws-gpu1 'nsys status -e'
121
+ ```
122
+
123
+ Additionally, Netcat must be installed (required by the remote profiling daemon):
124
+
125
+ ```bash
126
+ ssh aws-gpu1 'sudo apt-get install -y netcat'
127
+ ```
128
+
129
+ ### Port Requirements
130
+
131
+ If using GUI remote profiling (not the CLI workflow), you need **port 45555** open in the EC2 security group in addition to SSH. The current `aws-bootstrap` security group only opens SSH — you would need to manually add the rule via the AWS console or CLI, or use the SSH tunnel approach described above.
132
+
133
+ ### Workflow: CLI on Remote, View Locally (Recommended)
134
+
135
+ This avoids the port 45555 requirement entirely:
136
+
137
+ ```bash
138
+ # Profile on the remote instance
139
+ ssh aws-gpu1 'nsys profile --trace=cuda,nvtx --output=/tmp/report ./my_app'
140
+
141
+ # Download the report
142
+ scp aws-gpu1:/tmp/report.nsys-rep .
143
+
144
+ # Open locally in the GUI
145
+ nsys-ui report.nsys-rep
146
+ ```
147
+
148
+ ### References
149
+
150
+ - [Nsight Systems User Guide](https://docs.nvidia.com/nsight-systems/UserGuide/index.html)
151
+ - [Nsight Systems Installation Guide](https://docs.nvidia.com/nsight-systems/InstallationGuide/index.html)
152
+
153
+ ---
154
+
155
+ ## Nsight Visual Studio Code Edition (CUDA Debugger)
156
+
157
+ Nsight VSCE is a VS Code extension for building and debugging CUDA applications. This is the most natural fit for the `aws-bootstrap` workflow since it works directly with VSCode Remote SSH.
158
+
159
+ ### How It Works
160
+
161
+ The extension provides CUDA debugging via `cuda-gdb` (or `cuda-gdbserver` for explicit remote setups). When used with VSCode Remote SSH, everything runs on the remote instance — the extension, the compiler, the debugger.
162
+
163
+ Features include:
164
+ - Breakpoints in GPU device code (including conditional breakpoints)
165
+ - GPU register, variable, and call-stack inspection
166
+ - Warp and lane focus controls (switch between streaming multiprocessors, warps, lanes)
167
+ - Full CPU thread inspection while stopped in GPU code, and vice versa
168
+ - CUDA-aware syntax highlighting and IntelliSense
169
+
170
+ ### Setup
171
+
172
+ **Local machine:**
173
+
174
+ 1. Install [VSCode](https://code.visualstudio.com/)
175
+ 2. Install the [Remote - SSH](https://marketplace.visualstudio.com/items?itemName=ms-vscode-remote.remote-ssh) extension
176
+
177
+ **Remote EC2 instance (via VSCode Remote SSH):**
178
+
179
+ 1. Connect to the instance: `code --folder-uri vscode-remote://ssh-remote+aws-gpu1/home/ubuntu/workspace`
180
+ 2. Install the [Nsight VSCE extension](https://marketplace.visualstudio.com/items?itemName=NVIDIA.nsight-vscode-edition) on the remote (VS Code will prompt)
181
+ 3. `cuda-gdb` is included with the CUDA toolkit on Deep Learning AMIs
182
+
183
+ ### Debugging Workflow
184
+
185
+ 1. Connect to `aws-gpu1` via VSCode Remote SSH (opens `~/workspace`).
186
+ 2. `launch.json` and `tasks.json` are pre-configured in `~/workspace/.vscode/` with the detected `cuda-gdb` path and GPU architecture.
187
+ 3. Open or create `.cu` files in `~/workspace`.
188
+ 4. Set breakpoints in your `.cu` files.
189
+ 5. Press F5 to start debugging.
190
+
191
+ ### Known Issues
192
+
193
+ - `cuda-gdb` may require root privileges for GPU access. The same `NVreg_RestrictProfilingToAdminUsers=0` modprobe fix (described in the Nsight Compute section) resolves this. Alternatively, create a sudoers entry for `cuda-gdb`.
194
+ - Some users report the debugger failing to start on certain Remote SSH configurations. Check the Debug Console output for error details.
195
+
196
+ ### References
197
+
198
+ - [Nsight VSCE Documentation](https://docs.nvidia.com/nsight-visual-studio-code-edition/latest/)
199
+ - [Nsight VSCE on VS Code Marketplace](https://marketplace.visualstudio.com/items?itemName=NVIDIA.nsight-vscode-edition)
200
+ - [Nsight VSCE on GitHub](https://github.com/NVIDIA/nsight-vscode-edition)
201
+
202
+ ---
203
+
204
+ ## Quick Reference
205
+
206
+ ### Common Setup: GPU Performance Counter Access
207
+
208
+ Required for Nsight Compute profiling and `cuda-gdb` debugging. This is a one-time setup per instance but **requires a reboot**:
209
+
210
+ ```bash
211
+ ssh aws-gpu1
212
+ sudo bash -c 'echo "options nvidia NVreg_RestrictProfilingToAdminUsers=0" > /etc/modprobe.d/nvidia.conf'
213
+ sudo update-initramfs -u -k all
214
+ sudo reboot
215
+ ```
216
+
217
+ After reboot, the instance will have a new public IP (unless using an Elastic IP). Run `aws-bootstrap status` to see the updated address.
218
+
219
+ ### Recommended Approach: CLI Profiling + Local Viewer
220
+
221
+ The most practical and secure workflow for `aws-bootstrap` instances:
222
+
223
+ ```bash
224
+ # Kernel profiling with Nsight Compute
225
+ ssh aws-gpu1 'ncu -o /tmp/profile --set full ./my_cuda_app'
226
+ scp aws-gpu1:/tmp/profile.ncu-rep .
227
+ ncu-ui profile.ncu-rep
228
+
229
+ # System profiling with Nsight Systems
230
+ ssh aws-gpu1 'nsys profile --trace=cuda,nvtx --output=/tmp/report ./my_app'
231
+ scp aws-gpu1:/tmp/report.nsys-rep .
232
+ nsys-ui report.nsys-rep
233
+ ```
234
+
235
+ This requires no additional ports, no security group changes, and works with the existing SSH configuration that `aws-bootstrap` sets up.
236
+
237
+ ### Port Summary
238
+
239
+ | Tool | Method | Ports |
240
+ |------|--------|-------|
241
+ | Nsight Compute (GUI remote) | SSH tunnel | 22 only |
242
+ | Nsight Compute (CLI + scp) | SSH | 22 only |
243
+ | Nsight Systems (GUI remote) | SSH + raw socket | 22 + 45555 |
244
+ | Nsight Systems (CLI + scp) | SSH | 22 only |
245
+ | Nsight VSCE (VSCode) | Remote SSH | 22 only |
@@ -33,7 +33,7 @@ aws-bootstrap = "aws_bootstrap.cli:main"
33
33
  include = ["aws_bootstrap*"]
34
34
 
35
35
  [tool.setuptools.package-data]
36
- "aws_bootstrap.resources" = ["*.sh", "*.txt", "*.ipynb"]
36
+ "aws_bootstrap.resources" = ["*.sh", "*.txt", "*.ipynb", "*.json", "*.cu"]
37
37
 
38
38
  [tool.setuptools_scm]
39
39