aws-bootstrap-g4dn 0.3.0__py3-none-any.whl → 0.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
aws_bootstrap/cli.py CHANGED
@@ -29,6 +29,7 @@ from .ssh import (
29
29
  private_key_path,
30
30
  query_gpu_info,
31
31
  remove_ssh_host,
32
+ resolve_instance_id,
32
33
  run_remote_setup,
33
34
  wait_for_ssh,
34
35
  )
@@ -277,7 +278,7 @@ def launch(
277
278
  click.echo()
278
279
  click.secho(" VSCode Remote SSH:", fg="cyan")
279
280
  click.secho(
280
- f" code --folder-uri vscode-remote://ssh-remote+{alias}/home/{config.ssh_user}",
281
+ f" code --folder-uri vscode-remote://ssh-remote+{alias}/home/{config.ssh_user}/workspace",
281
282
  bold=True,
282
283
  )
283
284
 
@@ -288,7 +289,7 @@ def launch(
288
289
 
289
290
  click.echo()
290
291
  click.secho(" Terminate:", fg="cyan")
291
- click.secho(f" aws-bootstrap terminate {instance_id} --region {config.region}", bold=True)
292
+ click.secho(f" aws-bootstrap terminate {alias} --region {config.region}", bold=True)
292
293
  click.echo()
293
294
 
294
295
 
@@ -410,7 +411,7 @@ def status(region, profile, gpu, instructions):
410
411
 
411
412
  click.secho(" VSCode Remote SSH:", fg="cyan")
412
413
  click.secho(
413
- f" code --folder-uri vscode-remote://ssh-remote+{alias}/home/{user}",
414
+ f" code --folder-uri vscode-remote://ssh-remote+{alias}/home/{user}/workspace",
414
415
  bold=True,
415
416
  )
416
417
 
@@ -419,7 +420,8 @@ def status(region, profile, gpu, instructions):
419
420
 
420
421
  click.echo()
421
422
  first_id = instances[0]["InstanceId"]
422
- click.echo(" To terminate: " + click.style(f"aws-bootstrap terminate {first_id}", bold=True))
423
+ first_ref = ssh_hosts.get(first_id, first_id)
424
+ click.echo(" To terminate: " + click.style(f"aws-bootstrap terminate {first_ref}", bold=True))
423
425
  click.echo()
424
426
 
425
427
 
@@ -427,18 +429,28 @@ def status(region, profile, gpu, instructions):
427
429
  @click.option("--region", default="us-west-2", show_default=True, help="AWS region.")
428
430
  @click.option("--profile", default=None, help="AWS profile override.")
429
431
  @click.option("--yes", "-y", is_flag=True, default=False, help="Skip confirmation prompt.")
430
- @click.argument("instance_ids", nargs=-1)
432
+ @click.argument("instance_ids", nargs=-1, metavar="[INSTANCE_ID_OR_ALIAS]...")
431
433
  def terminate(region, profile, yes, instance_ids):
432
434
  """Terminate instances created by aws-bootstrap.
433
435
 
434
- Pass specific instance IDs to terminate, or omit to terminate all
435
- aws-bootstrap instances in the region.
436
+ Pass specific instance IDs or SSH aliases (e.g. aws-gpu1) to terminate,
437
+ or omit to terminate all aws-bootstrap instances in the region.
436
438
  """
437
439
  session = boto3.Session(profile_name=profile, region_name=region)
438
440
  ec2 = session.client("ec2")
439
441
 
440
442
  if instance_ids:
441
- targets = list(instance_ids)
443
+ targets = []
444
+ for value in instance_ids:
445
+ resolved = resolve_instance_id(value)
446
+ if resolved is None:
447
+ raise CLIError(
448
+ f"Could not resolve '{value}' to an instance ID.\n\n"
449
+ " It is not a valid instance ID or a known SSH alias."
450
+ )
451
+ if resolved != value:
452
+ info(f"Resolved alias '{value}' -> {resolved}")
453
+ targets.append(resolved)
442
454
  else:
443
455
  instances = find_tagged_instances(ec2, "aws-bootstrap-g4dn")
444
456
  if not instances:
@@ -628,7 +628,9 @@ def configure_precision(device: torch.device, requested: PrecisionMode) -> Preci
628
628
  return PrecisionMode.FP32
629
629
 
630
630
 
631
- def print_system_info(requested_precision: PrecisionMode) -> tuple[torch.device, PrecisionMode]:
631
+ def print_system_info(
632
+ requested_precision: PrecisionMode, force_cpu: bool = False
633
+ ) -> tuple[torch.device, PrecisionMode]:
632
634
  """Print system and CUDA information, return device and actual precision mode."""
633
635
  print("\n" + "=" * 60)
634
636
  print("System Information")
@@ -636,7 +638,7 @@ def print_system_info(requested_precision: PrecisionMode) -> tuple[torch.device,
636
638
  print(f"PyTorch version: {torch.__version__}")
637
639
  print(f"Python version: {sys.version.split()[0]}")
638
640
 
639
- if torch.cuda.is_available():
641
+ if torch.cuda.is_available() and not force_cpu:
640
642
  device = torch.device("cuda")
641
643
  print("CUDA available: Yes")
642
644
  print(f"CUDA version: {torch.version.cuda}")
@@ -666,8 +668,11 @@ def print_system_info(requested_precision: PrecisionMode) -> tuple[torch.device,
666
668
  else:
667
669
  device = torch.device("cpu")
668
670
  actual_precision = PrecisionMode.FP32
669
- print("CUDA available: No (running on CPU)")
670
- print("WARNING: GPU benchmark results will not be representative!")
671
+ if force_cpu:
672
+ print("CPU-only mode requested (--cpu flag)")
673
+ else:
674
+ print("CUDA available: No (running on CPU)")
675
+ print("Running on CPU for benchmarking")
671
676
 
672
677
  print("=" * 60)
673
678
  return device, actual_precision
@@ -724,10 +729,15 @@ def main() -> None:
724
729
  action="store_true",
725
730
  help="Run CUDA/cuBLAS diagnostic tests before benchmarking",
726
731
  )
732
+ parser.add_argument(
733
+ "--cpu",
734
+ action="store_true",
735
+ help="Force CPU-only execution (for CPU vs GPU comparison)",
736
+ )
727
737
  args = parser.parse_args()
728
738
 
729
739
  requested_precision = PrecisionMode(args.precision)
730
- device, actual_precision = print_system_info(requested_precision)
740
+ device, actual_precision = print_system_info(requested_precision, force_cpu=args.cpu)
731
741
 
732
742
  # Run diagnostics if requested
733
743
  if args.diagnose:
@@ -0,0 +1,42 @@
1
+ {
2
+ // CUDA debug configurations for VSCode
3
+ // Deployed to: ~/workspace/.vscode/launch.json
4
+ //
5
+ // Usage: Open any .cu file, press F5 to build and debug
6
+ "version": "0.2.0",
7
+ "configurations": [
8
+ {
9
+ "name": "CUDA: Build and Debug Active File",
10
+ "type": "cuda-gdb",
11
+ "request": "launch",
12
+ "program": "${fileDirname}/${fileBasenameNoExtension}",
13
+ "args": [],
14
+ "cwd": "${fileDirname}",
15
+ "miDebuggerPath": "__CUDA_GDB_PATH__",
16
+ "stopAtEntry": false,
17
+ "preLaunchTask": "nvcc: build active file (debug)"
18
+ },
19
+ {
20
+ "name": "CUDA: Build and Debug (stop at main)",
21
+ "type": "cuda-gdb",
22
+ "request": "launch",
23
+ "program": "${fileDirname}/${fileBasenameNoExtension}",
24
+ "args": [],
25
+ "cwd": "${fileDirname}",
26
+ "miDebuggerPath": "__CUDA_GDB_PATH__",
27
+ "stopAtEntry": true,
28
+ "preLaunchTask": "nvcc: build active file (debug)"
29
+ },
30
+ {
31
+ "name": "CUDA: Run Active File (no debug)",
32
+ "type": "cuda-gdb",
33
+ "request": "launch",
34
+ "program": "${fileDirname}/${fileBasenameNoExtension}",
35
+ "args": [],
36
+ "cwd": "${fileDirname}",
37
+ "miDebuggerPath": "__CUDA_GDB_PATH__",
38
+ "stopAtEntry": false,
39
+ "preLaunchTask": "nvcc: build active file (release)"
40
+ }
41
+ ]
42
+ }
@@ -7,7 +7,7 @@ echo "=== aws-bootstrap-g4dn remote setup ==="
7
7
 
8
8
  # 1. Verify GPU
9
9
  echo ""
10
- echo "[1/5] Verifying GPU and CUDA..."
10
+ echo "[1/6] Verifying GPU and CUDA..."
11
11
  if command -v nvidia-smi &>/dev/null; then
12
12
  nvidia-smi --query-gpu=name,driver_version,memory.total --format=csv,noheader
13
13
  else
@@ -20,15 +20,40 @@ else
20
20
  echo "WARNING: nvcc not found (CUDA toolkit may not be installed)"
21
21
  fi
22
22
 
23
+ # Make Nsight Systems (nsys) available on PATH if installed under /opt/nvidia
24
+ if ! command -v nsys &>/dev/null; then
25
+ NSIGHT_DIR="/opt/nvidia/nsight-systems"
26
+ if [ -d "$NSIGHT_DIR" ]; then
27
+ # Fix permissions — the parent dir is often root-only (drwx------)
28
+ sudo chmod o+rx "$NSIGHT_DIR"
29
+ # Find the latest version directory (lexicographic sort)
30
+ NSYS_VERSION=$(ls -1 "$NSIGHT_DIR" | sort -V | tail -1)
31
+ if [ -n "$NSYS_VERSION" ] && [ -x "$NSIGHT_DIR/$NSYS_VERSION/bin/nsys" ]; then
32
+ NSYS_BIN="$NSIGHT_DIR/$NSYS_VERSION/bin"
33
+ if ! grep -q "nsight-systems" ~/.bashrc 2>/dev/null; then
34
+ echo "export PATH=\"$NSYS_BIN:\$PATH\"" >> ~/.bashrc
35
+ fi
36
+ export PATH="$NSYS_BIN:$PATH"
37
+ echo " Nsight Systems $NSYS_VERSION added to PATH ($NSYS_BIN)"
38
+ else
39
+ echo " WARNING: Nsight Systems directory found but no nsys binary"
40
+ fi
41
+ else
42
+ echo " Nsight Systems not found at $NSIGHT_DIR"
43
+ fi
44
+ else
45
+ echo " nsys already on PATH: $(command -v nsys)"
46
+ fi
47
+
23
48
  # 2. Install utilities
24
49
  echo ""
25
- echo "[2/5] Installing utilities..."
50
+ echo "[2/6] Installing utilities..."
26
51
  sudo apt-get update -qq
27
52
  sudo apt-get install -y -qq htop tmux tree jq
28
53
 
29
54
  # 3. Set up Python environment with uv
30
55
  echo ""
31
- echo "[3/5] Setting up Python environment with uv..."
56
+ echo "[3/6] Setting up Python environment with uv..."
32
57
  if ! command -v uv &>/dev/null; then
33
58
  curl -LsSf https://astral.sh/uv/install.sh | sh
34
59
  fi
@@ -153,7 +178,7 @@ echo " Jupyter config written to $JUPYTER_CONFIG_DIR/jupyter_lab_config.py"
153
178
 
154
179
  # 4. Jupyter systemd service
155
180
  echo ""
156
- echo "[4/5] Setting up Jupyter systemd service..."
181
+ echo "[4/6] Setting up Jupyter systemd service..."
157
182
  LOGIN_USER=$(whoami)
158
183
 
159
184
  sudo tee /etc/systemd/system/jupyter.service > /dev/null << SVCEOF
@@ -180,7 +205,7 @@ echo " Jupyter service started (port 8888)"
180
205
 
181
206
  # 5. SSH keepalive
182
207
  echo ""
183
- echo "[5/5] Configuring SSH keepalive..."
208
+ echo "[5/6] Configuring SSH keepalive..."
184
209
  if ! grep -q "ClientAliveInterval" /etc/ssh/sshd_config; then
185
210
  echo "ClientAliveInterval 60" | sudo tee -a /etc/ssh/sshd_config > /dev/null
186
211
  echo "ClientAliveCountMax 10" | sudo tee -a /etc/ssh/sshd_config > /dev/null
@@ -190,5 +215,58 @@ else
190
215
  echo " SSH keepalive already configured"
191
216
  fi
192
217
 
218
+ # 6. VSCode workspace setup
219
+ echo ""
220
+ echo "[6/6] Setting up VSCode workspace..."
221
+ mkdir -p ~/workspace/.vscode
222
+
223
+ # Detect cuda-gdb path
224
+ CUDA_GDB_PATH=""
225
+ if command -v cuda-gdb &>/dev/null; then
226
+ CUDA_GDB_PATH=$(command -v cuda-gdb)
227
+ elif [ -x /usr/local/cuda/bin/cuda-gdb ]; then
228
+ CUDA_GDB_PATH="/usr/local/cuda/bin/cuda-gdb"
229
+ else
230
+ # Try glob for versioned CUDA installs
231
+ for p in /usr/local/cuda-*/bin/cuda-gdb; do
232
+ if [ -x "$p" ]; then
233
+ CUDA_GDB_PATH="$p"
234
+ fi
235
+ done
236
+ fi
237
+ if [ -z "$CUDA_GDB_PATH" ]; then
238
+ echo " WARNING: cuda-gdb not found — using placeholder in launch.json"
239
+ CUDA_GDB_PATH="cuda-gdb"
240
+ else
241
+ echo " cuda-gdb: $CUDA_GDB_PATH"
242
+ fi
243
+
244
+ # Detect GPU SM architecture
245
+ GPU_ARCH=""
246
+ if command -v nvidia-smi &>/dev/null; then
247
+ COMPUTE_CAP=$(nvidia-smi --query-gpu=compute_cap --format=csv,noheader 2>/dev/null | head -1 | tr -d '[:space:]')
248
+ if [ -n "$COMPUTE_CAP" ]; then
249
+ GPU_ARCH="sm_$(echo "$COMPUTE_CAP" | tr -d '.')"
250
+ fi
251
+ fi
252
+ if [ -z "$GPU_ARCH" ]; then
253
+ echo " WARNING: Could not detect GPU arch — defaulting to sm_75"
254
+ GPU_ARCH="sm_75"
255
+ else
256
+ echo " GPU arch: $GPU_ARCH"
257
+ fi
258
+
259
+ # Copy example CUDA source into workspace
260
+ cp /tmp/saxpy.cu ~/workspace/saxpy.cu
261
+ echo " Deployed saxpy.cu"
262
+
263
+ # Deploy launch.json with cuda-gdb path
264
+ sed "s|__CUDA_GDB_PATH__|${CUDA_GDB_PATH}|g" /tmp/launch.json > ~/workspace/.vscode/launch.json
265
+ echo " Deployed launch.json"
266
+
267
+ # Deploy tasks.json with GPU architecture
268
+ sed "s|__GPU_ARCH__|${GPU_ARCH}|g" /tmp/tasks.json > ~/workspace/.vscode/tasks.json
269
+ echo " Deployed tasks.json"
270
+
193
271
  echo ""
194
272
  echo "=== Remote setup complete ==="
@@ -0,0 +1,49 @@
1
+ /**
2
+ * SAXPY Example, CUDA Style
3
+ * Source: https://developer.nvidia.com/blog/easy-introduction-cuda-c-and-c/
4
+ *
5
+ * This is included as an example CUDA C++ source file to try out the VS Code launch configuration we include on the host machine.
6
+ *
7
+ */
8
+ #include <stdio.h>
9
+
10
+ __global__
11
+ void saxpy(int n, float a, float *x, float *y)
12
+ {
13
+ int i = blockIdx.x*blockDim.x + threadIdx.x;
14
+ if (i < n) y[i] = a*x[i] + y[i];
15
+ }
16
+
17
+ int main(void)
18
+ {
19
+ int N = 1<<20;
20
+ float *x, *y, *d_x, *d_y;
21
+ x = (float*)malloc(N*sizeof(float));
22
+ y = (float*)malloc(N*sizeof(float));
23
+
24
+ cudaMalloc(&d_x, N*sizeof(float));
25
+ cudaMalloc(&d_y, N*sizeof(float));
26
+
27
+ for (int i = 0; i < N; i++) {
28
+ x[i] = 1.0f;
29
+ y[i] = 2.0f;
30
+ }
31
+
32
+ cudaMemcpy(d_x, x, N*sizeof(float), cudaMemcpyHostToDevice);
33
+ cudaMemcpy(d_y, y, N*sizeof(float), cudaMemcpyHostToDevice);
34
+
35
+ // Perform SAXPY on 1M elements
36
+ saxpy<<<(N+255)/256, 256>>>(N, 2.0f, d_x, d_y);
37
+
38
+ cudaMemcpy(y, d_y, N*sizeof(float), cudaMemcpyDeviceToHost);
39
+
40
+ float maxError = 0.0f;
41
+ for (int i = 0; i < N; i++)
42
+ maxError = max(maxError, abs(y[i]-4.0f));
43
+ printf("Max error: %f\n", maxError);
44
+
45
+ cudaFree(d_x);
46
+ cudaFree(d_y);
47
+ free(x);
48
+ free(y);
49
+ }
@@ -0,0 +1,48 @@
1
+ {
2
+ // CUDA build tasks for VSCode
3
+ // Deployed to: ~/workspace/.vscode/tasks.json
4
+ "version": "2.0.0",
5
+ "tasks": [
6
+ {
7
+ "label": "nvcc: build active file (debug)",
8
+ "type": "shell",
9
+ "command": "nvcc",
10
+ "args": [
11
+ "-g", // Host debug symbols
12
+ "-G", // Device (GPU) debug symbols
13
+ "-O0", // No optimization
14
+ "-arch=__GPU_ARCH__", // GPU arch (auto-detected)
15
+ "-o",
16
+ "${fileDirname}/${fileBasenameNoExtension}",
17
+ "${file}"
18
+ ],
19
+ "options": {
20
+ "cwd": "${fileDirname}"
21
+ },
22
+ "problemMatcher": ["$nvcc"],
23
+ "group": {
24
+ "kind": "build",
25
+ "isDefault": true
26
+ },
27
+ "detail": "Compile active .cu file with debug symbols (-g -G)"
28
+ },
29
+ {
30
+ "label": "nvcc: build active file (release)",
31
+ "type": "shell",
32
+ "command": "nvcc",
33
+ "args": [
34
+ "-O3",
35
+ "-arch=__GPU_ARCH__",
36
+ "-o",
37
+ "${fileDirname}/${fileBasenameNoExtension}",
38
+ "${file}"
39
+ ],
40
+ "options": {
41
+ "cwd": "${fileDirname}"
42
+ },
43
+ "problemMatcher": ["$nvcc"],
44
+ "group": "build",
45
+ "detail": "Compile active .cu file optimized (no debug)"
46
+ }
47
+ ]
48
+ }
aws_bootstrap/ssh.py CHANGED
@@ -159,6 +159,42 @@ def run_remote_setup(
159
159
  click.secho(f" SCP failed: {nb_result.stderr}", fg="red", err=True)
160
160
  return False
161
161
 
162
+ # SCP the CUDA example source
163
+ saxpy_path = script_path.parent / "saxpy.cu"
164
+ click.echo(" Uploading saxpy.cu...")
165
+ saxpy_result = subprocess.run(
166
+ ["scp", *ssh_opts, *scp_port_opts, str(saxpy_path), f"{user}@{host}:/tmp/saxpy.cu"],
167
+ capture_output=True,
168
+ text=True,
169
+ )
170
+ if saxpy_result.returncode != 0:
171
+ click.secho(f" SCP failed: {saxpy_result.stderr}", fg="red", err=True)
172
+ return False
173
+
174
+ # SCP the VSCode launch.json
175
+ launch_json_path = script_path.parent / "launch.json"
176
+ click.echo(" Uploading launch.json...")
177
+ launch_result = subprocess.run(
178
+ ["scp", *ssh_opts, *scp_port_opts, str(launch_json_path), f"{user}@{host}:/tmp/launch.json"],
179
+ capture_output=True,
180
+ text=True,
181
+ )
182
+ if launch_result.returncode != 0:
183
+ click.secho(f" SCP failed: {launch_result.stderr}", fg="red", err=True)
184
+ return False
185
+
186
+ # SCP the VSCode tasks.json
187
+ tasks_json_path = script_path.parent / "tasks.json"
188
+ click.echo(" Uploading tasks.json...")
189
+ tasks_result = subprocess.run(
190
+ ["scp", *ssh_opts, *scp_port_opts, str(tasks_json_path), f"{user}@{host}:/tmp/tasks.json"],
191
+ capture_output=True,
192
+ text=True,
193
+ )
194
+ if tasks_result.returncode != 0:
195
+ click.secho(f" SCP failed: {tasks_result.stderr}", fg="red", err=True)
196
+ return False
197
+
162
198
  # SCP the script
163
199
  click.echo(" Uploading remote_setup.sh...")
164
200
  scp_result = subprocess.run(
@@ -338,6 +374,34 @@ def list_ssh_hosts(config_path: Path | None = None) -> dict[str, str]:
338
374
  return result
339
375
 
340
376
 
377
+ _INSTANCE_ID_RE = re.compile(r"^i-[0-9a-f]{8,17}$")
378
+
379
+
380
+ def _is_instance_id(value: str) -> bool:
381
+ """Return ``True`` if *value* looks like an EC2 instance ID (``i-`` + hex)."""
382
+ return _INSTANCE_ID_RE.match(value) is not None
383
+
384
+
385
+ def resolve_instance_id(value: str, config_path: Path | None = None) -> str | None:
386
+ """Resolve *value* to an EC2 instance ID.
387
+
388
+ If *value* already looks like an instance ID (``i-`` prefix followed by hex
389
+ digits) it is returned as-is. Otherwise it is treated as an SSH host alias
390
+ and looked up in the managed SSH config blocks.
391
+
392
+ Returns the instance ID on success, or ``None`` if the alias was not found.
393
+ """
394
+ if _is_instance_id(value):
395
+ return value
396
+
397
+ hosts = list_ssh_hosts(config_path)
398
+ # Reverse lookup: alias -> instance_id
399
+ for iid, alias in hosts.items():
400
+ if alias == value:
401
+ return iid
402
+ return None
403
+
404
+
341
405
  @dataclass
342
406
  class SSHHostDetails:
343
407
  """Connection details parsed from an SSH config stanza."""
@@ -170,6 +170,58 @@ def test_terminate_with_confirm(mock_terminate, mock_find, mock_session, mock_re
170
170
  assert mock_terminate.call_args[0][1] == ["i-abc123"]
171
171
 
172
172
 
173
+ @patch("aws_bootstrap.cli.remove_ssh_host", return_value=None)
174
+ @patch("aws_bootstrap.cli.boto3.Session")
175
+ @patch("aws_bootstrap.cli.terminate_tagged_instances")
176
+ @patch("aws_bootstrap.cli.resolve_instance_id", return_value="i-abc123")
177
+ def test_terminate_by_alias(mock_resolve, mock_terminate, mock_session, mock_remove_ssh):
178
+ mock_terminate.return_value = [
179
+ {
180
+ "InstanceId": "i-abc123",
181
+ "PreviousState": {"Name": "running"},
182
+ "CurrentState": {"Name": "shutting-down"},
183
+ }
184
+ ]
185
+ runner = CliRunner()
186
+ result = runner.invoke(main, ["terminate", "--yes", "aws-gpu1"])
187
+ assert result.exit_code == 0
188
+ assert "Resolved alias 'aws-gpu1' -> i-abc123" in result.output
189
+ assert "Terminated 1" in result.output
190
+ mock_resolve.assert_called_once_with("aws-gpu1")
191
+ mock_terminate.assert_called_once()
192
+ assert mock_terminate.call_args[0][1] == ["i-abc123"]
193
+
194
+
195
+ @patch("aws_bootstrap.cli.boto3.Session")
196
+ @patch("aws_bootstrap.cli.resolve_instance_id", return_value=None)
197
+ def test_terminate_unknown_alias_errors(mock_resolve, mock_session):
198
+ runner = CliRunner()
199
+ result = runner.invoke(main, ["terminate", "--yes", "aws-gpu99"])
200
+ assert result.exit_code != 0
201
+ assert "Could not resolve 'aws-gpu99'" in result.output
202
+
203
+
204
+ @patch("aws_bootstrap.cli.remove_ssh_host", return_value=None)
205
+ @patch("aws_bootstrap.cli.boto3.Session")
206
+ @patch("aws_bootstrap.cli.terminate_tagged_instances")
207
+ @patch("aws_bootstrap.cli.resolve_instance_id", return_value="i-abc123")
208
+ def test_terminate_by_instance_id_passthrough(mock_resolve, mock_terminate, mock_session, mock_remove_ssh):
209
+ """Instance IDs are passed through without resolution message."""
210
+ mock_resolve.return_value = "i-abc123"
211
+ mock_terminate.return_value = [
212
+ {
213
+ "InstanceId": "i-abc123",
214
+ "PreviousState": {"Name": "running"},
215
+ "CurrentState": {"Name": "shutting-down"},
216
+ }
217
+ ]
218
+ runner = CliRunner()
219
+ result = runner.invoke(main, ["terminate", "--yes", "i-abc123"])
220
+ assert result.exit_code == 0
221
+ assert "Resolved alias" not in result.output
222
+ assert "Terminated 1" in result.output
223
+
224
+
173
225
  @patch("aws_bootstrap.cli.boto3.Session")
174
226
  @patch("aws_bootstrap.cli.find_tagged_instances")
175
227
  def test_terminate_cancelled(mock_find, mock_session):
@@ -565,7 +617,7 @@ def test_status_instructions_shown_by_default(mock_find, mock_spot, mock_session
565
617
  assert result.exit_code == 0
566
618
  assert "ssh aws-gpu1" in result.output
567
619
  assert "ssh -NL 8888:localhost:8888 aws-gpu1" in result.output
568
- assert "vscode-remote://ssh-remote+aws-gpu1/home/ubuntu" in result.output
620
+ assert "vscode-remote://ssh-remote+aws-gpu1/home/ubuntu/workspace" in result.output
569
621
  assert "python ~/gpu_benchmark.py" in result.output
570
622
 
571
623
 
@@ -6,6 +6,7 @@ import stat
6
6
  from pathlib import Path
7
7
 
8
8
  from aws_bootstrap.ssh import (
9
+ _is_instance_id,
9
10
  _next_alias,
10
11
  _read_ssh_config,
11
12
  add_ssh_host,
@@ -13,6 +14,7 @@ from aws_bootstrap.ssh import (
13
14
  get_ssh_host_details,
14
15
  list_ssh_hosts,
15
16
  remove_ssh_host,
17
+ resolve_instance_id,
16
18
  )
17
19
 
18
20
 
@@ -331,3 +333,77 @@ def test_get_ssh_host_details_default_port(tmp_path):
331
333
  details = get_ssh_host_details("i-abc123", config_path=cfg)
332
334
  assert details is not None
333
335
  assert details.port == 22
336
+
337
+
338
+ # ---------------------------------------------------------------------------
339
+ # Instance ID detection
340
+ # ---------------------------------------------------------------------------
341
+
342
+
343
+ def test_is_instance_id_valid_short():
344
+ assert _is_instance_id("i-abcdef01") is True
345
+
346
+
347
+ def test_is_instance_id_valid_long():
348
+ assert _is_instance_id("i-0123456789abcdef0") is True
349
+
350
+
351
+ def test_is_instance_id_rejects_alias():
352
+ assert _is_instance_id("aws-gpu1") is False
353
+
354
+
355
+ def test_is_instance_id_rejects_empty():
356
+ assert _is_instance_id("") is False
357
+
358
+
359
+ def test_is_instance_id_rejects_prefix_only():
360
+ assert _is_instance_id("i-") is False
361
+
362
+
363
+ def test_is_instance_id_rejects_uppercase():
364
+ assert _is_instance_id("i-ABCDEF01") is False
365
+
366
+
367
+ def test_is_instance_id_rejects_too_short():
368
+ assert _is_instance_id("i-abc") is False
369
+
370
+
371
+ # ---------------------------------------------------------------------------
372
+ # resolve_instance_id
373
+ # ---------------------------------------------------------------------------
374
+
375
+
376
+ def test_resolve_passthrough_instance_id(tmp_path):
377
+ """Instance IDs are returned as-is without consulting SSH config."""
378
+ cfg = _config_path(tmp_path)
379
+ cfg.parent.mkdir(parents=True, exist_ok=True)
380
+ cfg.write_text("")
381
+ result = resolve_instance_id("i-0123456789abcdef0", config_path=cfg)
382
+ assert result == "i-0123456789abcdef0"
383
+
384
+
385
+ def test_resolve_alias_to_instance_id(tmp_path):
386
+ cfg = _config_path(tmp_path)
387
+ add_ssh_host("i-abc12345", "1.2.3.4", "ubuntu", KEY_PATH, config_path=cfg)
388
+ result = resolve_instance_id("aws-gpu1", config_path=cfg)
389
+ assert result == "i-abc12345"
390
+
391
+
392
+ def test_resolve_alias_multiple_hosts(tmp_path):
393
+ cfg = _config_path(tmp_path)
394
+ add_ssh_host("i-111aaa11", "1.1.1.1", "ubuntu", KEY_PATH, config_path=cfg)
395
+ add_ssh_host("i-222bbb22", "2.2.2.2", "ubuntu", KEY_PATH, config_path=cfg)
396
+ assert resolve_instance_id("aws-gpu1", config_path=cfg) == "i-111aaa11"
397
+ assert resolve_instance_id("aws-gpu2", config_path=cfg) == "i-222bbb22"
398
+
399
+
400
+ def test_resolve_unknown_alias_returns_none(tmp_path):
401
+ cfg = _config_path(tmp_path)
402
+ cfg.parent.mkdir(parents=True, exist_ok=True)
403
+ cfg.write_text("")
404
+ assert resolve_instance_id("aws-gpu99", config_path=cfg) is None
405
+
406
+
407
+ def test_resolve_nonexistent_config_returns_none(tmp_path):
408
+ cfg = tmp_path / "no_such_file"
409
+ assert resolve_instance_id("aws-gpu1", config_path=cfg) is None
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: aws-bootstrap-g4dn
3
- Version: 0.3.0
3
+ Version: 0.5.0
4
4
  Summary: Bootstrap AWS EC2 GPU instances for hybrid local-remote development
5
5
  Author: Adam Ever-Hadani
6
6
  License-Expression: MIT
@@ -49,7 +49,7 @@ ssh aws-gpu1 # You're in, venv activated, PyTorch works
49
49
  ### 🎯 Target Workflows
50
50
 
51
51
  1. **Jupyter server-client** — Jupyter runs on the instance, connect from your local browser
52
- 2. **VSCode Remote SSH** — `ssh aws-gpu1` just works with the Remote SSH extension
52
+ 2. **VSCode Remote SSH** — opens `~/workspace` with pre-configured CUDA debug/build tasks and an example `.cu` file
53
53
  3. **NVIDIA Nsight remote debugging** — GPU debugging over SSH
54
54
 
55
55
  ---
@@ -162,6 +162,7 @@ The setup script runs automatically on the instance after SSH becomes available:
162
162
  | **GPU smoke test notebook** | Copies `gpu_smoke_test.ipynb` to `~/gpu_smoke_test.ipynb` (open in JupyterLab) |
163
163
  | **Jupyter** | Configures and starts JupyterLab as a systemd service on port 8888 |
164
164
  | **SSH keepalive** | Configures server-side keepalive to prevent idle disconnects |
165
+ | **VSCode workspace** | Creates `~/workspace/.vscode/` with `launch.json` and `tasks.json` (auto-detected `cuda-gdb` path and GPU arch), plus an example `saxpy.cu` |
165
166
 
166
167
  ### 📊 GPU Benchmark
167
168
 
@@ -200,6 +201,28 @@ ssh -i ~/.ssh/id_ed25519 -NL 8888:localhost:8888 ubuntu@<public-ip>
200
201
 
201
202
  A **GPU smoke test notebook** (`~/gpu_smoke_test.ipynb`) is pre-installed on every instance. Open it in JupyterLab to interactively verify the CUDA stack, run FP32/FP16 matmuls, train a small CNN on MNIST, and visualise training loss and GPU memory usage.
202
203
 
204
+ ### 🖥️ VSCode Remote SSH
205
+
206
+ The remote setup creates a `~/workspace` folder with pre-configured CUDA debug and build tasks:
207
+
208
+ ```
209
+ ~/workspace/
210
+ ├── .vscode/
211
+ │ ├── launch.json # CUDA debug configs (cuda-gdb path auto-detected)
212
+ │ └── tasks.json # nvcc build tasks (GPU arch auto-detected, e.g. sm_75)
213
+ └── saxpy.cu # Example CUDA source — open and press F5 to debug
214
+ ```
215
+
216
+ Connect directly from your terminal:
217
+
218
+ ```bash
219
+ code --folder-uri vscode-remote://ssh-remote+aws-gpu1/home/ubuntu/workspace
220
+ ```
221
+
222
+ Then install the [Nsight VSCE extension](https://marketplace.visualstudio.com/items?itemName=NVIDIA.nsight-vscode-edition) on the remote when prompted. Open `saxpy.cu`, set a breakpoint, and press F5.
223
+
224
+ See [Nsight remote profiling guide](docs/nsight-remote-profiling.md) for more details on CUDA debugging and profiling workflows.
225
+
203
226
  ### 📋 Listing Resources
204
227
 
205
228
  ```bash
@@ -238,8 +261,14 @@ aws-bootstrap status --region us-east-1
238
261
  # Terminate all aws-bootstrap instances (with confirmation prompt)
239
262
  aws-bootstrap terminate
240
263
 
241
- # Terminate specific instances
242
- aws-bootstrap terminate i-abc123 i-def456
264
+ # Terminate by SSH alias (resolved via ~/.ssh/config)
265
+ aws-bootstrap terminate aws-gpu1
266
+
267
+ # Terminate by instance ID
268
+ aws-bootstrap terminate i-abc123
269
+
270
+ # Mix aliases and instance IDs
271
+ aws-bootstrap terminate aws-gpu1 i-def456
243
272
 
244
273
  # Skip confirmation prompt
245
274
  aws-bootstrap terminate --yes
@@ -251,7 +280,7 @@ aws-bootstrap terminate --yes
251
280
  CUDA: 12.8 (driver supports up to 13.0)
252
281
  ```
253
282
 
254
- SSH aliases are managed automatically — they're created on `launch`, shown in `status`, and cleaned up on `terminate`. Aliases use sequential numbering (`aws-gpu1`, `aws-gpu2`, etc.) and never reuse numbers from previous instances.
283
+ SSH aliases are managed automatically — they're created on `launch`, shown in `status`, and cleaned up on `terminate`. Aliases use sequential numbering (`aws-gpu1`, `aws-gpu2`, etc.) and never reuse numbers from previous instances. You can use aliases anywhere you'd use an instance ID, e.g. `aws-bootstrap terminate aws-gpu1`.
255
284
 
256
285
  ## EC2 vCPU Quotas
257
286
 
@@ -322,7 +351,7 @@ aws-bootstrap launch --instance-type t3.medium --ami-filter "ubuntu/images/hvm-s
322
351
  | GPU instance pricing | [instances.vantage.sh](https://instances.vantage.sh/aws/ec2/g4dn.xlarge) |
323
352
  | Spot instance quotas | [AWS docs](https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/using-spot-limits.html) |
324
353
  | Deep Learning AMIs | [AWS docs](https://docs.aws.amazon.com/dlami/latest/devguide/what-is-dlami.html) |
325
- | Nvidia Nsight remote debugging | [Nvidia docs](https://docs.nvidia.com/nsight-visual-studio-edition/3.2/Content/Setup_Remote_Debugging.htm) |
354
+ | Nsight remote GPU profiling | [Guide](docs/nsight-remote-profiling.md) — Nsight Compute, Nsight Systems, and Nsight VSCE on EC2 |
326
355
 
327
356
  Tutorials on setting up a CUDA environment on EC2 GPU instances:
328
357
 
@@ -0,0 +1,27 @@
1
+ aws_bootstrap/__init__.py,sha256=kl_jvrunGyIyizdRqAP6ROb5P1BBrXX5PTq5gq1ipU0,82
2
+ aws_bootstrap/cli.py,sha256=N2hT0XEC-4k5Cs3iGfA_xt_onc__NMNmh8fCaV4frgc,21076
3
+ aws_bootstrap/config.py,sha256=TeCOYDlijT-KD5SFIzc-VvBhOqcq9YCgen9NK63rka8,895
4
+ aws_bootstrap/ec2.py,sha256=LHpzW91ayK45gsWV_B4LanSZIhWggqTsL31qHUceiaA,12274
5
+ aws_bootstrap/gpu.py,sha256=WTnHR0s3mQHDlnzqRgqAC6omWz7nT5YtGpcs0Bf88jk,692
6
+ aws_bootstrap/ssh.py,sha256=0acHNX7IG6PUvp6T72l9kHTwUs5sVXFAyJXvUfA3qnE,20131
7
+ aws_bootstrap/resources/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
8
+ aws_bootstrap/resources/gpu_benchmark.py,sha256=1eFt_3MXvoLhs9HahrRPhbxvtdjFaXG2Ty3GEg7Gud0,29366
9
+ aws_bootstrap/resources/gpu_smoke_test.ipynb,sha256=XvAOEIPa5H9ri5mRZqOdknmwOwKNvCME6DzBGuhRYfg,10698
10
+ aws_bootstrap/resources/launch.json,sha256=ZOcvHLy3-zBOqRTtFzuyn-_2tB64yuEn8PrJOoZ-PgE,1484
11
+ aws_bootstrap/resources/remote_setup.sh,sha256=z_YGdzwEHWInkE3dZVbBNa0F_joTeVhnOpCYOj1CK30,8331
12
+ aws_bootstrap/resources/requirements.txt,sha256=gpYl1MFCfWXiAhbIUgAjuTHONz3MKci25msIyOkMmUk,75
13
+ aws_bootstrap/resources/saxpy.cu,sha256=1BSESEwGGCx3KWx9ZJ8jiPHQ42KzQN6i2aP0I28bPsA,1178
14
+ aws_bootstrap/resources/tasks.json,sha256=6U8pB1N8YIWgUCfFet4ne3nYnI92tWv5D5kPiQG3Zlg,1576
15
+ aws_bootstrap/tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
16
+ aws_bootstrap/tests/test_cli.py,sha256=m_4tIX0xYZ8BxDkHPGAWiPAKo4vETaTSKSJbyn3K1Cg,34731
17
+ aws_bootstrap/tests/test_config.py,sha256=arvET6KNl4Vqsz0zFrSdhciXGU688bfsvCr3dSpziN0,1050
18
+ aws_bootstrap/tests/test_ec2.py,sha256=Jmqsjv973hxXbZWfGgECtm6aa2156Lzji227sYMBuMg,10547
19
+ aws_bootstrap/tests/test_gpu.py,sha256=rbMuda_sIVbaCzkWXoLv9YIfnWztgRoP7NuVL8XHrUY,3871
20
+ aws_bootstrap/tests/test_ssh_config.py,sha256=YYtv82zBBLGioTo58iC31_5jUli1s0eoGV9VRCobOgY,14059
21
+ aws_bootstrap/tests/test_ssh_gpu.py,sha256=dRp86Og-8GqiATSff3rxhu83mBZdGgqI4UOnoC00Ln0,1454
22
+ aws_bootstrap_g4dn-0.5.0.dist-info/licenses/LICENSE,sha256=Hen77Mt8sazSQJ9DgrmZuAvDwo2vc5JAkR_avuFV-CM,1067
23
+ aws_bootstrap_g4dn-0.5.0.dist-info/METADATA,sha256=t8m53ZodJlZyMffeSu3Wk5bMt-Dm_Jl3q_HTbRLQbYE,13728
24
+ aws_bootstrap_g4dn-0.5.0.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
25
+ aws_bootstrap_g4dn-0.5.0.dist-info/entry_points.txt,sha256=T8FXfOgmLEvFi8DHaFJ3tCzId9J3_d2Y6qT98OXxCjA,57
26
+ aws_bootstrap_g4dn-0.5.0.dist-info/top_level.txt,sha256=mix9gZRs8JUv0OMSB_rwdGcRnTKzsKgHrE5fyAn5zJw,14
27
+ aws_bootstrap_g4dn-0.5.0.dist-info/RECORD,,
@@ -1,24 +0,0 @@
1
- aws_bootstrap/__init__.py,sha256=kl_jvrunGyIyizdRqAP6ROb5P1BBrXX5PTq5gq1ipU0,82
2
- aws_bootstrap/cli.py,sha256=H7Lud1PWk0O5zKGf1StARCEahrMErickuHXsWk42j3A,20481
3
- aws_bootstrap/config.py,sha256=TeCOYDlijT-KD5SFIzc-VvBhOqcq9YCgen9NK63rka8,895
4
- aws_bootstrap/ec2.py,sha256=LHpzW91ayK45gsWV_B4LanSZIhWggqTsL31qHUceiaA,12274
5
- aws_bootstrap/gpu.py,sha256=WTnHR0s3mQHDlnzqRgqAC6omWz7nT5YtGpcs0Bf88jk,692
6
- aws_bootstrap/ssh.py,sha256=RK5Ahiwpol9-4MUvurKyNa1JorQW9VkkNtSSfPzryrU,17851
7
- aws_bootstrap/resources/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
8
- aws_bootstrap/resources/gpu_benchmark.py,sha256=2uoss2bZGhg7c3D7Hg1-EJlOVDtzAH4co1ahSvF_lVU,29080
9
- aws_bootstrap/resources/gpu_smoke_test.ipynb,sha256=XvAOEIPa5H9ri5mRZqOdknmwOwKNvCME6DzBGuhRYfg,10698
10
- aws_bootstrap/resources/remote_setup.sh,sha256=n1joNO-6EizLsz2BPOPruFhe90kEQ9Np2SBhYXnOJRs,5648
11
- aws_bootstrap/resources/requirements.txt,sha256=gpYl1MFCfWXiAhbIUgAjuTHONz3MKci25msIyOkMmUk,75
12
- aws_bootstrap/tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
13
- aws_bootstrap/tests/test_cli.py,sha256=vyoVVqSakC7Y2BCEFpyf2ghTUvT-QWBQC9-yvEFz3gw,32554
14
- aws_bootstrap/tests/test_config.py,sha256=arvET6KNl4Vqsz0zFrSdhciXGU688bfsvCr3dSpziN0,1050
15
- aws_bootstrap/tests/test_ec2.py,sha256=Jmqsjv973hxXbZWfGgECtm6aa2156Lzji227sYMBuMg,10547
16
- aws_bootstrap/tests/test_gpu.py,sha256=rbMuda_sIVbaCzkWXoLv9YIfnWztgRoP7NuVL8XHrUY,3871
17
- aws_bootstrap/tests/test_ssh_config.py,sha256=iQDd3hJ8to-2-QHW26Brtglfl0q0P6sCE6U_itxoNyY,11609
18
- aws_bootstrap/tests/test_ssh_gpu.py,sha256=dRp86Og-8GqiATSff3rxhu83mBZdGgqI4UOnoC00Ln0,1454
19
- aws_bootstrap_g4dn-0.3.0.dist-info/licenses/LICENSE,sha256=Hen77Mt8sazSQJ9DgrmZuAvDwo2vc5JAkR_avuFV-CM,1067
20
- aws_bootstrap_g4dn-0.3.0.dist-info/METADATA,sha256=tfsBYTSqVQf8A46P22qwdFsb_ur-Ge57hQfuDaj0mgE,12417
21
- aws_bootstrap_g4dn-0.3.0.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
22
- aws_bootstrap_g4dn-0.3.0.dist-info/entry_points.txt,sha256=T8FXfOgmLEvFi8DHaFJ3tCzId9J3_d2Y6qT98OXxCjA,57
23
- aws_bootstrap_g4dn-0.3.0.dist-info/top_level.txt,sha256=mix9gZRs8JUv0OMSB_rwdGcRnTKzsKgHrE5fyAn5zJw,14
24
- aws_bootstrap_g4dn-0.3.0.dist-info/RECORD,,