aws-bootstrap-g4dn 0.4.0__py3-none-any.whl → 0.6.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,245 @@
1
+ """Tests for EBS data volume operations in ec2.py."""
2
+
3
+ from __future__ import annotations
4
+ from unittest.mock import MagicMock
5
+
6
+ import botocore.exceptions
7
+ import pytest
8
+
9
+ from aws_bootstrap.ec2 import (
10
+ EBS_DEVICE_NAME,
11
+ CLIError,
12
+ attach_ebs_volume,
13
+ create_ebs_volume,
14
+ delete_ebs_volume,
15
+ detach_ebs_volume,
16
+ find_ebs_volumes_for_instance,
17
+ validate_ebs_volume,
18
+ )
19
+
20
+
21
+ # ---------------------------------------------------------------------------
22
+ # create_ebs_volume
23
+ # ---------------------------------------------------------------------------
24
+
25
+
26
+ def test_create_ebs_volume():
27
+ ec2 = MagicMock()
28
+ ec2.create_volume.return_value = {"VolumeId": "vol-abc123"}
29
+ waiter = MagicMock()
30
+ ec2.get_waiter.return_value = waiter
31
+
32
+ vol_id = create_ebs_volume(ec2, 96, "us-west-2a", "aws-bootstrap-g4dn", "i-test123")
33
+
34
+ assert vol_id == "vol-abc123"
35
+ ec2.create_volume.assert_called_once()
36
+ create_kwargs = ec2.create_volume.call_args[1]
37
+ assert create_kwargs["AvailabilityZone"] == "us-west-2a"
38
+ assert create_kwargs["Size"] == 96
39
+ assert create_kwargs["VolumeType"] == "gp3"
40
+
41
+ # Check tags
42
+ tags = create_kwargs["TagSpecifications"][0]["Tags"]
43
+ tag_dict = {t["Key"]: t["Value"] for t in tags}
44
+ assert tag_dict["created-by"] == "aws-bootstrap-g4dn"
45
+ assert tag_dict["Name"] == "aws-bootstrap-data-i-test123"
46
+ assert tag_dict["aws-bootstrap-instance"] == "i-test123"
47
+
48
+ ec2.get_waiter.assert_called_once_with("volume_available")
49
+ waiter.wait.assert_called_once()
50
+
51
+
52
+ # ---------------------------------------------------------------------------
53
+ # validate_ebs_volume
54
+ # ---------------------------------------------------------------------------
55
+
56
+
57
+ def test_validate_ebs_volume_valid():
58
+ ec2 = MagicMock()
59
+ ec2.describe_volumes.return_value = {
60
+ "Volumes": [
61
+ {
62
+ "VolumeId": "vol-abc123",
63
+ "State": "available",
64
+ "AvailabilityZone": "us-west-2a",
65
+ "Size": 100,
66
+ }
67
+ ]
68
+ }
69
+ vol = validate_ebs_volume(ec2, "vol-abc123", "us-west-2a")
70
+ assert vol["VolumeId"] == "vol-abc123"
71
+
72
+
73
+ def test_validate_ebs_volume_wrong_az():
74
+ ec2 = MagicMock()
75
+ ec2.describe_volumes.return_value = {
76
+ "Volumes": [
77
+ {
78
+ "VolumeId": "vol-abc123",
79
+ "State": "available",
80
+ "AvailabilityZone": "us-east-1a",
81
+ "Size": 100,
82
+ }
83
+ ]
84
+ }
85
+ with pytest.raises(CLIError, match="us-east-1a"):
86
+ validate_ebs_volume(ec2, "vol-abc123", "us-west-2a")
87
+
88
+
89
+ def test_validate_ebs_volume_in_use():
90
+ ec2 = MagicMock()
91
+ ec2.describe_volumes.return_value = {
92
+ "Volumes": [
93
+ {
94
+ "VolumeId": "vol-abc123",
95
+ "State": "in-use",
96
+ "AvailabilityZone": "us-west-2a",
97
+ "Size": 100,
98
+ }
99
+ ]
100
+ }
101
+ with pytest.raises(CLIError, match="in-use"):
102
+ validate_ebs_volume(ec2, "vol-abc123", "us-west-2a")
103
+
104
+
105
+ def test_validate_ebs_volume_not_found():
106
+ ec2 = MagicMock()
107
+ ec2.describe_volumes.side_effect = botocore.exceptions.ClientError(
108
+ {"Error": {"Code": "InvalidVolume.NotFound", "Message": "not found"}},
109
+ "DescribeVolumes",
110
+ )
111
+ with pytest.raises(CLIError, match="not found"):
112
+ validate_ebs_volume(ec2, "vol-notfound", "us-west-2a")
113
+
114
+
115
+ def test_validate_ebs_volume_empty_response():
116
+ ec2 = MagicMock()
117
+ ec2.describe_volumes.return_value = {"Volumes": []}
118
+ with pytest.raises(CLIError, match="not found"):
119
+ validate_ebs_volume(ec2, "vol-empty", "us-west-2a")
120
+
121
+
122
+ # ---------------------------------------------------------------------------
123
+ # attach_ebs_volume
124
+ # ---------------------------------------------------------------------------
125
+
126
+
127
+ def test_attach_ebs_volume():
128
+ ec2 = MagicMock()
129
+ waiter = MagicMock()
130
+ ec2.get_waiter.return_value = waiter
131
+
132
+ attach_ebs_volume(ec2, "vol-abc123", "i-test123")
133
+
134
+ ec2.attach_volume.assert_called_once_with(
135
+ VolumeId="vol-abc123",
136
+ InstanceId="i-test123",
137
+ Device=EBS_DEVICE_NAME,
138
+ )
139
+ ec2.get_waiter.assert_called_once_with("volume_in_use")
140
+ waiter.wait.assert_called_once()
141
+
142
+
143
+ def test_attach_ebs_volume_custom_device():
144
+ ec2 = MagicMock()
145
+ waiter = MagicMock()
146
+ ec2.get_waiter.return_value = waiter
147
+
148
+ attach_ebs_volume(ec2, "vol-abc123", "i-test123", device_name="/dev/sdg")
149
+
150
+ ec2.attach_volume.assert_called_once_with(
151
+ VolumeId="vol-abc123",
152
+ InstanceId="i-test123",
153
+ Device="/dev/sdg",
154
+ )
155
+
156
+
157
+ # ---------------------------------------------------------------------------
158
+ # detach_ebs_volume
159
+ # ---------------------------------------------------------------------------
160
+
161
+
162
+ def test_detach_ebs_volume():
163
+ ec2 = MagicMock()
164
+ waiter = MagicMock()
165
+ ec2.get_waiter.return_value = waiter
166
+
167
+ detach_ebs_volume(ec2, "vol-abc123")
168
+
169
+ ec2.detach_volume.assert_called_once_with(VolumeId="vol-abc123")
170
+ ec2.get_waiter.assert_called_once_with("volume_available")
171
+ waiter.wait.assert_called_once()
172
+
173
+
174
+ # ---------------------------------------------------------------------------
175
+ # delete_ebs_volume
176
+ # ---------------------------------------------------------------------------
177
+
178
+
179
+ def test_delete_ebs_volume():
180
+ ec2 = MagicMock()
181
+ delete_ebs_volume(ec2, "vol-abc123")
182
+ ec2.delete_volume.assert_called_once_with(VolumeId="vol-abc123")
183
+
184
+
185
+ # ---------------------------------------------------------------------------
186
+ # find_ebs_volumes_for_instance
187
+ # ---------------------------------------------------------------------------
188
+
189
+
190
+ def test_find_ebs_volumes_for_instance():
191
+ ec2 = MagicMock()
192
+ ec2.describe_volumes.return_value = {
193
+ "Volumes": [
194
+ {
195
+ "VolumeId": "vol-data1",
196
+ "Size": 96,
197
+ "State": "in-use",
198
+ "Attachments": [{"Device": "/dev/sdf", "InstanceId": "i-test123"}],
199
+ }
200
+ ]
201
+ }
202
+ volumes = find_ebs_volumes_for_instance(ec2, "i-test123", "aws-bootstrap-g4dn")
203
+ assert len(volumes) == 1
204
+ assert volumes[0]["VolumeId"] == "vol-data1"
205
+ assert volumes[0]["Size"] == 96
206
+ assert volumes[0]["Device"] == "/dev/sdf"
207
+ assert volumes[0]["State"] == "in-use"
208
+
209
+
210
+ def test_find_ebs_volumes_empty():
211
+ ec2 = MagicMock()
212
+ ec2.describe_volumes.return_value = {"Volumes": []}
213
+ volumes = find_ebs_volumes_for_instance(ec2, "i-test123", "aws-bootstrap-g4dn")
214
+ assert volumes == []
215
+
216
+
217
+ def test_find_ebs_volumes_includes_available():
218
+ """Detached (available) volumes are still discovered by tags."""
219
+ ec2 = MagicMock()
220
+ ec2.describe_volumes.return_value = {
221
+ "Volumes": [
222
+ {
223
+ "VolumeId": "vol-avail",
224
+ "Size": 50,
225
+ "State": "available",
226
+ "Attachments": [],
227
+ }
228
+ ]
229
+ }
230
+ volumes = find_ebs_volumes_for_instance(ec2, "i-old", "aws-bootstrap-g4dn")
231
+ assert len(volumes) == 1
232
+ assert volumes[0]["VolumeId"] == "vol-avail"
233
+ assert volumes[0]["State"] == "available"
234
+ assert volumes[0]["Device"] == ""
235
+
236
+
237
+ def test_find_ebs_volumes_client_error_returns_empty():
238
+ """ClientError (e.g. permissions) returns empty list instead of raising."""
239
+ ec2 = MagicMock()
240
+ ec2.describe_volumes.side_effect = botocore.exceptions.ClientError(
241
+ {"Error": {"Code": "UnauthorizedOperation", "Message": "no access"}},
242
+ "DescribeVolumes",
243
+ )
244
+ volumes = find_ebs_volumes_for_instance(ec2, "i-test", "aws-bootstrap-g4dn")
245
+ assert volumes == []
@@ -6,13 +6,17 @@ import stat
6
6
  from pathlib import Path
7
7
 
8
8
  from aws_bootstrap.ssh import (
9
+ _is_instance_id,
9
10
  _next_alias,
10
11
  _read_ssh_config,
11
12
  add_ssh_host,
13
+ cleanup_stale_ssh_hosts,
12
14
  find_ssh_alias,
15
+ find_stale_ssh_hosts,
13
16
  get_ssh_host_details,
14
17
  list_ssh_hosts,
15
18
  remove_ssh_host,
19
+ resolve_instance_id,
16
20
  )
17
21
 
18
22
 
@@ -331,3 +335,151 @@ def test_get_ssh_host_details_default_port(tmp_path):
331
335
  details = get_ssh_host_details("i-abc123", config_path=cfg)
332
336
  assert details is not None
333
337
  assert details.port == 22
338
+
339
+
340
+ # ---------------------------------------------------------------------------
341
+ # Instance ID detection
342
+ # ---------------------------------------------------------------------------
343
+
344
+
345
+ def test_is_instance_id_valid_short():
346
+ assert _is_instance_id("i-abcdef01") is True
347
+
348
+
349
+ def test_is_instance_id_valid_long():
350
+ assert _is_instance_id("i-0123456789abcdef0") is True
351
+
352
+
353
+ def test_is_instance_id_rejects_alias():
354
+ assert _is_instance_id("aws-gpu1") is False
355
+
356
+
357
+ def test_is_instance_id_rejects_empty():
358
+ assert _is_instance_id("") is False
359
+
360
+
361
+ def test_is_instance_id_rejects_prefix_only():
362
+ assert _is_instance_id("i-") is False
363
+
364
+
365
+ def test_is_instance_id_rejects_uppercase():
366
+ assert _is_instance_id("i-ABCDEF01") is False
367
+
368
+
369
+ def test_is_instance_id_rejects_too_short():
370
+ assert _is_instance_id("i-abc") is False
371
+
372
+
373
+ # ---------------------------------------------------------------------------
374
+ # resolve_instance_id
375
+ # ---------------------------------------------------------------------------
376
+
377
+
378
+ def test_resolve_passthrough_instance_id(tmp_path):
379
+ """Instance IDs are returned as-is without consulting SSH config."""
380
+ cfg = _config_path(tmp_path)
381
+ cfg.parent.mkdir(parents=True, exist_ok=True)
382
+ cfg.write_text("")
383
+ result = resolve_instance_id("i-0123456789abcdef0", config_path=cfg)
384
+ assert result == "i-0123456789abcdef0"
385
+
386
+
387
+ def test_resolve_alias_to_instance_id(tmp_path):
388
+ cfg = _config_path(tmp_path)
389
+ add_ssh_host("i-abc12345", "1.2.3.4", "ubuntu", KEY_PATH, config_path=cfg)
390
+ result = resolve_instance_id("aws-gpu1", config_path=cfg)
391
+ assert result == "i-abc12345"
392
+
393
+
394
+ def test_resolve_alias_multiple_hosts(tmp_path):
395
+ cfg = _config_path(tmp_path)
396
+ add_ssh_host("i-111aaa11", "1.1.1.1", "ubuntu", KEY_PATH, config_path=cfg)
397
+ add_ssh_host("i-222bbb22", "2.2.2.2", "ubuntu", KEY_PATH, config_path=cfg)
398
+ assert resolve_instance_id("aws-gpu1", config_path=cfg) == "i-111aaa11"
399
+ assert resolve_instance_id("aws-gpu2", config_path=cfg) == "i-222bbb22"
400
+
401
+
402
+ def test_resolve_unknown_alias_returns_none(tmp_path):
403
+ cfg = _config_path(tmp_path)
404
+ cfg.parent.mkdir(parents=True, exist_ok=True)
405
+ cfg.write_text("")
406
+ assert resolve_instance_id("aws-gpu99", config_path=cfg) is None
407
+
408
+
409
+ def test_resolve_nonexistent_config_returns_none(tmp_path):
410
+ cfg = tmp_path / "no_such_file"
411
+ assert resolve_instance_id("aws-gpu1", config_path=cfg) is None
412
+
413
+
414
+ # ---------------------------------------------------------------------------
415
+ # Cleanup: find_stale_ssh_hosts / cleanup_stale_ssh_hosts
416
+ # ---------------------------------------------------------------------------
417
+
418
+
419
+ def test_find_stale_ssh_hosts_finds_orphans(tmp_path):
420
+ cfg = _config_path(tmp_path)
421
+ add_ssh_host("i-111aaaa1", "1.1.1.1", "ubuntu", KEY_PATH, config_path=cfg)
422
+ add_ssh_host("i-222bbbb2", "2.2.2.2", "ubuntu", KEY_PATH, config_path=cfg)
423
+ stale = find_stale_ssh_hosts({"i-111aaaa1"}, config_path=cfg)
424
+ assert stale == [("i-222bbbb2", "aws-gpu2")]
425
+
426
+
427
+ def test_find_stale_ssh_hosts_none_stale(tmp_path):
428
+ cfg = _config_path(tmp_path)
429
+ add_ssh_host("i-111aaaa1", "1.1.1.1", "ubuntu", KEY_PATH, config_path=cfg)
430
+ add_ssh_host("i-222bbbb2", "2.2.2.2", "ubuntu", KEY_PATH, config_path=cfg)
431
+ stale = find_stale_ssh_hosts({"i-111aaaa1", "i-222bbbb2"}, config_path=cfg)
432
+ assert stale == []
433
+
434
+
435
+ def test_find_stale_ssh_hosts_all_stale(tmp_path):
436
+ cfg = _config_path(tmp_path)
437
+ add_ssh_host("i-111aaaa1", "1.1.1.1", "ubuntu", KEY_PATH, config_path=cfg)
438
+ add_ssh_host("i-222bbbb2", "2.2.2.2", "ubuntu", KEY_PATH, config_path=cfg)
439
+ stale = find_stale_ssh_hosts(set(), config_path=cfg)
440
+ assert len(stale) == 2
441
+ assert ("i-111aaaa1", "aws-gpu1") in stale
442
+ assert ("i-222bbbb2", "aws-gpu2") in stale
443
+
444
+
445
+ def test_find_stale_ssh_hosts_empty_config(tmp_path):
446
+ cfg = _config_path(tmp_path)
447
+ cfg.parent.mkdir(parents=True, exist_ok=True)
448
+ cfg.write_text("")
449
+ stale = find_stale_ssh_hosts(set(), config_path=cfg)
450
+ assert stale == []
451
+
452
+
453
+ def test_cleanup_stale_ssh_hosts_removes(tmp_path):
454
+ cfg = _config_path(tmp_path)
455
+ add_ssh_host("i-111aaaa1", "1.1.1.1", "ubuntu", KEY_PATH, config_path=cfg)
456
+ add_ssh_host("i-222bbbb2", "2.2.2.2", "ubuntu", KEY_PATH, config_path=cfg)
457
+ results = cleanup_stale_ssh_hosts({"i-111aaaa1"}, config_path=cfg)
458
+ assert len(results) == 1
459
+ assert results[0].instance_id == "i-222bbbb2"
460
+ assert results[0].alias == "aws-gpu2"
461
+ assert results[0].removed is True
462
+ # Verify it was actually removed from the config
463
+ content = cfg.read_text()
464
+ assert "i-222bbbb2" not in content
465
+ assert "i-111aaaa1" in content
466
+
467
+
468
+ def test_cleanup_stale_ssh_hosts_dry_run(tmp_path):
469
+ cfg = _config_path(tmp_path)
470
+ add_ssh_host("i-111aaaa1", "1.1.1.1", "ubuntu", KEY_PATH, config_path=cfg)
471
+ add_ssh_host("i-222bbbb2", "2.2.2.2", "ubuntu", KEY_PATH, config_path=cfg)
472
+ results = cleanup_stale_ssh_hosts({"i-111aaaa1"}, config_path=cfg, dry_run=True)
473
+ assert len(results) == 1
474
+ assert results[0].removed is False
475
+ # Verify config is unchanged
476
+ content = cfg.read_text()
477
+ assert "i-222bbbb2" in content
478
+ assert "i-111aaaa1" in content
479
+
480
+
481
+ def test_cleanup_stale_ssh_hosts_no_stale(tmp_path):
482
+ cfg = _config_path(tmp_path)
483
+ add_ssh_host("i-111aaaa1", "1.1.1.1", "ubuntu", KEY_PATH, config_path=cfg)
484
+ results = cleanup_stale_ssh_hosts({"i-111aaaa1"}, config_path=cfg)
485
+ assert results == []
@@ -0,0 +1,76 @@
1
+ """Tests for mount_ebs_volume SSH function."""
2
+
3
+ from __future__ import annotations
4
+ from pathlib import Path
5
+ from unittest.mock import MagicMock, patch
6
+
7
+ from aws_bootstrap.ssh import mount_ebs_volume
8
+
9
+
10
+ KEY_PATH = Path("/home/user/.ssh/id_ed25519.pub")
11
+
12
+
13
+ @patch("aws_bootstrap.ssh.subprocess.run")
14
+ def test_mount_ebs_volume_success_format(mock_run):
15
+ """New volume: SSH command includes mkfs."""
16
+ mock_run.return_value = MagicMock(returncode=0)
17
+
18
+ result = mount_ebs_volume("1.2.3.4", "ubuntu", KEY_PATH, "vol-abc123", format_volume=True)
19
+
20
+ assert result is True
21
+ mock_run.assert_called_once()
22
+ cmd = mock_run.call_args[0][0]
23
+ script = cmd[-1]
24
+ assert "mkfs.ext4" in script
25
+ assert "/data" in script
26
+ assert "volabc123" in script # stripped vol- hyphen
27
+
28
+
29
+ @patch("aws_bootstrap.ssh.subprocess.run")
30
+ def test_mount_ebs_volume_success_no_format(mock_run):
31
+ """Existing volume: SSH command skips mkfs."""
32
+ mock_run.return_value = MagicMock(returncode=0)
33
+
34
+ result = mount_ebs_volume("1.2.3.4", "ubuntu", KEY_PATH, "vol-abc123", format_volume=False)
35
+
36
+ assert result is True
37
+ mock_run.assert_called_once()
38
+ cmd = mock_run.call_args[0][0]
39
+ script = cmd[-1]
40
+ assert "mkfs" not in script
41
+ assert "/data" in script
42
+
43
+
44
+ @patch("aws_bootstrap.ssh.subprocess.run")
45
+ def test_mount_ebs_volume_failure(mock_run):
46
+ """Non-zero exit code returns False."""
47
+ mock_run.return_value = MagicMock(returncode=1)
48
+
49
+ result = mount_ebs_volume("1.2.3.4", "ubuntu", KEY_PATH, "vol-abc123")
50
+
51
+ assert result is False
52
+
53
+
54
+ @patch("aws_bootstrap.ssh.subprocess.run")
55
+ def test_mount_ebs_volume_custom_port(mock_run):
56
+ """Non-default port is passed as -p flag."""
57
+ mock_run.return_value = MagicMock(returncode=0)
58
+
59
+ mount_ebs_volume("1.2.3.4", "ubuntu", KEY_PATH, "vol-abc123", port=2222)
60
+
61
+ cmd = mock_run.call_args[0][0]
62
+ assert "-p" in cmd
63
+ port_idx = cmd.index("-p")
64
+ assert cmd[port_idx + 1] == "2222"
65
+
66
+
67
+ @patch("aws_bootstrap.ssh.subprocess.run")
68
+ def test_mount_ebs_volume_custom_mount_point(mock_run):
69
+ """Custom mount point appears in the SSH script."""
70
+ mock_run.return_value = MagicMock(returncode=0)
71
+
72
+ mount_ebs_volume("1.2.3.4", "ubuntu", KEY_PATH, "vol-abc123", mount_point="/mnt/data")
73
+
74
+ cmd = mock_run.call_args[0][0]
75
+ script = cmd[-1]
76
+ assert "/mnt/data" in script
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: aws-bootstrap-g4dn
3
- Version: 0.4.0
3
+ Version: 0.6.0
4
4
  Summary: Bootstrap AWS EC2 GPU instances for hybrid local-remote development
5
5
  Author: Adam Ever-Hadani
6
6
  License-Expression: MIT
@@ -44,7 +44,8 @@ ssh aws-gpu1 # You're in, venv activated, PyTorch works
44
44
  | 📊 | **GPU benchmark included** | CNN (MNIST) + Transformer benchmarks with FP16/FP32/BF16 precision and tqdm progress |
45
45
  | 📓 | **Jupyter ready** | Lab server auto-starts as a systemd service on port 8888 — just SSH tunnel and open |
46
46
  | 🖥️ | **`status --gpu`** | Shows CUDA toolkit version, driver max, GPU architecture, spot pricing, uptime, and estimated cost |
47
- | 🗑️ | **Clean terminate** | Stops instances, removes SSH aliases, shows shutting-down state until fully gone |
47
+ | 💾 | **EBS data volumes** | Attach persistent storage at `/data` survives spot interruptions and termination, reattach to new instances |
48
+ | 🗑️ | **Clean terminate** | Stops instances, removes SSH aliases, cleans up EBS volumes (or preserves with `--keep-ebs`) |
48
49
 
49
50
  ### 🎯 Target Workflows
50
51
 
@@ -132,16 +133,24 @@ aws-bootstrap launch --python-version 3.13
132
133
  # Use a non-default SSH port
133
134
  aws-bootstrap launch --ssh-port 2222
134
135
 
136
+ # Attach a persistent EBS data volume (96 GB gp3, mounted at /data)
137
+ aws-bootstrap launch --ebs-storage 96
138
+
139
+ # Reattach an existing EBS volume from a previous instance
140
+ aws-bootstrap launch --ebs-volume-id vol-0abc123def456
141
+
135
142
  # Use a specific AWS profile
136
143
  aws-bootstrap launch --profile my-aws-profile
137
144
  ```
138
145
 
139
146
  After launch, the CLI:
140
147
 
141
- 1. **Adds an SSH alias** (e.g. `aws-gpu1`) to `~/.ssh/config`
142
- 2. **Runs remote setup** installs utilities, creates a Python venv, installs CUDA-matched PyTorch, sets up Jupyter
143
- 3. **Runs a CUDA smoke test** — verifies `torch.cuda.is_available()` and runs a quick GPU matmul
144
- 4. **Prints connection commands** — SSH, Jupyter tunnel, GPU benchmark, and terminate
148
+ 1. **Creates/attaches EBS volume** (if `--ebs-storage` or `--ebs-volume-id` was specified)
149
+ 2. **Adds an SSH alias** (e.g. `aws-gpu1`) to `~/.ssh/config`
150
+ 3. **Runs remote setup** — installs utilities, creates a Python venv, installs CUDA-matched PyTorch, sets up Jupyter
151
+ 4. **Mounts EBS volume** at `/data` (if applicable formats new volumes, mounts existing ones as-is)
152
+ 5. **Runs a CUDA smoke test** — verifies `torch.cuda.is_available()` and runs a quick GPU matmul
153
+ 6. **Prints connection commands** — SSH, Jupyter tunnel, GPU benchmark, and terminate
145
154
 
146
155
  ```bash
147
156
  ssh aws-gpu1 # venv auto-activates on login
@@ -154,7 +163,7 @@ The setup script runs automatically on the instance after SSH becomes available:
154
163
  | Step | What |
155
164
  |------|------|
156
165
  | **GPU verify** | Confirms `nvidia-smi` and `nvcc` are working |
157
- | **Utilities** | Installs `htop`, `tmux`, `tree`, `jq` |
166
+ | **Utilities** | Installs `htop`, `tmux`, `tree`, `jq`, `ffmpeg` |
158
167
  | **Python venv** | Creates `~/venv` with `uv`, auto-activates in `~/.bashrc`. Use `--python-version` to pin a specific Python (e.g. `3.13`) |
159
168
  | **CUDA-aware PyTorch** | Detects CUDA toolkit version → installs PyTorch from the matching `cu{TAG}` wheel index |
160
169
  | **CUDA smoke test** | Runs `torch.cuda.is_available()` + GPU matmul to verify the stack |
@@ -261,11 +270,29 @@ aws-bootstrap status --region us-east-1
261
270
  # Terminate all aws-bootstrap instances (with confirmation prompt)
262
271
  aws-bootstrap terminate
263
272
 
264
- # Terminate specific instances
265
- aws-bootstrap terminate i-abc123 i-def456
273
+ # Terminate but preserve EBS data volumes for reuse
274
+ aws-bootstrap terminate --keep-ebs
275
+
276
+ # Terminate by SSH alias (resolved via ~/.ssh/config)
277
+ aws-bootstrap terminate aws-gpu1
278
+
279
+ # Terminate by instance ID
280
+ aws-bootstrap terminate i-abc123
281
+
282
+ # Mix aliases and instance IDs
283
+ aws-bootstrap terminate aws-gpu1 i-def456
266
284
 
267
285
  # Skip confirmation prompt
268
286
  aws-bootstrap terminate --yes
287
+
288
+ # Remove stale SSH config entries for terminated instances
289
+ aws-bootstrap cleanup
290
+
291
+ # Preview what would be removed without modifying config
292
+ aws-bootstrap cleanup --dry-run
293
+
294
+ # Skip confirmation prompt
295
+ aws-bootstrap cleanup --yes
269
296
  ```
270
297
 
271
298
  `status --gpu` reports both the **installed CUDA toolkit** version (from `nvcc`) and the **maximum CUDA version supported by the driver** (from `nvidia-smi`), so you can see at a glance whether they match:
@@ -274,7 +301,32 @@ aws-bootstrap terminate --yes
274
301
  CUDA: 12.8 (driver supports up to 13.0)
275
302
  ```
276
303
 
277
- SSH aliases are managed automatically — they're created on `launch`, shown in `status`, and cleaned up on `terminate`. Aliases use sequential numbering (`aws-gpu1`, `aws-gpu2`, etc.) and never reuse numbers from previous instances.
304
+ SSH aliases are managed automatically — they're created on `launch`, shown in `status`, and cleaned up on `terminate`. Aliases use sequential numbering (`aws-gpu1`, `aws-gpu2`, etc.) and never reuse numbers from previous instances. You can use aliases anywhere you'd use an instance ID, e.g. `aws-bootstrap terminate aws-gpu1`.
305
+
306
+ ## EBS Data Volumes
307
+
308
+ Attach persistent EBS storage to keep datasets and model checkpoints across instance lifecycles. Volumes are mounted at `/data` and persist independently of the instance.
309
+
310
+ ```bash
311
+ # Create a new 96 GB gp3 volume, formatted and mounted at /data
312
+ aws-bootstrap launch --ebs-storage 96
313
+
314
+ # After terminating with --keep-ebs, reattach the same volume to a new instance
315
+ aws-bootstrap terminate --keep-ebs
316
+ # Output: Preserving EBS volume: vol-0abc123...
317
+ # Reattach with: aws-bootstrap launch --ebs-volume-id vol-0abc123...
318
+
319
+ aws-bootstrap launch --ebs-volume-id vol-0abc123def456
320
+ ```
321
+
322
+ Key behaviors:
323
+ - `--ebs-storage` and `--ebs-volume-id` are mutually exclusive
324
+ - New volumes are formatted as ext4; existing volumes are mounted as-is
325
+ - Volumes are tagged for automatic discovery by `status` and `terminate`
326
+ - `terminate` deletes data volumes by default; use `--keep-ebs` to preserve them
327
+ - **Spot-safe** — data volumes survive spot interruptions. If AWS reclaims your instance, the volume detaches automatically and can be reattached to a new instance with `--ebs-volume-id`
328
+ - EBS volumes must be in the same availability zone as the instance
329
+ - Mount failures are non-fatal — the instance remains usable
278
330
 
279
331
  ## EC2 vCPU Quotas
280
332
 
@@ -1,27 +1,29 @@
1
1
  aws_bootstrap/__init__.py,sha256=kl_jvrunGyIyizdRqAP6ROb5P1BBrXX5PTq5gq1ipU0,82
2
- aws_bootstrap/cli.py,sha256=XqCKxyc294krVtggrsqm2cYrHR6DWaqQeuzrRAN5u_c,20501
3
- aws_bootstrap/config.py,sha256=TeCOYDlijT-KD5SFIzc-VvBhOqcq9YCgen9NK63rka8,895
4
- aws_bootstrap/ec2.py,sha256=LHpzW91ayK45gsWV_B4LanSZIhWggqTsL31qHUceiaA,12274
2
+ aws_bootstrap/cli.py,sha256=n3Ep_7zhBiRSU4ZUeGVqTRb81nzo98mxzQSKdAuiopY,27788
3
+ aws_bootstrap/config.py,sha256=p770XgjfuK1-wVkAEeBdtJSVkc58DKFHgaJlZ-zbGmk,967
4
+ aws_bootstrap/ec2.py,sha256=uNqxWWfPfGCbujQ3eonvqjjxLE76fsEyNchPS6byR6c,16719
5
5
  aws_bootstrap/gpu.py,sha256=WTnHR0s3mQHDlnzqRgqAC6omWz7nT5YtGpcs0Bf88jk,692
6
- aws_bootstrap/ssh.py,sha256=UFRDgNR8cljV-lwMvCy_BAJQBz7gj4a_cQIulf-2A10,19226
6
+ aws_bootstrap/ssh.py,sha256=xY0Yn5q4aA0Xb3ejNY-KCbooZArXRGpimSnbJiBLI_w,24059
7
7
  aws_bootstrap/resources/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
8
8
  aws_bootstrap/resources/gpu_benchmark.py,sha256=1eFt_3MXvoLhs9HahrRPhbxvtdjFaXG2Ty3GEg7Gud0,29366
9
9
  aws_bootstrap/resources/gpu_smoke_test.ipynb,sha256=XvAOEIPa5H9ri5mRZqOdknmwOwKNvCME6DzBGuhRYfg,10698
10
10
  aws_bootstrap/resources/launch.json,sha256=ZOcvHLy3-zBOqRTtFzuyn-_2tB64yuEn8PrJOoZ-PgE,1484
11
- aws_bootstrap/resources/remote_setup.sh,sha256=z_YGdzwEHWInkE3dZVbBNa0F_joTeVhnOpCYOj1CK30,8331
11
+ aws_bootstrap/resources/remote_setup.sh,sha256=i9qXzAqYrnDWt6nFrqeghYWVHmbbzP-OS7O_YJB3GYU,8400
12
12
  aws_bootstrap/resources/requirements.txt,sha256=gpYl1MFCfWXiAhbIUgAjuTHONz3MKci25msIyOkMmUk,75
13
13
  aws_bootstrap/resources/saxpy.cu,sha256=1BSESEwGGCx3KWx9ZJ8jiPHQ42KzQN6i2aP0I28bPsA,1178
14
14
  aws_bootstrap/resources/tasks.json,sha256=6U8pB1N8YIWgUCfFet4ne3nYnI92tWv5D5kPiQG3Zlg,1576
15
15
  aws_bootstrap/tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
16
- aws_bootstrap/tests/test_cli.py,sha256=Lwzpdovq_iJFB6qZ8NuySqzHFkQ_2Q8AAGXdITXi1Vo,32564
17
- aws_bootstrap/tests/test_config.py,sha256=arvET6KNl4Vqsz0zFrSdhciXGU688bfsvCr3dSpziN0,1050
16
+ aws_bootstrap/tests/test_cli.py,sha256=OOYzopo8gkp6fO1MT10fsqAhIs5pN9cCpJyOzWNXHCg,48638
17
+ aws_bootstrap/tests/test_config.py,sha256=vspSGoben_i7m4Fh6UGSes6Fkr789Y1eaOLe54fRSGc,1524
18
+ aws_bootstrap/tests/test_ebs.py,sha256=B2HrgSmS7yroz6zzRuPxKIXmQGlWesuGqOtybyZmHJQ,7582
18
19
  aws_bootstrap/tests/test_ec2.py,sha256=Jmqsjv973hxXbZWfGgECtm6aa2156Lzji227sYMBuMg,10547
19
20
  aws_bootstrap/tests/test_gpu.py,sha256=rbMuda_sIVbaCzkWXoLv9YIfnWztgRoP7NuVL8XHrUY,3871
20
- aws_bootstrap/tests/test_ssh_config.py,sha256=iQDd3hJ8to-2-QHW26Brtglfl0q0P6sCE6U_itxoNyY,11609
21
+ aws_bootstrap/tests/test_ssh_config.py,sha256=qy3UDdvkTfrALiF-W3m8aKvnQj3BeCrZdLjG75tcVJU,17131
22
+ aws_bootstrap/tests/test_ssh_ebs.py,sha256=ipt0xOzdf3kfkVt42Dgr_z7D6JDIMuRi3DqX0OP8sm0,2342
21
23
  aws_bootstrap/tests/test_ssh_gpu.py,sha256=dRp86Og-8GqiATSff3rxhu83mBZdGgqI4UOnoC00Ln0,1454
22
- aws_bootstrap_g4dn-0.4.0.dist-info/licenses/LICENSE,sha256=Hen77Mt8sazSQJ9DgrmZuAvDwo2vc5JAkR_avuFV-CM,1067
23
- aws_bootstrap_g4dn-0.4.0.dist-info/METADATA,sha256=0OQsG5kVwsbfT7dfaZoNrkOlfNRUrKr9NwljtLBKj1I,13483
24
- aws_bootstrap_g4dn-0.4.0.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
25
- aws_bootstrap_g4dn-0.4.0.dist-info/entry_points.txt,sha256=T8FXfOgmLEvFi8DHaFJ3tCzId9J3_d2Y6qT98OXxCjA,57
26
- aws_bootstrap_g4dn-0.4.0.dist-info/top_level.txt,sha256=mix9gZRs8JUv0OMSB_rwdGcRnTKzsKgHrE5fyAn5zJw,14
27
- aws_bootstrap_g4dn-0.4.0.dist-info/RECORD,,
24
+ aws_bootstrap_g4dn-0.6.0.dist-info/licenses/LICENSE,sha256=Hen77Mt8sazSQJ9DgrmZuAvDwo2vc5JAkR_avuFV-CM,1067
25
+ aws_bootstrap_g4dn-0.6.0.dist-info/METADATA,sha256=Ot9yCJfJup1ZzW-0cq99zT9bbswHnRJ4SWxEsJ-pK58,15859
26
+ aws_bootstrap_g4dn-0.6.0.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
27
+ aws_bootstrap_g4dn-0.6.0.dist-info/entry_points.txt,sha256=T8FXfOgmLEvFi8DHaFJ3tCzId9J3_d2Y6qT98OXxCjA,57
28
+ aws_bootstrap_g4dn-0.6.0.dist-info/top_level.txt,sha256=mix9gZRs8JUv0OMSB_rwdGcRnTKzsKgHrE5fyAn5zJw,14
29
+ aws_bootstrap_g4dn-0.6.0.dist-info/RECORD,,