aws-bootstrap-g4dn 0.5.0__py3-none-any.whl → 0.6.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- aws_bootstrap/cli.py +172 -8
- aws_bootstrap/config.py +2 -0
- aws_bootstrap/ec2.py +128 -0
- aws_bootstrap/resources/remote_setup.sh +2 -2
- aws_bootstrap/ssh.py +121 -0
- aws_bootstrap/tests/test_cli.py +372 -4
- aws_bootstrap/tests/test_config.py +18 -0
- aws_bootstrap/tests/test_ebs.py +245 -0
- aws_bootstrap/tests/test_ssh_config.py +76 -0
- aws_bootstrap/tests/test_ssh_ebs.py +76 -0
- {aws_bootstrap_g4dn-0.5.0.dist-info → aws_bootstrap_g4dn-0.6.0.dist-info}/METADATA +53 -7
- {aws_bootstrap_g4dn-0.5.0.dist-info → aws_bootstrap_g4dn-0.6.0.dist-info}/RECORD +16 -14
- {aws_bootstrap_g4dn-0.5.0.dist-info → aws_bootstrap_g4dn-0.6.0.dist-info}/WHEEL +0 -0
- {aws_bootstrap_g4dn-0.5.0.dist-info → aws_bootstrap_g4dn-0.6.0.dist-info}/entry_points.txt +0 -0
- {aws_bootstrap_g4dn-0.5.0.dist-info → aws_bootstrap_g4dn-0.6.0.dist-info}/licenses/LICENSE +0 -0
- {aws_bootstrap_g4dn-0.5.0.dist-info → aws_bootstrap_g4dn-0.6.0.dist-info}/top_level.txt +0 -0
aws_bootstrap/cli.py
CHANGED
|
@@ -10,8 +10,13 @@ import click
|
|
|
10
10
|
|
|
11
11
|
from .config import LaunchConfig
|
|
12
12
|
from .ec2 import (
|
|
13
|
+
EBS_MOUNT_POINT,
|
|
13
14
|
CLIError,
|
|
15
|
+
attach_ebs_volume,
|
|
16
|
+
create_ebs_volume,
|
|
17
|
+
delete_ebs_volume,
|
|
14
18
|
ensure_security_group,
|
|
19
|
+
find_ebs_volumes_for_instance,
|
|
15
20
|
find_tagged_instances,
|
|
16
21
|
get_latest_ami,
|
|
17
22
|
get_spot_price,
|
|
@@ -19,13 +24,17 @@ from .ec2 import (
|
|
|
19
24
|
list_amis,
|
|
20
25
|
list_instance_types,
|
|
21
26
|
terminate_tagged_instances,
|
|
27
|
+
validate_ebs_volume,
|
|
22
28
|
wait_instance_ready,
|
|
23
29
|
)
|
|
24
30
|
from .ssh import (
|
|
25
31
|
add_ssh_host,
|
|
32
|
+
cleanup_stale_ssh_hosts,
|
|
33
|
+
find_stale_ssh_hosts,
|
|
26
34
|
get_ssh_host_details,
|
|
27
35
|
import_key_pair,
|
|
28
36
|
list_ssh_hosts,
|
|
37
|
+
mount_ebs_volume,
|
|
29
38
|
private_key_path,
|
|
30
39
|
query_gpu_info,
|
|
31
40
|
remove_ssh_host,
|
|
@@ -120,6 +129,18 @@ def main():
|
|
|
120
129
|
help="Python version for the remote venv (e.g. 3.13, 3.14.2). Passed to uv during setup.",
|
|
121
130
|
)
|
|
122
131
|
@click.option("--ssh-port", default=22, show_default=True, type=int, help="SSH port on the remote instance.")
|
|
132
|
+
@click.option(
|
|
133
|
+
"--ebs-storage",
|
|
134
|
+
default=None,
|
|
135
|
+
type=int,
|
|
136
|
+
help="Create and attach a new EBS data volume (size in GB, gp3). Mounted at /data.",
|
|
137
|
+
)
|
|
138
|
+
@click.option(
|
|
139
|
+
"--ebs-volume-id",
|
|
140
|
+
default=None,
|
|
141
|
+
type=str,
|
|
142
|
+
help="Attach an existing EBS volume by ID (e.g. vol-0abc123). Mounted at /data.",
|
|
143
|
+
)
|
|
123
144
|
def launch(
|
|
124
145
|
instance_type,
|
|
125
146
|
ami_filter,
|
|
@@ -134,8 +155,13 @@ def launch(
|
|
|
134
155
|
profile,
|
|
135
156
|
python_version,
|
|
136
157
|
ssh_port,
|
|
158
|
+
ebs_storage,
|
|
159
|
+
ebs_volume_id,
|
|
137
160
|
):
|
|
138
161
|
"""Launch a GPU-accelerated EC2 instance."""
|
|
162
|
+
if ebs_storage is not None and ebs_volume_id is not None:
|
|
163
|
+
raise CLIError("--ebs-storage and --ebs-volume-id are mutually exclusive.")
|
|
164
|
+
|
|
139
165
|
config = LaunchConfig(
|
|
140
166
|
instance_type=instance_type,
|
|
141
167
|
spot=spot,
|
|
@@ -148,6 +174,8 @@ def launch(
|
|
|
148
174
|
dry_run=dry_run,
|
|
149
175
|
ssh_port=ssh_port,
|
|
150
176
|
python_version=python_version,
|
|
177
|
+
ebs_storage=ebs_storage,
|
|
178
|
+
ebs_volume_id=ebs_volume_id,
|
|
151
179
|
)
|
|
152
180
|
if ami_filter:
|
|
153
181
|
config.ami_filter = ami_filter
|
|
@@ -162,18 +190,21 @@ def launch(
|
|
|
162
190
|
session = boto3.Session(profile_name=config.profile, region_name=config.region)
|
|
163
191
|
ec2 = session.client("ec2")
|
|
164
192
|
|
|
193
|
+
has_ebs = config.ebs_storage is not None or config.ebs_volume_id is not None
|
|
194
|
+
total_steps = 7 if has_ebs else 6
|
|
195
|
+
|
|
165
196
|
# Step 1: AMI lookup
|
|
166
|
-
step(1,
|
|
197
|
+
step(1, total_steps, "Looking up AMI...")
|
|
167
198
|
ami = get_latest_ami(ec2, config.ami_filter)
|
|
168
199
|
info(f"Found: {ami['Name']}")
|
|
169
200
|
val("AMI ID", ami["ImageId"])
|
|
170
201
|
|
|
171
202
|
# Step 2: SSH key pair
|
|
172
|
-
step(2,
|
|
203
|
+
step(2, total_steps, "Importing SSH key pair...")
|
|
173
204
|
import_key_pair(ec2, config.key_name, config.key_path)
|
|
174
205
|
|
|
175
206
|
# Step 3: Security group
|
|
176
|
-
step(3,
|
|
207
|
+
step(3, total_steps, "Ensuring security group...")
|
|
177
208
|
sg_id = ensure_security_group(ec2, config.security_group, config.tag_value, ssh_port=config.ssh_port)
|
|
178
209
|
|
|
179
210
|
pricing = "spot" if config.spot else "on-demand"
|
|
@@ -193,18 +224,22 @@ def launch(
|
|
|
193
224
|
val("SSH port", str(config.ssh_port))
|
|
194
225
|
if config.python_version:
|
|
195
226
|
val("Python version", config.python_version)
|
|
227
|
+
if config.ebs_storage:
|
|
228
|
+
val("EBS data volume", f"{config.ebs_storage} GB gp3 (new, mounted at {EBS_MOUNT_POINT})")
|
|
229
|
+
if config.ebs_volume_id:
|
|
230
|
+
val("EBS data volume", f"{config.ebs_volume_id} (existing, mounted at {EBS_MOUNT_POINT})")
|
|
196
231
|
click.echo()
|
|
197
232
|
click.secho("No resources launched (dry-run mode).", fg="yellow")
|
|
198
233
|
return
|
|
199
234
|
|
|
200
235
|
# Step 4: Launch instance
|
|
201
|
-
step(4,
|
|
236
|
+
step(4, total_steps, f"Launching {config.instance_type} instance ({pricing})...")
|
|
202
237
|
instance = launch_instance(ec2, config, ami["ImageId"], sg_id)
|
|
203
238
|
instance_id = instance["InstanceId"]
|
|
204
239
|
val("Instance ID", instance_id)
|
|
205
240
|
|
|
206
241
|
# Step 5: Wait for ready
|
|
207
|
-
step(5,
|
|
242
|
+
step(5, total_steps, "Waiting for instance to be ready...")
|
|
208
243
|
instance = wait_instance_ready(ec2, instance_id)
|
|
209
244
|
public_ip = instance.get("PublicIpAddress")
|
|
210
245
|
if not public_ip:
|
|
@@ -213,9 +248,39 @@ def launch(
|
|
|
213
248
|
return
|
|
214
249
|
|
|
215
250
|
val("Public IP", public_ip)
|
|
251
|
+
az = instance["Placement"]["AvailabilityZone"]
|
|
252
|
+
|
|
253
|
+
# Step 5.5 (optional): EBS data volume
|
|
254
|
+
ebs_volume_attached = None
|
|
255
|
+
ebs_format = False
|
|
256
|
+
if has_ebs:
|
|
257
|
+
step(6, total_steps, "Setting up EBS data volume...")
|
|
258
|
+
if config.ebs_storage:
|
|
259
|
+
info(f"Creating {config.ebs_storage} GB gp3 volume in {az}...")
|
|
260
|
+
ebs_volume_attached = create_ebs_volume(ec2, config.ebs_storage, az, config.tag_value, instance_id)
|
|
261
|
+
val("Volume ID", ebs_volume_attached)
|
|
262
|
+
ebs_format = True
|
|
263
|
+
elif config.ebs_volume_id:
|
|
264
|
+
info(f"Validating volume {config.ebs_volume_id}...")
|
|
265
|
+
validate_ebs_volume(ec2, config.ebs_volume_id, az)
|
|
266
|
+
ebs_volume_attached = config.ebs_volume_id
|
|
267
|
+
# Tag the existing volume for discovery
|
|
268
|
+
ec2.create_tags(
|
|
269
|
+
Resources=[ebs_volume_attached],
|
|
270
|
+
Tags=[
|
|
271
|
+
{"Key": "aws-bootstrap-instance", "Value": instance_id},
|
|
272
|
+
{"Key": "created-by", "Value": config.tag_value},
|
|
273
|
+
],
|
|
274
|
+
)
|
|
275
|
+
ebs_format = False
|
|
216
276
|
|
|
217
|
-
|
|
218
|
-
|
|
277
|
+
info(f"Attaching {ebs_volume_attached} to {instance_id}...")
|
|
278
|
+
attach_ebs_volume(ec2, ebs_volume_attached, instance_id)
|
|
279
|
+
success("EBS volume attached.")
|
|
280
|
+
|
|
281
|
+
# SSH and remote setup step
|
|
282
|
+
ssh_step = 7 if has_ebs else 6
|
|
283
|
+
step(ssh_step, total_steps, "Waiting for SSH access...")
|
|
219
284
|
private_key = private_key_path(config.key_path)
|
|
220
285
|
if not wait_for_ssh(public_ip, config.ssh_user, config.key_path, port=config.ssh_port):
|
|
221
286
|
warn("SSH did not become available within the timeout.")
|
|
@@ -238,6 +303,22 @@ def launch(
|
|
|
238
303
|
else:
|
|
239
304
|
warn("Remote setup failed. Instance is still running.")
|
|
240
305
|
|
|
306
|
+
# Mount EBS volume via SSH (after setup so the instance is fully ready)
|
|
307
|
+
if ebs_volume_attached:
|
|
308
|
+
info(f"Mounting EBS volume at {EBS_MOUNT_POINT}...")
|
|
309
|
+
if mount_ebs_volume(
|
|
310
|
+
public_ip,
|
|
311
|
+
config.ssh_user,
|
|
312
|
+
config.key_path,
|
|
313
|
+
ebs_volume_attached,
|
|
314
|
+
mount_point=EBS_MOUNT_POINT,
|
|
315
|
+
format_volume=ebs_format,
|
|
316
|
+
port=config.ssh_port,
|
|
317
|
+
):
|
|
318
|
+
success(f"EBS volume mounted at {EBS_MOUNT_POINT}.")
|
|
319
|
+
else:
|
|
320
|
+
warn(f"Failed to mount EBS volume at {EBS_MOUNT_POINT}. You may need to mount it manually.")
|
|
321
|
+
|
|
241
322
|
# Add SSH config alias
|
|
242
323
|
alias = add_ssh_host(
|
|
243
324
|
instance_id=instance_id,
|
|
@@ -260,6 +341,12 @@ def launch(
|
|
|
260
341
|
val("Instance", config.instance_type)
|
|
261
342
|
val("Pricing", pricing)
|
|
262
343
|
val("SSH alias", alias)
|
|
344
|
+
if ebs_volume_attached:
|
|
345
|
+
if config.ebs_storage:
|
|
346
|
+
ebs_label = f"{ebs_volume_attached} ({config.ebs_storage} GB, {EBS_MOUNT_POINT})"
|
|
347
|
+
else:
|
|
348
|
+
ebs_label = f"{ebs_volume_attached} ({EBS_MOUNT_POINT})"
|
|
349
|
+
val("EBS data volume", ebs_label)
|
|
263
350
|
|
|
264
351
|
port_flag = f" -p {config.ssh_port}" if config.ssh_port != 22 else ""
|
|
265
352
|
|
|
@@ -371,6 +458,12 @@ def status(region, profile, gpu, instructions):
|
|
|
371
458
|
else:
|
|
372
459
|
click.echo(" GPU: " + click.style("unavailable", dim=True))
|
|
373
460
|
|
|
461
|
+
# EBS data volumes
|
|
462
|
+
ebs_volumes = find_ebs_volumes_for_instance(ec2, inst["InstanceId"], "aws-bootstrap-g4dn")
|
|
463
|
+
for vol in ebs_volumes:
|
|
464
|
+
vol_state = f", {vol['State']}" if vol["State"] != "in-use" else ""
|
|
465
|
+
val(" EBS", f"{vol['VolumeId']} ({vol['Size']} GB, {EBS_MOUNT_POINT}{vol_state})")
|
|
466
|
+
|
|
374
467
|
lifecycle = inst["Lifecycle"]
|
|
375
468
|
is_spot = lifecycle == "spot"
|
|
376
469
|
|
|
@@ -429,8 +522,9 @@ def status(region, profile, gpu, instructions):
|
|
|
429
522
|
@click.option("--region", default="us-west-2", show_default=True, help="AWS region.")
|
|
430
523
|
@click.option("--profile", default=None, help="AWS profile override.")
|
|
431
524
|
@click.option("--yes", "-y", is_flag=True, default=False, help="Skip confirmation prompt.")
|
|
525
|
+
@click.option("--keep-ebs", is_flag=True, default=False, help="Preserve EBS data volumes instead of deleting them.")
|
|
432
526
|
@click.argument("instance_ids", nargs=-1, metavar="[INSTANCE_ID_OR_ALIAS]...")
|
|
433
|
-
def terminate(region, profile, yes, instance_ids):
|
|
527
|
+
def terminate(region, profile, yes, keep_ebs, instance_ids):
|
|
434
528
|
"""Terminate instances created by aws-bootstrap.
|
|
435
529
|
|
|
436
530
|
Pass specific instance IDs or SSH aliases (e.g. aws-gpu1) to terminate,
|
|
@@ -468,6 +562,13 @@ def terminate(region, profile, yes, instance_ids):
|
|
|
468
562
|
click.secho(" Cancelled.", fg="yellow")
|
|
469
563
|
return
|
|
470
564
|
|
|
565
|
+
# Discover EBS volumes before termination (while instances still exist)
|
|
566
|
+
ebs_by_instance: dict[str, list[dict]] = {}
|
|
567
|
+
for target in targets:
|
|
568
|
+
volumes = find_ebs_volumes_for_instance(ec2, target, "aws-bootstrap-g4dn")
|
|
569
|
+
if volumes:
|
|
570
|
+
ebs_by_instance[target] = volumes
|
|
571
|
+
|
|
471
572
|
changes = terminate_tagged_instances(ec2, targets)
|
|
472
573
|
click.echo()
|
|
473
574
|
for change in changes:
|
|
@@ -479,10 +580,73 @@ def terminate(region, profile, yes, instance_ids):
|
|
|
479
580
|
removed_alias = remove_ssh_host(change["InstanceId"])
|
|
480
581
|
if removed_alias:
|
|
481
582
|
info(f"Removed SSH config alias: {removed_alias}")
|
|
583
|
+
|
|
584
|
+
# Handle EBS volume cleanup
|
|
585
|
+
for _iid, volumes in ebs_by_instance.items():
|
|
586
|
+
for vol in volumes:
|
|
587
|
+
vid = vol["VolumeId"]
|
|
588
|
+
if keep_ebs:
|
|
589
|
+
click.echo()
|
|
590
|
+
info(f"Preserving EBS volume: {vid} ({vol['Size']} GB)")
|
|
591
|
+
info(f"Reattach with: aws-bootstrap launch --ebs-volume-id {vid}")
|
|
592
|
+
else:
|
|
593
|
+
click.echo()
|
|
594
|
+
info(f"Waiting for EBS volume {vid} to detach...")
|
|
595
|
+
try:
|
|
596
|
+
waiter = ec2.get_waiter("volume_available")
|
|
597
|
+
waiter.wait(VolumeIds=[vid], WaiterConfig={"Delay": 10, "MaxAttempts": 30})
|
|
598
|
+
delete_ebs_volume(ec2, vid)
|
|
599
|
+
success(f"Deleted EBS volume: {vid}")
|
|
600
|
+
except Exception as e:
|
|
601
|
+
warn(f"Failed to delete EBS volume {vid}: {e}")
|
|
602
|
+
|
|
482
603
|
click.echo()
|
|
483
604
|
success(f"Terminated {len(changes)} instance(s).")
|
|
484
605
|
|
|
485
606
|
|
|
607
|
+
@main.command()
|
|
608
|
+
@click.option("--dry-run", is_flag=True, default=False, help="Show what would be removed without removing.")
|
|
609
|
+
@click.option("--yes", "-y", is_flag=True, default=False, help="Skip confirmation prompt.")
|
|
610
|
+
@click.option("--region", default="us-west-2", show_default=True, help="AWS region.")
|
|
611
|
+
@click.option("--profile", default=None, help="AWS profile override.")
|
|
612
|
+
def cleanup(dry_run, yes, region, profile):
|
|
613
|
+
"""Remove stale SSH config entries for terminated instances."""
|
|
614
|
+
session = boto3.Session(profile_name=profile, region_name=region)
|
|
615
|
+
ec2 = session.client("ec2")
|
|
616
|
+
|
|
617
|
+
live_instances = find_tagged_instances(ec2, "aws-bootstrap-g4dn")
|
|
618
|
+
live_ids = {inst["InstanceId"] for inst in live_instances}
|
|
619
|
+
|
|
620
|
+
stale = find_stale_ssh_hosts(live_ids)
|
|
621
|
+
if not stale:
|
|
622
|
+
click.secho("No stale SSH config entries found.", fg="green")
|
|
623
|
+
return
|
|
624
|
+
|
|
625
|
+
click.secho(f"\n Found {len(stale)} stale SSH config entry(ies):\n", bold=True, fg="cyan")
|
|
626
|
+
for iid, alias in stale:
|
|
627
|
+
click.echo(" " + click.style(alias, fg="bright_white") + f" ({iid})")
|
|
628
|
+
|
|
629
|
+
if dry_run:
|
|
630
|
+
click.echo()
|
|
631
|
+
for iid, alias in stale:
|
|
632
|
+
info(f"Would remove {alias} ({iid})")
|
|
633
|
+
return
|
|
634
|
+
|
|
635
|
+
if not yes:
|
|
636
|
+
click.echo()
|
|
637
|
+
if not click.confirm(f" Remove {len(stale)} stale entry(ies)?"):
|
|
638
|
+
click.secho(" Cancelled.", fg="yellow")
|
|
639
|
+
return
|
|
640
|
+
|
|
641
|
+
results = cleanup_stale_ssh_hosts(live_ids)
|
|
642
|
+
click.echo()
|
|
643
|
+
for r in results:
|
|
644
|
+
success(f"Removed {r.alias} ({r.instance_id})")
|
|
645
|
+
|
|
646
|
+
click.echo()
|
|
647
|
+
success(f"Cleaned up {len(results)} stale entry(ies).")
|
|
648
|
+
|
|
649
|
+
|
|
486
650
|
# ---------------------------------------------------------------------------
|
|
487
651
|
# list command group
|
|
488
652
|
# ---------------------------------------------------------------------------
|
aws_bootstrap/config.py
CHANGED
aws_bootstrap/ec2.py
CHANGED
|
@@ -9,6 +9,10 @@ import click
|
|
|
9
9
|
from .config import LaunchConfig
|
|
10
10
|
|
|
11
11
|
|
|
12
|
+
EBS_DEVICE_NAME = "/dev/sdf"
|
|
13
|
+
EBS_MOUNT_POINT = "/data"
|
|
14
|
+
|
|
15
|
+
|
|
12
16
|
class CLIError(click.ClickException):
|
|
13
17
|
"""A ClickException that displays the error message in red."""
|
|
14
18
|
|
|
@@ -339,3 +343,127 @@ def wait_instance_ready(ec2_client, instance_id: str) -> dict:
|
|
|
339
343
|
desc = ec2_client.describe_instances(InstanceIds=[instance_id])
|
|
340
344
|
instance = desc["Reservations"][0]["Instances"][0]
|
|
341
345
|
return instance
|
|
346
|
+
|
|
347
|
+
|
|
348
|
+
# ---------------------------------------------------------------------------
|
|
349
|
+
# EBS data volume operations
|
|
350
|
+
# ---------------------------------------------------------------------------
|
|
351
|
+
|
|
352
|
+
|
|
353
|
+
def create_ebs_volume(ec2_client, size_gb: int, availability_zone: str, tag_value: str, instance_id: str) -> str:
|
|
354
|
+
"""Create a gp3 EBS volume and wait for it to become available.
|
|
355
|
+
|
|
356
|
+
Returns the volume ID.
|
|
357
|
+
"""
|
|
358
|
+
response = ec2_client.create_volume(
|
|
359
|
+
AvailabilityZone=availability_zone,
|
|
360
|
+
Size=size_gb,
|
|
361
|
+
VolumeType="gp3",
|
|
362
|
+
TagSpecifications=[
|
|
363
|
+
{
|
|
364
|
+
"ResourceType": "volume",
|
|
365
|
+
"Tags": [
|
|
366
|
+
{"Key": "created-by", "Value": tag_value},
|
|
367
|
+
{"Key": "Name", "Value": f"aws-bootstrap-data-{instance_id}"},
|
|
368
|
+
{"Key": "aws-bootstrap-instance", "Value": instance_id},
|
|
369
|
+
],
|
|
370
|
+
}
|
|
371
|
+
],
|
|
372
|
+
)
|
|
373
|
+
volume_id = response["VolumeId"]
|
|
374
|
+
|
|
375
|
+
waiter = ec2_client.get_waiter("volume_available")
|
|
376
|
+
waiter.wait(VolumeIds=[volume_id], WaiterConfig={"Delay": 5, "MaxAttempts": 24})
|
|
377
|
+
return volume_id
|
|
378
|
+
|
|
379
|
+
|
|
380
|
+
def validate_ebs_volume(ec2_client, volume_id: str, availability_zone: str) -> dict:
|
|
381
|
+
"""Validate that an existing EBS volume can be attached.
|
|
382
|
+
|
|
383
|
+
Checks that the volume exists, is available (not in-use), and is in the
|
|
384
|
+
correct availability zone. Returns the volume description dict.
|
|
385
|
+
|
|
386
|
+
Raises CLIError for validation failures.
|
|
387
|
+
"""
|
|
388
|
+
try:
|
|
389
|
+
response = ec2_client.describe_volumes(VolumeIds=[volume_id])
|
|
390
|
+
except botocore.exceptions.ClientError as e:
|
|
391
|
+
if e.response["Error"]["Code"] == "InvalidVolume.NotFound":
|
|
392
|
+
raise CLIError(f"EBS volume not found: {volume_id}") from None
|
|
393
|
+
raise
|
|
394
|
+
|
|
395
|
+
volumes = response["Volumes"]
|
|
396
|
+
if not volumes:
|
|
397
|
+
raise CLIError(f"EBS volume not found: {volume_id}")
|
|
398
|
+
|
|
399
|
+
vol = volumes[0]
|
|
400
|
+
|
|
401
|
+
if vol["State"] != "available":
|
|
402
|
+
raise CLIError(
|
|
403
|
+
f"EBS volume {volume_id} is currently '{vol['State']}' (must be 'available').\n"
|
|
404
|
+
" Detach it from its current instance first."
|
|
405
|
+
)
|
|
406
|
+
|
|
407
|
+
if vol["AvailabilityZone"] != availability_zone:
|
|
408
|
+
raise CLIError(
|
|
409
|
+
f"EBS volume {volume_id} is in {vol['AvailabilityZone']} "
|
|
410
|
+
f"but the instance is in {availability_zone}.\n"
|
|
411
|
+
" EBS volumes must be in the same availability zone as the instance."
|
|
412
|
+
)
|
|
413
|
+
|
|
414
|
+
return vol
|
|
415
|
+
|
|
416
|
+
|
|
417
|
+
def attach_ebs_volume(ec2_client, volume_id: str, instance_id: str, device_name: str = EBS_DEVICE_NAME) -> None:
|
|
418
|
+
"""Attach an EBS volume to an instance and wait for it to be in-use."""
|
|
419
|
+
ec2_client.attach_volume(
|
|
420
|
+
VolumeId=volume_id,
|
|
421
|
+
InstanceId=instance_id,
|
|
422
|
+
Device=device_name,
|
|
423
|
+
)
|
|
424
|
+
waiter = ec2_client.get_waiter("volume_in_use")
|
|
425
|
+
waiter.wait(VolumeIds=[volume_id], WaiterConfig={"Delay": 5, "MaxAttempts": 24})
|
|
426
|
+
|
|
427
|
+
|
|
428
|
+
def detach_ebs_volume(ec2_client, volume_id: str) -> None:
|
|
429
|
+
"""Detach an EBS volume and wait for it to become available."""
|
|
430
|
+
ec2_client.detach_volume(VolumeId=volume_id)
|
|
431
|
+
waiter = ec2_client.get_waiter("volume_available")
|
|
432
|
+
waiter.wait(VolumeIds=[volume_id], WaiterConfig={"Delay": 5, "MaxAttempts": 24})
|
|
433
|
+
|
|
434
|
+
|
|
435
|
+
def delete_ebs_volume(ec2_client, volume_id: str) -> None:
|
|
436
|
+
"""Delete an EBS volume."""
|
|
437
|
+
ec2_client.delete_volume(VolumeId=volume_id)
|
|
438
|
+
|
|
439
|
+
|
|
440
|
+
def find_ebs_volumes_for_instance(ec2_client, instance_id: str, tag_value: str) -> list[dict]:
|
|
441
|
+
"""Find EBS data volumes associated with an instance via tags.
|
|
442
|
+
|
|
443
|
+
Returns a list of dicts with VolumeId, Size, Device, and State.
|
|
444
|
+
Excludes root volumes (only returns volumes tagged by aws-bootstrap).
|
|
445
|
+
"""
|
|
446
|
+
try:
|
|
447
|
+
response = ec2_client.describe_volumes(
|
|
448
|
+
Filters=[
|
|
449
|
+
{"Name": "tag:aws-bootstrap-instance", "Values": [instance_id]},
|
|
450
|
+
{"Name": "tag:created-by", "Values": [tag_value]},
|
|
451
|
+
]
|
|
452
|
+
)
|
|
453
|
+
except botocore.exceptions.ClientError:
|
|
454
|
+
return []
|
|
455
|
+
|
|
456
|
+
volumes = []
|
|
457
|
+
for vol in response.get("Volumes", []):
|
|
458
|
+
device = ""
|
|
459
|
+
if vol.get("Attachments"):
|
|
460
|
+
device = vol["Attachments"][0].get("Device", "")
|
|
461
|
+
volumes.append(
|
|
462
|
+
{
|
|
463
|
+
"VolumeId": vol["VolumeId"],
|
|
464
|
+
"Size": vol["Size"],
|
|
465
|
+
"Device": device,
|
|
466
|
+
"State": vol["State"],
|
|
467
|
+
}
|
|
468
|
+
)
|
|
469
|
+
return volumes
|
|
@@ -48,8 +48,8 @@ fi
|
|
|
48
48
|
# 2. Install utilities
|
|
49
49
|
echo ""
|
|
50
50
|
echo "[2/6] Installing utilities..."
|
|
51
|
-
sudo apt-get update -qq
|
|
52
|
-
sudo apt-get install -y -qq htop tmux tree jq
|
|
51
|
+
sudo DEBIAN_FRONTEND=noninteractive apt-get update -qq
|
|
52
|
+
sudo DEBIAN_FRONTEND=noninteractive apt-get install -y -qq htop tmux tree jq ffmpeg
|
|
53
53
|
|
|
54
54
|
# 3. Set up Python environment with uv
|
|
55
55
|
echo ""
|
aws_bootstrap/ssh.py
CHANGED
|
@@ -374,6 +374,37 @@ def list_ssh_hosts(config_path: Path | None = None) -> dict[str, str]:
|
|
|
374
374
|
return result
|
|
375
375
|
|
|
376
376
|
|
|
377
|
+
def find_stale_ssh_hosts(live_instance_ids: set[str], config_path: Path | None = None) -> list[tuple[str, str]]:
|
|
378
|
+
"""Identify SSH config entries whose instances no longer exist.
|
|
379
|
+
|
|
380
|
+
Returns ``[(instance_id, alias), ...]`` for entries where the instance ID
|
|
381
|
+
is **not** in *live_instance_ids*, sorted by alias.
|
|
382
|
+
"""
|
|
383
|
+
hosts = list_ssh_hosts(config_path)
|
|
384
|
+
stale = [(iid, alias) for iid, alias in hosts.items() if iid not in live_instance_ids]
|
|
385
|
+
stale.sort(key=lambda t: t[1])
|
|
386
|
+
return stale
|
|
387
|
+
|
|
388
|
+
|
|
389
|
+
def cleanup_stale_ssh_hosts(
|
|
390
|
+
live_instance_ids: set[str],
|
|
391
|
+
config_path: Path | None = None,
|
|
392
|
+
dry_run: bool = False,
|
|
393
|
+
) -> list[CleanupResult]:
|
|
394
|
+
"""Remove SSH config entries for terminated/non-existent instances.
|
|
395
|
+
|
|
396
|
+
If *dry_run* is ``True``, entries are identified but not removed.
|
|
397
|
+
Returns a list of :class:`CleanupResult` objects.
|
|
398
|
+
"""
|
|
399
|
+
stale = find_stale_ssh_hosts(live_instance_ids, config_path)
|
|
400
|
+
results: list[CleanupResult] = []
|
|
401
|
+
for iid, alias in stale:
|
|
402
|
+
if not dry_run:
|
|
403
|
+
remove_ssh_host(iid, config_path)
|
|
404
|
+
results.append(CleanupResult(instance_id=iid, alias=alias, removed=not dry_run))
|
|
405
|
+
return results
|
|
406
|
+
|
|
407
|
+
|
|
377
408
|
_INSTANCE_ID_RE = re.compile(r"^i-[0-9a-f]{8,17}$")
|
|
378
409
|
|
|
379
410
|
|
|
@@ -402,6 +433,15 @@ def resolve_instance_id(value: str, config_path: Path | None = None) -> str | No
|
|
|
402
433
|
return None
|
|
403
434
|
|
|
404
435
|
|
|
436
|
+
@dataclass
|
|
437
|
+
class CleanupResult:
|
|
438
|
+
"""Result of cleaning up a single stale SSH config entry."""
|
|
439
|
+
|
|
440
|
+
instance_id: str
|
|
441
|
+
alias: str
|
|
442
|
+
removed: bool
|
|
443
|
+
|
|
444
|
+
|
|
405
445
|
@dataclass
|
|
406
446
|
class SSHHostDetails:
|
|
407
447
|
"""Connection details parsed from an SSH config stanza."""
|
|
@@ -515,6 +555,87 @@ def query_gpu_info(host: str, user: str, key_path: Path, timeout: int = 10, port
|
|
|
515
555
|
return None
|
|
516
556
|
|
|
517
557
|
|
|
558
|
+
# ---------------------------------------------------------------------------
|
|
559
|
+
# EBS volume mount
|
|
560
|
+
# ---------------------------------------------------------------------------
|
|
561
|
+
|
|
562
|
+
|
|
563
|
+
def mount_ebs_volume(
|
|
564
|
+
host: str,
|
|
565
|
+
user: str,
|
|
566
|
+
key_path: Path,
|
|
567
|
+
volume_id: str,
|
|
568
|
+
mount_point: str = "/data",
|
|
569
|
+
format_volume: bool = True,
|
|
570
|
+
port: int = 22,
|
|
571
|
+
) -> bool:
|
|
572
|
+
"""Mount an EBS volume on the remote instance via SSH.
|
|
573
|
+
|
|
574
|
+
Detects the NVMe device by volume ID serial, formats if requested,
|
|
575
|
+
mounts at *mount_point*, and adds an fstab entry for persistence.
|
|
576
|
+
|
|
577
|
+
Returns True on success, False on failure.
|
|
578
|
+
"""
|
|
579
|
+
ssh_opts = _ssh_opts(key_path)
|
|
580
|
+
port_opts = ["-p", str(port)] if port != 22 else []
|
|
581
|
+
|
|
582
|
+
# Strip the vol- prefix and hyphen for NVMe serial matching
|
|
583
|
+
vol_serial = volume_id.replace("-", "")
|
|
584
|
+
|
|
585
|
+
format_cmd = ""
|
|
586
|
+
if format_volume:
|
|
587
|
+
format_cmd = (
|
|
588
|
+
' if ! sudo blkid "$DEVICE" > /dev/null 2>&1; then\n'
|
|
589
|
+
' echo "Formatting $DEVICE as ext4..."\n'
|
|
590
|
+
' sudo mkfs.ext4 "$DEVICE"\n'
|
|
591
|
+
" fi\n"
|
|
592
|
+
)
|
|
593
|
+
|
|
594
|
+
remote_script = (
|
|
595
|
+
"set -e\n"
|
|
596
|
+
"# Detect EBS device by NVMe serial (Nitro instances)\n"
|
|
597
|
+
f'SERIAL="{vol_serial}"\n'
|
|
598
|
+
"DEVICE=$(lsblk -o NAME,SERIAL -dpn 2>/dev/null | "
|
|
599
|
+
"awk -v s=\"$SERIAL\" '$2 == s {print $1}' | head -1)\n"
|
|
600
|
+
"# Fallback to common device paths\n"
|
|
601
|
+
'if [ -z "$DEVICE" ]; then\n'
|
|
602
|
+
" for dev in /dev/nvme1n1 /dev/xvdf /dev/sdf; do\n"
|
|
603
|
+
' if [ -b "$dev" ]; then DEVICE="$dev"; break; fi\n'
|
|
604
|
+
" done\n"
|
|
605
|
+
"fi\n"
|
|
606
|
+
'if [ -z "$DEVICE" ]; then\n'
|
|
607
|
+
' echo "ERROR: Could not find EBS device" >&2\n'
|
|
608
|
+
" exit 1\n"
|
|
609
|
+
"fi\n"
|
|
610
|
+
'echo "Found EBS device: $DEVICE"\n'
|
|
611
|
+
f"{format_cmd}"
|
|
612
|
+
f"sudo mkdir -p {mount_point}\n"
|
|
613
|
+
f'sudo mount "$DEVICE" {mount_point}\n'
|
|
614
|
+
f"sudo chown {user}:{user} {mount_point}\n"
|
|
615
|
+
"# Add fstab entry for reboot persistence\n"
|
|
616
|
+
'UUID=$(sudo blkid -s UUID -o value "$DEVICE")\n'
|
|
617
|
+
'if [ -n "$UUID" ]; then\n'
|
|
618
|
+
f' if ! grep -q "$UUID" /etc/fstab; then\n'
|
|
619
|
+
f' echo "UUID=$UUID {mount_point} ext4 defaults,nofail 0 2" | sudo tee -a /etc/fstab > /dev/null\n'
|
|
620
|
+
" fi\n"
|
|
621
|
+
"fi\n"
|
|
622
|
+
f'echo "Mounted $DEVICE at {mount_point}"'
|
|
623
|
+
)
|
|
624
|
+
|
|
625
|
+
cmd = [
|
|
626
|
+
"ssh",
|
|
627
|
+
*ssh_opts,
|
|
628
|
+
*port_opts,
|
|
629
|
+
"-o",
|
|
630
|
+
"ConnectTimeout=10",
|
|
631
|
+
f"{user}@{host}",
|
|
632
|
+
remote_script,
|
|
633
|
+
]
|
|
634
|
+
|
|
635
|
+
result = subprocess.run(cmd, capture_output=False)
|
|
636
|
+
return result.returncode == 0
|
|
637
|
+
|
|
638
|
+
|
|
518
639
|
# ---------------------------------------------------------------------------
|
|
519
640
|
# Internal helpers
|
|
520
641
|
# ---------------------------------------------------------------------------
|