aws-bootstrap-g4dn 0.4.0__py3-none-any.whl → 0.6.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- aws_bootstrap/cli.py +190 -14
- aws_bootstrap/config.py +2 -0
- aws_bootstrap/ec2.py +128 -0
- aws_bootstrap/resources/remote_setup.sh +2 -2
- aws_bootstrap/ssh.py +149 -0
- aws_bootstrap/tests/test_cli.py +424 -4
- aws_bootstrap/tests/test_config.py +18 -0
- aws_bootstrap/tests/test_ebs.py +245 -0
- aws_bootstrap/tests/test_ssh_config.py +152 -0
- aws_bootstrap/tests/test_ssh_ebs.py +76 -0
- {aws_bootstrap_g4dn-0.4.0.dist-info → aws_bootstrap_g4dn-0.6.0.dist-info}/METADATA +62 -10
- {aws_bootstrap_g4dn-0.4.0.dist-info → aws_bootstrap_g4dn-0.6.0.dist-info}/RECORD +16 -14
- {aws_bootstrap_g4dn-0.4.0.dist-info → aws_bootstrap_g4dn-0.6.0.dist-info}/WHEEL +0 -0
- {aws_bootstrap_g4dn-0.4.0.dist-info → aws_bootstrap_g4dn-0.6.0.dist-info}/entry_points.txt +0 -0
- {aws_bootstrap_g4dn-0.4.0.dist-info → aws_bootstrap_g4dn-0.6.0.dist-info}/licenses/LICENSE +0 -0
- {aws_bootstrap_g4dn-0.4.0.dist-info → aws_bootstrap_g4dn-0.6.0.dist-info}/top_level.txt +0 -0
aws_bootstrap/cli.py
CHANGED
|
@@ -10,8 +10,13 @@ import click
|
|
|
10
10
|
|
|
11
11
|
from .config import LaunchConfig
|
|
12
12
|
from .ec2 import (
|
|
13
|
+
EBS_MOUNT_POINT,
|
|
13
14
|
CLIError,
|
|
15
|
+
attach_ebs_volume,
|
|
16
|
+
create_ebs_volume,
|
|
17
|
+
delete_ebs_volume,
|
|
14
18
|
ensure_security_group,
|
|
19
|
+
find_ebs_volumes_for_instance,
|
|
15
20
|
find_tagged_instances,
|
|
16
21
|
get_latest_ami,
|
|
17
22
|
get_spot_price,
|
|
@@ -19,16 +24,21 @@ from .ec2 import (
|
|
|
19
24
|
list_amis,
|
|
20
25
|
list_instance_types,
|
|
21
26
|
terminate_tagged_instances,
|
|
27
|
+
validate_ebs_volume,
|
|
22
28
|
wait_instance_ready,
|
|
23
29
|
)
|
|
24
30
|
from .ssh import (
|
|
25
31
|
add_ssh_host,
|
|
32
|
+
cleanup_stale_ssh_hosts,
|
|
33
|
+
find_stale_ssh_hosts,
|
|
26
34
|
get_ssh_host_details,
|
|
27
35
|
import_key_pair,
|
|
28
36
|
list_ssh_hosts,
|
|
37
|
+
mount_ebs_volume,
|
|
29
38
|
private_key_path,
|
|
30
39
|
query_gpu_info,
|
|
31
40
|
remove_ssh_host,
|
|
41
|
+
resolve_instance_id,
|
|
32
42
|
run_remote_setup,
|
|
33
43
|
wait_for_ssh,
|
|
34
44
|
)
|
|
@@ -119,6 +129,18 @@ def main():
|
|
|
119
129
|
help="Python version for the remote venv (e.g. 3.13, 3.14.2). Passed to uv during setup.",
|
|
120
130
|
)
|
|
121
131
|
@click.option("--ssh-port", default=22, show_default=True, type=int, help="SSH port on the remote instance.")
|
|
132
|
+
@click.option(
|
|
133
|
+
"--ebs-storage",
|
|
134
|
+
default=None,
|
|
135
|
+
type=int,
|
|
136
|
+
help="Create and attach a new EBS data volume (size in GB, gp3). Mounted at /data.",
|
|
137
|
+
)
|
|
138
|
+
@click.option(
|
|
139
|
+
"--ebs-volume-id",
|
|
140
|
+
default=None,
|
|
141
|
+
type=str,
|
|
142
|
+
help="Attach an existing EBS volume by ID (e.g. vol-0abc123). Mounted at /data.",
|
|
143
|
+
)
|
|
122
144
|
def launch(
|
|
123
145
|
instance_type,
|
|
124
146
|
ami_filter,
|
|
@@ -133,8 +155,13 @@ def launch(
|
|
|
133
155
|
profile,
|
|
134
156
|
python_version,
|
|
135
157
|
ssh_port,
|
|
158
|
+
ebs_storage,
|
|
159
|
+
ebs_volume_id,
|
|
136
160
|
):
|
|
137
161
|
"""Launch a GPU-accelerated EC2 instance."""
|
|
162
|
+
if ebs_storage is not None and ebs_volume_id is not None:
|
|
163
|
+
raise CLIError("--ebs-storage and --ebs-volume-id are mutually exclusive.")
|
|
164
|
+
|
|
138
165
|
config = LaunchConfig(
|
|
139
166
|
instance_type=instance_type,
|
|
140
167
|
spot=spot,
|
|
@@ -147,6 +174,8 @@ def launch(
|
|
|
147
174
|
dry_run=dry_run,
|
|
148
175
|
ssh_port=ssh_port,
|
|
149
176
|
python_version=python_version,
|
|
177
|
+
ebs_storage=ebs_storage,
|
|
178
|
+
ebs_volume_id=ebs_volume_id,
|
|
150
179
|
)
|
|
151
180
|
if ami_filter:
|
|
152
181
|
config.ami_filter = ami_filter
|
|
@@ -161,18 +190,21 @@ def launch(
|
|
|
161
190
|
session = boto3.Session(profile_name=config.profile, region_name=config.region)
|
|
162
191
|
ec2 = session.client("ec2")
|
|
163
192
|
|
|
193
|
+
has_ebs = config.ebs_storage is not None or config.ebs_volume_id is not None
|
|
194
|
+
total_steps = 7 if has_ebs else 6
|
|
195
|
+
|
|
164
196
|
# Step 1: AMI lookup
|
|
165
|
-
step(1,
|
|
197
|
+
step(1, total_steps, "Looking up AMI...")
|
|
166
198
|
ami = get_latest_ami(ec2, config.ami_filter)
|
|
167
199
|
info(f"Found: {ami['Name']}")
|
|
168
200
|
val("AMI ID", ami["ImageId"])
|
|
169
201
|
|
|
170
202
|
# Step 2: SSH key pair
|
|
171
|
-
step(2,
|
|
203
|
+
step(2, total_steps, "Importing SSH key pair...")
|
|
172
204
|
import_key_pair(ec2, config.key_name, config.key_path)
|
|
173
205
|
|
|
174
206
|
# Step 3: Security group
|
|
175
|
-
step(3,
|
|
207
|
+
step(3, total_steps, "Ensuring security group...")
|
|
176
208
|
sg_id = ensure_security_group(ec2, config.security_group, config.tag_value, ssh_port=config.ssh_port)
|
|
177
209
|
|
|
178
210
|
pricing = "spot" if config.spot else "on-demand"
|
|
@@ -192,18 +224,22 @@ def launch(
|
|
|
192
224
|
val("SSH port", str(config.ssh_port))
|
|
193
225
|
if config.python_version:
|
|
194
226
|
val("Python version", config.python_version)
|
|
227
|
+
if config.ebs_storage:
|
|
228
|
+
val("EBS data volume", f"{config.ebs_storage} GB gp3 (new, mounted at {EBS_MOUNT_POINT})")
|
|
229
|
+
if config.ebs_volume_id:
|
|
230
|
+
val("EBS data volume", f"{config.ebs_volume_id} (existing, mounted at {EBS_MOUNT_POINT})")
|
|
195
231
|
click.echo()
|
|
196
232
|
click.secho("No resources launched (dry-run mode).", fg="yellow")
|
|
197
233
|
return
|
|
198
234
|
|
|
199
235
|
# Step 4: Launch instance
|
|
200
|
-
step(4,
|
|
236
|
+
step(4, total_steps, f"Launching {config.instance_type} instance ({pricing})...")
|
|
201
237
|
instance = launch_instance(ec2, config, ami["ImageId"], sg_id)
|
|
202
238
|
instance_id = instance["InstanceId"]
|
|
203
239
|
val("Instance ID", instance_id)
|
|
204
240
|
|
|
205
241
|
# Step 5: Wait for ready
|
|
206
|
-
step(5,
|
|
242
|
+
step(5, total_steps, "Waiting for instance to be ready...")
|
|
207
243
|
instance = wait_instance_ready(ec2, instance_id)
|
|
208
244
|
public_ip = instance.get("PublicIpAddress")
|
|
209
245
|
if not public_ip:
|
|
@@ -212,9 +248,39 @@ def launch(
|
|
|
212
248
|
return
|
|
213
249
|
|
|
214
250
|
val("Public IP", public_ip)
|
|
251
|
+
az = instance["Placement"]["AvailabilityZone"]
|
|
252
|
+
|
|
253
|
+
# Step 5.5 (optional): EBS data volume
|
|
254
|
+
ebs_volume_attached = None
|
|
255
|
+
ebs_format = False
|
|
256
|
+
if has_ebs:
|
|
257
|
+
step(6, total_steps, "Setting up EBS data volume...")
|
|
258
|
+
if config.ebs_storage:
|
|
259
|
+
info(f"Creating {config.ebs_storage} GB gp3 volume in {az}...")
|
|
260
|
+
ebs_volume_attached = create_ebs_volume(ec2, config.ebs_storage, az, config.tag_value, instance_id)
|
|
261
|
+
val("Volume ID", ebs_volume_attached)
|
|
262
|
+
ebs_format = True
|
|
263
|
+
elif config.ebs_volume_id:
|
|
264
|
+
info(f"Validating volume {config.ebs_volume_id}...")
|
|
265
|
+
validate_ebs_volume(ec2, config.ebs_volume_id, az)
|
|
266
|
+
ebs_volume_attached = config.ebs_volume_id
|
|
267
|
+
# Tag the existing volume for discovery
|
|
268
|
+
ec2.create_tags(
|
|
269
|
+
Resources=[ebs_volume_attached],
|
|
270
|
+
Tags=[
|
|
271
|
+
{"Key": "aws-bootstrap-instance", "Value": instance_id},
|
|
272
|
+
{"Key": "created-by", "Value": config.tag_value},
|
|
273
|
+
],
|
|
274
|
+
)
|
|
275
|
+
ebs_format = False
|
|
215
276
|
|
|
216
|
-
|
|
217
|
-
|
|
277
|
+
info(f"Attaching {ebs_volume_attached} to {instance_id}...")
|
|
278
|
+
attach_ebs_volume(ec2, ebs_volume_attached, instance_id)
|
|
279
|
+
success("EBS volume attached.")
|
|
280
|
+
|
|
281
|
+
# SSH and remote setup step
|
|
282
|
+
ssh_step = 7 if has_ebs else 6
|
|
283
|
+
step(ssh_step, total_steps, "Waiting for SSH access...")
|
|
218
284
|
private_key = private_key_path(config.key_path)
|
|
219
285
|
if not wait_for_ssh(public_ip, config.ssh_user, config.key_path, port=config.ssh_port):
|
|
220
286
|
warn("SSH did not become available within the timeout.")
|
|
@@ -237,6 +303,22 @@ def launch(
|
|
|
237
303
|
else:
|
|
238
304
|
warn("Remote setup failed. Instance is still running.")
|
|
239
305
|
|
|
306
|
+
# Mount EBS volume via SSH (after setup so the instance is fully ready)
|
|
307
|
+
if ebs_volume_attached:
|
|
308
|
+
info(f"Mounting EBS volume at {EBS_MOUNT_POINT}...")
|
|
309
|
+
if mount_ebs_volume(
|
|
310
|
+
public_ip,
|
|
311
|
+
config.ssh_user,
|
|
312
|
+
config.key_path,
|
|
313
|
+
ebs_volume_attached,
|
|
314
|
+
mount_point=EBS_MOUNT_POINT,
|
|
315
|
+
format_volume=ebs_format,
|
|
316
|
+
port=config.ssh_port,
|
|
317
|
+
):
|
|
318
|
+
success(f"EBS volume mounted at {EBS_MOUNT_POINT}.")
|
|
319
|
+
else:
|
|
320
|
+
warn(f"Failed to mount EBS volume at {EBS_MOUNT_POINT}. You may need to mount it manually.")
|
|
321
|
+
|
|
240
322
|
# Add SSH config alias
|
|
241
323
|
alias = add_ssh_host(
|
|
242
324
|
instance_id=instance_id,
|
|
@@ -259,6 +341,12 @@ def launch(
|
|
|
259
341
|
val("Instance", config.instance_type)
|
|
260
342
|
val("Pricing", pricing)
|
|
261
343
|
val("SSH alias", alias)
|
|
344
|
+
if ebs_volume_attached:
|
|
345
|
+
if config.ebs_storage:
|
|
346
|
+
ebs_label = f"{ebs_volume_attached} ({config.ebs_storage} GB, {EBS_MOUNT_POINT})"
|
|
347
|
+
else:
|
|
348
|
+
ebs_label = f"{ebs_volume_attached} ({EBS_MOUNT_POINT})"
|
|
349
|
+
val("EBS data volume", ebs_label)
|
|
262
350
|
|
|
263
351
|
port_flag = f" -p {config.ssh_port}" if config.ssh_port != 22 else ""
|
|
264
352
|
|
|
@@ -288,7 +376,7 @@ def launch(
|
|
|
288
376
|
|
|
289
377
|
click.echo()
|
|
290
378
|
click.secho(" Terminate:", fg="cyan")
|
|
291
|
-
click.secho(f" aws-bootstrap terminate {
|
|
379
|
+
click.secho(f" aws-bootstrap terminate {alias} --region {config.region}", bold=True)
|
|
292
380
|
click.echo()
|
|
293
381
|
|
|
294
382
|
|
|
@@ -370,6 +458,12 @@ def status(region, profile, gpu, instructions):
|
|
|
370
458
|
else:
|
|
371
459
|
click.echo(" GPU: " + click.style("unavailable", dim=True))
|
|
372
460
|
|
|
461
|
+
# EBS data volumes
|
|
462
|
+
ebs_volumes = find_ebs_volumes_for_instance(ec2, inst["InstanceId"], "aws-bootstrap-g4dn")
|
|
463
|
+
for vol in ebs_volumes:
|
|
464
|
+
vol_state = f", {vol['State']}" if vol["State"] != "in-use" else ""
|
|
465
|
+
val(" EBS", f"{vol['VolumeId']} ({vol['Size']} GB, {EBS_MOUNT_POINT}{vol_state})")
|
|
466
|
+
|
|
373
467
|
lifecycle = inst["Lifecycle"]
|
|
374
468
|
is_spot = lifecycle == "spot"
|
|
375
469
|
|
|
@@ -419,7 +513,8 @@ def status(region, profile, gpu, instructions):
|
|
|
419
513
|
|
|
420
514
|
click.echo()
|
|
421
515
|
first_id = instances[0]["InstanceId"]
|
|
422
|
-
|
|
516
|
+
first_ref = ssh_hosts.get(first_id, first_id)
|
|
517
|
+
click.echo(" To terminate: " + click.style(f"aws-bootstrap terminate {first_ref}", bold=True))
|
|
423
518
|
click.echo()
|
|
424
519
|
|
|
425
520
|
|
|
@@ -427,18 +522,29 @@ def status(region, profile, gpu, instructions):
|
|
|
427
522
|
@click.option("--region", default="us-west-2", show_default=True, help="AWS region.")
|
|
428
523
|
@click.option("--profile", default=None, help="AWS profile override.")
|
|
429
524
|
@click.option("--yes", "-y", is_flag=True, default=False, help="Skip confirmation prompt.")
|
|
430
|
-
@click.
|
|
431
|
-
|
|
525
|
+
@click.option("--keep-ebs", is_flag=True, default=False, help="Preserve EBS data volumes instead of deleting them.")
|
|
526
|
+
@click.argument("instance_ids", nargs=-1, metavar="[INSTANCE_ID_OR_ALIAS]...")
|
|
527
|
+
def terminate(region, profile, yes, keep_ebs, instance_ids):
|
|
432
528
|
"""Terminate instances created by aws-bootstrap.
|
|
433
529
|
|
|
434
|
-
Pass specific instance IDs
|
|
435
|
-
aws-bootstrap instances in the region.
|
|
530
|
+
Pass specific instance IDs or SSH aliases (e.g. aws-gpu1) to terminate,
|
|
531
|
+
or omit to terminate all aws-bootstrap instances in the region.
|
|
436
532
|
"""
|
|
437
533
|
session = boto3.Session(profile_name=profile, region_name=region)
|
|
438
534
|
ec2 = session.client("ec2")
|
|
439
535
|
|
|
440
536
|
if instance_ids:
|
|
441
|
-
targets =
|
|
537
|
+
targets = []
|
|
538
|
+
for value in instance_ids:
|
|
539
|
+
resolved = resolve_instance_id(value)
|
|
540
|
+
if resolved is None:
|
|
541
|
+
raise CLIError(
|
|
542
|
+
f"Could not resolve '{value}' to an instance ID.\n\n"
|
|
543
|
+
" It is not a valid instance ID or a known SSH alias."
|
|
544
|
+
)
|
|
545
|
+
if resolved != value:
|
|
546
|
+
info(f"Resolved alias '{value}' -> {resolved}")
|
|
547
|
+
targets.append(resolved)
|
|
442
548
|
else:
|
|
443
549
|
instances = find_tagged_instances(ec2, "aws-bootstrap-g4dn")
|
|
444
550
|
if not instances:
|
|
@@ -456,6 +562,13 @@ def terminate(region, profile, yes, instance_ids):
|
|
|
456
562
|
click.secho(" Cancelled.", fg="yellow")
|
|
457
563
|
return
|
|
458
564
|
|
|
565
|
+
# Discover EBS volumes before termination (while instances still exist)
|
|
566
|
+
ebs_by_instance: dict[str, list[dict]] = {}
|
|
567
|
+
for target in targets:
|
|
568
|
+
volumes = find_ebs_volumes_for_instance(ec2, target, "aws-bootstrap-g4dn")
|
|
569
|
+
if volumes:
|
|
570
|
+
ebs_by_instance[target] = volumes
|
|
571
|
+
|
|
459
572
|
changes = terminate_tagged_instances(ec2, targets)
|
|
460
573
|
click.echo()
|
|
461
574
|
for change in changes:
|
|
@@ -467,10 +580,73 @@ def terminate(region, profile, yes, instance_ids):
|
|
|
467
580
|
removed_alias = remove_ssh_host(change["InstanceId"])
|
|
468
581
|
if removed_alias:
|
|
469
582
|
info(f"Removed SSH config alias: {removed_alias}")
|
|
583
|
+
|
|
584
|
+
# Handle EBS volume cleanup
|
|
585
|
+
for _iid, volumes in ebs_by_instance.items():
|
|
586
|
+
for vol in volumes:
|
|
587
|
+
vid = vol["VolumeId"]
|
|
588
|
+
if keep_ebs:
|
|
589
|
+
click.echo()
|
|
590
|
+
info(f"Preserving EBS volume: {vid} ({vol['Size']} GB)")
|
|
591
|
+
info(f"Reattach with: aws-bootstrap launch --ebs-volume-id {vid}")
|
|
592
|
+
else:
|
|
593
|
+
click.echo()
|
|
594
|
+
info(f"Waiting for EBS volume {vid} to detach...")
|
|
595
|
+
try:
|
|
596
|
+
waiter = ec2.get_waiter("volume_available")
|
|
597
|
+
waiter.wait(VolumeIds=[vid], WaiterConfig={"Delay": 10, "MaxAttempts": 30})
|
|
598
|
+
delete_ebs_volume(ec2, vid)
|
|
599
|
+
success(f"Deleted EBS volume: {vid}")
|
|
600
|
+
except Exception as e:
|
|
601
|
+
warn(f"Failed to delete EBS volume {vid}: {e}")
|
|
602
|
+
|
|
470
603
|
click.echo()
|
|
471
604
|
success(f"Terminated {len(changes)} instance(s).")
|
|
472
605
|
|
|
473
606
|
|
|
607
|
+
@main.command()
|
|
608
|
+
@click.option("--dry-run", is_flag=True, default=False, help="Show what would be removed without removing.")
|
|
609
|
+
@click.option("--yes", "-y", is_flag=True, default=False, help="Skip confirmation prompt.")
|
|
610
|
+
@click.option("--region", default="us-west-2", show_default=True, help="AWS region.")
|
|
611
|
+
@click.option("--profile", default=None, help="AWS profile override.")
|
|
612
|
+
def cleanup(dry_run, yes, region, profile):
|
|
613
|
+
"""Remove stale SSH config entries for terminated instances."""
|
|
614
|
+
session = boto3.Session(profile_name=profile, region_name=region)
|
|
615
|
+
ec2 = session.client("ec2")
|
|
616
|
+
|
|
617
|
+
live_instances = find_tagged_instances(ec2, "aws-bootstrap-g4dn")
|
|
618
|
+
live_ids = {inst["InstanceId"] for inst in live_instances}
|
|
619
|
+
|
|
620
|
+
stale = find_stale_ssh_hosts(live_ids)
|
|
621
|
+
if not stale:
|
|
622
|
+
click.secho("No stale SSH config entries found.", fg="green")
|
|
623
|
+
return
|
|
624
|
+
|
|
625
|
+
click.secho(f"\n Found {len(stale)} stale SSH config entry(ies):\n", bold=True, fg="cyan")
|
|
626
|
+
for iid, alias in stale:
|
|
627
|
+
click.echo(" " + click.style(alias, fg="bright_white") + f" ({iid})")
|
|
628
|
+
|
|
629
|
+
if dry_run:
|
|
630
|
+
click.echo()
|
|
631
|
+
for iid, alias in stale:
|
|
632
|
+
info(f"Would remove {alias} ({iid})")
|
|
633
|
+
return
|
|
634
|
+
|
|
635
|
+
if not yes:
|
|
636
|
+
click.echo()
|
|
637
|
+
if not click.confirm(f" Remove {len(stale)} stale entry(ies)?"):
|
|
638
|
+
click.secho(" Cancelled.", fg="yellow")
|
|
639
|
+
return
|
|
640
|
+
|
|
641
|
+
results = cleanup_stale_ssh_hosts(live_ids)
|
|
642
|
+
click.echo()
|
|
643
|
+
for r in results:
|
|
644
|
+
success(f"Removed {r.alias} ({r.instance_id})")
|
|
645
|
+
|
|
646
|
+
click.echo()
|
|
647
|
+
success(f"Cleaned up {len(results)} stale entry(ies).")
|
|
648
|
+
|
|
649
|
+
|
|
474
650
|
# ---------------------------------------------------------------------------
|
|
475
651
|
# list command group
|
|
476
652
|
# ---------------------------------------------------------------------------
|
aws_bootstrap/config.py
CHANGED
aws_bootstrap/ec2.py
CHANGED
|
@@ -9,6 +9,10 @@ import click
|
|
|
9
9
|
from .config import LaunchConfig
|
|
10
10
|
|
|
11
11
|
|
|
12
|
+
EBS_DEVICE_NAME = "/dev/sdf"
|
|
13
|
+
EBS_MOUNT_POINT = "/data"
|
|
14
|
+
|
|
15
|
+
|
|
12
16
|
class CLIError(click.ClickException):
|
|
13
17
|
"""A ClickException that displays the error message in red."""
|
|
14
18
|
|
|
@@ -339,3 +343,127 @@ def wait_instance_ready(ec2_client, instance_id: str) -> dict:
|
|
|
339
343
|
desc = ec2_client.describe_instances(InstanceIds=[instance_id])
|
|
340
344
|
instance = desc["Reservations"][0]["Instances"][0]
|
|
341
345
|
return instance
|
|
346
|
+
|
|
347
|
+
|
|
348
|
+
# ---------------------------------------------------------------------------
|
|
349
|
+
# EBS data volume operations
|
|
350
|
+
# ---------------------------------------------------------------------------
|
|
351
|
+
|
|
352
|
+
|
|
353
|
+
def create_ebs_volume(ec2_client, size_gb: int, availability_zone: str, tag_value: str, instance_id: str) -> str:
|
|
354
|
+
"""Create a gp3 EBS volume and wait for it to become available.
|
|
355
|
+
|
|
356
|
+
Returns the volume ID.
|
|
357
|
+
"""
|
|
358
|
+
response = ec2_client.create_volume(
|
|
359
|
+
AvailabilityZone=availability_zone,
|
|
360
|
+
Size=size_gb,
|
|
361
|
+
VolumeType="gp3",
|
|
362
|
+
TagSpecifications=[
|
|
363
|
+
{
|
|
364
|
+
"ResourceType": "volume",
|
|
365
|
+
"Tags": [
|
|
366
|
+
{"Key": "created-by", "Value": tag_value},
|
|
367
|
+
{"Key": "Name", "Value": f"aws-bootstrap-data-{instance_id}"},
|
|
368
|
+
{"Key": "aws-bootstrap-instance", "Value": instance_id},
|
|
369
|
+
],
|
|
370
|
+
}
|
|
371
|
+
],
|
|
372
|
+
)
|
|
373
|
+
volume_id = response["VolumeId"]
|
|
374
|
+
|
|
375
|
+
waiter = ec2_client.get_waiter("volume_available")
|
|
376
|
+
waiter.wait(VolumeIds=[volume_id], WaiterConfig={"Delay": 5, "MaxAttempts": 24})
|
|
377
|
+
return volume_id
|
|
378
|
+
|
|
379
|
+
|
|
380
|
+
def validate_ebs_volume(ec2_client, volume_id: str, availability_zone: str) -> dict:
|
|
381
|
+
"""Validate that an existing EBS volume can be attached.
|
|
382
|
+
|
|
383
|
+
Checks that the volume exists, is available (not in-use), and is in the
|
|
384
|
+
correct availability zone. Returns the volume description dict.
|
|
385
|
+
|
|
386
|
+
Raises CLIError for validation failures.
|
|
387
|
+
"""
|
|
388
|
+
try:
|
|
389
|
+
response = ec2_client.describe_volumes(VolumeIds=[volume_id])
|
|
390
|
+
except botocore.exceptions.ClientError as e:
|
|
391
|
+
if e.response["Error"]["Code"] == "InvalidVolume.NotFound":
|
|
392
|
+
raise CLIError(f"EBS volume not found: {volume_id}") from None
|
|
393
|
+
raise
|
|
394
|
+
|
|
395
|
+
volumes = response["Volumes"]
|
|
396
|
+
if not volumes:
|
|
397
|
+
raise CLIError(f"EBS volume not found: {volume_id}")
|
|
398
|
+
|
|
399
|
+
vol = volumes[0]
|
|
400
|
+
|
|
401
|
+
if vol["State"] != "available":
|
|
402
|
+
raise CLIError(
|
|
403
|
+
f"EBS volume {volume_id} is currently '{vol['State']}' (must be 'available').\n"
|
|
404
|
+
" Detach it from its current instance first."
|
|
405
|
+
)
|
|
406
|
+
|
|
407
|
+
if vol["AvailabilityZone"] != availability_zone:
|
|
408
|
+
raise CLIError(
|
|
409
|
+
f"EBS volume {volume_id} is in {vol['AvailabilityZone']} "
|
|
410
|
+
f"but the instance is in {availability_zone}.\n"
|
|
411
|
+
" EBS volumes must be in the same availability zone as the instance."
|
|
412
|
+
)
|
|
413
|
+
|
|
414
|
+
return vol
|
|
415
|
+
|
|
416
|
+
|
|
417
|
+
def attach_ebs_volume(ec2_client, volume_id: str, instance_id: str, device_name: str = EBS_DEVICE_NAME) -> None:
|
|
418
|
+
"""Attach an EBS volume to an instance and wait for it to be in-use."""
|
|
419
|
+
ec2_client.attach_volume(
|
|
420
|
+
VolumeId=volume_id,
|
|
421
|
+
InstanceId=instance_id,
|
|
422
|
+
Device=device_name,
|
|
423
|
+
)
|
|
424
|
+
waiter = ec2_client.get_waiter("volume_in_use")
|
|
425
|
+
waiter.wait(VolumeIds=[volume_id], WaiterConfig={"Delay": 5, "MaxAttempts": 24})
|
|
426
|
+
|
|
427
|
+
|
|
428
|
+
def detach_ebs_volume(ec2_client, volume_id: str) -> None:
|
|
429
|
+
"""Detach an EBS volume and wait for it to become available."""
|
|
430
|
+
ec2_client.detach_volume(VolumeId=volume_id)
|
|
431
|
+
waiter = ec2_client.get_waiter("volume_available")
|
|
432
|
+
waiter.wait(VolumeIds=[volume_id], WaiterConfig={"Delay": 5, "MaxAttempts": 24})
|
|
433
|
+
|
|
434
|
+
|
|
435
|
+
def delete_ebs_volume(ec2_client, volume_id: str) -> None:
|
|
436
|
+
"""Delete an EBS volume."""
|
|
437
|
+
ec2_client.delete_volume(VolumeId=volume_id)
|
|
438
|
+
|
|
439
|
+
|
|
440
|
+
def find_ebs_volumes_for_instance(ec2_client, instance_id: str, tag_value: str) -> list[dict]:
|
|
441
|
+
"""Find EBS data volumes associated with an instance via tags.
|
|
442
|
+
|
|
443
|
+
Returns a list of dicts with VolumeId, Size, Device, and State.
|
|
444
|
+
Excludes root volumes (only returns volumes tagged by aws-bootstrap).
|
|
445
|
+
"""
|
|
446
|
+
try:
|
|
447
|
+
response = ec2_client.describe_volumes(
|
|
448
|
+
Filters=[
|
|
449
|
+
{"Name": "tag:aws-bootstrap-instance", "Values": [instance_id]},
|
|
450
|
+
{"Name": "tag:created-by", "Values": [tag_value]},
|
|
451
|
+
]
|
|
452
|
+
)
|
|
453
|
+
except botocore.exceptions.ClientError:
|
|
454
|
+
return []
|
|
455
|
+
|
|
456
|
+
volumes = []
|
|
457
|
+
for vol in response.get("Volumes", []):
|
|
458
|
+
device = ""
|
|
459
|
+
if vol.get("Attachments"):
|
|
460
|
+
device = vol["Attachments"][0].get("Device", "")
|
|
461
|
+
volumes.append(
|
|
462
|
+
{
|
|
463
|
+
"VolumeId": vol["VolumeId"],
|
|
464
|
+
"Size": vol["Size"],
|
|
465
|
+
"Device": device,
|
|
466
|
+
"State": vol["State"],
|
|
467
|
+
}
|
|
468
|
+
)
|
|
469
|
+
return volumes
|
|
@@ -48,8 +48,8 @@ fi
|
|
|
48
48
|
# 2. Install utilities
|
|
49
49
|
echo ""
|
|
50
50
|
echo "[2/6] Installing utilities..."
|
|
51
|
-
sudo apt-get update -qq
|
|
52
|
-
sudo apt-get install -y -qq htop tmux tree jq
|
|
51
|
+
sudo DEBIAN_FRONTEND=noninteractive apt-get update -qq
|
|
52
|
+
sudo DEBIAN_FRONTEND=noninteractive apt-get install -y -qq htop tmux tree jq ffmpeg
|
|
53
53
|
|
|
54
54
|
# 3. Set up Python environment with uv
|
|
55
55
|
echo ""
|
aws_bootstrap/ssh.py
CHANGED
|
@@ -374,6 +374,74 @@ def list_ssh_hosts(config_path: Path | None = None) -> dict[str, str]:
|
|
|
374
374
|
return result
|
|
375
375
|
|
|
376
376
|
|
|
377
|
+
def find_stale_ssh_hosts(live_instance_ids: set[str], config_path: Path | None = None) -> list[tuple[str, str]]:
|
|
378
|
+
"""Identify SSH config entries whose instances no longer exist.
|
|
379
|
+
|
|
380
|
+
Returns ``[(instance_id, alias), ...]`` for entries where the instance ID
|
|
381
|
+
is **not** in *live_instance_ids*, sorted by alias.
|
|
382
|
+
"""
|
|
383
|
+
hosts = list_ssh_hosts(config_path)
|
|
384
|
+
stale = [(iid, alias) for iid, alias in hosts.items() if iid not in live_instance_ids]
|
|
385
|
+
stale.sort(key=lambda t: t[1])
|
|
386
|
+
return stale
|
|
387
|
+
|
|
388
|
+
|
|
389
|
+
def cleanup_stale_ssh_hosts(
|
|
390
|
+
live_instance_ids: set[str],
|
|
391
|
+
config_path: Path | None = None,
|
|
392
|
+
dry_run: bool = False,
|
|
393
|
+
) -> list[CleanupResult]:
|
|
394
|
+
"""Remove SSH config entries for terminated/non-existent instances.
|
|
395
|
+
|
|
396
|
+
If *dry_run* is ``True``, entries are identified but not removed.
|
|
397
|
+
Returns a list of :class:`CleanupResult` objects.
|
|
398
|
+
"""
|
|
399
|
+
stale = find_stale_ssh_hosts(live_instance_ids, config_path)
|
|
400
|
+
results: list[CleanupResult] = []
|
|
401
|
+
for iid, alias in stale:
|
|
402
|
+
if not dry_run:
|
|
403
|
+
remove_ssh_host(iid, config_path)
|
|
404
|
+
results.append(CleanupResult(instance_id=iid, alias=alias, removed=not dry_run))
|
|
405
|
+
return results
|
|
406
|
+
|
|
407
|
+
|
|
408
|
+
_INSTANCE_ID_RE = re.compile(r"^i-[0-9a-f]{8,17}$")
|
|
409
|
+
|
|
410
|
+
|
|
411
|
+
def _is_instance_id(value: str) -> bool:
|
|
412
|
+
"""Return ``True`` if *value* looks like an EC2 instance ID (``i-`` + hex)."""
|
|
413
|
+
return _INSTANCE_ID_RE.match(value) is not None
|
|
414
|
+
|
|
415
|
+
|
|
416
|
+
def resolve_instance_id(value: str, config_path: Path | None = None) -> str | None:
|
|
417
|
+
"""Resolve *value* to an EC2 instance ID.
|
|
418
|
+
|
|
419
|
+
If *value* already looks like an instance ID (``i-`` prefix followed by hex
|
|
420
|
+
digits) it is returned as-is. Otherwise it is treated as an SSH host alias
|
|
421
|
+
and looked up in the managed SSH config blocks.
|
|
422
|
+
|
|
423
|
+
Returns the instance ID on success, or ``None`` if the alias was not found.
|
|
424
|
+
"""
|
|
425
|
+
if _is_instance_id(value):
|
|
426
|
+
return value
|
|
427
|
+
|
|
428
|
+
hosts = list_ssh_hosts(config_path)
|
|
429
|
+
# Reverse lookup: alias -> instance_id
|
|
430
|
+
for iid, alias in hosts.items():
|
|
431
|
+
if alias == value:
|
|
432
|
+
return iid
|
|
433
|
+
return None
|
|
434
|
+
|
|
435
|
+
|
|
436
|
+
@dataclass
|
|
437
|
+
class CleanupResult:
|
|
438
|
+
"""Result of cleaning up a single stale SSH config entry."""
|
|
439
|
+
|
|
440
|
+
instance_id: str
|
|
441
|
+
alias: str
|
|
442
|
+
removed: bool
|
|
443
|
+
|
|
444
|
+
|
|
377
445
|
@dataclass
|
|
378
446
|
class SSHHostDetails:
|
|
379
447
|
"""Connection details parsed from an SSH config stanza."""
|
|
@@ -487,6 +555,87 @@ def query_gpu_info(host: str, user: str, key_path: Path, timeout: int = 10, port
|
|
|
487
555
|
return None
|
|
488
556
|
|
|
489
557
|
|
|
558
|
+
# ---------------------------------------------------------------------------
|
|
559
|
+
# EBS volume mount
|
|
560
|
+
# ---------------------------------------------------------------------------
|
|
561
|
+
|
|
562
|
+
|
|
563
|
+
def mount_ebs_volume(
|
|
564
|
+
host: str,
|
|
565
|
+
user: str,
|
|
566
|
+
key_path: Path,
|
|
567
|
+
volume_id: str,
|
|
568
|
+
mount_point: str = "/data",
|
|
569
|
+
format_volume: bool = True,
|
|
570
|
+
port: int = 22,
|
|
571
|
+
) -> bool:
|
|
572
|
+
"""Mount an EBS volume on the remote instance via SSH.
|
|
573
|
+
|
|
574
|
+
Detects the NVMe device by volume ID serial, formats if requested,
|
|
575
|
+
mounts at *mount_point*, and adds an fstab entry for persistence.
|
|
576
|
+
|
|
577
|
+
Returns True on success, False on failure.
|
|
578
|
+
"""
|
|
579
|
+
ssh_opts = _ssh_opts(key_path)
|
|
580
|
+
port_opts = ["-p", str(port)] if port != 22 else []
|
|
581
|
+
|
|
582
|
+
# Strip the vol- prefix and hyphen for NVMe serial matching
|
|
583
|
+
vol_serial = volume_id.replace("-", "")
|
|
584
|
+
|
|
585
|
+
format_cmd = ""
|
|
586
|
+
if format_volume:
|
|
587
|
+
format_cmd = (
|
|
588
|
+
' if ! sudo blkid "$DEVICE" > /dev/null 2>&1; then\n'
|
|
589
|
+
' echo "Formatting $DEVICE as ext4..."\n'
|
|
590
|
+
' sudo mkfs.ext4 "$DEVICE"\n'
|
|
591
|
+
" fi\n"
|
|
592
|
+
)
|
|
593
|
+
|
|
594
|
+
remote_script = (
|
|
595
|
+
"set -e\n"
|
|
596
|
+
"# Detect EBS device by NVMe serial (Nitro instances)\n"
|
|
597
|
+
f'SERIAL="{vol_serial}"\n'
|
|
598
|
+
"DEVICE=$(lsblk -o NAME,SERIAL -dpn 2>/dev/null | "
|
|
599
|
+
"awk -v s=\"$SERIAL\" '$2 == s {print $1}' | head -1)\n"
|
|
600
|
+
"# Fallback to common device paths\n"
|
|
601
|
+
'if [ -z "$DEVICE" ]; then\n'
|
|
602
|
+
" for dev in /dev/nvme1n1 /dev/xvdf /dev/sdf; do\n"
|
|
603
|
+
' if [ -b "$dev" ]; then DEVICE="$dev"; break; fi\n'
|
|
604
|
+
" done\n"
|
|
605
|
+
"fi\n"
|
|
606
|
+
'if [ -z "$DEVICE" ]; then\n'
|
|
607
|
+
' echo "ERROR: Could not find EBS device" >&2\n'
|
|
608
|
+
" exit 1\n"
|
|
609
|
+
"fi\n"
|
|
610
|
+
'echo "Found EBS device: $DEVICE"\n'
|
|
611
|
+
f"{format_cmd}"
|
|
612
|
+
f"sudo mkdir -p {mount_point}\n"
|
|
613
|
+
f'sudo mount "$DEVICE" {mount_point}\n'
|
|
614
|
+
f"sudo chown {user}:{user} {mount_point}\n"
|
|
615
|
+
"# Add fstab entry for reboot persistence\n"
|
|
616
|
+
'UUID=$(sudo blkid -s UUID -o value "$DEVICE")\n'
|
|
617
|
+
'if [ -n "$UUID" ]; then\n'
|
|
618
|
+
f' if ! grep -q "$UUID" /etc/fstab; then\n'
|
|
619
|
+
f' echo "UUID=$UUID {mount_point} ext4 defaults,nofail 0 2" | sudo tee -a /etc/fstab > /dev/null\n'
|
|
620
|
+
" fi\n"
|
|
621
|
+
"fi\n"
|
|
622
|
+
f'echo "Mounted $DEVICE at {mount_point}"'
|
|
623
|
+
)
|
|
624
|
+
|
|
625
|
+
cmd = [
|
|
626
|
+
"ssh",
|
|
627
|
+
*ssh_opts,
|
|
628
|
+
*port_opts,
|
|
629
|
+
"-o",
|
|
630
|
+
"ConnectTimeout=10",
|
|
631
|
+
f"{user}@{host}",
|
|
632
|
+
remote_script,
|
|
633
|
+
]
|
|
634
|
+
|
|
635
|
+
result = subprocess.run(cmd, capture_output=False)
|
|
636
|
+
return result.returncode == 0
|
|
637
|
+
|
|
638
|
+
|
|
490
639
|
# ---------------------------------------------------------------------------
|
|
491
640
|
# Internal helpers
|
|
492
641
|
# ---------------------------------------------------------------------------
|