aws-bootstrap-g4dn 0.4.0__py3-none-any.whl → 0.6.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
aws_bootstrap/cli.py CHANGED
@@ -10,8 +10,13 @@ import click
10
10
 
11
11
  from .config import LaunchConfig
12
12
  from .ec2 import (
13
+ EBS_MOUNT_POINT,
13
14
  CLIError,
15
+ attach_ebs_volume,
16
+ create_ebs_volume,
17
+ delete_ebs_volume,
14
18
  ensure_security_group,
19
+ find_ebs_volumes_for_instance,
15
20
  find_tagged_instances,
16
21
  get_latest_ami,
17
22
  get_spot_price,
@@ -19,16 +24,21 @@ from .ec2 import (
19
24
  list_amis,
20
25
  list_instance_types,
21
26
  terminate_tagged_instances,
27
+ validate_ebs_volume,
22
28
  wait_instance_ready,
23
29
  )
24
30
  from .ssh import (
25
31
  add_ssh_host,
32
+ cleanup_stale_ssh_hosts,
33
+ find_stale_ssh_hosts,
26
34
  get_ssh_host_details,
27
35
  import_key_pair,
28
36
  list_ssh_hosts,
37
+ mount_ebs_volume,
29
38
  private_key_path,
30
39
  query_gpu_info,
31
40
  remove_ssh_host,
41
+ resolve_instance_id,
32
42
  run_remote_setup,
33
43
  wait_for_ssh,
34
44
  )
@@ -119,6 +129,18 @@ def main():
119
129
  help="Python version for the remote venv (e.g. 3.13, 3.14.2). Passed to uv during setup.",
120
130
  )
121
131
  @click.option("--ssh-port", default=22, show_default=True, type=int, help="SSH port on the remote instance.")
132
+ @click.option(
133
+ "--ebs-storage",
134
+ default=None,
135
+ type=int,
136
+ help="Create and attach a new EBS data volume (size in GB, gp3). Mounted at /data.",
137
+ )
138
+ @click.option(
139
+ "--ebs-volume-id",
140
+ default=None,
141
+ type=str,
142
+ help="Attach an existing EBS volume by ID (e.g. vol-0abc123). Mounted at /data.",
143
+ )
122
144
  def launch(
123
145
  instance_type,
124
146
  ami_filter,
@@ -133,8 +155,13 @@ def launch(
133
155
  profile,
134
156
  python_version,
135
157
  ssh_port,
158
+ ebs_storage,
159
+ ebs_volume_id,
136
160
  ):
137
161
  """Launch a GPU-accelerated EC2 instance."""
162
+ if ebs_storage is not None and ebs_volume_id is not None:
163
+ raise CLIError("--ebs-storage and --ebs-volume-id are mutually exclusive.")
164
+
138
165
  config = LaunchConfig(
139
166
  instance_type=instance_type,
140
167
  spot=spot,
@@ -147,6 +174,8 @@ def launch(
147
174
  dry_run=dry_run,
148
175
  ssh_port=ssh_port,
149
176
  python_version=python_version,
177
+ ebs_storage=ebs_storage,
178
+ ebs_volume_id=ebs_volume_id,
150
179
  )
151
180
  if ami_filter:
152
181
  config.ami_filter = ami_filter
@@ -161,18 +190,21 @@ def launch(
161
190
  session = boto3.Session(profile_name=config.profile, region_name=config.region)
162
191
  ec2 = session.client("ec2")
163
192
 
193
+ has_ebs = config.ebs_storage is not None or config.ebs_volume_id is not None
194
+ total_steps = 7 if has_ebs else 6
195
+
164
196
  # Step 1: AMI lookup
165
- step(1, 6, "Looking up AMI...")
197
+ step(1, total_steps, "Looking up AMI...")
166
198
  ami = get_latest_ami(ec2, config.ami_filter)
167
199
  info(f"Found: {ami['Name']}")
168
200
  val("AMI ID", ami["ImageId"])
169
201
 
170
202
  # Step 2: SSH key pair
171
- step(2, 6, "Importing SSH key pair...")
203
+ step(2, total_steps, "Importing SSH key pair...")
172
204
  import_key_pair(ec2, config.key_name, config.key_path)
173
205
 
174
206
  # Step 3: Security group
175
- step(3, 6, "Ensuring security group...")
207
+ step(3, total_steps, "Ensuring security group...")
176
208
  sg_id = ensure_security_group(ec2, config.security_group, config.tag_value, ssh_port=config.ssh_port)
177
209
 
178
210
  pricing = "spot" if config.spot else "on-demand"
@@ -192,18 +224,22 @@ def launch(
192
224
  val("SSH port", str(config.ssh_port))
193
225
  if config.python_version:
194
226
  val("Python version", config.python_version)
227
+ if config.ebs_storage:
228
+ val("EBS data volume", f"{config.ebs_storage} GB gp3 (new, mounted at {EBS_MOUNT_POINT})")
229
+ if config.ebs_volume_id:
230
+ val("EBS data volume", f"{config.ebs_volume_id} (existing, mounted at {EBS_MOUNT_POINT})")
195
231
  click.echo()
196
232
  click.secho("No resources launched (dry-run mode).", fg="yellow")
197
233
  return
198
234
 
199
235
  # Step 4: Launch instance
200
- step(4, 6, f"Launching {config.instance_type} instance ({pricing})...")
236
+ step(4, total_steps, f"Launching {config.instance_type} instance ({pricing})...")
201
237
  instance = launch_instance(ec2, config, ami["ImageId"], sg_id)
202
238
  instance_id = instance["InstanceId"]
203
239
  val("Instance ID", instance_id)
204
240
 
205
241
  # Step 5: Wait for ready
206
- step(5, 6, "Waiting for instance to be ready...")
242
+ step(5, total_steps, "Waiting for instance to be ready...")
207
243
  instance = wait_instance_ready(ec2, instance_id)
208
244
  public_ip = instance.get("PublicIpAddress")
209
245
  if not public_ip:
@@ -212,9 +248,39 @@ def launch(
212
248
  return
213
249
 
214
250
  val("Public IP", public_ip)
251
+ az = instance["Placement"]["AvailabilityZone"]
252
+
253
+ # Step 5.5 (optional): EBS data volume
254
+ ebs_volume_attached = None
255
+ ebs_format = False
256
+ if has_ebs:
257
+ step(6, total_steps, "Setting up EBS data volume...")
258
+ if config.ebs_storage:
259
+ info(f"Creating {config.ebs_storage} GB gp3 volume in {az}...")
260
+ ebs_volume_attached = create_ebs_volume(ec2, config.ebs_storage, az, config.tag_value, instance_id)
261
+ val("Volume ID", ebs_volume_attached)
262
+ ebs_format = True
263
+ elif config.ebs_volume_id:
264
+ info(f"Validating volume {config.ebs_volume_id}...")
265
+ validate_ebs_volume(ec2, config.ebs_volume_id, az)
266
+ ebs_volume_attached = config.ebs_volume_id
267
+ # Tag the existing volume for discovery
268
+ ec2.create_tags(
269
+ Resources=[ebs_volume_attached],
270
+ Tags=[
271
+ {"Key": "aws-bootstrap-instance", "Value": instance_id},
272
+ {"Key": "created-by", "Value": config.tag_value},
273
+ ],
274
+ )
275
+ ebs_format = False
215
276
 
216
- # Step 6: SSH and remote setup
217
- step(6, 6, "Waiting for SSH access...")
277
+ info(f"Attaching {ebs_volume_attached} to {instance_id}...")
278
+ attach_ebs_volume(ec2, ebs_volume_attached, instance_id)
279
+ success("EBS volume attached.")
280
+
281
+ # SSH and remote setup step
282
+ ssh_step = 7 if has_ebs else 6
283
+ step(ssh_step, total_steps, "Waiting for SSH access...")
218
284
  private_key = private_key_path(config.key_path)
219
285
  if not wait_for_ssh(public_ip, config.ssh_user, config.key_path, port=config.ssh_port):
220
286
  warn("SSH did not become available within the timeout.")
@@ -237,6 +303,22 @@ def launch(
237
303
  else:
238
304
  warn("Remote setup failed. Instance is still running.")
239
305
 
306
+ # Mount EBS volume via SSH (after setup so the instance is fully ready)
307
+ if ebs_volume_attached:
308
+ info(f"Mounting EBS volume at {EBS_MOUNT_POINT}...")
309
+ if mount_ebs_volume(
310
+ public_ip,
311
+ config.ssh_user,
312
+ config.key_path,
313
+ ebs_volume_attached,
314
+ mount_point=EBS_MOUNT_POINT,
315
+ format_volume=ebs_format,
316
+ port=config.ssh_port,
317
+ ):
318
+ success(f"EBS volume mounted at {EBS_MOUNT_POINT}.")
319
+ else:
320
+ warn(f"Failed to mount EBS volume at {EBS_MOUNT_POINT}. You may need to mount it manually.")
321
+
240
322
  # Add SSH config alias
241
323
  alias = add_ssh_host(
242
324
  instance_id=instance_id,
@@ -259,6 +341,12 @@ def launch(
259
341
  val("Instance", config.instance_type)
260
342
  val("Pricing", pricing)
261
343
  val("SSH alias", alias)
344
+ if ebs_volume_attached:
345
+ if config.ebs_storage:
346
+ ebs_label = f"{ebs_volume_attached} ({config.ebs_storage} GB, {EBS_MOUNT_POINT})"
347
+ else:
348
+ ebs_label = f"{ebs_volume_attached} ({EBS_MOUNT_POINT})"
349
+ val("EBS data volume", ebs_label)
262
350
 
263
351
  port_flag = f" -p {config.ssh_port}" if config.ssh_port != 22 else ""
264
352
 
@@ -288,7 +376,7 @@ def launch(
288
376
 
289
377
  click.echo()
290
378
  click.secho(" Terminate:", fg="cyan")
291
- click.secho(f" aws-bootstrap terminate {instance_id} --region {config.region}", bold=True)
379
+ click.secho(f" aws-bootstrap terminate {alias} --region {config.region}", bold=True)
292
380
  click.echo()
293
381
 
294
382
 
@@ -370,6 +458,12 @@ def status(region, profile, gpu, instructions):
370
458
  else:
371
459
  click.echo(" GPU: " + click.style("unavailable", dim=True))
372
460
 
461
+ # EBS data volumes
462
+ ebs_volumes = find_ebs_volumes_for_instance(ec2, inst["InstanceId"], "aws-bootstrap-g4dn")
463
+ for vol in ebs_volumes:
464
+ vol_state = f", {vol['State']}" if vol["State"] != "in-use" else ""
465
+ val(" EBS", f"{vol['VolumeId']} ({vol['Size']} GB, {EBS_MOUNT_POINT}{vol_state})")
466
+
373
467
  lifecycle = inst["Lifecycle"]
374
468
  is_spot = lifecycle == "spot"
375
469
 
@@ -419,7 +513,8 @@ def status(region, profile, gpu, instructions):
419
513
 
420
514
  click.echo()
421
515
  first_id = instances[0]["InstanceId"]
422
- click.echo(" To terminate: " + click.style(f"aws-bootstrap terminate {first_id}", bold=True))
516
+ first_ref = ssh_hosts.get(first_id, first_id)
517
+ click.echo(" To terminate: " + click.style(f"aws-bootstrap terminate {first_ref}", bold=True))
423
518
  click.echo()
424
519
 
425
520
 
@@ -427,18 +522,29 @@ def status(region, profile, gpu, instructions):
427
522
  @click.option("--region", default="us-west-2", show_default=True, help="AWS region.")
428
523
  @click.option("--profile", default=None, help="AWS profile override.")
429
524
  @click.option("--yes", "-y", is_flag=True, default=False, help="Skip confirmation prompt.")
430
- @click.argument("instance_ids", nargs=-1)
431
- def terminate(region, profile, yes, instance_ids):
525
+ @click.option("--keep-ebs", is_flag=True, default=False, help="Preserve EBS data volumes instead of deleting them.")
526
+ @click.argument("instance_ids", nargs=-1, metavar="[INSTANCE_ID_OR_ALIAS]...")
527
+ def terminate(region, profile, yes, keep_ebs, instance_ids):
432
528
  """Terminate instances created by aws-bootstrap.
433
529
 
434
- Pass specific instance IDs to terminate, or omit to terminate all
435
- aws-bootstrap instances in the region.
530
+ Pass specific instance IDs or SSH aliases (e.g. aws-gpu1) to terminate,
531
+ or omit to terminate all aws-bootstrap instances in the region.
436
532
  """
437
533
  session = boto3.Session(profile_name=profile, region_name=region)
438
534
  ec2 = session.client("ec2")
439
535
 
440
536
  if instance_ids:
441
- targets = list(instance_ids)
537
+ targets = []
538
+ for value in instance_ids:
539
+ resolved = resolve_instance_id(value)
540
+ if resolved is None:
541
+ raise CLIError(
542
+ f"Could not resolve '{value}' to an instance ID.\n\n"
543
+ " It is not a valid instance ID or a known SSH alias."
544
+ )
545
+ if resolved != value:
546
+ info(f"Resolved alias '{value}' -> {resolved}")
547
+ targets.append(resolved)
442
548
  else:
443
549
  instances = find_tagged_instances(ec2, "aws-bootstrap-g4dn")
444
550
  if not instances:
@@ -456,6 +562,13 @@ def terminate(region, profile, yes, instance_ids):
456
562
  click.secho(" Cancelled.", fg="yellow")
457
563
  return
458
564
 
565
+ # Discover EBS volumes before termination (while instances still exist)
566
+ ebs_by_instance: dict[str, list[dict]] = {}
567
+ for target in targets:
568
+ volumes = find_ebs_volumes_for_instance(ec2, target, "aws-bootstrap-g4dn")
569
+ if volumes:
570
+ ebs_by_instance[target] = volumes
571
+
459
572
  changes = terminate_tagged_instances(ec2, targets)
460
573
  click.echo()
461
574
  for change in changes:
@@ -467,10 +580,73 @@ def terminate(region, profile, yes, instance_ids):
467
580
  removed_alias = remove_ssh_host(change["InstanceId"])
468
581
  if removed_alias:
469
582
  info(f"Removed SSH config alias: {removed_alias}")
583
+
584
+ # Handle EBS volume cleanup
585
+ for _iid, volumes in ebs_by_instance.items():
586
+ for vol in volumes:
587
+ vid = vol["VolumeId"]
588
+ if keep_ebs:
589
+ click.echo()
590
+ info(f"Preserving EBS volume: {vid} ({vol['Size']} GB)")
591
+ info(f"Reattach with: aws-bootstrap launch --ebs-volume-id {vid}")
592
+ else:
593
+ click.echo()
594
+ info(f"Waiting for EBS volume {vid} to detach...")
595
+ try:
596
+ waiter = ec2.get_waiter("volume_available")
597
+ waiter.wait(VolumeIds=[vid], WaiterConfig={"Delay": 10, "MaxAttempts": 30})
598
+ delete_ebs_volume(ec2, vid)
599
+ success(f"Deleted EBS volume: {vid}")
600
+ except Exception as e:
601
+ warn(f"Failed to delete EBS volume {vid}: {e}")
602
+
470
603
  click.echo()
471
604
  success(f"Terminated {len(changes)} instance(s).")
472
605
 
473
606
 
607
+ @main.command()
608
+ @click.option("--dry-run", is_flag=True, default=False, help="Show what would be removed without removing.")
609
+ @click.option("--yes", "-y", is_flag=True, default=False, help="Skip confirmation prompt.")
610
+ @click.option("--region", default="us-west-2", show_default=True, help="AWS region.")
611
+ @click.option("--profile", default=None, help="AWS profile override.")
612
+ def cleanup(dry_run, yes, region, profile):
613
+ """Remove stale SSH config entries for terminated instances."""
614
+ session = boto3.Session(profile_name=profile, region_name=region)
615
+ ec2 = session.client("ec2")
616
+
617
+ live_instances = find_tagged_instances(ec2, "aws-bootstrap-g4dn")
618
+ live_ids = {inst["InstanceId"] for inst in live_instances}
619
+
620
+ stale = find_stale_ssh_hosts(live_ids)
621
+ if not stale:
622
+ click.secho("No stale SSH config entries found.", fg="green")
623
+ return
624
+
625
+ click.secho(f"\n Found {len(stale)} stale SSH config entry(ies):\n", bold=True, fg="cyan")
626
+ for iid, alias in stale:
627
+ click.echo(" " + click.style(alias, fg="bright_white") + f" ({iid})")
628
+
629
+ if dry_run:
630
+ click.echo()
631
+ for iid, alias in stale:
632
+ info(f"Would remove {alias} ({iid})")
633
+ return
634
+
635
+ if not yes:
636
+ click.echo()
637
+ if not click.confirm(f" Remove {len(stale)} stale entry(ies)?"):
638
+ click.secho(" Cancelled.", fg="yellow")
639
+ return
640
+
641
+ results = cleanup_stale_ssh_hosts(live_ids)
642
+ click.echo()
643
+ for r in results:
644
+ success(f"Removed {r.alias} ({r.instance_id})")
645
+
646
+ click.echo()
647
+ success(f"Cleaned up {len(results)} stale entry(ies).")
648
+
649
+
474
650
  # ---------------------------------------------------------------------------
475
651
  # list command group
476
652
  # ---------------------------------------------------------------------------
aws_bootstrap/config.py CHANGED
@@ -24,3 +24,5 @@ class LaunchConfig:
24
24
  alias_prefix: str = "aws-gpu"
25
25
  ssh_port: int = 22
26
26
  python_version: str | None = None
27
+ ebs_storage: int | None = None
28
+ ebs_volume_id: str | None = None
aws_bootstrap/ec2.py CHANGED
@@ -9,6 +9,10 @@ import click
9
9
  from .config import LaunchConfig
10
10
 
11
11
 
12
+ EBS_DEVICE_NAME = "/dev/sdf"
13
+ EBS_MOUNT_POINT = "/data"
14
+
15
+
12
16
  class CLIError(click.ClickException):
13
17
  """A ClickException that displays the error message in red."""
14
18
 
@@ -339,3 +343,127 @@ def wait_instance_ready(ec2_client, instance_id: str) -> dict:
339
343
  desc = ec2_client.describe_instances(InstanceIds=[instance_id])
340
344
  instance = desc["Reservations"][0]["Instances"][0]
341
345
  return instance
346
+
347
+
348
+ # ---------------------------------------------------------------------------
349
+ # EBS data volume operations
350
+ # ---------------------------------------------------------------------------
351
+
352
+
353
+ def create_ebs_volume(ec2_client, size_gb: int, availability_zone: str, tag_value: str, instance_id: str) -> str:
354
+ """Create a gp3 EBS volume and wait for it to become available.
355
+
356
+ Returns the volume ID.
357
+ """
358
+ response = ec2_client.create_volume(
359
+ AvailabilityZone=availability_zone,
360
+ Size=size_gb,
361
+ VolumeType="gp3",
362
+ TagSpecifications=[
363
+ {
364
+ "ResourceType": "volume",
365
+ "Tags": [
366
+ {"Key": "created-by", "Value": tag_value},
367
+ {"Key": "Name", "Value": f"aws-bootstrap-data-{instance_id}"},
368
+ {"Key": "aws-bootstrap-instance", "Value": instance_id},
369
+ ],
370
+ }
371
+ ],
372
+ )
373
+ volume_id = response["VolumeId"]
374
+
375
+ waiter = ec2_client.get_waiter("volume_available")
376
+ waiter.wait(VolumeIds=[volume_id], WaiterConfig={"Delay": 5, "MaxAttempts": 24})
377
+ return volume_id
378
+
379
+
380
+ def validate_ebs_volume(ec2_client, volume_id: str, availability_zone: str) -> dict:
381
+ """Validate that an existing EBS volume can be attached.
382
+
383
+ Checks that the volume exists, is available (not in-use), and is in the
384
+ correct availability zone. Returns the volume description dict.
385
+
386
+ Raises CLIError for validation failures.
387
+ """
388
+ try:
389
+ response = ec2_client.describe_volumes(VolumeIds=[volume_id])
390
+ except botocore.exceptions.ClientError as e:
391
+ if e.response["Error"]["Code"] == "InvalidVolume.NotFound":
392
+ raise CLIError(f"EBS volume not found: {volume_id}") from None
393
+ raise
394
+
395
+ volumes = response["Volumes"]
396
+ if not volumes:
397
+ raise CLIError(f"EBS volume not found: {volume_id}")
398
+
399
+ vol = volumes[0]
400
+
401
+ if vol["State"] != "available":
402
+ raise CLIError(
403
+ f"EBS volume {volume_id} is currently '{vol['State']}' (must be 'available').\n"
404
+ " Detach it from its current instance first."
405
+ )
406
+
407
+ if vol["AvailabilityZone"] != availability_zone:
408
+ raise CLIError(
409
+ f"EBS volume {volume_id} is in {vol['AvailabilityZone']} "
410
+ f"but the instance is in {availability_zone}.\n"
411
+ " EBS volumes must be in the same availability zone as the instance."
412
+ )
413
+
414
+ return vol
415
+
416
+
417
+ def attach_ebs_volume(ec2_client, volume_id: str, instance_id: str, device_name: str = EBS_DEVICE_NAME) -> None:
418
+ """Attach an EBS volume to an instance and wait for it to be in-use."""
419
+ ec2_client.attach_volume(
420
+ VolumeId=volume_id,
421
+ InstanceId=instance_id,
422
+ Device=device_name,
423
+ )
424
+ waiter = ec2_client.get_waiter("volume_in_use")
425
+ waiter.wait(VolumeIds=[volume_id], WaiterConfig={"Delay": 5, "MaxAttempts": 24})
426
+
427
+
428
+ def detach_ebs_volume(ec2_client, volume_id: str) -> None:
429
+ """Detach an EBS volume and wait for it to become available."""
430
+ ec2_client.detach_volume(VolumeId=volume_id)
431
+ waiter = ec2_client.get_waiter("volume_available")
432
+ waiter.wait(VolumeIds=[volume_id], WaiterConfig={"Delay": 5, "MaxAttempts": 24})
433
+
434
+
435
+ def delete_ebs_volume(ec2_client, volume_id: str) -> None:
436
+ """Delete an EBS volume."""
437
+ ec2_client.delete_volume(VolumeId=volume_id)
438
+
439
+
440
+ def find_ebs_volumes_for_instance(ec2_client, instance_id: str, tag_value: str) -> list[dict]:
441
+ """Find EBS data volumes associated with an instance via tags.
442
+
443
+ Returns a list of dicts with VolumeId, Size, Device, and State.
444
+ Excludes root volumes (only returns volumes tagged by aws-bootstrap).
445
+ """
446
+ try:
447
+ response = ec2_client.describe_volumes(
448
+ Filters=[
449
+ {"Name": "tag:aws-bootstrap-instance", "Values": [instance_id]},
450
+ {"Name": "tag:created-by", "Values": [tag_value]},
451
+ ]
452
+ )
453
+ except botocore.exceptions.ClientError:
454
+ return []
455
+
456
+ volumes = []
457
+ for vol in response.get("Volumes", []):
458
+ device = ""
459
+ if vol.get("Attachments"):
460
+ device = vol["Attachments"][0].get("Device", "")
461
+ volumes.append(
462
+ {
463
+ "VolumeId": vol["VolumeId"],
464
+ "Size": vol["Size"],
465
+ "Device": device,
466
+ "State": vol["State"],
467
+ }
468
+ )
469
+ return volumes
@@ -48,8 +48,8 @@ fi
48
48
  # 2. Install utilities
49
49
  echo ""
50
50
  echo "[2/6] Installing utilities..."
51
- sudo apt-get update -qq
52
- sudo apt-get install -y -qq htop tmux tree jq
51
+ sudo DEBIAN_FRONTEND=noninteractive apt-get update -qq
52
+ sudo DEBIAN_FRONTEND=noninteractive apt-get install -y -qq htop tmux tree jq ffmpeg
53
53
 
54
54
  # 3. Set up Python environment with uv
55
55
  echo ""
aws_bootstrap/ssh.py CHANGED
@@ -374,6 +374,74 @@ def list_ssh_hosts(config_path: Path | None = None) -> dict[str, str]:
374
374
  return result
375
375
 
376
376
 
377
+ def find_stale_ssh_hosts(live_instance_ids: set[str], config_path: Path | None = None) -> list[tuple[str, str]]:
378
+ """Identify SSH config entries whose instances no longer exist.
379
+
380
+ Returns ``[(instance_id, alias), ...]`` for entries where the instance ID
381
+ is **not** in *live_instance_ids*, sorted by alias.
382
+ """
383
+ hosts = list_ssh_hosts(config_path)
384
+ stale = [(iid, alias) for iid, alias in hosts.items() if iid not in live_instance_ids]
385
+ stale.sort(key=lambda t: t[1])
386
+ return stale
387
+
388
+
389
+ def cleanup_stale_ssh_hosts(
390
+ live_instance_ids: set[str],
391
+ config_path: Path | None = None,
392
+ dry_run: bool = False,
393
+ ) -> list[CleanupResult]:
394
+ """Remove SSH config entries for terminated/non-existent instances.
395
+
396
+ If *dry_run* is ``True``, entries are identified but not removed.
397
+ Returns a list of :class:`CleanupResult` objects.
398
+ """
399
+ stale = find_stale_ssh_hosts(live_instance_ids, config_path)
400
+ results: list[CleanupResult] = []
401
+ for iid, alias in stale:
402
+ if not dry_run:
403
+ remove_ssh_host(iid, config_path)
404
+ results.append(CleanupResult(instance_id=iid, alias=alias, removed=not dry_run))
405
+ return results
406
+
407
+
408
+ _INSTANCE_ID_RE = re.compile(r"^i-[0-9a-f]{8,17}$")
409
+
410
+
411
+ def _is_instance_id(value: str) -> bool:
412
+ """Return ``True`` if *value* looks like an EC2 instance ID (``i-`` + hex)."""
413
+ return _INSTANCE_ID_RE.match(value) is not None
414
+
415
+
416
+ def resolve_instance_id(value: str, config_path: Path | None = None) -> str | None:
417
+ """Resolve *value* to an EC2 instance ID.
418
+
419
+ If *value* already looks like an instance ID (``i-`` prefix followed by hex
420
+ digits) it is returned as-is. Otherwise it is treated as an SSH host alias
421
+ and looked up in the managed SSH config blocks.
422
+
423
+ Returns the instance ID on success, or ``None`` if the alias was not found.
424
+ """
425
+ if _is_instance_id(value):
426
+ return value
427
+
428
+ hosts = list_ssh_hosts(config_path)
429
+ # Reverse lookup: alias -> instance_id
430
+ for iid, alias in hosts.items():
431
+ if alias == value:
432
+ return iid
433
+ return None
434
+
435
+
436
+ @dataclass
437
+ class CleanupResult:
438
+ """Result of cleaning up a single stale SSH config entry."""
439
+
440
+ instance_id: str
441
+ alias: str
442
+ removed: bool
443
+
444
+
377
445
  @dataclass
378
446
  class SSHHostDetails:
379
447
  """Connection details parsed from an SSH config stanza."""
@@ -487,6 +555,87 @@ def query_gpu_info(host: str, user: str, key_path: Path, timeout: int = 10, port
487
555
  return None
488
556
 
489
557
 
558
+ # ---------------------------------------------------------------------------
559
+ # EBS volume mount
560
+ # ---------------------------------------------------------------------------
561
+
562
+
563
+ def mount_ebs_volume(
564
+ host: str,
565
+ user: str,
566
+ key_path: Path,
567
+ volume_id: str,
568
+ mount_point: str = "/data",
569
+ format_volume: bool = True,
570
+ port: int = 22,
571
+ ) -> bool:
572
+ """Mount an EBS volume on the remote instance via SSH.
573
+
574
+ Detects the NVMe device by volume ID serial, formats if requested,
575
+ mounts at *mount_point*, and adds an fstab entry for persistence.
576
+
577
+ Returns True on success, False on failure.
578
+ """
579
+ ssh_opts = _ssh_opts(key_path)
580
+ port_opts = ["-p", str(port)] if port != 22 else []
581
+
582
+ # Strip the vol- prefix and hyphen for NVMe serial matching
583
+ vol_serial = volume_id.replace("-", "")
584
+
585
+ format_cmd = ""
586
+ if format_volume:
587
+ format_cmd = (
588
+ ' if ! sudo blkid "$DEVICE" > /dev/null 2>&1; then\n'
589
+ ' echo "Formatting $DEVICE as ext4..."\n'
590
+ ' sudo mkfs.ext4 "$DEVICE"\n'
591
+ " fi\n"
592
+ )
593
+
594
+ remote_script = (
595
+ "set -e\n"
596
+ "# Detect EBS device by NVMe serial (Nitro instances)\n"
597
+ f'SERIAL="{vol_serial}"\n'
598
+ "DEVICE=$(lsblk -o NAME,SERIAL -dpn 2>/dev/null | "
599
+ "awk -v s=\"$SERIAL\" '$2 == s {print $1}' | head -1)\n"
600
+ "# Fallback to common device paths\n"
601
+ 'if [ -z "$DEVICE" ]; then\n'
602
+ " for dev in /dev/nvme1n1 /dev/xvdf /dev/sdf; do\n"
603
+ ' if [ -b "$dev" ]; then DEVICE="$dev"; break; fi\n'
604
+ " done\n"
605
+ "fi\n"
606
+ 'if [ -z "$DEVICE" ]; then\n'
607
+ ' echo "ERROR: Could not find EBS device" >&2\n'
608
+ " exit 1\n"
609
+ "fi\n"
610
+ 'echo "Found EBS device: $DEVICE"\n'
611
+ f"{format_cmd}"
612
+ f"sudo mkdir -p {mount_point}\n"
613
+ f'sudo mount "$DEVICE" {mount_point}\n'
614
+ f"sudo chown {user}:{user} {mount_point}\n"
615
+ "# Add fstab entry for reboot persistence\n"
616
+ 'UUID=$(sudo blkid -s UUID -o value "$DEVICE")\n'
617
+ 'if [ -n "$UUID" ]; then\n'
618
+ f' if ! grep -q "$UUID" /etc/fstab; then\n'
619
+ f' echo "UUID=$UUID {mount_point} ext4 defaults,nofail 0 2" | sudo tee -a /etc/fstab > /dev/null\n'
620
+ " fi\n"
621
+ "fi\n"
622
+ f'echo "Mounted $DEVICE at {mount_point}"'
623
+ )
624
+
625
+ cmd = [
626
+ "ssh",
627
+ *ssh_opts,
628
+ *port_opts,
629
+ "-o",
630
+ "ConnectTimeout=10",
631
+ f"{user}@{host}",
632
+ remote_script,
633
+ ]
634
+
635
+ result = subprocess.run(cmd, capture_output=False)
636
+ return result.returncode == 0
637
+
638
+
490
639
  # ---------------------------------------------------------------------------
491
640
  # Internal helpers
492
641
  # ---------------------------------------------------------------------------