dayhoff-tools 1.5.2__py3-none-any.whl → 1.5.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of dayhoff-tools might be problematic. Click here for more details.

@@ -1,6 +1,7 @@
1
1
  """Engine and Studio management commands for DHT CLI."""
2
2
 
3
3
  import json
4
+ import re
4
5
  import shutil
5
6
  import subprocess
6
7
  import sys
@@ -19,7 +20,6 @@ from rich.panel import Panel
19
20
  from rich.progress import Progress, SpinnerColumn, TextColumn, TimeElapsedColumn
20
21
  from rich.prompt import Confirm, IntPrompt, Prompt
21
22
  from rich.table import Table
22
- import re
23
23
 
24
24
  # Initialize Typer apps
25
25
  engine_app = typer.Typer(help="Manage compute engines for development.")
@@ -34,6 +34,10 @@ HOURLY_COSTS = {
34
34
  "t4": 0.75, # g4dn.2xlarge
35
35
  "a10g": 1.50, # g5.2xlarge
36
36
  "a100": 21.96, # p4d.24xlarge
37
+ "4_t4": 3.91, # g4dn.12xlarge
38
+ "8_t4": 7.83, # g4dn.metal
39
+ "4_a10g": 6.24, # g5.12xlarge
40
+ "8_a10g": 16.29, # g5.48xlarge
37
41
  }
38
42
 
39
43
  # SSH config management
@@ -43,6 +47,7 @@ SSH_MANAGED_COMMENT = "# Managed by dh engine"
43
47
  # Bootstrap stage helpers
44
48
  # --------------------------------------------------------------------------------
45
49
 
50
+
46
51
  def _colour_stage(stage: str) -> str:
47
52
  """Return colourised stage name for table output."""
48
53
  if not stage:
@@ -67,7 +72,14 @@ def _fetch_init_stages(instance_ids: List[str]) -> Dict[str, str]:
67
72
  for res in page["Reservations"]:
68
73
  for inst in res["Instances"]:
69
74
  iid = inst["InstanceId"]
70
- tag_val = next((t["Value"] for t in inst.get("Tags", []) if t["Key"] == "DayhoffInitStage"), None)
75
+ tag_val = next(
76
+ (
77
+ t["Value"]
78
+ for t in inst.get("Tags", [])
79
+ if t["Key"] == "DayhoffInitStage"
80
+ ),
81
+ None,
82
+ )
71
83
  if tag_val:
72
84
  stages[iid] = tag_val
73
85
  except Exception:
@@ -167,13 +179,13 @@ def format_duration(duration: timedelta) -> str:
167
179
 
168
180
  def get_disk_usage_via_ssm(instance_id: str) -> Optional[str]:
169
181
  """Get disk usage for an engine via SSM.
170
-
182
+
171
183
  Returns:
172
184
  String like "17/50 GB" or None if failed
173
185
  """
174
186
  try:
175
187
  ssm = boto3.client("ssm", region_name="us-east-1")
176
-
188
+
177
189
  # Run df command to get disk usage
178
190
  response = ssm.send_command(
179
191
  InstanceIds=[instance_id],
@@ -181,14 +193,14 @@ def get_disk_usage_via_ssm(instance_id: str) -> Optional[str]:
181
193
  Parameters={
182
194
  "commands": [
183
195
  # Get root filesystem usage in GB
184
- "df -BG / | tail -1 | awk '{gsub(/G/, \"\", $2); gsub(/G/, \"\", $3); print $3 \"/\" $2 \" GB\"}'"
196
+ 'df -BG / | tail -1 | awk \'{gsub(/G/, "", $2); gsub(/G/, "", $3); print $3 "/" $2 " GB"}\''
185
197
  ],
186
198
  "executionTimeout": ["10"],
187
199
  },
188
200
  )
189
-
201
+
190
202
  command_id = response["Command"]["CommandId"]
191
-
203
+
192
204
  # Wait for command to complete (with timeout)
193
205
  for _ in range(5): # 5 second timeout
194
206
  time.sleep(1)
@@ -198,13 +210,13 @@ def get_disk_usage_via_ssm(instance_id: str) -> Optional[str]:
198
210
  )
199
211
  if result["Status"] in ["Success", "Failed"]:
200
212
  break
201
-
213
+
202
214
  if result["Status"] == "Success":
203
215
  output = result["StandardOutputContent"].strip()
204
216
  return output if output else None
205
-
217
+
206
218
  return None
207
-
219
+
208
220
  except Exception as e:
209
221
  # logger.debug(f"Failed to get disk usage for {instance_id}: {e}") # Original code had this line commented out
210
222
  return None
@@ -212,13 +224,13 @@ def get_disk_usage_via_ssm(instance_id: str) -> Optional[str]:
212
224
 
213
225
  def get_studio_disk_usage_via_ssm(instance_id: str, username: str) -> Optional[str]:
214
226
  """Get disk usage for a studio via SSM.
215
-
227
+
216
228
  Returns:
217
229
  String like "333/500 GB" or None if failed
218
230
  """
219
231
  try:
220
232
  ssm = boto3.client("ssm", region_name="us-east-1")
221
-
233
+
222
234
  # Run df command to get studio disk usage
223
235
  response = ssm.send_command(
224
236
  InstanceIds=[instance_id],
@@ -226,14 +238,14 @@ def get_studio_disk_usage_via_ssm(instance_id: str, username: str) -> Optional[s
226
238
  Parameters={
227
239
  "commands": [
228
240
  # Get studio filesystem usage in GB
229
- f"df -BG /studios/{username} 2>/dev/null | tail -1 | awk '{{gsub(/G/, \"\", $2); gsub(/G/, \"\", $3); print $3 \"/\" $2 \" GB\"}}'"
241
+ f'df -BG /studios/{username} 2>/dev/null | tail -1 | awk \'{{gsub(/G/, "", $2); gsub(/G/, "", $3); print $3 "/" $2 " GB"}}\''
230
242
  ],
231
243
  "executionTimeout": ["10"],
232
244
  },
233
245
  )
234
-
246
+
235
247
  command_id = response["Command"]["CommandId"]
236
-
248
+
237
249
  # Wait for command to complete (with timeout)
238
250
  for _ in range(5): # 5 second timeout
239
251
  time.sleep(1)
@@ -243,13 +255,13 @@ def get_studio_disk_usage_via_ssm(instance_id: str, username: str) -> Optional[s
243
255
  )
244
256
  if result["Status"] in ["Success", "Failed"]:
245
257
  break
246
-
258
+
247
259
  if result["Status"] == "Success":
248
260
  output = result["StandardOutputContent"].strip()
249
261
  return output if output else None
250
-
262
+
251
263
  return None
252
-
264
+
253
265
  except Exception:
254
266
  return None
255
267
 
@@ -434,7 +446,7 @@ def launch_engine(
434
446
  "cpu",
435
447
  "--type",
436
448
  "-t",
437
- help="Engine type: cpu, cpumax, t4, a10g, a100",
449
+ help="Engine type: cpu, cpumax, t4, a10g, a100, 4_t4, 8_t4, 4_a10g, 8_a10g",
438
450
  ),
439
451
  user: Optional[str] = typer.Option(None, "--user", "-u", help="Override username"),
440
452
  boot_disk_size: Optional[int] = typer.Option(
@@ -455,7 +467,17 @@ def launch_engine(
455
467
  username = user
456
468
 
457
469
  # Validate engine type
458
- valid_types = ["cpu", "cpumax", "t4", "a10g", "a100"]
470
+ valid_types = [
471
+ "cpu",
472
+ "cpumax",
473
+ "t4",
474
+ "a10g",
475
+ "a100",
476
+ "4_t4",
477
+ "8_t4",
478
+ "4_a10g",
479
+ "8_a10g",
480
+ ]
459
481
  if engine_type not in valid_types:
460
482
  console.print(f"[red]❌ Invalid engine type: {engine_type}[/red]")
461
483
  console.print(f"Valid types: {', '.join(valid_types)}")
@@ -489,7 +511,7 @@ def launch_engine(
489
511
  "engine_type": engine_type,
490
512
  }
491
513
  if boot_disk_size is not None:
492
- request_data["boot_disk_size"] = boot_disk_size
514
+ request_data["boot_disk_size"] = str(boot_disk_size)
493
515
  if availability_zone:
494
516
  request_data["availability_zone"] = availability_zone
495
517
 
@@ -555,22 +577,17 @@ def list_engines(
555
577
  table.add_column("Disk Usage")
556
578
  table.add_column("Uptime/Since")
557
579
  table.add_column("$/hour", justify="right")
558
- table.add_column("Cost Today", justify="right", style="yellow")
559
580
 
560
- total_cost = 0.0
561
581
  for engine in engines:
562
582
  launch_time = parse_launch_time(engine["launch_time"])
563
583
  uptime = datetime.now(timezone.utc) - launch_time
564
584
  hourly_cost = HOURLY_COSTS.get(engine["engine_type"], 0)
565
585
 
566
586
  if engine["state"].lower() == "running":
567
- daily_cost = hourly_cost * min(uptime.total_seconds() / 3600, 24)
568
- total_cost += daily_cost
569
587
  time_str = format_duration(uptime)
570
588
  # Get disk usage for running engines
571
589
  disk_usage = get_disk_usage_via_ssm(engine["instance_id"]) or "-"
572
590
  else:
573
- daily_cost = 0
574
591
  time_str = launch_time.strftime("%Y-%m-%d %H:%M")
575
592
  disk_usage = "-"
576
593
 
@@ -583,13 +600,9 @@ def list_engines(
583
600
  disk_usage,
584
601
  time_str,
585
602
  f"${hourly_cost:.2f}",
586
- f"${daily_cost:.2f}" if daily_cost > 0 else "-",
587
603
  )
588
604
 
589
605
  console.print(table)
590
-
591
- if total_cost > 0:
592
- console.print(f"\n[yellow]Total cost today: ${total_cost:.2f}[/yellow]")
593
606
  else:
594
607
  error = response.json().get("error", "Unknown error")
595
608
  console.print(f"[red]❌ Failed to list engines: {error}[/red]")
@@ -617,7 +630,7 @@ def engine_status(
617
630
  if response.status_code != 200:
618
631
  console.print("[red]❌ Failed to fetch engine details[/red]")
619
632
  raise typer.Exit(1)
620
-
633
+
621
634
  engine_details = response.json()
622
635
  engine = engine_details.get("engine", engine) # Use detailed info if available
623
636
  idle_detector = engine_details.get("idle_detector", {})
@@ -650,26 +663,38 @@ def engine_status(
650
663
  res = ssm.send_command(
651
664
  InstanceIds=[engine["instance_id"]],
652
665
  DocumentName="AWS-RunShellScript",
653
- Parameters={"commands": ["cat /opt/dayhoff/state/engine-health.json 2>/dev/null || cat /var/run/engine-health.json 2>/dev/null || true"], "executionTimeout": ["10"]},
666
+ Parameters={
667
+ "commands": [
668
+ "cat /opt/dayhoff/state/engine-health.json 2>/dev/null || cat /var/run/engine-health.json 2>/dev/null || true"
669
+ ],
670
+ "executionTimeout": ["10"],
671
+ },
654
672
  )
655
673
  cid = res["Command"]["CommandId"]
656
674
  time.sleep(1)
657
- inv = ssm.get_command_invocation(CommandId=cid, InstanceId=engine["instance_id"])
675
+ inv = ssm.get_command_invocation(
676
+ CommandId=cid, InstanceId=engine["instance_id"]
677
+ )
658
678
  if inv["Status"] == "Success":
659
679
  import json as _json
680
+
660
681
  health = _json.loads(inv["StandardOutputContent"].strip() or "{}")
661
682
  status_lines.append("")
662
683
  status_lines.append("[bold]Health:[/bold]")
663
- status_lines.append(f" • GPU Drivers: {'OK' if health.get('drivers_ok') else 'MISSING'}")
664
- status_lines.append(f" • Idle Detector: {health.get('idle_detector_timer', 'unknown')}")
684
+ status_lines.append(
685
+ f" • GPU Drivers: {'OK' if health.get('drivers_ok') else 'MISSING'}"
686
+ )
687
+ status_lines.append(
688
+ f" • Idle Detector: {health.get('idle_detector_timer', 'unknown')}"
689
+ )
665
690
  except Exception:
666
691
  pass
667
-
692
+
668
693
  # Idle detector status (from new API endpoint)
669
694
  if idle_detector.get("available"):
670
695
  status_lines.append("")
671
696
  status_lines.append("[bold]Idle Detector:[/bold]")
672
-
697
+
673
698
  # Overall status
674
699
  if idle_detector["status"] == "active":
675
700
  status_lines.append(" [green]✓ Engine ACTIVE[/green]")
@@ -678,33 +703,41 @@ def engine_status(
678
703
  idle_threshold = idle_detector.get("idle_threshold", 1800)
679
704
  idle_minutes = idle_seconds // 60
680
705
  threshold_minutes = idle_threshold // 60
681
- status_lines.append(f" [yellow]⏱ Engine IDLE ({idle_minutes}/{threshold_minutes} minutes)[/yellow]")
682
-
706
+ status_lines.append(
707
+ f" [yellow]⏱ Engine IDLE ({idle_minutes}/{threshold_minutes} minutes)[/yellow]"
708
+ )
709
+
683
710
  # Coffee lock
684
711
  if idle_detector.get("coffee_lock"):
685
- status_lines.append(f" • [cyan]☕ Caffeinated for another {idle_detector['coffee_lock']}[/cyan]")
686
-
712
+ status_lines.append(
713
+ f" • [cyan]☕ Caffeinated for another {idle_detector['coffee_lock']}[/cyan]"
714
+ )
715
+
687
716
  # SSH sessions
688
717
  ssh_sessions = idle_detector.get("ssh_sessions", [])
689
718
  if ssh_sessions:
690
719
  status_lines.append(f" • [blue]SSH Sessions ({len(ssh_sessions)}):[/blue]")
691
720
  for session in ssh_sessions:
692
- status_lines.append(f" - {session['tty']} (pid {session['pid']}, idle {session['idle_time']}) from {session['from_ip']}")
693
-
721
+ status_lines.append(
722
+ f" - {session['tty']} (pid {session['pid']}, idle {session['idle_time']}) from {session['from_ip']}"
723
+ )
724
+
694
725
  # IDE connections
695
726
  ide_conn = idle_detector.get("ide_connections")
696
727
  if ide_conn:
697
- status_lines.append(f" • [magenta]🖥 IDE connected ({ide_conn['connection_count']} connections)[/magenta]")
728
+ status_lines.append(
729
+ f" • [magenta]🖥 IDE connected ({ide_conn['connection_count']} connections)[/magenta]"
730
+ )
698
731
 
699
732
  if attached_studios:
700
733
  status_lines.append("")
701
734
  status_lines.append("[bold]Attached Studios:[/bold]")
702
735
  for studio in attached_studios:
703
- status_lines.append(
704
- f" • {studio['user']} ({studio['studio_id']})"
705
- )
736
+ status_lines.append(f" • {studio['user']} ({studio['studio_id']})")
706
737
 
707
- console.print(Panel("\n".join(status_lines), title="Engine Status", border_style="blue"))
738
+ console.print(
739
+ Panel("\n".join(status_lines), title="Engine Status", border_style="blue")
740
+ )
708
741
 
709
742
  if show_log:
710
743
  console.print("\n[bold]Bootstrap Log:[/bold]")
@@ -713,11 +746,18 @@ def engine_status(
713
746
  resp = ssm.send_command(
714
747
  InstanceIds=[engine["instance_id"]],
715
748
  DocumentName="AWS-RunShellScript",
716
- Parameters={"commands": ["cat /var/log/engine-setup.log 2>/dev/null || echo 'No setup log found'"], "executionTimeout": ["15"]},
749
+ Parameters={
750
+ "commands": [
751
+ "cat /var/log/engine-setup.log 2>/dev/null || echo 'No setup log found'"
752
+ ],
753
+ "executionTimeout": ["15"],
754
+ },
717
755
  )
718
756
  cid = resp["Command"]["CommandId"]
719
757
  time.sleep(2)
720
- inv = ssm.get_command_invocation(CommandId=cid, InstanceId=engine["instance_id"])
758
+ inv = ssm.get_command_invocation(
759
+ CommandId=cid, InstanceId=engine["instance_id"]
760
+ )
721
761
  if inv["Status"] == "Success":
722
762
  log_content = inv["StandardOutputContent"].strip()
723
763
  if log_content:
@@ -856,7 +896,9 @@ def terminate_engine(
856
896
  @engine_app.command("ssh")
857
897
  def ssh_engine(
858
898
  name_or_id: str = typer.Argument(help="Engine name or instance ID"),
859
- admin: bool = typer.Option(False, "--admin", help="Connect as ec2-user instead of the engine owner user"),
899
+ admin: bool = typer.Option(
900
+ False, "--admin", help="Connect as ec2-user instead of the engine owner user"
901
+ ),
860
902
  ):
861
903
  """Connect to an engine via SSH.
862
904
 
@@ -864,7 +906,7 @@ def ssh_engine(
864
906
  Pass `--admin` to connect with the underlying [`ec2-user`] account for break-glass or debugging.
865
907
  """
866
908
  username = check_aws_sso()
867
-
909
+
868
910
  # Check for Session Manager Plugin
869
911
  if not check_session_manager_plugin():
870
912
  raise typer.Exit(1)
@@ -886,7 +928,9 @@ def ssh_engine(
886
928
  ssh_user = "ec2-user" if admin else username
887
929
 
888
930
  # Update SSH config
889
- console.print(f"Updating SSH config for [cyan]{engine['name']}[/cyan] (user: {ssh_user})...")
931
+ console.print(
932
+ f"Updating SSH config for [cyan]{engine['name']}[/cyan] (user: {ssh_user})..."
933
+ )
890
934
  update_ssh_config_entry(engine["name"], engine["instance_id"], ssh_user)
891
935
 
892
936
  # Connect
@@ -900,7 +944,11 @@ def config_ssh(
900
944
  all_engines: bool = typer.Option(
901
945
  False, "--all", "-a", help="Include all engines from all users"
902
946
  ),
903
- admin: bool = typer.Option(False, "--admin", help="Generate entries that use ec2-user instead of per-engine owner user"),
947
+ admin: bool = typer.Option(
948
+ False,
949
+ "--admin",
950
+ help="Generate entries that use ec2-user instead of per-engine owner user",
951
+ ),
904
952
  ):
905
953
  """Update SSH config with available engines."""
906
954
  username = check_aws_sso()
@@ -964,7 +1012,7 @@ def config_ssh(
964
1012
  if not clean:
965
1013
  for engine in running_engines:
966
1014
  # Determine ssh user based on --admin flag
967
- ssh_user = 'ec2-user' if admin else username
1015
+ ssh_user = "ec2-user" if admin else username
968
1016
  new_lines.extend(
969
1017
  [
970
1018
  "",
@@ -998,7 +1046,9 @@ def config_ssh(
998
1046
  def coffee(
999
1047
  name_or_id: str = typer.Argument(help="Engine name or instance ID"),
1000
1048
  duration: str = typer.Argument("4h", help="Duration (e.g., 2h, 30m, 2h30m)"),
1001
- cancel: bool = typer.Option(False, "--cancel", help="Cancel existing coffee lock instead of extending"),
1049
+ cancel: bool = typer.Option(
1050
+ False, "--cancel", help="Cancel existing coffee lock instead of extending"
1051
+ ),
1002
1052
  ):
1003
1053
  """Pour ☕ for an engine: keeps it awake for the given duration (or cancel)."""
1004
1054
  username = check_aws_sso()
@@ -1036,7 +1086,9 @@ def coffee(
1036
1086
  if cancel:
1037
1087
  console.print(f"Cancelling coffee for [cyan]{engine['name']}[/cyan]…")
1038
1088
  else:
1039
- console.print(f"Pouring coffee for [cyan]{engine['name']}[/cyan] for {duration}…")
1089
+ console.print(
1090
+ f"Pouring coffee for [cyan]{engine['name']}[/cyan] for {duration}…"
1091
+ )
1040
1092
 
1041
1093
  # Use SSM to run the engine coffee command
1042
1094
  ssm = boto3.client("ssm", region_name="us-east-1")
@@ -1046,7 +1098,11 @@ def coffee(
1046
1098
  DocumentName="AWS-RunShellScript",
1047
1099
  Parameters={
1048
1100
  "commands": [
1049
- ("/usr/local/bin/engine-coffee --cancel" if cancel else f"/usr/local/bin/engine-coffee {seconds_total}")
1101
+ (
1102
+ "/usr/local/bin/engine-coffee --cancel"
1103
+ if cancel
1104
+ else f"/usr/local/bin/engine-coffee {seconds_total}"
1105
+ )
1050
1106
  ],
1051
1107
  "executionTimeout": ["60"],
1052
1108
  },
@@ -1066,7 +1122,9 @@ def coffee(
1066
1122
 
1067
1123
  if result["Status"] == "Success":
1068
1124
  if cancel:
1069
- console.print("[green]✓ Coffee cancelled – auto-shutdown re-enabled[/green]")
1125
+ console.print(
1126
+ "[green]✓ Coffee cancelled – auto-shutdown re-enabled[/green]"
1127
+ )
1070
1128
  else:
1071
1129
  console.print(f"[green]✓ Coffee poured for {duration}[/green]")
1072
1130
  console.print(
@@ -1089,7 +1147,9 @@ def resize_engine(
1089
1147
  name_or_id: str = typer.Argument(help="Engine name or instance ID"),
1090
1148
  size: int = typer.Option(..., "--size", "-s", help="New size in GB"),
1091
1149
  online: bool = typer.Option(
1092
- False, "--online", help="Resize while running (requires manual filesystem expansion)"
1150
+ False,
1151
+ "--online",
1152
+ help="Resize while running (requires manual filesystem expansion)",
1093
1153
  ),
1094
1154
  force: bool = typer.Option(
1095
1155
  False, "--force", "-f", help="Force resize and detach all studios"
@@ -1109,59 +1169,65 @@ def resize_engine(
1109
1169
 
1110
1170
  # Get current volume info to validate size
1111
1171
  ec2 = boto3.client("ec2", region_name="us-east-1")
1112
-
1172
+
1113
1173
  try:
1114
1174
  # Get instance details to find root volume
1115
1175
  instance_info = ec2.describe_instances(InstanceIds=[engine["instance_id"]])
1116
1176
  instance = instance_info["Reservations"][0]["Instances"][0]
1117
-
1177
+
1118
1178
  # Find root volume
1119
1179
  root_device = instance.get("RootDeviceName", "/dev/xvda")
1120
1180
  root_volume_id = None
1121
-
1181
+
1122
1182
  for bdm in instance.get("BlockDeviceMappings", []):
1123
1183
  if bdm["DeviceName"] == root_device:
1124
1184
  root_volume_id = bdm["Ebs"]["VolumeId"]
1125
1185
  break
1126
-
1186
+
1127
1187
  if not root_volume_id:
1128
1188
  console.print("[red]❌ Could not find root volume[/red]")
1129
1189
  raise typer.Exit(1)
1130
-
1190
+
1131
1191
  # Get current volume size
1132
1192
  volumes = ec2.describe_volumes(VolumeIds=[root_volume_id])
1133
1193
  current_size = volumes["Volumes"][0]["Size"]
1134
-
1194
+
1135
1195
  if size <= current_size:
1136
- console.print(f"[red]❌ New size ({size}GB) must be larger than current size ({current_size}GB)[/red]")
1196
+ console.print(
1197
+ f"[red]❌ New size ({size}GB) must be larger than current size ({current_size}GB)[/red]"
1198
+ )
1137
1199
  raise typer.Exit(1)
1138
-
1139
- console.print(f"[yellow]Resizing engine boot disk from {current_size}GB to {size}GB[/yellow]")
1140
-
1200
+
1201
+ console.print(
1202
+ f"[yellow]Resizing engine boot disk from {current_size}GB to {size}GB[/yellow]"
1203
+ )
1204
+
1141
1205
  # Check if we need to stop the instance
1142
1206
  if not online and engine["state"].lower() == "running":
1143
1207
  console.print("Stopping engine for offline resize...")
1144
1208
  stop_response = make_api_request(
1145
- "POST", f"/engines/{engine['instance_id']}/stop", json_data={"detach_studios": False}
1209
+ "POST",
1210
+ f"/engines/{engine['instance_id']}/stop",
1211
+ json_data={"detach_studios": False},
1146
1212
  )
1147
1213
  if stop_response.status_code != 200:
1148
1214
  console.print("[red]❌ Failed to stop engine[/red]")
1149
1215
  raise typer.Exit(1)
1150
-
1216
+
1151
1217
  # Wait for instance to stop
1152
1218
  console.print("Waiting for engine to stop...")
1153
1219
  waiter = ec2.get_waiter("instance_stopped")
1154
1220
  waiter.wait(InstanceIds=[engine["instance_id"]])
1155
1221
  console.print("[green]✓ Engine stopped[/green]")
1156
-
1222
+
1157
1223
  # Call the resize API
1158
1224
  console.print("Resizing volume...")
1159
1225
  resize_response = make_api_request(
1160
- "POST",
1226
+ "POST",
1161
1227
  f"/engines/{engine['instance_id']}/resize",
1162
- json_data={"size": size, "detach_studios": force}
1228
+ json_data={"size": size, "detach_studios": force},
1163
1229
  )
1164
-
1230
+
1165
1231
  if resize_response.status_code == 409 and not force:
1166
1232
  # Engine has attached studios
1167
1233
  data = resize_response.json()
@@ -1175,71 +1241,85 @@ def resize_engine(
1175
1241
  resize_response = make_api_request(
1176
1242
  "POST",
1177
1243
  f"/engines/{engine['instance_id']}/resize",
1178
- json_data={"size": size, "detach_studios": True}
1244
+ json_data={"size": size, "detach_studios": True},
1179
1245
  )
1180
1246
  else:
1181
1247
  console.print("Resize cancelled.")
1182
1248
  return
1183
-
1249
+
1184
1250
  if resize_response.status_code != 200:
1185
1251
  error = resize_response.json().get("error", "Unknown error")
1186
1252
  console.print(f"[red]❌ Failed to resize engine: {error}[/red]")
1187
1253
  raise typer.Exit(1)
1188
-
1254
+
1189
1255
  # Check if studios were detached
1190
1256
  data = resize_response.json()
1191
1257
  detached_studios = data.get("detached_studios", 0)
1192
1258
  if detached_studios > 0:
1193
- console.print(f"[green]✓ Detached {detached_studios} studio(s) before resize[/green]")
1194
-
1259
+ console.print(
1260
+ f"[green]✓ Detached {detached_studios} studio(s) before resize[/green]"
1261
+ )
1262
+
1195
1263
  # Wait for modification to complete
1196
1264
  console.print("Waiting for volume modification to complete...")
1197
1265
  while True:
1198
1266
  mod_state = ec2.describe_volumes_modifications(VolumeIds=[root_volume_id])
1199
1267
  if not mod_state["VolumesModifications"]:
1200
1268
  break # Modification complete
1201
-
1269
+
1202
1270
  modification = mod_state["VolumesModifications"][0]
1203
1271
  state = modification["ModificationState"]
1204
1272
  progress = modification.get("Progress", 0)
1205
-
1273
+
1206
1274
  # Show progress updates only for the resize phase
1207
1275
  if state == "modifying":
1208
1276
  console.print(f"[yellow]Progress: {progress}%[/yellow]")
1209
-
1277
+
1210
1278
  # Exit as soon as optimization starts (resize is complete)
1211
1279
  if state == "optimizing":
1212
1280
  console.print("[green]✓ Volume resized successfully[/green]")
1213
- console.print("[dim]AWS is optimizing the volume in the background (no action needed).[/dim]")
1281
+ console.print(
1282
+ "[dim]AWS is optimizing the volume in the background (no action needed).[/dim]"
1283
+ )
1214
1284
  break
1215
-
1285
+
1216
1286
  if state == "completed":
1217
1287
  console.print("[green]✓ Volume resized successfully[/green]")
1218
1288
  break
1219
1289
  elif state == "failed":
1220
1290
  console.print("[red]❌ Volume modification failed[/red]")
1221
1291
  raise typer.Exit(1)
1222
-
1292
+
1223
1293
  time.sleep(2) # Check more frequently for better UX
1224
-
1294
+
1225
1295
  # If offline resize, start the instance back up
1226
1296
  if not online and engine["state"].lower() == "running":
1227
1297
  console.print("Starting engine back up...")
1228
- start_response = make_api_request("POST", f"/engines/{engine['instance_id']}/start")
1298
+ start_response = make_api_request(
1299
+ "POST", f"/engines/{engine['instance_id']}/start"
1300
+ )
1229
1301
  if start_response.status_code != 200:
1230
- console.print("[yellow]⚠️ Failed to restart engine automatically[/yellow]")
1231
- console.print(f"Please start it manually: [cyan]dh engine start {engine['name']}[/cyan]")
1302
+ console.print(
1303
+ "[yellow]⚠️ Failed to restart engine automatically[/yellow]"
1304
+ )
1305
+ console.print(
1306
+ f"Please start it manually: [cyan]dh engine start {engine['name']}[/cyan]"
1307
+ )
1232
1308
  else:
1233
1309
  console.print("[green]✓ Engine started[/green]")
1234
1310
  console.print("The filesystem will be automatically expanded on boot.")
1235
-
1311
+
1236
1312
  elif online and engine["state"].lower() == "running":
1237
- console.print("\n[yellow]⚠️ Online resize complete. You must now expand the filesystem:[/yellow]")
1313
+ console.print(
1314
+ "\n[yellow]⚠️ Online resize complete. You must now expand the filesystem:[/yellow]"
1315
+ )
1238
1316
  console.print(f"1. SSH into the engine: [cyan]ssh {engine['name']}[/cyan]")
1239
1317
  console.print("2. Find the root device: [cyan]lsblk[/cyan]")
1240
- console.print("3. Expand the partition: [cyan]sudo growpart /dev/nvme0n1 1[/cyan] (adjust device name as needed)")
1318
+ console.print(
1319
+ "3. Expand the partition: [cyan]sudo growpart /dev/nvme0n1 1[/cyan] (adjust device name as needed)"
1320
+ )
1241
1321
  console.print("4. Expand the filesystem: [cyan]sudo xfs_growfs /[/cyan]")
1242
-
1322
+
1243
1323
  except ClientError as e:
1244
1324
  console.print(f"[red]❌ Failed to resize engine: {e}[/red]")
1245
1325
  raise typer.Exit(1)
@@ -1355,30 +1435,44 @@ def create_ami(
1355
1435
  # If any user studios are still attached we must detach them before the instance reboots
1356
1436
  # for snapshot consistency; otherwise Studio-Manager metadata becomes stale.
1357
1437
 
1358
- attached_resp = make_api_request("GET", f"/engines/{engine['instance_id']}/studios")
1359
- attached_studios = attached_resp.json().get("studios", []) if attached_resp.status_code == 200 else []
1438
+ attached_resp = make_api_request(
1439
+ "GET", f"/engines/{engine['instance_id']}/studios"
1440
+ )
1441
+ attached_studios = (
1442
+ attached_resp.json().get("studios", [])
1443
+ if attached_resp.status_code == 200
1444
+ else []
1445
+ )
1360
1446
 
1361
1447
  if attached_studios:
1362
- console.print(f"Detaching {len(attached_studios)} studio(s) from this engine…")
1448
+ console.print(
1449
+ f"Detaching {len(attached_studios)} studio(s) from this engine…"
1450
+ )
1363
1451
  for s in attached_studios:
1364
1452
  console.print(f" • {s['user']} ({s['studio_id']})")
1365
1453
 
1366
1454
  for s in attached_studios:
1367
1455
  resp = make_api_request("POST", f"/studios/{s['studio_id']}/detach")
1368
1456
  if resp.status_code != 200:
1369
- console.print(f"[red]❌ Failed to detach {s['studio_id']} – aborting.[/red]")
1457
+ console.print(
1458
+ f"[red]❌ Failed to detach {s['studio_id']} – aborting.[/red]"
1459
+ )
1370
1460
  return
1371
1461
 
1372
1462
  # Wait briefly for volumes to become available (max 2 min)
1373
1463
  # (time is already imported at module level)
1374
1464
  ec2_wait = boto3.client("ec2", region_name="us-east-1")
1375
- vol_ids = [s['studio_id'] for s in attached_studios]
1465
+ vol_ids = [s["studio_id"] for s in attached_studios]
1376
1466
  console.print("Waiting for volumes to detach…")
1377
1467
  waiter = ec2_wait.get_waiter("volume_available")
1378
1468
  try:
1379
- waiter.wait(VolumeIds=vol_ids, WaiterConfig={"Delay": 5, "MaxAttempts": 24})
1469
+ waiter.wait(
1470
+ VolumeIds=vol_ids, WaiterConfig={"Delay": 5, "MaxAttempts": 24}
1471
+ )
1380
1472
  except Exception:
1381
- console.print("[yellow]Proceeding even though some volumes may still be detaching.[/yellow]")
1473
+ console.print(
1474
+ "[yellow]Proceeding even though some volumes may still be detaching.[/yellow]"
1475
+ )
1382
1476
 
1383
1477
  # Create the AMI
1384
1478
  with Progress(
@@ -1386,7 +1480,9 @@ def create_ami(
1386
1480
  TextColumn("[progress.description]{task.description}"),
1387
1481
  transient=True,
1388
1482
  ) as progress:
1389
- progress.add_task("Creating AMI (this will take several minutes)...", total=None)
1483
+ progress.add_task(
1484
+ "Creating AMI (this will take several minutes)...", total=None
1485
+ )
1390
1486
 
1391
1487
  create_params = {
1392
1488
  "InstanceId": engine["instance_id"],
@@ -1519,18 +1615,22 @@ def create_studio(
1519
1615
 
1520
1616
  @studio_app.command("status")
1521
1617
  def studio_status(
1522
- user: Optional[str] = typer.Option(None, "--user", "-u", help="Check status for a different user (admin only)"),
1618
+ user: Optional[str] = typer.Option(
1619
+ None, "--user", "-u", help="Check status for a different user (admin only)"
1620
+ ),
1523
1621
  ):
1524
1622
  """Show status of your studio."""
1525
1623
  username = check_aws_sso()
1526
-
1624
+
1527
1625
  # Use specified user if provided, otherwise use current user
1528
1626
  target_user = user if user else username
1529
-
1627
+
1530
1628
  # Add warning when checking another user's studio
1531
1629
  if target_user != username:
1532
- console.print(f"[yellow]⚠️ Checking studio status for user: {target_user}[/yellow]")
1533
-
1630
+ console.print(
1631
+ f"[yellow]⚠️ Checking studio status for user: {target_user}[/yellow]"
1632
+ )
1633
+
1534
1634
  studio = get_user_studio(target_user)
1535
1635
  if not studio:
1536
1636
  if target_user == username:
@@ -1585,18 +1685,20 @@ def studio_status(
1585
1685
  @studio_app.command("attach")
1586
1686
  def attach_studio(
1587
1687
  engine_name_or_id: str = typer.Argument(help="Engine name or instance ID"),
1588
- user: Optional[str] = typer.Option(None, "--user", "-u", help="Attach a different user's studio (admin only)"),
1688
+ user: Optional[str] = typer.Option(
1689
+ None, "--user", "-u", help="Attach a different user's studio (admin only)"
1690
+ ),
1589
1691
  ):
1590
1692
  """Attach your studio to an engine."""
1591
1693
  username = check_aws_sso()
1592
-
1694
+
1593
1695
  # Check for Session Manager Plugin since we'll update SSH config
1594
1696
  if not check_session_manager_plugin():
1595
1697
  raise typer.Exit(1)
1596
-
1698
+
1597
1699
  # Use specified user if provided, otherwise use current user
1598
1700
  target_user = user if user else username
1599
-
1701
+
1600
1702
  # Add confirmation when attaching another user's studio
1601
1703
  if target_user != username:
1602
1704
  console.print(f"[yellow]⚠️ Managing studio for user: {target_user}[/yellow]")
@@ -1682,7 +1784,7 @@ def attach_studio(
1682
1784
 
1683
1785
  # Determine retry strategy
1684
1786
  max_attempts = 40 if engine_started_now else 3
1685
- retry_delay = 10 if engine_started_now else 3
1787
+ retry_delay = 10 if engine_started_now else 3
1686
1788
 
1687
1789
  if engine_started_now:
1688
1790
  # Long spinner-based loop while the freshly started engine finishes booting
@@ -1692,17 +1794,24 @@ def attach_studio(
1692
1794
  TextColumn("[progress.description]{task.description}"),
1693
1795
  transient=True,
1694
1796
  ) as prog:
1695
- task = prog.add_task("Attaching studio (engine is still booting)…", total=None)
1797
+ task = prog.add_task(
1798
+ "Attaching studio (engine is still booting)…", total=None
1799
+ )
1696
1800
 
1697
1801
  for attempt in range(max_attempts):
1698
- success, error_msg = _attempt_studio_attach(studio, engine, target_user, public_key)
1802
+ success, error_msg = _attempt_studio_attach(
1803
+ studio, engine, target_user, public_key
1804
+ )
1699
1805
 
1700
1806
  if success:
1701
1807
  break # success!
1702
1808
 
1703
1809
  # Update spinner every 3rd try to avoid log spam
1704
1810
  if attempt % 3 == 0:
1705
- prog.update(task, description=f"Attaching studio (engine is still booting)… {attempt+1}/{max_attempts}")
1811
+ prog.update(
1812
+ task,
1813
+ description=f"Attaching studio (engine is still booting)… {attempt+1}/{max_attempts}",
1814
+ )
1706
1815
 
1707
1816
  if error_msg:
1708
1817
  console.print(f"[red]❌ Failed to attach studio: {error_msg}[/red]")
@@ -1711,15 +1820,19 @@ def attach_studio(
1711
1820
  time.sleep(retry_delay)
1712
1821
 
1713
1822
  else:
1714
- console.print("[yellow]Engine is still starting up – please retry in a minute.[/yellow]")
1823
+ console.print(
1824
+ "[yellow]Engine is still starting up – please retry in a minute.[/yellow]"
1825
+ )
1715
1826
  return
1716
1827
  else:
1717
1828
  # Give the (already-running) engine a little breathing room – e.g. it may still be mounting EFS
1718
1829
  max_attempts = 10 # ~1 min total
1719
- retry_delay = 6
1830
+ retry_delay = 6
1720
1831
 
1721
1832
  for attempt in range(max_attempts):
1722
- success, error_msg = _attempt_studio_attach(studio, engine, target_user, public_key)
1833
+ success, error_msg = _attempt_studio_attach(
1834
+ studio, engine, target_user, public_key
1835
+ )
1723
1836
 
1724
1837
  if success:
1725
1838
  break # attached!
@@ -1735,7 +1848,9 @@ def attach_studio(
1735
1848
  time.sleep(retry_delay)
1736
1849
 
1737
1850
  else:
1738
- console.print("[yellow]Engine is busy or still initialising – please retry in about a minute.[/yellow]")
1851
+ console.print(
1852
+ "[yellow]Engine is busy or still initialising – please retry in about a minute.[/yellow]"
1853
+ )
1739
1854
  return
1740
1855
 
1741
1856
  # Successful attach path
@@ -1799,14 +1914,16 @@ def _attempt_studio_attach(studio, engine, target_user, public_key):
1799
1914
 
1800
1915
  @studio_app.command("detach")
1801
1916
  def detach_studio(
1802
- user: Optional[str] = typer.Option(None, "--user", "-u", help="Detach a different user's studio (admin only)"),
1917
+ user: Optional[str] = typer.Option(
1918
+ None, "--user", "-u", help="Detach a different user's studio (admin only)"
1919
+ ),
1803
1920
  ):
1804
1921
  """Detach your studio from its current engine."""
1805
1922
  username = check_aws_sso()
1806
-
1923
+
1807
1924
  # Use specified user if provided, otherwise use current user
1808
1925
  target_user = user if user else username
1809
-
1926
+
1810
1927
  # Add confirmation when detaching another user's studio
1811
1928
  if target_user != username:
1812
1929
  console.print(f"[yellow]⚠️ Managing studio for user: {target_user}[/yellow]")
@@ -1826,7 +1943,9 @@ def detach_studio(
1826
1943
  if target_user == username:
1827
1944
  console.print("[yellow]Your studio is not attached to any engine.[/yellow]")
1828
1945
  else:
1829
- console.print(f"[yellow]{target_user}'s studio is not attached to any engine.[/yellow]")
1946
+ console.print(
1947
+ f"[yellow]{target_user}'s studio is not attached to any engine.[/yellow]"
1948
+ )
1830
1949
  return
1831
1950
 
1832
1951
  console.print(f"Detaching studio from {studio.get('attached_vm_id')}...")
@@ -1842,24 +1961,30 @@ def detach_studio(
1842
1961
 
1843
1962
  @studio_app.command("delete")
1844
1963
  def delete_studio(
1845
- user: Optional[str] = typer.Option(None, "--user", "-u", help="Delete a different user's studio (admin only)"),
1964
+ user: Optional[str] = typer.Option(
1965
+ None, "--user", "-u", help="Delete a different user's studio (admin only)"
1966
+ ),
1846
1967
  ):
1847
1968
  """Delete your studio permanently."""
1848
1969
  username = check_aws_sso()
1849
-
1970
+
1850
1971
  # Use specified user if provided, otherwise use current user
1851
1972
  target_user = user if user else username
1852
-
1973
+
1853
1974
  # Extra warning when deleting another user's studio
1854
1975
  if target_user != username:
1855
- console.print(f"[red]⚠️ ADMIN ACTION: Deleting studio for user: {target_user}[/red]")
1976
+ console.print(
1977
+ f"[red]⚠️ ADMIN ACTION: Deleting studio for user: {target_user}[/red]"
1978
+ )
1856
1979
 
1857
1980
  studio = get_user_studio(target_user)
1858
1981
  if not studio:
1859
1982
  if target_user == username:
1860
1983
  console.print("[yellow]You don't have a studio to delete.[/yellow]")
1861
1984
  else:
1862
- console.print(f"[yellow]User {target_user} doesn't have a studio to delete.[/yellow]")
1985
+ console.print(
1986
+ f"[yellow]User {target_user} doesn't have a studio to delete.[/yellow]"
1987
+ )
1863
1988
  return
1864
1989
 
1865
1990
  console.print(
@@ -1870,7 +1995,11 @@ def delete_studio(
1870
1995
  console.print(f"Size: {studio['size_gb']}GB")
1871
1996
 
1872
1997
  # Multiple confirmations
1873
- if not Confirm.ask(f"\nAre you sure you want to delete {target_user}'s studio?" if target_user != username else "\nAre you sure you want to delete your studio?"):
1998
+ if not Confirm.ask(
1999
+ f"\nAre you sure you want to delete {target_user}'s studio?"
2000
+ if target_user != username
2001
+ else "\nAre you sure you want to delete your studio?"
2002
+ ):
1874
2003
  console.print("Deletion cancelled.")
1875
2004
  return
1876
2005
 
@@ -1942,7 +2071,7 @@ def list_studios(
1942
2071
  vm_id = studio["attached_vm_id"]
1943
2072
  engine_name = engines.get(vm_id, "unknown")
1944
2073
  attached_to = f"{engine_name} ({vm_id})"
1945
-
2074
+
1946
2075
  # Try to get disk usage if attached
1947
2076
  if studio["status"] == "in-use":
1948
2077
  usage = get_studio_disk_usage_via_ssm(vm_id, studio["user"])
@@ -1966,14 +2095,16 @@ def list_studios(
1966
2095
 
1967
2096
  @studio_app.command("reset")
1968
2097
  def reset_studio(
1969
- user: Optional[str] = typer.Option(None, "--user", "-u", help="Reset a different user's studio"),
2098
+ user: Optional[str] = typer.Option(
2099
+ None, "--user", "-u", help="Reset a different user's studio"
2100
+ ),
1970
2101
  ):
1971
2102
  """Reset a stuck studio (admin operation)."""
1972
2103
  username = check_aws_sso()
1973
-
2104
+
1974
2105
  # Use specified user if provided, otherwise use current user
1975
2106
  target_user = user if user else username
1976
-
2107
+
1977
2108
  # Add warning when resetting another user's studio
1978
2109
  if target_user != username:
1979
2110
  console.print(f"[yellow]⚠️ Resetting studio for user: {target_user}[/yellow]")
@@ -2044,14 +2175,16 @@ def reset_studio(
2044
2175
  @studio_app.command("resize")
2045
2176
  def resize_studio(
2046
2177
  size: int = typer.Option(..., "--size", "-s", help="New size in GB"),
2047
- user: Optional[str] = typer.Option(None, "--user", "-u", help="Resize a different user's studio (admin only)"),
2178
+ user: Optional[str] = typer.Option(
2179
+ None, "--user", "-u", help="Resize a different user's studio (admin only)"
2180
+ ),
2048
2181
  ):
2049
2182
  """Resize your studio volume (requires detachment)."""
2050
2183
  username = check_aws_sso()
2051
-
2184
+
2052
2185
  # Use specified user if provided, otherwise use current user
2053
2186
  target_user = user if user else username
2054
-
2187
+
2055
2188
  # Add warning when resizing another user's studio
2056
2189
  if target_user != username:
2057
2190
  console.print(f"[yellow]⚠️ Resizing studio for user: {target_user}[/yellow]")
@@ -2065,29 +2198,31 @@ def resize_studio(
2065
2198
  return
2066
2199
 
2067
2200
  current_size = studio["size_gb"]
2068
-
2201
+
2069
2202
  if size <= current_size:
2070
- console.print(f"[red]❌ New size ({size}GB) must be larger than current size ({current_size}GB)[/red]")
2203
+ console.print(
2204
+ f"[red]❌ New size ({size}GB) must be larger than current size ({current_size}GB)[/red]"
2205
+ )
2071
2206
  raise typer.Exit(1)
2072
2207
 
2073
2208
  # Check if studio is attached
2074
2209
  if studio["status"] == "in-use":
2075
2210
  console.print("[yellow]⚠️ Studio must be detached before resizing[/yellow]")
2076
2211
  console.print(f"Currently attached to: {studio.get('attached_vm_id')}")
2077
-
2212
+
2078
2213
  if not Confirm.ask("\nDetach studio and proceed with resize?"):
2079
2214
  console.print("Resize cancelled.")
2080
2215
  return
2081
-
2216
+
2082
2217
  # Detach the studio
2083
2218
  console.print("Detaching studio...")
2084
2219
  response = make_api_request("POST", f"/studios/{studio['studio_id']}/detach")
2085
2220
  if response.status_code != 200:
2086
2221
  console.print("[red]❌ Failed to detach studio[/red]")
2087
2222
  raise typer.Exit(1)
2088
-
2223
+
2089
2224
  console.print("[green]✓ Studio detached[/green]")
2090
-
2225
+
2091
2226
  # Wait a moment for detachment to complete
2092
2227
  time.sleep(5)
2093
2228
 
@@ -2095,68 +2230,79 @@ def resize_studio(
2095
2230
 
2096
2231
  # Call the resize API
2097
2232
  resize_response = make_api_request(
2098
- "POST",
2099
- f"/studios/{studio['studio_id']}/resize",
2100
- json_data={"size": size}
2233
+ "POST", f"/studios/{studio['studio_id']}/resize", json_data={"size": size}
2101
2234
  )
2102
-
2235
+
2103
2236
  if resize_response.status_code != 200:
2104
2237
  error = resize_response.json().get("error", "Unknown error")
2105
2238
  console.print(f"[red]❌ Failed to resize studio: {error}[/red]")
2106
2239
  raise typer.Exit(1)
2107
-
2240
+
2108
2241
  # Wait for volume modification to complete
2109
2242
  ec2 = boto3.client("ec2", region_name="us-east-1")
2110
2243
  console.print("Resizing volume...")
2111
-
2244
+
2112
2245
  # Track progress
2113
2246
  last_progress = 0
2114
-
2247
+
2115
2248
  while True:
2116
2249
  try:
2117
- mod_state = ec2.describe_volumes_modifications(VolumeIds=[studio["studio_id"]])
2250
+ mod_state = ec2.describe_volumes_modifications(
2251
+ VolumeIds=[studio["studio_id"]]
2252
+ )
2118
2253
  if not mod_state["VolumesModifications"]:
2119
2254
  break # Modification complete
2120
-
2255
+
2121
2256
  modification = mod_state["VolumesModifications"][0]
2122
2257
  state = modification["ModificationState"]
2123
2258
  progress = modification.get("Progress", 0)
2124
-
2259
+
2125
2260
  # Show progress updates only for the resize phase
2126
2261
  if state == "modifying" and progress > last_progress:
2127
2262
  console.print(f"[yellow]Progress: {progress}%[/yellow]")
2128
2263
  last_progress = progress
2129
-
2264
+
2130
2265
  # Exit as soon as optimization starts (resize is complete)
2131
2266
  if state == "optimizing":
2132
- console.print(f"[green]✓ Studio resized successfully to {size}GB![/green]")
2133
- console.print("[dim]AWS is optimizing the volume in the background (no action needed).[/dim]")
2267
+ console.print(
2268
+ f"[green] Studio resized successfully to {size}GB![/green]"
2269
+ )
2270
+ console.print(
2271
+ "[dim]AWS is optimizing the volume in the background (no action needed).[/dim]"
2272
+ )
2134
2273
  break
2135
-
2274
+
2136
2275
  if state == "completed":
2137
- console.print(f"[green]✓ Studio resized successfully to {size}GB![/green]")
2276
+ console.print(
2277
+ f"[green]✓ Studio resized successfully to {size}GB![/green]"
2278
+ )
2138
2279
  break
2139
2280
  elif state == "failed":
2140
2281
  console.print("[red]❌ Volume modification failed[/red]")
2141
2282
  raise typer.Exit(1)
2142
-
2283
+
2143
2284
  time.sleep(2) # Check more frequently for better UX
2144
-
2285
+
2145
2286
  except ClientError:
2146
2287
  # Modification might be complete
2147
2288
  console.print(f"[green]✓ Studio resized successfully to {size}GB![/green]")
2148
2289
  break
2149
-
2150
- console.print("\n[dim]The filesystem will be automatically expanded when you next attach the studio.[/dim]")
2290
+
2291
+ console.print(
2292
+ "\n[dim]The filesystem will be automatically expanded when you next attach the studio.[/dim]"
2293
+ )
2151
2294
  console.print(f"To attach: [cyan]dh studio attach <engine-name>[/cyan]")
2152
2295
 
2296
+
2153
2297
  # ================= Idle timeout command =================
2154
2298
 
2155
2299
 
2156
2300
  @engine_app.command("idle")
2157
2301
  def idle_timeout_cmd(
2158
2302
  name_or_id: str = typer.Argument(help="Engine name or instance ID"),
2159
- set: Optional[str] = typer.Option(None, "--set", "-s", help="New timeout (e.g., 2h30m, 45m)")
2303
+ set: Optional[str] = typer.Option(
2304
+ None, "--set", "-s", help="New timeout (e.g., 2h30m, 45m)"
2305
+ ),
2160
2306
  ):
2161
2307
  """Show or set the engine idle-detector timeout."""
2162
2308
  check_aws_sso()
@@ -2177,11 +2323,18 @@ def idle_timeout_cmd(
2177
2323
  resp = ssm.send_command(
2178
2324
  InstanceIds=[engine["instance_id"]],
2179
2325
  DocumentName="AWS-RunShellScript",
2180
- Parameters={"commands": ["grep -E '^IDLE_TIMEOUT_SECONDS=' /etc/engine.env || echo 'IDLE_TIMEOUT_SECONDS=1800'"], "executionTimeout": ["10"]},
2326
+ Parameters={
2327
+ "commands": [
2328
+ "grep -E '^IDLE_TIMEOUT_SECONDS=' /etc/engine.env || echo 'IDLE_TIMEOUT_SECONDS=1800'"
2329
+ ],
2330
+ "executionTimeout": ["10"],
2331
+ },
2181
2332
  )
2182
2333
  cid = resp["Command"]["CommandId"]
2183
2334
  time.sleep(1)
2184
- inv = ssm.get_command_invocation(CommandId=cid, InstanceId=engine["instance_id"])
2335
+ inv = ssm.get_command_invocation(
2336
+ CommandId=cid, InstanceId=engine["instance_id"]
2337
+ )
2185
2338
  if inv["Status"] == "Success":
2186
2339
  line = inv["StandardOutputContent"].strip()
2187
2340
  secs = int(line.split("=")[1]) if "=" in line else 1800
@@ -2219,8 +2372,10 @@ def idle_timeout_cmd(
2219
2372
  time.sleep(2)
2220
2373
  console.print(f"[green]✓ Idle timeout updated to {set}[/green]")
2221
2374
 
2375
+
2222
2376
  # Add this near the end, after the idle-timeout command
2223
2377
 
2378
+
2224
2379
  @engine_app.command("debug")
2225
2380
  def debug_engine(
2226
2381
  name_or_id: str = typer.Argument(help="Engine name or instance ID"),
@@ -2240,17 +2395,32 @@ def debug_engine(
2240
2395
  console.print(f"[bold]Debug info for {engine['name']}:[/bold]\n")
2241
2396
 
2242
2397
  ssm = boto3.client("ssm", region_name="us-east-1")
2243
-
2398
+
2244
2399
  # Check multiple files and systemd status
2245
2400
  checks = [
2246
- ("Stage file", "cat /opt/dayhoff/state/engine-init.stage 2>/dev/null || cat /var/run/engine-init.stage 2>/dev/null || echo 'MISSING'"),
2247
- ("Health file", "cat /opt/dayhoff/state/engine-health.json 2>/dev/null || cat /var/run/engine-health.json 2>/dev/null || echo 'MISSING'"),
2248
- ("Sentinel file", "ls -la /opt/dayhoff/first_boot_complete.sentinel 2>/dev/null || echo 'MISSING'"),
2249
- ("Setup service", "systemctl status setup-aws-vm.service --no-pager || echo 'Service not found'"),
2250
- ("Bootstrap log tail", "tail -20 /var/log/engine-setup.log 2>/dev/null || echo 'No log'"),
2401
+ (
2402
+ "Stage file",
2403
+ "cat /opt/dayhoff/state/engine-init.stage 2>/dev/null || cat /var/run/engine-init.stage 2>/dev/null || echo 'MISSING'",
2404
+ ),
2405
+ (
2406
+ "Health file",
2407
+ "cat /opt/dayhoff/state/engine-health.json 2>/dev/null || cat /var/run/engine-health.json 2>/dev/null || echo 'MISSING'",
2408
+ ),
2409
+ (
2410
+ "Sentinel file",
2411
+ "ls -la /opt/dayhoff/first_boot_complete.sentinel 2>/dev/null || echo 'MISSING'",
2412
+ ),
2413
+ (
2414
+ "Setup service",
2415
+ "systemctl status setup-aws-vm.service --no-pager || echo 'Service not found'",
2416
+ ),
2417
+ (
2418
+ "Bootstrap log tail",
2419
+ "tail -20 /var/log/engine-setup.log 2>/dev/null || echo 'No log'",
2420
+ ),
2251
2421
  ("Environment file", "cat /etc/engine.env 2>/dev/null || echo 'MISSING'"),
2252
2422
  ]
2253
-
2423
+
2254
2424
  for name, cmd in checks:
2255
2425
  try:
2256
2426
  resp = ssm.send_command(
@@ -2260,14 +2430,16 @@ def debug_engine(
2260
2430
  )
2261
2431
  cid = resp["Command"]["CommandId"]
2262
2432
  time.sleep(1)
2263
- inv = ssm.get_command_invocation(CommandId=cid, InstanceId=engine["instance_id"])
2264
-
2433
+ inv = ssm.get_command_invocation(
2434
+ CommandId=cid, InstanceId=engine["instance_id"]
2435
+ )
2436
+
2265
2437
  if inv["Status"] == "Success":
2266
2438
  output = inv["StandardOutputContent"].strip()
2267
2439
  console.print(f"[cyan]{name}:[/cyan]")
2268
2440
  console.print(f"[dim]{output}[/dim]\n")
2269
2441
  else:
2270
2442
  console.print(f"[cyan]{name}:[/cyan] [red]FAILED[/red]\n")
2271
-
2443
+
2272
2444
  except Exception as e:
2273
2445
  console.print(f"[cyan]{name}:[/cyan] [red]ERROR: {e}[/red]\n")
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: dayhoff-tools
3
- Version: 1.5.2
3
+ Version: 1.5.3
4
4
  Summary: Common tools for all the repos at Dayhoff Labs
5
5
  Author: Daniel Martin-Alarcon
6
6
  Author-email: dma@dayhofflabs.com
@@ -3,7 +3,7 @@ dayhoff_tools/chemistry/standardizer.py,sha256=uMn7VwHnx02nc404eO6fRuS4rsl4dvSPf
3
3
  dayhoff_tools/chemistry/utils.py,sha256=jt-7JgF-GeeVC421acX-bobKbLU_X94KNOW24p_P-_M,2257
4
4
  dayhoff_tools/cli/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
5
5
  dayhoff_tools/cli/cloud_commands.py,sha256=33qcWLmq-FwEXMdL3F0OHm-5Stlh2r65CldyEZgQ1no,40904
6
- dayhoff_tools/cli/engine_commands.py,sha256=FvQTWb43x2Ns5rOW--ryfWkzzqvSzcpExDKyERY53Zk,87320
6
+ dayhoff_tools/cli/engine_commands.py,sha256=8iMAKny8tWEcNtV6l90F8jlW6jx7FPAruJrsuyHFa58,88985
7
7
  dayhoff_tools/cli/main.py,sha256=tRN7WCBHg6uyNp6rA54pKTCoVmBntta2i0Yas3bUpZ4,4853
8
8
  dayhoff_tools/cli/swarm_commands.py,sha256=5EyKj8yietvT5lfoz8Zx0iQvVaNgc3SJX1z2zQR6o6M,5614
9
9
  dayhoff_tools/cli/utility_commands.py,sha256=FRZTPrjsG_qmIIqoNxd1Q1vVkS_5w8aY33IrVYVNCLg,18131
@@ -27,7 +27,7 @@ dayhoff_tools/intake/uniprot.py,sha256=BZYJQF63OtPcBBnQ7_P9gulxzJtqyorgyuDiPeOJq
27
27
  dayhoff_tools/logs.py,sha256=DKdeP0k0kliRcilwvX0mUB2eipO5BdWUeHwh-VnsICs,838
28
28
  dayhoff_tools/sqlite.py,sha256=jV55ikF8VpTfeQqqlHSbY8OgfyfHj8zgHNpZjBLos_E,18672
29
29
  dayhoff_tools/warehouse.py,sha256=fV3goH2cH1Y0oLpGERnu4p70P2JfByJHjBh_oMRv9C0,23134
30
- dayhoff_tools-1.5.2.dist-info/METADATA,sha256=sb8H_nJZYquFb8E_uvrj2xT3stsRS4x_uHdxQk7z15A,2824
31
- dayhoff_tools-1.5.2.dist-info/WHEEL,sha256=b4K_helf-jlQoXBBETfwnf4B04YC67LOev0jo4fX5m8,88
32
- dayhoff_tools-1.5.2.dist-info/entry_points.txt,sha256=iAf4jteNqW3cJm6CO6czLxjW3vxYKsyGLZ8WGmxamSc,49
33
- dayhoff_tools-1.5.2.dist-info/RECORD,,
30
+ dayhoff_tools-1.5.3.dist-info/METADATA,sha256=lrN8SkfIylBQrb-w5mEBzOZek3-fV-izpylkRkI-pdU,2824
31
+ dayhoff_tools-1.5.3.dist-info/WHEEL,sha256=b4K_helf-jlQoXBBETfwnf4B04YC67LOev0jo4fX5m8,88
32
+ dayhoff_tools-1.5.3.dist-info/entry_points.txt,sha256=iAf4jteNqW3cJm6CO6czLxjW3vxYKsyGLZ8WGmxamSc,49
33
+ dayhoff_tools-1.5.3.dist-info/RECORD,,