dayhoff-tools 1.10.0__py3-none-any.whl → 1.10.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (39) hide show
  1. dayhoff_tools/cli/engine/__init__.py +1 -323
  2. dayhoff_tools/cli/engine/coffee.py +110 -0
  3. dayhoff_tools/cli/engine/config_ssh.py +113 -0
  4. dayhoff_tools/cli/engine/debug.py +79 -0
  5. dayhoff_tools/cli/engine/gami.py +160 -0
  6. dayhoff_tools/cli/engine/idle.py +148 -0
  7. dayhoff_tools/cli/engine/launch.py +101 -0
  8. dayhoff_tools/cli/engine/list.py +116 -0
  9. dayhoff_tools/cli/engine/repair.py +128 -0
  10. dayhoff_tools/cli/engine/resize.py +195 -0
  11. dayhoff_tools/cli/engine/ssh.py +62 -0
  12. dayhoff_tools/cli/engine/{engine_core.py → status.py} +6 -201
  13. dayhoff_tools/cli/engine_studio_commands.py +323 -0
  14. dayhoff_tools/cli/engine_studio_utils/__init__.py +1 -0
  15. dayhoff_tools/cli/engine_studio_utils/api_utils.py +47 -0
  16. dayhoff_tools/cli/engine_studio_utils/aws_utils.py +102 -0
  17. dayhoff_tools/cli/engine_studio_utils/constants.py +21 -0
  18. dayhoff_tools/cli/engine_studio_utils/formatting.py +210 -0
  19. dayhoff_tools/cli/engine_studio_utils/ssh_utils.py +141 -0
  20. dayhoff_tools/cli/main.py +1 -2
  21. dayhoff_tools/cli/studio/__init__.py +1 -0
  22. dayhoff_tools/cli/studio/attach.py +314 -0
  23. dayhoff_tools/cli/studio/create.py +48 -0
  24. dayhoff_tools/cli/studio/delete.py +71 -0
  25. dayhoff_tools/cli/studio/detach.py +56 -0
  26. dayhoff_tools/cli/studio/list.py +81 -0
  27. dayhoff_tools/cli/studio/reset.py +90 -0
  28. dayhoff_tools/cli/studio/resize.py +134 -0
  29. dayhoff_tools/cli/studio/status.py +78 -0
  30. {dayhoff_tools-1.10.0.dist-info → dayhoff_tools-1.10.1.dist-info}/METADATA +1 -1
  31. dayhoff_tools-1.10.1.dist-info/RECORD +61 -0
  32. dayhoff_tools/cli/engine/engine_maintenance.py +0 -431
  33. dayhoff_tools/cli/engine/engine_management.py +0 -505
  34. dayhoff_tools/cli/engine/shared.py +0 -501
  35. dayhoff_tools/cli/engine/studio_commands.py +0 -825
  36. dayhoff_tools-1.10.0.dist-info/RECORD +0 -39
  37. /dayhoff_tools/cli/engine/{engine_lifecycle.py → lifecycle.py} +0 -0
  38. {dayhoff_tools-1.10.0.dist-info → dayhoff_tools-1.10.1.dist-info}/WHEEL +0 -0
  39. {dayhoff_tools-1.10.0.dist-info → dayhoff_tools-1.10.1.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,160 @@
1
+ """Engine GAMI (Golden AMI) creation command."""
2
+
3
+ from datetime import datetime
4
+
5
+ import boto3
6
+ import typer
7
+ from botocore.exceptions import ClientError
8
+ from rich.progress import Progress, SpinnerColumn, TextColumn
9
+ from rich.prompt import Confirm
10
+
11
+ from ..engine_studio_utils.api_utils import make_api_request
12
+ from ..engine_studio_utils.aws_utils import check_aws_sso
13
+ from ..engine_studio_utils.constants import console
14
+ from ..engine_studio_utils.formatting import resolve_engine
15
+
16
+
17
+ def create_ami(
18
+ name_or_id: str = typer.Argument(
19
+ help="Engine name or instance ID to create AMI from"
20
+ ),
21
+ ):
22
+ """Create a 'Golden AMI' from a running engine.
23
+
24
+ This process is for creating a pre-warmed, standardized machine image
25
+ that can be used to launch new engines more quickly.
26
+
27
+ IMPORTANT:
28
+ - The engine MUST have all studios detached before running this command.
29
+ - This process will make the source engine unusable. You should
30
+ plan to TERMINATE the engine after the AMI is created.
31
+ """
32
+ check_aws_sso()
33
+
34
+ # Get all engines to resolve name and check status
35
+ # We pass check_ready=True to get attached studio info
36
+ response = make_api_request("GET", "/engines", params={"check_ready": "true"})
37
+ if response.status_code != 200:
38
+ console.print("[red]❌ Failed to fetch engines[/red]")
39
+ raise typer.Exit(1)
40
+
41
+ engines = response.json().get("engines", [])
42
+ engine = resolve_engine(name_or_id, engines)
43
+
44
+ # --- Pre-flight checks ---
45
+
46
+ # 1. Check if engine is running
47
+ if engine["state"].lower() != "running":
48
+ console.print(f"[red]❌ Engine '{engine['name']}' is not running.[/red]")
49
+ console.print("Please start it before creating an AMI.")
50
+ raise typer.Exit(1)
51
+
52
+ # 2. Check for attached studios from the detailed API response
53
+ attached_studios = engine.get("studios", [])
54
+ if attached_studios:
55
+ console.print(
56
+ f"[bold red]❌ Engine '{engine['name']}' has studios attached.[/bold red]"
57
+ )
58
+ console.print("Please detach all studios before creating an AMI:")
59
+ for studio in attached_studios:
60
+ console.print(f" - {studio['user']} ({studio['studio_id']})")
61
+ console.print("\nTo detach, run [bold]dh studio detach[/bold]")
62
+ raise typer.Exit(1)
63
+
64
+ # Construct AMI name and description
65
+ ami_name = (
66
+ f"prewarmed-engine-{engine['engine_type']}-{datetime.now().strftime('%Y%m%d')}"
67
+ )
68
+ description = (
69
+ f"Amazon Linux 2023 with NVIDIA drivers, Docker, and pre-pulled "
70
+ f"dev container image for {engine['engine_type']} engines"
71
+ )
72
+
73
+ console.print(f"Creating AMI from engine [cyan]{engine['name']}[/cyan]...")
74
+ console.print(f"[bold]AMI Name:[/] {ami_name}")
75
+ console.print(f"[bold]Description:[/] {description}")
76
+
77
+ console.print(
78
+ "\n[bold yellow]⚠️ Important:[/bold yellow]\n"
79
+ "1. This process will run cleanup scripts on the engine.\n"
80
+ "2. The source engine should be [bold]terminated[/bold] after the AMI is created.\n"
81
+ )
82
+
83
+ if not Confirm.ask("Continue with AMI creation?"):
84
+ raise typer.Exit()
85
+
86
+ # Create AMI using EC2 client directly, as the backend logic is too complex
87
+ ec2 = boto3.client("ec2", region_name="us-east-1")
88
+ ssm = boto3.client("ssm", region_name="us-east-1")
89
+
90
+ try:
91
+ # Clean up instance state before snapshotting
92
+ console.print("Cleaning up instance for AMI creation...")
93
+ cleanup_commands = [
94
+ "sudo rm -f /opt/dayhoff/first_boot_complete.sentinel",
95
+ "history -c",
96
+ "sudo rm -rf /tmp/* /var/log/messages /var/log/cloud-init.log",
97
+ "sudo rm -rf /var/lib/amazon/ssm/* /etc/amazon/ssm/*",
98
+ "sleep 2 && sudo systemctl stop amazon-ssm-agent &", # Stop agent last
99
+ ]
100
+
101
+ cleanup_response = ssm.send_command(
102
+ InstanceIds=[engine["instance_id"]],
103
+ DocumentName="AWS-RunShellScript",
104
+ Parameters={"commands": cleanup_commands, "executionTimeout": ["120"]},
105
+ )
106
+
107
+ # Acknowledge that the SSM command might be in progress as the agent shuts down
108
+ console.print(
109
+ "[dim]ℹ️ Cleanup command sent (status may show 'InProgress' as SSM agent stops)[/dim]"
110
+ )
111
+
112
+ # Create the AMI
113
+ with Progress(
114
+ SpinnerColumn(),
115
+ TextColumn("[progress.description]{task.description}"),
116
+ transient=True,
117
+ ) as progress:
118
+ task = progress.add_task(
119
+ "Creating AMI (this will take several minutes)...", total=None
120
+ )
121
+
122
+ response = ec2.create_image(
123
+ InstanceId=engine["instance_id"],
124
+ Name=ami_name,
125
+ Description=description,
126
+ NoReboot=False,
127
+ TagSpecifications=[
128
+ {
129
+ "ResourceType": "image",
130
+ "Tags": [
131
+ {"Key": "Environment", "Value": "dev"},
132
+ {"Key": "Type", "Value": "golden-ami"},
133
+ {"Key": "EngineType", "Value": engine["engine_type"]},
134
+ {"Key": "Name", "Value": ami_name},
135
+ ],
136
+ }
137
+ ],
138
+ )
139
+
140
+ ami_id = response["ImageId"]
141
+ progress.update(
142
+ task,
143
+ completed=True,
144
+ description=f"[green]✓ AMI creation initiated![/green]",
145
+ )
146
+
147
+ console.print(f" [bold]AMI ID:[/] {ami_id}")
148
+ console.print("\nThe AMI creation process will continue in the background.")
149
+ console.print("You can monitor progress in the EC2 Console under 'AMIs'.")
150
+ console.print(
151
+ "\nOnce complete, update the AMI ID in [bold]terraform/environments/dev/variables.tf[/bold] "
152
+ "and run [bold]terraform apply[/bold]."
153
+ )
154
+ console.print(
155
+ f"\nRemember to [bold red]terminate the source engine '{engine['name']}'[/bold red] to save costs."
156
+ )
157
+
158
+ except ClientError as e:
159
+ console.print(f"[red]❌ Failed to create AMI: {e}[/red]")
160
+ raise typer.Exit(1)
@@ -0,0 +1,148 @@
1
+ """Engine idle timeout command."""
2
+
3
+ import re
4
+ import time
5
+ from typing import Optional
6
+
7
+ import boto3
8
+ import typer
9
+
10
+ from ..engine_studio_utils.api_utils import make_api_request
11
+ from ..engine_studio_utils.aws_utils import check_aws_sso
12
+ from ..engine_studio_utils.constants import console
13
+ from ..engine_studio_utils.formatting import resolve_engine
14
+
15
+
16
+ def idle_timeout_cmd(
17
+ name_or_id: str = typer.Argument(help="Engine name or instance ID"),
18
+ set: Optional[str] = typer.Option(
19
+ None, "--set", "-s", help="New timeout (e.g., 2h30m, 45m)"
20
+ ),
21
+ slack: Optional[str] = typer.Option(
22
+ None, "--slack", help="Set Slack notifications: none, default, all"
23
+ ),
24
+ ):
25
+ """Show or set engine idle-detector settings."""
26
+ check_aws_sso()
27
+
28
+ # Resolve engine
29
+ response = make_api_request("GET", "/engines")
30
+ if response.status_code != 200:
31
+ console.print("[red]❌ Failed to fetch engines[/red]")
32
+ raise typer.Exit(1)
33
+
34
+ engines = response.json().get("engines", [])
35
+ engine = resolve_engine(name_or_id, engines)
36
+
37
+ ssm = boto3.client("ssm", region_name="us-east-1")
38
+
39
+ # Handle slack notifications change
40
+ if slack:
41
+ slack = slack.lower()
42
+ if slack not in ["none", "default", "all"]:
43
+ console.print("[red]❌ Invalid slack option. Use: none, default, all[/red]")
44
+ raise typer.Exit(1)
45
+
46
+ console.print(f"Setting Slack notifications to [bold]{slack}[/bold]...")
47
+
48
+ if slack == "none":
49
+ settings = {
50
+ "SLACK_NOTIFY_WARNINGS": "false",
51
+ "SLACK_NOTIFY_IDLE_START": "false",
52
+ "SLACK_NOTIFY_IDLE_END": "false",
53
+ "SLACK_NOTIFY_SHUTDOWN": "false",
54
+ }
55
+ elif slack == "default":
56
+ settings = {
57
+ "SLACK_NOTIFY_WARNINGS": "true",
58
+ "SLACK_NOTIFY_IDLE_START": "false",
59
+ "SLACK_NOTIFY_IDLE_END": "false",
60
+ "SLACK_NOTIFY_SHUTDOWN": "true",
61
+ }
62
+ else: # all
63
+ settings = {
64
+ "SLACK_NOTIFY_WARNINGS": "true",
65
+ "SLACK_NOTIFY_IDLE_START": "true",
66
+ "SLACK_NOTIFY_IDLE_END": "true",
67
+ "SLACK_NOTIFY_SHUTDOWN": "true",
68
+ }
69
+
70
+ commands = []
71
+ for key, value in settings.items():
72
+ # Use a robust sed command that adds the line if it doesn't exist
73
+ commands.append(
74
+ f"grep -q '^{key}=' /etc/engine.env && sudo sed -i 's|^{key}=.*|{key}={value}|' /etc/engine.env || echo '{key}={value}' | sudo tee -a /etc/engine.env > /dev/null"
75
+ )
76
+
77
+ # Instead of restarting service, send SIGHUP to reload config
78
+ commands.append(
79
+ "sudo pkill -HUP -f engine-idle-detector.py || sudo systemctl restart engine-idle-detector.service"
80
+ )
81
+
82
+ resp = ssm.send_command(
83
+ InstanceIds=[engine["instance_id"]],
84
+ DocumentName="AWS-RunShellScript",
85
+ Parameters={"commands": commands, "executionTimeout": ["60"]},
86
+ )
87
+ cid = resp["Command"]["CommandId"]
88
+ time.sleep(2) # Give it a moment to process
89
+ console.print(f"[green]✓ Slack notifications updated to '{slack}'[/green]")
90
+ console.print("[dim]Note: Settings updated without resetting idle timer[/dim]")
91
+
92
+ # Handle setting new timeout value
93
+ if set is not None:
94
+ m = re.match(r"^(?:(\d+)h)?(?:(\d+)m)?$", set)
95
+ if not m:
96
+ console.print(
97
+ "[red]❌ Invalid duration format. Use e.g. 2h, 45m, 1h30m[/red]"
98
+ )
99
+ raise typer.Exit(1)
100
+ hours = int(m.group(1) or 0)
101
+ minutes = int(m.group(2) or 0)
102
+ seconds = hours * 3600 + minutes * 60
103
+ if seconds == 0:
104
+ console.print("[red]❌ Duration must be greater than zero[/red]")
105
+ raise typer.Exit(1)
106
+
107
+ console.print(f"Setting idle timeout to {set} ({seconds} seconds)…")
108
+
109
+ cmd = (
110
+ "sudo sed -i '/^IDLE_TIMEOUT_SECONDS=/d' /etc/engine.env && "
111
+ f"echo 'IDLE_TIMEOUT_SECONDS={seconds}' | sudo tee -a /etc/engine.env >/dev/null && "
112
+ "sudo systemctl restart engine-idle-detector.service"
113
+ )
114
+
115
+ resp = ssm.send_command(
116
+ InstanceIds=[engine["instance_id"]],
117
+ DocumentName="AWS-RunShellScript",
118
+ Parameters={"commands": [cmd], "executionTimeout": ["60"]},
119
+ )
120
+ cid = resp["Command"]["CommandId"]
121
+ time.sleep(2)
122
+ console.print(f"[green]✓ Idle timeout updated to {set}[/green]")
123
+
124
+ # If no action was specified, show current timeout
125
+ if set is None and slack is None:
126
+ # Show current timeout setting
127
+ resp = ssm.send_command(
128
+ InstanceIds=[engine["instance_id"]],
129
+ DocumentName="AWS-RunShellScript",
130
+ Parameters={
131
+ "commands": [
132
+ "grep -E '^IDLE_TIMEOUT_SECONDS=' /etc/engine.env || echo 'IDLE_TIMEOUT_SECONDS=1800'"
133
+ ],
134
+ "executionTimeout": ["10"],
135
+ },
136
+ )
137
+ cid = resp["Command"]["CommandId"]
138
+ time.sleep(1)
139
+ inv = ssm.get_command_invocation(
140
+ CommandId=cid, InstanceId=engine["instance_id"]
141
+ )
142
+ if inv["Status"] == "Success":
143
+ line = inv["StandardOutputContent"].strip()
144
+ secs = int(line.split("=")[1]) if "=" in line else 1800
145
+ console.print(f"Current idle timeout: {secs//60}m ({secs} seconds)")
146
+ else:
147
+ console.print("[red]❌ Could not retrieve idle timeout[/red]")
148
+ return
@@ -0,0 +1,101 @@
1
+ """Engine launch command."""
2
+
3
+ from typing import Any, Dict, Optional
4
+
5
+ import typer
6
+ from rich.progress import Progress, SpinnerColumn, TextColumn
7
+
8
+ from ..engine_studio_utils.api_utils import make_api_request
9
+ from ..engine_studio_utils.aws_utils import check_aws_sso
10
+ from ..engine_studio_utils.constants import HOURLY_COSTS, console
11
+
12
+
13
+ def launch_engine(
14
+ name: str = typer.Argument(help="Name for the new engine"),
15
+ engine_type: str = typer.Option(
16
+ "cpu",
17
+ "--type",
18
+ "-t",
19
+ help="Engine type: cpu, cpumax, t4, a10g, a100, 4_t4, 8_t4, 4_a10g, 8_a10g",
20
+ ),
21
+ user: Optional[str] = typer.Option(None, "--user", "-u", help="Override username"),
22
+ boot_disk_size: Optional[int] = typer.Option(
23
+ None,
24
+ "--size",
25
+ "-s",
26
+ help="Boot disk size in GB (default: 50GB, min: 20GB, max: 1000GB)",
27
+ ),
28
+ availability_zone: Optional[str] = typer.Option(
29
+ None,
30
+ "--az",
31
+ help="Prefer a specific Availability Zone (e.g., us-east-1b). If omitted the service will try all public subnets.",
32
+ ),
33
+ ):
34
+ """Launch a new engine instance."""
35
+ username = check_aws_sso()
36
+ if user:
37
+ username = user
38
+
39
+ # Validate engine type
40
+ valid_types = [
41
+ "cpu",
42
+ "cpumax",
43
+ "t4",
44
+ "a10g",
45
+ "a100",
46
+ "4_t4",
47
+ "8_t4",
48
+ "4_a10g",
49
+ "8_a10g",
50
+ ]
51
+ if engine_type not in valid_types:
52
+ console.print(f"[red]❌ Invalid engine type: {engine_type}[/red]")
53
+ console.print(f"Valid types: {', '.join(valid_types)}")
54
+ raise typer.Exit(1)
55
+
56
+ # Validate boot disk size
57
+ if boot_disk_size is not None:
58
+ if boot_disk_size < 20:
59
+ console.print("[red]❌ Boot disk size must be at least 20GB[/red]")
60
+ raise typer.Exit(1)
61
+ if boot_disk_size > 1000:
62
+ console.print("[red]❌ Boot disk size cannot exceed 1000GB[/red]")
63
+ raise typer.Exit(1)
64
+
65
+ cost = HOURLY_COSTS.get(engine_type, 0)
66
+ disk_info = f" with {boot_disk_size}GB boot disk" if boot_disk_size else ""
67
+ console.print(
68
+ f"Launching [cyan]{name}[/cyan] ({engine_type}){disk_info} for ${cost:.2f}/hour..."
69
+ )
70
+
71
+ with Progress(
72
+ SpinnerColumn(),
73
+ TextColumn("[progress.description]{task.description}"),
74
+ transient=True,
75
+ ) as progress:
76
+ progress.add_task("Creating engine...", total=None)
77
+
78
+ request_data: Dict[str, Any] = {
79
+ "name": name,
80
+ "user": username,
81
+ "engine_type": engine_type,
82
+ }
83
+ if boot_disk_size is not None:
84
+ request_data["boot_disk_size"] = boot_disk_size
85
+ if availability_zone:
86
+ request_data["availability_zone"] = availability_zone
87
+
88
+ response = make_api_request("POST", "/engines", json_data=request_data)
89
+
90
+ if response.status_code == 201:
91
+ data = response.json()
92
+ console.print(f"[green]✓ Engine launched successfully![/green]")
93
+ console.print(f"Instance ID: [cyan]{data['instance_id']}[/cyan]")
94
+ console.print(f"Type: {data['instance_type']} (${cost:.2f}/hour)")
95
+ if boot_disk_size:
96
+ console.print(f"Boot disk: {boot_disk_size}GB")
97
+ console.print("\nThe engine is initializing. This may take a few minutes.")
98
+ console.print(f"Check status with: [cyan]dh engine status {name}[/cyan]")
99
+ else:
100
+ error = response.json().get("error", "Unknown error")
101
+ console.print(f"[red]❌ Failed to launch engine: {error}[/red]")
@@ -0,0 +1,116 @@
1
+ """Engine list command."""
2
+
3
+ from datetime import datetime, timezone
4
+ from typing import Optional
5
+
6
+ import typer
7
+ from rich import box
8
+ from rich.table import Table
9
+
10
+ from ..engine_studio_utils.api_utils import make_api_request
11
+ from ..engine_studio_utils.aws_utils import _fetch_init_stages, check_aws_sso
12
+ from ..engine_studio_utils.constants import HOURLY_COSTS, console
13
+ from ..engine_studio_utils.formatting import (
14
+ format_duration,
15
+ format_status,
16
+ get_disk_usage_via_ssm,
17
+ parse_launch_time,
18
+ )
19
+
20
+
21
+ def list_engines(
22
+ user: Optional[str] = typer.Option(None, "--user", "-u", help="Filter by user"),
23
+ running_only: bool = typer.Option(
24
+ False, "--running", help="Show only running engines"
25
+ ),
26
+ stopped_only: bool = typer.Option(
27
+ False, "--stopped", help="Show only stopped engines"
28
+ ),
29
+ detailed: bool = typer.Option(
30
+ False, "--detailed", "-d", help="Show detailed status (slower)"
31
+ ),
32
+ ):
33
+ """List engines (shows all engines by default)."""
34
+ current_user = check_aws_sso()
35
+
36
+ params = {}
37
+ if user:
38
+ params["user"] = user
39
+ if detailed:
40
+ params["check_ready"] = "true"
41
+
42
+ response = make_api_request("GET", "/engines", params=params)
43
+
44
+ if response.status_code == 200:
45
+ data = response.json()
46
+ engines = data.get("engines", [])
47
+
48
+ # Filter by state if requested
49
+ if running_only:
50
+ engines = [e for e in engines if e["state"].lower() == "running"]
51
+ elif stopped_only:
52
+ engines = [e for e in engines if e["state"].lower() == "stopped"]
53
+
54
+ if not engines:
55
+ console.print("No engines found.")
56
+ return
57
+
58
+ # Only fetch detailed info if requested (slow)
59
+ stages_map = {}
60
+ if detailed:
61
+ stages_map = _fetch_init_stages([e["instance_id"] for e in engines])
62
+
63
+ # Create table
64
+ table = Table(title="Engines", box=box.ROUNDED)
65
+ table.add_column("Name", style="cyan")
66
+ table.add_column("Instance ID", style="dim")
67
+ table.add_column("Type")
68
+ table.add_column("User")
69
+ table.add_column("Status")
70
+ if detailed:
71
+ table.add_column("Disk Usage")
72
+ table.add_column("Uptime/Since")
73
+ table.add_column("$/hour", justify="right")
74
+
75
+ for engine in engines:
76
+ launch_time = parse_launch_time(engine["launch_time"])
77
+ uptime = datetime.now(timezone.utc) - launch_time
78
+ hourly_cost = HOURLY_COSTS.get(engine["engine_type"], 0)
79
+
80
+ if engine["state"].lower() == "running":
81
+ time_str = format_duration(uptime)
82
+ # Only get disk usage if detailed mode
83
+ if detailed:
84
+ disk_usage = get_disk_usage_via_ssm(engine["instance_id"]) or "-"
85
+ else:
86
+ disk_usage = None
87
+ else:
88
+ time_str = launch_time.strftime("%Y-%m-%d %H:%M")
89
+ disk_usage = "-" if detailed else None
90
+
91
+ row_data = [
92
+ engine["name"],
93
+ engine["instance_id"],
94
+ engine["engine_type"],
95
+ engine["user"],
96
+ format_status(engine["state"], engine.get("ready")),
97
+ ]
98
+ if detailed:
99
+ row_data.append(disk_usage)
100
+ row_data.extend(
101
+ [
102
+ time_str,
103
+ f"${hourly_cost:.2f}",
104
+ ]
105
+ )
106
+
107
+ table.add_row(*row_data)
108
+
109
+ console.print(table)
110
+ if not detailed and any(e["state"].lower() == "running" for e in engines):
111
+ console.print(
112
+ "\n[dim]Tip: Use --detailed to see disk usage and bootstrap status (slower)[/dim]"
113
+ )
114
+ else:
115
+ error = response.json().get("error", "Unknown error")
116
+ console.print(f"[red]❌ Failed to list engines: {error}[/red]")
@@ -0,0 +1,128 @@
1
+ """Engine repair command."""
2
+
3
+ import time
4
+
5
+ import boto3
6
+ import typer
7
+ from rich.progress import Progress, SpinnerColumn, TextColumn
8
+ from rich.prompt import Confirm
9
+
10
+ from ..engine_studio_utils.api_utils import make_api_request
11
+ from ..engine_studio_utils.aws_utils import check_aws_sso
12
+ from ..engine_studio_utils.constants import console
13
+ from ..engine_studio_utils.formatting import resolve_engine
14
+
15
+
16
+ def repair_engine(
17
+ name_or_id: str = typer.Argument(help="Engine name or instance ID"),
18
+ ):
19
+ """Repair an engine that's stuck in a bad state (e.g., after GAMI creation)."""
20
+ check_aws_sso()
21
+
22
+ # Get all engines to resolve name
23
+ response = make_api_request("GET", "/engines")
24
+ if response.status_code != 200:
25
+ console.print("[red]❌ Failed to fetch engines[/red]")
26
+ raise typer.Exit(1)
27
+
28
+ engines = response.json().get("engines", [])
29
+ engine = resolve_engine(name_or_id, engines)
30
+
31
+ if engine["state"].lower() != "running":
32
+ console.print(
33
+ f"[yellow]⚠️ Engine is {engine['state']}. Must be running to repair.[/yellow]"
34
+ )
35
+ if engine["state"].lower() == "stopped" and Confirm.ask(
36
+ "Start the engine first?"
37
+ ):
38
+ response = make_api_request(
39
+ "POST", f"/engines/{engine['instance_id']}/start"
40
+ )
41
+ if response.status_code != 200:
42
+ console.print("[red]❌ Failed to start engine[/red]")
43
+ raise typer.Exit(1)
44
+ console.print("[green]✓ Engine started[/green]")
45
+ console.print("Waiting for engine to become ready...")
46
+ time.sleep(30) # Give it time to boot
47
+ else:
48
+ raise typer.Exit(1)
49
+
50
+ console.print(f"[bold]Repairing engine [cyan]{engine['name']}[/cyan][/bold]")
51
+ console.print(
52
+ "[dim]This will restore bootstrap state and ensure all services are running[/dim]\n"
53
+ )
54
+
55
+ ssm = boto3.client("ssm", region_name="us-east-1")
56
+
57
+ # Repair commands
58
+ repair_commands = [
59
+ # Create necessary directories
60
+ "sudo mkdir -p /opt/dayhoff /opt/dayhoff/state /opt/dayhoff/scripts",
61
+ # Download scripts from S3 if missing
62
+ "source /etc/engine.env && sudo aws s3 sync s3://${VM_SCRIPTS_BUCKET}/ /opt/dayhoff/scripts/ --exclude '*' --include '*.sh' --quiet",
63
+ "sudo chmod +x /opt/dayhoff/scripts/*.sh 2>/dev/null || true",
64
+ # Restore bootstrap state
65
+ "sudo touch /opt/dayhoff/first_boot_complete.sentinel",
66
+ "echo 'finished' | sudo tee /opt/dayhoff/state/engine-init.stage > /dev/null",
67
+ # Ensure SSM agent is running
68
+ "sudo systemctl restart amazon-ssm-agent 2>/dev/null || true",
69
+ # Restart idle detector (service only)
70
+ "sudo systemctl restart engine-idle-detector.service 2>/dev/null || true",
71
+ # Report status
72
+ "echo '=== Repair Complete ===' && echo 'Sentinel: ' && ls -la /opt/dayhoff/first_boot_complete.sentinel",
73
+ "echo 'Stage: ' && cat /opt/dayhoff/state/engine-init.stage",
74
+ "echo 'Scripts: ' && ls /opt/dayhoff/scripts/*.sh 2>/dev/null | wc -l",
75
+ ]
76
+
77
+ try:
78
+ with Progress(
79
+ SpinnerColumn(),
80
+ TextColumn("[progress.description]{task.description}"),
81
+ transient=True,
82
+ ) as progress:
83
+ task = progress.add_task("Repairing engine...", total=None)
84
+
85
+ response = ssm.send_command(
86
+ InstanceIds=[engine["instance_id"]],
87
+ DocumentName="AWS-RunShellScript",
88
+ Parameters={
89
+ "commands": repair_commands,
90
+ "executionTimeout": ["60"],
91
+ },
92
+ )
93
+
94
+ command_id = response["Command"]["CommandId"]
95
+
96
+ # Wait for command
97
+ for _ in range(60):
98
+ time.sleep(1)
99
+ result = ssm.get_command_invocation(
100
+ CommandId=command_id,
101
+ InstanceId=engine["instance_id"],
102
+ )
103
+ if result["Status"] in ["Success", "Failed"]:
104
+ break
105
+
106
+ if result["Status"] == "Success":
107
+ output = result["StandardOutputContent"]
108
+ console.print("[green]✓ Engine repaired successfully![/green]\n")
109
+
110
+ # Show repair results
111
+ if "=== Repair Complete ===" in output:
112
+ repair_section = output.split("=== Repair Complete ===")[1].strip()
113
+ console.print("[bold]Repair Results:[/bold]")
114
+ console.print(repair_section)
115
+
116
+ console.print(
117
+ "\n[dim]You should now be able to attach studios to this engine.[/dim]"
118
+ )
119
+ else:
120
+ console.print(
121
+ f"[red]❌ Repair failed: {result.get('StandardErrorContent', 'Unknown error')}[/red]"
122
+ )
123
+ console.print(
124
+ "\n[yellow]Try running 'dh engine debug' for more information.[/yellow]"
125
+ )
126
+
127
+ except Exception as e:
128
+ console.print(f"[red]❌ Failed to repair engine: {e}[/red]")