dayhoff-tools 1.9.26__py3-none-any.whl → 1.10.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dayhoff_tools/cli/engine/__init__.py +1 -323
- dayhoff_tools/cli/engine/coffee.py +110 -0
- dayhoff_tools/cli/engine/config_ssh.py +113 -0
- dayhoff_tools/cli/engine/debug.py +79 -0
- dayhoff_tools/cli/engine/gami.py +160 -0
- dayhoff_tools/cli/engine/idle.py +148 -0
- dayhoff_tools/cli/engine/launch.py +101 -0
- dayhoff_tools/cli/engine/list.py +116 -0
- dayhoff_tools/cli/engine/repair.py +128 -0
- dayhoff_tools/cli/engine/resize.py +195 -0
- dayhoff_tools/cli/engine/ssh.py +62 -0
- dayhoff_tools/cli/engine/{engine_core.py → status.py} +6 -201
- dayhoff_tools/cli/engine_studio_commands.py +323 -0
- dayhoff_tools/cli/engine_studio_utils/__init__.py +1 -0
- dayhoff_tools/cli/engine_studio_utils/api_utils.py +47 -0
- dayhoff_tools/cli/engine_studio_utils/aws_utils.py +102 -0
- dayhoff_tools/cli/engine_studio_utils/constants.py +21 -0
- dayhoff_tools/cli/engine_studio_utils/formatting.py +210 -0
- dayhoff_tools/cli/engine_studio_utils/ssh_utils.py +141 -0
- dayhoff_tools/cli/main.py +1 -2
- dayhoff_tools/cli/studio/__init__.py +1 -0
- dayhoff_tools/cli/studio/attach.py +314 -0
- dayhoff_tools/cli/studio/create.py +48 -0
- dayhoff_tools/cli/studio/delete.py +71 -0
- dayhoff_tools/cli/studio/detach.py +56 -0
- dayhoff_tools/cli/studio/list.py +81 -0
- dayhoff_tools/cli/studio/reset.py +90 -0
- dayhoff_tools/cli/studio/resize.py +134 -0
- dayhoff_tools/cli/studio/status.py +78 -0
- {dayhoff_tools-1.9.26.dist-info → dayhoff_tools-1.10.1.dist-info}/METADATA +1 -1
- dayhoff_tools-1.10.1.dist-info/RECORD +61 -0
- dayhoff_tools/cli/engine/engine_maintenance.py +0 -431
- dayhoff_tools/cli/engine/engine_management.py +0 -505
- dayhoff_tools/cli/engine/shared.py +0 -501
- dayhoff_tools/cli/engine/studio_commands.py +0 -825
- dayhoff_tools-1.9.26.dist-info/RECORD +0 -39
- /dayhoff_tools/cli/engine/{engine_lifecycle.py → lifecycle.py} +0 -0
- {dayhoff_tools-1.9.26.dist-info → dayhoff_tools-1.10.1.dist-info}/WHEEL +0 -0
- {dayhoff_tools-1.9.26.dist-info → dayhoff_tools-1.10.1.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,160 @@
|
|
1
|
+
"""Engine GAMI (Golden AMI) creation command."""
|
2
|
+
|
3
|
+
from datetime import datetime
|
4
|
+
|
5
|
+
import boto3
|
6
|
+
import typer
|
7
|
+
from botocore.exceptions import ClientError
|
8
|
+
from rich.progress import Progress, SpinnerColumn, TextColumn
|
9
|
+
from rich.prompt import Confirm
|
10
|
+
|
11
|
+
from ..engine_studio_utils.api_utils import make_api_request
|
12
|
+
from ..engine_studio_utils.aws_utils import check_aws_sso
|
13
|
+
from ..engine_studio_utils.constants import console
|
14
|
+
from ..engine_studio_utils.formatting import resolve_engine
|
15
|
+
|
16
|
+
|
17
|
+
def create_ami(
|
18
|
+
name_or_id: str = typer.Argument(
|
19
|
+
help="Engine name or instance ID to create AMI from"
|
20
|
+
),
|
21
|
+
):
|
22
|
+
"""Create a 'Golden AMI' from a running engine.
|
23
|
+
|
24
|
+
This process is for creating a pre-warmed, standardized machine image
|
25
|
+
that can be used to launch new engines more quickly.
|
26
|
+
|
27
|
+
IMPORTANT:
|
28
|
+
- The engine MUST have all studios detached before running this command.
|
29
|
+
- This process will make the source engine unusable. You should
|
30
|
+
plan to TERMINATE the engine after the AMI is created.
|
31
|
+
"""
|
32
|
+
check_aws_sso()
|
33
|
+
|
34
|
+
# Get all engines to resolve name and check status
|
35
|
+
# We pass check_ready=True to get attached studio info
|
36
|
+
response = make_api_request("GET", "/engines", params={"check_ready": "true"})
|
37
|
+
if response.status_code != 200:
|
38
|
+
console.print("[red]❌ Failed to fetch engines[/red]")
|
39
|
+
raise typer.Exit(1)
|
40
|
+
|
41
|
+
engines = response.json().get("engines", [])
|
42
|
+
engine = resolve_engine(name_or_id, engines)
|
43
|
+
|
44
|
+
# --- Pre-flight checks ---
|
45
|
+
|
46
|
+
# 1. Check if engine is running
|
47
|
+
if engine["state"].lower() != "running":
|
48
|
+
console.print(f"[red]❌ Engine '{engine['name']}' is not running.[/red]")
|
49
|
+
console.print("Please start it before creating an AMI.")
|
50
|
+
raise typer.Exit(1)
|
51
|
+
|
52
|
+
# 2. Check for attached studios from the detailed API response
|
53
|
+
attached_studios = engine.get("studios", [])
|
54
|
+
if attached_studios:
|
55
|
+
console.print(
|
56
|
+
f"[bold red]❌ Engine '{engine['name']}' has studios attached.[/bold red]"
|
57
|
+
)
|
58
|
+
console.print("Please detach all studios before creating an AMI:")
|
59
|
+
for studio in attached_studios:
|
60
|
+
console.print(f" - {studio['user']} ({studio['studio_id']})")
|
61
|
+
console.print("\nTo detach, run [bold]dh studio detach[/bold]")
|
62
|
+
raise typer.Exit(1)
|
63
|
+
|
64
|
+
# Construct AMI name and description
|
65
|
+
ami_name = (
|
66
|
+
f"prewarmed-engine-{engine['engine_type']}-{datetime.now().strftime('%Y%m%d')}"
|
67
|
+
)
|
68
|
+
description = (
|
69
|
+
f"Amazon Linux 2023 with NVIDIA drivers, Docker, and pre-pulled "
|
70
|
+
f"dev container image for {engine['engine_type']} engines"
|
71
|
+
)
|
72
|
+
|
73
|
+
console.print(f"Creating AMI from engine [cyan]{engine['name']}[/cyan]...")
|
74
|
+
console.print(f"[bold]AMI Name:[/] {ami_name}")
|
75
|
+
console.print(f"[bold]Description:[/] {description}")
|
76
|
+
|
77
|
+
console.print(
|
78
|
+
"\n[bold yellow]⚠️ Important:[/bold yellow]\n"
|
79
|
+
"1. This process will run cleanup scripts on the engine.\n"
|
80
|
+
"2. The source engine should be [bold]terminated[/bold] after the AMI is created.\n"
|
81
|
+
)
|
82
|
+
|
83
|
+
if not Confirm.ask("Continue with AMI creation?"):
|
84
|
+
raise typer.Exit()
|
85
|
+
|
86
|
+
# Create AMI using EC2 client directly, as the backend logic is too complex
|
87
|
+
ec2 = boto3.client("ec2", region_name="us-east-1")
|
88
|
+
ssm = boto3.client("ssm", region_name="us-east-1")
|
89
|
+
|
90
|
+
try:
|
91
|
+
# Clean up instance state before snapshotting
|
92
|
+
console.print("Cleaning up instance for AMI creation...")
|
93
|
+
cleanup_commands = [
|
94
|
+
"sudo rm -f /opt/dayhoff/first_boot_complete.sentinel",
|
95
|
+
"history -c",
|
96
|
+
"sudo rm -rf /tmp/* /var/log/messages /var/log/cloud-init.log",
|
97
|
+
"sudo rm -rf /var/lib/amazon/ssm/* /etc/amazon/ssm/*",
|
98
|
+
"sleep 2 && sudo systemctl stop amazon-ssm-agent &", # Stop agent last
|
99
|
+
]
|
100
|
+
|
101
|
+
cleanup_response = ssm.send_command(
|
102
|
+
InstanceIds=[engine["instance_id"]],
|
103
|
+
DocumentName="AWS-RunShellScript",
|
104
|
+
Parameters={"commands": cleanup_commands, "executionTimeout": ["120"]},
|
105
|
+
)
|
106
|
+
|
107
|
+
# Acknowledge that the SSM command might be in progress as the agent shuts down
|
108
|
+
console.print(
|
109
|
+
"[dim]ℹ️ Cleanup command sent (status may show 'InProgress' as SSM agent stops)[/dim]"
|
110
|
+
)
|
111
|
+
|
112
|
+
# Create the AMI
|
113
|
+
with Progress(
|
114
|
+
SpinnerColumn(),
|
115
|
+
TextColumn("[progress.description]{task.description}"),
|
116
|
+
transient=True,
|
117
|
+
) as progress:
|
118
|
+
task = progress.add_task(
|
119
|
+
"Creating AMI (this will take several minutes)...", total=None
|
120
|
+
)
|
121
|
+
|
122
|
+
response = ec2.create_image(
|
123
|
+
InstanceId=engine["instance_id"],
|
124
|
+
Name=ami_name,
|
125
|
+
Description=description,
|
126
|
+
NoReboot=False,
|
127
|
+
TagSpecifications=[
|
128
|
+
{
|
129
|
+
"ResourceType": "image",
|
130
|
+
"Tags": [
|
131
|
+
{"Key": "Environment", "Value": "dev"},
|
132
|
+
{"Key": "Type", "Value": "golden-ami"},
|
133
|
+
{"Key": "EngineType", "Value": engine["engine_type"]},
|
134
|
+
{"Key": "Name", "Value": ami_name},
|
135
|
+
],
|
136
|
+
}
|
137
|
+
],
|
138
|
+
)
|
139
|
+
|
140
|
+
ami_id = response["ImageId"]
|
141
|
+
progress.update(
|
142
|
+
task,
|
143
|
+
completed=True,
|
144
|
+
description=f"[green]✓ AMI creation initiated![/green]",
|
145
|
+
)
|
146
|
+
|
147
|
+
console.print(f" [bold]AMI ID:[/] {ami_id}")
|
148
|
+
console.print("\nThe AMI creation process will continue in the background.")
|
149
|
+
console.print("You can monitor progress in the EC2 Console under 'AMIs'.")
|
150
|
+
console.print(
|
151
|
+
"\nOnce complete, update the AMI ID in [bold]terraform/environments/dev/variables.tf[/bold] "
|
152
|
+
"and run [bold]terraform apply[/bold]."
|
153
|
+
)
|
154
|
+
console.print(
|
155
|
+
f"\nRemember to [bold red]terminate the source engine '{engine['name']}'[/bold red] to save costs."
|
156
|
+
)
|
157
|
+
|
158
|
+
except ClientError as e:
|
159
|
+
console.print(f"[red]❌ Failed to create AMI: {e}[/red]")
|
160
|
+
raise typer.Exit(1)
|
@@ -0,0 +1,148 @@
|
|
1
|
+
"""Engine idle timeout command."""
|
2
|
+
|
3
|
+
import re
|
4
|
+
import time
|
5
|
+
from typing import Optional
|
6
|
+
|
7
|
+
import boto3
|
8
|
+
import typer
|
9
|
+
|
10
|
+
from ..engine_studio_utils.api_utils import make_api_request
|
11
|
+
from ..engine_studio_utils.aws_utils import check_aws_sso
|
12
|
+
from ..engine_studio_utils.constants import console
|
13
|
+
from ..engine_studio_utils.formatting import resolve_engine
|
14
|
+
|
15
|
+
|
16
|
+
def idle_timeout_cmd(
|
17
|
+
name_or_id: str = typer.Argument(help="Engine name or instance ID"),
|
18
|
+
set: Optional[str] = typer.Option(
|
19
|
+
None, "--set", "-s", help="New timeout (e.g., 2h30m, 45m)"
|
20
|
+
),
|
21
|
+
slack: Optional[str] = typer.Option(
|
22
|
+
None, "--slack", help="Set Slack notifications: none, default, all"
|
23
|
+
),
|
24
|
+
):
|
25
|
+
"""Show or set engine idle-detector settings."""
|
26
|
+
check_aws_sso()
|
27
|
+
|
28
|
+
# Resolve engine
|
29
|
+
response = make_api_request("GET", "/engines")
|
30
|
+
if response.status_code != 200:
|
31
|
+
console.print("[red]❌ Failed to fetch engines[/red]")
|
32
|
+
raise typer.Exit(1)
|
33
|
+
|
34
|
+
engines = response.json().get("engines", [])
|
35
|
+
engine = resolve_engine(name_or_id, engines)
|
36
|
+
|
37
|
+
ssm = boto3.client("ssm", region_name="us-east-1")
|
38
|
+
|
39
|
+
# Handle slack notifications change
|
40
|
+
if slack:
|
41
|
+
slack = slack.lower()
|
42
|
+
if slack not in ["none", "default", "all"]:
|
43
|
+
console.print("[red]❌ Invalid slack option. Use: none, default, all[/red]")
|
44
|
+
raise typer.Exit(1)
|
45
|
+
|
46
|
+
console.print(f"Setting Slack notifications to [bold]{slack}[/bold]...")
|
47
|
+
|
48
|
+
if slack == "none":
|
49
|
+
settings = {
|
50
|
+
"SLACK_NOTIFY_WARNINGS": "false",
|
51
|
+
"SLACK_NOTIFY_IDLE_START": "false",
|
52
|
+
"SLACK_NOTIFY_IDLE_END": "false",
|
53
|
+
"SLACK_NOTIFY_SHUTDOWN": "false",
|
54
|
+
}
|
55
|
+
elif slack == "default":
|
56
|
+
settings = {
|
57
|
+
"SLACK_NOTIFY_WARNINGS": "true",
|
58
|
+
"SLACK_NOTIFY_IDLE_START": "false",
|
59
|
+
"SLACK_NOTIFY_IDLE_END": "false",
|
60
|
+
"SLACK_NOTIFY_SHUTDOWN": "true",
|
61
|
+
}
|
62
|
+
else: # all
|
63
|
+
settings = {
|
64
|
+
"SLACK_NOTIFY_WARNINGS": "true",
|
65
|
+
"SLACK_NOTIFY_IDLE_START": "true",
|
66
|
+
"SLACK_NOTIFY_IDLE_END": "true",
|
67
|
+
"SLACK_NOTIFY_SHUTDOWN": "true",
|
68
|
+
}
|
69
|
+
|
70
|
+
commands = []
|
71
|
+
for key, value in settings.items():
|
72
|
+
# Use a robust sed command that adds the line if it doesn't exist
|
73
|
+
commands.append(
|
74
|
+
f"grep -q '^{key}=' /etc/engine.env && sudo sed -i 's|^{key}=.*|{key}={value}|' /etc/engine.env || echo '{key}={value}' | sudo tee -a /etc/engine.env > /dev/null"
|
75
|
+
)
|
76
|
+
|
77
|
+
# Instead of restarting service, send SIGHUP to reload config
|
78
|
+
commands.append(
|
79
|
+
"sudo pkill -HUP -f engine-idle-detector.py || sudo systemctl restart engine-idle-detector.service"
|
80
|
+
)
|
81
|
+
|
82
|
+
resp = ssm.send_command(
|
83
|
+
InstanceIds=[engine["instance_id"]],
|
84
|
+
DocumentName="AWS-RunShellScript",
|
85
|
+
Parameters={"commands": commands, "executionTimeout": ["60"]},
|
86
|
+
)
|
87
|
+
cid = resp["Command"]["CommandId"]
|
88
|
+
time.sleep(2) # Give it a moment to process
|
89
|
+
console.print(f"[green]✓ Slack notifications updated to '{slack}'[/green]")
|
90
|
+
console.print("[dim]Note: Settings updated without resetting idle timer[/dim]")
|
91
|
+
|
92
|
+
# Handle setting new timeout value
|
93
|
+
if set is not None:
|
94
|
+
m = re.match(r"^(?:(\d+)h)?(?:(\d+)m)?$", set)
|
95
|
+
if not m:
|
96
|
+
console.print(
|
97
|
+
"[red]❌ Invalid duration format. Use e.g. 2h, 45m, 1h30m[/red]"
|
98
|
+
)
|
99
|
+
raise typer.Exit(1)
|
100
|
+
hours = int(m.group(1) or 0)
|
101
|
+
minutes = int(m.group(2) or 0)
|
102
|
+
seconds = hours * 3600 + minutes * 60
|
103
|
+
if seconds == 0:
|
104
|
+
console.print("[red]❌ Duration must be greater than zero[/red]")
|
105
|
+
raise typer.Exit(1)
|
106
|
+
|
107
|
+
console.print(f"Setting idle timeout to {set} ({seconds} seconds)…")
|
108
|
+
|
109
|
+
cmd = (
|
110
|
+
"sudo sed -i '/^IDLE_TIMEOUT_SECONDS=/d' /etc/engine.env && "
|
111
|
+
f"echo 'IDLE_TIMEOUT_SECONDS={seconds}' | sudo tee -a /etc/engine.env >/dev/null && "
|
112
|
+
"sudo systemctl restart engine-idle-detector.service"
|
113
|
+
)
|
114
|
+
|
115
|
+
resp = ssm.send_command(
|
116
|
+
InstanceIds=[engine["instance_id"]],
|
117
|
+
DocumentName="AWS-RunShellScript",
|
118
|
+
Parameters={"commands": [cmd], "executionTimeout": ["60"]},
|
119
|
+
)
|
120
|
+
cid = resp["Command"]["CommandId"]
|
121
|
+
time.sleep(2)
|
122
|
+
console.print(f"[green]✓ Idle timeout updated to {set}[/green]")
|
123
|
+
|
124
|
+
# If no action was specified, show current timeout
|
125
|
+
if set is None and slack is None:
|
126
|
+
# Show current timeout setting
|
127
|
+
resp = ssm.send_command(
|
128
|
+
InstanceIds=[engine["instance_id"]],
|
129
|
+
DocumentName="AWS-RunShellScript",
|
130
|
+
Parameters={
|
131
|
+
"commands": [
|
132
|
+
"grep -E '^IDLE_TIMEOUT_SECONDS=' /etc/engine.env || echo 'IDLE_TIMEOUT_SECONDS=1800'"
|
133
|
+
],
|
134
|
+
"executionTimeout": ["10"],
|
135
|
+
},
|
136
|
+
)
|
137
|
+
cid = resp["Command"]["CommandId"]
|
138
|
+
time.sleep(1)
|
139
|
+
inv = ssm.get_command_invocation(
|
140
|
+
CommandId=cid, InstanceId=engine["instance_id"]
|
141
|
+
)
|
142
|
+
if inv["Status"] == "Success":
|
143
|
+
line = inv["StandardOutputContent"].strip()
|
144
|
+
secs = int(line.split("=")[1]) if "=" in line else 1800
|
145
|
+
console.print(f"Current idle timeout: {secs//60}m ({secs} seconds)")
|
146
|
+
else:
|
147
|
+
console.print("[red]❌ Could not retrieve idle timeout[/red]")
|
148
|
+
return
|
@@ -0,0 +1,101 @@
|
|
1
|
+
"""Engine launch command."""
|
2
|
+
|
3
|
+
from typing import Any, Dict, Optional
|
4
|
+
|
5
|
+
import typer
|
6
|
+
from rich.progress import Progress, SpinnerColumn, TextColumn
|
7
|
+
|
8
|
+
from ..engine_studio_utils.api_utils import make_api_request
|
9
|
+
from ..engine_studio_utils.aws_utils import check_aws_sso
|
10
|
+
from ..engine_studio_utils.constants import HOURLY_COSTS, console
|
11
|
+
|
12
|
+
|
13
|
+
def launch_engine(
|
14
|
+
name: str = typer.Argument(help="Name for the new engine"),
|
15
|
+
engine_type: str = typer.Option(
|
16
|
+
"cpu",
|
17
|
+
"--type",
|
18
|
+
"-t",
|
19
|
+
help="Engine type: cpu, cpumax, t4, a10g, a100, 4_t4, 8_t4, 4_a10g, 8_a10g",
|
20
|
+
),
|
21
|
+
user: Optional[str] = typer.Option(None, "--user", "-u", help="Override username"),
|
22
|
+
boot_disk_size: Optional[int] = typer.Option(
|
23
|
+
None,
|
24
|
+
"--size",
|
25
|
+
"-s",
|
26
|
+
help="Boot disk size in GB (default: 50GB, min: 20GB, max: 1000GB)",
|
27
|
+
),
|
28
|
+
availability_zone: Optional[str] = typer.Option(
|
29
|
+
None,
|
30
|
+
"--az",
|
31
|
+
help="Prefer a specific Availability Zone (e.g., us-east-1b). If omitted the service will try all public subnets.",
|
32
|
+
),
|
33
|
+
):
|
34
|
+
"""Launch a new engine instance."""
|
35
|
+
username = check_aws_sso()
|
36
|
+
if user:
|
37
|
+
username = user
|
38
|
+
|
39
|
+
# Validate engine type
|
40
|
+
valid_types = [
|
41
|
+
"cpu",
|
42
|
+
"cpumax",
|
43
|
+
"t4",
|
44
|
+
"a10g",
|
45
|
+
"a100",
|
46
|
+
"4_t4",
|
47
|
+
"8_t4",
|
48
|
+
"4_a10g",
|
49
|
+
"8_a10g",
|
50
|
+
]
|
51
|
+
if engine_type not in valid_types:
|
52
|
+
console.print(f"[red]❌ Invalid engine type: {engine_type}[/red]")
|
53
|
+
console.print(f"Valid types: {', '.join(valid_types)}")
|
54
|
+
raise typer.Exit(1)
|
55
|
+
|
56
|
+
# Validate boot disk size
|
57
|
+
if boot_disk_size is not None:
|
58
|
+
if boot_disk_size < 20:
|
59
|
+
console.print("[red]❌ Boot disk size must be at least 20GB[/red]")
|
60
|
+
raise typer.Exit(1)
|
61
|
+
if boot_disk_size > 1000:
|
62
|
+
console.print("[red]❌ Boot disk size cannot exceed 1000GB[/red]")
|
63
|
+
raise typer.Exit(1)
|
64
|
+
|
65
|
+
cost = HOURLY_COSTS.get(engine_type, 0)
|
66
|
+
disk_info = f" with {boot_disk_size}GB boot disk" if boot_disk_size else ""
|
67
|
+
console.print(
|
68
|
+
f"Launching [cyan]{name}[/cyan] ({engine_type}){disk_info} for ${cost:.2f}/hour..."
|
69
|
+
)
|
70
|
+
|
71
|
+
with Progress(
|
72
|
+
SpinnerColumn(),
|
73
|
+
TextColumn("[progress.description]{task.description}"),
|
74
|
+
transient=True,
|
75
|
+
) as progress:
|
76
|
+
progress.add_task("Creating engine...", total=None)
|
77
|
+
|
78
|
+
request_data: Dict[str, Any] = {
|
79
|
+
"name": name,
|
80
|
+
"user": username,
|
81
|
+
"engine_type": engine_type,
|
82
|
+
}
|
83
|
+
if boot_disk_size is not None:
|
84
|
+
request_data["boot_disk_size"] = boot_disk_size
|
85
|
+
if availability_zone:
|
86
|
+
request_data["availability_zone"] = availability_zone
|
87
|
+
|
88
|
+
response = make_api_request("POST", "/engines", json_data=request_data)
|
89
|
+
|
90
|
+
if response.status_code == 201:
|
91
|
+
data = response.json()
|
92
|
+
console.print(f"[green]✓ Engine launched successfully![/green]")
|
93
|
+
console.print(f"Instance ID: [cyan]{data['instance_id']}[/cyan]")
|
94
|
+
console.print(f"Type: {data['instance_type']} (${cost:.2f}/hour)")
|
95
|
+
if boot_disk_size:
|
96
|
+
console.print(f"Boot disk: {boot_disk_size}GB")
|
97
|
+
console.print("\nThe engine is initializing. This may take a few minutes.")
|
98
|
+
console.print(f"Check status with: [cyan]dh engine status {name}[/cyan]")
|
99
|
+
else:
|
100
|
+
error = response.json().get("error", "Unknown error")
|
101
|
+
console.print(f"[red]❌ Failed to launch engine: {error}[/red]")
|
@@ -0,0 +1,116 @@
|
|
1
|
+
"""Engine list command."""
|
2
|
+
|
3
|
+
from datetime import datetime, timezone
|
4
|
+
from typing import Optional
|
5
|
+
|
6
|
+
import typer
|
7
|
+
from rich import box
|
8
|
+
from rich.table import Table
|
9
|
+
|
10
|
+
from ..engine_studio_utils.api_utils import make_api_request
|
11
|
+
from ..engine_studio_utils.aws_utils import _fetch_init_stages, check_aws_sso
|
12
|
+
from ..engine_studio_utils.constants import HOURLY_COSTS, console
|
13
|
+
from ..engine_studio_utils.formatting import (
|
14
|
+
format_duration,
|
15
|
+
format_status,
|
16
|
+
get_disk_usage_via_ssm,
|
17
|
+
parse_launch_time,
|
18
|
+
)
|
19
|
+
|
20
|
+
|
21
|
+
def list_engines(
|
22
|
+
user: Optional[str] = typer.Option(None, "--user", "-u", help="Filter by user"),
|
23
|
+
running_only: bool = typer.Option(
|
24
|
+
False, "--running", help="Show only running engines"
|
25
|
+
),
|
26
|
+
stopped_only: bool = typer.Option(
|
27
|
+
False, "--stopped", help="Show only stopped engines"
|
28
|
+
),
|
29
|
+
detailed: bool = typer.Option(
|
30
|
+
False, "--detailed", "-d", help="Show detailed status (slower)"
|
31
|
+
),
|
32
|
+
):
|
33
|
+
"""List engines (shows all engines by default)."""
|
34
|
+
current_user = check_aws_sso()
|
35
|
+
|
36
|
+
params = {}
|
37
|
+
if user:
|
38
|
+
params["user"] = user
|
39
|
+
if detailed:
|
40
|
+
params["check_ready"] = "true"
|
41
|
+
|
42
|
+
response = make_api_request("GET", "/engines", params=params)
|
43
|
+
|
44
|
+
if response.status_code == 200:
|
45
|
+
data = response.json()
|
46
|
+
engines = data.get("engines", [])
|
47
|
+
|
48
|
+
# Filter by state if requested
|
49
|
+
if running_only:
|
50
|
+
engines = [e for e in engines if e["state"].lower() == "running"]
|
51
|
+
elif stopped_only:
|
52
|
+
engines = [e for e in engines if e["state"].lower() == "stopped"]
|
53
|
+
|
54
|
+
if not engines:
|
55
|
+
console.print("No engines found.")
|
56
|
+
return
|
57
|
+
|
58
|
+
# Only fetch detailed info if requested (slow)
|
59
|
+
stages_map = {}
|
60
|
+
if detailed:
|
61
|
+
stages_map = _fetch_init_stages([e["instance_id"] for e in engines])
|
62
|
+
|
63
|
+
# Create table
|
64
|
+
table = Table(title="Engines", box=box.ROUNDED)
|
65
|
+
table.add_column("Name", style="cyan")
|
66
|
+
table.add_column("Instance ID", style="dim")
|
67
|
+
table.add_column("Type")
|
68
|
+
table.add_column("User")
|
69
|
+
table.add_column("Status")
|
70
|
+
if detailed:
|
71
|
+
table.add_column("Disk Usage")
|
72
|
+
table.add_column("Uptime/Since")
|
73
|
+
table.add_column("$/hour", justify="right")
|
74
|
+
|
75
|
+
for engine in engines:
|
76
|
+
launch_time = parse_launch_time(engine["launch_time"])
|
77
|
+
uptime = datetime.now(timezone.utc) - launch_time
|
78
|
+
hourly_cost = HOURLY_COSTS.get(engine["engine_type"], 0)
|
79
|
+
|
80
|
+
if engine["state"].lower() == "running":
|
81
|
+
time_str = format_duration(uptime)
|
82
|
+
# Only get disk usage if detailed mode
|
83
|
+
if detailed:
|
84
|
+
disk_usage = get_disk_usage_via_ssm(engine["instance_id"]) or "-"
|
85
|
+
else:
|
86
|
+
disk_usage = None
|
87
|
+
else:
|
88
|
+
time_str = launch_time.strftime("%Y-%m-%d %H:%M")
|
89
|
+
disk_usage = "-" if detailed else None
|
90
|
+
|
91
|
+
row_data = [
|
92
|
+
engine["name"],
|
93
|
+
engine["instance_id"],
|
94
|
+
engine["engine_type"],
|
95
|
+
engine["user"],
|
96
|
+
format_status(engine["state"], engine.get("ready")),
|
97
|
+
]
|
98
|
+
if detailed:
|
99
|
+
row_data.append(disk_usage)
|
100
|
+
row_data.extend(
|
101
|
+
[
|
102
|
+
time_str,
|
103
|
+
f"${hourly_cost:.2f}",
|
104
|
+
]
|
105
|
+
)
|
106
|
+
|
107
|
+
table.add_row(*row_data)
|
108
|
+
|
109
|
+
console.print(table)
|
110
|
+
if not detailed and any(e["state"].lower() == "running" for e in engines):
|
111
|
+
console.print(
|
112
|
+
"\n[dim]Tip: Use --detailed to see disk usage and bootstrap status (slower)[/dim]"
|
113
|
+
)
|
114
|
+
else:
|
115
|
+
error = response.json().get("error", "Unknown error")
|
116
|
+
console.print(f"[red]❌ Failed to list engines: {error}[/red]")
|
@@ -0,0 +1,128 @@
|
|
1
|
+
"""Engine repair command."""
|
2
|
+
|
3
|
+
import time
|
4
|
+
|
5
|
+
import boto3
|
6
|
+
import typer
|
7
|
+
from rich.progress import Progress, SpinnerColumn, TextColumn
|
8
|
+
from rich.prompt import Confirm
|
9
|
+
|
10
|
+
from ..engine_studio_utils.api_utils import make_api_request
|
11
|
+
from ..engine_studio_utils.aws_utils import check_aws_sso
|
12
|
+
from ..engine_studio_utils.constants import console
|
13
|
+
from ..engine_studio_utils.formatting import resolve_engine
|
14
|
+
|
15
|
+
|
16
|
+
def repair_engine(
|
17
|
+
name_or_id: str = typer.Argument(help="Engine name or instance ID"),
|
18
|
+
):
|
19
|
+
"""Repair an engine that's stuck in a bad state (e.g., after GAMI creation)."""
|
20
|
+
check_aws_sso()
|
21
|
+
|
22
|
+
# Get all engines to resolve name
|
23
|
+
response = make_api_request("GET", "/engines")
|
24
|
+
if response.status_code != 200:
|
25
|
+
console.print("[red]❌ Failed to fetch engines[/red]")
|
26
|
+
raise typer.Exit(1)
|
27
|
+
|
28
|
+
engines = response.json().get("engines", [])
|
29
|
+
engine = resolve_engine(name_or_id, engines)
|
30
|
+
|
31
|
+
if engine["state"].lower() != "running":
|
32
|
+
console.print(
|
33
|
+
f"[yellow]⚠️ Engine is {engine['state']}. Must be running to repair.[/yellow]"
|
34
|
+
)
|
35
|
+
if engine["state"].lower() == "stopped" and Confirm.ask(
|
36
|
+
"Start the engine first?"
|
37
|
+
):
|
38
|
+
response = make_api_request(
|
39
|
+
"POST", f"/engines/{engine['instance_id']}/start"
|
40
|
+
)
|
41
|
+
if response.status_code != 200:
|
42
|
+
console.print("[red]❌ Failed to start engine[/red]")
|
43
|
+
raise typer.Exit(1)
|
44
|
+
console.print("[green]✓ Engine started[/green]")
|
45
|
+
console.print("Waiting for engine to become ready...")
|
46
|
+
time.sleep(30) # Give it time to boot
|
47
|
+
else:
|
48
|
+
raise typer.Exit(1)
|
49
|
+
|
50
|
+
console.print(f"[bold]Repairing engine [cyan]{engine['name']}[/cyan][/bold]")
|
51
|
+
console.print(
|
52
|
+
"[dim]This will restore bootstrap state and ensure all services are running[/dim]\n"
|
53
|
+
)
|
54
|
+
|
55
|
+
ssm = boto3.client("ssm", region_name="us-east-1")
|
56
|
+
|
57
|
+
# Repair commands
|
58
|
+
repair_commands = [
|
59
|
+
# Create necessary directories
|
60
|
+
"sudo mkdir -p /opt/dayhoff /opt/dayhoff/state /opt/dayhoff/scripts",
|
61
|
+
# Download scripts from S3 if missing
|
62
|
+
"source /etc/engine.env && sudo aws s3 sync s3://${VM_SCRIPTS_BUCKET}/ /opt/dayhoff/scripts/ --exclude '*' --include '*.sh' --quiet",
|
63
|
+
"sudo chmod +x /opt/dayhoff/scripts/*.sh 2>/dev/null || true",
|
64
|
+
# Restore bootstrap state
|
65
|
+
"sudo touch /opt/dayhoff/first_boot_complete.sentinel",
|
66
|
+
"echo 'finished' | sudo tee /opt/dayhoff/state/engine-init.stage > /dev/null",
|
67
|
+
# Ensure SSM agent is running
|
68
|
+
"sudo systemctl restart amazon-ssm-agent 2>/dev/null || true",
|
69
|
+
# Restart idle detector (service only)
|
70
|
+
"sudo systemctl restart engine-idle-detector.service 2>/dev/null || true",
|
71
|
+
# Report status
|
72
|
+
"echo '=== Repair Complete ===' && echo 'Sentinel: ' && ls -la /opt/dayhoff/first_boot_complete.sentinel",
|
73
|
+
"echo 'Stage: ' && cat /opt/dayhoff/state/engine-init.stage",
|
74
|
+
"echo 'Scripts: ' && ls /opt/dayhoff/scripts/*.sh 2>/dev/null | wc -l",
|
75
|
+
]
|
76
|
+
|
77
|
+
try:
|
78
|
+
with Progress(
|
79
|
+
SpinnerColumn(),
|
80
|
+
TextColumn("[progress.description]{task.description}"),
|
81
|
+
transient=True,
|
82
|
+
) as progress:
|
83
|
+
task = progress.add_task("Repairing engine...", total=None)
|
84
|
+
|
85
|
+
response = ssm.send_command(
|
86
|
+
InstanceIds=[engine["instance_id"]],
|
87
|
+
DocumentName="AWS-RunShellScript",
|
88
|
+
Parameters={
|
89
|
+
"commands": repair_commands,
|
90
|
+
"executionTimeout": ["60"],
|
91
|
+
},
|
92
|
+
)
|
93
|
+
|
94
|
+
command_id = response["Command"]["CommandId"]
|
95
|
+
|
96
|
+
# Wait for command
|
97
|
+
for _ in range(60):
|
98
|
+
time.sleep(1)
|
99
|
+
result = ssm.get_command_invocation(
|
100
|
+
CommandId=command_id,
|
101
|
+
InstanceId=engine["instance_id"],
|
102
|
+
)
|
103
|
+
if result["Status"] in ["Success", "Failed"]:
|
104
|
+
break
|
105
|
+
|
106
|
+
if result["Status"] == "Success":
|
107
|
+
output = result["StandardOutputContent"]
|
108
|
+
console.print("[green]✓ Engine repaired successfully![/green]\n")
|
109
|
+
|
110
|
+
# Show repair results
|
111
|
+
if "=== Repair Complete ===" in output:
|
112
|
+
repair_section = output.split("=== Repair Complete ===")[1].strip()
|
113
|
+
console.print("[bold]Repair Results:[/bold]")
|
114
|
+
console.print(repair_section)
|
115
|
+
|
116
|
+
console.print(
|
117
|
+
"\n[dim]You should now be able to attach studios to this engine.[/dim]"
|
118
|
+
)
|
119
|
+
else:
|
120
|
+
console.print(
|
121
|
+
f"[red]❌ Repair failed: {result.get('StandardErrorContent', 'Unknown error')}[/red]"
|
122
|
+
)
|
123
|
+
console.print(
|
124
|
+
"\n[yellow]Try running 'dh engine debug' for more information.[/yellow]"
|
125
|
+
)
|
126
|
+
|
127
|
+
except Exception as e:
|
128
|
+
console.print(f"[red]❌ Failed to repair engine: {e}[/red]")
|