dayhoff-tools 1.9.26__py3-none-any.whl → 1.10.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dayhoff_tools/cli/engine/__init__.py +1 -323
- dayhoff_tools/cli/engine/coffee.py +110 -0
- dayhoff_tools/cli/engine/config_ssh.py +113 -0
- dayhoff_tools/cli/engine/debug.py +79 -0
- dayhoff_tools/cli/engine/gami.py +160 -0
- dayhoff_tools/cli/engine/idle.py +148 -0
- dayhoff_tools/cli/engine/launch.py +101 -0
- dayhoff_tools/cli/engine/list.py +116 -0
- dayhoff_tools/cli/engine/repair.py +128 -0
- dayhoff_tools/cli/engine/resize.py +195 -0
- dayhoff_tools/cli/engine/ssh.py +62 -0
- dayhoff_tools/cli/engine/{engine_core.py → status.py} +6 -201
- dayhoff_tools/cli/engine_studio_commands.py +323 -0
- dayhoff_tools/cli/engine_studio_utils/__init__.py +1 -0
- dayhoff_tools/cli/engine_studio_utils/api_utils.py +47 -0
- dayhoff_tools/cli/engine_studio_utils/aws_utils.py +102 -0
- dayhoff_tools/cli/engine_studio_utils/constants.py +21 -0
- dayhoff_tools/cli/engine_studio_utils/formatting.py +210 -0
- dayhoff_tools/cli/engine_studio_utils/ssh_utils.py +141 -0
- dayhoff_tools/cli/main.py +1 -2
- dayhoff_tools/cli/studio/__init__.py +1 -0
- dayhoff_tools/cli/studio/attach.py +314 -0
- dayhoff_tools/cli/studio/create.py +48 -0
- dayhoff_tools/cli/studio/delete.py +71 -0
- dayhoff_tools/cli/studio/detach.py +56 -0
- dayhoff_tools/cli/studio/list.py +81 -0
- dayhoff_tools/cli/studio/reset.py +90 -0
- dayhoff_tools/cli/studio/resize.py +134 -0
- dayhoff_tools/cli/studio/status.py +78 -0
- {dayhoff_tools-1.9.26.dist-info → dayhoff_tools-1.10.1.dist-info}/METADATA +1 -1
- dayhoff_tools-1.10.1.dist-info/RECORD +61 -0
- dayhoff_tools/cli/engine/engine_maintenance.py +0 -431
- dayhoff_tools/cli/engine/engine_management.py +0 -505
- dayhoff_tools/cli/engine/shared.py +0 -501
- dayhoff_tools/cli/engine/studio_commands.py +0 -825
- dayhoff_tools-1.9.26.dist-info/RECORD +0 -39
- /dayhoff_tools/cli/engine/{engine_lifecycle.py → lifecycle.py} +0 -0
- {dayhoff_tools-1.9.26.dist-info → dayhoff_tools-1.10.1.dist-info}/WHEEL +0 -0
- {dayhoff_tools-1.9.26.dist-info → dayhoff_tools-1.10.1.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,210 @@
|
|
1
|
+
"""Display formatting utilities for engine and studio commands."""
|
2
|
+
|
3
|
+
import time
|
4
|
+
from datetime import datetime, timedelta, timezone
|
5
|
+
from typing import Dict, List, Optional
|
6
|
+
|
7
|
+
import boto3
|
8
|
+
import typer
|
9
|
+
from rich.prompt import IntPrompt
|
10
|
+
|
11
|
+
from .constants import HOURLY_COSTS, console
|
12
|
+
|
13
|
+
|
14
|
+
def format_duration(duration: timedelta) -> str:
|
15
|
+
"""Format a duration as a human-readable string."""
|
16
|
+
total_seconds = int(duration.total_seconds())
|
17
|
+
hours = total_seconds // 3600
|
18
|
+
minutes = (total_seconds % 3600) // 60
|
19
|
+
|
20
|
+
if hours > 0:
|
21
|
+
return f"{hours}h {minutes}m"
|
22
|
+
else:
|
23
|
+
return f"{minutes}m"
|
24
|
+
|
25
|
+
|
26
|
+
def parse_launch_time(launch_time_str: str) -> datetime:
|
27
|
+
"""Parse launch time from API response."""
|
28
|
+
# Try different datetime formats
|
29
|
+
formats = [
|
30
|
+
"%Y-%m-%dT%H:%M:%S.%fZ",
|
31
|
+
"%Y-%m-%dT%H:%M:%SZ",
|
32
|
+
"%Y-%m-%dT%H:%M:%S%z", # ISO format with timezone
|
33
|
+
"%Y-%m-%dT%H:%M:%S+00:00", # Explicit UTC offset
|
34
|
+
"%Y-%m-%d %H:%M:%S",
|
35
|
+
]
|
36
|
+
|
37
|
+
# First try parsing with fromisoformat for better timezone handling
|
38
|
+
try:
|
39
|
+
# Handle the ISO format properly
|
40
|
+
return datetime.fromisoformat(launch_time_str.replace("Z", "+00:00"))
|
41
|
+
except (ValueError, AttributeError):
|
42
|
+
pass
|
43
|
+
|
44
|
+
# Fallback to manual format parsing
|
45
|
+
for fmt in formats:
|
46
|
+
try:
|
47
|
+
parsed = datetime.strptime(launch_time_str, fmt)
|
48
|
+
# If no timezone info, assume UTC
|
49
|
+
if parsed.tzinfo is None:
|
50
|
+
parsed = parsed.replace(tzinfo=timezone.utc)
|
51
|
+
return parsed
|
52
|
+
except ValueError:
|
53
|
+
continue
|
54
|
+
|
55
|
+
# Fallback: assume it's recent
|
56
|
+
return datetime.now(timezone.utc)
|
57
|
+
|
58
|
+
|
59
|
+
def format_status(state: str, ready: Optional[bool]) -> str:
|
60
|
+
"""Format engine status with ready indicator."""
|
61
|
+
if state.lower() == "running":
|
62
|
+
if ready is True:
|
63
|
+
return "[green]Running ✓[/green]"
|
64
|
+
elif ready is False:
|
65
|
+
return "[yellow]Running ⚠ (Bootstrapping...)[/yellow]"
|
66
|
+
else:
|
67
|
+
return "[green]Running[/green]"
|
68
|
+
elif state.lower() == "stopped":
|
69
|
+
return "[dim]Stopped[/dim]"
|
70
|
+
elif state.lower() == "stopping":
|
71
|
+
return "[yellow]Stopping...[/yellow]"
|
72
|
+
elif state.lower() == "pending":
|
73
|
+
return "[yellow]Starting...[/yellow]"
|
74
|
+
else:
|
75
|
+
return state
|
76
|
+
|
77
|
+
|
78
|
+
def resolve_engine(name_or_id: str, engines: List[Dict]) -> Dict:
|
79
|
+
"""Resolve engine by name or ID with interactive selection."""
|
80
|
+
# Exact ID match
|
81
|
+
exact_id = [e for e in engines if e["instance_id"] == name_or_id]
|
82
|
+
if exact_id:
|
83
|
+
return exact_id[0]
|
84
|
+
|
85
|
+
# Exact name match
|
86
|
+
exact_name = [e for e in engines if e["name"] == name_or_id]
|
87
|
+
if len(exact_name) == 1:
|
88
|
+
return exact_name[0]
|
89
|
+
|
90
|
+
# Prefix matches
|
91
|
+
matches = [
|
92
|
+
e
|
93
|
+
for e in engines
|
94
|
+
if e["name"].startswith(name_or_id) or e["instance_id"].startswith(name_or_id)
|
95
|
+
]
|
96
|
+
|
97
|
+
if len(matches) == 0:
|
98
|
+
console.print(f"[red]❌ No engine found matching '{name_or_id}'[/red]")
|
99
|
+
raise typer.Exit(1)
|
100
|
+
elif len(matches) == 1:
|
101
|
+
return matches[0]
|
102
|
+
else:
|
103
|
+
# Interactive selection
|
104
|
+
console.print(f"Multiple engines match '{name_or_id}':")
|
105
|
+
for i, engine in enumerate(matches, 1):
|
106
|
+
cost = HOURLY_COSTS.get(engine["engine_type"], 0)
|
107
|
+
console.print(
|
108
|
+
f" {i}. [cyan]{engine['name']}[/cyan] ({engine['instance_id']}) "
|
109
|
+
f"- {engine['engine_type']} - {engine['state']} - ${cost:.2f}/hr"
|
110
|
+
)
|
111
|
+
|
112
|
+
while True:
|
113
|
+
try:
|
114
|
+
choice = IntPrompt.ask(
|
115
|
+
"Select engine",
|
116
|
+
default=1,
|
117
|
+
choices=[str(i) for i in range(1, len(matches) + 1)],
|
118
|
+
)
|
119
|
+
return matches[choice - 1]
|
120
|
+
except (ValueError, IndexError):
|
121
|
+
console.print("[red]Invalid selection, please try again[/red]")
|
122
|
+
|
123
|
+
|
124
|
+
def get_disk_usage_via_ssm(instance_id: str) -> Optional[str]:
|
125
|
+
"""Get disk usage for an engine via SSM.
|
126
|
+
|
127
|
+
Returns:
|
128
|
+
String like "17/50 GB" or None if failed
|
129
|
+
"""
|
130
|
+
try:
|
131
|
+
ssm = boto3.client("ssm", region_name="us-east-1")
|
132
|
+
|
133
|
+
# Run df command to get disk usage
|
134
|
+
response = ssm.send_command(
|
135
|
+
InstanceIds=[instance_id],
|
136
|
+
DocumentName="AWS-RunShellScript",
|
137
|
+
Parameters={
|
138
|
+
"commands": [
|
139
|
+
# Get root filesystem usage in GB
|
140
|
+
'df -BG / | tail -1 | awk \'{gsub(/G/, "", $2); gsub(/G/, "", $3); print $3 "/" $2 " GB"}\''
|
141
|
+
],
|
142
|
+
"executionTimeout": ["10"],
|
143
|
+
},
|
144
|
+
)
|
145
|
+
|
146
|
+
command_id = response["Command"]["CommandId"]
|
147
|
+
|
148
|
+
# Wait for command to complete (with timeout)
|
149
|
+
for _ in range(5): # 5 second timeout
|
150
|
+
time.sleep(1)
|
151
|
+
result = ssm.get_command_invocation(
|
152
|
+
CommandId=command_id,
|
153
|
+
InstanceId=instance_id,
|
154
|
+
)
|
155
|
+
if result["Status"] in ["Success", "Failed"]:
|
156
|
+
break
|
157
|
+
|
158
|
+
if result["Status"] == "Success":
|
159
|
+
output = result["StandardOutputContent"].strip()
|
160
|
+
return output if output else None
|
161
|
+
|
162
|
+
return None
|
163
|
+
|
164
|
+
except Exception as e:
|
165
|
+
# logger.debug(f"Failed to get disk usage for {instance_id}: {e}") # Original code had this line commented out
|
166
|
+
return None
|
167
|
+
|
168
|
+
|
169
|
+
def get_studio_disk_usage_via_ssm(instance_id: str, username: str) -> Optional[str]:
|
170
|
+
"""Get disk usage for a studio via SSM.
|
171
|
+
|
172
|
+
Returns:
|
173
|
+
String like "333/500 GB" or None if failed
|
174
|
+
"""
|
175
|
+
try:
|
176
|
+
ssm = boto3.client("ssm", region_name="us-east-1")
|
177
|
+
|
178
|
+
# Run df command to get studio disk usage
|
179
|
+
response = ssm.send_command(
|
180
|
+
InstanceIds=[instance_id],
|
181
|
+
DocumentName="AWS-RunShellScript",
|
182
|
+
Parameters={
|
183
|
+
"commands": [
|
184
|
+
# Get studio filesystem usage in GB
|
185
|
+
f'df -BG /studios/{username} 2>/dev/null | tail -1 | awk \'{{gsub(/G/, "", $2); gsub(/G/, "", $3); print $3 "/" $2 " GB"}}\''
|
186
|
+
],
|
187
|
+
"executionTimeout": ["10"],
|
188
|
+
},
|
189
|
+
)
|
190
|
+
|
191
|
+
command_id = response["Command"]["CommandId"]
|
192
|
+
|
193
|
+
# Wait for command to complete (with timeout)
|
194
|
+
for _ in range(5): # 5 second timeout
|
195
|
+
time.sleep(1)
|
196
|
+
result = ssm.get_command_invocation(
|
197
|
+
CommandId=command_id,
|
198
|
+
InstanceId=instance_id,
|
199
|
+
)
|
200
|
+
if result["Status"] in ["Success", "Failed"]:
|
201
|
+
break
|
202
|
+
|
203
|
+
if result["Status"] == "Success":
|
204
|
+
output = result["StandardOutputContent"].strip()
|
205
|
+
return output if output else None
|
206
|
+
|
207
|
+
return None
|
208
|
+
|
209
|
+
except Exception:
|
210
|
+
return None
|
@@ -0,0 +1,141 @@
|
|
1
|
+
"""SSH-related utilities for engine and studio commands."""
|
2
|
+
|
3
|
+
import os
|
4
|
+
import shutil
|
5
|
+
import subprocess
|
6
|
+
from pathlib import Path
|
7
|
+
|
8
|
+
from .constants import SSH_MANAGED_COMMENT
|
9
|
+
|
10
|
+
|
11
|
+
def get_ssh_public_key() -> str:
|
12
|
+
"""Get the user's SSH public key.
|
13
|
+
|
14
|
+
Discovery order (container-friendly):
|
15
|
+
1) DHT_SSH_PUBLIC_KEY env var (direct key content)
|
16
|
+
2) DHT_SSH_PUBLIC_KEY_PATH env var (path to a .pub file)
|
17
|
+
3) ssh-agent via `ssh-add -L` (requires SSH_AUTH_SOCK)
|
18
|
+
4) Conventional files: ~/.ssh/id_ed25519.pub, ~/.ssh/id_rsa.pub
|
19
|
+
|
20
|
+
Raises:
|
21
|
+
FileNotFoundError: If no public key can be discovered.
|
22
|
+
"""
|
23
|
+
# 1) Direct env var content
|
24
|
+
env_key = os.environ.get("DHT_SSH_PUBLIC_KEY")
|
25
|
+
if env_key and env_key.strip():
|
26
|
+
return env_key.strip()
|
27
|
+
|
28
|
+
# 2) Env var path
|
29
|
+
env_path = os.environ.get("DHT_SSH_PUBLIC_KEY_PATH")
|
30
|
+
if env_path:
|
31
|
+
p = Path(env_path).expanduser()
|
32
|
+
if p.is_file():
|
33
|
+
try:
|
34
|
+
return p.read_text().strip()
|
35
|
+
except Exception:
|
36
|
+
pass
|
37
|
+
|
38
|
+
# 3) Agent lookup (ssh-add -L)
|
39
|
+
try:
|
40
|
+
if shutil.which("ssh-add") is not None:
|
41
|
+
proc = subprocess.run(["ssh-add", "-L"], capture_output=True, text=True)
|
42
|
+
if proc.returncode == 0 and proc.stdout:
|
43
|
+
keys = [
|
44
|
+
line.strip() for line in proc.stdout.splitlines() if line.strip()
|
45
|
+
]
|
46
|
+
# Prefer ed25519, then rsa
|
47
|
+
for pref in ("ssh-ed25519", "ssh-rsa", "ecdsa-sha2-nistp256"):
|
48
|
+
for k in keys:
|
49
|
+
if k.startswith(pref + " "):
|
50
|
+
return k
|
51
|
+
# Fallback to first key if types not matched
|
52
|
+
if keys:
|
53
|
+
return keys[0]
|
54
|
+
except Exception:
|
55
|
+
pass
|
56
|
+
|
57
|
+
# 4) Conventional files
|
58
|
+
home = Path.home()
|
59
|
+
key_paths = [home / ".ssh" / "id_ed25519.pub", home / ".ssh" / "id_rsa.pub"]
|
60
|
+
for key_path in key_paths:
|
61
|
+
if key_path.is_file():
|
62
|
+
try:
|
63
|
+
return key_path.read_text().strip()
|
64
|
+
except Exception:
|
65
|
+
continue
|
66
|
+
|
67
|
+
raise FileNotFoundError(
|
68
|
+
"No SSH public key found. Please create one with 'ssh-keygen' first."
|
69
|
+
)
|
70
|
+
|
71
|
+
|
72
|
+
def check_session_manager_plugin():
|
73
|
+
"""Check if AWS Session Manager Plugin is available and warn if not."""
|
74
|
+
from .constants import console
|
75
|
+
|
76
|
+
if shutil.which("session-manager-plugin") is None:
|
77
|
+
console.print(
|
78
|
+
"[bold red]⚠️ AWS Session Manager Plugin not found![/bold red]\n"
|
79
|
+
"SSH connections to engines require the Session Manager Plugin.\n"
|
80
|
+
"Please install it following the setup guide:\n"
|
81
|
+
"[link]https://github.com/dayhofflabs/nutshell/blob/main/REFERENCE/setup_guides/new-laptop.md[/link]"
|
82
|
+
)
|
83
|
+
return False
|
84
|
+
return True
|
85
|
+
|
86
|
+
|
87
|
+
def update_ssh_config_entry(
|
88
|
+
engine_name: str, instance_id: str, ssh_user: str, idle_timeout: int = 600
|
89
|
+
):
|
90
|
+
"""Add or update a single SSH config entry for the given SSH user.
|
91
|
+
|
92
|
+
Args:
|
93
|
+
engine_name: Host alias to write into ~/.ssh/config
|
94
|
+
instance_id: EC2 instance-id (used by the proxy command)
|
95
|
+
ssh_user: Username to place into the SSH stanza
|
96
|
+
idle_timeout: Idle timeout **in seconds** to pass to the SSM port-forward. 600 = 10 min.
|
97
|
+
"""
|
98
|
+
config_path = Path.home() / ".ssh" / "config"
|
99
|
+
config_path.parent.mkdir(mode=0o700, exist_ok=True)
|
100
|
+
|
101
|
+
# Touch the file if it doesn't exist
|
102
|
+
if not config_path.exists():
|
103
|
+
config_path.touch(mode=0o600)
|
104
|
+
|
105
|
+
# Read existing config
|
106
|
+
content = config_path.read_text()
|
107
|
+
lines = content.splitlines() if content else []
|
108
|
+
|
109
|
+
# Remove any existing entry for this engine
|
110
|
+
new_lines = []
|
111
|
+
skip_until_next_host = False
|
112
|
+
for line in lines:
|
113
|
+
# Check if this is our managed host
|
114
|
+
if (
|
115
|
+
line.strip().startswith(f"Host {engine_name}")
|
116
|
+
and SSH_MANAGED_COMMENT in line
|
117
|
+
):
|
118
|
+
skip_until_next_host = True
|
119
|
+
elif line.strip().startswith("Host ") and skip_until_next_host:
|
120
|
+
skip_until_next_host = False
|
121
|
+
# This is a different host entry, keep it
|
122
|
+
new_lines.append(line)
|
123
|
+
elif not skip_until_next_host:
|
124
|
+
new_lines.append(line)
|
125
|
+
|
126
|
+
# Add the new entry
|
127
|
+
if new_lines and new_lines[-1].strip(): # Add blank line if needed
|
128
|
+
new_lines.append("")
|
129
|
+
|
130
|
+
new_lines.extend(
|
131
|
+
[
|
132
|
+
f"Host {engine_name} {SSH_MANAGED_COMMENT}",
|
133
|
+
f" HostName {instance_id}",
|
134
|
+
f" User {ssh_user}",
|
135
|
+
f" ProxyCommand sh -c \"AWS_SSM_IDLE_TIMEOUT={idle_timeout} aws ssm start-session --target %h --document-name AWS-StartSSHSession --parameters 'portNumber=%p'\"",
|
136
|
+
]
|
137
|
+
)
|
138
|
+
|
139
|
+
# Write back
|
140
|
+
config_path.write_text("\n".join(new_lines))
|
141
|
+
config_path.chmod(0o600)
|
dayhoff_tools/cli/main.py
CHANGED
@@ -4,9 +4,8 @@ import sys
|
|
4
4
|
from importlib.metadata import PackageNotFoundError, version
|
5
5
|
|
6
6
|
import typer
|
7
|
-
|
8
7
|
from dayhoff_tools.cli.cloud_commands import aws_app, gcp_app
|
9
|
-
from dayhoff_tools.cli.
|
8
|
+
from dayhoff_tools.cli.engine_studio_commands import engine_app, studio_app
|
10
9
|
from dayhoff_tools.cli.utility_commands import (
|
11
10
|
build_and_upload_wheel,
|
12
11
|
delete_local_branch,
|
@@ -0,0 +1 @@
|
|
1
|
+
"""Studio management commands."""
|
@@ -0,0 +1,314 @@
|
|
1
|
+
"""Studio attach command."""
|
2
|
+
|
3
|
+
import time
|
4
|
+
from typing import Optional
|
5
|
+
|
6
|
+
import typer
|
7
|
+
from rich.progress import Progress, SpinnerColumn, TextColumn, TimeElapsedColumn
|
8
|
+
from rich.prompt import Confirm, IntPrompt
|
9
|
+
|
10
|
+
from ..engine_studio_utils.api_utils import get_user_studio, make_api_request
|
11
|
+
from ..engine_studio_utils.aws_utils import check_aws_sso
|
12
|
+
from ..engine_studio_utils.constants import console
|
13
|
+
from ..engine_studio_utils.formatting import resolve_engine
|
14
|
+
from ..engine_studio_utils.ssh_utils import (
|
15
|
+
check_session_manager_plugin,
|
16
|
+
get_ssh_public_key,
|
17
|
+
update_ssh_config_entry,
|
18
|
+
)
|
19
|
+
|
20
|
+
|
21
|
+
def attach_studio(
|
22
|
+
engine_name_or_id: str = typer.Argument(help="Engine name or instance ID"),
|
23
|
+
user: Optional[str] = typer.Option(
|
24
|
+
None, "--user", "-u", help="Attach a different user's studio (admin only)"
|
25
|
+
),
|
26
|
+
):
|
27
|
+
"""Attach your studio to an engine."""
|
28
|
+
username = check_aws_sso()
|
29
|
+
|
30
|
+
# Check for Session Manager Plugin since we'll update SSH config
|
31
|
+
if not check_session_manager_plugin():
|
32
|
+
raise typer.Exit(1)
|
33
|
+
|
34
|
+
# Use specified user if provided, otherwise use current user
|
35
|
+
target_user = user if user else username
|
36
|
+
|
37
|
+
# Add confirmation when attaching another user's studio
|
38
|
+
if target_user != username:
|
39
|
+
console.print(f"[yellow]⚠️ Managing studio for user: {target_user}[/yellow]")
|
40
|
+
if not Confirm.ask(f"Are you sure you want to attach {target_user}'s studio?"):
|
41
|
+
console.print("Operation cancelled.")
|
42
|
+
return
|
43
|
+
|
44
|
+
# Get user's studio
|
45
|
+
studio = get_user_studio(target_user)
|
46
|
+
if not studio:
|
47
|
+
if target_user == username:
|
48
|
+
console.print("[yellow]You don't have a studio yet.[/yellow]")
|
49
|
+
if Confirm.ask("Would you like to create one now?"):
|
50
|
+
size = IntPrompt.ask("Studio size (GB)", default=50)
|
51
|
+
response = make_api_request(
|
52
|
+
"POST",
|
53
|
+
"/studios",
|
54
|
+
json_data={"user": username, "size_gb": size},
|
55
|
+
)
|
56
|
+
if response.status_code != 201:
|
57
|
+
console.print("[red]❌ Failed to create studio[/red]")
|
58
|
+
raise typer.Exit(1)
|
59
|
+
studio = response.json()
|
60
|
+
studio["studio_id"] = studio["studio_id"] # Normalize key
|
61
|
+
else:
|
62
|
+
raise typer.Exit(0)
|
63
|
+
else:
|
64
|
+
console.print(f"[red]❌ User {target_user} doesn't have a studio.[/red]")
|
65
|
+
raise typer.Exit(1)
|
66
|
+
|
67
|
+
# Check if already attached
|
68
|
+
if studio.get("status") == "in-use":
|
69
|
+
console.print(
|
70
|
+
f"[yellow]Studio is already attached to {studio.get('attached_vm_id')}[/yellow]"
|
71
|
+
)
|
72
|
+
if not Confirm.ask("Detach and reattach to new engine?"):
|
73
|
+
return
|
74
|
+
# Detach first
|
75
|
+
response = make_api_request("POST", f"/studios/{studio['studio_id']}/detach")
|
76
|
+
if response.status_code != 200:
|
77
|
+
console.print("[red]❌ Failed to detach studio[/red]")
|
78
|
+
raise typer.Exit(1)
|
79
|
+
|
80
|
+
# Get all engines to resolve name
|
81
|
+
response = make_api_request("GET", "/engines")
|
82
|
+
if response.status_code != 200:
|
83
|
+
console.print("[red]❌ Failed to fetch engines[/red]")
|
84
|
+
raise typer.Exit(1)
|
85
|
+
|
86
|
+
engines = response.json().get("engines", [])
|
87
|
+
engine = resolve_engine(engine_name_or_id, engines)
|
88
|
+
|
89
|
+
# Flag to track if we started the engine in this command (affects retry length)
|
90
|
+
engine_started_now: bool = False
|
91
|
+
|
92
|
+
if engine["state"].lower() != "running":
|
93
|
+
console.print(f"[yellow]⚠️ Engine is {engine['state']}[/yellow]")
|
94
|
+
if engine["state"].lower() == "stopped" and Confirm.ask(
|
95
|
+
"Start the engine first?"
|
96
|
+
):
|
97
|
+
response = make_api_request(
|
98
|
+
"POST", f"/engines/{engine['instance_id']}/start"
|
99
|
+
)
|
100
|
+
if response.status_code != 200:
|
101
|
+
console.print("[red]❌ Failed to start engine[/red]")
|
102
|
+
raise typer.Exit(1)
|
103
|
+
console.print("[green]✓ Engine started[/green]")
|
104
|
+
# Mark that we booted the engine so attach loop gets extended retries
|
105
|
+
engine_started_now = True
|
106
|
+
# No further waiting here – attachment attempts below handle retry logic while the
|
107
|
+
# engine finishes booting.
|
108
|
+
else:
|
109
|
+
raise typer.Exit(1)
|
110
|
+
|
111
|
+
# Retrieve SSH public key (required for authorised_keys provisioning)
|
112
|
+
try:
|
113
|
+
public_key = get_ssh_public_key()
|
114
|
+
except FileNotFoundError as e:
|
115
|
+
console.print(f"[red]❌ {e}[/red]")
|
116
|
+
raise typer.Exit(1)
|
117
|
+
|
118
|
+
console.print(f"Attaching studio to engine [cyan]{engine['name']}[/cyan]...")
|
119
|
+
|
120
|
+
# Determine retry strategy based on whether we just started the engine
|
121
|
+
if engine_started_now:
|
122
|
+
max_attempts = 40 # About 7 minutes total with exponential backoff
|
123
|
+
base_delay = 8
|
124
|
+
max_delay = 20
|
125
|
+
else:
|
126
|
+
max_attempts = 15 # About 2 minutes total with exponential backoff
|
127
|
+
base_delay = 5
|
128
|
+
max_delay = 10
|
129
|
+
|
130
|
+
# Unified retry loop with exponential backoff
|
131
|
+
with Progress(
|
132
|
+
SpinnerColumn(),
|
133
|
+
TimeElapsedColumn(),
|
134
|
+
TextColumn("[progress.description]{task.description}"),
|
135
|
+
transient=True,
|
136
|
+
) as prog:
|
137
|
+
desc = (
|
138
|
+
"Attaching studio (engine is still booting)…"
|
139
|
+
if engine_started_now
|
140
|
+
else "Attaching studio…"
|
141
|
+
)
|
142
|
+
task = prog.add_task(desc, total=None)
|
143
|
+
|
144
|
+
consecutive_not_ready = 0
|
145
|
+
last_error = None
|
146
|
+
|
147
|
+
for attempt in range(max_attempts):
|
148
|
+
# Check if the attach already completed
|
149
|
+
if _is_studio_attached(studio["studio_id"], engine["instance_id"]):
|
150
|
+
success = True
|
151
|
+
break
|
152
|
+
|
153
|
+
success, error_msg = _attempt_studio_attach(
|
154
|
+
studio, engine, target_user, public_key
|
155
|
+
)
|
156
|
+
|
157
|
+
if success:
|
158
|
+
break # success!
|
159
|
+
|
160
|
+
if error_msg:
|
161
|
+
# Fatal error – bubble up immediately
|
162
|
+
console.print(f"[red]❌ Failed to attach studio: {error_msg}[/red]")
|
163
|
+
|
164
|
+
# Suggest repair command if engine seems broken
|
165
|
+
if "not ready" in error_msg.lower() and attempt > 5:
|
166
|
+
console.print(
|
167
|
+
f"\n[yellow]Engine may be in a bad state. Try:[/yellow]"
|
168
|
+
)
|
169
|
+
console.print(f"[dim] dh engine repair {engine['name']}[/dim]")
|
170
|
+
return
|
171
|
+
|
172
|
+
# Track consecutive "not ready" responses
|
173
|
+
consecutive_not_ready += 1
|
174
|
+
last_error = "Engine not ready"
|
175
|
+
|
176
|
+
# Update progress display
|
177
|
+
if attempt % 3 == 0:
|
178
|
+
prog.update(
|
179
|
+
task,
|
180
|
+
description=f"{desc} attempt {attempt+1}/{max_attempts}",
|
181
|
+
)
|
182
|
+
|
183
|
+
# If engine seems stuck after many attempts, show a hint
|
184
|
+
if consecutive_not_ready > 10 and attempt == 10:
|
185
|
+
console.print(
|
186
|
+
"[yellow]Engine is taking longer than expected to become ready.[/yellow]"
|
187
|
+
)
|
188
|
+
console.print(
|
189
|
+
"[dim]This can happen after GAMI creation or if the engine is still bootstrapping.[/dim]"
|
190
|
+
)
|
191
|
+
|
192
|
+
# Exponential backoff with jitter
|
193
|
+
delay = min(base_delay * (1.5 ** min(attempt, 5)), max_delay)
|
194
|
+
delay += time.time() % 2 # Add 0-2 seconds of jitter
|
195
|
+
time.sleep(delay)
|
196
|
+
|
197
|
+
else:
|
198
|
+
# All attempts exhausted
|
199
|
+
console.print(
|
200
|
+
f"[yellow]Engine is not becoming ready after {max_attempts} attempts.[/yellow]"
|
201
|
+
)
|
202
|
+
if last_error:
|
203
|
+
console.print(f"[dim]Last issue: {last_error}[/dim]")
|
204
|
+
console.print("\n[yellow]You can try:[/yellow]")
|
205
|
+
console.print(
|
206
|
+
f" 1. Wait a minute and retry: [cyan]dh studio attach {engine['name']}[/cyan]"
|
207
|
+
)
|
208
|
+
console.print(
|
209
|
+
f" 2. Check engine status: [cyan]dh engine status {engine['name']}[/cyan]"
|
210
|
+
)
|
211
|
+
console.print(
|
212
|
+
f" 3. Repair the engine: [cyan]dh engine repair {engine['name']}[/cyan]"
|
213
|
+
)
|
214
|
+
return
|
215
|
+
|
216
|
+
# Successful attach path
|
217
|
+
console.print(f"[green]✓ Studio attached successfully![/green]")
|
218
|
+
|
219
|
+
# Update SSH config - use target_user for the connection
|
220
|
+
update_ssh_config_entry(engine["name"], engine["instance_id"], target_user)
|
221
|
+
console.print(f"[green]✓ SSH config updated[/green]")
|
222
|
+
console.print(f"\nConnect with: [cyan]ssh {engine['name']}[/cyan]")
|
223
|
+
console.print(f"Files are at: [cyan]/studios/{target_user}[/cyan]")
|
224
|
+
|
225
|
+
|
226
|
+
def _is_studio_attached(target_studio_id: str, target_vm_id: str) -> bool:
|
227
|
+
"""Check if a studio is attached to a specific VM."""
|
228
|
+
response = make_api_request("GET", "/studios")
|
229
|
+
if response.status_code != 200:
|
230
|
+
return False
|
231
|
+
|
232
|
+
studios = response.json().get("studios", [])
|
233
|
+
for studio in studios:
|
234
|
+
if (
|
235
|
+
studio["studio_id"] == target_studio_id
|
236
|
+
and studio.get("attached_vm_id") == target_vm_id
|
237
|
+
and studio.get("status") == "in-use"
|
238
|
+
):
|
239
|
+
return True
|
240
|
+
return False
|
241
|
+
|
242
|
+
|
243
|
+
def _attempt_studio_attach(studio, engine, target_user, public_key):
|
244
|
+
response = make_api_request(
|
245
|
+
"POST",
|
246
|
+
f"/studios/{studio['studio_id']}/attach",
|
247
|
+
json_data={
|
248
|
+
"vm_id": engine["instance_id"],
|
249
|
+
"user": target_user,
|
250
|
+
"public_key": public_key,
|
251
|
+
},
|
252
|
+
)
|
253
|
+
|
254
|
+
# Fast-path success
|
255
|
+
if response.status_code == 200:
|
256
|
+
return True, None
|
257
|
+
|
258
|
+
# Asynchronous path – API returned 202 Accepted and operation tracking ID
|
259
|
+
if response.status_code == 202:
|
260
|
+
# The operation status polling is broken in the Lambda, so we just
|
261
|
+
# wait and check if the studio is actually attached
|
262
|
+
time.sleep(5) # Give the async operation a moment to start
|
263
|
+
|
264
|
+
# Check periodically if the studio is attached
|
265
|
+
for check in range(20): # Check for up to 60 seconds
|
266
|
+
if _is_studio_attached(studio["studio_id"], engine["instance_id"]):
|
267
|
+
return True, None
|
268
|
+
time.sleep(3)
|
269
|
+
|
270
|
+
# If we get here, attachment didn't complete in reasonable time
|
271
|
+
return False, None # Return None to trigger retry
|
272
|
+
|
273
|
+
# --- determine if we should retry ---
|
274
|
+
recoverable = False
|
275
|
+
error_text = response.json().get("error", "Unknown error")
|
276
|
+
err_msg = error_text.lower()
|
277
|
+
|
278
|
+
# Check for "Studio is not available (status: in-use)" which means it's already attached
|
279
|
+
if (
|
280
|
+
response.status_code == 400
|
281
|
+
and "not available" in err_msg
|
282
|
+
and "in-use" in err_msg
|
283
|
+
):
|
284
|
+
# Studio is already attached somewhere - check if it's to THIS engine
|
285
|
+
if _is_studio_attached(studio["studio_id"], engine["instance_id"]):
|
286
|
+
return True, None # It's attached to our target engine - success!
|
287
|
+
else:
|
288
|
+
return False, error_text # It's attached elsewhere - fatal error
|
289
|
+
|
290
|
+
if response.status_code in (409, 503):
|
291
|
+
recoverable = True
|
292
|
+
else:
|
293
|
+
RECOVERABLE_PATTERNS = [
|
294
|
+
"not ready",
|
295
|
+
"still starting",
|
296
|
+
"initializing",
|
297
|
+
"failed to mount",
|
298
|
+
"device busy",
|
299
|
+
"pending", # VM state pending
|
300
|
+
]
|
301
|
+
FATAL_PATTERNS = [
|
302
|
+
"permission",
|
303
|
+
]
|
304
|
+
if any(p in err_msg for p in FATAL_PATTERNS):
|
305
|
+
recoverable = False
|
306
|
+
elif any(p in err_msg for p in RECOVERABLE_PATTERNS):
|
307
|
+
recoverable = True
|
308
|
+
|
309
|
+
if not recoverable:
|
310
|
+
# fatal – abort immediately
|
311
|
+
return False, error_text
|
312
|
+
|
313
|
+
# recoverable – signal caller to retry without treating as error
|
314
|
+
return False, None
|