dayhoff-tools 1.9.26__py3-none-any.whl → 1.10.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (39) hide show
  1. dayhoff_tools/cli/engine/__init__.py +1 -323
  2. dayhoff_tools/cli/engine/coffee.py +110 -0
  3. dayhoff_tools/cli/engine/config_ssh.py +113 -0
  4. dayhoff_tools/cli/engine/debug.py +79 -0
  5. dayhoff_tools/cli/engine/gami.py +160 -0
  6. dayhoff_tools/cli/engine/idle.py +148 -0
  7. dayhoff_tools/cli/engine/launch.py +101 -0
  8. dayhoff_tools/cli/engine/list.py +116 -0
  9. dayhoff_tools/cli/engine/repair.py +128 -0
  10. dayhoff_tools/cli/engine/resize.py +195 -0
  11. dayhoff_tools/cli/engine/ssh.py +62 -0
  12. dayhoff_tools/cli/engine/{engine_core.py → status.py} +6 -201
  13. dayhoff_tools/cli/engine_studio_commands.py +323 -0
  14. dayhoff_tools/cli/engine_studio_utils/__init__.py +1 -0
  15. dayhoff_tools/cli/engine_studio_utils/api_utils.py +47 -0
  16. dayhoff_tools/cli/engine_studio_utils/aws_utils.py +102 -0
  17. dayhoff_tools/cli/engine_studio_utils/constants.py +21 -0
  18. dayhoff_tools/cli/engine_studio_utils/formatting.py +210 -0
  19. dayhoff_tools/cli/engine_studio_utils/ssh_utils.py +141 -0
  20. dayhoff_tools/cli/main.py +1 -2
  21. dayhoff_tools/cli/studio/__init__.py +1 -0
  22. dayhoff_tools/cli/studio/attach.py +314 -0
  23. dayhoff_tools/cli/studio/create.py +48 -0
  24. dayhoff_tools/cli/studio/delete.py +71 -0
  25. dayhoff_tools/cli/studio/detach.py +56 -0
  26. dayhoff_tools/cli/studio/list.py +81 -0
  27. dayhoff_tools/cli/studio/reset.py +90 -0
  28. dayhoff_tools/cli/studio/resize.py +134 -0
  29. dayhoff_tools/cli/studio/status.py +78 -0
  30. {dayhoff_tools-1.9.26.dist-info → dayhoff_tools-1.10.1.dist-info}/METADATA +1 -1
  31. dayhoff_tools-1.10.1.dist-info/RECORD +61 -0
  32. dayhoff_tools/cli/engine/engine_maintenance.py +0 -431
  33. dayhoff_tools/cli/engine/engine_management.py +0 -505
  34. dayhoff_tools/cli/engine/shared.py +0 -501
  35. dayhoff_tools/cli/engine/studio_commands.py +0 -825
  36. dayhoff_tools-1.9.26.dist-info/RECORD +0 -39
  37. /dayhoff_tools/cli/engine/{engine_lifecycle.py → lifecycle.py} +0 -0
  38. {dayhoff_tools-1.9.26.dist-info → dayhoff_tools-1.10.1.dist-info}/WHEEL +0 -0
  39. {dayhoff_tools-1.9.26.dist-info → dayhoff_tools-1.10.1.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,210 @@
1
+ """Display formatting utilities for engine and studio commands."""
2
+
3
+ import time
4
+ from datetime import datetime, timedelta, timezone
5
+ from typing import Dict, List, Optional
6
+
7
+ import boto3
8
+ import typer
9
+ from rich.prompt import IntPrompt
10
+
11
+ from .constants import HOURLY_COSTS, console
12
+
13
+
14
+ def format_duration(duration: timedelta) -> str:
15
+ """Format a duration as a human-readable string."""
16
+ total_seconds = int(duration.total_seconds())
17
+ hours = total_seconds // 3600
18
+ minutes = (total_seconds % 3600) // 60
19
+
20
+ if hours > 0:
21
+ return f"{hours}h {minutes}m"
22
+ else:
23
+ return f"{minutes}m"
24
+
25
+
26
+ def parse_launch_time(launch_time_str: str) -> datetime:
27
+ """Parse launch time from API response."""
28
+ # Try different datetime formats
29
+ formats = [
30
+ "%Y-%m-%dT%H:%M:%S.%fZ",
31
+ "%Y-%m-%dT%H:%M:%SZ",
32
+ "%Y-%m-%dT%H:%M:%S%z", # ISO format with timezone
33
+ "%Y-%m-%dT%H:%M:%S+00:00", # Explicit UTC offset
34
+ "%Y-%m-%d %H:%M:%S",
35
+ ]
36
+
37
+ # First try parsing with fromisoformat for better timezone handling
38
+ try:
39
+ # Handle the ISO format properly
40
+ return datetime.fromisoformat(launch_time_str.replace("Z", "+00:00"))
41
+ except (ValueError, AttributeError):
42
+ pass
43
+
44
+ # Fallback to manual format parsing
45
+ for fmt in formats:
46
+ try:
47
+ parsed = datetime.strptime(launch_time_str, fmt)
48
+ # If no timezone info, assume UTC
49
+ if parsed.tzinfo is None:
50
+ parsed = parsed.replace(tzinfo=timezone.utc)
51
+ return parsed
52
+ except ValueError:
53
+ continue
54
+
55
+ # Fallback: assume it's recent
56
+ return datetime.now(timezone.utc)
57
+
58
+
59
+ def format_status(state: str, ready: Optional[bool]) -> str:
60
+ """Format engine status with ready indicator."""
61
+ if state.lower() == "running":
62
+ if ready is True:
63
+ return "[green]Running ✓[/green]"
64
+ elif ready is False:
65
+ return "[yellow]Running ⚠ (Bootstrapping...)[/yellow]"
66
+ else:
67
+ return "[green]Running[/green]"
68
+ elif state.lower() == "stopped":
69
+ return "[dim]Stopped[/dim]"
70
+ elif state.lower() == "stopping":
71
+ return "[yellow]Stopping...[/yellow]"
72
+ elif state.lower() == "pending":
73
+ return "[yellow]Starting...[/yellow]"
74
+ else:
75
+ return state
76
+
77
+
78
+ def resolve_engine(name_or_id: str, engines: List[Dict]) -> Dict:
79
+ """Resolve engine by name or ID with interactive selection."""
80
+ # Exact ID match
81
+ exact_id = [e for e in engines if e["instance_id"] == name_or_id]
82
+ if exact_id:
83
+ return exact_id[0]
84
+
85
+ # Exact name match
86
+ exact_name = [e for e in engines if e["name"] == name_or_id]
87
+ if len(exact_name) == 1:
88
+ return exact_name[0]
89
+
90
+ # Prefix matches
91
+ matches = [
92
+ e
93
+ for e in engines
94
+ if e["name"].startswith(name_or_id) or e["instance_id"].startswith(name_or_id)
95
+ ]
96
+
97
+ if len(matches) == 0:
98
+ console.print(f"[red]❌ No engine found matching '{name_or_id}'[/red]")
99
+ raise typer.Exit(1)
100
+ elif len(matches) == 1:
101
+ return matches[0]
102
+ else:
103
+ # Interactive selection
104
+ console.print(f"Multiple engines match '{name_or_id}':")
105
+ for i, engine in enumerate(matches, 1):
106
+ cost = HOURLY_COSTS.get(engine["engine_type"], 0)
107
+ console.print(
108
+ f" {i}. [cyan]{engine['name']}[/cyan] ({engine['instance_id']}) "
109
+ f"- {engine['engine_type']} - {engine['state']} - ${cost:.2f}/hr"
110
+ )
111
+
112
+ while True:
113
+ try:
114
+ choice = IntPrompt.ask(
115
+ "Select engine",
116
+ default=1,
117
+ choices=[str(i) for i in range(1, len(matches) + 1)],
118
+ )
119
+ return matches[choice - 1]
120
+ except (ValueError, IndexError):
121
+ console.print("[red]Invalid selection, please try again[/red]")
122
+
123
+
124
+ def get_disk_usage_via_ssm(instance_id: str) -> Optional[str]:
125
+ """Get disk usage for an engine via SSM.
126
+
127
+ Returns:
128
+ String like "17/50 GB" or None if failed
129
+ """
130
+ try:
131
+ ssm = boto3.client("ssm", region_name="us-east-1")
132
+
133
+ # Run df command to get disk usage
134
+ response = ssm.send_command(
135
+ InstanceIds=[instance_id],
136
+ DocumentName="AWS-RunShellScript",
137
+ Parameters={
138
+ "commands": [
139
+ # Get root filesystem usage in GB
140
+ 'df -BG / | tail -1 | awk \'{gsub(/G/, "", $2); gsub(/G/, "", $3); print $3 "/" $2 " GB"}\''
141
+ ],
142
+ "executionTimeout": ["10"],
143
+ },
144
+ )
145
+
146
+ command_id = response["Command"]["CommandId"]
147
+
148
+ # Wait for command to complete (with timeout)
149
+ for _ in range(5): # 5 second timeout
150
+ time.sleep(1)
151
+ result = ssm.get_command_invocation(
152
+ CommandId=command_id,
153
+ InstanceId=instance_id,
154
+ )
155
+ if result["Status"] in ["Success", "Failed"]:
156
+ break
157
+
158
+ if result["Status"] == "Success":
159
+ output = result["StandardOutputContent"].strip()
160
+ return output if output else None
161
+
162
+ return None
163
+
164
+ except Exception as e:
165
+ # logger.debug(f"Failed to get disk usage for {instance_id}: {e}") # Original code had this line commented out
166
+ return None
167
+
168
+
169
+ def get_studio_disk_usage_via_ssm(instance_id: str, username: str) -> Optional[str]:
170
+ """Get disk usage for a studio via SSM.
171
+
172
+ Returns:
173
+ String like "333/500 GB" or None if failed
174
+ """
175
+ try:
176
+ ssm = boto3.client("ssm", region_name="us-east-1")
177
+
178
+ # Run df command to get studio disk usage
179
+ response = ssm.send_command(
180
+ InstanceIds=[instance_id],
181
+ DocumentName="AWS-RunShellScript",
182
+ Parameters={
183
+ "commands": [
184
+ # Get studio filesystem usage in GB
185
+ f'df -BG /studios/{username} 2>/dev/null | tail -1 | awk \'{{gsub(/G/, "", $2); gsub(/G/, "", $3); print $3 "/" $2 " GB"}}\''
186
+ ],
187
+ "executionTimeout": ["10"],
188
+ },
189
+ )
190
+
191
+ command_id = response["Command"]["CommandId"]
192
+
193
+ # Wait for command to complete (with timeout)
194
+ for _ in range(5): # 5 second timeout
195
+ time.sleep(1)
196
+ result = ssm.get_command_invocation(
197
+ CommandId=command_id,
198
+ InstanceId=instance_id,
199
+ )
200
+ if result["Status"] in ["Success", "Failed"]:
201
+ break
202
+
203
+ if result["Status"] == "Success":
204
+ output = result["StandardOutputContent"].strip()
205
+ return output if output else None
206
+
207
+ return None
208
+
209
+ except Exception:
210
+ return None
@@ -0,0 +1,141 @@
1
+ """SSH-related utilities for engine and studio commands."""
2
+
3
+ import os
4
+ import shutil
5
+ import subprocess
6
+ from pathlib import Path
7
+
8
+ from .constants import SSH_MANAGED_COMMENT
9
+
10
+
11
+ def get_ssh_public_key() -> str:
12
+ """Get the user's SSH public key.
13
+
14
+ Discovery order (container-friendly):
15
+ 1) DHT_SSH_PUBLIC_KEY env var (direct key content)
16
+ 2) DHT_SSH_PUBLIC_KEY_PATH env var (path to a .pub file)
17
+ 3) ssh-agent via `ssh-add -L` (requires SSH_AUTH_SOCK)
18
+ 4) Conventional files: ~/.ssh/id_ed25519.pub, ~/.ssh/id_rsa.pub
19
+
20
+ Raises:
21
+ FileNotFoundError: If no public key can be discovered.
22
+ """
23
+ # 1) Direct env var content
24
+ env_key = os.environ.get("DHT_SSH_PUBLIC_KEY")
25
+ if env_key and env_key.strip():
26
+ return env_key.strip()
27
+
28
+ # 2) Env var path
29
+ env_path = os.environ.get("DHT_SSH_PUBLIC_KEY_PATH")
30
+ if env_path:
31
+ p = Path(env_path).expanduser()
32
+ if p.is_file():
33
+ try:
34
+ return p.read_text().strip()
35
+ except Exception:
36
+ pass
37
+
38
+ # 3) Agent lookup (ssh-add -L)
39
+ try:
40
+ if shutil.which("ssh-add") is not None:
41
+ proc = subprocess.run(["ssh-add", "-L"], capture_output=True, text=True)
42
+ if proc.returncode == 0 and proc.stdout:
43
+ keys = [
44
+ line.strip() for line in proc.stdout.splitlines() if line.strip()
45
+ ]
46
+ # Prefer ed25519, then rsa
47
+ for pref in ("ssh-ed25519", "ssh-rsa", "ecdsa-sha2-nistp256"):
48
+ for k in keys:
49
+ if k.startswith(pref + " "):
50
+ return k
51
+ # Fallback to first key if types not matched
52
+ if keys:
53
+ return keys[0]
54
+ except Exception:
55
+ pass
56
+
57
+ # 4) Conventional files
58
+ home = Path.home()
59
+ key_paths = [home / ".ssh" / "id_ed25519.pub", home / ".ssh" / "id_rsa.pub"]
60
+ for key_path in key_paths:
61
+ if key_path.is_file():
62
+ try:
63
+ return key_path.read_text().strip()
64
+ except Exception:
65
+ continue
66
+
67
+ raise FileNotFoundError(
68
+ "No SSH public key found. Please create one with 'ssh-keygen' first."
69
+ )
70
+
71
+
72
+ def check_session_manager_plugin():
73
+ """Check if AWS Session Manager Plugin is available and warn if not."""
74
+ from .constants import console
75
+
76
+ if shutil.which("session-manager-plugin") is None:
77
+ console.print(
78
+ "[bold red]⚠️ AWS Session Manager Plugin not found![/bold red]\n"
79
+ "SSH connections to engines require the Session Manager Plugin.\n"
80
+ "Please install it following the setup guide:\n"
81
+ "[link]https://github.com/dayhofflabs/nutshell/blob/main/REFERENCE/setup_guides/new-laptop.md[/link]"
82
+ )
83
+ return False
84
+ return True
85
+
86
+
87
+ def update_ssh_config_entry(
88
+ engine_name: str, instance_id: str, ssh_user: str, idle_timeout: int = 600
89
+ ):
90
+ """Add or update a single SSH config entry for the given SSH user.
91
+
92
+ Args:
93
+ engine_name: Host alias to write into ~/.ssh/config
94
+ instance_id: EC2 instance-id (used by the proxy command)
95
+ ssh_user: Username to place into the SSH stanza
96
+ idle_timeout: Idle timeout **in seconds** to pass to the SSM port-forward. 600 = 10 min.
97
+ """
98
+ config_path = Path.home() / ".ssh" / "config"
99
+ config_path.parent.mkdir(mode=0o700, exist_ok=True)
100
+
101
+ # Touch the file if it doesn't exist
102
+ if not config_path.exists():
103
+ config_path.touch(mode=0o600)
104
+
105
+ # Read existing config
106
+ content = config_path.read_text()
107
+ lines = content.splitlines() if content else []
108
+
109
+ # Remove any existing entry for this engine
110
+ new_lines = []
111
+ skip_until_next_host = False
112
+ for line in lines:
113
+ # Check if this is our managed host
114
+ if (
115
+ line.strip().startswith(f"Host {engine_name}")
116
+ and SSH_MANAGED_COMMENT in line
117
+ ):
118
+ skip_until_next_host = True
119
+ elif line.strip().startswith("Host ") and skip_until_next_host:
120
+ skip_until_next_host = False
121
+ # This is a different host entry, keep it
122
+ new_lines.append(line)
123
+ elif not skip_until_next_host:
124
+ new_lines.append(line)
125
+
126
+ # Add the new entry
127
+ if new_lines and new_lines[-1].strip(): # Add blank line if needed
128
+ new_lines.append("")
129
+
130
+ new_lines.extend(
131
+ [
132
+ f"Host {engine_name} {SSH_MANAGED_COMMENT}",
133
+ f" HostName {instance_id}",
134
+ f" User {ssh_user}",
135
+ f" ProxyCommand sh -c \"AWS_SSM_IDLE_TIMEOUT={idle_timeout} aws ssm start-session --target %h --document-name AWS-StartSSHSession --parameters 'portNumber=%p'\"",
136
+ ]
137
+ )
138
+
139
+ # Write back
140
+ config_path.write_text("\n".join(new_lines))
141
+ config_path.chmod(0o600)
dayhoff_tools/cli/main.py CHANGED
@@ -4,9 +4,8 @@ import sys
4
4
  from importlib.metadata import PackageNotFoundError, version
5
5
 
6
6
  import typer
7
-
8
7
  from dayhoff_tools.cli.cloud_commands import aws_app, gcp_app
9
- from dayhoff_tools.cli.engine import engine_app, studio_app
8
+ from dayhoff_tools.cli.engine_studio_commands import engine_app, studio_app
10
9
  from dayhoff_tools.cli.utility_commands import (
11
10
  build_and_upload_wheel,
12
11
  delete_local_branch,
@@ -0,0 +1 @@
1
+ """Studio management commands."""
@@ -0,0 +1,314 @@
1
+ """Studio attach command."""
2
+
3
+ import time
4
+ from typing import Optional
5
+
6
+ import typer
7
+ from rich.progress import Progress, SpinnerColumn, TextColumn, TimeElapsedColumn
8
+ from rich.prompt import Confirm, IntPrompt
9
+
10
+ from ..engine_studio_utils.api_utils import get_user_studio, make_api_request
11
+ from ..engine_studio_utils.aws_utils import check_aws_sso
12
+ from ..engine_studio_utils.constants import console
13
+ from ..engine_studio_utils.formatting import resolve_engine
14
+ from ..engine_studio_utils.ssh_utils import (
15
+ check_session_manager_plugin,
16
+ get_ssh_public_key,
17
+ update_ssh_config_entry,
18
+ )
19
+
20
+
21
+ def attach_studio(
22
+ engine_name_or_id: str = typer.Argument(help="Engine name or instance ID"),
23
+ user: Optional[str] = typer.Option(
24
+ None, "--user", "-u", help="Attach a different user's studio (admin only)"
25
+ ),
26
+ ):
27
+ """Attach your studio to an engine."""
28
+ username = check_aws_sso()
29
+
30
+ # Check for Session Manager Plugin since we'll update SSH config
31
+ if not check_session_manager_plugin():
32
+ raise typer.Exit(1)
33
+
34
+ # Use specified user if provided, otherwise use current user
35
+ target_user = user if user else username
36
+
37
+ # Add confirmation when attaching another user's studio
38
+ if target_user != username:
39
+ console.print(f"[yellow]⚠️ Managing studio for user: {target_user}[/yellow]")
40
+ if not Confirm.ask(f"Are you sure you want to attach {target_user}'s studio?"):
41
+ console.print("Operation cancelled.")
42
+ return
43
+
44
+ # Get user's studio
45
+ studio = get_user_studio(target_user)
46
+ if not studio:
47
+ if target_user == username:
48
+ console.print("[yellow]You don't have a studio yet.[/yellow]")
49
+ if Confirm.ask("Would you like to create one now?"):
50
+ size = IntPrompt.ask("Studio size (GB)", default=50)
51
+ response = make_api_request(
52
+ "POST",
53
+ "/studios",
54
+ json_data={"user": username, "size_gb": size},
55
+ )
56
+ if response.status_code != 201:
57
+ console.print("[red]❌ Failed to create studio[/red]")
58
+ raise typer.Exit(1)
59
+ studio = response.json()
60
+ studio["studio_id"] = studio["studio_id"] # Normalize key
61
+ else:
62
+ raise typer.Exit(0)
63
+ else:
64
+ console.print(f"[red]❌ User {target_user} doesn't have a studio.[/red]")
65
+ raise typer.Exit(1)
66
+
67
+ # Check if already attached
68
+ if studio.get("status") == "in-use":
69
+ console.print(
70
+ f"[yellow]Studio is already attached to {studio.get('attached_vm_id')}[/yellow]"
71
+ )
72
+ if not Confirm.ask("Detach and reattach to new engine?"):
73
+ return
74
+ # Detach first
75
+ response = make_api_request("POST", f"/studios/{studio['studio_id']}/detach")
76
+ if response.status_code != 200:
77
+ console.print("[red]❌ Failed to detach studio[/red]")
78
+ raise typer.Exit(1)
79
+
80
+ # Get all engines to resolve name
81
+ response = make_api_request("GET", "/engines")
82
+ if response.status_code != 200:
83
+ console.print("[red]❌ Failed to fetch engines[/red]")
84
+ raise typer.Exit(1)
85
+
86
+ engines = response.json().get("engines", [])
87
+ engine = resolve_engine(engine_name_or_id, engines)
88
+
89
+ # Flag to track if we started the engine in this command (affects retry length)
90
+ engine_started_now: bool = False
91
+
92
+ if engine["state"].lower() != "running":
93
+ console.print(f"[yellow]⚠️ Engine is {engine['state']}[/yellow]")
94
+ if engine["state"].lower() == "stopped" and Confirm.ask(
95
+ "Start the engine first?"
96
+ ):
97
+ response = make_api_request(
98
+ "POST", f"/engines/{engine['instance_id']}/start"
99
+ )
100
+ if response.status_code != 200:
101
+ console.print("[red]❌ Failed to start engine[/red]")
102
+ raise typer.Exit(1)
103
+ console.print("[green]✓ Engine started[/green]")
104
+ # Mark that we booted the engine so attach loop gets extended retries
105
+ engine_started_now = True
106
+ # No further waiting here – attachment attempts below handle retry logic while the
107
+ # engine finishes booting.
108
+ else:
109
+ raise typer.Exit(1)
110
+
111
+ # Retrieve SSH public key (required for authorised_keys provisioning)
112
+ try:
113
+ public_key = get_ssh_public_key()
114
+ except FileNotFoundError as e:
115
+ console.print(f"[red]❌ {e}[/red]")
116
+ raise typer.Exit(1)
117
+
118
+ console.print(f"Attaching studio to engine [cyan]{engine['name']}[/cyan]...")
119
+
120
+ # Determine retry strategy based on whether we just started the engine
121
+ if engine_started_now:
122
+ max_attempts = 40 # About 7 minutes total with exponential backoff
123
+ base_delay = 8
124
+ max_delay = 20
125
+ else:
126
+ max_attempts = 15 # About 2 minutes total with exponential backoff
127
+ base_delay = 5
128
+ max_delay = 10
129
+
130
+ # Unified retry loop with exponential backoff
131
+ with Progress(
132
+ SpinnerColumn(),
133
+ TimeElapsedColumn(),
134
+ TextColumn("[progress.description]{task.description}"),
135
+ transient=True,
136
+ ) as prog:
137
+ desc = (
138
+ "Attaching studio (engine is still booting)…"
139
+ if engine_started_now
140
+ else "Attaching studio…"
141
+ )
142
+ task = prog.add_task(desc, total=None)
143
+
144
+ consecutive_not_ready = 0
145
+ last_error = None
146
+
147
+ for attempt in range(max_attempts):
148
+ # Check if the attach already completed
149
+ if _is_studio_attached(studio["studio_id"], engine["instance_id"]):
150
+ success = True
151
+ break
152
+
153
+ success, error_msg = _attempt_studio_attach(
154
+ studio, engine, target_user, public_key
155
+ )
156
+
157
+ if success:
158
+ break # success!
159
+
160
+ if error_msg:
161
+ # Fatal error – bubble up immediately
162
+ console.print(f"[red]❌ Failed to attach studio: {error_msg}[/red]")
163
+
164
+ # Suggest repair command if engine seems broken
165
+ if "not ready" in error_msg.lower() and attempt > 5:
166
+ console.print(
167
+ f"\n[yellow]Engine may be in a bad state. Try:[/yellow]"
168
+ )
169
+ console.print(f"[dim] dh engine repair {engine['name']}[/dim]")
170
+ return
171
+
172
+ # Track consecutive "not ready" responses
173
+ consecutive_not_ready += 1
174
+ last_error = "Engine not ready"
175
+
176
+ # Update progress display
177
+ if attempt % 3 == 0:
178
+ prog.update(
179
+ task,
180
+ description=f"{desc} attempt {attempt+1}/{max_attempts}",
181
+ )
182
+
183
+ # If engine seems stuck after many attempts, show a hint
184
+ if consecutive_not_ready > 10 and attempt == 10:
185
+ console.print(
186
+ "[yellow]Engine is taking longer than expected to become ready.[/yellow]"
187
+ )
188
+ console.print(
189
+ "[dim]This can happen after GAMI creation or if the engine is still bootstrapping.[/dim]"
190
+ )
191
+
192
+ # Exponential backoff with jitter
193
+ delay = min(base_delay * (1.5 ** min(attempt, 5)), max_delay)
194
+ delay += time.time() % 2 # Add 0-2 seconds of jitter
195
+ time.sleep(delay)
196
+
197
+ else:
198
+ # All attempts exhausted
199
+ console.print(
200
+ f"[yellow]Engine is not becoming ready after {max_attempts} attempts.[/yellow]"
201
+ )
202
+ if last_error:
203
+ console.print(f"[dim]Last issue: {last_error}[/dim]")
204
+ console.print("\n[yellow]You can try:[/yellow]")
205
+ console.print(
206
+ f" 1. Wait a minute and retry: [cyan]dh studio attach {engine['name']}[/cyan]"
207
+ )
208
+ console.print(
209
+ f" 2. Check engine status: [cyan]dh engine status {engine['name']}[/cyan]"
210
+ )
211
+ console.print(
212
+ f" 3. Repair the engine: [cyan]dh engine repair {engine['name']}[/cyan]"
213
+ )
214
+ return
215
+
216
+ # Successful attach path
217
+ console.print(f"[green]✓ Studio attached successfully![/green]")
218
+
219
+ # Update SSH config - use target_user for the connection
220
+ update_ssh_config_entry(engine["name"], engine["instance_id"], target_user)
221
+ console.print(f"[green]✓ SSH config updated[/green]")
222
+ console.print(f"\nConnect with: [cyan]ssh {engine['name']}[/cyan]")
223
+ console.print(f"Files are at: [cyan]/studios/{target_user}[/cyan]")
224
+
225
+
226
+ def _is_studio_attached(target_studio_id: str, target_vm_id: str) -> bool:
227
+ """Check if a studio is attached to a specific VM."""
228
+ response = make_api_request("GET", "/studios")
229
+ if response.status_code != 200:
230
+ return False
231
+
232
+ studios = response.json().get("studios", [])
233
+ for studio in studios:
234
+ if (
235
+ studio["studio_id"] == target_studio_id
236
+ and studio.get("attached_vm_id") == target_vm_id
237
+ and studio.get("status") == "in-use"
238
+ ):
239
+ return True
240
+ return False
241
+
242
+
243
+ def _attempt_studio_attach(studio, engine, target_user, public_key):
244
+ response = make_api_request(
245
+ "POST",
246
+ f"/studios/{studio['studio_id']}/attach",
247
+ json_data={
248
+ "vm_id": engine["instance_id"],
249
+ "user": target_user,
250
+ "public_key": public_key,
251
+ },
252
+ )
253
+
254
+ # Fast-path success
255
+ if response.status_code == 200:
256
+ return True, None
257
+
258
+ # Asynchronous path – API returned 202 Accepted and operation tracking ID
259
+ if response.status_code == 202:
260
+ # The operation status polling is broken in the Lambda, so we just
261
+ # wait and check if the studio is actually attached
262
+ time.sleep(5) # Give the async operation a moment to start
263
+
264
+ # Check periodically if the studio is attached
265
+ for check in range(20): # Check for up to 60 seconds
266
+ if _is_studio_attached(studio["studio_id"], engine["instance_id"]):
267
+ return True, None
268
+ time.sleep(3)
269
+
270
+ # If we get here, attachment didn't complete in reasonable time
271
+ return False, None # Return None to trigger retry
272
+
273
+ # --- determine if we should retry ---
274
+ recoverable = False
275
+ error_text = response.json().get("error", "Unknown error")
276
+ err_msg = error_text.lower()
277
+
278
+ # Check for "Studio is not available (status: in-use)" which means it's already attached
279
+ if (
280
+ response.status_code == 400
281
+ and "not available" in err_msg
282
+ and "in-use" in err_msg
283
+ ):
284
+ # Studio is already attached somewhere - check if it's to THIS engine
285
+ if _is_studio_attached(studio["studio_id"], engine["instance_id"]):
286
+ return True, None # It's attached to our target engine - success!
287
+ else:
288
+ return False, error_text # It's attached elsewhere - fatal error
289
+
290
+ if response.status_code in (409, 503):
291
+ recoverable = True
292
+ else:
293
+ RECOVERABLE_PATTERNS = [
294
+ "not ready",
295
+ "still starting",
296
+ "initializing",
297
+ "failed to mount",
298
+ "device busy",
299
+ "pending", # VM state pending
300
+ ]
301
+ FATAL_PATTERNS = [
302
+ "permission",
303
+ ]
304
+ if any(p in err_msg for p in FATAL_PATTERNS):
305
+ recoverable = False
306
+ elif any(p in err_msg for p in RECOVERABLE_PATTERNS):
307
+ recoverable = True
308
+
309
+ if not recoverable:
310
+ # fatal – abort immediately
311
+ return False, error_text
312
+
313
+ # recoverable – signal caller to retry without treating as error
314
+ return False, None