dayhoff-tools 1.9.9__py3-none-any.whl → 1.9.10__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dayhoff_tools/cli/engine/__init__.py +49 -0
- dayhoff_tools/cli/engine/engine_core.py +739 -0
- dayhoff_tools/cli/engine/engine_lifecycle.py +136 -0
- dayhoff_tools/cli/engine/engine_maintenance.py +377 -0
- dayhoff_tools/cli/engine/engine_management.py +505 -0
- dayhoff_tools/cli/engine/shared.py +501 -0
- dayhoff_tools/cli/engine/studio_commands.py +825 -0
- dayhoff_tools/cli/main.py +1 -1
- {dayhoff_tools-1.9.9.dist-info → dayhoff_tools-1.9.10.dist-info}/METADATA +1 -1
- {dayhoff_tools-1.9.9.dist-info → dayhoff_tools-1.9.10.dist-info}/RECORD +12 -6
- dayhoff_tools/cli/engine_commands.py +0 -3013
- {dayhoff_tools-1.9.9.dist-info → dayhoff_tools-1.9.10.dist-info}/WHEEL +0 -0
- {dayhoff_tools-1.9.9.dist-info → dayhoff_tools-1.9.10.dist-info}/entry_points.txt +0 -0
@@ -1,3013 +0,0 @@
|
|
1
|
-
"""Engine and Studio management commands for DHT CLI."""
|
2
|
-
|
3
|
-
import json
|
4
|
-
import os
|
5
|
-
import re
|
6
|
-
import shutil
|
7
|
-
import subprocess
|
8
|
-
import sys
|
9
|
-
import time
|
10
|
-
from datetime import datetime, timedelta, timezone
|
11
|
-
from pathlib import Path
|
12
|
-
from typing import Any, Dict, List, Optional, Tuple
|
13
|
-
|
14
|
-
import boto3
|
15
|
-
import requests
|
16
|
-
import typer
|
17
|
-
from botocore.exceptions import ClientError, NoCredentialsError
|
18
|
-
from rich import box
|
19
|
-
from rich.console import Console
|
20
|
-
from rich.panel import Panel
|
21
|
-
from rich.progress import Progress, SpinnerColumn, TextColumn, TimeElapsedColumn
|
22
|
-
from rich.prompt import Confirm, IntPrompt, Prompt
|
23
|
-
from rich.table import Table
|
24
|
-
|
25
|
-
# Initialize Typer apps
|
26
|
-
engine_app = typer.Typer(help="Manage compute engines for development.")
|
27
|
-
studio_app = typer.Typer(help="Manage persistent development studios.")
|
28
|
-
|
29
|
-
console = Console()
|
30
|
-
|
31
|
-
# Cost information
|
32
|
-
HOURLY_COSTS = {
|
33
|
-
"cpu": 0.50, # r6i.2xlarge
|
34
|
-
"cpumax": 2.02, # r7i.8xlarge
|
35
|
-
"t4": 0.75, # g4dn.2xlarge
|
36
|
-
"a10g": 1.50, # g5.2xlarge
|
37
|
-
"a100": 21.96, # p4d.24xlarge
|
38
|
-
"4_t4": 3.91, # g4dn.12xlarge
|
39
|
-
"8_t4": 7.83, # g4dn.metal
|
40
|
-
"4_a10g": 6.24, # g5.12xlarge
|
41
|
-
"8_a10g": 16.29, # g5.48xlarge
|
42
|
-
}
|
43
|
-
|
44
|
-
# SSH config management
|
45
|
-
SSH_MANAGED_COMMENT = "# Managed by dh engine"
|
46
|
-
|
47
|
-
# --------------------------------------------------------------------------------
|
48
|
-
# Bootstrap stage helpers
|
49
|
-
# --------------------------------------------------------------------------------
|
50
|
-
|
51
|
-
|
52
|
-
def _colour_stage(stage: str) -> str:
|
53
|
-
"""Return colourised stage name for table output."""
|
54
|
-
if not stage:
|
55
|
-
return "[dim]-[/dim]"
|
56
|
-
low = stage.lower()
|
57
|
-
if low.startswith("error"):
|
58
|
-
return f"[red]{stage}[/red]"
|
59
|
-
if low == "finished":
|
60
|
-
return f"[green]{stage}[/green]"
|
61
|
-
return f"[yellow]{stage}[/yellow]"
|
62
|
-
|
63
|
-
|
64
|
-
def _fetch_init_stages(instance_ids: List[str]) -> Dict[str, str]:
|
65
|
-
"""Fetch DayhoffInitStage tag for many instances in one call."""
|
66
|
-
if not instance_ids:
|
67
|
-
return {}
|
68
|
-
ec2 = boto3.client("ec2", region_name="us-east-1")
|
69
|
-
stages: Dict[str, str] = {}
|
70
|
-
try:
|
71
|
-
paginator = ec2.get_paginator("describe_instances")
|
72
|
-
for page in paginator.paginate(InstanceIds=instance_ids):
|
73
|
-
for res in page["Reservations"]:
|
74
|
-
for inst in res["Instances"]:
|
75
|
-
iid = inst["InstanceId"]
|
76
|
-
tag_val = next(
|
77
|
-
(
|
78
|
-
t["Value"]
|
79
|
-
for t in inst.get("Tags", [])
|
80
|
-
if t["Key"] == "DayhoffInitStage"
|
81
|
-
),
|
82
|
-
None,
|
83
|
-
)
|
84
|
-
if tag_val:
|
85
|
-
stages[iid] = tag_val
|
86
|
-
except Exception:
|
87
|
-
pass # best-effort
|
88
|
-
return stages
|
89
|
-
|
90
|
-
|
91
|
-
def check_aws_sso() -> str:
|
92
|
-
"""Check AWS SSO status and return username."""
|
93
|
-
try:
|
94
|
-
sts = boto3.client("sts")
|
95
|
-
identity = sts.get_caller_identity()
|
96
|
-
# Parse username from assumed role ARN
|
97
|
-
# Format: arn:aws:sts::123456789012:assumed-role/AWSReservedSSO_DeveloperAccess_xxxx/username
|
98
|
-
arn = identity["Arn"]
|
99
|
-
if "assumed-role" in arn:
|
100
|
-
username = arn.split("/")[-1]
|
101
|
-
return username
|
102
|
-
else:
|
103
|
-
# Fallback for other auth methods
|
104
|
-
return identity["UserId"].split(":")[-1]
|
105
|
-
except (NoCredentialsError, ClientError):
|
106
|
-
console.print("[red]❌ Not logged in to AWS SSO[/red]")
|
107
|
-
console.print("Please run: [cyan]aws sso login[/cyan]")
|
108
|
-
if Confirm.ask("Would you like to login now?"):
|
109
|
-
try:
|
110
|
-
result = subprocess.run(
|
111
|
-
["aws", "sso", "login"],
|
112
|
-
capture_output=True,
|
113
|
-
text=True,
|
114
|
-
check=True,
|
115
|
-
)
|
116
|
-
if result.returncode == 0:
|
117
|
-
console.print("[green]✓ Successfully logged in![/green]")
|
118
|
-
return check_aws_sso()
|
119
|
-
except subprocess.CalledProcessError as e:
|
120
|
-
console.print(f"[red]Login failed: {e}[/red]")
|
121
|
-
raise typer.Exit(1)
|
122
|
-
|
123
|
-
|
124
|
-
def get_api_url() -> str:
|
125
|
-
"""Get Studio Manager API URL from SSM Parameter Store."""
|
126
|
-
ssm = boto3.client("ssm", region_name="us-east-1")
|
127
|
-
try:
|
128
|
-
response = ssm.get_parameter(Name="/dev/studio-manager/api-url")
|
129
|
-
return response["Parameter"]["Value"]
|
130
|
-
except ClientError as e:
|
131
|
-
if e.response["Error"]["Code"] == "ParameterNotFound":
|
132
|
-
console.print(
|
133
|
-
"[red]❌ API URL parameter not found in SSM Parameter Store[/red]"
|
134
|
-
)
|
135
|
-
console.print(
|
136
|
-
"Please ensure the Studio Manager infrastructure is deployed."
|
137
|
-
)
|
138
|
-
else:
|
139
|
-
console.print(f"[red]❌ Error retrieving API URL: {e}[/red]")
|
140
|
-
raise typer.Exit(1)
|
141
|
-
|
142
|
-
|
143
|
-
def make_api_request(
|
144
|
-
method: str,
|
145
|
-
endpoint: str,
|
146
|
-
json_data: Optional[Dict] = None,
|
147
|
-
params: Optional[Dict] = None,
|
148
|
-
) -> requests.Response:
|
149
|
-
"""Make an API request with error handling."""
|
150
|
-
api_url = get_api_url()
|
151
|
-
url = f"{api_url}{endpoint}"
|
152
|
-
|
153
|
-
try:
|
154
|
-
if method == "GET":
|
155
|
-
response = requests.get(url, params=params)
|
156
|
-
elif method == "POST":
|
157
|
-
response = requests.post(url, json=json_data)
|
158
|
-
elif method == "DELETE":
|
159
|
-
response = requests.delete(url)
|
160
|
-
else:
|
161
|
-
raise ValueError(f"Unsupported HTTP method: {method}")
|
162
|
-
|
163
|
-
return response
|
164
|
-
except requests.exceptions.RequestException as e:
|
165
|
-
console.print(f"[red]❌ API request failed: {e}[/red]")
|
166
|
-
raise typer.Exit(1)
|
167
|
-
|
168
|
-
|
169
|
-
def format_duration(duration: timedelta) -> str:
|
170
|
-
"""Format a duration as a human-readable string."""
|
171
|
-
total_seconds = int(duration.total_seconds())
|
172
|
-
hours = total_seconds // 3600
|
173
|
-
minutes = (total_seconds % 3600) // 60
|
174
|
-
|
175
|
-
if hours > 0:
|
176
|
-
return f"{hours}h {minutes}m"
|
177
|
-
else:
|
178
|
-
return f"{minutes}m"
|
179
|
-
|
180
|
-
|
181
|
-
def get_disk_usage_via_ssm(instance_id: str) -> Optional[str]:
|
182
|
-
"""Get disk usage for an engine via SSM.
|
183
|
-
|
184
|
-
Returns:
|
185
|
-
String like "17/50 GB" or None if failed
|
186
|
-
"""
|
187
|
-
try:
|
188
|
-
ssm = boto3.client("ssm", region_name="us-east-1")
|
189
|
-
|
190
|
-
# Run df command to get disk usage
|
191
|
-
response = ssm.send_command(
|
192
|
-
InstanceIds=[instance_id],
|
193
|
-
DocumentName="AWS-RunShellScript",
|
194
|
-
Parameters={
|
195
|
-
"commands": [
|
196
|
-
# Get root filesystem usage in GB
|
197
|
-
'df -BG / | tail -1 | awk \'{gsub(/G/, "", $2); gsub(/G/, "", $3); print $3 "/" $2 " GB"}\''
|
198
|
-
],
|
199
|
-
"executionTimeout": ["10"],
|
200
|
-
},
|
201
|
-
)
|
202
|
-
|
203
|
-
command_id = response["Command"]["CommandId"]
|
204
|
-
|
205
|
-
# Wait for command to complete (with timeout)
|
206
|
-
for _ in range(5): # 5 second timeout
|
207
|
-
time.sleep(1)
|
208
|
-
result = ssm.get_command_invocation(
|
209
|
-
CommandId=command_id,
|
210
|
-
InstanceId=instance_id,
|
211
|
-
)
|
212
|
-
if result["Status"] in ["Success", "Failed"]:
|
213
|
-
break
|
214
|
-
|
215
|
-
if result["Status"] == "Success":
|
216
|
-
output = result["StandardOutputContent"].strip()
|
217
|
-
return output if output else None
|
218
|
-
|
219
|
-
return None
|
220
|
-
|
221
|
-
except Exception as e:
|
222
|
-
# logger.debug(f"Failed to get disk usage for {instance_id}: {e}") # Original code had this line commented out
|
223
|
-
return None
|
224
|
-
|
225
|
-
|
226
|
-
def get_studio_disk_usage_via_ssm(instance_id: str, username: str) -> Optional[str]:
|
227
|
-
"""Get disk usage for a studio via SSM.
|
228
|
-
|
229
|
-
Returns:
|
230
|
-
String like "333/500 GB" or None if failed
|
231
|
-
"""
|
232
|
-
try:
|
233
|
-
ssm = boto3.client("ssm", region_name="us-east-1")
|
234
|
-
|
235
|
-
# Run df command to get studio disk usage
|
236
|
-
response = ssm.send_command(
|
237
|
-
InstanceIds=[instance_id],
|
238
|
-
DocumentName="AWS-RunShellScript",
|
239
|
-
Parameters={
|
240
|
-
"commands": [
|
241
|
-
# Get studio filesystem usage in GB
|
242
|
-
f'df -BG /studios/{username} 2>/dev/null | tail -1 | awk \'{{gsub(/G/, "", $2); gsub(/G/, "", $3); print $3 "/" $2 " GB"}}\''
|
243
|
-
],
|
244
|
-
"executionTimeout": ["10"],
|
245
|
-
},
|
246
|
-
)
|
247
|
-
|
248
|
-
command_id = response["Command"]["CommandId"]
|
249
|
-
|
250
|
-
# Wait for command to complete (with timeout)
|
251
|
-
for _ in range(5): # 5 second timeout
|
252
|
-
time.sleep(1)
|
253
|
-
result = ssm.get_command_invocation(
|
254
|
-
CommandId=command_id,
|
255
|
-
InstanceId=instance_id,
|
256
|
-
)
|
257
|
-
if result["Status"] in ["Success", "Failed"]:
|
258
|
-
break
|
259
|
-
|
260
|
-
if result["Status"] == "Success":
|
261
|
-
output = result["StandardOutputContent"].strip()
|
262
|
-
return output if output else None
|
263
|
-
|
264
|
-
return None
|
265
|
-
|
266
|
-
except Exception:
|
267
|
-
return None
|
268
|
-
|
269
|
-
|
270
|
-
def parse_launch_time(launch_time_str: str) -> datetime:
|
271
|
-
"""Parse launch time from API response."""
|
272
|
-
# Try different datetime formats
|
273
|
-
formats = [
|
274
|
-
"%Y-%m-%dT%H:%M:%S.%fZ",
|
275
|
-
"%Y-%m-%dT%H:%M:%SZ",
|
276
|
-
"%Y-%m-%dT%H:%M:%S%z", # ISO format with timezone
|
277
|
-
"%Y-%m-%dT%H:%M:%S+00:00", # Explicit UTC offset
|
278
|
-
"%Y-%m-%d %H:%M:%S",
|
279
|
-
]
|
280
|
-
|
281
|
-
# First try parsing with fromisoformat for better timezone handling
|
282
|
-
try:
|
283
|
-
# Handle the ISO format properly
|
284
|
-
return datetime.fromisoformat(launch_time_str.replace("Z", "+00:00"))
|
285
|
-
except (ValueError, AttributeError):
|
286
|
-
pass
|
287
|
-
|
288
|
-
# Fallback to manual format parsing
|
289
|
-
for fmt in formats:
|
290
|
-
try:
|
291
|
-
parsed = datetime.strptime(launch_time_str, fmt)
|
292
|
-
# If no timezone info, assume UTC
|
293
|
-
if parsed.tzinfo is None:
|
294
|
-
parsed = parsed.replace(tzinfo=timezone.utc)
|
295
|
-
return parsed
|
296
|
-
except ValueError:
|
297
|
-
continue
|
298
|
-
|
299
|
-
# Fallback: assume it's recent
|
300
|
-
return datetime.now(timezone.utc)
|
301
|
-
|
302
|
-
|
303
|
-
def format_status(state: str, ready: Optional[bool]) -> str:
|
304
|
-
"""Format engine status with ready indicator."""
|
305
|
-
if state.lower() == "running":
|
306
|
-
if ready is True:
|
307
|
-
return "[green]Running ✓[/green]"
|
308
|
-
elif ready is False:
|
309
|
-
return "[yellow]Running ⚠ (Bootstrapping...)[/yellow]"
|
310
|
-
else:
|
311
|
-
return "[green]Running[/green]"
|
312
|
-
elif state.lower() == "stopped":
|
313
|
-
return "[dim]Stopped[/dim]"
|
314
|
-
elif state.lower() == "stopping":
|
315
|
-
return "[yellow]Stopping...[/yellow]"
|
316
|
-
elif state.lower() == "pending":
|
317
|
-
return "[yellow]Starting...[/yellow]"
|
318
|
-
else:
|
319
|
-
return state
|
320
|
-
|
321
|
-
|
322
|
-
def resolve_engine(name_or_id: str, engines: List[Dict]) -> Dict:
|
323
|
-
"""Resolve engine by name or ID with interactive selection."""
|
324
|
-
# Exact ID match
|
325
|
-
exact_id = [e for e in engines if e["instance_id"] == name_or_id]
|
326
|
-
if exact_id:
|
327
|
-
return exact_id[0]
|
328
|
-
|
329
|
-
# Exact name match
|
330
|
-
exact_name = [e for e in engines if e["name"] == name_or_id]
|
331
|
-
if len(exact_name) == 1:
|
332
|
-
return exact_name[0]
|
333
|
-
|
334
|
-
# Prefix matches
|
335
|
-
matches = [
|
336
|
-
e
|
337
|
-
for e in engines
|
338
|
-
if e["name"].startswith(name_or_id) or e["instance_id"].startswith(name_or_id)
|
339
|
-
]
|
340
|
-
|
341
|
-
if len(matches) == 0:
|
342
|
-
console.print(f"[red]❌ No engine found matching '{name_or_id}'[/red]")
|
343
|
-
raise typer.Exit(1)
|
344
|
-
elif len(matches) == 1:
|
345
|
-
return matches[0]
|
346
|
-
else:
|
347
|
-
# Interactive selection
|
348
|
-
console.print(f"Multiple engines match '{name_or_id}':")
|
349
|
-
for i, engine in enumerate(matches, 1):
|
350
|
-
cost = HOURLY_COSTS.get(engine["engine_type"], 0)
|
351
|
-
console.print(
|
352
|
-
f" {i}. [cyan]{engine['name']}[/cyan] ({engine['instance_id']}) "
|
353
|
-
f"- {engine['engine_type']} - {engine['state']} - ${cost:.2f}/hr"
|
354
|
-
)
|
355
|
-
|
356
|
-
while True:
|
357
|
-
try:
|
358
|
-
choice = IntPrompt.ask(
|
359
|
-
"Select engine",
|
360
|
-
default=1,
|
361
|
-
choices=[str(i) for i in range(1, len(matches) + 1)],
|
362
|
-
)
|
363
|
-
return matches[choice - 1]
|
364
|
-
except (ValueError, IndexError):
|
365
|
-
console.print("[red]Invalid selection, please try again[/red]")
|
366
|
-
|
367
|
-
|
368
|
-
def get_ssh_public_key() -> str:
|
369
|
-
"""Get the user's SSH public key.
|
370
|
-
|
371
|
-
Discovery order (container-friendly):
|
372
|
-
1) DHT_SSH_PUBLIC_KEY env var (direct key content)
|
373
|
-
2) DHT_SSH_PUBLIC_KEY_PATH env var (path to a .pub file)
|
374
|
-
3) ssh-agent via `ssh-add -L` (requires SSH_AUTH_SOCK)
|
375
|
-
4) Conventional files: ~/.ssh/id_ed25519.pub, ~/.ssh/id_rsa.pub
|
376
|
-
|
377
|
-
Raises:
|
378
|
-
FileNotFoundError: If no public key can be discovered.
|
379
|
-
"""
|
380
|
-
# 1) Direct env var content
|
381
|
-
env_key = os.environ.get("DHT_SSH_PUBLIC_KEY")
|
382
|
-
if env_key and env_key.strip():
|
383
|
-
return env_key.strip()
|
384
|
-
|
385
|
-
# 2) Env var path
|
386
|
-
env_path = os.environ.get("DHT_SSH_PUBLIC_KEY_PATH")
|
387
|
-
if env_path:
|
388
|
-
p = Path(env_path).expanduser()
|
389
|
-
if p.is_file():
|
390
|
-
try:
|
391
|
-
return p.read_text().strip()
|
392
|
-
except Exception:
|
393
|
-
pass
|
394
|
-
|
395
|
-
# 3) Agent lookup (ssh-add -L)
|
396
|
-
try:
|
397
|
-
if shutil.which("ssh-add") is not None:
|
398
|
-
proc = subprocess.run(["ssh-add", "-L"], capture_output=True, text=True)
|
399
|
-
if proc.returncode == 0 and proc.stdout:
|
400
|
-
keys = [
|
401
|
-
line.strip() for line in proc.stdout.splitlines() if line.strip()
|
402
|
-
]
|
403
|
-
# Prefer ed25519, then rsa
|
404
|
-
for pref in ("ssh-ed25519", "ssh-rsa", "ecdsa-sha2-nistp256"):
|
405
|
-
for k in keys:
|
406
|
-
if k.startswith(pref + " "):
|
407
|
-
return k
|
408
|
-
# Fallback to first key if types not matched
|
409
|
-
if keys:
|
410
|
-
return keys[0]
|
411
|
-
except Exception:
|
412
|
-
pass
|
413
|
-
|
414
|
-
# 4) Conventional files
|
415
|
-
home = Path.home()
|
416
|
-
key_paths = [home / ".ssh" / "id_ed25519.pub", home / ".ssh" / "id_rsa.pub"]
|
417
|
-
for key_path in key_paths:
|
418
|
-
if key_path.is_file():
|
419
|
-
try:
|
420
|
-
return key_path.read_text().strip()
|
421
|
-
except Exception:
|
422
|
-
continue
|
423
|
-
|
424
|
-
raise FileNotFoundError(
|
425
|
-
"No SSH public key found. Please create one with 'ssh-keygen' first."
|
426
|
-
)
|
427
|
-
|
428
|
-
|
429
|
-
def check_session_manager_plugin():
|
430
|
-
"""Check if AWS Session Manager Plugin is available and warn if not."""
|
431
|
-
if shutil.which("session-manager-plugin") is None:
|
432
|
-
console.print(
|
433
|
-
"[bold red]⚠️ AWS Session Manager Plugin not found![/bold red]\n"
|
434
|
-
"SSH connections to engines require the Session Manager Plugin.\n"
|
435
|
-
"Please install it following the setup guide:\n"
|
436
|
-
"[link]https://github.com/dayhofflabs/nutshell/blob/main/REFERENCE/setup_guides/new-laptop.md[/link]"
|
437
|
-
)
|
438
|
-
return False
|
439
|
-
return True
|
440
|
-
|
441
|
-
|
442
|
-
def update_ssh_config_entry(
|
443
|
-
engine_name: str, instance_id: str, ssh_user: str, idle_timeout: int = 600
|
444
|
-
):
|
445
|
-
"""Add or update a single SSH config entry for the given SSH user.
|
446
|
-
|
447
|
-
Args:
|
448
|
-
engine_name: Host alias to write into ~/.ssh/config
|
449
|
-
instance_id: EC2 instance-id (used by the proxy command)
|
450
|
-
ssh_user: Username to place into the SSH stanza
|
451
|
-
idle_timeout: Idle timeout **in seconds** to pass to the SSM port-forward. 600 = 10 min.
|
452
|
-
"""
|
453
|
-
config_path = Path.home() / ".ssh" / "config"
|
454
|
-
config_path.parent.mkdir(mode=0o700, exist_ok=True)
|
455
|
-
|
456
|
-
# Touch the file if it doesn't exist
|
457
|
-
if not config_path.exists():
|
458
|
-
config_path.touch(mode=0o600)
|
459
|
-
|
460
|
-
# Read existing config
|
461
|
-
content = config_path.read_text()
|
462
|
-
lines = content.splitlines() if content else []
|
463
|
-
|
464
|
-
# Remove any existing entry for this engine
|
465
|
-
new_lines = []
|
466
|
-
skip_until_next_host = False
|
467
|
-
for line in lines:
|
468
|
-
# Check if this is our managed host
|
469
|
-
if (
|
470
|
-
line.strip().startswith(f"Host {engine_name}")
|
471
|
-
and SSH_MANAGED_COMMENT in line
|
472
|
-
):
|
473
|
-
skip_until_next_host = True
|
474
|
-
elif line.strip().startswith("Host ") and skip_until_next_host:
|
475
|
-
skip_until_next_host = False
|
476
|
-
# This is a different host entry, keep it
|
477
|
-
new_lines.append(line)
|
478
|
-
elif not skip_until_next_host:
|
479
|
-
new_lines.append(line)
|
480
|
-
|
481
|
-
# Add the new entry
|
482
|
-
if new_lines and new_lines[-1].strip(): # Add blank line if needed
|
483
|
-
new_lines.append("")
|
484
|
-
|
485
|
-
new_lines.extend(
|
486
|
-
[
|
487
|
-
f"Host {engine_name} {SSH_MANAGED_COMMENT}",
|
488
|
-
f" HostName {instance_id}",
|
489
|
-
f" User {ssh_user}",
|
490
|
-
f" ProxyCommand sh -c \"AWS_SSM_IDLE_TIMEOUT={idle_timeout} aws ssm start-session --target %h --document-name AWS-StartSSHSession --parameters 'portNumber=%p'\"",
|
491
|
-
]
|
492
|
-
)
|
493
|
-
|
494
|
-
# Write back
|
495
|
-
config_path.write_text("\n".join(new_lines))
|
496
|
-
config_path.chmod(0o600)
|
497
|
-
|
498
|
-
|
499
|
-
# ==================== ENGINE COMMANDS ====================
|
500
|
-
|
501
|
-
|
502
|
-
@engine_app.command("launch")
|
503
|
-
def launch_engine(
|
504
|
-
name: str = typer.Argument(help="Name for the new engine"),
|
505
|
-
engine_type: str = typer.Option(
|
506
|
-
"cpu",
|
507
|
-
"--type",
|
508
|
-
"-t",
|
509
|
-
help="Engine type: cpu, cpumax, t4, a10g, a100, 4_t4, 8_t4, 4_a10g, 8_a10g",
|
510
|
-
),
|
511
|
-
user: Optional[str] = typer.Option(None, "--user", "-u", help="Override username"),
|
512
|
-
boot_disk_size: Optional[int] = typer.Option(
|
513
|
-
None,
|
514
|
-
"--size",
|
515
|
-
"-s",
|
516
|
-
help="Boot disk size in GB (default: 50GB, min: 20GB, max: 1000GB)",
|
517
|
-
),
|
518
|
-
availability_zone: Optional[str] = typer.Option(
|
519
|
-
None,
|
520
|
-
"--az",
|
521
|
-
help="Prefer a specific Availability Zone (e.g., us-east-1b). If omitted the service will try all public subnets.",
|
522
|
-
),
|
523
|
-
):
|
524
|
-
"""Launch a new engine instance."""
|
525
|
-
username = check_aws_sso()
|
526
|
-
if user:
|
527
|
-
username = user
|
528
|
-
|
529
|
-
# Validate engine type
|
530
|
-
valid_types = [
|
531
|
-
"cpu",
|
532
|
-
"cpumax",
|
533
|
-
"t4",
|
534
|
-
"a10g",
|
535
|
-
"a100",
|
536
|
-
"4_t4",
|
537
|
-
"8_t4",
|
538
|
-
"4_a10g",
|
539
|
-
"8_a10g",
|
540
|
-
]
|
541
|
-
if engine_type not in valid_types:
|
542
|
-
console.print(f"[red]❌ Invalid engine type: {engine_type}[/red]")
|
543
|
-
console.print(f"Valid types: {', '.join(valid_types)}")
|
544
|
-
raise typer.Exit(1)
|
545
|
-
|
546
|
-
# Validate boot disk size
|
547
|
-
if boot_disk_size is not None:
|
548
|
-
if boot_disk_size < 20:
|
549
|
-
console.print("[red]❌ Boot disk size must be at least 20GB[/red]")
|
550
|
-
raise typer.Exit(1)
|
551
|
-
if boot_disk_size > 1000:
|
552
|
-
console.print("[red]❌ Boot disk size cannot exceed 1000GB[/red]")
|
553
|
-
raise typer.Exit(1)
|
554
|
-
|
555
|
-
cost = HOURLY_COSTS.get(engine_type, 0)
|
556
|
-
disk_info = f" with {boot_disk_size}GB boot disk" if boot_disk_size else ""
|
557
|
-
console.print(
|
558
|
-
f"Launching [cyan]{name}[/cyan] ({engine_type}){disk_info} for ${cost:.2f}/hour..."
|
559
|
-
)
|
560
|
-
|
561
|
-
with Progress(
|
562
|
-
SpinnerColumn(),
|
563
|
-
TextColumn("[progress.description]{task.description}"),
|
564
|
-
transient=True,
|
565
|
-
) as progress:
|
566
|
-
progress.add_task("Creating engine...", total=None)
|
567
|
-
|
568
|
-
request_data: Dict[str, Any] = {
|
569
|
-
"name": name,
|
570
|
-
"user": username,
|
571
|
-
"engine_type": engine_type,
|
572
|
-
}
|
573
|
-
if boot_disk_size is not None:
|
574
|
-
request_data["boot_disk_size"] = boot_disk_size
|
575
|
-
if availability_zone:
|
576
|
-
request_data["availability_zone"] = availability_zone
|
577
|
-
|
578
|
-
response = make_api_request("POST", "/engines", json_data=request_data)
|
579
|
-
|
580
|
-
if response.status_code == 201:
|
581
|
-
data = response.json()
|
582
|
-
console.print(f"[green]✓ Engine launched successfully![/green]")
|
583
|
-
console.print(f"Instance ID: [cyan]{data['instance_id']}[/cyan]")
|
584
|
-
console.print(f"Type: {data['instance_type']} (${cost:.2f}/hour)")
|
585
|
-
if boot_disk_size:
|
586
|
-
console.print(f"Boot disk: {boot_disk_size}GB")
|
587
|
-
console.print("\nThe engine is initializing. This may take a few minutes.")
|
588
|
-
console.print(f"Check status with: [cyan]dh engine status {name}[/cyan]")
|
589
|
-
else:
|
590
|
-
error = response.json().get("error", "Unknown error")
|
591
|
-
console.print(f"[red]❌ Failed to launch engine: {error}[/red]")
|
592
|
-
|
593
|
-
|
594
|
-
@engine_app.command("list")
|
595
|
-
def list_engines(
|
596
|
-
user: Optional[str] = typer.Option(None, "--user", "-u", help="Filter by user"),
|
597
|
-
running_only: bool = typer.Option(
|
598
|
-
False, "--running", help="Show only running engines"
|
599
|
-
),
|
600
|
-
stopped_only: bool = typer.Option(
|
601
|
-
False, "--stopped", help="Show only stopped engines"
|
602
|
-
),
|
603
|
-
detailed: bool = typer.Option(
|
604
|
-
False, "--detailed", "-d", help="Show detailed status (slower)"
|
605
|
-
),
|
606
|
-
):
|
607
|
-
"""List engines (shows all engines by default)."""
|
608
|
-
current_user = check_aws_sso()
|
609
|
-
|
610
|
-
params = {}
|
611
|
-
if user:
|
612
|
-
params["user"] = user
|
613
|
-
if detailed:
|
614
|
-
params["check_ready"] = "true"
|
615
|
-
|
616
|
-
response = make_api_request("GET", "/engines", params=params)
|
617
|
-
|
618
|
-
if response.status_code == 200:
|
619
|
-
data = response.json()
|
620
|
-
engines = data.get("engines", [])
|
621
|
-
|
622
|
-
# Filter by state if requested
|
623
|
-
if running_only:
|
624
|
-
engines = [e for e in engines if e["state"].lower() == "running"]
|
625
|
-
elif stopped_only:
|
626
|
-
engines = [e for e in engines if e["state"].lower() == "stopped"]
|
627
|
-
|
628
|
-
if not engines:
|
629
|
-
console.print("No engines found.")
|
630
|
-
return
|
631
|
-
|
632
|
-
# Only fetch detailed info if requested (slow)
|
633
|
-
stages_map = {}
|
634
|
-
if detailed:
|
635
|
-
stages_map = _fetch_init_stages([e["instance_id"] for e in engines])
|
636
|
-
|
637
|
-
# Create table
|
638
|
-
table = Table(title="Engines", box=box.ROUNDED)
|
639
|
-
table.add_column("Name", style="cyan")
|
640
|
-
table.add_column("Instance ID", style="dim")
|
641
|
-
table.add_column("Type")
|
642
|
-
table.add_column("User")
|
643
|
-
table.add_column("Status")
|
644
|
-
if detailed:
|
645
|
-
table.add_column("Disk Usage")
|
646
|
-
table.add_column("Uptime/Since")
|
647
|
-
table.add_column("$/hour", justify="right")
|
648
|
-
|
649
|
-
for engine in engines:
|
650
|
-
launch_time = parse_launch_time(engine["launch_time"])
|
651
|
-
uptime = datetime.now(timezone.utc) - launch_time
|
652
|
-
hourly_cost = HOURLY_COSTS.get(engine["engine_type"], 0)
|
653
|
-
|
654
|
-
if engine["state"].lower() == "running":
|
655
|
-
time_str = format_duration(uptime)
|
656
|
-
# Only get disk usage if detailed mode
|
657
|
-
if detailed:
|
658
|
-
disk_usage = get_disk_usage_via_ssm(engine["instance_id"]) or "-"
|
659
|
-
else:
|
660
|
-
disk_usage = None
|
661
|
-
else:
|
662
|
-
time_str = launch_time.strftime("%Y-%m-%d %H:%M")
|
663
|
-
disk_usage = "-" if detailed else None
|
664
|
-
|
665
|
-
row_data = [
|
666
|
-
engine["name"],
|
667
|
-
engine["instance_id"],
|
668
|
-
engine["engine_type"],
|
669
|
-
engine["user"],
|
670
|
-
format_status(engine["state"], engine.get("ready")),
|
671
|
-
]
|
672
|
-
if detailed:
|
673
|
-
row_data.append(disk_usage)
|
674
|
-
row_data.extend(
|
675
|
-
[
|
676
|
-
time_str,
|
677
|
-
f"${hourly_cost:.2f}",
|
678
|
-
]
|
679
|
-
)
|
680
|
-
|
681
|
-
table.add_row(*row_data)
|
682
|
-
|
683
|
-
console.print(table)
|
684
|
-
if not detailed and any(e["state"].lower() == "running" for e in engines):
|
685
|
-
console.print(
|
686
|
-
"\n[dim]Tip: Use --detailed to see disk usage and bootstrap status (slower)[/dim]"
|
687
|
-
)
|
688
|
-
else:
|
689
|
-
error = response.json().get("error", "Unknown error")
|
690
|
-
console.print(f"[red]❌ Failed to list engines: {error}[/red]")
|
691
|
-
|
692
|
-
|
693
|
-
@engine_app.command("status")
|
694
|
-
def engine_status(
|
695
|
-
name_or_id: str = typer.Argument(help="Engine name or instance ID"),
|
696
|
-
detailed: bool = typer.Option(False, "--detailed", "-d", help="Show detailed status (slower)"),
|
697
|
-
show_log: bool = typer.Option(False, "--show-log", help="Show bootstrap log (requires --detailed)"),
|
698
|
-
):
|
699
|
-
"""Show engine status and information."""
|
700
|
-
check_aws_sso()
|
701
|
-
|
702
|
-
# Get all engines to resolve name
|
703
|
-
response = make_api_request("GET", "/engines")
|
704
|
-
if response.status_code != 200:
|
705
|
-
console.print("[red]❌ Failed to fetch engines[/red]")
|
706
|
-
raise typer.Exit(1)
|
707
|
-
|
708
|
-
engines = response.json().get("engines", [])
|
709
|
-
engine = resolve_engine(name_or_id, engines)
|
710
|
-
|
711
|
-
# Fast status display (default)
|
712
|
-
if not detailed:
|
713
|
-
# Fetch idle status via SSM with longer timeout
|
714
|
-
ssm = boto3.client("ssm", region_name="us-east-1")
|
715
|
-
idle_data = None # Use None to indicate no data received
|
716
|
-
|
717
|
-
if engine["state"].lower() == "running":
|
718
|
-
try:
|
719
|
-
resp = ssm.send_command(
|
720
|
-
InstanceIds=[engine["instance_id"]],
|
721
|
-
DocumentName="AWS-RunShellScript",
|
722
|
-
Parameters={
|
723
|
-
"commands": [
|
724
|
-
"cat /var/run/idle-detector/last_state.json 2>/dev/null || echo '{}'"
|
725
|
-
],
|
726
|
-
"executionTimeout": ["10"],
|
727
|
-
},
|
728
|
-
)
|
729
|
-
cid = resp["Command"]["CommandId"]
|
730
|
-
|
731
|
-
# Wait up to 3 seconds for result
|
732
|
-
for _ in range(6): # 6 * 0.5 = 3 seconds
|
733
|
-
time.sleep(0.5)
|
734
|
-
inv = ssm.get_command_invocation(
|
735
|
-
CommandId=cid, InstanceId=engine["instance_id"]
|
736
|
-
)
|
737
|
-
if inv["Status"] in ["Success", "Failed"]:
|
738
|
-
break
|
739
|
-
|
740
|
-
if inv["Status"] == "Success":
|
741
|
-
content = inv["StandardOutputContent"].strip()
|
742
|
-
if content and content != "{}":
|
743
|
-
idle_data = json.loads(content)
|
744
|
-
else:
|
745
|
-
idle_data = {} # Empty response but SSM worked
|
746
|
-
except Exception:
|
747
|
-
idle_data = None # SSM failed
|
748
|
-
|
749
|
-
# Determine running state display
|
750
|
-
running_state = engine["state"].lower()
|
751
|
-
if running_state == "running":
|
752
|
-
run_disp = "[green]Running[/green]"
|
753
|
-
elif running_state == "pending":
|
754
|
-
run_disp = "[yellow]Starting...[/yellow]"
|
755
|
-
elif running_state == "stopping":
|
756
|
-
run_disp = "[yellow]Stopping...[/yellow]"
|
757
|
-
elif running_state == "stopped":
|
758
|
-
run_disp = "[dim]Stopped[/dim]"
|
759
|
-
else:
|
760
|
-
run_disp = engine["state"].capitalize()
|
761
|
-
|
762
|
-
# Determine idle/active status
|
763
|
-
idle_disp = ""
|
764
|
-
if running_state == "running":
|
765
|
-
if idle_data is None:
|
766
|
-
# SSM failed - we don't know the status
|
767
|
-
idle_disp = " [dim]N/A[/dim]"
|
768
|
-
elif not idle_data:
|
769
|
-
# Empty data - likely very early in boot
|
770
|
-
idle_disp = " [dim]N/A[/dim]"
|
771
|
-
else:
|
772
|
-
# We have data
|
773
|
-
is_idle = idle_data.get("idle", False)
|
774
|
-
timeout_sec = idle_data.get("timeout_sec")
|
775
|
-
idle_seconds = idle_data.get("idle_seconds", 0) if is_idle else 0
|
776
|
-
|
777
|
-
if is_idle:
|
778
|
-
if isinstance(timeout_sec, int) and isinstance(idle_seconds, int):
|
779
|
-
remaining = max(0, timeout_sec - idle_seconds)
|
780
|
-
remaining_mins = remaining // 60
|
781
|
-
if remaining_mins == 0:
|
782
|
-
idle_disp = f" [yellow]Idle {idle_seconds//60}m/{timeout_sec//60}m: [red]<1m[/red] left[/yellow]"
|
783
|
-
else:
|
784
|
-
idle_disp = f" [yellow]Idle {idle_seconds//60}m/{timeout_sec//60}m: [red]{remaining_mins}m[/red] left[/yellow]"
|
785
|
-
else:
|
786
|
-
idle_disp = " [yellow]Idle ?/?[/yellow]"
|
787
|
-
else:
|
788
|
-
# Actively not idle
|
789
|
-
idle_disp = " [green]Active[/green]"
|
790
|
-
|
791
|
-
# Build status lines - minimal info for fast view
|
792
|
-
status_lines = [
|
793
|
-
f"[blue]{engine['name']}[/blue] {run_disp}{idle_disp}",
|
794
|
-
]
|
795
|
-
|
796
|
-
# Add activity sensors if we have idle data
|
797
|
-
if idle_data and idle_data.get("reasons"):
|
798
|
-
status_lines.append("") # blank line before sensors
|
799
|
-
|
800
|
-
sensor_map = {
|
801
|
-
"CoffeeLockSensor": ("☕", "Coffee"),
|
802
|
-
"ActiveLoginSensor": ("🐚", "SSH"),
|
803
|
-
"IDEConnectionSensor": ("🖥 ", "IDE"),
|
804
|
-
"DockerWorkloadSensor": ("🐳", "Docker"),
|
805
|
-
}
|
806
|
-
|
807
|
-
for r in idle_data.get("reasons", []):
|
808
|
-
sensor = r.get("sensor", "Unknown")
|
809
|
-
active = r.get("active", False)
|
810
|
-
icon, label = sensor_map.get(sensor, ("?", sensor))
|
811
|
-
status_str = "[green]YES[/green]" if active else "[dim]nope[/dim]"
|
812
|
-
status_lines.append(f" {icon} {label:6} {status_str}")
|
813
|
-
|
814
|
-
# Display in a nice panel
|
815
|
-
console.print(
|
816
|
-
Panel("\n".join(status_lines), title="Engine Status", border_style="blue")
|
817
|
-
)
|
818
|
-
return # Exit early for fast status
|
819
|
-
|
820
|
-
# Get detailed engine status including idle detector info (for --detailed mode)
|
821
|
-
response = make_api_request("GET", f"/engines/{engine['instance_id']}")
|
822
|
-
if response.status_code != 200:
|
823
|
-
console.print("[red]❌ Failed to fetch engine details[/red]")
|
824
|
-
raise typer.Exit(1)
|
825
|
-
|
826
|
-
engine_details = response.json()
|
827
|
-
engine = engine_details.get("engine", engine) # Use detailed info if available
|
828
|
-
idle_detector = engine_details.get("idle_detector", {}) or {}
|
829
|
-
attached_studios = engine_details.get("attached_studios", [])
|
830
|
-
|
831
|
-
# Calculate costs
|
832
|
-
launch_time = parse_launch_time(engine["launch_time"])
|
833
|
-
uptime = datetime.now(timezone.utc) - launch_time
|
834
|
-
hourly_cost = HOURLY_COSTS.get(engine["engine_type"], 0)
|
835
|
-
# total_cost intentionally not shown in status view
|
836
|
-
|
837
|
-
stages_map = _fetch_init_stages([engine["instance_id"]])
|
838
|
-
stage_val = stages_map.get(engine["instance_id"], "-")
|
839
|
-
|
840
|
-
# Try to fetch actual boot time via SSM (best-effort)
|
841
|
-
boot_time_str: Optional[str] = None
|
842
|
-
try:
|
843
|
-
if engine["state"].lower() == "running":
|
844
|
-
ssm = boto3.client("ssm", region_name="us-east-1")
|
845
|
-
resp = ssm.send_command(
|
846
|
-
InstanceIds=[engine["instance_id"]],
|
847
|
-
DocumentName="AWS-RunShellScript",
|
848
|
-
Parameters={
|
849
|
-
"commands": ["uptime -s || who -b | awk '{print $3\" \"$4}'"]
|
850
|
-
},
|
851
|
-
)
|
852
|
-
cid = resp["Command"]["CommandId"]
|
853
|
-
time.sleep(1)
|
854
|
-
inv = ssm.get_command_invocation(
|
855
|
-
CommandId=cid, InstanceId=engine["instance_id"]
|
856
|
-
)
|
857
|
-
if inv.get("Status") == "Success":
|
858
|
-
boot_time_str = (
|
859
|
-
(inv.get("StandardOutputContent") or "").strip().splitlines()[0]
|
860
|
-
if inv.get("StandardOutputContent")
|
861
|
-
else None
|
862
|
-
)
|
863
|
-
except Exception:
|
864
|
-
boot_time_str = None
|
865
|
-
|
866
|
-
started_line = (
|
867
|
-
f"[bold]Started:[/bold] {boot_time_str} ({format_duration(uptime)} ago)"
|
868
|
-
if boot_time_str
|
869
|
-
else f"[bold]Started:[/bold] {launch_time.strftime('%Y-%m-%d %H:%M:%S')} ({format_duration(uptime)} ago)"
|
870
|
-
)
|
871
|
-
|
872
|
-
# ---------------- Front-loaded summary ----------------
|
873
|
-
running_state = engine["state"].lower()
|
874
|
-
if running_state == "running":
|
875
|
-
run_disp = "[green]Running[/green]"
|
876
|
-
elif running_state == "pending":
|
877
|
-
run_disp = "[yellow]Starting...[/yellow]"
|
878
|
-
elif running_state == "stopping":
|
879
|
-
run_disp = "[yellow]Stopping...[/yellow]"
|
880
|
-
elif running_state == "stopped":
|
881
|
-
run_disp = "[dim]Stopped[/dim]"
|
882
|
-
else:
|
883
|
-
run_disp = engine["state"].capitalize()
|
884
|
-
|
885
|
-
# Compose Active/Idle header with extra detail when idle
|
886
|
-
def _compute_active_disp(idle_info: Dict[str, Any]) -> str:
|
887
|
-
# If we don't have idle info or it's explicitly unavailable, show N/A
|
888
|
-
if not idle_info or idle_info.get("available") == False:
|
889
|
-
return "[dim]N/A[/dim]"
|
890
|
-
|
891
|
-
if idle_info.get("status") == "active":
|
892
|
-
return "[green]Active[/green]"
|
893
|
-
if running_state in ("stopped", "stopping"):
|
894
|
-
return "[dim]N/A[/dim]"
|
895
|
-
|
896
|
-
# If idle, show time/threshold with time remaining if available
|
897
|
-
if idle_info.get("status") == "idle":
|
898
|
-
idle_seconds_v = idle_info.get("idle_seconds")
|
899
|
-
thresh_v = idle_info.get("idle_threshold")
|
900
|
-
if isinstance(idle_seconds_v, (int, float)) and isinstance(thresh_v, (int, float)):
|
901
|
-
remaining = max(0, int(thresh_v) - int(idle_seconds_v))
|
902
|
-
remaining_mins = remaining // 60
|
903
|
-
if remaining_mins == 0:
|
904
|
-
return f"[yellow]Idle {int(idle_seconds_v)//60}m/{int(thresh_v)//60}m: [red]<1m[/red] left[/yellow]"
|
905
|
-
else:
|
906
|
-
return f"[yellow]Idle {int(idle_seconds_v)//60}m/{int(thresh_v)//60}m: [red]{remaining_mins}m[/red] left[/yellow]"
|
907
|
-
elif isinstance(thresh_v, (int, float)):
|
908
|
-
return f"[yellow]Idle ?/{int(thresh_v)//60}m[/yellow]"
|
909
|
-
else:
|
910
|
-
return "[yellow]Idle ?/?[/yellow]"
|
911
|
-
|
912
|
-
# Default to N/A if we can't determine status
|
913
|
-
return "[dim]N/A[/dim]"
|
914
|
-
|
915
|
-
active_disp = _compute_active_disp(idle_detector)
|
916
|
-
|
917
|
-
top_lines = [
|
918
|
-
f"[blue]{engine['name']}[/blue] {run_disp} {active_disp}\n",
|
919
|
-
]
|
920
|
-
|
921
|
-
# Studios summary next, with studio name in purple/magenta
|
922
|
-
studios_line = None
|
923
|
-
if attached_studios:
|
924
|
-
stu_texts = [
|
925
|
-
f"[magenta]{s.get('user', 'studio')}[/magenta] ({s.get('studio_id', 'unknown')})"
|
926
|
-
for s in attached_studios
|
927
|
-
]
|
928
|
-
studios_line = "Studios: " + ", ".join(stu_texts)
|
929
|
-
top_lines.append(studios_line)
|
930
|
-
|
931
|
-
# Paragraph break
|
932
|
-
top_lines.append("")
|
933
|
-
|
934
|
-
# ---------------- Details block (white/default) ----------------
|
935
|
-
status_lines = [
|
936
|
-
f"Name: {engine['name']}",
|
937
|
-
f"Instance: {engine['instance_id']}",
|
938
|
-
f"Type: {engine['engine_type']} ({engine['instance_type']})",
|
939
|
-
f"Status: {engine['state']}",
|
940
|
-
f"User: {engine['user']}",
|
941
|
-
f"IP: {engine.get('public_ip', 'N/A')}",
|
942
|
-
started_line,
|
943
|
-
f"$/hour: ${hourly_cost:.2f}",
|
944
|
-
]
|
945
|
-
|
946
|
-
# Disk usage (like list --detailed)
|
947
|
-
if engine["state"].lower() == "running":
|
948
|
-
disk_usage = get_disk_usage_via_ssm(engine["instance_id"]) or "-"
|
949
|
-
status_lines.append(f"Disk: {disk_usage}")
|
950
|
-
|
951
|
-
# Idle timeout (show even when not idle) - but only if we have data
|
952
|
-
if idle_detector.get("available"):
|
953
|
-
idle_threshold_secs: Optional[int] = None
|
954
|
-
# Prefer value from idle detector overlay if present
|
955
|
-
try:
|
956
|
-
if isinstance(idle_detector.get("idle_threshold"), (int, float)):
|
957
|
-
idle_threshold_secs = int(idle_detector["idle_threshold"])
|
958
|
-
except Exception:
|
959
|
-
idle_threshold_secs = None
|
960
|
-
|
961
|
-
if idle_threshold_secs is None and engine["state"].lower() == "running":
|
962
|
-
# Fallback: read /etc/engine.env via SSM
|
963
|
-
try:
|
964
|
-
ssm = boto3.client("ssm", region_name="us-east-1")
|
965
|
-
resp = ssm.send_command(
|
966
|
-
InstanceIds=[engine["instance_id"]],
|
967
|
-
DocumentName="AWS-RunShellScript",
|
968
|
-
Parameters={
|
969
|
-
"commands": [
|
970
|
-
"grep -E '^IDLE_TIMEOUT_SECONDS=' /etc/engine.env | cut -d'=' -f2 || echo '?'",
|
971
|
-
],
|
972
|
-
"executionTimeout": ["5"],
|
973
|
-
},
|
974
|
-
)
|
975
|
-
cid = resp["Command"]["CommandId"]
|
976
|
-
time.sleep(1)
|
977
|
-
inv = ssm.get_command_invocation(
|
978
|
-
CommandId=cid, InstanceId=engine["instance_id"]
|
979
|
-
)
|
980
|
-
if inv.get("Status") == "Success":
|
981
|
-
out = (inv.get("StandardOutputContent") or "").strip()
|
982
|
-
if out and out != "?" and out.isdigit():
|
983
|
-
idle_threshold_secs = int(out)
|
984
|
-
except Exception:
|
985
|
-
idle_threshold_secs = None
|
986
|
-
|
987
|
-
if idle_threshold_secs is not None:
|
988
|
-
status_lines.append(
|
989
|
-
f"Idle timeout: {idle_threshold_secs//60}m ({idle_threshold_secs}s)"
|
990
|
-
)
|
991
|
-
else:
|
992
|
-
status_lines.append("Idle timeout: unknown")
|
993
|
-
else:
|
994
|
-
# No idle detector data available
|
995
|
-
status_lines.append("Idle timeout: N/A")
|
996
|
-
|
997
|
-
# Health report (only if bootstrap finished)
|
998
|
-
if stage_val == "finished":
|
999
|
-
try:
|
1000
|
-
ssm = boto3.client("ssm", region_name="us-east-1")
|
1001
|
-
res = ssm.send_command(
|
1002
|
-
InstanceIds=[engine["instance_id"]],
|
1003
|
-
DocumentName="AWS-RunShellScript",
|
1004
|
-
Parameters={
|
1005
|
-
"commands": [
|
1006
|
-
"cat /opt/dayhoff/state/engine-health.json 2>/dev/null || cat /var/run/engine-health.json 2>/dev/null || true"
|
1007
|
-
],
|
1008
|
-
"executionTimeout": ["10"],
|
1009
|
-
},
|
1010
|
-
)
|
1011
|
-
cid = res["Command"]["CommandId"]
|
1012
|
-
time.sleep(1)
|
1013
|
-
inv = ssm.get_command_invocation(
|
1014
|
-
CommandId=cid, InstanceId=engine["instance_id"]
|
1015
|
-
)
|
1016
|
-
if inv["Status"] == "Success":
|
1017
|
-
import json as _json
|
1018
|
-
|
1019
|
-
health = _json.loads(inv["StandardOutputContent"].strip() or "{}")
|
1020
|
-
status_lines.append("")
|
1021
|
-
status_lines.append("[bold]Health:[/bold]")
|
1022
|
-
status_lines.append(
|
1023
|
-
f" • GPU Drivers: {'OK' if health.get('drivers_ok') else 'MISSING'}"
|
1024
|
-
)
|
1025
|
-
idle_stat = health.get("idle_detector_service") or health.get(
|
1026
|
-
"idle_detector_timer", "unknown"
|
1027
|
-
)
|
1028
|
-
status_lines.append(f" • Idle Detector: {idle_stat}")
|
1029
|
-
except Exception:
|
1030
|
-
pass
|
1031
|
-
|
1032
|
-
# Try to enrich/fallback idle-detector details from on-engine summary file via SSM
|
1033
|
-
def _fetch_idle_summary_via_ssm(instance_id: str) -> Optional[Dict]:
|
1034
|
-
try:
|
1035
|
-
ssm = boto3.client("ssm", region_name="us-east-1")
|
1036
|
-
res = ssm.send_command(
|
1037
|
-
InstanceIds=[instance_id],
|
1038
|
-
DocumentName="AWS-RunShellScript",
|
1039
|
-
Parameters={
|
1040
|
-
"commands": [
|
1041
|
-
"cat /var/run/idle-detector/last_state.json 2>/dev/null || true",
|
1042
|
-
],
|
1043
|
-
"executionTimeout": ["5"],
|
1044
|
-
},
|
1045
|
-
)
|
1046
|
-
cid = res["Command"]["CommandId"]
|
1047
|
-
# Wait up to 2 seconds for SSM command to complete (was 1 second)
|
1048
|
-
for _ in range(4): # 4 * 0.5 = 2 seconds
|
1049
|
-
time.sleep(0.5)
|
1050
|
-
inv = ssm.get_command_invocation(CommandId=cid, InstanceId=instance_id)
|
1051
|
-
if inv["Status"] in ["Success", "Failed"]:
|
1052
|
-
break
|
1053
|
-
if inv["Status"] != "Success":
|
1054
|
-
return None
|
1055
|
-
content = inv["StandardOutputContent"].strip()
|
1056
|
-
if not content:
|
1057
|
-
return None
|
1058
|
-
data = json.loads(content)
|
1059
|
-
# Convert last_state schema (new or old) to idle_detector schema used by CLI output
|
1060
|
-
idle_info: Dict[str, Any] = {"available": True}
|
1061
|
-
|
1062
|
-
# Active/idle
|
1063
|
-
idle_flag = bool(data.get("idle", False))
|
1064
|
-
idle_info["status"] = "idle" if idle_flag else "active"
|
1065
|
-
|
1066
|
-
# Threshold and elapsed
|
1067
|
-
if isinstance(data.get("timeout_sec"), (int, float)):
|
1068
|
-
idle_info["idle_threshold"] = int(data["timeout_sec"]) # seconds
|
1069
|
-
if isinstance(data.get("idle_seconds"), (int, float)):
|
1070
|
-
idle_info["idle_seconds"] = int(data["idle_seconds"])
|
1071
|
-
|
1072
|
-
# Keep raw reasons for sensor display when available (new schema)
|
1073
|
-
if isinstance(data.get("reasons"), list):
|
1074
|
-
idle_info["_reasons_raw"] = data["reasons"]
|
1075
|
-
else:
|
1076
|
-
# Fallback: synthesize reasons from the old forensics layout
|
1077
|
-
f_all = data.get("forensics", {}) or {}
|
1078
|
-
synthesized = []
|
1079
|
-
|
1080
|
-
def _mk(sensor_name: str, key: str):
|
1081
|
-
entry = f_all.get(key, {}) or {}
|
1082
|
-
synthesized.append(
|
1083
|
-
{
|
1084
|
-
"sensor": sensor_name,
|
1085
|
-
"active": bool(entry.get("active", False)),
|
1086
|
-
"reason": entry.get("reason", ""),
|
1087
|
-
"forensic": entry.get("forensic", {}),
|
1088
|
-
}
|
1089
|
-
)
|
1090
|
-
|
1091
|
-
_mk("CoffeeLockSensor", "coffee")
|
1092
|
-
_mk("ActiveLoginSensor", "ssh")
|
1093
|
-
_mk("IDEConnectionSensor", "ide")
|
1094
|
-
_mk("DockerWorkloadSensor", "docker")
|
1095
|
-
idle_info["_reasons_raw"] = synthesized
|
1096
|
-
|
1097
|
-
# Derive details from sensors
|
1098
|
-
for r in idle_info.get("_reasons_raw", []):
|
1099
|
-
if not r.get("active"):
|
1100
|
-
continue
|
1101
|
-
sensor = (r.get("sensor") or "").lower()
|
1102
|
-
forensic = r.get("forensic") or {}
|
1103
|
-
if sensor == "ideconnectionsensor":
|
1104
|
-
# Prefer unique_pid_count written by new detector
|
1105
|
-
cnt = forensic.get("unique_pid_count")
|
1106
|
-
if not isinstance(cnt, int):
|
1107
|
-
cnt = forensic.get("matches")
|
1108
|
-
if isinstance(cnt, int):
|
1109
|
-
idle_info["ide_connections"] = {"connection_count": cnt}
|
1110
|
-
else:
|
1111
|
-
idle_info["ide_connections"] = {"connection_count": 1}
|
1112
|
-
elif sensor == "coffeelocksensor":
|
1113
|
-
rem = forensic.get("remaining_sec")
|
1114
|
-
if isinstance(rem, (int, float)) and rem > 0:
|
1115
|
-
idle_info["coffee_lock"] = format_duration(
|
1116
|
-
timedelta(seconds=int(rem))
|
1117
|
-
)
|
1118
|
-
elif sensor == "activeloginsensor":
|
1119
|
-
sess = {
|
1120
|
-
"tty": forensic.get("tty", "pts/?"),
|
1121
|
-
"pid": forensic.get("pid", "?"),
|
1122
|
-
"idle_time": forensic.get("idle_sec", 0),
|
1123
|
-
"from_ip": forensic.get("remote_addr", "unknown"),
|
1124
|
-
}
|
1125
|
-
idle_info.setdefault("ssh_sessions", []).append(sess)
|
1126
|
-
return idle_info
|
1127
|
-
except Exception:
|
1128
|
-
return None
|
1129
|
-
|
1130
|
-
# Always try to enrich from on-engine summary (fast, best-effort)
|
1131
|
-
overlay = _fetch_idle_summary_via_ssm(engine["instance_id"])
|
1132
|
-
if overlay:
|
1133
|
-
# If API didn't indicate availability, replace entirely; otherwise fill gaps
|
1134
|
-
if not idle_detector.get("available"):
|
1135
|
-
idle_detector = overlay
|
1136
|
-
else:
|
1137
|
-
for k, v in overlay.items():
|
1138
|
-
idle_detector.setdefault(k, v)
|
1139
|
-
else:
|
1140
|
-
# SSM failed - mark as unavailable if we don't have good data
|
1141
|
-
if not idle_detector.get("available"):
|
1142
|
-
idle_detector = {"available": False} # Mark as unavailable
|
1143
|
-
|
1144
|
-
# Recompute header display with latest data
|
1145
|
-
active_disp = _compute_active_disp(idle_detector)
|
1146
|
-
top_lines[0] = f"[blue]{engine['name']}[/blue] {run_disp} {active_disp}\n"
|
1147
|
-
|
1148
|
-
# Activity Sensors (show all with YES/no)
|
1149
|
-
if idle_detector.get("available"):
|
1150
|
-
status_lines.append("")
|
1151
|
-
status_lines.append("[bold]Activity Sensors:[/bold]")
|
1152
|
-
reasons_raw = idle_detector.get("_reasons_raw", []) or []
|
1153
|
-
by_sensor: Dict[str, Dict[str, Any]] = {}
|
1154
|
-
for r in reasons_raw:
|
1155
|
-
nm = r.get("sensor")
|
1156
|
-
if nm:
|
1157
|
-
by_sensor[nm] = r
|
1158
|
-
|
1159
|
-
def _sensor_line(label: str, key: str, emoji: str) -> str:
|
1160
|
-
r = by_sensor.get(key, {})
|
1161
|
-
active = bool(r.get("active"))
|
1162
|
-
reason_txt = r.get("reason") or ("" if not active else "active")
|
1163
|
-
flag = "[green]YES[/green]" if active else "[dim]nope[/dim]"
|
1164
|
-
return (
|
1165
|
-
f" {emoji} {label}: {flag} {('- ' + reason_txt) if reason_txt else ''}"
|
1166
|
-
)
|
1167
|
-
|
1168
|
-
status_lines.append(_sensor_line("Coffee", "CoffeeLockSensor", "☕"))
|
1169
|
-
status_lines.append(_sensor_line("Shell ", "ActiveLoginSensor", "🐚"))
|
1170
|
-
status_lines.append(_sensor_line(" IDE ", "IDEConnectionSensor", "🖥"))
|
1171
|
-
status_lines.append(_sensor_line("Docker", "DockerWorkloadSensor", "🐳"))
|
1172
|
-
|
1173
|
-
# Recompute display with latest idle detector data
|
1174
|
-
active_disp = _compute_active_disp(idle_detector)
|
1175
|
-
# Rewrite top header line (index 0) to include updated display
|
1176
|
-
top_lines[0] = f"[blue]{engine['name']}[/blue] {run_disp} {active_disp}\n"
|
1177
|
-
|
1178
|
-
# Combine top summary and details
|
1179
|
-
all_lines = top_lines + status_lines
|
1180
|
-
console.print(
|
1181
|
-
Panel("\n".join(all_lines), title="Engine Status", border_style="blue")
|
1182
|
-
)
|
1183
|
-
|
1184
|
-
if show_log:
|
1185
|
-
if not detailed:
|
1186
|
-
console.print("[yellow]Note: --show-log requires --detailed flag[/yellow]")
|
1187
|
-
return
|
1188
|
-
console.print("\n[bold]Bootstrap Log:[/bold]")
|
1189
|
-
try:
|
1190
|
-
ssm = boto3.client("ssm", region_name="us-east-1")
|
1191
|
-
resp = ssm.send_command(
|
1192
|
-
InstanceIds=[engine["instance_id"]],
|
1193
|
-
DocumentName="AWS-RunShellScript",
|
1194
|
-
Parameters={
|
1195
|
-
"commands": [
|
1196
|
-
"cat /var/log/engine-setup.log 2>/dev/null || echo 'No setup log found'"
|
1197
|
-
],
|
1198
|
-
"executionTimeout": ["15"],
|
1199
|
-
},
|
1200
|
-
)
|
1201
|
-
cid = resp["Command"]["CommandId"]
|
1202
|
-
time.sleep(2)
|
1203
|
-
inv = ssm.get_command_invocation(
|
1204
|
-
CommandId=cid, InstanceId=engine["instance_id"]
|
1205
|
-
)
|
1206
|
-
if inv["Status"] == "Success":
|
1207
|
-
log_content = inv["StandardOutputContent"].strip()
|
1208
|
-
if log_content:
|
1209
|
-
console.print(f"[dim]{log_content}[/dim]")
|
1210
|
-
else:
|
1211
|
-
console.print("[yellow]No bootstrap log available[/yellow]")
|
1212
|
-
else:
|
1213
|
-
console.print("[red]❌ Could not retrieve bootstrap log[/red]")
|
1214
|
-
except Exception as e:
|
1215
|
-
console.print(f"[red]❌ Error fetching log: {e}[/red]")
|
1216
|
-
|
1217
|
-
|
1218
|
-
@engine_app.command("stop")
|
1219
|
-
def stop_engine(
|
1220
|
-
name_or_id: str = typer.Argument(help="Engine name or instance ID"),
|
1221
|
-
force: bool = typer.Option(
|
1222
|
-
False, "--force", "-f", help="Force stop and detach all studios"
|
1223
|
-
),
|
1224
|
-
):
|
1225
|
-
"""Stop an engine."""
|
1226
|
-
check_aws_sso()
|
1227
|
-
|
1228
|
-
# Get all engines to resolve name
|
1229
|
-
response = make_api_request("GET", "/engines")
|
1230
|
-
if response.status_code != 200:
|
1231
|
-
console.print("[red]❌ Failed to fetch engines[/red]")
|
1232
|
-
raise typer.Exit(1)
|
1233
|
-
|
1234
|
-
engines = response.json().get("engines", [])
|
1235
|
-
engine = resolve_engine(name_or_id, engines)
|
1236
|
-
|
1237
|
-
console.print(f"Stopping engine [cyan]{engine['name']}[/cyan]...")
|
1238
|
-
|
1239
|
-
# First attempt without detaching
|
1240
|
-
response = make_api_request(
|
1241
|
-
"POST",
|
1242
|
-
f"/engines/{engine['instance_id']}/stop",
|
1243
|
-
json_data={"detach_studios": force},
|
1244
|
-
)
|
1245
|
-
|
1246
|
-
if response.status_code == 409 and not force:
|
1247
|
-
# Engine has attached studios
|
1248
|
-
data = response.json()
|
1249
|
-
attached_studios = data.get("attached_studios", [])
|
1250
|
-
|
1251
|
-
console.print("\n[yellow]⚠️ This engine has attached studios:[/yellow]")
|
1252
|
-
for studio in attached_studios:
|
1253
|
-
console.print(f" • {studio['user']} ({studio['studio_id']})")
|
1254
|
-
|
1255
|
-
if Confirm.ask("\nDetach all studios and stop the engine?"):
|
1256
|
-
response = make_api_request(
|
1257
|
-
"POST",
|
1258
|
-
f"/engines/{engine['instance_id']}/stop",
|
1259
|
-
json_data={"detach_studios": True},
|
1260
|
-
)
|
1261
|
-
else:
|
1262
|
-
console.print("Stop cancelled.")
|
1263
|
-
return
|
1264
|
-
|
1265
|
-
if response.status_code == 200:
|
1266
|
-
console.print(f"[green]✓ Engine stopped successfully![/green]")
|
1267
|
-
else:
|
1268
|
-
error = response.json().get("error", "Unknown error")
|
1269
|
-
console.print(f"[red]❌ Failed to stop engine: {error}[/red]")
|
1270
|
-
|
1271
|
-
|
1272
|
-
@engine_app.command("start")
|
1273
|
-
def start_engine(
|
1274
|
-
name_or_id: str = typer.Argument(help="Engine name or instance ID"),
|
1275
|
-
):
|
1276
|
-
"""Start a stopped engine."""
|
1277
|
-
check_aws_sso()
|
1278
|
-
|
1279
|
-
# Get all engines to resolve name
|
1280
|
-
response = make_api_request("GET", "/engines")
|
1281
|
-
if response.status_code != 200:
|
1282
|
-
console.print("[red]❌ Failed to fetch engines[/red]")
|
1283
|
-
raise typer.Exit(1)
|
1284
|
-
|
1285
|
-
engines = response.json().get("engines", [])
|
1286
|
-
engine = resolve_engine(name_or_id, engines)
|
1287
|
-
|
1288
|
-
console.print(f"Starting engine [cyan]{engine['name']}[/cyan]...")
|
1289
|
-
|
1290
|
-
response = make_api_request("POST", f"/engines/{engine['instance_id']}/start")
|
1291
|
-
|
1292
|
-
if response.status_code == 200:
|
1293
|
-
data = response.json()
|
1294
|
-
console.print(f"[green]✓ Engine started successfully![/green]")
|
1295
|
-
console.print(f"New public IP: {data.get('public_ip', 'Pending...')}")
|
1296
|
-
else:
|
1297
|
-
error = response.json().get("error", "Unknown error")
|
1298
|
-
console.print(f"[red]❌ Failed to start engine: {error}[/red]")
|
1299
|
-
|
1300
|
-
|
1301
|
-
@engine_app.command("terminate")
|
1302
|
-
def terminate_engine(
|
1303
|
-
name_or_id: str = typer.Argument(help="Engine name or instance ID"),
|
1304
|
-
):
|
1305
|
-
"""Permanently terminate an engine."""
|
1306
|
-
check_aws_sso()
|
1307
|
-
|
1308
|
-
# Get all engines to resolve name
|
1309
|
-
response = make_api_request("GET", "/engines")
|
1310
|
-
if response.status_code != 200:
|
1311
|
-
console.print("[red]❌ Failed to fetch engines[/red]")
|
1312
|
-
raise typer.Exit(1)
|
1313
|
-
|
1314
|
-
engines = response.json().get("engines", [])
|
1315
|
-
engine = resolve_engine(name_or_id, engines)
|
1316
|
-
|
1317
|
-
# Calculate cost
|
1318
|
-
launch_time = parse_launch_time(engine["launch_time"])
|
1319
|
-
uptime = datetime.now(timezone.utc) - launch_time
|
1320
|
-
hourly_cost = HOURLY_COSTS.get(engine["engine_type"], 0)
|
1321
|
-
total_cost = hourly_cost * (uptime.total_seconds() / 3600)
|
1322
|
-
|
1323
|
-
console.print(
|
1324
|
-
f"\n[yellow]⚠️ This will permanently terminate engine '{engine['name']}'[/yellow]"
|
1325
|
-
)
|
1326
|
-
console.print(f"Total cost for this session: ${total_cost:.2f}")
|
1327
|
-
|
1328
|
-
if not Confirm.ask("\nAre you sure you want to terminate this engine?"):
|
1329
|
-
console.print("Termination cancelled.")
|
1330
|
-
return
|
1331
|
-
|
1332
|
-
response = make_api_request("DELETE", f"/engines/{engine['instance_id']}")
|
1333
|
-
|
1334
|
-
if response.status_code == 200:
|
1335
|
-
console.print(f"[green]✓ Engine terminated successfully![/green]")
|
1336
|
-
else:
|
1337
|
-
error = response.json().get("error", "Unknown error")
|
1338
|
-
console.print(f"[red]❌ Failed to terminate engine: {error}[/red]")
|
1339
|
-
|
1340
|
-
|
1341
|
-
@engine_app.command("ssh")
|
1342
|
-
def ssh_engine(
|
1343
|
-
name_or_id: str = typer.Argument(help="Engine name or instance ID"),
|
1344
|
-
admin: bool = typer.Option(
|
1345
|
-
False, "--admin", help="Connect as ec2-user instead of the engine owner user"
|
1346
|
-
),
|
1347
|
-
idle_timeout: int = typer.Option(
|
1348
|
-
600,
|
1349
|
-
"--idle-timeout",
|
1350
|
-
help="Idle timeout (seconds) for the SSM port-forward (0 = disable)",
|
1351
|
-
),
|
1352
|
-
):
|
1353
|
-
"""Connect to an engine via SSH.
|
1354
|
-
|
1355
|
-
By default the CLI connects using the engine's owner username (the same one stored in the `User` tag).
|
1356
|
-
Pass `--admin` to connect with the underlying [`ec2-user`] account for break-glass or debugging.
|
1357
|
-
"""
|
1358
|
-
username = check_aws_sso()
|
1359
|
-
|
1360
|
-
# Check for Session Manager Plugin
|
1361
|
-
if not check_session_manager_plugin():
|
1362
|
-
raise typer.Exit(1)
|
1363
|
-
|
1364
|
-
# Get all engines to resolve name
|
1365
|
-
response = make_api_request("GET", "/engines")
|
1366
|
-
if response.status_code != 200:
|
1367
|
-
console.print("[red]❌ Failed to fetch engines[/red]")
|
1368
|
-
raise typer.Exit(1)
|
1369
|
-
|
1370
|
-
engines = response.json().get("engines", [])
|
1371
|
-
engine = resolve_engine(name_or_id, engines)
|
1372
|
-
|
1373
|
-
if engine["state"].lower() != "running":
|
1374
|
-
console.print(f"[red]❌ Engine is not running (state: {engine['state']})[/red]")
|
1375
|
-
raise typer.Exit(1)
|
1376
|
-
|
1377
|
-
# Choose SSH user
|
1378
|
-
ssh_user = "ec2-user" if admin else username
|
1379
|
-
|
1380
|
-
# Update SSH config
|
1381
|
-
console.print(
|
1382
|
-
f"Updating SSH config for [cyan]{engine['name']}[/cyan] (user: {ssh_user})..."
|
1383
|
-
)
|
1384
|
-
update_ssh_config_entry(
|
1385
|
-
engine["name"], engine["instance_id"], ssh_user, idle_timeout
|
1386
|
-
)
|
1387
|
-
|
1388
|
-
# Connect
|
1389
|
-
console.print(f"[green]✓ Connecting to {engine['name']}...[/green]")
|
1390
|
-
subprocess.run(["ssh", engine["name"]])
|
1391
|
-
|
1392
|
-
|
1393
|
-
@engine_app.command("config-ssh")
|
1394
|
-
def config_ssh(
|
1395
|
-
clean: bool = typer.Option(False, "--clean", help="Remove all managed entries"),
|
1396
|
-
all_engines: bool = typer.Option(
|
1397
|
-
False, "--all", "-a", help="Include all engines from all users"
|
1398
|
-
),
|
1399
|
-
admin: bool = typer.Option(
|
1400
|
-
False,
|
1401
|
-
"--admin",
|
1402
|
-
help="Generate entries that use ec2-user instead of per-engine owner user",
|
1403
|
-
),
|
1404
|
-
):
|
1405
|
-
"""Update SSH config with available engines."""
|
1406
|
-
username = check_aws_sso()
|
1407
|
-
|
1408
|
-
# Only check for Session Manager Plugin if we're not just cleaning
|
1409
|
-
if not clean and not check_session_manager_plugin():
|
1410
|
-
raise typer.Exit(1)
|
1411
|
-
|
1412
|
-
if clean:
|
1413
|
-
console.print("Removing all managed SSH entries...")
|
1414
|
-
else:
|
1415
|
-
if all_engines:
|
1416
|
-
console.print("Updating SSH config with all running engines...")
|
1417
|
-
else:
|
1418
|
-
console.print(
|
1419
|
-
f"Updating SSH config with running engines for [cyan]{username}[/cyan] and [cyan]shared[/cyan]..."
|
1420
|
-
)
|
1421
|
-
|
1422
|
-
# Get all engines
|
1423
|
-
response = make_api_request("GET", "/engines")
|
1424
|
-
if response.status_code != 200:
|
1425
|
-
console.print("[red]❌ Failed to fetch engines[/red]")
|
1426
|
-
raise typer.Exit(1)
|
1427
|
-
|
1428
|
-
engines = response.json().get("engines", [])
|
1429
|
-
running_engines = [e for e in engines if e["state"].lower() == "running"]
|
1430
|
-
|
1431
|
-
# Filter engines based on options
|
1432
|
-
if not all_engines:
|
1433
|
-
# Show only current user's engines and shared engines
|
1434
|
-
running_engines = [
|
1435
|
-
e for e in running_engines if e["user"] == username or e["user"] == "shared"
|
1436
|
-
]
|
1437
|
-
|
1438
|
-
# Read existing config
|
1439
|
-
config_path = Path.home() / ".ssh" / "config"
|
1440
|
-
config_path.parent.mkdir(mode=0o700, exist_ok=True)
|
1441
|
-
|
1442
|
-
if config_path.exists():
|
1443
|
-
content = config_path.read_text()
|
1444
|
-
lines = content.splitlines()
|
1445
|
-
else:
|
1446
|
-
content = ""
|
1447
|
-
lines = []
|
1448
|
-
|
1449
|
-
# Remove old managed entries
|
1450
|
-
new_lines = []
|
1451
|
-
skip_until_next_host = False
|
1452
|
-
for line in lines:
|
1453
|
-
if SSH_MANAGED_COMMENT in line:
|
1454
|
-
skip_until_next_host = True
|
1455
|
-
elif line.strip().startswith("Host ") and skip_until_next_host:
|
1456
|
-
skip_until_next_host = False
|
1457
|
-
# Check if this is a managed host
|
1458
|
-
if SSH_MANAGED_COMMENT not in line:
|
1459
|
-
new_lines.append(line)
|
1460
|
-
elif not skip_until_next_host:
|
1461
|
-
new_lines.append(line)
|
1462
|
-
|
1463
|
-
# Add new entries if not cleaning
|
1464
|
-
if not clean:
|
1465
|
-
for engine in running_engines:
|
1466
|
-
# Determine ssh user based on --admin flag
|
1467
|
-
ssh_user = "ec2-user" if admin else username
|
1468
|
-
new_lines.extend(
|
1469
|
-
[
|
1470
|
-
"",
|
1471
|
-
f"Host {engine['name']} {SSH_MANAGED_COMMENT}",
|
1472
|
-
f" HostName {engine['instance_id']}",
|
1473
|
-
f" User {ssh_user}",
|
1474
|
-
f" ProxyCommand sh -c \"AWS_SSM_IDLE_TIMEOUT=600 aws ssm start-session --target %h --document-name AWS-StartSSHSession --parameters 'portNumber=%p'\"",
|
1475
|
-
]
|
1476
|
-
)
|
1477
|
-
|
1478
|
-
# Write back
|
1479
|
-
config_path.write_text("\n".join(new_lines))
|
1480
|
-
config_path.chmod(0o600)
|
1481
|
-
|
1482
|
-
if clean:
|
1483
|
-
console.print("[green]✓ Removed all managed SSH entries[/green]")
|
1484
|
-
else:
|
1485
|
-
console.print(
|
1486
|
-
f"[green]✓ Updated SSH config with {len(running_engines)} engines[/green]"
|
1487
|
-
)
|
1488
|
-
for engine in running_engines:
|
1489
|
-
user_display = (
|
1490
|
-
f"[dim]({engine['user']})[/dim]" if engine["user"] != username else ""
|
1491
|
-
)
|
1492
|
-
console.print(
|
1493
|
-
f" • {engine['name']} → {engine['instance_id']} {user_display}"
|
1494
|
-
)
|
1495
|
-
|
1496
|
-
|
1497
|
-
@engine_app.command("coffee")
|
1498
|
-
def coffee(
|
1499
|
-
name_or_id: str = typer.Argument(help="Engine name or instance ID"),
|
1500
|
-
duration: str = typer.Argument("4h", help="Duration (e.g., 2h, 30m, 2h30m)"),
|
1501
|
-
cancel: bool = typer.Option(
|
1502
|
-
False, "--cancel", help="Cancel existing coffee lock instead of extending"
|
1503
|
-
),
|
1504
|
-
):
|
1505
|
-
"""Pour ☕ for an engine: keeps it awake for the given duration (or cancel)."""
|
1506
|
-
username = check_aws_sso()
|
1507
|
-
|
1508
|
-
# Parse duration
|
1509
|
-
import re
|
1510
|
-
|
1511
|
-
if not cancel:
|
1512
|
-
match = re.match(r"(?:(\d+)h)?(?:(\d+)m)?", duration)
|
1513
|
-
if not match or (not match.group(1) and not match.group(2)):
|
1514
|
-
console.print(f"[red]❌ Invalid duration format: {duration}[/red]")
|
1515
|
-
console.print("Use format like: 4h, 30m, 2h30m")
|
1516
|
-
raise typer.Exit(1)
|
1517
|
-
|
1518
|
-
hours = int(match.group(1) or 0)
|
1519
|
-
minutes = int(match.group(2) or 0)
|
1520
|
-
seconds_total = (hours * 60 + minutes) * 60
|
1521
|
-
if seconds_total == 0:
|
1522
|
-
console.print("[red]❌ Duration must be greater than zero[/red]")
|
1523
|
-
raise typer.Exit(1)
|
1524
|
-
|
1525
|
-
# Get all engines to resolve name
|
1526
|
-
response = make_api_request("GET", "/engines")
|
1527
|
-
if response.status_code != 200:
|
1528
|
-
console.print("[red]❌ Failed to fetch engines[/red]")
|
1529
|
-
raise typer.Exit(1)
|
1530
|
-
|
1531
|
-
engines = response.json().get("engines", [])
|
1532
|
-
engine = resolve_engine(name_or_id, engines)
|
1533
|
-
|
1534
|
-
if engine["state"].lower() != "running":
|
1535
|
-
console.print(f"[red]❌ Engine is not running (state: {engine['state']})[/red]")
|
1536
|
-
raise typer.Exit(1)
|
1537
|
-
|
1538
|
-
if cancel:
|
1539
|
-
console.print(f"Cancelling coffee for [cyan]{engine['name']}[/cyan]…")
|
1540
|
-
else:
|
1541
|
-
console.print(
|
1542
|
-
f"Pouring coffee for [cyan]{engine['name']}[/cyan] for {duration}…"
|
1543
|
-
)
|
1544
|
-
|
1545
|
-
# Use SSM to run the engine coffee command
|
1546
|
-
ssm = boto3.client("ssm", region_name="us-east-1")
|
1547
|
-
try:
|
1548
|
-
response = ssm.send_command(
|
1549
|
-
InstanceIds=[engine["instance_id"]],
|
1550
|
-
DocumentName="AWS-RunShellScript",
|
1551
|
-
Parameters={
|
1552
|
-
"commands": [
|
1553
|
-
(
|
1554
|
-
"/usr/local/bin/engine-coffee --cancel"
|
1555
|
-
if cancel
|
1556
|
-
else f"/usr/local/bin/engine-coffee {seconds_total}"
|
1557
|
-
)
|
1558
|
-
],
|
1559
|
-
"executionTimeout": ["60"],
|
1560
|
-
},
|
1561
|
-
)
|
1562
|
-
|
1563
|
-
command_id = response["Command"]["CommandId"]
|
1564
|
-
|
1565
|
-
# Wait for command to complete
|
1566
|
-
for _ in range(10):
|
1567
|
-
time.sleep(1)
|
1568
|
-
result = ssm.get_command_invocation(
|
1569
|
-
CommandId=command_id,
|
1570
|
-
InstanceId=engine["instance_id"],
|
1571
|
-
)
|
1572
|
-
if result["Status"] in ["Success", "Failed"]:
|
1573
|
-
break
|
1574
|
-
|
1575
|
-
if result["Status"] == "Success":
|
1576
|
-
if cancel:
|
1577
|
-
console.print(
|
1578
|
-
"[green]✓ Coffee cancelled – auto-shutdown re-enabled[/green]"
|
1579
|
-
)
|
1580
|
-
else:
|
1581
|
-
console.print(f"[green]✓ Coffee poured for {duration}[/green]")
|
1582
|
-
console.print(
|
1583
|
-
"\n[dim]Note: Detached Docker containers (except dev containers) will also keep the engine awake.[/dim]"
|
1584
|
-
)
|
1585
|
-
console.print(
|
1586
|
-
"[dim]Use coffee for nohup operations or other background tasks.[/dim]"
|
1587
|
-
)
|
1588
|
-
else:
|
1589
|
-
console.print(
|
1590
|
-
f"[red]❌ Failed to manage coffee: {result.get('StatusDetails', 'Unknown error')}[/red]"
|
1591
|
-
)
|
1592
|
-
|
1593
|
-
except ClientError as e:
|
1594
|
-
console.print(f"[red]❌ Failed to manage coffee: {e}[/red]")
|
1595
|
-
|
1596
|
-
|
1597
|
-
@engine_app.command("resize")
|
1598
|
-
def resize_engine(
|
1599
|
-
name_or_id: str = typer.Argument(help="Engine name or instance ID"),
|
1600
|
-
size: int = typer.Option(..., "--size", "-s", help="New size in GB"),
|
1601
|
-
online: bool = typer.Option(
|
1602
|
-
False,
|
1603
|
-
"--online",
|
1604
|
-
help="Resize while running (requires manual filesystem expansion)",
|
1605
|
-
),
|
1606
|
-
force: bool = typer.Option(
|
1607
|
-
False, "--force", "-f", help="Force resize and detach all studios"
|
1608
|
-
),
|
1609
|
-
):
|
1610
|
-
"""Resize an engine's boot disk."""
|
1611
|
-
check_aws_sso()
|
1612
|
-
|
1613
|
-
# Get all engines to resolve name
|
1614
|
-
response = make_api_request("GET", "/engines")
|
1615
|
-
if response.status_code != 200:
|
1616
|
-
console.print("[red]❌ Failed to fetch engines[/red]")
|
1617
|
-
raise typer.Exit(1)
|
1618
|
-
|
1619
|
-
engines = response.json().get("engines", [])
|
1620
|
-
engine = resolve_engine(name_or_id, engines)
|
1621
|
-
|
1622
|
-
# Get current volume info to validate size
|
1623
|
-
ec2 = boto3.client("ec2", region_name="us-east-1")
|
1624
|
-
|
1625
|
-
try:
|
1626
|
-
# Get instance details to find root volume
|
1627
|
-
instance_info = ec2.describe_instances(InstanceIds=[engine["instance_id"]])
|
1628
|
-
instance = instance_info["Reservations"][0]["Instances"][0]
|
1629
|
-
|
1630
|
-
# Find root volume
|
1631
|
-
root_device = instance.get("RootDeviceName", "/dev/xvda")
|
1632
|
-
root_volume_id = None
|
1633
|
-
|
1634
|
-
for bdm in instance.get("BlockDeviceMappings", []):
|
1635
|
-
if bdm["DeviceName"] == root_device:
|
1636
|
-
root_volume_id = bdm["Ebs"]["VolumeId"]
|
1637
|
-
break
|
1638
|
-
|
1639
|
-
if not root_volume_id:
|
1640
|
-
console.print("[red]❌ Could not find root volume[/red]")
|
1641
|
-
raise typer.Exit(1)
|
1642
|
-
|
1643
|
-
# Get current volume size
|
1644
|
-
volumes = ec2.describe_volumes(VolumeIds=[root_volume_id])
|
1645
|
-
current_size = volumes["Volumes"][0]["Size"]
|
1646
|
-
|
1647
|
-
if size <= current_size:
|
1648
|
-
console.print(
|
1649
|
-
f"[red]❌ New size ({size}GB) must be larger than current size ({current_size}GB)[/red]"
|
1650
|
-
)
|
1651
|
-
raise typer.Exit(1)
|
1652
|
-
|
1653
|
-
console.print(
|
1654
|
-
f"[yellow]Resizing engine boot disk from {current_size}GB to {size}GB[/yellow]"
|
1655
|
-
)
|
1656
|
-
|
1657
|
-
# Check if we need to stop the instance
|
1658
|
-
if not online and engine["state"].lower() == "running":
|
1659
|
-
console.print("Stopping engine for offline resize...")
|
1660
|
-
stop_response = make_api_request(
|
1661
|
-
"POST",
|
1662
|
-
f"/engines/{engine['instance_id']}/stop",
|
1663
|
-
json_data={"detach_studios": False},
|
1664
|
-
)
|
1665
|
-
if stop_response.status_code != 200:
|
1666
|
-
console.print("[red]❌ Failed to stop engine[/red]")
|
1667
|
-
raise typer.Exit(1)
|
1668
|
-
|
1669
|
-
# Wait for instance to stop
|
1670
|
-
console.print("Waiting for engine to stop...")
|
1671
|
-
waiter = ec2.get_waiter("instance_stopped")
|
1672
|
-
waiter.wait(InstanceIds=[engine["instance_id"]])
|
1673
|
-
console.print("[green]✓ Engine stopped[/green]")
|
1674
|
-
|
1675
|
-
# Call the resize API
|
1676
|
-
console.print("Resizing volume...")
|
1677
|
-
resize_response = make_api_request(
|
1678
|
-
"POST",
|
1679
|
-
f"/engines/{engine['instance_id']}/resize",
|
1680
|
-
json_data={"size": size, "detach_studios": force},
|
1681
|
-
)
|
1682
|
-
|
1683
|
-
if resize_response.status_code == 409 and not force:
|
1684
|
-
# Engine has attached studios
|
1685
|
-
data = resize_response.json()
|
1686
|
-
attached_studios = data.get("attached_studios", [])
|
1687
|
-
|
1688
|
-
console.print("\n[yellow]⚠️ This engine has attached studios:[/yellow]")
|
1689
|
-
for studio in attached_studios:
|
1690
|
-
console.print(f" • {studio['user']} ({studio['studio_id']})")
|
1691
|
-
|
1692
|
-
if Confirm.ask("\nDetach all studios and resize the engine?"):
|
1693
|
-
resize_response = make_api_request(
|
1694
|
-
"POST",
|
1695
|
-
f"/engines/{engine['instance_id']}/resize",
|
1696
|
-
json_data={"size": size, "detach_studios": True},
|
1697
|
-
)
|
1698
|
-
else:
|
1699
|
-
console.print("Resize cancelled.")
|
1700
|
-
return
|
1701
|
-
|
1702
|
-
if resize_response.status_code != 200:
|
1703
|
-
error = resize_response.json().get("error", "Unknown error")
|
1704
|
-
console.print(f"[red]❌ Failed to resize engine: {error}[/red]")
|
1705
|
-
raise typer.Exit(1)
|
1706
|
-
|
1707
|
-
# Check if studios were detached
|
1708
|
-
data = resize_response.json()
|
1709
|
-
detached_studios = data.get("detached_studios", 0)
|
1710
|
-
if detached_studios > 0:
|
1711
|
-
console.print(
|
1712
|
-
f"[green]✓ Detached {detached_studios} studio(s) before resize[/green]"
|
1713
|
-
)
|
1714
|
-
|
1715
|
-
# Wait for modification to complete
|
1716
|
-
console.print("Waiting for volume modification to complete...")
|
1717
|
-
while True:
|
1718
|
-
mod_state = ec2.describe_volumes_modifications(VolumeIds=[root_volume_id])
|
1719
|
-
if not mod_state["VolumesModifications"]:
|
1720
|
-
break # Modification complete
|
1721
|
-
|
1722
|
-
modification = mod_state["VolumesModifications"][0]
|
1723
|
-
state = modification["ModificationState"]
|
1724
|
-
progress = modification.get("Progress", 0)
|
1725
|
-
|
1726
|
-
# Show progress updates only for the resize phase
|
1727
|
-
if state == "modifying":
|
1728
|
-
console.print(f"[yellow]Progress: {progress}%[/yellow]")
|
1729
|
-
|
1730
|
-
# Exit as soon as optimization starts (resize is complete)
|
1731
|
-
if state == "optimizing":
|
1732
|
-
console.print("[green]✓ Volume resized successfully[/green]")
|
1733
|
-
console.print(
|
1734
|
-
"[dim]AWS is optimizing the volume in the background (no action needed).[/dim]"
|
1735
|
-
)
|
1736
|
-
break
|
1737
|
-
|
1738
|
-
if state == "completed":
|
1739
|
-
console.print("[green]✓ Volume resized successfully[/green]")
|
1740
|
-
break
|
1741
|
-
elif state == "failed":
|
1742
|
-
console.print("[red]❌ Volume modification failed[/red]")
|
1743
|
-
raise typer.Exit(1)
|
1744
|
-
|
1745
|
-
time.sleep(2) # Check more frequently for better UX
|
1746
|
-
|
1747
|
-
# If offline resize, start the instance back up
|
1748
|
-
if not online and engine["state"].lower() == "running":
|
1749
|
-
console.print("Starting engine back up...")
|
1750
|
-
start_response = make_api_request(
|
1751
|
-
"POST", f"/engines/{engine['instance_id']}/start"
|
1752
|
-
)
|
1753
|
-
if start_response.status_code != 200:
|
1754
|
-
console.print(
|
1755
|
-
"[yellow]⚠️ Failed to restart engine automatically[/yellow]"
|
1756
|
-
)
|
1757
|
-
console.print(
|
1758
|
-
f"Please start it manually: [cyan]dh engine start {engine['name']}[/cyan]"
|
1759
|
-
)
|
1760
|
-
else:
|
1761
|
-
console.print("[green]✓ Engine started[/green]")
|
1762
|
-
console.print("The filesystem will be automatically expanded on boot.")
|
1763
|
-
|
1764
|
-
elif online and engine["state"].lower() == "running":
|
1765
|
-
console.print(
|
1766
|
-
"\n[yellow]⚠️ Online resize complete. You must now expand the filesystem:[/yellow]"
|
1767
|
-
)
|
1768
|
-
console.print(f"1. SSH into the engine: [cyan]ssh {engine['name']}[/cyan]")
|
1769
|
-
console.print("2. Find the root device: [cyan]lsblk[/cyan]")
|
1770
|
-
console.print(
|
1771
|
-
"3. Expand the partition: [cyan]sudo growpart /dev/nvme0n1 1[/cyan] (adjust device name as needed)"
|
1772
|
-
)
|
1773
|
-
console.print("4. Expand the filesystem: [cyan]sudo xfs_growfs /[/cyan]")
|
1774
|
-
|
1775
|
-
except ClientError as e:
|
1776
|
-
console.print(f"[red]❌ Failed to resize engine: {e}[/red]")
|
1777
|
-
raise typer.Exit(1)
|
1778
|
-
|
1779
|
-
|
1780
|
-
@engine_app.command("gami")
|
1781
|
-
def create_ami(
|
1782
|
-
name_or_id: str = typer.Argument(
|
1783
|
-
help="Engine name or instance ID to create AMI from"
|
1784
|
-
),
|
1785
|
-
):
|
1786
|
-
"""Create a 'Golden AMI' from a running engine.
|
1787
|
-
|
1788
|
-
This process is for creating a pre-warmed, standardized machine image
|
1789
|
-
that can be used to launch new engines more quickly.
|
1790
|
-
|
1791
|
-
IMPORTANT:
|
1792
|
-
- The engine MUST have all studios detached before running this command.
|
1793
|
-
- This process will make the source engine unusable. You should
|
1794
|
-
plan to TERMINATE the engine after the AMI is created.
|
1795
|
-
"""
|
1796
|
-
check_aws_sso()
|
1797
|
-
|
1798
|
-
# Get all engines to resolve name and check status
|
1799
|
-
# We pass check_ready=True to get attached studio info
|
1800
|
-
response = make_api_request("GET", "/engines", params={"check_ready": "true"})
|
1801
|
-
if response.status_code != 200:
|
1802
|
-
console.print("[red]❌ Failed to fetch engines[/red]")
|
1803
|
-
raise typer.Exit(1)
|
1804
|
-
|
1805
|
-
engines = response.json().get("engines", [])
|
1806
|
-
engine = resolve_engine(name_or_id, engines)
|
1807
|
-
|
1808
|
-
# --- Pre-flight checks ---
|
1809
|
-
|
1810
|
-
# 1. Check if engine is running
|
1811
|
-
if engine["state"].lower() != "running":
|
1812
|
-
console.print(f"[red]❌ Engine '{engine['name']}' is not running.[/red]")
|
1813
|
-
console.print("Please start it before creating an AMI.")
|
1814
|
-
raise typer.Exit(1)
|
1815
|
-
|
1816
|
-
# 2. Check for attached studios from the detailed API response
|
1817
|
-
attached_studios = engine.get("studios", [])
|
1818
|
-
if attached_studios:
|
1819
|
-
console.print(
|
1820
|
-
f"[bold red]❌ Engine '{engine['name']}' has studios attached.[/bold red]"
|
1821
|
-
)
|
1822
|
-
console.print("Please detach all studios before creating an AMI:")
|
1823
|
-
for studio in attached_studios:
|
1824
|
-
console.print(f" - {studio['user']} ({studio['studio_id']})")
|
1825
|
-
console.print("\nTo detach, run [bold]dh studio detach[/bold]")
|
1826
|
-
raise typer.Exit(1)
|
1827
|
-
|
1828
|
-
# Construct AMI name and description
|
1829
|
-
ami_name = (
|
1830
|
-
f"prewarmed-engine-{engine['engine_type']}-{datetime.now().strftime('%Y%m%d')}"
|
1831
|
-
)
|
1832
|
-
description = (
|
1833
|
-
f"Amazon Linux 2023 with NVIDIA drivers, Docker, and pre-pulled "
|
1834
|
-
f"dev container image for {engine['engine_type']} engines"
|
1835
|
-
)
|
1836
|
-
|
1837
|
-
console.print(f"Creating AMI from engine [cyan]{engine['name']}[/cyan]...")
|
1838
|
-
console.print(f"[bold]AMI Name:[/] {ami_name}")
|
1839
|
-
console.print(f"[bold]Description:[/] {description}")
|
1840
|
-
|
1841
|
-
console.print(
|
1842
|
-
"\n[bold yellow]⚠️ Important:[/bold yellow]\n"
|
1843
|
-
"1. This process will run cleanup scripts on the engine.\n"
|
1844
|
-
"2. The source engine should be [bold]terminated[/bold] after the AMI is created.\n"
|
1845
|
-
)
|
1846
|
-
|
1847
|
-
if not Confirm.ask("Continue with AMI creation?"):
|
1848
|
-
raise typer.Exit()
|
1849
|
-
|
1850
|
-
# Create AMI using EC2 client directly, as the backend logic is too complex
|
1851
|
-
ec2 = boto3.client("ec2", region_name="us-east-1")
|
1852
|
-
ssm = boto3.client("ssm", region_name="us-east-1")
|
1853
|
-
|
1854
|
-
try:
|
1855
|
-
# Clean up instance state before snapshotting
|
1856
|
-
console.print("Cleaning up instance for AMI creation...")
|
1857
|
-
cleanup_commands = [
|
1858
|
-
"sudo rm -f /opt/dayhoff/first_boot_complete.sentinel",
|
1859
|
-
"history -c",
|
1860
|
-
"sudo rm -rf /tmp/* /var/log/messages /var/log/cloud-init.log",
|
1861
|
-
"sudo rm -rf /var/lib/amazon/ssm/* /etc/amazon/ssm/*",
|
1862
|
-
"sleep 2 && sudo systemctl stop amazon-ssm-agent &", # Stop agent last
|
1863
|
-
]
|
1864
|
-
|
1865
|
-
cleanup_response = ssm.send_command(
|
1866
|
-
InstanceIds=[engine["instance_id"]],
|
1867
|
-
DocumentName="AWS-RunShellScript",
|
1868
|
-
Parameters={"commands": cleanup_commands, "executionTimeout": ["120"]},
|
1869
|
-
)
|
1870
|
-
|
1871
|
-
# Acknowledge that the SSM command might be in progress as the agent shuts down
|
1872
|
-
console.print(
|
1873
|
-
"[dim]ℹ️ Cleanup command sent (status may show 'InProgress' as SSM agent stops)[/dim]"
|
1874
|
-
)
|
1875
|
-
|
1876
|
-
# Create the AMI
|
1877
|
-
with Progress(
|
1878
|
-
SpinnerColumn(),
|
1879
|
-
TextColumn("[progress.description]{task.description}"),
|
1880
|
-
transient=True,
|
1881
|
-
) as progress:
|
1882
|
-
task = progress.add_task(
|
1883
|
-
"Creating AMI (this will take several minutes)...", total=None
|
1884
|
-
)
|
1885
|
-
|
1886
|
-
response = ec2.create_image(
|
1887
|
-
InstanceId=engine["instance_id"],
|
1888
|
-
Name=ami_name,
|
1889
|
-
Description=description,
|
1890
|
-
NoReboot=False,
|
1891
|
-
TagSpecifications=[
|
1892
|
-
{
|
1893
|
-
"ResourceType": "image",
|
1894
|
-
"Tags": [
|
1895
|
-
{"Key": "Environment", "Value": "dev"},
|
1896
|
-
{"Key": "Type", "Value": "golden-ami"},
|
1897
|
-
{"Key": "EngineType", "Value": engine["engine_type"]},
|
1898
|
-
{"Key": "Name", "Value": ami_name},
|
1899
|
-
],
|
1900
|
-
}
|
1901
|
-
],
|
1902
|
-
)
|
1903
|
-
|
1904
|
-
ami_id = response["ImageId"]
|
1905
|
-
progress.update(
|
1906
|
-
task,
|
1907
|
-
completed=True,
|
1908
|
-
description=f"[green]✓ AMI creation initiated![/green]",
|
1909
|
-
)
|
1910
|
-
|
1911
|
-
console.print(f" [bold]AMI ID:[/] {ami_id}")
|
1912
|
-
console.print("\nThe AMI creation process will continue in the background.")
|
1913
|
-
console.print("You can monitor progress in the EC2 Console under 'AMIs'.")
|
1914
|
-
console.print(
|
1915
|
-
"\nOnce complete, update the AMI ID in [bold]terraform/environments/dev/variables.tf[/bold] "
|
1916
|
-
"and run [bold]terraform apply[/bold]."
|
1917
|
-
)
|
1918
|
-
console.print(
|
1919
|
-
f"\nRemember to [bold red]terminate the source engine '{engine['name']}'[/bold red] to save costs."
|
1920
|
-
)
|
1921
|
-
|
1922
|
-
except ClientError as e:
|
1923
|
-
console.print(f"[red]❌ Failed to create AMI: {e}[/red]")
|
1924
|
-
raise typer.Exit(1)
|
1925
|
-
|
1926
|
-
|
1927
|
-
# ==================== STUDIO COMMANDS ====================
|
1928
|
-
|
1929
|
-
|
1930
|
-
def get_user_studio(username: str) -> Optional[Dict]:
|
1931
|
-
"""Get the current user's studio."""
|
1932
|
-
response = make_api_request("GET", "/studios")
|
1933
|
-
if response.status_code != 200:
|
1934
|
-
return None
|
1935
|
-
|
1936
|
-
studios = response.json().get("studios", [])
|
1937
|
-
user_studios = [s for s in studios if s["user"] == username]
|
1938
|
-
|
1939
|
-
return user_studios[0] if user_studios else None
|
1940
|
-
|
1941
|
-
|
1942
|
-
@studio_app.command("create")
|
1943
|
-
def create_studio(
|
1944
|
-
size_gb: int = typer.Option(50, "--size", "-s", help="Studio size in GB"),
|
1945
|
-
):
|
1946
|
-
"""Create a new studio for the current user."""
|
1947
|
-
username = check_aws_sso()
|
1948
|
-
|
1949
|
-
# Check if user already has a studio
|
1950
|
-
existing = get_user_studio(username)
|
1951
|
-
if existing:
|
1952
|
-
console.print(
|
1953
|
-
f"[yellow]You already have a studio: {existing['studio_id']}[/yellow]"
|
1954
|
-
)
|
1955
|
-
return
|
1956
|
-
|
1957
|
-
console.print(f"Creating {size_gb}GB studio for user [cyan]{username}[/cyan]...")
|
1958
|
-
|
1959
|
-
with Progress(
|
1960
|
-
SpinnerColumn(),
|
1961
|
-
TextColumn("[progress.description]{task.description}"),
|
1962
|
-
transient=True,
|
1963
|
-
) as progress:
|
1964
|
-
progress.add_task("Creating studio volume...", total=None)
|
1965
|
-
|
1966
|
-
response = make_api_request(
|
1967
|
-
"POST",
|
1968
|
-
"/studios",
|
1969
|
-
json_data={"user": username, "size_gb": size_gb},
|
1970
|
-
)
|
1971
|
-
|
1972
|
-
if response.status_code == 201:
|
1973
|
-
data = response.json()
|
1974
|
-
console.print(f"[green]✓ Studio created successfully![/green]")
|
1975
|
-
console.print(f"Studio ID: [cyan]{data['studio_id']}[/cyan]")
|
1976
|
-
console.print(f"Size: {data['size_gb']}GB")
|
1977
|
-
console.print(f"\nNext step: [cyan]dh studio attach <engine-name>[/cyan]")
|
1978
|
-
else:
|
1979
|
-
error = response.json().get("error", "Unknown error")
|
1980
|
-
console.print(f"[red]❌ Failed to create studio: {error}[/red]")
|
1981
|
-
|
1982
|
-
|
1983
|
-
@studio_app.command("status")
|
1984
|
-
def studio_status(
|
1985
|
-
user: Optional[str] = typer.Option(
|
1986
|
-
None, "--user", "-u", help="Check status for a different user (admin only)"
|
1987
|
-
),
|
1988
|
-
):
|
1989
|
-
"""Show status of your studio."""
|
1990
|
-
username = check_aws_sso()
|
1991
|
-
|
1992
|
-
# Use specified user if provided, otherwise use current user
|
1993
|
-
target_user = user if user else username
|
1994
|
-
|
1995
|
-
# Add warning when checking another user's studio
|
1996
|
-
if target_user != username:
|
1997
|
-
console.print(
|
1998
|
-
f"[yellow]⚠️ Checking studio status for user: {target_user}[/yellow]"
|
1999
|
-
)
|
2000
|
-
|
2001
|
-
studio = get_user_studio(target_user)
|
2002
|
-
if not studio:
|
2003
|
-
if target_user == username:
|
2004
|
-
console.print("[yellow]You don't have a studio yet.[/yellow]")
|
2005
|
-
console.print("Create one with: [cyan]dh studio create[/cyan]")
|
2006
|
-
else:
|
2007
|
-
console.print(f"[yellow]User {target_user} doesn't have a studio.[/yellow]")
|
2008
|
-
return
|
2009
|
-
|
2010
|
-
# Create status panel
|
2011
|
-
# Format status with colors
|
2012
|
-
status = studio["status"]
|
2013
|
-
if status == "in-use":
|
2014
|
-
status_display = "[bright_blue]attached[/bright_blue]"
|
2015
|
-
elif status in ["attaching", "detaching"]:
|
2016
|
-
status_display = f"[yellow]{status}[/yellow]"
|
2017
|
-
else:
|
2018
|
-
status_display = f"[green]{status}[/green]"
|
2019
|
-
|
2020
|
-
status_lines = [
|
2021
|
-
f"[bold]Studio ID:[/bold] {studio['studio_id']}",
|
2022
|
-
f"[bold]User:[/bold] {studio['user']}",
|
2023
|
-
f"[bold]Status:[/bold] {status_display}",
|
2024
|
-
f"[bold]Size:[/bold] {studio['size_gb']}GB",
|
2025
|
-
f"[bold]Created:[/bold] {studio['creation_date']}",
|
2026
|
-
]
|
2027
|
-
|
2028
|
-
if studio.get("attached_vm_id"):
|
2029
|
-
status_lines.append(f"[bold]Attached to:[/bold] {studio['attached_vm_id']}")
|
2030
|
-
|
2031
|
-
# Try to get engine details
|
2032
|
-
response = make_api_request("GET", "/engines")
|
2033
|
-
if response.status_code == 200:
|
2034
|
-
engines = response.json().get("engines", [])
|
2035
|
-
attached_engine = next(
|
2036
|
-
(e for e in engines if e["instance_id"] == studio["attached_vm_id"]),
|
2037
|
-
None,
|
2038
|
-
)
|
2039
|
-
if attached_engine:
|
2040
|
-
status_lines.append(
|
2041
|
-
f"[bold]Engine Name:[/bold] {attached_engine['name']}"
|
2042
|
-
)
|
2043
|
-
|
2044
|
-
panel = Panel(
|
2045
|
-
"\n".join(status_lines),
|
2046
|
-
title="Studio Details",
|
2047
|
-
border_style="blue",
|
2048
|
-
)
|
2049
|
-
console.print(panel)
|
2050
|
-
|
2051
|
-
|
2052
|
-
def _is_studio_attached(target_studio_id: str, target_vm_id: str) -> bool:
|
2053
|
-
"""Return True when the given studio already shows as attached to the VM.
|
2054
|
-
|
2055
|
-
Using this extra check lets us stop the outer retry loop as soon as the
|
2056
|
-
asynchronous attach operation actually finishes, even in the unlikely
|
2057
|
-
event that the operation-tracking DynamoDB record is not yet updated.
|
2058
|
-
"""
|
2059
|
-
# First try the per-studio endpoint – fastest.
|
2060
|
-
resp = make_api_request("GET", f"/studios/{target_studio_id}")
|
2061
|
-
if resp.status_code == 200:
|
2062
|
-
data = resp.json()
|
2063
|
-
if (
|
2064
|
-
data.get("status") == "in-use"
|
2065
|
-
and data.get("attached_vm_id") == target_vm_id
|
2066
|
-
):
|
2067
|
-
return True
|
2068
|
-
# Fallback: list + filter (covers edge-cases where the direct endpoint
|
2069
|
-
# is slower to update IAM/APIGW mapping than the list endpoint).
|
2070
|
-
list_resp = make_api_request("GET", "/studios")
|
2071
|
-
if list_resp.status_code == 200:
|
2072
|
-
for stu in list_resp.json().get("studios", []):
|
2073
|
-
if (
|
2074
|
-
stu.get("studio_id") == target_studio_id
|
2075
|
-
and stu.get("status") == "in-use"
|
2076
|
-
and stu.get("attached_vm_id") == target_vm_id
|
2077
|
-
):
|
2078
|
-
return True
|
2079
|
-
return False
|
2080
|
-
|
2081
|
-
|
2082
|
-
@studio_app.command("attach")
|
2083
|
-
def attach_studio(
|
2084
|
-
engine_name_or_id: str = typer.Argument(help="Engine name or instance ID"),
|
2085
|
-
user: Optional[str] = typer.Option(
|
2086
|
-
None, "--user", "-u", help="Attach a different user's studio (admin only)"
|
2087
|
-
),
|
2088
|
-
):
|
2089
|
-
"""Attach your studio to an engine."""
|
2090
|
-
username = check_aws_sso()
|
2091
|
-
|
2092
|
-
# Check for Session Manager Plugin since we'll update SSH config
|
2093
|
-
if not check_session_manager_plugin():
|
2094
|
-
raise typer.Exit(1)
|
2095
|
-
|
2096
|
-
# Use specified user if provided, otherwise use current user
|
2097
|
-
target_user = user if user else username
|
2098
|
-
|
2099
|
-
# Add confirmation when attaching another user's studio
|
2100
|
-
if target_user != username:
|
2101
|
-
console.print(f"[yellow]⚠️ Managing studio for user: {target_user}[/yellow]")
|
2102
|
-
if not Confirm.ask(f"Are you sure you want to attach {target_user}'s studio?"):
|
2103
|
-
console.print("Operation cancelled.")
|
2104
|
-
return
|
2105
|
-
|
2106
|
-
# Get user's studio
|
2107
|
-
studio = get_user_studio(target_user)
|
2108
|
-
if not studio:
|
2109
|
-
if target_user == username:
|
2110
|
-
console.print("[yellow]You don't have a studio yet.[/yellow]")
|
2111
|
-
if Confirm.ask("Would you like to create one now?"):
|
2112
|
-
size = IntPrompt.ask("Studio size (GB)", default=50)
|
2113
|
-
response = make_api_request(
|
2114
|
-
"POST",
|
2115
|
-
"/studios",
|
2116
|
-
json_data={"user": username, "size_gb": size},
|
2117
|
-
)
|
2118
|
-
if response.status_code != 201:
|
2119
|
-
console.print("[red]❌ Failed to create studio[/red]")
|
2120
|
-
raise typer.Exit(1)
|
2121
|
-
studio = response.json()
|
2122
|
-
studio["studio_id"] = studio["studio_id"] # Normalize key
|
2123
|
-
else:
|
2124
|
-
raise typer.Exit(0)
|
2125
|
-
else:
|
2126
|
-
console.print(f"[red]❌ User {target_user} doesn't have a studio.[/red]")
|
2127
|
-
raise typer.Exit(1)
|
2128
|
-
|
2129
|
-
# Check if already attached
|
2130
|
-
if studio.get("status") == "in-use":
|
2131
|
-
console.print(
|
2132
|
-
f"[yellow]Studio is already attached to {studio.get('attached_vm_id')}[/yellow]"
|
2133
|
-
)
|
2134
|
-
if not Confirm.ask("Detach and reattach to new engine?"):
|
2135
|
-
return
|
2136
|
-
# Detach first
|
2137
|
-
response = make_api_request("POST", f"/studios/{studio['studio_id']}/detach")
|
2138
|
-
if response.status_code != 200:
|
2139
|
-
console.print("[red]❌ Failed to detach studio[/red]")
|
2140
|
-
raise typer.Exit(1)
|
2141
|
-
|
2142
|
-
# Get all engines to resolve name
|
2143
|
-
response = make_api_request("GET", "/engines")
|
2144
|
-
if response.status_code != 200:
|
2145
|
-
console.print("[red]❌ Failed to fetch engines[/red]")
|
2146
|
-
raise typer.Exit(1)
|
2147
|
-
|
2148
|
-
engines = response.json().get("engines", [])
|
2149
|
-
engine = resolve_engine(engine_name_or_id, engines)
|
2150
|
-
|
2151
|
-
# Flag to track if we started the engine in this command (affects retry length)
|
2152
|
-
engine_started_now: bool = False
|
2153
|
-
|
2154
|
-
if engine["state"].lower() != "running":
|
2155
|
-
console.print(f"[yellow]⚠️ Engine is {engine['state']}[/yellow]")
|
2156
|
-
if engine["state"].lower() == "stopped" and Confirm.ask(
|
2157
|
-
"Start the engine first?"
|
2158
|
-
):
|
2159
|
-
response = make_api_request(
|
2160
|
-
"POST", f"/engines/{engine['instance_id']}/start"
|
2161
|
-
)
|
2162
|
-
if response.status_code != 200:
|
2163
|
-
console.print("[red]❌ Failed to start engine[/red]")
|
2164
|
-
raise typer.Exit(1)
|
2165
|
-
console.print("[green]✓ Engine started[/green]")
|
2166
|
-
# Mark that we booted the engine so attach loop gets extended retries
|
2167
|
-
engine_started_now = True
|
2168
|
-
# No further waiting here – attachment attempts below handle retry logic while the
|
2169
|
-
# engine finishes booting.
|
2170
|
-
else:
|
2171
|
-
raise typer.Exit(1)
|
2172
|
-
|
2173
|
-
# Retrieve SSH public key (required for authorised_keys provisioning)
|
2174
|
-
try:
|
2175
|
-
public_key = get_ssh_public_key()
|
2176
|
-
except FileNotFoundError as e:
|
2177
|
-
console.print(f"[red]❌ {e}[/red]")
|
2178
|
-
raise typer.Exit(1)
|
2179
|
-
|
2180
|
-
console.print(f"Attaching studio to engine [cyan]{engine['name']}[/cyan]...")
|
2181
|
-
|
2182
|
-
# Determine retry strategy based on whether we just started the engine
|
2183
|
-
if engine_started_now:
|
2184
|
-
max_attempts = 40 # About 7 minutes total with exponential backoff
|
2185
|
-
base_delay = 8
|
2186
|
-
max_delay = 20
|
2187
|
-
else:
|
2188
|
-
max_attempts = 15 # About 2 minutes total with exponential backoff
|
2189
|
-
base_delay = 5
|
2190
|
-
max_delay = 10
|
2191
|
-
|
2192
|
-
# Unified retry loop with exponential backoff
|
2193
|
-
with Progress(
|
2194
|
-
SpinnerColumn(),
|
2195
|
-
TimeElapsedColumn(),
|
2196
|
-
TextColumn("[progress.description]{task.description}"),
|
2197
|
-
transient=True,
|
2198
|
-
) as prog:
|
2199
|
-
desc = (
|
2200
|
-
"Attaching studio (engine is still booting)…"
|
2201
|
-
if engine_started_now
|
2202
|
-
else "Attaching studio…"
|
2203
|
-
)
|
2204
|
-
task = prog.add_task(desc, total=None)
|
2205
|
-
|
2206
|
-
consecutive_not_ready = 0
|
2207
|
-
last_error = None
|
2208
|
-
|
2209
|
-
for attempt in range(max_attempts):
|
2210
|
-
# Check if the attach already completed
|
2211
|
-
if _is_studio_attached(studio["studio_id"], engine["instance_id"]):
|
2212
|
-
success = True
|
2213
|
-
break
|
2214
|
-
|
2215
|
-
success, error_msg = _attempt_studio_attach(
|
2216
|
-
studio, engine, target_user, public_key
|
2217
|
-
)
|
2218
|
-
|
2219
|
-
if success:
|
2220
|
-
break # success!
|
2221
|
-
|
2222
|
-
if error_msg:
|
2223
|
-
# Fatal error – bubble up immediately
|
2224
|
-
console.print(f"[red]❌ Failed to attach studio: {error_msg}[/red]")
|
2225
|
-
|
2226
|
-
# Suggest repair command if engine seems broken
|
2227
|
-
if "not ready" in error_msg.lower() and attempt > 5:
|
2228
|
-
console.print(
|
2229
|
-
f"\n[yellow]Engine may be in a bad state. Try:[/yellow]"
|
2230
|
-
)
|
2231
|
-
console.print(f"[dim] dh engine repair {engine['name']}[/dim]")
|
2232
|
-
return
|
2233
|
-
|
2234
|
-
# Track consecutive "not ready" responses
|
2235
|
-
consecutive_not_ready += 1
|
2236
|
-
last_error = "Engine not ready"
|
2237
|
-
|
2238
|
-
# Update progress display
|
2239
|
-
if attempt % 3 == 0:
|
2240
|
-
prog.update(
|
2241
|
-
task,
|
2242
|
-
description=f"{desc} attempt {attempt+1}/{max_attempts}",
|
2243
|
-
)
|
2244
|
-
|
2245
|
-
# If engine seems stuck after many attempts, show a hint
|
2246
|
-
if consecutive_not_ready > 10 and attempt == 10:
|
2247
|
-
console.print(
|
2248
|
-
"[yellow]Engine is taking longer than expected to become ready.[/yellow]"
|
2249
|
-
)
|
2250
|
-
console.print(
|
2251
|
-
"[dim]This can happen after GAMI creation or if the engine is still bootstrapping.[/dim]"
|
2252
|
-
)
|
2253
|
-
|
2254
|
-
# Exponential backoff with jitter
|
2255
|
-
delay = min(base_delay * (1.5 ** min(attempt, 5)), max_delay)
|
2256
|
-
delay += time.time() % 2 # Add 0-2 seconds of jitter
|
2257
|
-
time.sleep(delay)
|
2258
|
-
|
2259
|
-
else:
|
2260
|
-
# All attempts exhausted
|
2261
|
-
console.print(
|
2262
|
-
f"[yellow]Engine is not becoming ready after {max_attempts} attempts.[/yellow]"
|
2263
|
-
)
|
2264
|
-
if last_error:
|
2265
|
-
console.print(f"[dim]Last issue: {last_error}[/dim]")
|
2266
|
-
console.print("\n[yellow]You can try:[/yellow]")
|
2267
|
-
console.print(
|
2268
|
-
f" 1. Wait a minute and retry: [cyan]dh studio attach {engine['name']}[/cyan]"
|
2269
|
-
)
|
2270
|
-
console.print(
|
2271
|
-
f" 2. Check engine status: [cyan]dh engine status {engine['name']}[/cyan]"
|
2272
|
-
)
|
2273
|
-
console.print(
|
2274
|
-
f" 3. Repair the engine: [cyan]dh engine repair {engine['name']}[/cyan]"
|
2275
|
-
)
|
2276
|
-
return
|
2277
|
-
|
2278
|
-
# Successful attach path
|
2279
|
-
console.print(f"[green]✓ Studio attached successfully![/green]")
|
2280
|
-
|
2281
|
-
# Update SSH config - use target_user for the connection
|
2282
|
-
update_ssh_config_entry(engine["name"], engine["instance_id"], target_user)
|
2283
|
-
console.print(f"[green]✓ SSH config updated[/green]")
|
2284
|
-
console.print(f"\nConnect with: [cyan]ssh {engine['name']}[/cyan]")
|
2285
|
-
console.print(f"Files are at: [cyan]/studios/{target_user}[/cyan]")
|
2286
|
-
|
2287
|
-
|
2288
|
-
def _attempt_studio_attach(studio, engine, target_user, public_key):
|
2289
|
-
response = make_api_request(
|
2290
|
-
"POST",
|
2291
|
-
f"/studios/{studio['studio_id']}/attach",
|
2292
|
-
json_data={
|
2293
|
-
"vm_id": engine["instance_id"],
|
2294
|
-
"user": target_user,
|
2295
|
-
"public_key": public_key,
|
2296
|
-
},
|
2297
|
-
)
|
2298
|
-
|
2299
|
-
# Fast-path success
|
2300
|
-
if response.status_code == 200:
|
2301
|
-
return True, None
|
2302
|
-
|
2303
|
-
# Asynchronous path – API returned 202 Accepted and operation tracking ID
|
2304
|
-
if response.status_code == 202:
|
2305
|
-
# The operation status polling is broken in the Lambda, so we just
|
2306
|
-
# wait and check if the studio is actually attached
|
2307
|
-
time.sleep(5) # Give the async operation a moment to start
|
2308
|
-
|
2309
|
-
# Check periodically if the studio is attached
|
2310
|
-
for check in range(20): # Check for up to 60 seconds
|
2311
|
-
if _is_studio_attached(studio["studio_id"], engine["instance_id"]):
|
2312
|
-
return True, None
|
2313
|
-
time.sleep(3)
|
2314
|
-
|
2315
|
-
# If we get here, attachment didn't complete in reasonable time
|
2316
|
-
return False, None # Return None to trigger retry
|
2317
|
-
|
2318
|
-
# --- determine if we should retry ---
|
2319
|
-
recoverable = False
|
2320
|
-
error_text = response.json().get("error", "Unknown error")
|
2321
|
-
err_msg = error_text.lower()
|
2322
|
-
|
2323
|
-
# Check for "Studio is not available (status: in-use)" which means it's already attached
|
2324
|
-
if (
|
2325
|
-
response.status_code == 400
|
2326
|
-
and "not available" in err_msg
|
2327
|
-
and "in-use" in err_msg
|
2328
|
-
):
|
2329
|
-
# Studio is already attached somewhere - check if it's to THIS engine
|
2330
|
-
if _is_studio_attached(studio["studio_id"], engine["instance_id"]):
|
2331
|
-
return True, None # It's attached to our target engine - success!
|
2332
|
-
else:
|
2333
|
-
return False, error_text # It's attached elsewhere - fatal error
|
2334
|
-
|
2335
|
-
if response.status_code in (409, 503):
|
2336
|
-
recoverable = True
|
2337
|
-
else:
|
2338
|
-
RECOVERABLE_PATTERNS = [
|
2339
|
-
"not ready",
|
2340
|
-
"still starting",
|
2341
|
-
"initializing",
|
2342
|
-
"failed to mount",
|
2343
|
-
"device busy",
|
2344
|
-
"pending", # VM state pending
|
2345
|
-
]
|
2346
|
-
FATAL_PATTERNS = [
|
2347
|
-
"permission",
|
2348
|
-
]
|
2349
|
-
if any(p in err_msg for p in FATAL_PATTERNS):
|
2350
|
-
recoverable = False
|
2351
|
-
elif any(p in err_msg for p in RECOVERABLE_PATTERNS):
|
2352
|
-
recoverable = True
|
2353
|
-
|
2354
|
-
if not recoverable:
|
2355
|
-
# fatal – abort immediately
|
2356
|
-
return False, error_text
|
2357
|
-
|
2358
|
-
# recoverable – signal caller to retry without treating as error
|
2359
|
-
return False, None
|
2360
|
-
|
2361
|
-
|
2362
|
-
# Note: _poll_operation was removed because the Lambda's operation tracking is broken.
|
2363
|
-
# We now use _is_studio_attached() to check if the studio is actually attached instead.
|
2364
|
-
|
2365
|
-
|
2366
|
-
@studio_app.command("detach")
|
2367
|
-
def detach_studio(
|
2368
|
-
user: Optional[str] = typer.Option(
|
2369
|
-
None, "--user", "-u", help="Detach a different user's studio (admin only)"
|
2370
|
-
),
|
2371
|
-
):
|
2372
|
-
"""Detach your studio from its current engine."""
|
2373
|
-
username = check_aws_sso()
|
2374
|
-
|
2375
|
-
# Use specified user if provided, otherwise use current user
|
2376
|
-
target_user = user if user else username
|
2377
|
-
|
2378
|
-
# Add confirmation when detaching another user's studio
|
2379
|
-
if target_user != username:
|
2380
|
-
console.print(f"[yellow]⚠️ Managing studio for user: {target_user}[/yellow]")
|
2381
|
-
if not Confirm.ask(f"Are you sure you want to detach {target_user}'s studio?"):
|
2382
|
-
console.print("Operation cancelled.")
|
2383
|
-
return
|
2384
|
-
|
2385
|
-
studio = get_user_studio(target_user)
|
2386
|
-
if not studio:
|
2387
|
-
if target_user == username:
|
2388
|
-
console.print("[yellow]You don't have a studio.[/yellow]")
|
2389
|
-
else:
|
2390
|
-
console.print(f"[yellow]User {target_user} doesn't have a studio.[/yellow]")
|
2391
|
-
return
|
2392
|
-
|
2393
|
-
if studio.get("status") != "in-use":
|
2394
|
-
if target_user == username:
|
2395
|
-
console.print("[yellow]Your studio is not attached to any engine.[/yellow]")
|
2396
|
-
else:
|
2397
|
-
console.print(
|
2398
|
-
f"[yellow]{target_user}'s studio is not attached to any engine.[/yellow]"
|
2399
|
-
)
|
2400
|
-
return
|
2401
|
-
|
2402
|
-
console.print(f"Detaching studio from {studio.get('attached_vm_id')}...")
|
2403
|
-
|
2404
|
-
response = make_api_request("POST", f"/studios/{studio['studio_id']}/detach")
|
2405
|
-
|
2406
|
-
if response.status_code == 200:
|
2407
|
-
console.print(f"[green]✓ Studio detached successfully![/green]")
|
2408
|
-
else:
|
2409
|
-
error = response.json().get("error", "Unknown error")
|
2410
|
-
console.print(f"[red]❌ Failed to detach studio: {error}[/red]")
|
2411
|
-
|
2412
|
-
|
2413
|
-
@studio_app.command("delete")
|
2414
|
-
def delete_studio(
|
2415
|
-
user: Optional[str] = typer.Option(
|
2416
|
-
None, "--user", "-u", help="Delete a different user's studio (admin only)"
|
2417
|
-
),
|
2418
|
-
):
|
2419
|
-
"""Delete your studio permanently."""
|
2420
|
-
username = check_aws_sso()
|
2421
|
-
|
2422
|
-
# Use specified user if provided, otherwise use current user
|
2423
|
-
target_user = user if user else username
|
2424
|
-
|
2425
|
-
# Extra warning when deleting another user's studio
|
2426
|
-
if target_user != username:
|
2427
|
-
console.print(
|
2428
|
-
f"[red]⚠️ ADMIN ACTION: Deleting studio for user: {target_user}[/red]"
|
2429
|
-
)
|
2430
|
-
|
2431
|
-
studio = get_user_studio(target_user)
|
2432
|
-
if not studio:
|
2433
|
-
if target_user == username:
|
2434
|
-
console.print("[yellow]You don't have a studio to delete.[/yellow]")
|
2435
|
-
else:
|
2436
|
-
console.print(
|
2437
|
-
f"[yellow]User {target_user} doesn't have a studio to delete.[/yellow]"
|
2438
|
-
)
|
2439
|
-
return
|
2440
|
-
|
2441
|
-
console.print(
|
2442
|
-
"[red]⚠️ WARNING: This will permanently delete the studio and all data![/red]"
|
2443
|
-
)
|
2444
|
-
console.print(f"Studio ID: {studio['studio_id']}")
|
2445
|
-
console.print(f"User: {target_user}")
|
2446
|
-
console.print(f"Size: {studio['size_gb']}GB")
|
2447
|
-
|
2448
|
-
# Multiple confirmations
|
2449
|
-
if not Confirm.ask(
|
2450
|
-
f"\nAre you sure you want to delete {target_user}'s studio?"
|
2451
|
-
if target_user != username
|
2452
|
-
else "\nAre you sure you want to delete your studio?"
|
2453
|
-
):
|
2454
|
-
console.print("Deletion cancelled.")
|
2455
|
-
return
|
2456
|
-
|
2457
|
-
if not Confirm.ask("[red]This action cannot be undone. Continue?[/red]"):
|
2458
|
-
console.print("Deletion cancelled.")
|
2459
|
-
return
|
2460
|
-
|
2461
|
-
typed_confirm = Prompt.ask('Type "DELETE" to confirm permanent deletion')
|
2462
|
-
if typed_confirm != "DELETE":
|
2463
|
-
console.print("Deletion cancelled.")
|
2464
|
-
return
|
2465
|
-
|
2466
|
-
response = make_api_request("DELETE", f"/studios/{studio['studio_id']}")
|
2467
|
-
|
2468
|
-
if response.status_code == 200:
|
2469
|
-
console.print(f"[green]✓ Studio deleted successfully![/green]")
|
2470
|
-
else:
|
2471
|
-
error = response.json().get("error", "Unknown error")
|
2472
|
-
console.print(f"[red]❌ Failed to delete studio: {error}[/red]")
|
2473
|
-
|
2474
|
-
|
2475
|
-
@studio_app.command("list")
|
2476
|
-
def list_studios(
|
2477
|
-
all_users: bool = typer.Option(
|
2478
|
-
False, "--all", "-a", help="Show all users' studios"
|
2479
|
-
),
|
2480
|
-
):
|
2481
|
-
"""List studios."""
|
2482
|
-
username = check_aws_sso()
|
2483
|
-
|
2484
|
-
response = make_api_request("GET", "/studios")
|
2485
|
-
|
2486
|
-
if response.status_code == 200:
|
2487
|
-
studios = response.json().get("studios", [])
|
2488
|
-
|
2489
|
-
if not studios:
|
2490
|
-
console.print("No studios found.")
|
2491
|
-
return
|
2492
|
-
|
2493
|
-
# Get all engines to map instance IDs to names
|
2494
|
-
engines_response = make_api_request("GET", "/engines")
|
2495
|
-
engines = {}
|
2496
|
-
if engines_response.status_code == 200:
|
2497
|
-
for engine in engines_response.json().get("engines", []):
|
2498
|
-
engines[engine["instance_id"]] = engine["name"]
|
2499
|
-
|
2500
|
-
# Create table
|
2501
|
-
table = Table(title="Studios", box=box.ROUNDED)
|
2502
|
-
table.add_column("Studio ID", style="cyan")
|
2503
|
-
table.add_column("User")
|
2504
|
-
table.add_column("Status")
|
2505
|
-
table.add_column("Size", justify="right")
|
2506
|
-
table.add_column("Disk Usage", justify="right")
|
2507
|
-
table.add_column("Attached To")
|
2508
|
-
|
2509
|
-
for studio in studios:
|
2510
|
-
# Change status display
|
2511
|
-
if studio["status"] == "in-use":
|
2512
|
-
status_display = "[bright_blue]attached[/bright_blue]"
|
2513
|
-
elif studio["status"] in ["attaching", "detaching"]:
|
2514
|
-
status_display = "[yellow]" + studio["status"] + "[/yellow]"
|
2515
|
-
else:
|
2516
|
-
status_display = "[green]available[/green]"
|
2517
|
-
|
2518
|
-
# Format attached engine info
|
2519
|
-
attached_to = "-"
|
2520
|
-
disk_usage = "?/?"
|
2521
|
-
if studio.get("attached_vm_id"):
|
2522
|
-
vm_id = studio["attached_vm_id"]
|
2523
|
-
engine_name = engines.get(vm_id, "unknown")
|
2524
|
-
attached_to = f"{engine_name} ({vm_id})"
|
2525
|
-
|
2526
|
-
# Try to get disk usage if attached
|
2527
|
-
if studio["status"] == "in-use":
|
2528
|
-
usage = get_studio_disk_usage_via_ssm(vm_id, studio["user"])
|
2529
|
-
if usage:
|
2530
|
-
disk_usage = usage
|
2531
|
-
|
2532
|
-
table.add_row(
|
2533
|
-
studio["studio_id"],
|
2534
|
-
studio["user"],
|
2535
|
-
status_display,
|
2536
|
-
f"{studio['size_gb']}GB",
|
2537
|
-
disk_usage,
|
2538
|
-
attached_to,
|
2539
|
-
)
|
2540
|
-
|
2541
|
-
console.print(table)
|
2542
|
-
else:
|
2543
|
-
error = response.json().get("error", "Unknown error")
|
2544
|
-
console.print(f"[red]❌ Failed to list studios: {error}[/red]")
|
2545
|
-
|
2546
|
-
|
2547
|
-
@studio_app.command("reset")
|
2548
|
-
def reset_studio(
|
2549
|
-
user: Optional[str] = typer.Option(
|
2550
|
-
None, "--user", "-u", help="Reset a different user's studio"
|
2551
|
-
),
|
2552
|
-
):
|
2553
|
-
"""Reset a stuck studio (admin operation)."""
|
2554
|
-
username = check_aws_sso()
|
2555
|
-
|
2556
|
-
# Use specified user if provided, otherwise use current user
|
2557
|
-
target_user = user if user else username
|
2558
|
-
|
2559
|
-
# Add warning when resetting another user's studio
|
2560
|
-
if target_user != username:
|
2561
|
-
console.print(f"[yellow]⚠️ Resetting studio for user: {target_user}[/yellow]")
|
2562
|
-
|
2563
|
-
studio = get_user_studio(target_user)
|
2564
|
-
if not studio:
|
2565
|
-
if target_user == username:
|
2566
|
-
console.print("[yellow]You don't have a studio.[/yellow]")
|
2567
|
-
else:
|
2568
|
-
console.print(f"[yellow]User {target_user} doesn't have a studio.[/yellow]")
|
2569
|
-
return
|
2570
|
-
|
2571
|
-
console.print(f"[yellow]⚠️ This will force-reset the studio state[/yellow]")
|
2572
|
-
console.print(f"Current status: {studio['status']}")
|
2573
|
-
if studio.get("attached_vm_id"):
|
2574
|
-
console.print(f"Listed as attached to: {studio['attached_vm_id']}")
|
2575
|
-
|
2576
|
-
if not Confirm.ask("\nReset studio state?"):
|
2577
|
-
console.print("Reset cancelled.")
|
2578
|
-
return
|
2579
|
-
|
2580
|
-
# Direct DynamoDB update
|
2581
|
-
console.print("Resetting studio state...")
|
2582
|
-
|
2583
|
-
dynamodb = boto3.resource("dynamodb", region_name="us-east-1")
|
2584
|
-
table = dynamodb.Table("dev-studios")
|
2585
|
-
|
2586
|
-
try:
|
2587
|
-
# Check if volume is actually attached
|
2588
|
-
ec2 = boto3.client("ec2", region_name="us-east-1")
|
2589
|
-
volumes = ec2.describe_volumes(VolumeIds=[studio["studio_id"]])
|
2590
|
-
|
2591
|
-
if volumes["Volumes"]:
|
2592
|
-
volume = volumes["Volumes"][0]
|
2593
|
-
attachments = volume.get("Attachments", [])
|
2594
|
-
if attachments:
|
2595
|
-
console.print(
|
2596
|
-
f"[red]Volume is still attached to {attachments[0]['InstanceId']}![/red]"
|
2597
|
-
)
|
2598
|
-
if Confirm.ask("Force-detach the volume?"):
|
2599
|
-
ec2.detach_volume(
|
2600
|
-
VolumeId=studio["studio_id"],
|
2601
|
-
InstanceId=attachments[0]["InstanceId"],
|
2602
|
-
Force=True,
|
2603
|
-
)
|
2604
|
-
console.print("Waiting for volume to detach...")
|
2605
|
-
waiter = ec2.get_waiter("volume_available")
|
2606
|
-
waiter.wait(VolumeIds=[studio["studio_id"]])
|
2607
|
-
|
2608
|
-
# Reset in DynamoDB – align attribute names with Studio Manager backend
|
2609
|
-
table.update_item(
|
2610
|
-
Key={"StudioID": studio["studio_id"]},
|
2611
|
-
UpdateExpression="SET #st = :status, AttachedVMID = :vm_id, AttachedDevice = :device",
|
2612
|
-
ExpressionAttributeNames={"#st": "Status"},
|
2613
|
-
ExpressionAttributeValues={
|
2614
|
-
":status": "available",
|
2615
|
-
":vm_id": None,
|
2616
|
-
":device": None,
|
2617
|
-
},
|
2618
|
-
)
|
2619
|
-
|
2620
|
-
console.print(f"[green]✓ Studio reset to available state![/green]")
|
2621
|
-
|
2622
|
-
except ClientError as e:
|
2623
|
-
console.print(f"[red]❌ Failed to reset studio: {e}[/red]")
|
2624
|
-
|
2625
|
-
|
2626
|
-
@studio_app.command("resize")
|
2627
|
-
def resize_studio(
|
2628
|
-
size: int = typer.Option(..., "--size", "-s", help="New size in GB"),
|
2629
|
-
user: Optional[str] = typer.Option(
|
2630
|
-
None, "--user", "-u", help="Resize a different user's studio (admin only)"
|
2631
|
-
),
|
2632
|
-
):
|
2633
|
-
"""Resize your studio volume (requires detachment)."""
|
2634
|
-
username = check_aws_sso()
|
2635
|
-
|
2636
|
-
# Use specified user if provided, otherwise use current user
|
2637
|
-
target_user = user if user else username
|
2638
|
-
|
2639
|
-
# Add warning when resizing another user's studio
|
2640
|
-
if target_user != username:
|
2641
|
-
console.print(f"[yellow]⚠️ Resizing studio for user: {target_user}[/yellow]")
|
2642
|
-
|
2643
|
-
studio = get_user_studio(target_user)
|
2644
|
-
if not studio:
|
2645
|
-
if target_user == username:
|
2646
|
-
console.print("[yellow]You don't have a studio yet.[/yellow]")
|
2647
|
-
else:
|
2648
|
-
console.print(f"[yellow]User {target_user} doesn't have a studio.[/yellow]")
|
2649
|
-
return
|
2650
|
-
|
2651
|
-
current_size = studio["size_gb"]
|
2652
|
-
|
2653
|
-
if size <= current_size:
|
2654
|
-
console.print(
|
2655
|
-
f"[red]❌ New size ({size}GB) must be larger than current size ({current_size}GB)[/red]"
|
2656
|
-
)
|
2657
|
-
raise typer.Exit(1)
|
2658
|
-
|
2659
|
-
# Check if studio is attached
|
2660
|
-
if studio["status"] == "in-use":
|
2661
|
-
console.print("[yellow]⚠️ Studio must be detached before resizing[/yellow]")
|
2662
|
-
console.print(f"Currently attached to: {studio.get('attached_vm_id')}")
|
2663
|
-
|
2664
|
-
if not Confirm.ask("\nDetach studio and proceed with resize?"):
|
2665
|
-
console.print("Resize cancelled.")
|
2666
|
-
return
|
2667
|
-
|
2668
|
-
# Detach the studio
|
2669
|
-
console.print("Detaching studio...")
|
2670
|
-
response = make_api_request("POST", f"/studios/{studio['studio_id']}/detach")
|
2671
|
-
if response.status_code != 200:
|
2672
|
-
console.print("[red]❌ Failed to detach studio[/red]")
|
2673
|
-
raise typer.Exit(1)
|
2674
|
-
|
2675
|
-
console.print("[green]✓ Studio detached[/green]")
|
2676
|
-
|
2677
|
-
# Wait a moment for detachment to complete
|
2678
|
-
time.sleep(5)
|
2679
|
-
|
2680
|
-
console.print(f"[yellow]Resizing studio from {current_size}GB to {size}GB[/yellow]")
|
2681
|
-
|
2682
|
-
# Call the resize API
|
2683
|
-
resize_response = make_api_request(
|
2684
|
-
"POST", f"/studios/{studio['studio_id']}/resize", json_data={"size": size}
|
2685
|
-
)
|
2686
|
-
|
2687
|
-
if resize_response.status_code != 200:
|
2688
|
-
error = resize_response.json().get("error", "Unknown error")
|
2689
|
-
console.print(f"[red]❌ Failed to resize studio: {error}[/red]")
|
2690
|
-
raise typer.Exit(1)
|
2691
|
-
|
2692
|
-
# Wait for volume modification to complete
|
2693
|
-
ec2 = boto3.client("ec2", region_name="us-east-1")
|
2694
|
-
console.print("Resizing volume...")
|
2695
|
-
|
2696
|
-
# Track progress
|
2697
|
-
last_progress = 0
|
2698
|
-
|
2699
|
-
while True:
|
2700
|
-
try:
|
2701
|
-
mod_state = ec2.describe_volumes_modifications(
|
2702
|
-
VolumeIds=[studio["studio_id"]]
|
2703
|
-
)
|
2704
|
-
if not mod_state["VolumesModifications"]:
|
2705
|
-
break # Modification complete
|
2706
|
-
|
2707
|
-
modification = mod_state["VolumesModifications"][0]
|
2708
|
-
state = modification["ModificationState"]
|
2709
|
-
progress = modification.get("Progress", 0)
|
2710
|
-
|
2711
|
-
# Show progress updates only for the resize phase
|
2712
|
-
if state == "modifying" and progress > last_progress:
|
2713
|
-
console.print(f"[yellow]Progress: {progress}%[/yellow]")
|
2714
|
-
last_progress = progress
|
2715
|
-
|
2716
|
-
# Exit as soon as optimization starts (resize is complete)
|
2717
|
-
if state == "optimizing":
|
2718
|
-
console.print(
|
2719
|
-
f"[green]✓ Studio resized successfully to {size}GB![/green]"
|
2720
|
-
)
|
2721
|
-
console.print(
|
2722
|
-
"[dim]AWS is optimizing the volume in the background (no action needed).[/dim]"
|
2723
|
-
)
|
2724
|
-
break
|
2725
|
-
|
2726
|
-
if state == "completed":
|
2727
|
-
console.print(
|
2728
|
-
f"[green]✓ Studio resized successfully to {size}GB![/green]"
|
2729
|
-
)
|
2730
|
-
break
|
2731
|
-
elif state == "failed":
|
2732
|
-
console.print("[red]❌ Volume modification failed[/red]")
|
2733
|
-
raise typer.Exit(1)
|
2734
|
-
|
2735
|
-
time.sleep(2) # Check more frequently for better UX
|
2736
|
-
|
2737
|
-
except ClientError:
|
2738
|
-
# Modification might be complete
|
2739
|
-
console.print(f"[green]✓ Studio resized successfully to {size}GB![/green]")
|
2740
|
-
break
|
2741
|
-
|
2742
|
-
console.print(
|
2743
|
-
"\n[dim]The filesystem will be automatically expanded when you next attach the studio.[/dim]"
|
2744
|
-
)
|
2745
|
-
console.print(f"To attach: [cyan]dh studio attach <engine-name>[/cyan]")
|
2746
|
-
|
2747
|
-
|
2748
|
-
# ================= Idle timeout command =================
|
2749
|
-
|
2750
|
-
|
2751
|
-
@engine_app.command("idle")
|
2752
|
-
def idle_timeout_cmd(
|
2753
|
-
name_or_id: str = typer.Argument(help="Engine name or instance ID"),
|
2754
|
-
set: Optional[str] = typer.Option(
|
2755
|
-
None, "--set", "-s", help="New timeout (e.g., 2h30m, 45m)"
|
2756
|
-
),
|
2757
|
-
|
2758
|
-
):
|
2759
|
-
"""Show or set the engine idle-detector timeout."""
|
2760
|
-
check_aws_sso()
|
2761
|
-
|
2762
|
-
# Resolve engine
|
2763
|
-
response = make_api_request("GET", "/engines")
|
2764
|
-
if response.status_code != 200:
|
2765
|
-
console.print("[red]❌ Failed to fetch engines[/red]")
|
2766
|
-
raise typer.Exit(1)
|
2767
|
-
|
2768
|
-
engines = response.json().get("engines", [])
|
2769
|
-
engine = resolve_engine(name_or_id, engines)
|
2770
|
-
|
2771
|
-
ssm = boto3.client("ssm", region_name="us-east-1")
|
2772
|
-
|
2773
|
-
if set is None:
|
2774
|
-
# Show current timeout setting
|
2775
|
-
resp = ssm.send_command(
|
2776
|
-
InstanceIds=[engine["instance_id"]],
|
2777
|
-
DocumentName="AWS-RunShellScript",
|
2778
|
-
Parameters={
|
2779
|
-
"commands": [
|
2780
|
-
"grep -E '^IDLE_TIMEOUT_SECONDS=' /etc/engine.env || echo 'IDLE_TIMEOUT_SECONDS=1800'"
|
2781
|
-
],
|
2782
|
-
"executionTimeout": ["10"],
|
2783
|
-
},
|
2784
|
-
)
|
2785
|
-
cid = resp["Command"]["CommandId"]
|
2786
|
-
time.sleep(1)
|
2787
|
-
inv = ssm.get_command_invocation(
|
2788
|
-
CommandId=cid, InstanceId=engine["instance_id"]
|
2789
|
-
)
|
2790
|
-
if inv["Status"] == "Success":
|
2791
|
-
line = inv["StandardOutputContent"].strip()
|
2792
|
-
secs = int(line.split("=")[1]) if "=" in line else 1800
|
2793
|
-
console.print(f"Current idle timeout: {secs//60}m ({secs} seconds)")
|
2794
|
-
else:
|
2795
|
-
console.print("[red]❌ Could not retrieve idle timeout[/red]")
|
2796
|
-
return
|
2797
|
-
|
2798
|
-
# ----- set new value -----
|
2799
|
-
m = re.match(r"^(?:(\d+)h)?(?:(\d+)m)?$", set)
|
2800
|
-
if not m:
|
2801
|
-
console.print("[red]❌ Invalid duration format. Use e.g. 2h, 45m, 1h30m[/red]")
|
2802
|
-
raise typer.Exit(1)
|
2803
|
-
hours = int(m.group(1) or 0)
|
2804
|
-
minutes = int(m.group(2) or 0)
|
2805
|
-
seconds = hours * 3600 + minutes * 60
|
2806
|
-
if seconds == 0:
|
2807
|
-
console.print("[red]❌ Duration must be greater than zero[/red]")
|
2808
|
-
raise typer.Exit(1)
|
2809
|
-
|
2810
|
-
console.print(f"Setting idle timeout to {set} ({seconds} seconds)…")
|
2811
|
-
|
2812
|
-
cmd = (
|
2813
|
-
"sudo sed -i '/^IDLE_TIMEOUT_SECONDS=/d' /etc/engine.env && "
|
2814
|
-
f"echo 'IDLE_TIMEOUT_SECONDS={seconds}' | sudo tee -a /etc/engine.env >/dev/null && "
|
2815
|
-
"sudo systemctl restart engine-idle-detector.service"
|
2816
|
-
)
|
2817
|
-
|
2818
|
-
resp = ssm.send_command(
|
2819
|
-
InstanceIds=[engine["instance_id"]],
|
2820
|
-
DocumentName="AWS-RunShellScript",
|
2821
|
-
Parameters={"commands": [cmd], "executionTimeout": ["60"]},
|
2822
|
-
)
|
2823
|
-
cid = resp["Command"]["CommandId"]
|
2824
|
-
time.sleep(2)
|
2825
|
-
console.print(f"[green]✓ Idle timeout updated to {set}[/green]")
|
2826
|
-
|
2827
|
-
|
2828
|
-
# Add this near the end, after the idle-timeout command
|
2829
|
-
|
2830
|
-
|
2831
|
-
@engine_app.command("debug")
|
2832
|
-
def debug_engine(
|
2833
|
-
name_or_id: str = typer.Argument(help="Engine name or instance ID"),
|
2834
|
-
):
|
2835
|
-
"""Debug engine bootstrap status and files."""
|
2836
|
-
check_aws_sso()
|
2837
|
-
|
2838
|
-
# Resolve engine
|
2839
|
-
response = make_api_request("GET", "/engines")
|
2840
|
-
if response.status_code != 200:
|
2841
|
-
console.print("[red]❌ Failed to fetch engines[/red]")
|
2842
|
-
raise typer.Exit(1)
|
2843
|
-
|
2844
|
-
engines = response.json().get("engines", [])
|
2845
|
-
engine = resolve_engine(name_or_id, engines)
|
2846
|
-
|
2847
|
-
console.print(f"[bold]Debug info for {engine['name']}:[/bold]\n")
|
2848
|
-
|
2849
|
-
ssm = boto3.client("ssm", region_name="us-east-1")
|
2850
|
-
|
2851
|
-
# Check multiple files and systemd status
|
2852
|
-
checks = [
|
2853
|
-
(
|
2854
|
-
"Stage file",
|
2855
|
-
"cat /opt/dayhoff/state/engine-init.stage 2>/dev/null || cat /var/run/engine-init.stage 2>/dev/null || echo 'MISSING'",
|
2856
|
-
),
|
2857
|
-
(
|
2858
|
-
"Health file",
|
2859
|
-
"cat /opt/dayhoff/state/engine-health.json 2>/dev/null || cat /var/run/engine-health.json 2>/dev/null || echo 'MISSING'",
|
2860
|
-
),
|
2861
|
-
(
|
2862
|
-
"Sentinel file",
|
2863
|
-
"ls -la /opt/dayhoff/first_boot_complete.sentinel 2>/dev/null || echo 'MISSING'",
|
2864
|
-
),
|
2865
|
-
(
|
2866
|
-
"Setup service",
|
2867
|
-
"systemctl status setup-aws-vm.service --no-pager || echo 'Service not found'",
|
2868
|
-
),
|
2869
|
-
(
|
2870
|
-
"Bootstrap log tail",
|
2871
|
-
"tail -20 /var/log/engine-setup.log 2>/dev/null || echo 'No log'",
|
2872
|
-
),
|
2873
|
-
("Environment file", "cat /etc/engine.env 2>/dev/null || echo 'MISSING'"),
|
2874
|
-
]
|
2875
|
-
|
2876
|
-
for name, cmd in checks:
|
2877
|
-
try:
|
2878
|
-
resp = ssm.send_command(
|
2879
|
-
InstanceIds=[engine["instance_id"]],
|
2880
|
-
DocumentName="AWS-RunShellScript",
|
2881
|
-
Parameters={"commands": [cmd], "executionTimeout": ["10"]},
|
2882
|
-
)
|
2883
|
-
cid = resp["Command"]["CommandId"]
|
2884
|
-
time.sleep(1)
|
2885
|
-
inv = ssm.get_command_invocation(
|
2886
|
-
CommandId=cid, InstanceId=engine["instance_id"]
|
2887
|
-
)
|
2888
|
-
|
2889
|
-
if inv["Status"] == "Success":
|
2890
|
-
output = inv["StandardOutputContent"].strip()
|
2891
|
-
console.print(f"[cyan]{name}:[/cyan]")
|
2892
|
-
console.print(f"[dim]{output}[/dim]\n")
|
2893
|
-
else:
|
2894
|
-
console.print(f"[cyan]{name}:[/cyan] [red]FAILED[/red]\n")
|
2895
|
-
|
2896
|
-
except Exception as e:
|
2897
|
-
console.print(f"[cyan]{name}:[/cyan] [red]ERROR: {e}[/red]\n")
|
2898
|
-
|
2899
|
-
|
2900
|
-
@engine_app.command("repair")
|
2901
|
-
def repair_engine(
|
2902
|
-
name_or_id: str = typer.Argument(help="Engine name or instance ID"),
|
2903
|
-
):
|
2904
|
-
"""Repair an engine that's stuck in a bad state (e.g., after GAMI creation)."""
|
2905
|
-
check_aws_sso()
|
2906
|
-
|
2907
|
-
# Get all engines to resolve name
|
2908
|
-
response = make_api_request("GET", "/engines")
|
2909
|
-
if response.status_code != 200:
|
2910
|
-
console.print("[red]❌ Failed to fetch engines[/red]")
|
2911
|
-
raise typer.Exit(1)
|
2912
|
-
|
2913
|
-
engines = response.json().get("engines", [])
|
2914
|
-
engine = resolve_engine(name_or_id, engines)
|
2915
|
-
|
2916
|
-
if engine["state"].lower() != "running":
|
2917
|
-
console.print(
|
2918
|
-
f"[yellow]⚠️ Engine is {engine['state']}. Must be running to repair.[/yellow]"
|
2919
|
-
)
|
2920
|
-
if engine["state"].lower() == "stopped" and Confirm.ask(
|
2921
|
-
"Start the engine first?"
|
2922
|
-
):
|
2923
|
-
response = make_api_request(
|
2924
|
-
"POST", f"/engines/{engine['instance_id']}/start"
|
2925
|
-
)
|
2926
|
-
if response.status_code != 200:
|
2927
|
-
console.print("[red]❌ Failed to start engine[/red]")
|
2928
|
-
raise typer.Exit(1)
|
2929
|
-
console.print("[green]✓ Engine started[/green]")
|
2930
|
-
console.print("Waiting for engine to become ready...")
|
2931
|
-
time.sleep(30) # Give it time to boot
|
2932
|
-
else:
|
2933
|
-
raise typer.Exit(1)
|
2934
|
-
|
2935
|
-
console.print(f"[bold]Repairing engine [cyan]{engine['name']}[/cyan][/bold]")
|
2936
|
-
console.print(
|
2937
|
-
"[dim]This will restore bootstrap state and ensure all services are running[/dim]\n"
|
2938
|
-
)
|
2939
|
-
|
2940
|
-
ssm = boto3.client("ssm", region_name="us-east-1")
|
2941
|
-
|
2942
|
-
# Repair commands
|
2943
|
-
repair_commands = [
|
2944
|
-
# Create necessary directories
|
2945
|
-
"sudo mkdir -p /opt/dayhoff /opt/dayhoff/state /opt/dayhoff/scripts",
|
2946
|
-
# Download scripts from S3 if missing
|
2947
|
-
"source /etc/engine.env && sudo aws s3 sync s3://${VM_SCRIPTS_BUCKET}/ /opt/dayhoff/scripts/ --exclude '*' --include '*.sh' --quiet",
|
2948
|
-
"sudo chmod +x /opt/dayhoff/scripts/*.sh 2>/dev/null || true",
|
2949
|
-
# Restore bootstrap state
|
2950
|
-
"sudo touch /opt/dayhoff/first_boot_complete.sentinel",
|
2951
|
-
"echo 'finished' | sudo tee /opt/dayhoff/state/engine-init.stage > /dev/null",
|
2952
|
-
# Ensure SSM agent is running
|
2953
|
-
"sudo systemctl restart amazon-ssm-agent 2>/dev/null || true",
|
2954
|
-
# Restart idle detector (service only)
|
2955
|
-
"sudo systemctl restart engine-idle-detector.service 2>/dev/null || true",
|
2956
|
-
# Report status
|
2957
|
-
"echo '=== Repair Complete ===' && echo 'Sentinel: ' && ls -la /opt/dayhoff/first_boot_complete.sentinel",
|
2958
|
-
"echo 'Stage: ' && cat /opt/dayhoff/state/engine-init.stage",
|
2959
|
-
"echo 'Scripts: ' && ls /opt/dayhoff/scripts/*.sh 2>/dev/null | wc -l",
|
2960
|
-
]
|
2961
|
-
|
2962
|
-
try:
|
2963
|
-
with Progress(
|
2964
|
-
SpinnerColumn(),
|
2965
|
-
TextColumn("[progress.description]{task.description}"),
|
2966
|
-
transient=True,
|
2967
|
-
) as progress:
|
2968
|
-
task = progress.add_task("Repairing engine...", total=None)
|
2969
|
-
|
2970
|
-
response = ssm.send_command(
|
2971
|
-
InstanceIds=[engine["instance_id"]],
|
2972
|
-
DocumentName="AWS-RunShellScript",
|
2973
|
-
Parameters={
|
2974
|
-
"commands": repair_commands,
|
2975
|
-
"executionTimeout": ["60"],
|
2976
|
-
},
|
2977
|
-
)
|
2978
|
-
|
2979
|
-
command_id = response["Command"]["CommandId"]
|
2980
|
-
|
2981
|
-
# Wait for command
|
2982
|
-
for _ in range(60):
|
2983
|
-
time.sleep(1)
|
2984
|
-
result = ssm.get_command_invocation(
|
2985
|
-
CommandId=command_id,
|
2986
|
-
InstanceId=engine["instance_id"],
|
2987
|
-
)
|
2988
|
-
if result["Status"] in ["Success", "Failed"]:
|
2989
|
-
break
|
2990
|
-
|
2991
|
-
if result["Status"] == "Success":
|
2992
|
-
output = result["StandardOutputContent"]
|
2993
|
-
console.print("[green]✓ Engine repaired successfully![/green]\n")
|
2994
|
-
|
2995
|
-
# Show repair results
|
2996
|
-
if "=== Repair Complete ===" in output:
|
2997
|
-
repair_section = output.split("=== Repair Complete ===")[1].strip()
|
2998
|
-
console.print("[bold]Repair Results:[/bold]")
|
2999
|
-
console.print(repair_section)
|
3000
|
-
|
3001
|
-
console.print(
|
3002
|
-
"\n[dim]You should now be able to attach studios to this engine.[/dim]"
|
3003
|
-
)
|
3004
|
-
else:
|
3005
|
-
console.print(
|
3006
|
-
f"[red]❌ Repair failed: {result.get('StandardErrorContent', 'Unknown error')}[/red]"
|
3007
|
-
)
|
3008
|
-
console.print(
|
3009
|
-
"\n[yellow]Try running 'dh engine debug' for more information.[/yellow]"
|
3010
|
-
)
|
3011
|
-
|
3012
|
-
except Exception as e:
|
3013
|
-
console.print(f"[red]❌ Failed to repair engine: {e}[/red]")
|