dayhoff-tools 1.9.9__py3-none-any.whl → 1.9.10__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,3013 +0,0 @@
1
- """Engine and Studio management commands for DHT CLI."""
2
-
3
- import json
4
- import os
5
- import re
6
- import shutil
7
- import subprocess
8
- import sys
9
- import time
10
- from datetime import datetime, timedelta, timezone
11
- from pathlib import Path
12
- from typing import Any, Dict, List, Optional, Tuple
13
-
14
- import boto3
15
- import requests
16
- import typer
17
- from botocore.exceptions import ClientError, NoCredentialsError
18
- from rich import box
19
- from rich.console import Console
20
- from rich.panel import Panel
21
- from rich.progress import Progress, SpinnerColumn, TextColumn, TimeElapsedColumn
22
- from rich.prompt import Confirm, IntPrompt, Prompt
23
- from rich.table import Table
24
-
25
- # Initialize Typer apps
26
- engine_app = typer.Typer(help="Manage compute engines for development.")
27
- studio_app = typer.Typer(help="Manage persistent development studios.")
28
-
29
- console = Console()
30
-
31
- # Cost information
32
- HOURLY_COSTS = {
33
- "cpu": 0.50, # r6i.2xlarge
34
- "cpumax": 2.02, # r7i.8xlarge
35
- "t4": 0.75, # g4dn.2xlarge
36
- "a10g": 1.50, # g5.2xlarge
37
- "a100": 21.96, # p4d.24xlarge
38
- "4_t4": 3.91, # g4dn.12xlarge
39
- "8_t4": 7.83, # g4dn.metal
40
- "4_a10g": 6.24, # g5.12xlarge
41
- "8_a10g": 16.29, # g5.48xlarge
42
- }
43
-
44
- # SSH config management
45
- SSH_MANAGED_COMMENT = "# Managed by dh engine"
46
-
47
- # --------------------------------------------------------------------------------
48
- # Bootstrap stage helpers
49
- # --------------------------------------------------------------------------------
50
-
51
-
52
- def _colour_stage(stage: str) -> str:
53
- """Return colourised stage name for table output."""
54
- if not stage:
55
- return "[dim]-[/dim]"
56
- low = stage.lower()
57
- if low.startswith("error"):
58
- return f"[red]{stage}[/red]"
59
- if low == "finished":
60
- return f"[green]{stage}[/green]"
61
- return f"[yellow]{stage}[/yellow]"
62
-
63
-
64
- def _fetch_init_stages(instance_ids: List[str]) -> Dict[str, str]:
65
- """Fetch DayhoffInitStage tag for many instances in one call."""
66
- if not instance_ids:
67
- return {}
68
- ec2 = boto3.client("ec2", region_name="us-east-1")
69
- stages: Dict[str, str] = {}
70
- try:
71
- paginator = ec2.get_paginator("describe_instances")
72
- for page in paginator.paginate(InstanceIds=instance_ids):
73
- for res in page["Reservations"]:
74
- for inst in res["Instances"]:
75
- iid = inst["InstanceId"]
76
- tag_val = next(
77
- (
78
- t["Value"]
79
- for t in inst.get("Tags", [])
80
- if t["Key"] == "DayhoffInitStage"
81
- ),
82
- None,
83
- )
84
- if tag_val:
85
- stages[iid] = tag_val
86
- except Exception:
87
- pass # best-effort
88
- return stages
89
-
90
-
91
- def check_aws_sso() -> str:
92
- """Check AWS SSO status and return username."""
93
- try:
94
- sts = boto3.client("sts")
95
- identity = sts.get_caller_identity()
96
- # Parse username from assumed role ARN
97
- # Format: arn:aws:sts::123456789012:assumed-role/AWSReservedSSO_DeveloperAccess_xxxx/username
98
- arn = identity["Arn"]
99
- if "assumed-role" in arn:
100
- username = arn.split("/")[-1]
101
- return username
102
- else:
103
- # Fallback for other auth methods
104
- return identity["UserId"].split(":")[-1]
105
- except (NoCredentialsError, ClientError):
106
- console.print("[red]❌ Not logged in to AWS SSO[/red]")
107
- console.print("Please run: [cyan]aws sso login[/cyan]")
108
- if Confirm.ask("Would you like to login now?"):
109
- try:
110
- result = subprocess.run(
111
- ["aws", "sso", "login"],
112
- capture_output=True,
113
- text=True,
114
- check=True,
115
- )
116
- if result.returncode == 0:
117
- console.print("[green]✓ Successfully logged in![/green]")
118
- return check_aws_sso()
119
- except subprocess.CalledProcessError as e:
120
- console.print(f"[red]Login failed: {e}[/red]")
121
- raise typer.Exit(1)
122
-
123
-
124
- def get_api_url() -> str:
125
- """Get Studio Manager API URL from SSM Parameter Store."""
126
- ssm = boto3.client("ssm", region_name="us-east-1")
127
- try:
128
- response = ssm.get_parameter(Name="/dev/studio-manager/api-url")
129
- return response["Parameter"]["Value"]
130
- except ClientError as e:
131
- if e.response["Error"]["Code"] == "ParameterNotFound":
132
- console.print(
133
- "[red]❌ API URL parameter not found in SSM Parameter Store[/red]"
134
- )
135
- console.print(
136
- "Please ensure the Studio Manager infrastructure is deployed."
137
- )
138
- else:
139
- console.print(f"[red]❌ Error retrieving API URL: {e}[/red]")
140
- raise typer.Exit(1)
141
-
142
-
143
- def make_api_request(
144
- method: str,
145
- endpoint: str,
146
- json_data: Optional[Dict] = None,
147
- params: Optional[Dict] = None,
148
- ) -> requests.Response:
149
- """Make an API request with error handling."""
150
- api_url = get_api_url()
151
- url = f"{api_url}{endpoint}"
152
-
153
- try:
154
- if method == "GET":
155
- response = requests.get(url, params=params)
156
- elif method == "POST":
157
- response = requests.post(url, json=json_data)
158
- elif method == "DELETE":
159
- response = requests.delete(url)
160
- else:
161
- raise ValueError(f"Unsupported HTTP method: {method}")
162
-
163
- return response
164
- except requests.exceptions.RequestException as e:
165
- console.print(f"[red]❌ API request failed: {e}[/red]")
166
- raise typer.Exit(1)
167
-
168
-
169
- def format_duration(duration: timedelta) -> str:
170
- """Format a duration as a human-readable string."""
171
- total_seconds = int(duration.total_seconds())
172
- hours = total_seconds // 3600
173
- minutes = (total_seconds % 3600) // 60
174
-
175
- if hours > 0:
176
- return f"{hours}h {minutes}m"
177
- else:
178
- return f"{minutes}m"
179
-
180
-
181
- def get_disk_usage_via_ssm(instance_id: str) -> Optional[str]:
182
- """Get disk usage for an engine via SSM.
183
-
184
- Returns:
185
- String like "17/50 GB" or None if failed
186
- """
187
- try:
188
- ssm = boto3.client("ssm", region_name="us-east-1")
189
-
190
- # Run df command to get disk usage
191
- response = ssm.send_command(
192
- InstanceIds=[instance_id],
193
- DocumentName="AWS-RunShellScript",
194
- Parameters={
195
- "commands": [
196
- # Get root filesystem usage in GB
197
- 'df -BG / | tail -1 | awk \'{gsub(/G/, "", $2); gsub(/G/, "", $3); print $3 "/" $2 " GB"}\''
198
- ],
199
- "executionTimeout": ["10"],
200
- },
201
- )
202
-
203
- command_id = response["Command"]["CommandId"]
204
-
205
- # Wait for command to complete (with timeout)
206
- for _ in range(5): # 5 second timeout
207
- time.sleep(1)
208
- result = ssm.get_command_invocation(
209
- CommandId=command_id,
210
- InstanceId=instance_id,
211
- )
212
- if result["Status"] in ["Success", "Failed"]:
213
- break
214
-
215
- if result["Status"] == "Success":
216
- output = result["StandardOutputContent"].strip()
217
- return output if output else None
218
-
219
- return None
220
-
221
- except Exception as e:
222
- # logger.debug(f"Failed to get disk usage for {instance_id}: {e}") # Original code had this line commented out
223
- return None
224
-
225
-
226
- def get_studio_disk_usage_via_ssm(instance_id: str, username: str) -> Optional[str]:
227
- """Get disk usage for a studio via SSM.
228
-
229
- Returns:
230
- String like "333/500 GB" or None if failed
231
- """
232
- try:
233
- ssm = boto3.client("ssm", region_name="us-east-1")
234
-
235
- # Run df command to get studio disk usage
236
- response = ssm.send_command(
237
- InstanceIds=[instance_id],
238
- DocumentName="AWS-RunShellScript",
239
- Parameters={
240
- "commands": [
241
- # Get studio filesystem usage in GB
242
- f'df -BG /studios/{username} 2>/dev/null | tail -1 | awk \'{{gsub(/G/, "", $2); gsub(/G/, "", $3); print $3 "/" $2 " GB"}}\''
243
- ],
244
- "executionTimeout": ["10"],
245
- },
246
- )
247
-
248
- command_id = response["Command"]["CommandId"]
249
-
250
- # Wait for command to complete (with timeout)
251
- for _ in range(5): # 5 second timeout
252
- time.sleep(1)
253
- result = ssm.get_command_invocation(
254
- CommandId=command_id,
255
- InstanceId=instance_id,
256
- )
257
- if result["Status"] in ["Success", "Failed"]:
258
- break
259
-
260
- if result["Status"] == "Success":
261
- output = result["StandardOutputContent"].strip()
262
- return output if output else None
263
-
264
- return None
265
-
266
- except Exception:
267
- return None
268
-
269
-
270
- def parse_launch_time(launch_time_str: str) -> datetime:
271
- """Parse launch time from API response."""
272
- # Try different datetime formats
273
- formats = [
274
- "%Y-%m-%dT%H:%M:%S.%fZ",
275
- "%Y-%m-%dT%H:%M:%SZ",
276
- "%Y-%m-%dT%H:%M:%S%z", # ISO format with timezone
277
- "%Y-%m-%dT%H:%M:%S+00:00", # Explicit UTC offset
278
- "%Y-%m-%d %H:%M:%S",
279
- ]
280
-
281
- # First try parsing with fromisoformat for better timezone handling
282
- try:
283
- # Handle the ISO format properly
284
- return datetime.fromisoformat(launch_time_str.replace("Z", "+00:00"))
285
- except (ValueError, AttributeError):
286
- pass
287
-
288
- # Fallback to manual format parsing
289
- for fmt in formats:
290
- try:
291
- parsed = datetime.strptime(launch_time_str, fmt)
292
- # If no timezone info, assume UTC
293
- if parsed.tzinfo is None:
294
- parsed = parsed.replace(tzinfo=timezone.utc)
295
- return parsed
296
- except ValueError:
297
- continue
298
-
299
- # Fallback: assume it's recent
300
- return datetime.now(timezone.utc)
301
-
302
-
303
- def format_status(state: str, ready: Optional[bool]) -> str:
304
- """Format engine status with ready indicator."""
305
- if state.lower() == "running":
306
- if ready is True:
307
- return "[green]Running ✓[/green]"
308
- elif ready is False:
309
- return "[yellow]Running ⚠ (Bootstrapping...)[/yellow]"
310
- else:
311
- return "[green]Running[/green]"
312
- elif state.lower() == "stopped":
313
- return "[dim]Stopped[/dim]"
314
- elif state.lower() == "stopping":
315
- return "[yellow]Stopping...[/yellow]"
316
- elif state.lower() == "pending":
317
- return "[yellow]Starting...[/yellow]"
318
- else:
319
- return state
320
-
321
-
322
- def resolve_engine(name_or_id: str, engines: List[Dict]) -> Dict:
323
- """Resolve engine by name or ID with interactive selection."""
324
- # Exact ID match
325
- exact_id = [e for e in engines if e["instance_id"] == name_or_id]
326
- if exact_id:
327
- return exact_id[0]
328
-
329
- # Exact name match
330
- exact_name = [e for e in engines if e["name"] == name_or_id]
331
- if len(exact_name) == 1:
332
- return exact_name[0]
333
-
334
- # Prefix matches
335
- matches = [
336
- e
337
- for e in engines
338
- if e["name"].startswith(name_or_id) or e["instance_id"].startswith(name_or_id)
339
- ]
340
-
341
- if len(matches) == 0:
342
- console.print(f"[red]❌ No engine found matching '{name_or_id}'[/red]")
343
- raise typer.Exit(1)
344
- elif len(matches) == 1:
345
- return matches[0]
346
- else:
347
- # Interactive selection
348
- console.print(f"Multiple engines match '{name_or_id}':")
349
- for i, engine in enumerate(matches, 1):
350
- cost = HOURLY_COSTS.get(engine["engine_type"], 0)
351
- console.print(
352
- f" {i}. [cyan]{engine['name']}[/cyan] ({engine['instance_id']}) "
353
- f"- {engine['engine_type']} - {engine['state']} - ${cost:.2f}/hr"
354
- )
355
-
356
- while True:
357
- try:
358
- choice = IntPrompt.ask(
359
- "Select engine",
360
- default=1,
361
- choices=[str(i) for i in range(1, len(matches) + 1)],
362
- )
363
- return matches[choice - 1]
364
- except (ValueError, IndexError):
365
- console.print("[red]Invalid selection, please try again[/red]")
366
-
367
-
368
- def get_ssh_public_key() -> str:
369
- """Get the user's SSH public key.
370
-
371
- Discovery order (container-friendly):
372
- 1) DHT_SSH_PUBLIC_KEY env var (direct key content)
373
- 2) DHT_SSH_PUBLIC_KEY_PATH env var (path to a .pub file)
374
- 3) ssh-agent via `ssh-add -L` (requires SSH_AUTH_SOCK)
375
- 4) Conventional files: ~/.ssh/id_ed25519.pub, ~/.ssh/id_rsa.pub
376
-
377
- Raises:
378
- FileNotFoundError: If no public key can be discovered.
379
- """
380
- # 1) Direct env var content
381
- env_key = os.environ.get("DHT_SSH_PUBLIC_KEY")
382
- if env_key and env_key.strip():
383
- return env_key.strip()
384
-
385
- # 2) Env var path
386
- env_path = os.environ.get("DHT_SSH_PUBLIC_KEY_PATH")
387
- if env_path:
388
- p = Path(env_path).expanduser()
389
- if p.is_file():
390
- try:
391
- return p.read_text().strip()
392
- except Exception:
393
- pass
394
-
395
- # 3) Agent lookup (ssh-add -L)
396
- try:
397
- if shutil.which("ssh-add") is not None:
398
- proc = subprocess.run(["ssh-add", "-L"], capture_output=True, text=True)
399
- if proc.returncode == 0 and proc.stdout:
400
- keys = [
401
- line.strip() for line in proc.stdout.splitlines() if line.strip()
402
- ]
403
- # Prefer ed25519, then rsa
404
- for pref in ("ssh-ed25519", "ssh-rsa", "ecdsa-sha2-nistp256"):
405
- for k in keys:
406
- if k.startswith(pref + " "):
407
- return k
408
- # Fallback to first key if types not matched
409
- if keys:
410
- return keys[0]
411
- except Exception:
412
- pass
413
-
414
- # 4) Conventional files
415
- home = Path.home()
416
- key_paths = [home / ".ssh" / "id_ed25519.pub", home / ".ssh" / "id_rsa.pub"]
417
- for key_path in key_paths:
418
- if key_path.is_file():
419
- try:
420
- return key_path.read_text().strip()
421
- except Exception:
422
- continue
423
-
424
- raise FileNotFoundError(
425
- "No SSH public key found. Please create one with 'ssh-keygen' first."
426
- )
427
-
428
-
429
- def check_session_manager_plugin():
430
- """Check if AWS Session Manager Plugin is available and warn if not."""
431
- if shutil.which("session-manager-plugin") is None:
432
- console.print(
433
- "[bold red]⚠️ AWS Session Manager Plugin not found![/bold red]\n"
434
- "SSH connections to engines require the Session Manager Plugin.\n"
435
- "Please install it following the setup guide:\n"
436
- "[link]https://github.com/dayhofflabs/nutshell/blob/main/REFERENCE/setup_guides/new-laptop.md[/link]"
437
- )
438
- return False
439
- return True
440
-
441
-
442
- def update_ssh_config_entry(
443
- engine_name: str, instance_id: str, ssh_user: str, idle_timeout: int = 600
444
- ):
445
- """Add or update a single SSH config entry for the given SSH user.
446
-
447
- Args:
448
- engine_name: Host alias to write into ~/.ssh/config
449
- instance_id: EC2 instance-id (used by the proxy command)
450
- ssh_user: Username to place into the SSH stanza
451
- idle_timeout: Idle timeout **in seconds** to pass to the SSM port-forward. 600 = 10 min.
452
- """
453
- config_path = Path.home() / ".ssh" / "config"
454
- config_path.parent.mkdir(mode=0o700, exist_ok=True)
455
-
456
- # Touch the file if it doesn't exist
457
- if not config_path.exists():
458
- config_path.touch(mode=0o600)
459
-
460
- # Read existing config
461
- content = config_path.read_text()
462
- lines = content.splitlines() if content else []
463
-
464
- # Remove any existing entry for this engine
465
- new_lines = []
466
- skip_until_next_host = False
467
- for line in lines:
468
- # Check if this is our managed host
469
- if (
470
- line.strip().startswith(f"Host {engine_name}")
471
- and SSH_MANAGED_COMMENT in line
472
- ):
473
- skip_until_next_host = True
474
- elif line.strip().startswith("Host ") and skip_until_next_host:
475
- skip_until_next_host = False
476
- # This is a different host entry, keep it
477
- new_lines.append(line)
478
- elif not skip_until_next_host:
479
- new_lines.append(line)
480
-
481
- # Add the new entry
482
- if new_lines and new_lines[-1].strip(): # Add blank line if needed
483
- new_lines.append("")
484
-
485
- new_lines.extend(
486
- [
487
- f"Host {engine_name} {SSH_MANAGED_COMMENT}",
488
- f" HostName {instance_id}",
489
- f" User {ssh_user}",
490
- f" ProxyCommand sh -c \"AWS_SSM_IDLE_TIMEOUT={idle_timeout} aws ssm start-session --target %h --document-name AWS-StartSSHSession --parameters 'portNumber=%p'\"",
491
- ]
492
- )
493
-
494
- # Write back
495
- config_path.write_text("\n".join(new_lines))
496
- config_path.chmod(0o600)
497
-
498
-
499
- # ==================== ENGINE COMMANDS ====================
500
-
501
-
502
- @engine_app.command("launch")
503
- def launch_engine(
504
- name: str = typer.Argument(help="Name for the new engine"),
505
- engine_type: str = typer.Option(
506
- "cpu",
507
- "--type",
508
- "-t",
509
- help="Engine type: cpu, cpumax, t4, a10g, a100, 4_t4, 8_t4, 4_a10g, 8_a10g",
510
- ),
511
- user: Optional[str] = typer.Option(None, "--user", "-u", help="Override username"),
512
- boot_disk_size: Optional[int] = typer.Option(
513
- None,
514
- "--size",
515
- "-s",
516
- help="Boot disk size in GB (default: 50GB, min: 20GB, max: 1000GB)",
517
- ),
518
- availability_zone: Optional[str] = typer.Option(
519
- None,
520
- "--az",
521
- help="Prefer a specific Availability Zone (e.g., us-east-1b). If omitted the service will try all public subnets.",
522
- ),
523
- ):
524
- """Launch a new engine instance."""
525
- username = check_aws_sso()
526
- if user:
527
- username = user
528
-
529
- # Validate engine type
530
- valid_types = [
531
- "cpu",
532
- "cpumax",
533
- "t4",
534
- "a10g",
535
- "a100",
536
- "4_t4",
537
- "8_t4",
538
- "4_a10g",
539
- "8_a10g",
540
- ]
541
- if engine_type not in valid_types:
542
- console.print(f"[red]❌ Invalid engine type: {engine_type}[/red]")
543
- console.print(f"Valid types: {', '.join(valid_types)}")
544
- raise typer.Exit(1)
545
-
546
- # Validate boot disk size
547
- if boot_disk_size is not None:
548
- if boot_disk_size < 20:
549
- console.print("[red]❌ Boot disk size must be at least 20GB[/red]")
550
- raise typer.Exit(1)
551
- if boot_disk_size > 1000:
552
- console.print("[red]❌ Boot disk size cannot exceed 1000GB[/red]")
553
- raise typer.Exit(1)
554
-
555
- cost = HOURLY_COSTS.get(engine_type, 0)
556
- disk_info = f" with {boot_disk_size}GB boot disk" if boot_disk_size else ""
557
- console.print(
558
- f"Launching [cyan]{name}[/cyan] ({engine_type}){disk_info} for ${cost:.2f}/hour..."
559
- )
560
-
561
- with Progress(
562
- SpinnerColumn(),
563
- TextColumn("[progress.description]{task.description}"),
564
- transient=True,
565
- ) as progress:
566
- progress.add_task("Creating engine...", total=None)
567
-
568
- request_data: Dict[str, Any] = {
569
- "name": name,
570
- "user": username,
571
- "engine_type": engine_type,
572
- }
573
- if boot_disk_size is not None:
574
- request_data["boot_disk_size"] = boot_disk_size
575
- if availability_zone:
576
- request_data["availability_zone"] = availability_zone
577
-
578
- response = make_api_request("POST", "/engines", json_data=request_data)
579
-
580
- if response.status_code == 201:
581
- data = response.json()
582
- console.print(f"[green]✓ Engine launched successfully![/green]")
583
- console.print(f"Instance ID: [cyan]{data['instance_id']}[/cyan]")
584
- console.print(f"Type: {data['instance_type']} (${cost:.2f}/hour)")
585
- if boot_disk_size:
586
- console.print(f"Boot disk: {boot_disk_size}GB")
587
- console.print("\nThe engine is initializing. This may take a few minutes.")
588
- console.print(f"Check status with: [cyan]dh engine status {name}[/cyan]")
589
- else:
590
- error = response.json().get("error", "Unknown error")
591
- console.print(f"[red]❌ Failed to launch engine: {error}[/red]")
592
-
593
-
594
- @engine_app.command("list")
595
- def list_engines(
596
- user: Optional[str] = typer.Option(None, "--user", "-u", help="Filter by user"),
597
- running_only: bool = typer.Option(
598
- False, "--running", help="Show only running engines"
599
- ),
600
- stopped_only: bool = typer.Option(
601
- False, "--stopped", help="Show only stopped engines"
602
- ),
603
- detailed: bool = typer.Option(
604
- False, "--detailed", "-d", help="Show detailed status (slower)"
605
- ),
606
- ):
607
- """List engines (shows all engines by default)."""
608
- current_user = check_aws_sso()
609
-
610
- params = {}
611
- if user:
612
- params["user"] = user
613
- if detailed:
614
- params["check_ready"] = "true"
615
-
616
- response = make_api_request("GET", "/engines", params=params)
617
-
618
- if response.status_code == 200:
619
- data = response.json()
620
- engines = data.get("engines", [])
621
-
622
- # Filter by state if requested
623
- if running_only:
624
- engines = [e for e in engines if e["state"].lower() == "running"]
625
- elif stopped_only:
626
- engines = [e for e in engines if e["state"].lower() == "stopped"]
627
-
628
- if not engines:
629
- console.print("No engines found.")
630
- return
631
-
632
- # Only fetch detailed info if requested (slow)
633
- stages_map = {}
634
- if detailed:
635
- stages_map = _fetch_init_stages([e["instance_id"] for e in engines])
636
-
637
- # Create table
638
- table = Table(title="Engines", box=box.ROUNDED)
639
- table.add_column("Name", style="cyan")
640
- table.add_column("Instance ID", style="dim")
641
- table.add_column("Type")
642
- table.add_column("User")
643
- table.add_column("Status")
644
- if detailed:
645
- table.add_column("Disk Usage")
646
- table.add_column("Uptime/Since")
647
- table.add_column("$/hour", justify="right")
648
-
649
- for engine in engines:
650
- launch_time = parse_launch_time(engine["launch_time"])
651
- uptime = datetime.now(timezone.utc) - launch_time
652
- hourly_cost = HOURLY_COSTS.get(engine["engine_type"], 0)
653
-
654
- if engine["state"].lower() == "running":
655
- time_str = format_duration(uptime)
656
- # Only get disk usage if detailed mode
657
- if detailed:
658
- disk_usage = get_disk_usage_via_ssm(engine["instance_id"]) or "-"
659
- else:
660
- disk_usage = None
661
- else:
662
- time_str = launch_time.strftime("%Y-%m-%d %H:%M")
663
- disk_usage = "-" if detailed else None
664
-
665
- row_data = [
666
- engine["name"],
667
- engine["instance_id"],
668
- engine["engine_type"],
669
- engine["user"],
670
- format_status(engine["state"], engine.get("ready")),
671
- ]
672
- if detailed:
673
- row_data.append(disk_usage)
674
- row_data.extend(
675
- [
676
- time_str,
677
- f"${hourly_cost:.2f}",
678
- ]
679
- )
680
-
681
- table.add_row(*row_data)
682
-
683
- console.print(table)
684
- if not detailed and any(e["state"].lower() == "running" for e in engines):
685
- console.print(
686
- "\n[dim]Tip: Use --detailed to see disk usage and bootstrap status (slower)[/dim]"
687
- )
688
- else:
689
- error = response.json().get("error", "Unknown error")
690
- console.print(f"[red]❌ Failed to list engines: {error}[/red]")
691
-
692
-
693
- @engine_app.command("status")
694
- def engine_status(
695
- name_or_id: str = typer.Argument(help="Engine name or instance ID"),
696
- detailed: bool = typer.Option(False, "--detailed", "-d", help="Show detailed status (slower)"),
697
- show_log: bool = typer.Option(False, "--show-log", help="Show bootstrap log (requires --detailed)"),
698
- ):
699
- """Show engine status and information."""
700
- check_aws_sso()
701
-
702
- # Get all engines to resolve name
703
- response = make_api_request("GET", "/engines")
704
- if response.status_code != 200:
705
- console.print("[red]❌ Failed to fetch engines[/red]")
706
- raise typer.Exit(1)
707
-
708
- engines = response.json().get("engines", [])
709
- engine = resolve_engine(name_or_id, engines)
710
-
711
- # Fast status display (default)
712
- if not detailed:
713
- # Fetch idle status via SSM with longer timeout
714
- ssm = boto3.client("ssm", region_name="us-east-1")
715
- idle_data = None # Use None to indicate no data received
716
-
717
- if engine["state"].lower() == "running":
718
- try:
719
- resp = ssm.send_command(
720
- InstanceIds=[engine["instance_id"]],
721
- DocumentName="AWS-RunShellScript",
722
- Parameters={
723
- "commands": [
724
- "cat /var/run/idle-detector/last_state.json 2>/dev/null || echo '{}'"
725
- ],
726
- "executionTimeout": ["10"],
727
- },
728
- )
729
- cid = resp["Command"]["CommandId"]
730
-
731
- # Wait up to 3 seconds for result
732
- for _ in range(6): # 6 * 0.5 = 3 seconds
733
- time.sleep(0.5)
734
- inv = ssm.get_command_invocation(
735
- CommandId=cid, InstanceId=engine["instance_id"]
736
- )
737
- if inv["Status"] in ["Success", "Failed"]:
738
- break
739
-
740
- if inv["Status"] == "Success":
741
- content = inv["StandardOutputContent"].strip()
742
- if content and content != "{}":
743
- idle_data = json.loads(content)
744
- else:
745
- idle_data = {} # Empty response but SSM worked
746
- except Exception:
747
- idle_data = None # SSM failed
748
-
749
- # Determine running state display
750
- running_state = engine["state"].lower()
751
- if running_state == "running":
752
- run_disp = "[green]Running[/green]"
753
- elif running_state == "pending":
754
- run_disp = "[yellow]Starting...[/yellow]"
755
- elif running_state == "stopping":
756
- run_disp = "[yellow]Stopping...[/yellow]"
757
- elif running_state == "stopped":
758
- run_disp = "[dim]Stopped[/dim]"
759
- else:
760
- run_disp = engine["state"].capitalize()
761
-
762
- # Determine idle/active status
763
- idle_disp = ""
764
- if running_state == "running":
765
- if idle_data is None:
766
- # SSM failed - we don't know the status
767
- idle_disp = " [dim]N/A[/dim]"
768
- elif not idle_data:
769
- # Empty data - likely very early in boot
770
- idle_disp = " [dim]N/A[/dim]"
771
- else:
772
- # We have data
773
- is_idle = idle_data.get("idle", False)
774
- timeout_sec = idle_data.get("timeout_sec")
775
- idle_seconds = idle_data.get("idle_seconds", 0) if is_idle else 0
776
-
777
- if is_idle:
778
- if isinstance(timeout_sec, int) and isinstance(idle_seconds, int):
779
- remaining = max(0, timeout_sec - idle_seconds)
780
- remaining_mins = remaining // 60
781
- if remaining_mins == 0:
782
- idle_disp = f" [yellow]Idle {idle_seconds//60}m/{timeout_sec//60}m: [red]<1m[/red] left[/yellow]"
783
- else:
784
- idle_disp = f" [yellow]Idle {idle_seconds//60}m/{timeout_sec//60}m: [red]{remaining_mins}m[/red] left[/yellow]"
785
- else:
786
- idle_disp = " [yellow]Idle ?/?[/yellow]"
787
- else:
788
- # Actively not idle
789
- idle_disp = " [green]Active[/green]"
790
-
791
- # Build status lines - minimal info for fast view
792
- status_lines = [
793
- f"[blue]{engine['name']}[/blue] {run_disp}{idle_disp}",
794
- ]
795
-
796
- # Add activity sensors if we have idle data
797
- if idle_data and idle_data.get("reasons"):
798
- status_lines.append("") # blank line before sensors
799
-
800
- sensor_map = {
801
- "CoffeeLockSensor": ("☕", "Coffee"),
802
- "ActiveLoginSensor": ("🐚", "SSH"),
803
- "IDEConnectionSensor": ("🖥 ", "IDE"),
804
- "DockerWorkloadSensor": ("🐳", "Docker"),
805
- }
806
-
807
- for r in idle_data.get("reasons", []):
808
- sensor = r.get("sensor", "Unknown")
809
- active = r.get("active", False)
810
- icon, label = sensor_map.get(sensor, ("?", sensor))
811
- status_str = "[green]YES[/green]" if active else "[dim]nope[/dim]"
812
- status_lines.append(f" {icon} {label:6} {status_str}")
813
-
814
- # Display in a nice panel
815
- console.print(
816
- Panel("\n".join(status_lines), title="Engine Status", border_style="blue")
817
- )
818
- return # Exit early for fast status
819
-
820
- # Get detailed engine status including idle detector info (for --detailed mode)
821
- response = make_api_request("GET", f"/engines/{engine['instance_id']}")
822
- if response.status_code != 200:
823
- console.print("[red]❌ Failed to fetch engine details[/red]")
824
- raise typer.Exit(1)
825
-
826
- engine_details = response.json()
827
- engine = engine_details.get("engine", engine) # Use detailed info if available
828
- idle_detector = engine_details.get("idle_detector", {}) or {}
829
- attached_studios = engine_details.get("attached_studios", [])
830
-
831
- # Calculate costs
832
- launch_time = parse_launch_time(engine["launch_time"])
833
- uptime = datetime.now(timezone.utc) - launch_time
834
- hourly_cost = HOURLY_COSTS.get(engine["engine_type"], 0)
835
- # total_cost intentionally not shown in status view
836
-
837
- stages_map = _fetch_init_stages([engine["instance_id"]])
838
- stage_val = stages_map.get(engine["instance_id"], "-")
839
-
840
- # Try to fetch actual boot time via SSM (best-effort)
841
- boot_time_str: Optional[str] = None
842
- try:
843
- if engine["state"].lower() == "running":
844
- ssm = boto3.client("ssm", region_name="us-east-1")
845
- resp = ssm.send_command(
846
- InstanceIds=[engine["instance_id"]],
847
- DocumentName="AWS-RunShellScript",
848
- Parameters={
849
- "commands": ["uptime -s || who -b | awk '{print $3\" \"$4}'"]
850
- },
851
- )
852
- cid = resp["Command"]["CommandId"]
853
- time.sleep(1)
854
- inv = ssm.get_command_invocation(
855
- CommandId=cid, InstanceId=engine["instance_id"]
856
- )
857
- if inv.get("Status") == "Success":
858
- boot_time_str = (
859
- (inv.get("StandardOutputContent") or "").strip().splitlines()[0]
860
- if inv.get("StandardOutputContent")
861
- else None
862
- )
863
- except Exception:
864
- boot_time_str = None
865
-
866
- started_line = (
867
- f"[bold]Started:[/bold] {boot_time_str} ({format_duration(uptime)} ago)"
868
- if boot_time_str
869
- else f"[bold]Started:[/bold] {launch_time.strftime('%Y-%m-%d %H:%M:%S')} ({format_duration(uptime)} ago)"
870
- )
871
-
872
- # ---------------- Front-loaded summary ----------------
873
- running_state = engine["state"].lower()
874
- if running_state == "running":
875
- run_disp = "[green]Running[/green]"
876
- elif running_state == "pending":
877
- run_disp = "[yellow]Starting...[/yellow]"
878
- elif running_state == "stopping":
879
- run_disp = "[yellow]Stopping...[/yellow]"
880
- elif running_state == "stopped":
881
- run_disp = "[dim]Stopped[/dim]"
882
- else:
883
- run_disp = engine["state"].capitalize()
884
-
885
- # Compose Active/Idle header with extra detail when idle
886
- def _compute_active_disp(idle_info: Dict[str, Any]) -> str:
887
- # If we don't have idle info or it's explicitly unavailable, show N/A
888
- if not idle_info or idle_info.get("available") == False:
889
- return "[dim]N/A[/dim]"
890
-
891
- if idle_info.get("status") == "active":
892
- return "[green]Active[/green]"
893
- if running_state in ("stopped", "stopping"):
894
- return "[dim]N/A[/dim]"
895
-
896
- # If idle, show time/threshold with time remaining if available
897
- if idle_info.get("status") == "idle":
898
- idle_seconds_v = idle_info.get("idle_seconds")
899
- thresh_v = idle_info.get("idle_threshold")
900
- if isinstance(idle_seconds_v, (int, float)) and isinstance(thresh_v, (int, float)):
901
- remaining = max(0, int(thresh_v) - int(idle_seconds_v))
902
- remaining_mins = remaining // 60
903
- if remaining_mins == 0:
904
- return f"[yellow]Idle {int(idle_seconds_v)//60}m/{int(thresh_v)//60}m: [red]<1m[/red] left[/yellow]"
905
- else:
906
- return f"[yellow]Idle {int(idle_seconds_v)//60}m/{int(thresh_v)//60}m: [red]{remaining_mins}m[/red] left[/yellow]"
907
- elif isinstance(thresh_v, (int, float)):
908
- return f"[yellow]Idle ?/{int(thresh_v)//60}m[/yellow]"
909
- else:
910
- return "[yellow]Idle ?/?[/yellow]"
911
-
912
- # Default to N/A if we can't determine status
913
- return "[dim]N/A[/dim]"
914
-
915
- active_disp = _compute_active_disp(idle_detector)
916
-
917
- top_lines = [
918
- f"[blue]{engine['name']}[/blue] {run_disp} {active_disp}\n",
919
- ]
920
-
921
- # Studios summary next, with studio name in purple/magenta
922
- studios_line = None
923
- if attached_studios:
924
- stu_texts = [
925
- f"[magenta]{s.get('user', 'studio')}[/magenta] ({s.get('studio_id', 'unknown')})"
926
- for s in attached_studios
927
- ]
928
- studios_line = "Studios: " + ", ".join(stu_texts)
929
- top_lines.append(studios_line)
930
-
931
- # Paragraph break
932
- top_lines.append("")
933
-
934
- # ---------------- Details block (white/default) ----------------
935
- status_lines = [
936
- f"Name: {engine['name']}",
937
- f"Instance: {engine['instance_id']}",
938
- f"Type: {engine['engine_type']} ({engine['instance_type']})",
939
- f"Status: {engine['state']}",
940
- f"User: {engine['user']}",
941
- f"IP: {engine.get('public_ip', 'N/A')}",
942
- started_line,
943
- f"$/hour: ${hourly_cost:.2f}",
944
- ]
945
-
946
- # Disk usage (like list --detailed)
947
- if engine["state"].lower() == "running":
948
- disk_usage = get_disk_usage_via_ssm(engine["instance_id"]) or "-"
949
- status_lines.append(f"Disk: {disk_usage}")
950
-
951
- # Idle timeout (show even when not idle) - but only if we have data
952
- if idle_detector.get("available"):
953
- idle_threshold_secs: Optional[int] = None
954
- # Prefer value from idle detector overlay if present
955
- try:
956
- if isinstance(idle_detector.get("idle_threshold"), (int, float)):
957
- idle_threshold_secs = int(idle_detector["idle_threshold"])
958
- except Exception:
959
- idle_threshold_secs = None
960
-
961
- if idle_threshold_secs is None and engine["state"].lower() == "running":
962
- # Fallback: read /etc/engine.env via SSM
963
- try:
964
- ssm = boto3.client("ssm", region_name="us-east-1")
965
- resp = ssm.send_command(
966
- InstanceIds=[engine["instance_id"]],
967
- DocumentName="AWS-RunShellScript",
968
- Parameters={
969
- "commands": [
970
- "grep -E '^IDLE_TIMEOUT_SECONDS=' /etc/engine.env | cut -d'=' -f2 || echo '?'",
971
- ],
972
- "executionTimeout": ["5"],
973
- },
974
- )
975
- cid = resp["Command"]["CommandId"]
976
- time.sleep(1)
977
- inv = ssm.get_command_invocation(
978
- CommandId=cid, InstanceId=engine["instance_id"]
979
- )
980
- if inv.get("Status") == "Success":
981
- out = (inv.get("StandardOutputContent") or "").strip()
982
- if out and out != "?" and out.isdigit():
983
- idle_threshold_secs = int(out)
984
- except Exception:
985
- idle_threshold_secs = None
986
-
987
- if idle_threshold_secs is not None:
988
- status_lines.append(
989
- f"Idle timeout: {idle_threshold_secs//60}m ({idle_threshold_secs}s)"
990
- )
991
- else:
992
- status_lines.append("Idle timeout: unknown")
993
- else:
994
- # No idle detector data available
995
- status_lines.append("Idle timeout: N/A")
996
-
997
- # Health report (only if bootstrap finished)
998
- if stage_val == "finished":
999
- try:
1000
- ssm = boto3.client("ssm", region_name="us-east-1")
1001
- res = ssm.send_command(
1002
- InstanceIds=[engine["instance_id"]],
1003
- DocumentName="AWS-RunShellScript",
1004
- Parameters={
1005
- "commands": [
1006
- "cat /opt/dayhoff/state/engine-health.json 2>/dev/null || cat /var/run/engine-health.json 2>/dev/null || true"
1007
- ],
1008
- "executionTimeout": ["10"],
1009
- },
1010
- )
1011
- cid = res["Command"]["CommandId"]
1012
- time.sleep(1)
1013
- inv = ssm.get_command_invocation(
1014
- CommandId=cid, InstanceId=engine["instance_id"]
1015
- )
1016
- if inv["Status"] == "Success":
1017
- import json as _json
1018
-
1019
- health = _json.loads(inv["StandardOutputContent"].strip() or "{}")
1020
- status_lines.append("")
1021
- status_lines.append("[bold]Health:[/bold]")
1022
- status_lines.append(
1023
- f" • GPU Drivers: {'OK' if health.get('drivers_ok') else 'MISSING'}"
1024
- )
1025
- idle_stat = health.get("idle_detector_service") or health.get(
1026
- "idle_detector_timer", "unknown"
1027
- )
1028
- status_lines.append(f" • Idle Detector: {idle_stat}")
1029
- except Exception:
1030
- pass
1031
-
1032
- # Try to enrich/fallback idle-detector details from on-engine summary file via SSM
1033
- def _fetch_idle_summary_via_ssm(instance_id: str) -> Optional[Dict]:
1034
- try:
1035
- ssm = boto3.client("ssm", region_name="us-east-1")
1036
- res = ssm.send_command(
1037
- InstanceIds=[instance_id],
1038
- DocumentName="AWS-RunShellScript",
1039
- Parameters={
1040
- "commands": [
1041
- "cat /var/run/idle-detector/last_state.json 2>/dev/null || true",
1042
- ],
1043
- "executionTimeout": ["5"],
1044
- },
1045
- )
1046
- cid = res["Command"]["CommandId"]
1047
- # Wait up to 2 seconds for SSM command to complete (was 1 second)
1048
- for _ in range(4): # 4 * 0.5 = 2 seconds
1049
- time.sleep(0.5)
1050
- inv = ssm.get_command_invocation(CommandId=cid, InstanceId=instance_id)
1051
- if inv["Status"] in ["Success", "Failed"]:
1052
- break
1053
- if inv["Status"] != "Success":
1054
- return None
1055
- content = inv["StandardOutputContent"].strip()
1056
- if not content:
1057
- return None
1058
- data = json.loads(content)
1059
- # Convert last_state schema (new or old) to idle_detector schema used by CLI output
1060
- idle_info: Dict[str, Any] = {"available": True}
1061
-
1062
- # Active/idle
1063
- idle_flag = bool(data.get("idle", False))
1064
- idle_info["status"] = "idle" if idle_flag else "active"
1065
-
1066
- # Threshold and elapsed
1067
- if isinstance(data.get("timeout_sec"), (int, float)):
1068
- idle_info["idle_threshold"] = int(data["timeout_sec"]) # seconds
1069
- if isinstance(data.get("idle_seconds"), (int, float)):
1070
- idle_info["idle_seconds"] = int(data["idle_seconds"])
1071
-
1072
- # Keep raw reasons for sensor display when available (new schema)
1073
- if isinstance(data.get("reasons"), list):
1074
- idle_info["_reasons_raw"] = data["reasons"]
1075
- else:
1076
- # Fallback: synthesize reasons from the old forensics layout
1077
- f_all = data.get("forensics", {}) or {}
1078
- synthesized = []
1079
-
1080
- def _mk(sensor_name: str, key: str):
1081
- entry = f_all.get(key, {}) or {}
1082
- synthesized.append(
1083
- {
1084
- "sensor": sensor_name,
1085
- "active": bool(entry.get("active", False)),
1086
- "reason": entry.get("reason", ""),
1087
- "forensic": entry.get("forensic", {}),
1088
- }
1089
- )
1090
-
1091
- _mk("CoffeeLockSensor", "coffee")
1092
- _mk("ActiveLoginSensor", "ssh")
1093
- _mk("IDEConnectionSensor", "ide")
1094
- _mk("DockerWorkloadSensor", "docker")
1095
- idle_info["_reasons_raw"] = synthesized
1096
-
1097
- # Derive details from sensors
1098
- for r in idle_info.get("_reasons_raw", []):
1099
- if not r.get("active"):
1100
- continue
1101
- sensor = (r.get("sensor") or "").lower()
1102
- forensic = r.get("forensic") or {}
1103
- if sensor == "ideconnectionsensor":
1104
- # Prefer unique_pid_count written by new detector
1105
- cnt = forensic.get("unique_pid_count")
1106
- if not isinstance(cnt, int):
1107
- cnt = forensic.get("matches")
1108
- if isinstance(cnt, int):
1109
- idle_info["ide_connections"] = {"connection_count": cnt}
1110
- else:
1111
- idle_info["ide_connections"] = {"connection_count": 1}
1112
- elif sensor == "coffeelocksensor":
1113
- rem = forensic.get("remaining_sec")
1114
- if isinstance(rem, (int, float)) and rem > 0:
1115
- idle_info["coffee_lock"] = format_duration(
1116
- timedelta(seconds=int(rem))
1117
- )
1118
- elif sensor == "activeloginsensor":
1119
- sess = {
1120
- "tty": forensic.get("tty", "pts/?"),
1121
- "pid": forensic.get("pid", "?"),
1122
- "idle_time": forensic.get("idle_sec", 0),
1123
- "from_ip": forensic.get("remote_addr", "unknown"),
1124
- }
1125
- idle_info.setdefault("ssh_sessions", []).append(sess)
1126
- return idle_info
1127
- except Exception:
1128
- return None
1129
-
1130
- # Always try to enrich from on-engine summary (fast, best-effort)
1131
- overlay = _fetch_idle_summary_via_ssm(engine["instance_id"])
1132
- if overlay:
1133
- # If API didn't indicate availability, replace entirely; otherwise fill gaps
1134
- if not idle_detector.get("available"):
1135
- idle_detector = overlay
1136
- else:
1137
- for k, v in overlay.items():
1138
- idle_detector.setdefault(k, v)
1139
- else:
1140
- # SSM failed - mark as unavailable if we don't have good data
1141
- if not idle_detector.get("available"):
1142
- idle_detector = {"available": False} # Mark as unavailable
1143
-
1144
- # Recompute header display with latest data
1145
- active_disp = _compute_active_disp(idle_detector)
1146
- top_lines[0] = f"[blue]{engine['name']}[/blue] {run_disp} {active_disp}\n"
1147
-
1148
- # Activity Sensors (show all with YES/no)
1149
- if idle_detector.get("available"):
1150
- status_lines.append("")
1151
- status_lines.append("[bold]Activity Sensors:[/bold]")
1152
- reasons_raw = idle_detector.get("_reasons_raw", []) or []
1153
- by_sensor: Dict[str, Dict[str, Any]] = {}
1154
- for r in reasons_raw:
1155
- nm = r.get("sensor")
1156
- if nm:
1157
- by_sensor[nm] = r
1158
-
1159
- def _sensor_line(label: str, key: str, emoji: str) -> str:
1160
- r = by_sensor.get(key, {})
1161
- active = bool(r.get("active"))
1162
- reason_txt = r.get("reason") or ("" if not active else "active")
1163
- flag = "[green]YES[/green]" if active else "[dim]nope[/dim]"
1164
- return (
1165
- f" {emoji} {label}: {flag} {('- ' + reason_txt) if reason_txt else ''}"
1166
- )
1167
-
1168
- status_lines.append(_sensor_line("Coffee", "CoffeeLockSensor", "☕"))
1169
- status_lines.append(_sensor_line("Shell ", "ActiveLoginSensor", "🐚"))
1170
- status_lines.append(_sensor_line(" IDE ", "IDEConnectionSensor", "🖥"))
1171
- status_lines.append(_sensor_line("Docker", "DockerWorkloadSensor", "🐳"))
1172
-
1173
- # Recompute display with latest idle detector data
1174
- active_disp = _compute_active_disp(idle_detector)
1175
- # Rewrite top header line (index 0) to include updated display
1176
- top_lines[0] = f"[blue]{engine['name']}[/blue] {run_disp} {active_disp}\n"
1177
-
1178
- # Combine top summary and details
1179
- all_lines = top_lines + status_lines
1180
- console.print(
1181
- Panel("\n".join(all_lines), title="Engine Status", border_style="blue")
1182
- )
1183
-
1184
- if show_log:
1185
- if not detailed:
1186
- console.print("[yellow]Note: --show-log requires --detailed flag[/yellow]")
1187
- return
1188
- console.print("\n[bold]Bootstrap Log:[/bold]")
1189
- try:
1190
- ssm = boto3.client("ssm", region_name="us-east-1")
1191
- resp = ssm.send_command(
1192
- InstanceIds=[engine["instance_id"]],
1193
- DocumentName="AWS-RunShellScript",
1194
- Parameters={
1195
- "commands": [
1196
- "cat /var/log/engine-setup.log 2>/dev/null || echo 'No setup log found'"
1197
- ],
1198
- "executionTimeout": ["15"],
1199
- },
1200
- )
1201
- cid = resp["Command"]["CommandId"]
1202
- time.sleep(2)
1203
- inv = ssm.get_command_invocation(
1204
- CommandId=cid, InstanceId=engine["instance_id"]
1205
- )
1206
- if inv["Status"] == "Success":
1207
- log_content = inv["StandardOutputContent"].strip()
1208
- if log_content:
1209
- console.print(f"[dim]{log_content}[/dim]")
1210
- else:
1211
- console.print("[yellow]No bootstrap log available[/yellow]")
1212
- else:
1213
- console.print("[red]❌ Could not retrieve bootstrap log[/red]")
1214
- except Exception as e:
1215
- console.print(f"[red]❌ Error fetching log: {e}[/red]")
1216
-
1217
-
1218
- @engine_app.command("stop")
1219
- def stop_engine(
1220
- name_or_id: str = typer.Argument(help="Engine name or instance ID"),
1221
- force: bool = typer.Option(
1222
- False, "--force", "-f", help="Force stop and detach all studios"
1223
- ),
1224
- ):
1225
- """Stop an engine."""
1226
- check_aws_sso()
1227
-
1228
- # Get all engines to resolve name
1229
- response = make_api_request("GET", "/engines")
1230
- if response.status_code != 200:
1231
- console.print("[red]❌ Failed to fetch engines[/red]")
1232
- raise typer.Exit(1)
1233
-
1234
- engines = response.json().get("engines", [])
1235
- engine = resolve_engine(name_or_id, engines)
1236
-
1237
- console.print(f"Stopping engine [cyan]{engine['name']}[/cyan]...")
1238
-
1239
- # First attempt without detaching
1240
- response = make_api_request(
1241
- "POST",
1242
- f"/engines/{engine['instance_id']}/stop",
1243
- json_data={"detach_studios": force},
1244
- )
1245
-
1246
- if response.status_code == 409 and not force:
1247
- # Engine has attached studios
1248
- data = response.json()
1249
- attached_studios = data.get("attached_studios", [])
1250
-
1251
- console.print("\n[yellow]⚠️ This engine has attached studios:[/yellow]")
1252
- for studio in attached_studios:
1253
- console.print(f" • {studio['user']} ({studio['studio_id']})")
1254
-
1255
- if Confirm.ask("\nDetach all studios and stop the engine?"):
1256
- response = make_api_request(
1257
- "POST",
1258
- f"/engines/{engine['instance_id']}/stop",
1259
- json_data={"detach_studios": True},
1260
- )
1261
- else:
1262
- console.print("Stop cancelled.")
1263
- return
1264
-
1265
- if response.status_code == 200:
1266
- console.print(f"[green]✓ Engine stopped successfully![/green]")
1267
- else:
1268
- error = response.json().get("error", "Unknown error")
1269
- console.print(f"[red]❌ Failed to stop engine: {error}[/red]")
1270
-
1271
-
1272
- @engine_app.command("start")
1273
- def start_engine(
1274
- name_or_id: str = typer.Argument(help="Engine name or instance ID"),
1275
- ):
1276
- """Start a stopped engine."""
1277
- check_aws_sso()
1278
-
1279
- # Get all engines to resolve name
1280
- response = make_api_request("GET", "/engines")
1281
- if response.status_code != 200:
1282
- console.print("[red]❌ Failed to fetch engines[/red]")
1283
- raise typer.Exit(1)
1284
-
1285
- engines = response.json().get("engines", [])
1286
- engine = resolve_engine(name_or_id, engines)
1287
-
1288
- console.print(f"Starting engine [cyan]{engine['name']}[/cyan]...")
1289
-
1290
- response = make_api_request("POST", f"/engines/{engine['instance_id']}/start")
1291
-
1292
- if response.status_code == 200:
1293
- data = response.json()
1294
- console.print(f"[green]✓ Engine started successfully![/green]")
1295
- console.print(f"New public IP: {data.get('public_ip', 'Pending...')}")
1296
- else:
1297
- error = response.json().get("error", "Unknown error")
1298
- console.print(f"[red]❌ Failed to start engine: {error}[/red]")
1299
-
1300
-
1301
- @engine_app.command("terminate")
1302
- def terminate_engine(
1303
- name_or_id: str = typer.Argument(help="Engine name or instance ID"),
1304
- ):
1305
- """Permanently terminate an engine."""
1306
- check_aws_sso()
1307
-
1308
- # Get all engines to resolve name
1309
- response = make_api_request("GET", "/engines")
1310
- if response.status_code != 200:
1311
- console.print("[red]❌ Failed to fetch engines[/red]")
1312
- raise typer.Exit(1)
1313
-
1314
- engines = response.json().get("engines", [])
1315
- engine = resolve_engine(name_or_id, engines)
1316
-
1317
- # Calculate cost
1318
- launch_time = parse_launch_time(engine["launch_time"])
1319
- uptime = datetime.now(timezone.utc) - launch_time
1320
- hourly_cost = HOURLY_COSTS.get(engine["engine_type"], 0)
1321
- total_cost = hourly_cost * (uptime.total_seconds() / 3600)
1322
-
1323
- console.print(
1324
- f"\n[yellow]⚠️ This will permanently terminate engine '{engine['name']}'[/yellow]"
1325
- )
1326
- console.print(f"Total cost for this session: ${total_cost:.2f}")
1327
-
1328
- if not Confirm.ask("\nAre you sure you want to terminate this engine?"):
1329
- console.print("Termination cancelled.")
1330
- return
1331
-
1332
- response = make_api_request("DELETE", f"/engines/{engine['instance_id']}")
1333
-
1334
- if response.status_code == 200:
1335
- console.print(f"[green]✓ Engine terminated successfully![/green]")
1336
- else:
1337
- error = response.json().get("error", "Unknown error")
1338
- console.print(f"[red]❌ Failed to terminate engine: {error}[/red]")
1339
-
1340
-
1341
- @engine_app.command("ssh")
1342
- def ssh_engine(
1343
- name_or_id: str = typer.Argument(help="Engine name or instance ID"),
1344
- admin: bool = typer.Option(
1345
- False, "--admin", help="Connect as ec2-user instead of the engine owner user"
1346
- ),
1347
- idle_timeout: int = typer.Option(
1348
- 600,
1349
- "--idle-timeout",
1350
- help="Idle timeout (seconds) for the SSM port-forward (0 = disable)",
1351
- ),
1352
- ):
1353
- """Connect to an engine via SSH.
1354
-
1355
- By default the CLI connects using the engine's owner username (the same one stored in the `User` tag).
1356
- Pass `--admin` to connect with the underlying [`ec2-user`] account for break-glass or debugging.
1357
- """
1358
- username = check_aws_sso()
1359
-
1360
- # Check for Session Manager Plugin
1361
- if not check_session_manager_plugin():
1362
- raise typer.Exit(1)
1363
-
1364
- # Get all engines to resolve name
1365
- response = make_api_request("GET", "/engines")
1366
- if response.status_code != 200:
1367
- console.print("[red]❌ Failed to fetch engines[/red]")
1368
- raise typer.Exit(1)
1369
-
1370
- engines = response.json().get("engines", [])
1371
- engine = resolve_engine(name_or_id, engines)
1372
-
1373
- if engine["state"].lower() != "running":
1374
- console.print(f"[red]❌ Engine is not running (state: {engine['state']})[/red]")
1375
- raise typer.Exit(1)
1376
-
1377
- # Choose SSH user
1378
- ssh_user = "ec2-user" if admin else username
1379
-
1380
- # Update SSH config
1381
- console.print(
1382
- f"Updating SSH config for [cyan]{engine['name']}[/cyan] (user: {ssh_user})..."
1383
- )
1384
- update_ssh_config_entry(
1385
- engine["name"], engine["instance_id"], ssh_user, idle_timeout
1386
- )
1387
-
1388
- # Connect
1389
- console.print(f"[green]✓ Connecting to {engine['name']}...[/green]")
1390
- subprocess.run(["ssh", engine["name"]])
1391
-
1392
-
1393
- @engine_app.command("config-ssh")
1394
- def config_ssh(
1395
- clean: bool = typer.Option(False, "--clean", help="Remove all managed entries"),
1396
- all_engines: bool = typer.Option(
1397
- False, "--all", "-a", help="Include all engines from all users"
1398
- ),
1399
- admin: bool = typer.Option(
1400
- False,
1401
- "--admin",
1402
- help="Generate entries that use ec2-user instead of per-engine owner user",
1403
- ),
1404
- ):
1405
- """Update SSH config with available engines."""
1406
- username = check_aws_sso()
1407
-
1408
- # Only check for Session Manager Plugin if we're not just cleaning
1409
- if not clean and not check_session_manager_plugin():
1410
- raise typer.Exit(1)
1411
-
1412
- if clean:
1413
- console.print("Removing all managed SSH entries...")
1414
- else:
1415
- if all_engines:
1416
- console.print("Updating SSH config with all running engines...")
1417
- else:
1418
- console.print(
1419
- f"Updating SSH config with running engines for [cyan]{username}[/cyan] and [cyan]shared[/cyan]..."
1420
- )
1421
-
1422
- # Get all engines
1423
- response = make_api_request("GET", "/engines")
1424
- if response.status_code != 200:
1425
- console.print("[red]❌ Failed to fetch engines[/red]")
1426
- raise typer.Exit(1)
1427
-
1428
- engines = response.json().get("engines", [])
1429
- running_engines = [e for e in engines if e["state"].lower() == "running"]
1430
-
1431
- # Filter engines based on options
1432
- if not all_engines:
1433
- # Show only current user's engines and shared engines
1434
- running_engines = [
1435
- e for e in running_engines if e["user"] == username or e["user"] == "shared"
1436
- ]
1437
-
1438
- # Read existing config
1439
- config_path = Path.home() / ".ssh" / "config"
1440
- config_path.parent.mkdir(mode=0o700, exist_ok=True)
1441
-
1442
- if config_path.exists():
1443
- content = config_path.read_text()
1444
- lines = content.splitlines()
1445
- else:
1446
- content = ""
1447
- lines = []
1448
-
1449
- # Remove old managed entries
1450
- new_lines = []
1451
- skip_until_next_host = False
1452
- for line in lines:
1453
- if SSH_MANAGED_COMMENT in line:
1454
- skip_until_next_host = True
1455
- elif line.strip().startswith("Host ") and skip_until_next_host:
1456
- skip_until_next_host = False
1457
- # Check if this is a managed host
1458
- if SSH_MANAGED_COMMENT not in line:
1459
- new_lines.append(line)
1460
- elif not skip_until_next_host:
1461
- new_lines.append(line)
1462
-
1463
- # Add new entries if not cleaning
1464
- if not clean:
1465
- for engine in running_engines:
1466
- # Determine ssh user based on --admin flag
1467
- ssh_user = "ec2-user" if admin else username
1468
- new_lines.extend(
1469
- [
1470
- "",
1471
- f"Host {engine['name']} {SSH_MANAGED_COMMENT}",
1472
- f" HostName {engine['instance_id']}",
1473
- f" User {ssh_user}",
1474
- f" ProxyCommand sh -c \"AWS_SSM_IDLE_TIMEOUT=600 aws ssm start-session --target %h --document-name AWS-StartSSHSession --parameters 'portNumber=%p'\"",
1475
- ]
1476
- )
1477
-
1478
- # Write back
1479
- config_path.write_text("\n".join(new_lines))
1480
- config_path.chmod(0o600)
1481
-
1482
- if clean:
1483
- console.print("[green]✓ Removed all managed SSH entries[/green]")
1484
- else:
1485
- console.print(
1486
- f"[green]✓ Updated SSH config with {len(running_engines)} engines[/green]"
1487
- )
1488
- for engine in running_engines:
1489
- user_display = (
1490
- f"[dim]({engine['user']})[/dim]" if engine["user"] != username else ""
1491
- )
1492
- console.print(
1493
- f" • {engine['name']} → {engine['instance_id']} {user_display}"
1494
- )
1495
-
1496
-
1497
- @engine_app.command("coffee")
1498
- def coffee(
1499
- name_or_id: str = typer.Argument(help="Engine name or instance ID"),
1500
- duration: str = typer.Argument("4h", help="Duration (e.g., 2h, 30m, 2h30m)"),
1501
- cancel: bool = typer.Option(
1502
- False, "--cancel", help="Cancel existing coffee lock instead of extending"
1503
- ),
1504
- ):
1505
- """Pour ☕ for an engine: keeps it awake for the given duration (or cancel)."""
1506
- username = check_aws_sso()
1507
-
1508
- # Parse duration
1509
- import re
1510
-
1511
- if not cancel:
1512
- match = re.match(r"(?:(\d+)h)?(?:(\d+)m)?", duration)
1513
- if not match or (not match.group(1) and not match.group(2)):
1514
- console.print(f"[red]❌ Invalid duration format: {duration}[/red]")
1515
- console.print("Use format like: 4h, 30m, 2h30m")
1516
- raise typer.Exit(1)
1517
-
1518
- hours = int(match.group(1) or 0)
1519
- minutes = int(match.group(2) or 0)
1520
- seconds_total = (hours * 60 + minutes) * 60
1521
- if seconds_total == 0:
1522
- console.print("[red]❌ Duration must be greater than zero[/red]")
1523
- raise typer.Exit(1)
1524
-
1525
- # Get all engines to resolve name
1526
- response = make_api_request("GET", "/engines")
1527
- if response.status_code != 200:
1528
- console.print("[red]❌ Failed to fetch engines[/red]")
1529
- raise typer.Exit(1)
1530
-
1531
- engines = response.json().get("engines", [])
1532
- engine = resolve_engine(name_or_id, engines)
1533
-
1534
- if engine["state"].lower() != "running":
1535
- console.print(f"[red]❌ Engine is not running (state: {engine['state']})[/red]")
1536
- raise typer.Exit(1)
1537
-
1538
- if cancel:
1539
- console.print(f"Cancelling coffee for [cyan]{engine['name']}[/cyan]…")
1540
- else:
1541
- console.print(
1542
- f"Pouring coffee for [cyan]{engine['name']}[/cyan] for {duration}…"
1543
- )
1544
-
1545
- # Use SSM to run the engine coffee command
1546
- ssm = boto3.client("ssm", region_name="us-east-1")
1547
- try:
1548
- response = ssm.send_command(
1549
- InstanceIds=[engine["instance_id"]],
1550
- DocumentName="AWS-RunShellScript",
1551
- Parameters={
1552
- "commands": [
1553
- (
1554
- "/usr/local/bin/engine-coffee --cancel"
1555
- if cancel
1556
- else f"/usr/local/bin/engine-coffee {seconds_total}"
1557
- )
1558
- ],
1559
- "executionTimeout": ["60"],
1560
- },
1561
- )
1562
-
1563
- command_id = response["Command"]["CommandId"]
1564
-
1565
- # Wait for command to complete
1566
- for _ in range(10):
1567
- time.sleep(1)
1568
- result = ssm.get_command_invocation(
1569
- CommandId=command_id,
1570
- InstanceId=engine["instance_id"],
1571
- )
1572
- if result["Status"] in ["Success", "Failed"]:
1573
- break
1574
-
1575
- if result["Status"] == "Success":
1576
- if cancel:
1577
- console.print(
1578
- "[green]✓ Coffee cancelled – auto-shutdown re-enabled[/green]"
1579
- )
1580
- else:
1581
- console.print(f"[green]✓ Coffee poured for {duration}[/green]")
1582
- console.print(
1583
- "\n[dim]Note: Detached Docker containers (except dev containers) will also keep the engine awake.[/dim]"
1584
- )
1585
- console.print(
1586
- "[dim]Use coffee for nohup operations or other background tasks.[/dim]"
1587
- )
1588
- else:
1589
- console.print(
1590
- f"[red]❌ Failed to manage coffee: {result.get('StatusDetails', 'Unknown error')}[/red]"
1591
- )
1592
-
1593
- except ClientError as e:
1594
- console.print(f"[red]❌ Failed to manage coffee: {e}[/red]")
1595
-
1596
-
1597
- @engine_app.command("resize")
1598
- def resize_engine(
1599
- name_or_id: str = typer.Argument(help="Engine name or instance ID"),
1600
- size: int = typer.Option(..., "--size", "-s", help="New size in GB"),
1601
- online: bool = typer.Option(
1602
- False,
1603
- "--online",
1604
- help="Resize while running (requires manual filesystem expansion)",
1605
- ),
1606
- force: bool = typer.Option(
1607
- False, "--force", "-f", help="Force resize and detach all studios"
1608
- ),
1609
- ):
1610
- """Resize an engine's boot disk."""
1611
- check_aws_sso()
1612
-
1613
- # Get all engines to resolve name
1614
- response = make_api_request("GET", "/engines")
1615
- if response.status_code != 200:
1616
- console.print("[red]❌ Failed to fetch engines[/red]")
1617
- raise typer.Exit(1)
1618
-
1619
- engines = response.json().get("engines", [])
1620
- engine = resolve_engine(name_or_id, engines)
1621
-
1622
- # Get current volume info to validate size
1623
- ec2 = boto3.client("ec2", region_name="us-east-1")
1624
-
1625
- try:
1626
- # Get instance details to find root volume
1627
- instance_info = ec2.describe_instances(InstanceIds=[engine["instance_id"]])
1628
- instance = instance_info["Reservations"][0]["Instances"][0]
1629
-
1630
- # Find root volume
1631
- root_device = instance.get("RootDeviceName", "/dev/xvda")
1632
- root_volume_id = None
1633
-
1634
- for bdm in instance.get("BlockDeviceMappings", []):
1635
- if bdm["DeviceName"] == root_device:
1636
- root_volume_id = bdm["Ebs"]["VolumeId"]
1637
- break
1638
-
1639
- if not root_volume_id:
1640
- console.print("[red]❌ Could not find root volume[/red]")
1641
- raise typer.Exit(1)
1642
-
1643
- # Get current volume size
1644
- volumes = ec2.describe_volumes(VolumeIds=[root_volume_id])
1645
- current_size = volumes["Volumes"][0]["Size"]
1646
-
1647
- if size <= current_size:
1648
- console.print(
1649
- f"[red]❌ New size ({size}GB) must be larger than current size ({current_size}GB)[/red]"
1650
- )
1651
- raise typer.Exit(1)
1652
-
1653
- console.print(
1654
- f"[yellow]Resizing engine boot disk from {current_size}GB to {size}GB[/yellow]"
1655
- )
1656
-
1657
- # Check if we need to stop the instance
1658
- if not online and engine["state"].lower() == "running":
1659
- console.print("Stopping engine for offline resize...")
1660
- stop_response = make_api_request(
1661
- "POST",
1662
- f"/engines/{engine['instance_id']}/stop",
1663
- json_data={"detach_studios": False},
1664
- )
1665
- if stop_response.status_code != 200:
1666
- console.print("[red]❌ Failed to stop engine[/red]")
1667
- raise typer.Exit(1)
1668
-
1669
- # Wait for instance to stop
1670
- console.print("Waiting for engine to stop...")
1671
- waiter = ec2.get_waiter("instance_stopped")
1672
- waiter.wait(InstanceIds=[engine["instance_id"]])
1673
- console.print("[green]✓ Engine stopped[/green]")
1674
-
1675
- # Call the resize API
1676
- console.print("Resizing volume...")
1677
- resize_response = make_api_request(
1678
- "POST",
1679
- f"/engines/{engine['instance_id']}/resize",
1680
- json_data={"size": size, "detach_studios": force},
1681
- )
1682
-
1683
- if resize_response.status_code == 409 and not force:
1684
- # Engine has attached studios
1685
- data = resize_response.json()
1686
- attached_studios = data.get("attached_studios", [])
1687
-
1688
- console.print("\n[yellow]⚠️ This engine has attached studios:[/yellow]")
1689
- for studio in attached_studios:
1690
- console.print(f" • {studio['user']} ({studio['studio_id']})")
1691
-
1692
- if Confirm.ask("\nDetach all studios and resize the engine?"):
1693
- resize_response = make_api_request(
1694
- "POST",
1695
- f"/engines/{engine['instance_id']}/resize",
1696
- json_data={"size": size, "detach_studios": True},
1697
- )
1698
- else:
1699
- console.print("Resize cancelled.")
1700
- return
1701
-
1702
- if resize_response.status_code != 200:
1703
- error = resize_response.json().get("error", "Unknown error")
1704
- console.print(f"[red]❌ Failed to resize engine: {error}[/red]")
1705
- raise typer.Exit(1)
1706
-
1707
- # Check if studios were detached
1708
- data = resize_response.json()
1709
- detached_studios = data.get("detached_studios", 0)
1710
- if detached_studios > 0:
1711
- console.print(
1712
- f"[green]✓ Detached {detached_studios} studio(s) before resize[/green]"
1713
- )
1714
-
1715
- # Wait for modification to complete
1716
- console.print("Waiting for volume modification to complete...")
1717
- while True:
1718
- mod_state = ec2.describe_volumes_modifications(VolumeIds=[root_volume_id])
1719
- if not mod_state["VolumesModifications"]:
1720
- break # Modification complete
1721
-
1722
- modification = mod_state["VolumesModifications"][0]
1723
- state = modification["ModificationState"]
1724
- progress = modification.get("Progress", 0)
1725
-
1726
- # Show progress updates only for the resize phase
1727
- if state == "modifying":
1728
- console.print(f"[yellow]Progress: {progress}%[/yellow]")
1729
-
1730
- # Exit as soon as optimization starts (resize is complete)
1731
- if state == "optimizing":
1732
- console.print("[green]✓ Volume resized successfully[/green]")
1733
- console.print(
1734
- "[dim]AWS is optimizing the volume in the background (no action needed).[/dim]"
1735
- )
1736
- break
1737
-
1738
- if state == "completed":
1739
- console.print("[green]✓ Volume resized successfully[/green]")
1740
- break
1741
- elif state == "failed":
1742
- console.print("[red]❌ Volume modification failed[/red]")
1743
- raise typer.Exit(1)
1744
-
1745
- time.sleep(2) # Check more frequently for better UX
1746
-
1747
- # If offline resize, start the instance back up
1748
- if not online and engine["state"].lower() == "running":
1749
- console.print("Starting engine back up...")
1750
- start_response = make_api_request(
1751
- "POST", f"/engines/{engine['instance_id']}/start"
1752
- )
1753
- if start_response.status_code != 200:
1754
- console.print(
1755
- "[yellow]⚠️ Failed to restart engine automatically[/yellow]"
1756
- )
1757
- console.print(
1758
- f"Please start it manually: [cyan]dh engine start {engine['name']}[/cyan]"
1759
- )
1760
- else:
1761
- console.print("[green]✓ Engine started[/green]")
1762
- console.print("The filesystem will be automatically expanded on boot.")
1763
-
1764
- elif online and engine["state"].lower() == "running":
1765
- console.print(
1766
- "\n[yellow]⚠️ Online resize complete. You must now expand the filesystem:[/yellow]"
1767
- )
1768
- console.print(f"1. SSH into the engine: [cyan]ssh {engine['name']}[/cyan]")
1769
- console.print("2. Find the root device: [cyan]lsblk[/cyan]")
1770
- console.print(
1771
- "3. Expand the partition: [cyan]sudo growpart /dev/nvme0n1 1[/cyan] (adjust device name as needed)"
1772
- )
1773
- console.print("4. Expand the filesystem: [cyan]sudo xfs_growfs /[/cyan]")
1774
-
1775
- except ClientError as e:
1776
- console.print(f"[red]❌ Failed to resize engine: {e}[/red]")
1777
- raise typer.Exit(1)
1778
-
1779
-
1780
- @engine_app.command("gami")
1781
- def create_ami(
1782
- name_or_id: str = typer.Argument(
1783
- help="Engine name or instance ID to create AMI from"
1784
- ),
1785
- ):
1786
- """Create a 'Golden AMI' from a running engine.
1787
-
1788
- This process is for creating a pre-warmed, standardized machine image
1789
- that can be used to launch new engines more quickly.
1790
-
1791
- IMPORTANT:
1792
- - The engine MUST have all studios detached before running this command.
1793
- - This process will make the source engine unusable. You should
1794
- plan to TERMINATE the engine after the AMI is created.
1795
- """
1796
- check_aws_sso()
1797
-
1798
- # Get all engines to resolve name and check status
1799
- # We pass check_ready=True to get attached studio info
1800
- response = make_api_request("GET", "/engines", params={"check_ready": "true"})
1801
- if response.status_code != 200:
1802
- console.print("[red]❌ Failed to fetch engines[/red]")
1803
- raise typer.Exit(1)
1804
-
1805
- engines = response.json().get("engines", [])
1806
- engine = resolve_engine(name_or_id, engines)
1807
-
1808
- # --- Pre-flight checks ---
1809
-
1810
- # 1. Check if engine is running
1811
- if engine["state"].lower() != "running":
1812
- console.print(f"[red]❌ Engine '{engine['name']}' is not running.[/red]")
1813
- console.print("Please start it before creating an AMI.")
1814
- raise typer.Exit(1)
1815
-
1816
- # 2. Check for attached studios from the detailed API response
1817
- attached_studios = engine.get("studios", [])
1818
- if attached_studios:
1819
- console.print(
1820
- f"[bold red]❌ Engine '{engine['name']}' has studios attached.[/bold red]"
1821
- )
1822
- console.print("Please detach all studios before creating an AMI:")
1823
- for studio in attached_studios:
1824
- console.print(f" - {studio['user']} ({studio['studio_id']})")
1825
- console.print("\nTo detach, run [bold]dh studio detach[/bold]")
1826
- raise typer.Exit(1)
1827
-
1828
- # Construct AMI name and description
1829
- ami_name = (
1830
- f"prewarmed-engine-{engine['engine_type']}-{datetime.now().strftime('%Y%m%d')}"
1831
- )
1832
- description = (
1833
- f"Amazon Linux 2023 with NVIDIA drivers, Docker, and pre-pulled "
1834
- f"dev container image for {engine['engine_type']} engines"
1835
- )
1836
-
1837
- console.print(f"Creating AMI from engine [cyan]{engine['name']}[/cyan]...")
1838
- console.print(f"[bold]AMI Name:[/] {ami_name}")
1839
- console.print(f"[bold]Description:[/] {description}")
1840
-
1841
- console.print(
1842
- "\n[bold yellow]⚠️ Important:[/bold yellow]\n"
1843
- "1. This process will run cleanup scripts on the engine.\n"
1844
- "2. The source engine should be [bold]terminated[/bold] after the AMI is created.\n"
1845
- )
1846
-
1847
- if not Confirm.ask("Continue with AMI creation?"):
1848
- raise typer.Exit()
1849
-
1850
- # Create AMI using EC2 client directly, as the backend logic is too complex
1851
- ec2 = boto3.client("ec2", region_name="us-east-1")
1852
- ssm = boto3.client("ssm", region_name="us-east-1")
1853
-
1854
- try:
1855
- # Clean up instance state before snapshotting
1856
- console.print("Cleaning up instance for AMI creation...")
1857
- cleanup_commands = [
1858
- "sudo rm -f /opt/dayhoff/first_boot_complete.sentinel",
1859
- "history -c",
1860
- "sudo rm -rf /tmp/* /var/log/messages /var/log/cloud-init.log",
1861
- "sudo rm -rf /var/lib/amazon/ssm/* /etc/amazon/ssm/*",
1862
- "sleep 2 && sudo systemctl stop amazon-ssm-agent &", # Stop agent last
1863
- ]
1864
-
1865
- cleanup_response = ssm.send_command(
1866
- InstanceIds=[engine["instance_id"]],
1867
- DocumentName="AWS-RunShellScript",
1868
- Parameters={"commands": cleanup_commands, "executionTimeout": ["120"]},
1869
- )
1870
-
1871
- # Acknowledge that the SSM command might be in progress as the agent shuts down
1872
- console.print(
1873
- "[dim]ℹ️ Cleanup command sent (status may show 'InProgress' as SSM agent stops)[/dim]"
1874
- )
1875
-
1876
- # Create the AMI
1877
- with Progress(
1878
- SpinnerColumn(),
1879
- TextColumn("[progress.description]{task.description}"),
1880
- transient=True,
1881
- ) as progress:
1882
- task = progress.add_task(
1883
- "Creating AMI (this will take several minutes)...", total=None
1884
- )
1885
-
1886
- response = ec2.create_image(
1887
- InstanceId=engine["instance_id"],
1888
- Name=ami_name,
1889
- Description=description,
1890
- NoReboot=False,
1891
- TagSpecifications=[
1892
- {
1893
- "ResourceType": "image",
1894
- "Tags": [
1895
- {"Key": "Environment", "Value": "dev"},
1896
- {"Key": "Type", "Value": "golden-ami"},
1897
- {"Key": "EngineType", "Value": engine["engine_type"]},
1898
- {"Key": "Name", "Value": ami_name},
1899
- ],
1900
- }
1901
- ],
1902
- )
1903
-
1904
- ami_id = response["ImageId"]
1905
- progress.update(
1906
- task,
1907
- completed=True,
1908
- description=f"[green]✓ AMI creation initiated![/green]",
1909
- )
1910
-
1911
- console.print(f" [bold]AMI ID:[/] {ami_id}")
1912
- console.print("\nThe AMI creation process will continue in the background.")
1913
- console.print("You can monitor progress in the EC2 Console under 'AMIs'.")
1914
- console.print(
1915
- "\nOnce complete, update the AMI ID in [bold]terraform/environments/dev/variables.tf[/bold] "
1916
- "and run [bold]terraform apply[/bold]."
1917
- )
1918
- console.print(
1919
- f"\nRemember to [bold red]terminate the source engine '{engine['name']}'[/bold red] to save costs."
1920
- )
1921
-
1922
- except ClientError as e:
1923
- console.print(f"[red]❌ Failed to create AMI: {e}[/red]")
1924
- raise typer.Exit(1)
1925
-
1926
-
1927
- # ==================== STUDIO COMMANDS ====================
1928
-
1929
-
1930
- def get_user_studio(username: str) -> Optional[Dict]:
1931
- """Get the current user's studio."""
1932
- response = make_api_request("GET", "/studios")
1933
- if response.status_code != 200:
1934
- return None
1935
-
1936
- studios = response.json().get("studios", [])
1937
- user_studios = [s for s in studios if s["user"] == username]
1938
-
1939
- return user_studios[0] if user_studios else None
1940
-
1941
-
1942
- @studio_app.command("create")
1943
- def create_studio(
1944
- size_gb: int = typer.Option(50, "--size", "-s", help="Studio size in GB"),
1945
- ):
1946
- """Create a new studio for the current user."""
1947
- username = check_aws_sso()
1948
-
1949
- # Check if user already has a studio
1950
- existing = get_user_studio(username)
1951
- if existing:
1952
- console.print(
1953
- f"[yellow]You already have a studio: {existing['studio_id']}[/yellow]"
1954
- )
1955
- return
1956
-
1957
- console.print(f"Creating {size_gb}GB studio for user [cyan]{username}[/cyan]...")
1958
-
1959
- with Progress(
1960
- SpinnerColumn(),
1961
- TextColumn("[progress.description]{task.description}"),
1962
- transient=True,
1963
- ) as progress:
1964
- progress.add_task("Creating studio volume...", total=None)
1965
-
1966
- response = make_api_request(
1967
- "POST",
1968
- "/studios",
1969
- json_data={"user": username, "size_gb": size_gb},
1970
- )
1971
-
1972
- if response.status_code == 201:
1973
- data = response.json()
1974
- console.print(f"[green]✓ Studio created successfully![/green]")
1975
- console.print(f"Studio ID: [cyan]{data['studio_id']}[/cyan]")
1976
- console.print(f"Size: {data['size_gb']}GB")
1977
- console.print(f"\nNext step: [cyan]dh studio attach <engine-name>[/cyan]")
1978
- else:
1979
- error = response.json().get("error", "Unknown error")
1980
- console.print(f"[red]❌ Failed to create studio: {error}[/red]")
1981
-
1982
-
1983
- @studio_app.command("status")
1984
- def studio_status(
1985
- user: Optional[str] = typer.Option(
1986
- None, "--user", "-u", help="Check status for a different user (admin only)"
1987
- ),
1988
- ):
1989
- """Show status of your studio."""
1990
- username = check_aws_sso()
1991
-
1992
- # Use specified user if provided, otherwise use current user
1993
- target_user = user if user else username
1994
-
1995
- # Add warning when checking another user's studio
1996
- if target_user != username:
1997
- console.print(
1998
- f"[yellow]⚠️ Checking studio status for user: {target_user}[/yellow]"
1999
- )
2000
-
2001
- studio = get_user_studio(target_user)
2002
- if not studio:
2003
- if target_user == username:
2004
- console.print("[yellow]You don't have a studio yet.[/yellow]")
2005
- console.print("Create one with: [cyan]dh studio create[/cyan]")
2006
- else:
2007
- console.print(f"[yellow]User {target_user} doesn't have a studio.[/yellow]")
2008
- return
2009
-
2010
- # Create status panel
2011
- # Format status with colors
2012
- status = studio["status"]
2013
- if status == "in-use":
2014
- status_display = "[bright_blue]attached[/bright_blue]"
2015
- elif status in ["attaching", "detaching"]:
2016
- status_display = f"[yellow]{status}[/yellow]"
2017
- else:
2018
- status_display = f"[green]{status}[/green]"
2019
-
2020
- status_lines = [
2021
- f"[bold]Studio ID:[/bold] {studio['studio_id']}",
2022
- f"[bold]User:[/bold] {studio['user']}",
2023
- f"[bold]Status:[/bold] {status_display}",
2024
- f"[bold]Size:[/bold] {studio['size_gb']}GB",
2025
- f"[bold]Created:[/bold] {studio['creation_date']}",
2026
- ]
2027
-
2028
- if studio.get("attached_vm_id"):
2029
- status_lines.append(f"[bold]Attached to:[/bold] {studio['attached_vm_id']}")
2030
-
2031
- # Try to get engine details
2032
- response = make_api_request("GET", "/engines")
2033
- if response.status_code == 200:
2034
- engines = response.json().get("engines", [])
2035
- attached_engine = next(
2036
- (e for e in engines if e["instance_id"] == studio["attached_vm_id"]),
2037
- None,
2038
- )
2039
- if attached_engine:
2040
- status_lines.append(
2041
- f"[bold]Engine Name:[/bold] {attached_engine['name']}"
2042
- )
2043
-
2044
- panel = Panel(
2045
- "\n".join(status_lines),
2046
- title="Studio Details",
2047
- border_style="blue",
2048
- )
2049
- console.print(panel)
2050
-
2051
-
2052
- def _is_studio_attached(target_studio_id: str, target_vm_id: str) -> bool:
2053
- """Return True when the given studio already shows as attached to the VM.
2054
-
2055
- Using this extra check lets us stop the outer retry loop as soon as the
2056
- asynchronous attach operation actually finishes, even in the unlikely
2057
- event that the operation-tracking DynamoDB record is not yet updated.
2058
- """
2059
- # First try the per-studio endpoint – fastest.
2060
- resp = make_api_request("GET", f"/studios/{target_studio_id}")
2061
- if resp.status_code == 200:
2062
- data = resp.json()
2063
- if (
2064
- data.get("status") == "in-use"
2065
- and data.get("attached_vm_id") == target_vm_id
2066
- ):
2067
- return True
2068
- # Fallback: list + filter (covers edge-cases where the direct endpoint
2069
- # is slower to update IAM/APIGW mapping than the list endpoint).
2070
- list_resp = make_api_request("GET", "/studios")
2071
- if list_resp.status_code == 200:
2072
- for stu in list_resp.json().get("studios", []):
2073
- if (
2074
- stu.get("studio_id") == target_studio_id
2075
- and stu.get("status") == "in-use"
2076
- and stu.get("attached_vm_id") == target_vm_id
2077
- ):
2078
- return True
2079
- return False
2080
-
2081
-
2082
- @studio_app.command("attach")
2083
- def attach_studio(
2084
- engine_name_or_id: str = typer.Argument(help="Engine name or instance ID"),
2085
- user: Optional[str] = typer.Option(
2086
- None, "--user", "-u", help="Attach a different user's studio (admin only)"
2087
- ),
2088
- ):
2089
- """Attach your studio to an engine."""
2090
- username = check_aws_sso()
2091
-
2092
- # Check for Session Manager Plugin since we'll update SSH config
2093
- if not check_session_manager_plugin():
2094
- raise typer.Exit(1)
2095
-
2096
- # Use specified user if provided, otherwise use current user
2097
- target_user = user if user else username
2098
-
2099
- # Add confirmation when attaching another user's studio
2100
- if target_user != username:
2101
- console.print(f"[yellow]⚠️ Managing studio for user: {target_user}[/yellow]")
2102
- if not Confirm.ask(f"Are you sure you want to attach {target_user}'s studio?"):
2103
- console.print("Operation cancelled.")
2104
- return
2105
-
2106
- # Get user's studio
2107
- studio = get_user_studio(target_user)
2108
- if not studio:
2109
- if target_user == username:
2110
- console.print("[yellow]You don't have a studio yet.[/yellow]")
2111
- if Confirm.ask("Would you like to create one now?"):
2112
- size = IntPrompt.ask("Studio size (GB)", default=50)
2113
- response = make_api_request(
2114
- "POST",
2115
- "/studios",
2116
- json_data={"user": username, "size_gb": size},
2117
- )
2118
- if response.status_code != 201:
2119
- console.print("[red]❌ Failed to create studio[/red]")
2120
- raise typer.Exit(1)
2121
- studio = response.json()
2122
- studio["studio_id"] = studio["studio_id"] # Normalize key
2123
- else:
2124
- raise typer.Exit(0)
2125
- else:
2126
- console.print(f"[red]❌ User {target_user} doesn't have a studio.[/red]")
2127
- raise typer.Exit(1)
2128
-
2129
- # Check if already attached
2130
- if studio.get("status") == "in-use":
2131
- console.print(
2132
- f"[yellow]Studio is already attached to {studio.get('attached_vm_id')}[/yellow]"
2133
- )
2134
- if not Confirm.ask("Detach and reattach to new engine?"):
2135
- return
2136
- # Detach first
2137
- response = make_api_request("POST", f"/studios/{studio['studio_id']}/detach")
2138
- if response.status_code != 200:
2139
- console.print("[red]❌ Failed to detach studio[/red]")
2140
- raise typer.Exit(1)
2141
-
2142
- # Get all engines to resolve name
2143
- response = make_api_request("GET", "/engines")
2144
- if response.status_code != 200:
2145
- console.print("[red]❌ Failed to fetch engines[/red]")
2146
- raise typer.Exit(1)
2147
-
2148
- engines = response.json().get("engines", [])
2149
- engine = resolve_engine(engine_name_or_id, engines)
2150
-
2151
- # Flag to track if we started the engine in this command (affects retry length)
2152
- engine_started_now: bool = False
2153
-
2154
- if engine["state"].lower() != "running":
2155
- console.print(f"[yellow]⚠️ Engine is {engine['state']}[/yellow]")
2156
- if engine["state"].lower() == "stopped" and Confirm.ask(
2157
- "Start the engine first?"
2158
- ):
2159
- response = make_api_request(
2160
- "POST", f"/engines/{engine['instance_id']}/start"
2161
- )
2162
- if response.status_code != 200:
2163
- console.print("[red]❌ Failed to start engine[/red]")
2164
- raise typer.Exit(1)
2165
- console.print("[green]✓ Engine started[/green]")
2166
- # Mark that we booted the engine so attach loop gets extended retries
2167
- engine_started_now = True
2168
- # No further waiting here – attachment attempts below handle retry logic while the
2169
- # engine finishes booting.
2170
- else:
2171
- raise typer.Exit(1)
2172
-
2173
- # Retrieve SSH public key (required for authorised_keys provisioning)
2174
- try:
2175
- public_key = get_ssh_public_key()
2176
- except FileNotFoundError as e:
2177
- console.print(f"[red]❌ {e}[/red]")
2178
- raise typer.Exit(1)
2179
-
2180
- console.print(f"Attaching studio to engine [cyan]{engine['name']}[/cyan]...")
2181
-
2182
- # Determine retry strategy based on whether we just started the engine
2183
- if engine_started_now:
2184
- max_attempts = 40 # About 7 minutes total with exponential backoff
2185
- base_delay = 8
2186
- max_delay = 20
2187
- else:
2188
- max_attempts = 15 # About 2 minutes total with exponential backoff
2189
- base_delay = 5
2190
- max_delay = 10
2191
-
2192
- # Unified retry loop with exponential backoff
2193
- with Progress(
2194
- SpinnerColumn(),
2195
- TimeElapsedColumn(),
2196
- TextColumn("[progress.description]{task.description}"),
2197
- transient=True,
2198
- ) as prog:
2199
- desc = (
2200
- "Attaching studio (engine is still booting)…"
2201
- if engine_started_now
2202
- else "Attaching studio…"
2203
- )
2204
- task = prog.add_task(desc, total=None)
2205
-
2206
- consecutive_not_ready = 0
2207
- last_error = None
2208
-
2209
- for attempt in range(max_attempts):
2210
- # Check if the attach already completed
2211
- if _is_studio_attached(studio["studio_id"], engine["instance_id"]):
2212
- success = True
2213
- break
2214
-
2215
- success, error_msg = _attempt_studio_attach(
2216
- studio, engine, target_user, public_key
2217
- )
2218
-
2219
- if success:
2220
- break # success!
2221
-
2222
- if error_msg:
2223
- # Fatal error – bubble up immediately
2224
- console.print(f"[red]❌ Failed to attach studio: {error_msg}[/red]")
2225
-
2226
- # Suggest repair command if engine seems broken
2227
- if "not ready" in error_msg.lower() and attempt > 5:
2228
- console.print(
2229
- f"\n[yellow]Engine may be in a bad state. Try:[/yellow]"
2230
- )
2231
- console.print(f"[dim] dh engine repair {engine['name']}[/dim]")
2232
- return
2233
-
2234
- # Track consecutive "not ready" responses
2235
- consecutive_not_ready += 1
2236
- last_error = "Engine not ready"
2237
-
2238
- # Update progress display
2239
- if attempt % 3 == 0:
2240
- prog.update(
2241
- task,
2242
- description=f"{desc} attempt {attempt+1}/{max_attempts}",
2243
- )
2244
-
2245
- # If engine seems stuck after many attempts, show a hint
2246
- if consecutive_not_ready > 10 and attempt == 10:
2247
- console.print(
2248
- "[yellow]Engine is taking longer than expected to become ready.[/yellow]"
2249
- )
2250
- console.print(
2251
- "[dim]This can happen after GAMI creation or if the engine is still bootstrapping.[/dim]"
2252
- )
2253
-
2254
- # Exponential backoff with jitter
2255
- delay = min(base_delay * (1.5 ** min(attempt, 5)), max_delay)
2256
- delay += time.time() % 2 # Add 0-2 seconds of jitter
2257
- time.sleep(delay)
2258
-
2259
- else:
2260
- # All attempts exhausted
2261
- console.print(
2262
- f"[yellow]Engine is not becoming ready after {max_attempts} attempts.[/yellow]"
2263
- )
2264
- if last_error:
2265
- console.print(f"[dim]Last issue: {last_error}[/dim]")
2266
- console.print("\n[yellow]You can try:[/yellow]")
2267
- console.print(
2268
- f" 1. Wait a minute and retry: [cyan]dh studio attach {engine['name']}[/cyan]"
2269
- )
2270
- console.print(
2271
- f" 2. Check engine status: [cyan]dh engine status {engine['name']}[/cyan]"
2272
- )
2273
- console.print(
2274
- f" 3. Repair the engine: [cyan]dh engine repair {engine['name']}[/cyan]"
2275
- )
2276
- return
2277
-
2278
- # Successful attach path
2279
- console.print(f"[green]✓ Studio attached successfully![/green]")
2280
-
2281
- # Update SSH config - use target_user for the connection
2282
- update_ssh_config_entry(engine["name"], engine["instance_id"], target_user)
2283
- console.print(f"[green]✓ SSH config updated[/green]")
2284
- console.print(f"\nConnect with: [cyan]ssh {engine['name']}[/cyan]")
2285
- console.print(f"Files are at: [cyan]/studios/{target_user}[/cyan]")
2286
-
2287
-
2288
- def _attempt_studio_attach(studio, engine, target_user, public_key):
2289
- response = make_api_request(
2290
- "POST",
2291
- f"/studios/{studio['studio_id']}/attach",
2292
- json_data={
2293
- "vm_id": engine["instance_id"],
2294
- "user": target_user,
2295
- "public_key": public_key,
2296
- },
2297
- )
2298
-
2299
- # Fast-path success
2300
- if response.status_code == 200:
2301
- return True, None
2302
-
2303
- # Asynchronous path – API returned 202 Accepted and operation tracking ID
2304
- if response.status_code == 202:
2305
- # The operation status polling is broken in the Lambda, so we just
2306
- # wait and check if the studio is actually attached
2307
- time.sleep(5) # Give the async operation a moment to start
2308
-
2309
- # Check periodically if the studio is attached
2310
- for check in range(20): # Check for up to 60 seconds
2311
- if _is_studio_attached(studio["studio_id"], engine["instance_id"]):
2312
- return True, None
2313
- time.sleep(3)
2314
-
2315
- # If we get here, attachment didn't complete in reasonable time
2316
- return False, None # Return None to trigger retry
2317
-
2318
- # --- determine if we should retry ---
2319
- recoverable = False
2320
- error_text = response.json().get("error", "Unknown error")
2321
- err_msg = error_text.lower()
2322
-
2323
- # Check for "Studio is not available (status: in-use)" which means it's already attached
2324
- if (
2325
- response.status_code == 400
2326
- and "not available" in err_msg
2327
- and "in-use" in err_msg
2328
- ):
2329
- # Studio is already attached somewhere - check if it's to THIS engine
2330
- if _is_studio_attached(studio["studio_id"], engine["instance_id"]):
2331
- return True, None # It's attached to our target engine - success!
2332
- else:
2333
- return False, error_text # It's attached elsewhere - fatal error
2334
-
2335
- if response.status_code in (409, 503):
2336
- recoverable = True
2337
- else:
2338
- RECOVERABLE_PATTERNS = [
2339
- "not ready",
2340
- "still starting",
2341
- "initializing",
2342
- "failed to mount",
2343
- "device busy",
2344
- "pending", # VM state pending
2345
- ]
2346
- FATAL_PATTERNS = [
2347
- "permission",
2348
- ]
2349
- if any(p in err_msg for p in FATAL_PATTERNS):
2350
- recoverable = False
2351
- elif any(p in err_msg for p in RECOVERABLE_PATTERNS):
2352
- recoverable = True
2353
-
2354
- if not recoverable:
2355
- # fatal – abort immediately
2356
- return False, error_text
2357
-
2358
- # recoverable – signal caller to retry without treating as error
2359
- return False, None
2360
-
2361
-
2362
- # Note: _poll_operation was removed because the Lambda's operation tracking is broken.
2363
- # We now use _is_studio_attached() to check if the studio is actually attached instead.
2364
-
2365
-
2366
- @studio_app.command("detach")
2367
- def detach_studio(
2368
- user: Optional[str] = typer.Option(
2369
- None, "--user", "-u", help="Detach a different user's studio (admin only)"
2370
- ),
2371
- ):
2372
- """Detach your studio from its current engine."""
2373
- username = check_aws_sso()
2374
-
2375
- # Use specified user if provided, otherwise use current user
2376
- target_user = user if user else username
2377
-
2378
- # Add confirmation when detaching another user's studio
2379
- if target_user != username:
2380
- console.print(f"[yellow]⚠️ Managing studio for user: {target_user}[/yellow]")
2381
- if not Confirm.ask(f"Are you sure you want to detach {target_user}'s studio?"):
2382
- console.print("Operation cancelled.")
2383
- return
2384
-
2385
- studio = get_user_studio(target_user)
2386
- if not studio:
2387
- if target_user == username:
2388
- console.print("[yellow]You don't have a studio.[/yellow]")
2389
- else:
2390
- console.print(f"[yellow]User {target_user} doesn't have a studio.[/yellow]")
2391
- return
2392
-
2393
- if studio.get("status") != "in-use":
2394
- if target_user == username:
2395
- console.print("[yellow]Your studio is not attached to any engine.[/yellow]")
2396
- else:
2397
- console.print(
2398
- f"[yellow]{target_user}'s studio is not attached to any engine.[/yellow]"
2399
- )
2400
- return
2401
-
2402
- console.print(f"Detaching studio from {studio.get('attached_vm_id')}...")
2403
-
2404
- response = make_api_request("POST", f"/studios/{studio['studio_id']}/detach")
2405
-
2406
- if response.status_code == 200:
2407
- console.print(f"[green]✓ Studio detached successfully![/green]")
2408
- else:
2409
- error = response.json().get("error", "Unknown error")
2410
- console.print(f"[red]❌ Failed to detach studio: {error}[/red]")
2411
-
2412
-
2413
- @studio_app.command("delete")
2414
- def delete_studio(
2415
- user: Optional[str] = typer.Option(
2416
- None, "--user", "-u", help="Delete a different user's studio (admin only)"
2417
- ),
2418
- ):
2419
- """Delete your studio permanently."""
2420
- username = check_aws_sso()
2421
-
2422
- # Use specified user if provided, otherwise use current user
2423
- target_user = user if user else username
2424
-
2425
- # Extra warning when deleting another user's studio
2426
- if target_user != username:
2427
- console.print(
2428
- f"[red]⚠️ ADMIN ACTION: Deleting studio for user: {target_user}[/red]"
2429
- )
2430
-
2431
- studio = get_user_studio(target_user)
2432
- if not studio:
2433
- if target_user == username:
2434
- console.print("[yellow]You don't have a studio to delete.[/yellow]")
2435
- else:
2436
- console.print(
2437
- f"[yellow]User {target_user} doesn't have a studio to delete.[/yellow]"
2438
- )
2439
- return
2440
-
2441
- console.print(
2442
- "[red]⚠️ WARNING: This will permanently delete the studio and all data![/red]"
2443
- )
2444
- console.print(f"Studio ID: {studio['studio_id']}")
2445
- console.print(f"User: {target_user}")
2446
- console.print(f"Size: {studio['size_gb']}GB")
2447
-
2448
- # Multiple confirmations
2449
- if not Confirm.ask(
2450
- f"\nAre you sure you want to delete {target_user}'s studio?"
2451
- if target_user != username
2452
- else "\nAre you sure you want to delete your studio?"
2453
- ):
2454
- console.print("Deletion cancelled.")
2455
- return
2456
-
2457
- if not Confirm.ask("[red]This action cannot be undone. Continue?[/red]"):
2458
- console.print("Deletion cancelled.")
2459
- return
2460
-
2461
- typed_confirm = Prompt.ask('Type "DELETE" to confirm permanent deletion')
2462
- if typed_confirm != "DELETE":
2463
- console.print("Deletion cancelled.")
2464
- return
2465
-
2466
- response = make_api_request("DELETE", f"/studios/{studio['studio_id']}")
2467
-
2468
- if response.status_code == 200:
2469
- console.print(f"[green]✓ Studio deleted successfully![/green]")
2470
- else:
2471
- error = response.json().get("error", "Unknown error")
2472
- console.print(f"[red]❌ Failed to delete studio: {error}[/red]")
2473
-
2474
-
2475
- @studio_app.command("list")
2476
- def list_studios(
2477
- all_users: bool = typer.Option(
2478
- False, "--all", "-a", help="Show all users' studios"
2479
- ),
2480
- ):
2481
- """List studios."""
2482
- username = check_aws_sso()
2483
-
2484
- response = make_api_request("GET", "/studios")
2485
-
2486
- if response.status_code == 200:
2487
- studios = response.json().get("studios", [])
2488
-
2489
- if not studios:
2490
- console.print("No studios found.")
2491
- return
2492
-
2493
- # Get all engines to map instance IDs to names
2494
- engines_response = make_api_request("GET", "/engines")
2495
- engines = {}
2496
- if engines_response.status_code == 200:
2497
- for engine in engines_response.json().get("engines", []):
2498
- engines[engine["instance_id"]] = engine["name"]
2499
-
2500
- # Create table
2501
- table = Table(title="Studios", box=box.ROUNDED)
2502
- table.add_column("Studio ID", style="cyan")
2503
- table.add_column("User")
2504
- table.add_column("Status")
2505
- table.add_column("Size", justify="right")
2506
- table.add_column("Disk Usage", justify="right")
2507
- table.add_column("Attached To")
2508
-
2509
- for studio in studios:
2510
- # Change status display
2511
- if studio["status"] == "in-use":
2512
- status_display = "[bright_blue]attached[/bright_blue]"
2513
- elif studio["status"] in ["attaching", "detaching"]:
2514
- status_display = "[yellow]" + studio["status"] + "[/yellow]"
2515
- else:
2516
- status_display = "[green]available[/green]"
2517
-
2518
- # Format attached engine info
2519
- attached_to = "-"
2520
- disk_usage = "?/?"
2521
- if studio.get("attached_vm_id"):
2522
- vm_id = studio["attached_vm_id"]
2523
- engine_name = engines.get(vm_id, "unknown")
2524
- attached_to = f"{engine_name} ({vm_id})"
2525
-
2526
- # Try to get disk usage if attached
2527
- if studio["status"] == "in-use":
2528
- usage = get_studio_disk_usage_via_ssm(vm_id, studio["user"])
2529
- if usage:
2530
- disk_usage = usage
2531
-
2532
- table.add_row(
2533
- studio["studio_id"],
2534
- studio["user"],
2535
- status_display,
2536
- f"{studio['size_gb']}GB",
2537
- disk_usage,
2538
- attached_to,
2539
- )
2540
-
2541
- console.print(table)
2542
- else:
2543
- error = response.json().get("error", "Unknown error")
2544
- console.print(f"[red]❌ Failed to list studios: {error}[/red]")
2545
-
2546
-
2547
- @studio_app.command("reset")
2548
- def reset_studio(
2549
- user: Optional[str] = typer.Option(
2550
- None, "--user", "-u", help="Reset a different user's studio"
2551
- ),
2552
- ):
2553
- """Reset a stuck studio (admin operation)."""
2554
- username = check_aws_sso()
2555
-
2556
- # Use specified user if provided, otherwise use current user
2557
- target_user = user if user else username
2558
-
2559
- # Add warning when resetting another user's studio
2560
- if target_user != username:
2561
- console.print(f"[yellow]⚠️ Resetting studio for user: {target_user}[/yellow]")
2562
-
2563
- studio = get_user_studio(target_user)
2564
- if not studio:
2565
- if target_user == username:
2566
- console.print("[yellow]You don't have a studio.[/yellow]")
2567
- else:
2568
- console.print(f"[yellow]User {target_user} doesn't have a studio.[/yellow]")
2569
- return
2570
-
2571
- console.print(f"[yellow]⚠️ This will force-reset the studio state[/yellow]")
2572
- console.print(f"Current status: {studio['status']}")
2573
- if studio.get("attached_vm_id"):
2574
- console.print(f"Listed as attached to: {studio['attached_vm_id']}")
2575
-
2576
- if not Confirm.ask("\nReset studio state?"):
2577
- console.print("Reset cancelled.")
2578
- return
2579
-
2580
- # Direct DynamoDB update
2581
- console.print("Resetting studio state...")
2582
-
2583
- dynamodb = boto3.resource("dynamodb", region_name="us-east-1")
2584
- table = dynamodb.Table("dev-studios")
2585
-
2586
- try:
2587
- # Check if volume is actually attached
2588
- ec2 = boto3.client("ec2", region_name="us-east-1")
2589
- volumes = ec2.describe_volumes(VolumeIds=[studio["studio_id"]])
2590
-
2591
- if volumes["Volumes"]:
2592
- volume = volumes["Volumes"][0]
2593
- attachments = volume.get("Attachments", [])
2594
- if attachments:
2595
- console.print(
2596
- f"[red]Volume is still attached to {attachments[0]['InstanceId']}![/red]"
2597
- )
2598
- if Confirm.ask("Force-detach the volume?"):
2599
- ec2.detach_volume(
2600
- VolumeId=studio["studio_id"],
2601
- InstanceId=attachments[0]["InstanceId"],
2602
- Force=True,
2603
- )
2604
- console.print("Waiting for volume to detach...")
2605
- waiter = ec2.get_waiter("volume_available")
2606
- waiter.wait(VolumeIds=[studio["studio_id"]])
2607
-
2608
- # Reset in DynamoDB – align attribute names with Studio Manager backend
2609
- table.update_item(
2610
- Key={"StudioID": studio["studio_id"]},
2611
- UpdateExpression="SET #st = :status, AttachedVMID = :vm_id, AttachedDevice = :device",
2612
- ExpressionAttributeNames={"#st": "Status"},
2613
- ExpressionAttributeValues={
2614
- ":status": "available",
2615
- ":vm_id": None,
2616
- ":device": None,
2617
- },
2618
- )
2619
-
2620
- console.print(f"[green]✓ Studio reset to available state![/green]")
2621
-
2622
- except ClientError as e:
2623
- console.print(f"[red]❌ Failed to reset studio: {e}[/red]")
2624
-
2625
-
2626
- @studio_app.command("resize")
2627
- def resize_studio(
2628
- size: int = typer.Option(..., "--size", "-s", help="New size in GB"),
2629
- user: Optional[str] = typer.Option(
2630
- None, "--user", "-u", help="Resize a different user's studio (admin only)"
2631
- ),
2632
- ):
2633
- """Resize your studio volume (requires detachment)."""
2634
- username = check_aws_sso()
2635
-
2636
- # Use specified user if provided, otherwise use current user
2637
- target_user = user if user else username
2638
-
2639
- # Add warning when resizing another user's studio
2640
- if target_user != username:
2641
- console.print(f"[yellow]⚠️ Resizing studio for user: {target_user}[/yellow]")
2642
-
2643
- studio = get_user_studio(target_user)
2644
- if not studio:
2645
- if target_user == username:
2646
- console.print("[yellow]You don't have a studio yet.[/yellow]")
2647
- else:
2648
- console.print(f"[yellow]User {target_user} doesn't have a studio.[/yellow]")
2649
- return
2650
-
2651
- current_size = studio["size_gb"]
2652
-
2653
- if size <= current_size:
2654
- console.print(
2655
- f"[red]❌ New size ({size}GB) must be larger than current size ({current_size}GB)[/red]"
2656
- )
2657
- raise typer.Exit(1)
2658
-
2659
- # Check if studio is attached
2660
- if studio["status"] == "in-use":
2661
- console.print("[yellow]⚠️ Studio must be detached before resizing[/yellow]")
2662
- console.print(f"Currently attached to: {studio.get('attached_vm_id')}")
2663
-
2664
- if not Confirm.ask("\nDetach studio and proceed with resize?"):
2665
- console.print("Resize cancelled.")
2666
- return
2667
-
2668
- # Detach the studio
2669
- console.print("Detaching studio...")
2670
- response = make_api_request("POST", f"/studios/{studio['studio_id']}/detach")
2671
- if response.status_code != 200:
2672
- console.print("[red]❌ Failed to detach studio[/red]")
2673
- raise typer.Exit(1)
2674
-
2675
- console.print("[green]✓ Studio detached[/green]")
2676
-
2677
- # Wait a moment for detachment to complete
2678
- time.sleep(5)
2679
-
2680
- console.print(f"[yellow]Resizing studio from {current_size}GB to {size}GB[/yellow]")
2681
-
2682
- # Call the resize API
2683
- resize_response = make_api_request(
2684
- "POST", f"/studios/{studio['studio_id']}/resize", json_data={"size": size}
2685
- )
2686
-
2687
- if resize_response.status_code != 200:
2688
- error = resize_response.json().get("error", "Unknown error")
2689
- console.print(f"[red]❌ Failed to resize studio: {error}[/red]")
2690
- raise typer.Exit(1)
2691
-
2692
- # Wait for volume modification to complete
2693
- ec2 = boto3.client("ec2", region_name="us-east-1")
2694
- console.print("Resizing volume...")
2695
-
2696
- # Track progress
2697
- last_progress = 0
2698
-
2699
- while True:
2700
- try:
2701
- mod_state = ec2.describe_volumes_modifications(
2702
- VolumeIds=[studio["studio_id"]]
2703
- )
2704
- if not mod_state["VolumesModifications"]:
2705
- break # Modification complete
2706
-
2707
- modification = mod_state["VolumesModifications"][0]
2708
- state = modification["ModificationState"]
2709
- progress = modification.get("Progress", 0)
2710
-
2711
- # Show progress updates only for the resize phase
2712
- if state == "modifying" and progress > last_progress:
2713
- console.print(f"[yellow]Progress: {progress}%[/yellow]")
2714
- last_progress = progress
2715
-
2716
- # Exit as soon as optimization starts (resize is complete)
2717
- if state == "optimizing":
2718
- console.print(
2719
- f"[green]✓ Studio resized successfully to {size}GB![/green]"
2720
- )
2721
- console.print(
2722
- "[dim]AWS is optimizing the volume in the background (no action needed).[/dim]"
2723
- )
2724
- break
2725
-
2726
- if state == "completed":
2727
- console.print(
2728
- f"[green]✓ Studio resized successfully to {size}GB![/green]"
2729
- )
2730
- break
2731
- elif state == "failed":
2732
- console.print("[red]❌ Volume modification failed[/red]")
2733
- raise typer.Exit(1)
2734
-
2735
- time.sleep(2) # Check more frequently for better UX
2736
-
2737
- except ClientError:
2738
- # Modification might be complete
2739
- console.print(f"[green]✓ Studio resized successfully to {size}GB![/green]")
2740
- break
2741
-
2742
- console.print(
2743
- "\n[dim]The filesystem will be automatically expanded when you next attach the studio.[/dim]"
2744
- )
2745
- console.print(f"To attach: [cyan]dh studio attach <engine-name>[/cyan]")
2746
-
2747
-
2748
- # ================= Idle timeout command =================
2749
-
2750
-
2751
- @engine_app.command("idle")
2752
- def idle_timeout_cmd(
2753
- name_or_id: str = typer.Argument(help="Engine name or instance ID"),
2754
- set: Optional[str] = typer.Option(
2755
- None, "--set", "-s", help="New timeout (e.g., 2h30m, 45m)"
2756
- ),
2757
-
2758
- ):
2759
- """Show or set the engine idle-detector timeout."""
2760
- check_aws_sso()
2761
-
2762
- # Resolve engine
2763
- response = make_api_request("GET", "/engines")
2764
- if response.status_code != 200:
2765
- console.print("[red]❌ Failed to fetch engines[/red]")
2766
- raise typer.Exit(1)
2767
-
2768
- engines = response.json().get("engines", [])
2769
- engine = resolve_engine(name_or_id, engines)
2770
-
2771
- ssm = boto3.client("ssm", region_name="us-east-1")
2772
-
2773
- if set is None:
2774
- # Show current timeout setting
2775
- resp = ssm.send_command(
2776
- InstanceIds=[engine["instance_id"]],
2777
- DocumentName="AWS-RunShellScript",
2778
- Parameters={
2779
- "commands": [
2780
- "grep -E '^IDLE_TIMEOUT_SECONDS=' /etc/engine.env || echo 'IDLE_TIMEOUT_SECONDS=1800'"
2781
- ],
2782
- "executionTimeout": ["10"],
2783
- },
2784
- )
2785
- cid = resp["Command"]["CommandId"]
2786
- time.sleep(1)
2787
- inv = ssm.get_command_invocation(
2788
- CommandId=cid, InstanceId=engine["instance_id"]
2789
- )
2790
- if inv["Status"] == "Success":
2791
- line = inv["StandardOutputContent"].strip()
2792
- secs = int(line.split("=")[1]) if "=" in line else 1800
2793
- console.print(f"Current idle timeout: {secs//60}m ({secs} seconds)")
2794
- else:
2795
- console.print("[red]❌ Could not retrieve idle timeout[/red]")
2796
- return
2797
-
2798
- # ----- set new value -----
2799
- m = re.match(r"^(?:(\d+)h)?(?:(\d+)m)?$", set)
2800
- if not m:
2801
- console.print("[red]❌ Invalid duration format. Use e.g. 2h, 45m, 1h30m[/red]")
2802
- raise typer.Exit(1)
2803
- hours = int(m.group(1) or 0)
2804
- minutes = int(m.group(2) or 0)
2805
- seconds = hours * 3600 + minutes * 60
2806
- if seconds == 0:
2807
- console.print("[red]❌ Duration must be greater than zero[/red]")
2808
- raise typer.Exit(1)
2809
-
2810
- console.print(f"Setting idle timeout to {set} ({seconds} seconds)…")
2811
-
2812
- cmd = (
2813
- "sudo sed -i '/^IDLE_TIMEOUT_SECONDS=/d' /etc/engine.env && "
2814
- f"echo 'IDLE_TIMEOUT_SECONDS={seconds}' | sudo tee -a /etc/engine.env >/dev/null && "
2815
- "sudo systemctl restart engine-idle-detector.service"
2816
- )
2817
-
2818
- resp = ssm.send_command(
2819
- InstanceIds=[engine["instance_id"]],
2820
- DocumentName="AWS-RunShellScript",
2821
- Parameters={"commands": [cmd], "executionTimeout": ["60"]},
2822
- )
2823
- cid = resp["Command"]["CommandId"]
2824
- time.sleep(2)
2825
- console.print(f"[green]✓ Idle timeout updated to {set}[/green]")
2826
-
2827
-
2828
- # Add this near the end, after the idle-timeout command
2829
-
2830
-
2831
- @engine_app.command("debug")
2832
- def debug_engine(
2833
- name_or_id: str = typer.Argument(help="Engine name or instance ID"),
2834
- ):
2835
- """Debug engine bootstrap status and files."""
2836
- check_aws_sso()
2837
-
2838
- # Resolve engine
2839
- response = make_api_request("GET", "/engines")
2840
- if response.status_code != 200:
2841
- console.print("[red]❌ Failed to fetch engines[/red]")
2842
- raise typer.Exit(1)
2843
-
2844
- engines = response.json().get("engines", [])
2845
- engine = resolve_engine(name_or_id, engines)
2846
-
2847
- console.print(f"[bold]Debug info for {engine['name']}:[/bold]\n")
2848
-
2849
- ssm = boto3.client("ssm", region_name="us-east-1")
2850
-
2851
- # Check multiple files and systemd status
2852
- checks = [
2853
- (
2854
- "Stage file",
2855
- "cat /opt/dayhoff/state/engine-init.stage 2>/dev/null || cat /var/run/engine-init.stage 2>/dev/null || echo 'MISSING'",
2856
- ),
2857
- (
2858
- "Health file",
2859
- "cat /opt/dayhoff/state/engine-health.json 2>/dev/null || cat /var/run/engine-health.json 2>/dev/null || echo 'MISSING'",
2860
- ),
2861
- (
2862
- "Sentinel file",
2863
- "ls -la /opt/dayhoff/first_boot_complete.sentinel 2>/dev/null || echo 'MISSING'",
2864
- ),
2865
- (
2866
- "Setup service",
2867
- "systemctl status setup-aws-vm.service --no-pager || echo 'Service not found'",
2868
- ),
2869
- (
2870
- "Bootstrap log tail",
2871
- "tail -20 /var/log/engine-setup.log 2>/dev/null || echo 'No log'",
2872
- ),
2873
- ("Environment file", "cat /etc/engine.env 2>/dev/null || echo 'MISSING'"),
2874
- ]
2875
-
2876
- for name, cmd in checks:
2877
- try:
2878
- resp = ssm.send_command(
2879
- InstanceIds=[engine["instance_id"]],
2880
- DocumentName="AWS-RunShellScript",
2881
- Parameters={"commands": [cmd], "executionTimeout": ["10"]},
2882
- )
2883
- cid = resp["Command"]["CommandId"]
2884
- time.sleep(1)
2885
- inv = ssm.get_command_invocation(
2886
- CommandId=cid, InstanceId=engine["instance_id"]
2887
- )
2888
-
2889
- if inv["Status"] == "Success":
2890
- output = inv["StandardOutputContent"].strip()
2891
- console.print(f"[cyan]{name}:[/cyan]")
2892
- console.print(f"[dim]{output}[/dim]\n")
2893
- else:
2894
- console.print(f"[cyan]{name}:[/cyan] [red]FAILED[/red]\n")
2895
-
2896
- except Exception as e:
2897
- console.print(f"[cyan]{name}:[/cyan] [red]ERROR: {e}[/red]\n")
2898
-
2899
-
2900
- @engine_app.command("repair")
2901
- def repair_engine(
2902
- name_or_id: str = typer.Argument(help="Engine name or instance ID"),
2903
- ):
2904
- """Repair an engine that's stuck in a bad state (e.g., after GAMI creation)."""
2905
- check_aws_sso()
2906
-
2907
- # Get all engines to resolve name
2908
- response = make_api_request("GET", "/engines")
2909
- if response.status_code != 200:
2910
- console.print("[red]❌ Failed to fetch engines[/red]")
2911
- raise typer.Exit(1)
2912
-
2913
- engines = response.json().get("engines", [])
2914
- engine = resolve_engine(name_or_id, engines)
2915
-
2916
- if engine["state"].lower() != "running":
2917
- console.print(
2918
- f"[yellow]⚠️ Engine is {engine['state']}. Must be running to repair.[/yellow]"
2919
- )
2920
- if engine["state"].lower() == "stopped" and Confirm.ask(
2921
- "Start the engine first?"
2922
- ):
2923
- response = make_api_request(
2924
- "POST", f"/engines/{engine['instance_id']}/start"
2925
- )
2926
- if response.status_code != 200:
2927
- console.print("[red]❌ Failed to start engine[/red]")
2928
- raise typer.Exit(1)
2929
- console.print("[green]✓ Engine started[/green]")
2930
- console.print("Waiting for engine to become ready...")
2931
- time.sleep(30) # Give it time to boot
2932
- else:
2933
- raise typer.Exit(1)
2934
-
2935
- console.print(f"[bold]Repairing engine [cyan]{engine['name']}[/cyan][/bold]")
2936
- console.print(
2937
- "[dim]This will restore bootstrap state and ensure all services are running[/dim]\n"
2938
- )
2939
-
2940
- ssm = boto3.client("ssm", region_name="us-east-1")
2941
-
2942
- # Repair commands
2943
- repair_commands = [
2944
- # Create necessary directories
2945
- "sudo mkdir -p /opt/dayhoff /opt/dayhoff/state /opt/dayhoff/scripts",
2946
- # Download scripts from S3 if missing
2947
- "source /etc/engine.env && sudo aws s3 sync s3://${VM_SCRIPTS_BUCKET}/ /opt/dayhoff/scripts/ --exclude '*' --include '*.sh' --quiet",
2948
- "sudo chmod +x /opt/dayhoff/scripts/*.sh 2>/dev/null || true",
2949
- # Restore bootstrap state
2950
- "sudo touch /opt/dayhoff/first_boot_complete.sentinel",
2951
- "echo 'finished' | sudo tee /opt/dayhoff/state/engine-init.stage > /dev/null",
2952
- # Ensure SSM agent is running
2953
- "sudo systemctl restart amazon-ssm-agent 2>/dev/null || true",
2954
- # Restart idle detector (service only)
2955
- "sudo systemctl restart engine-idle-detector.service 2>/dev/null || true",
2956
- # Report status
2957
- "echo '=== Repair Complete ===' && echo 'Sentinel: ' && ls -la /opt/dayhoff/first_boot_complete.sentinel",
2958
- "echo 'Stage: ' && cat /opt/dayhoff/state/engine-init.stage",
2959
- "echo 'Scripts: ' && ls /opt/dayhoff/scripts/*.sh 2>/dev/null | wc -l",
2960
- ]
2961
-
2962
- try:
2963
- with Progress(
2964
- SpinnerColumn(),
2965
- TextColumn("[progress.description]{task.description}"),
2966
- transient=True,
2967
- ) as progress:
2968
- task = progress.add_task("Repairing engine...", total=None)
2969
-
2970
- response = ssm.send_command(
2971
- InstanceIds=[engine["instance_id"]],
2972
- DocumentName="AWS-RunShellScript",
2973
- Parameters={
2974
- "commands": repair_commands,
2975
- "executionTimeout": ["60"],
2976
- },
2977
- )
2978
-
2979
- command_id = response["Command"]["CommandId"]
2980
-
2981
- # Wait for command
2982
- for _ in range(60):
2983
- time.sleep(1)
2984
- result = ssm.get_command_invocation(
2985
- CommandId=command_id,
2986
- InstanceId=engine["instance_id"],
2987
- )
2988
- if result["Status"] in ["Success", "Failed"]:
2989
- break
2990
-
2991
- if result["Status"] == "Success":
2992
- output = result["StandardOutputContent"]
2993
- console.print("[green]✓ Engine repaired successfully![/green]\n")
2994
-
2995
- # Show repair results
2996
- if "=== Repair Complete ===" in output:
2997
- repair_section = output.split("=== Repair Complete ===")[1].strip()
2998
- console.print("[bold]Repair Results:[/bold]")
2999
- console.print(repair_section)
3000
-
3001
- console.print(
3002
- "\n[dim]You should now be able to attach studios to this engine.[/dim]"
3003
- )
3004
- else:
3005
- console.print(
3006
- f"[red]❌ Repair failed: {result.get('StandardErrorContent', 'Unknown error')}[/red]"
3007
- )
3008
- console.print(
3009
- "\n[yellow]Try running 'dh engine debug' for more information.[/yellow]"
3010
- )
3011
-
3012
- except Exception as e:
3013
- console.print(f"[red]❌ Failed to repair engine: {e}[/red]")