dayhoff-tools 1.9.8__py3-none-any.whl → 1.9.10__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,3012 +0,0 @@
1
- """Engine and Studio management commands for DHT CLI."""
2
-
3
- import json
4
- import os
5
- import re
6
- import shutil
7
- import subprocess
8
- import sys
9
- import time
10
- from datetime import datetime, timedelta, timezone
11
- from pathlib import Path
12
- from typing import Any, Dict, List, Optional, Tuple
13
-
14
- import boto3
15
- import requests
16
- import typer
17
- from botocore.exceptions import ClientError, NoCredentialsError
18
- from rich import box
19
- from rich.console import Console
20
- from rich.panel import Panel
21
- from rich.progress import Progress, SpinnerColumn, TextColumn, TimeElapsedColumn
22
- from rich.prompt import Confirm, IntPrompt, Prompt
23
- from rich.table import Table
24
-
25
- # Initialize Typer apps
26
- engine_app = typer.Typer(help="Manage compute engines for development.")
27
- studio_app = typer.Typer(help="Manage persistent development studios.")
28
-
29
- console = Console()
30
-
31
- # Cost information
32
- HOURLY_COSTS = {
33
- "cpu": 0.50, # r6i.2xlarge
34
- "cpumax": 2.02, # r7i.8xlarge
35
- "t4": 0.75, # g4dn.2xlarge
36
- "a10g": 1.50, # g5.2xlarge
37
- "a100": 21.96, # p4d.24xlarge
38
- "4_t4": 3.91, # g4dn.12xlarge
39
- "8_t4": 7.83, # g4dn.metal
40
- "4_a10g": 6.24, # g5.12xlarge
41
- "8_a10g": 16.29, # g5.48xlarge
42
- }
43
-
44
- # SSH config management
45
- SSH_MANAGED_COMMENT = "# Managed by dh engine"
46
-
47
- # --------------------------------------------------------------------------------
48
- # Bootstrap stage helpers
49
- # --------------------------------------------------------------------------------
50
-
51
-
52
- def _colour_stage(stage: str) -> str:
53
- """Return colourised stage name for table output."""
54
- if not stage:
55
- return "[dim]-[/dim]"
56
- low = stage.lower()
57
- if low.startswith("error"):
58
- return f"[red]{stage}[/red]"
59
- if low == "finished":
60
- return f"[green]{stage}[/green]"
61
- return f"[yellow]{stage}[/yellow]"
62
-
63
-
64
- def _fetch_init_stages(instance_ids: List[str]) -> Dict[str, str]:
65
- """Fetch DayhoffInitStage tag for many instances in one call."""
66
- if not instance_ids:
67
- return {}
68
- ec2 = boto3.client("ec2", region_name="us-east-1")
69
- stages: Dict[str, str] = {}
70
- try:
71
- paginator = ec2.get_paginator("describe_instances")
72
- for page in paginator.paginate(InstanceIds=instance_ids):
73
- for res in page["Reservations"]:
74
- for inst in res["Instances"]:
75
- iid = inst["InstanceId"]
76
- tag_val = next(
77
- (
78
- t["Value"]
79
- for t in inst.get("Tags", [])
80
- if t["Key"] == "DayhoffInitStage"
81
- ),
82
- None,
83
- )
84
- if tag_val:
85
- stages[iid] = tag_val
86
- except Exception:
87
- pass # best-effort
88
- return stages
89
-
90
-
91
- def check_aws_sso() -> str:
92
- """Check AWS SSO status and return username."""
93
- try:
94
- sts = boto3.client("sts")
95
- identity = sts.get_caller_identity()
96
- # Parse username from assumed role ARN
97
- # Format: arn:aws:sts::123456789012:assumed-role/AWSReservedSSO_DeveloperAccess_xxxx/username
98
- arn = identity["Arn"]
99
- if "assumed-role" in arn:
100
- username = arn.split("/")[-1]
101
- return username
102
- else:
103
- # Fallback for other auth methods
104
- return identity["UserId"].split(":")[-1]
105
- except (NoCredentialsError, ClientError):
106
- console.print("[red]❌ Not logged in to AWS SSO[/red]")
107
- console.print("Please run: [cyan]aws sso login[/cyan]")
108
- if Confirm.ask("Would you like to login now?"):
109
- try:
110
- result = subprocess.run(
111
- ["aws", "sso", "login"],
112
- capture_output=True,
113
- text=True,
114
- check=True,
115
- )
116
- if result.returncode == 0:
117
- console.print("[green]✓ Successfully logged in![/green]")
118
- return check_aws_sso()
119
- except subprocess.CalledProcessError as e:
120
- console.print(f"[red]Login failed: {e}[/red]")
121
- raise typer.Exit(1)
122
-
123
-
124
- def get_api_url() -> str:
125
- """Get Studio Manager API URL from SSM Parameter Store."""
126
- ssm = boto3.client("ssm", region_name="us-east-1")
127
- try:
128
- response = ssm.get_parameter(Name="/dev/studio-manager/api-url")
129
- return response["Parameter"]["Value"]
130
- except ClientError as e:
131
- if e.response["Error"]["Code"] == "ParameterNotFound":
132
- console.print(
133
- "[red]❌ API URL parameter not found in SSM Parameter Store[/red]"
134
- )
135
- console.print(
136
- "Please ensure the Studio Manager infrastructure is deployed."
137
- )
138
- else:
139
- console.print(f"[red]❌ Error retrieving API URL: {e}[/red]")
140
- raise typer.Exit(1)
141
-
142
-
143
- def make_api_request(
144
- method: str,
145
- endpoint: str,
146
- json_data: Optional[Dict] = None,
147
- params: Optional[Dict] = None,
148
- ) -> requests.Response:
149
- """Make an API request with error handling."""
150
- api_url = get_api_url()
151
- url = f"{api_url}{endpoint}"
152
-
153
- try:
154
- if method == "GET":
155
- response = requests.get(url, params=params)
156
- elif method == "POST":
157
- response = requests.post(url, json=json_data)
158
- elif method == "DELETE":
159
- response = requests.delete(url)
160
- else:
161
- raise ValueError(f"Unsupported HTTP method: {method}")
162
-
163
- return response
164
- except requests.exceptions.RequestException as e:
165
- console.print(f"[red]❌ API request failed: {e}[/red]")
166
- raise typer.Exit(1)
167
-
168
-
169
- def format_duration(duration: timedelta) -> str:
170
- """Format a duration as a human-readable string."""
171
- total_seconds = int(duration.total_seconds())
172
- hours = total_seconds // 3600
173
- minutes = (total_seconds % 3600) // 60
174
-
175
- if hours > 0:
176
- return f"{hours}h {minutes}m"
177
- else:
178
- return f"{minutes}m"
179
-
180
-
181
- def get_disk_usage_via_ssm(instance_id: str) -> Optional[str]:
182
- """Get disk usage for an engine via SSM.
183
-
184
- Returns:
185
- String like "17/50 GB" or None if failed
186
- """
187
- try:
188
- ssm = boto3.client("ssm", region_name="us-east-1")
189
-
190
- # Run df command to get disk usage
191
- response = ssm.send_command(
192
- InstanceIds=[instance_id],
193
- DocumentName="AWS-RunShellScript",
194
- Parameters={
195
- "commands": [
196
- # Get root filesystem usage in GB
197
- 'df -BG / | tail -1 | awk \'{gsub(/G/, "", $2); gsub(/G/, "", $3); print $3 "/" $2 " GB"}\''
198
- ],
199
- "executionTimeout": ["10"],
200
- },
201
- )
202
-
203
- command_id = response["Command"]["CommandId"]
204
-
205
- # Wait for command to complete (with timeout)
206
- for _ in range(5): # 5 second timeout
207
- time.sleep(1)
208
- result = ssm.get_command_invocation(
209
- CommandId=command_id,
210
- InstanceId=instance_id,
211
- )
212
- if result["Status"] in ["Success", "Failed"]:
213
- break
214
-
215
- if result["Status"] == "Success":
216
- output = result["StandardOutputContent"].strip()
217
- return output if output else None
218
-
219
- return None
220
-
221
- except Exception as e:
222
- # logger.debug(f"Failed to get disk usage for {instance_id}: {e}") # Original code had this line commented out
223
- return None
224
-
225
-
226
- def get_studio_disk_usage_via_ssm(instance_id: str, username: str) -> Optional[str]:
227
- """Get disk usage for a studio via SSM.
228
-
229
- Returns:
230
- String like "333/500 GB" or None if failed
231
- """
232
- try:
233
- ssm = boto3.client("ssm", region_name="us-east-1")
234
-
235
- # Run df command to get studio disk usage
236
- response = ssm.send_command(
237
- InstanceIds=[instance_id],
238
- DocumentName="AWS-RunShellScript",
239
- Parameters={
240
- "commands": [
241
- # Get studio filesystem usage in GB
242
- f'df -BG /studios/{username} 2>/dev/null | tail -1 | awk \'{{gsub(/G/, "", $2); gsub(/G/, "", $3); print $3 "/" $2 " GB"}}\''
243
- ],
244
- "executionTimeout": ["10"],
245
- },
246
- )
247
-
248
- command_id = response["Command"]["CommandId"]
249
-
250
- # Wait for command to complete (with timeout)
251
- for _ in range(5): # 5 second timeout
252
- time.sleep(1)
253
- result = ssm.get_command_invocation(
254
- CommandId=command_id,
255
- InstanceId=instance_id,
256
- )
257
- if result["Status"] in ["Success", "Failed"]:
258
- break
259
-
260
- if result["Status"] == "Success":
261
- output = result["StandardOutputContent"].strip()
262
- return output if output else None
263
-
264
- return None
265
-
266
- except Exception:
267
- return None
268
-
269
-
270
- def parse_launch_time(launch_time_str: str) -> datetime:
271
- """Parse launch time from API response."""
272
- # Try different datetime formats
273
- formats = [
274
- "%Y-%m-%dT%H:%M:%S.%fZ",
275
- "%Y-%m-%dT%H:%M:%SZ",
276
- "%Y-%m-%dT%H:%M:%S%z", # ISO format with timezone
277
- "%Y-%m-%dT%H:%M:%S+00:00", # Explicit UTC offset
278
- "%Y-%m-%d %H:%M:%S",
279
- ]
280
-
281
- # First try parsing with fromisoformat for better timezone handling
282
- try:
283
- # Handle the ISO format properly
284
- return datetime.fromisoformat(launch_time_str.replace("Z", "+00:00"))
285
- except (ValueError, AttributeError):
286
- pass
287
-
288
- # Fallback to manual format parsing
289
- for fmt in formats:
290
- try:
291
- parsed = datetime.strptime(launch_time_str, fmt)
292
- # If no timezone info, assume UTC
293
- if parsed.tzinfo is None:
294
- parsed = parsed.replace(tzinfo=timezone.utc)
295
- return parsed
296
- except ValueError:
297
- continue
298
-
299
- # Fallback: assume it's recent
300
- return datetime.now(timezone.utc)
301
-
302
-
303
- def format_status(state: str, ready: Optional[bool]) -> str:
304
- """Format engine status with ready indicator."""
305
- if state.lower() == "running":
306
- if ready is True:
307
- return "[green]Running ✓[/green]"
308
- elif ready is False:
309
- return "[yellow]Running ⚠ (Bootstrapping...)[/yellow]"
310
- else:
311
- return "[green]Running[/green]"
312
- elif state.lower() == "stopped":
313
- return "[dim]Stopped[/dim]"
314
- elif state.lower() == "stopping":
315
- return "[yellow]Stopping...[/yellow]"
316
- elif state.lower() == "pending":
317
- return "[yellow]Starting...[/yellow]"
318
- else:
319
- return state
320
-
321
-
322
- def resolve_engine(name_or_id: str, engines: List[Dict]) -> Dict:
323
- """Resolve engine by name or ID with interactive selection."""
324
- # Exact ID match
325
- exact_id = [e for e in engines if e["instance_id"] == name_or_id]
326
- if exact_id:
327
- return exact_id[0]
328
-
329
- # Exact name match
330
- exact_name = [e for e in engines if e["name"] == name_or_id]
331
- if len(exact_name) == 1:
332
- return exact_name[0]
333
-
334
- # Prefix matches
335
- matches = [
336
- e
337
- for e in engines
338
- if e["name"].startswith(name_or_id) or e["instance_id"].startswith(name_or_id)
339
- ]
340
-
341
- if len(matches) == 0:
342
- console.print(f"[red]❌ No engine found matching '{name_or_id}'[/red]")
343
- raise typer.Exit(1)
344
- elif len(matches) == 1:
345
- return matches[0]
346
- else:
347
- # Interactive selection
348
- console.print(f"Multiple engines match '{name_or_id}':")
349
- for i, engine in enumerate(matches, 1):
350
- cost = HOURLY_COSTS.get(engine["engine_type"], 0)
351
- console.print(
352
- f" {i}. [cyan]{engine['name']}[/cyan] ({engine['instance_id']}) "
353
- f"- {engine['engine_type']} - {engine['state']} - ${cost:.2f}/hr"
354
- )
355
-
356
- while True:
357
- try:
358
- choice = IntPrompt.ask(
359
- "Select engine",
360
- default=1,
361
- choices=[str(i) for i in range(1, len(matches) + 1)],
362
- )
363
- return matches[choice - 1]
364
- except (ValueError, IndexError):
365
- console.print("[red]Invalid selection, please try again[/red]")
366
-
367
-
368
- def get_ssh_public_key() -> str:
369
- """Get the user's SSH public key.
370
-
371
- Discovery order (container-friendly):
372
- 1) DHT_SSH_PUBLIC_KEY env var (direct key content)
373
- 2) DHT_SSH_PUBLIC_KEY_PATH env var (path to a .pub file)
374
- 3) ssh-agent via `ssh-add -L` (requires SSH_AUTH_SOCK)
375
- 4) Conventional files: ~/.ssh/id_ed25519.pub, ~/.ssh/id_rsa.pub
376
-
377
- Raises:
378
- FileNotFoundError: If no public key can be discovered.
379
- """
380
- # 1) Direct env var content
381
- env_key = os.environ.get("DHT_SSH_PUBLIC_KEY")
382
- if env_key and env_key.strip():
383
- return env_key.strip()
384
-
385
- # 2) Env var path
386
- env_path = os.environ.get("DHT_SSH_PUBLIC_KEY_PATH")
387
- if env_path:
388
- p = Path(env_path).expanduser()
389
- if p.is_file():
390
- try:
391
- return p.read_text().strip()
392
- except Exception:
393
- pass
394
-
395
- # 3) Agent lookup (ssh-add -L)
396
- try:
397
- if shutil.which("ssh-add") is not None:
398
- proc = subprocess.run(["ssh-add", "-L"], capture_output=True, text=True)
399
- if proc.returncode == 0 and proc.stdout:
400
- keys = [
401
- line.strip() for line in proc.stdout.splitlines() if line.strip()
402
- ]
403
- # Prefer ed25519, then rsa
404
- for pref in ("ssh-ed25519", "ssh-rsa", "ecdsa-sha2-nistp256"):
405
- for k in keys:
406
- if k.startswith(pref + " "):
407
- return k
408
- # Fallback to first key if types not matched
409
- if keys:
410
- return keys[0]
411
- except Exception:
412
- pass
413
-
414
- # 4) Conventional files
415
- home = Path.home()
416
- key_paths = [home / ".ssh" / "id_ed25519.pub", home / ".ssh" / "id_rsa.pub"]
417
- for key_path in key_paths:
418
- if key_path.is_file():
419
- try:
420
- return key_path.read_text().strip()
421
- except Exception:
422
- continue
423
-
424
- raise FileNotFoundError(
425
- "No SSH public key found. Please create one with 'ssh-keygen' first."
426
- )
427
-
428
-
429
- def check_session_manager_plugin():
430
- """Check if AWS Session Manager Plugin is available and warn if not."""
431
- if shutil.which("session-manager-plugin") is None:
432
- console.print(
433
- "[bold red]⚠️ AWS Session Manager Plugin not found![/bold red]\n"
434
- "SSH connections to engines require the Session Manager Plugin.\n"
435
- "Please install it following the setup guide:\n"
436
- "[link]https://github.com/dayhofflabs/nutshell/blob/main/REFERENCE/setup_guides/new-laptop.md[/link]"
437
- )
438
- return False
439
- return True
440
-
441
-
442
- def update_ssh_config_entry(
443
- engine_name: str, instance_id: str, ssh_user: str, idle_timeout: int = 600
444
- ):
445
- """Add or update a single SSH config entry for the given SSH user.
446
-
447
- Args:
448
- engine_name: Host alias to write into ~/.ssh/config
449
- instance_id: EC2 instance-id (used by the proxy command)
450
- ssh_user: Username to place into the SSH stanza
451
- idle_timeout: Idle timeout **in seconds** to pass to the SSM port-forward. 600 = 10 min.
452
- """
453
- config_path = Path.home() / ".ssh" / "config"
454
- config_path.parent.mkdir(mode=0o700, exist_ok=True)
455
-
456
- # Touch the file if it doesn't exist
457
- if not config_path.exists():
458
- config_path.touch(mode=0o600)
459
-
460
- # Read existing config
461
- content = config_path.read_text()
462
- lines = content.splitlines() if content else []
463
-
464
- # Remove any existing entry for this engine
465
- new_lines = []
466
- skip_until_next_host = False
467
- for line in lines:
468
- # Check if this is our managed host
469
- if (
470
- line.strip().startswith(f"Host {engine_name}")
471
- and SSH_MANAGED_COMMENT in line
472
- ):
473
- skip_until_next_host = True
474
- elif line.strip().startswith("Host ") and skip_until_next_host:
475
- skip_until_next_host = False
476
- # This is a different host entry, keep it
477
- new_lines.append(line)
478
- elif not skip_until_next_host:
479
- new_lines.append(line)
480
-
481
- # Add the new entry
482
- if new_lines and new_lines[-1].strip(): # Add blank line if needed
483
- new_lines.append("")
484
-
485
- new_lines.extend(
486
- [
487
- f"Host {engine_name} {SSH_MANAGED_COMMENT}",
488
- f" HostName {instance_id}",
489
- f" User {ssh_user}",
490
- f" ProxyCommand sh -c \"AWS_SSM_IDLE_TIMEOUT={idle_timeout} aws ssm start-session --target %h --document-name AWS-StartSSHSession --parameters 'portNumber=%p'\"",
491
- ]
492
- )
493
-
494
- # Write back
495
- config_path.write_text("\n".join(new_lines))
496
- config_path.chmod(0o600)
497
-
498
-
499
- # ==================== ENGINE COMMANDS ====================
500
-
501
-
502
- @engine_app.command("launch")
503
- def launch_engine(
504
- name: str = typer.Argument(help="Name for the new engine"),
505
- engine_type: str = typer.Option(
506
- "cpu",
507
- "--type",
508
- "-t",
509
- help="Engine type: cpu, cpumax, t4, a10g, a100, 4_t4, 8_t4, 4_a10g, 8_a10g",
510
- ),
511
- user: Optional[str] = typer.Option(None, "--user", "-u", help="Override username"),
512
- boot_disk_size: Optional[int] = typer.Option(
513
- None,
514
- "--size",
515
- "-s",
516
- help="Boot disk size in GB (default: 50GB, min: 20GB, max: 1000GB)",
517
- ),
518
- availability_zone: Optional[str] = typer.Option(
519
- None,
520
- "--az",
521
- help="Prefer a specific Availability Zone (e.g., us-east-1b). If omitted the service will try all public subnets.",
522
- ),
523
- ):
524
- """Launch a new engine instance."""
525
- username = check_aws_sso()
526
- if user:
527
- username = user
528
-
529
- # Validate engine type
530
- valid_types = [
531
- "cpu",
532
- "cpumax",
533
- "t4",
534
- "a10g",
535
- "a100",
536
- "4_t4",
537
- "8_t4",
538
- "4_a10g",
539
- "8_a10g",
540
- ]
541
- if engine_type not in valid_types:
542
- console.print(f"[red]❌ Invalid engine type: {engine_type}[/red]")
543
- console.print(f"Valid types: {', '.join(valid_types)}")
544
- raise typer.Exit(1)
545
-
546
- # Validate boot disk size
547
- if boot_disk_size is not None:
548
- if boot_disk_size < 20:
549
- console.print("[red]❌ Boot disk size must be at least 20GB[/red]")
550
- raise typer.Exit(1)
551
- if boot_disk_size > 1000:
552
- console.print("[red]❌ Boot disk size cannot exceed 1000GB[/red]")
553
- raise typer.Exit(1)
554
-
555
- cost = HOURLY_COSTS.get(engine_type, 0)
556
- disk_info = f" with {boot_disk_size}GB boot disk" if boot_disk_size else ""
557
- console.print(
558
- f"Launching [cyan]{name}[/cyan] ({engine_type}){disk_info} for ${cost:.2f}/hour..."
559
- )
560
-
561
- with Progress(
562
- SpinnerColumn(),
563
- TextColumn("[progress.description]{task.description}"),
564
- transient=True,
565
- ) as progress:
566
- progress.add_task("Creating engine...", total=None)
567
-
568
- request_data: Dict[str, Any] = {
569
- "name": name,
570
- "user": username,
571
- "engine_type": engine_type,
572
- }
573
- if boot_disk_size is not None:
574
- request_data["boot_disk_size"] = boot_disk_size
575
- if availability_zone:
576
- request_data["availability_zone"] = availability_zone
577
-
578
- response = make_api_request("POST", "/engines", json_data=request_data)
579
-
580
- if response.status_code == 201:
581
- data = response.json()
582
- console.print(f"[green]✓ Engine launched successfully![/green]")
583
- console.print(f"Instance ID: [cyan]{data['instance_id']}[/cyan]")
584
- console.print(f"Type: {data['instance_type']} (${cost:.2f}/hour)")
585
- if boot_disk_size:
586
- console.print(f"Boot disk: {boot_disk_size}GB")
587
- console.print("\nThe engine is initializing. This may take a few minutes.")
588
- console.print(f"Check status with: [cyan]dh engine status {name}[/cyan]")
589
- else:
590
- error = response.json().get("error", "Unknown error")
591
- console.print(f"[red]❌ Failed to launch engine: {error}[/red]")
592
-
593
-
594
- @engine_app.command("list")
595
- def list_engines(
596
- user: Optional[str] = typer.Option(None, "--user", "-u", help="Filter by user"),
597
- running_only: bool = typer.Option(
598
- False, "--running", help="Show only running engines"
599
- ),
600
- stopped_only: bool = typer.Option(
601
- False, "--stopped", help="Show only stopped engines"
602
- ),
603
- detailed: bool = typer.Option(
604
- False, "--detailed", "-d", help="Show detailed status (slower)"
605
- ),
606
- ):
607
- """List engines (shows all engines by default)."""
608
- current_user = check_aws_sso()
609
-
610
- params = {}
611
- if user:
612
- params["user"] = user
613
- if detailed:
614
- params["check_ready"] = "true"
615
-
616
- response = make_api_request("GET", "/engines", params=params)
617
-
618
- if response.status_code == 200:
619
- data = response.json()
620
- engines = data.get("engines", [])
621
-
622
- # Filter by state if requested
623
- if running_only:
624
- engines = [e for e in engines if e["state"].lower() == "running"]
625
- elif stopped_only:
626
- engines = [e for e in engines if e["state"].lower() == "stopped"]
627
-
628
- if not engines:
629
- console.print("No engines found.")
630
- return
631
-
632
- # Only fetch detailed info if requested (slow)
633
- stages_map = {}
634
- if detailed:
635
- stages_map = _fetch_init_stages([e["instance_id"] for e in engines])
636
-
637
- # Create table
638
- table = Table(title="Engines", box=box.ROUNDED)
639
- table.add_column("Name", style="cyan")
640
- table.add_column("Instance ID", style="dim")
641
- table.add_column("Type")
642
- table.add_column("User")
643
- table.add_column("Status")
644
- if detailed:
645
- table.add_column("Disk Usage")
646
- table.add_column("Uptime/Since")
647
- table.add_column("$/hour", justify="right")
648
-
649
- for engine in engines:
650
- launch_time = parse_launch_time(engine["launch_time"])
651
- uptime = datetime.now(timezone.utc) - launch_time
652
- hourly_cost = HOURLY_COSTS.get(engine["engine_type"], 0)
653
-
654
- if engine["state"].lower() == "running":
655
- time_str = format_duration(uptime)
656
- # Only get disk usage if detailed mode
657
- if detailed:
658
- disk_usage = get_disk_usage_via_ssm(engine["instance_id"]) or "-"
659
- else:
660
- disk_usage = None
661
- else:
662
- time_str = launch_time.strftime("%Y-%m-%d %H:%M")
663
- disk_usage = "-" if detailed else None
664
-
665
- row_data = [
666
- engine["name"],
667
- engine["instance_id"],
668
- engine["engine_type"],
669
- engine["user"],
670
- format_status(engine["state"], engine.get("ready")),
671
- ]
672
- if detailed:
673
- row_data.append(disk_usage)
674
- row_data.extend(
675
- [
676
- time_str,
677
- f"${hourly_cost:.2f}",
678
- ]
679
- )
680
-
681
- table.add_row(*row_data)
682
-
683
- console.print(table)
684
- if not detailed and any(e["state"].lower() == "running" for e in engines):
685
- console.print(
686
- "\n[dim]Tip: Use --detailed to see disk usage and bootstrap status (slower)[/dim]"
687
- )
688
- else:
689
- error = response.json().get("error", "Unknown error")
690
- console.print(f"[red]❌ Failed to list engines: {error}[/red]")
691
-
692
-
693
- @engine_app.command("status")
694
- def engine_status(
695
- name_or_id: str = typer.Argument(help="Engine name or instance ID"),
696
- detailed: bool = typer.Option(False, "--detailed", "-d", help="Show detailed status (slower)"),
697
- show_log: bool = typer.Option(False, "--show-log", help="Show bootstrap log (requires --detailed)"),
698
- ):
699
- """Show engine status and information."""
700
- check_aws_sso()
701
-
702
- # Get all engines to resolve name
703
- response = make_api_request("GET", "/engines")
704
- if response.status_code != 200:
705
- console.print("[red]❌ Failed to fetch engines[/red]")
706
- raise typer.Exit(1)
707
-
708
- engines = response.json().get("engines", [])
709
- engine = resolve_engine(name_or_id, engines)
710
-
711
- # Fast status display (default)
712
- if not detailed:
713
- # Skip the API call for studios - use basic info we already have
714
- attached_studios = []
715
- studio_user = engine.get("user") # Use the engine's user as studio owner
716
-
717
- # Fetch idle status via SSM with longer timeout
718
- ssm = boto3.client("ssm", region_name="us-east-1")
719
- idle_data = None # Use None to indicate no data received
720
-
721
- if engine["state"].lower() == "running":
722
- try:
723
- resp = ssm.send_command(
724
- InstanceIds=[engine["instance_id"]],
725
- DocumentName="AWS-RunShellScript",
726
- Parameters={
727
- "commands": [
728
- "cat /var/run/idle-detector/last_state.json 2>/dev/null || echo '{}'"
729
- ],
730
- "executionTimeout": ["10"],
731
- },
732
- )
733
- cid = resp["Command"]["CommandId"]
734
-
735
- # Wait up to 3 seconds for result
736
- for _ in range(6): # 6 * 0.5 = 3 seconds
737
- time.sleep(0.5)
738
- inv = ssm.get_command_invocation(
739
- CommandId=cid, InstanceId=engine["instance_id"]
740
- )
741
- if inv["Status"] in ["Success", "Failed"]:
742
- break
743
-
744
- if inv["Status"] == "Success":
745
- content = inv["StandardOutputContent"].strip()
746
- if content and content != "{}":
747
- idle_data = json.loads(content)
748
- else:
749
- idle_data = {} # Empty response but SSM worked
750
- except Exception:
751
- idle_data = None # SSM failed
752
-
753
- # Determine running state display
754
- running_state = engine["state"].lower()
755
- if running_state == "running":
756
- run_disp = "[green]Running[/green]"
757
- elif running_state == "pending":
758
- run_disp = "[yellow]Starting...[/yellow]"
759
- elif running_state == "stopping":
760
- run_disp = "[yellow]Stopping...[/yellow]"
761
- elif running_state == "stopped":
762
- run_disp = "[dim]Stopped[/dim]"
763
- else:
764
- run_disp = engine["state"].capitalize()
765
-
766
- # Determine idle/active status
767
- idle_disp = ""
768
- if running_state == "running":
769
- if idle_data is None:
770
- # SSM failed - we don't know the status
771
- idle_disp = " [dim]N/A[/dim]"
772
- elif not idle_data:
773
- # Empty data - likely very early in boot
774
- idle_disp = " [dim]N/A[/dim]"
775
- else:
776
- # We have data
777
- is_idle = idle_data.get("idle", False)
778
- timeout_sec = idle_data.get("timeout_sec")
779
- idle_seconds = idle_data.get("idle_seconds", 0) if is_idle else 0
780
-
781
- if is_idle:
782
- if isinstance(timeout_sec, int) and isinstance(idle_seconds, int):
783
- remaining = max(0, timeout_sec - idle_seconds)
784
- remaining_mins = remaining // 60
785
- if remaining_mins == 0:
786
- idle_disp = f" [yellow]Idle {idle_seconds//60}m/{timeout_sec//60}m: [red]<1m[/red] left[/yellow]"
787
- else:
788
- idle_disp = f" [yellow]Idle {idle_seconds//60}m/{timeout_sec//60}m: [red]{remaining_mins}m[/red] left[/yellow]"
789
- else:
790
- idle_disp = " [yellow]Idle ?/?[/yellow]"
791
- else:
792
- # Actively not idle
793
- idle_disp = " [green]Active[/green]"
794
-
795
- # Build status lines - minimal info for fast view
796
- status_lines = [
797
- f"[blue]{engine['name']}[/blue] {run_disp}{idle_disp}\n",
798
- ]
799
-
800
- # Add activity sensors if we have idle data
801
- if idle_data and idle_data.get("reasons"):
802
- status_lines.append("") # blank line before sensors
803
-
804
- sensor_map = {
805
- "CoffeeLockSensor": ("☕", "Coffee"),
806
- "ActiveLoginSensor": ("🐚", "SSH"),
807
- "IDEConnectionSensor": ("🖥 ", "IDE"),
808
- "DockerWorkloadSensor": ("🐳", "Docker"),
809
- }
810
-
811
- for r in idle_data.get("reasons", []):
812
- sensor = r.get("sensor", "Unknown")
813
- active = r.get("active", False)
814
- icon, label = sensor_map.get(sensor, ("?", sensor))
815
- status_str = "[green]YES[/green]" if active else "[dim]nope[/dim]"
816
- status_lines.append(f" {icon} {label:6} {status_str}")
817
-
818
- # Display in a nice panel
819
- console.print(
820
- Panel("\n".join(status_lines), title="Engine Status", border_style="blue")
821
- )
822
- return # Exit early for fast status
823
-
824
- # Get detailed engine status including idle detector info (for --detailed mode)
825
- response = make_api_request("GET", f"/engines/{engine['instance_id']}")
826
- if response.status_code != 200:
827
- console.print("[red]❌ Failed to fetch engine details[/red]")
828
- raise typer.Exit(1)
829
-
830
- engine_details = response.json()
831
- engine = engine_details.get("engine", engine) # Use detailed info if available
832
- idle_detector = engine_details.get("idle_detector", {}) or {}
833
- attached_studios = engine_details.get("attached_studios", [])
834
-
835
- # Calculate costs
836
- launch_time = parse_launch_time(engine["launch_time"])
837
- uptime = datetime.now(timezone.utc) - launch_time
838
- hourly_cost = HOURLY_COSTS.get(engine["engine_type"], 0)
839
- # total_cost intentionally not shown in status view
840
-
841
- stages_map = _fetch_init_stages([engine["instance_id"]])
842
- stage_val = stages_map.get(engine["instance_id"], "-")
843
-
844
- # Try to fetch actual boot time via SSM (best-effort)
845
- boot_time_str: Optional[str] = None
846
- try:
847
- if engine["state"].lower() == "running":
848
- ssm = boto3.client("ssm", region_name="us-east-1")
849
- resp = ssm.send_command(
850
- InstanceIds=[engine["instance_id"]],
851
- DocumentName="AWS-RunShellScript",
852
- Parameters={
853
- "commands": ["uptime -s || who -b | awk '{print $3\" \"$4}'"]
854
- },
855
- )
856
- cid = resp["Command"]["CommandId"]
857
- time.sleep(1)
858
- inv = ssm.get_command_invocation(
859
- CommandId=cid, InstanceId=engine["instance_id"]
860
- )
861
- if inv.get("Status") == "Success":
862
- boot_time_str = (
863
- (inv.get("StandardOutputContent") or "").strip().splitlines()[0]
864
- if inv.get("StandardOutputContent")
865
- else None
866
- )
867
- except Exception:
868
- boot_time_str = None
869
-
870
- started_line = (
871
- f"[bold]Started:[/bold] {boot_time_str} ({format_duration(uptime)} ago)"
872
- if boot_time_str
873
- else f"[bold]Started:[/bold] {launch_time.strftime('%Y-%m-%d %H:%M:%S')} ({format_duration(uptime)} ago)"
874
- )
875
-
876
- # ---------------- Front-loaded summary ----------------
877
- running_state = engine["state"].lower()
878
- if running_state == "running":
879
- run_disp = "[green]Running[/green]"
880
- elif running_state == "pending":
881
- run_disp = "[yellow]Starting...[/yellow]"
882
- elif running_state == "stopping":
883
- run_disp = "[yellow]Stopping...[/yellow]"
884
- elif running_state == "stopped":
885
- run_disp = "[dim]Stopped[/dim]"
886
- else:
887
- run_disp = engine["state"].capitalize()
888
-
889
- # Compose Active/Idle header with extra detail when idle
890
- def _compute_active_disp(idle_info: Dict[str, Any]) -> str:
891
- if idle_info.get("status") == "active":
892
- return "[green]Active[/green]"
893
- if running_state in ("stopped", "stopping"):
894
- return "[dim]N/A[/dim]"
895
-
896
- # If we don't have idle info at all, show N/A
897
- if not idle_info.get("available"):
898
- return "[dim]N/A[/dim]"
899
-
900
- # If idle, show time/threshold with time remaining if available
901
- if idle_info.get("status") == "idle":
902
- idle_seconds_v = idle_info.get("idle_seconds")
903
- thresh_v = idle_info.get("idle_threshold")
904
- if isinstance(idle_seconds_v, (int, float)) and isinstance(thresh_v, (int, float)):
905
- remaining = max(0, int(thresh_v) - int(idle_seconds_v))
906
- remaining_mins = remaining // 60
907
- if remaining_mins == 0:
908
- return f"[yellow]Idle {int(idle_seconds_v)//60}m/{int(thresh_v)//60}m: [red]<1m[/red] left[/yellow]"
909
- else:
910
- return f"[yellow]Idle {int(idle_seconds_v)//60}m/{int(thresh_v)//60}m: [red]{remaining_mins}m[/red] left[/yellow]"
911
- elif isinstance(thresh_v, (int, float)):
912
- return f"[yellow]Idle ?/{int(thresh_v)//60}m[/yellow]"
913
- else:
914
- return "[yellow]Idle ?/?[/yellow]"
915
-
916
- # Default to N/A if we can't determine status
917
- return "[dim]N/A[/dim]"
918
-
919
- active_disp = _compute_active_disp(idle_detector)
920
-
921
- top_lines = [
922
- f"[blue]{engine['name']}[/blue] {run_disp} {active_disp}\n",
923
- ]
924
-
925
- # Studios summary next, with studio name in purple/magenta
926
- studios_line = None
927
- if attached_studios:
928
- stu_texts = [
929
- f"[magenta]{s.get('user', 'studio')}[/magenta] ({s.get('studio_id', 'unknown')})"
930
- for s in attached_studios
931
- ]
932
- studios_line = "Studios: " + ", ".join(stu_texts)
933
- top_lines.append(studios_line)
934
-
935
- # Paragraph break
936
- top_lines.append("")
937
-
938
- # ---------------- Details block (white/default) ----------------
939
- status_lines = [
940
- f"Name: {engine['name']}",
941
- f"Instance: {engine['instance_id']}",
942
- f"Type: {engine['engine_type']} ({engine['instance_type']})",
943
- f"Status: {engine['state']}",
944
- f"User: {engine['user']}",
945
- f"IP: {engine.get('public_ip', 'N/A')}",
946
- started_line,
947
- f"$/hour: ${hourly_cost:.2f}",
948
- ]
949
-
950
- # Disk usage (like list --detailed)
951
- if engine["state"].lower() == "running":
952
- disk_usage = get_disk_usage_via_ssm(engine["instance_id"]) or "-"
953
- status_lines.append(f"Disk: {disk_usage}")
954
-
955
- # Idle timeout (show even when not idle)
956
- idle_threshold_secs: Optional[int] = None
957
- # Prefer value from idle detector overlay if present
958
- try:
959
- if isinstance(idle_detector.get("idle_threshold"), (int, float)):
960
- idle_threshold_secs = int(idle_detector["idle_threshold"])
961
- except Exception:
962
- idle_threshold_secs = None
963
-
964
- if idle_threshold_secs is None and engine["state"].lower() == "running":
965
- # Fallback: read /etc/engine.env via SSM
966
- try:
967
- ssm = boto3.client("ssm", region_name="us-east-1")
968
- resp = ssm.send_command(
969
- InstanceIds=[engine["instance_id"]],
970
- DocumentName="AWS-RunShellScript",
971
- Parameters={
972
- "commands": [
973
- "grep -E '^IDLE_TIMEOUT_SECONDS=' /etc/engine.env | cut -d'=' -f2 || echo 1800",
974
- ],
975
- "executionTimeout": ["5"],
976
- },
977
- )
978
- cid = resp["Command"]["CommandId"]
979
- time.sleep(1)
980
- inv = ssm.get_command_invocation(
981
- CommandId=cid, InstanceId=engine["instance_id"]
982
- )
983
- if inv.get("Status") == "Success":
984
- out = (inv.get("StandardOutputContent") or "").strip()
985
- if out:
986
- idle_threshold_secs = int(out.splitlines()[0].strip())
987
- except Exception:
988
- idle_threshold_secs = None
989
-
990
- if idle_threshold_secs is None:
991
- idle_threshold_secs = 1800
992
-
993
- status_lines.append(
994
- f"Idle timeout: {idle_threshold_secs//60}m ({idle_threshold_secs}s)"
995
- )
996
-
997
- # Health report (only if bootstrap finished)
998
- if stage_val == "finished":
999
- try:
1000
- ssm = boto3.client("ssm", region_name="us-east-1")
1001
- res = ssm.send_command(
1002
- InstanceIds=[engine["instance_id"]],
1003
- DocumentName="AWS-RunShellScript",
1004
- Parameters={
1005
- "commands": [
1006
- "cat /opt/dayhoff/state/engine-health.json 2>/dev/null || cat /var/run/engine-health.json 2>/dev/null || true"
1007
- ],
1008
- "executionTimeout": ["10"],
1009
- },
1010
- )
1011
- cid = res["Command"]["CommandId"]
1012
- time.sleep(1)
1013
- inv = ssm.get_command_invocation(
1014
- CommandId=cid, InstanceId=engine["instance_id"]
1015
- )
1016
- if inv["Status"] == "Success":
1017
- import json as _json
1018
-
1019
- health = _json.loads(inv["StandardOutputContent"].strip() or "{}")
1020
- status_lines.append("")
1021
- status_lines.append("[bold]Health:[/bold]")
1022
- status_lines.append(
1023
- f" • GPU Drivers: {'OK' if health.get('drivers_ok') else 'MISSING'}"
1024
- )
1025
- idle_stat = health.get("idle_detector_service") or health.get(
1026
- "idle_detector_timer", "unknown"
1027
- )
1028
- status_lines.append(f" • Idle Detector: {idle_stat}")
1029
- except Exception:
1030
- pass
1031
-
1032
- # Try to enrich/fallback idle-detector details from on-engine summary file via SSM
1033
- def _fetch_idle_summary_via_ssm(instance_id: str) -> Optional[Dict]:
1034
- try:
1035
- ssm = boto3.client("ssm", region_name="us-east-1")
1036
- res = ssm.send_command(
1037
- InstanceIds=[instance_id],
1038
- DocumentName="AWS-RunShellScript",
1039
- Parameters={
1040
- "commands": [
1041
- "cat /var/run/idle-detector/last_state.json 2>/dev/null || true",
1042
- ],
1043
- "executionTimeout": ["5"],
1044
- },
1045
- )
1046
- cid = res["Command"]["CommandId"]
1047
- # Wait up to 2 seconds for SSM command to complete (was 1 second)
1048
- for _ in range(4): # 4 * 0.5 = 2 seconds
1049
- time.sleep(0.5)
1050
- inv = ssm.get_command_invocation(CommandId=cid, InstanceId=instance_id)
1051
- if inv["Status"] in ["Success", "Failed"]:
1052
- break
1053
- if inv["Status"] != "Success":
1054
- return None
1055
- content = inv["StandardOutputContent"].strip()
1056
- if not content:
1057
- return None
1058
- data = json.loads(content)
1059
- # Convert last_state schema (new or old) to idle_detector schema used by CLI output
1060
- idle_info: Dict[str, Any] = {"available": True}
1061
-
1062
- # Active/idle
1063
- idle_flag = bool(data.get("idle", False))
1064
- idle_info["status"] = "idle" if idle_flag else "active"
1065
-
1066
- # Threshold and elapsed
1067
- if isinstance(data.get("timeout_sec"), (int, float)):
1068
- idle_info["idle_threshold"] = int(data["timeout_sec"]) # seconds
1069
- if isinstance(data.get("idle_seconds"), (int, float)):
1070
- idle_info["idle_seconds"] = int(data["idle_seconds"])
1071
-
1072
- # Keep raw reasons for sensor display when available (new schema)
1073
- if isinstance(data.get("reasons"), list):
1074
- idle_info["_reasons_raw"] = data["reasons"]
1075
- else:
1076
- # Fallback: synthesize reasons from the old forensics layout
1077
- f_all = data.get("forensics", {}) or {}
1078
- synthesized = []
1079
-
1080
- def _mk(sensor_name: str, key: str):
1081
- entry = f_all.get(key, {}) or {}
1082
- synthesized.append(
1083
- {
1084
- "sensor": sensor_name,
1085
- "active": bool(entry.get("active", False)),
1086
- "reason": entry.get("reason", ""),
1087
- "forensic": entry.get("forensic", {}),
1088
- }
1089
- )
1090
-
1091
- _mk("CoffeeLockSensor", "coffee")
1092
- _mk("ActiveLoginSensor", "ssh")
1093
- _mk("IDEConnectionSensor", "ide")
1094
- _mk("DockerWorkloadSensor", "docker")
1095
- idle_info["_reasons_raw"] = synthesized
1096
-
1097
- # Derive details from sensors
1098
- for r in idle_info.get("_reasons_raw", []):
1099
- if not r.get("active"):
1100
- continue
1101
- sensor = (r.get("sensor") or "").lower()
1102
- forensic = r.get("forensic") or {}
1103
- if sensor == "ideconnectionsensor":
1104
- # Prefer unique_pid_count written by new detector
1105
- cnt = forensic.get("unique_pid_count")
1106
- if not isinstance(cnt, int):
1107
- cnt = forensic.get("matches")
1108
- if isinstance(cnt, int):
1109
- idle_info["ide_connections"] = {"connection_count": cnt}
1110
- else:
1111
- idle_info["ide_connections"] = {"connection_count": 1}
1112
- elif sensor == "coffeelocksensor":
1113
- rem = forensic.get("remaining_sec")
1114
- if isinstance(rem, (int, float)) and rem > 0:
1115
- idle_info["coffee_lock"] = format_duration(
1116
- timedelta(seconds=int(rem))
1117
- )
1118
- elif sensor == "activeloginsensor":
1119
- sess = {
1120
- "tty": forensic.get("tty", "pts/?"),
1121
- "pid": forensic.get("pid", "?"),
1122
- "idle_time": forensic.get("idle_sec", 0),
1123
- "from_ip": forensic.get("remote_addr", "unknown"),
1124
- }
1125
- idle_info.setdefault("ssh_sessions", []).append(sess)
1126
- return idle_info
1127
- except Exception:
1128
- return None
1129
-
1130
- # Always try to enrich from on-engine summary (fast, best-effort)
1131
- overlay = _fetch_idle_summary_via_ssm(engine["instance_id"])
1132
- if overlay:
1133
- # If API didn't indicate availability, replace entirely; otherwise fill gaps
1134
- if not idle_detector.get("available"):
1135
- idle_detector = overlay
1136
- else:
1137
- for k, v in overlay.items():
1138
- idle_detector.setdefault(k, v)
1139
-
1140
- # Recompute header display using enriched overlay values
1141
- try:
1142
- active_disp = _compute_active_disp(idle_detector)
1143
- top_lines[0] = f"[blue]{engine['name']}[/blue] {run_disp} {active_disp}\n"
1144
- except Exception:
1145
- pass
1146
-
1147
- # Activity Sensors (show all with YES/no)
1148
- if idle_detector.get("available"):
1149
- status_lines.append("")
1150
- status_lines.append("[bold]Activity Sensors:[/bold]")
1151
- reasons_raw = idle_detector.get("_reasons_raw", []) or []
1152
- by_sensor: Dict[str, Dict[str, Any]] = {}
1153
- for r in reasons_raw:
1154
- nm = r.get("sensor")
1155
- if nm:
1156
- by_sensor[nm] = r
1157
-
1158
- def _sensor_line(label: str, key: str, emoji: str) -> str:
1159
- r = by_sensor.get(key, {})
1160
- active = bool(r.get("active"))
1161
- reason_txt = r.get("reason") or ("" if not active else "active")
1162
- flag = "[green]YES[/green]" if active else "[dim]nope[/dim]"
1163
- return (
1164
- f" {emoji} {label}: {flag} {('- ' + reason_txt) if reason_txt else ''}"
1165
- )
1166
-
1167
- status_lines.append(_sensor_line("Coffee", "CoffeeLockSensor", "☕"))
1168
- status_lines.append(_sensor_line("Shell ", "ActiveLoginSensor", "🐚"))
1169
- status_lines.append(_sensor_line(" IDE ", "IDEConnectionSensor", "🖥"))
1170
- status_lines.append(_sensor_line("Docker", "DockerWorkloadSensor", "🐳"))
1171
-
1172
- # Recompute display with latest idle detector data
1173
- active_disp = _compute_active_disp(idle_detector)
1174
- # Rewrite top header line (index 0) to include updated display
1175
- top_lines[0] = f"[blue]{engine['name']}[/blue] {run_disp} {active_disp}\n"
1176
-
1177
- # Combine top summary and details
1178
- all_lines = top_lines + status_lines
1179
- console.print(
1180
- Panel("\n".join(all_lines), title="Engine Status", border_style="blue")
1181
- )
1182
-
1183
- if show_log:
1184
- if not detailed:
1185
- console.print("[yellow]Note: --show-log requires --detailed flag[/yellow]")
1186
- return
1187
- console.print("\n[bold]Bootstrap Log:[/bold]")
1188
- try:
1189
- ssm = boto3.client("ssm", region_name="us-east-1")
1190
- resp = ssm.send_command(
1191
- InstanceIds=[engine["instance_id"]],
1192
- DocumentName="AWS-RunShellScript",
1193
- Parameters={
1194
- "commands": [
1195
- "cat /var/log/engine-setup.log 2>/dev/null || echo 'No setup log found'"
1196
- ],
1197
- "executionTimeout": ["15"],
1198
- },
1199
- )
1200
- cid = resp["Command"]["CommandId"]
1201
- time.sleep(2)
1202
- inv = ssm.get_command_invocation(
1203
- CommandId=cid, InstanceId=engine["instance_id"]
1204
- )
1205
- if inv["Status"] == "Success":
1206
- log_content = inv["StandardOutputContent"].strip()
1207
- if log_content:
1208
- console.print(f"[dim]{log_content}[/dim]")
1209
- else:
1210
- console.print("[yellow]No bootstrap log available[/yellow]")
1211
- else:
1212
- console.print("[red]❌ Could not retrieve bootstrap log[/red]")
1213
- except Exception as e:
1214
- console.print(f"[red]❌ Error fetching log: {e}[/red]")
1215
-
1216
-
1217
- @engine_app.command("stop")
1218
- def stop_engine(
1219
- name_or_id: str = typer.Argument(help="Engine name or instance ID"),
1220
- force: bool = typer.Option(
1221
- False, "--force", "-f", help="Force stop and detach all studios"
1222
- ),
1223
- ):
1224
- """Stop an engine."""
1225
- check_aws_sso()
1226
-
1227
- # Get all engines to resolve name
1228
- response = make_api_request("GET", "/engines")
1229
- if response.status_code != 200:
1230
- console.print("[red]❌ Failed to fetch engines[/red]")
1231
- raise typer.Exit(1)
1232
-
1233
- engines = response.json().get("engines", [])
1234
- engine = resolve_engine(name_or_id, engines)
1235
-
1236
- console.print(f"Stopping engine [cyan]{engine['name']}[/cyan]...")
1237
-
1238
- # First attempt without detaching
1239
- response = make_api_request(
1240
- "POST",
1241
- f"/engines/{engine['instance_id']}/stop",
1242
- json_data={"detach_studios": force},
1243
- )
1244
-
1245
- if response.status_code == 409 and not force:
1246
- # Engine has attached studios
1247
- data = response.json()
1248
- attached_studios = data.get("attached_studios", [])
1249
-
1250
- console.print("\n[yellow]⚠️ This engine has attached studios:[/yellow]")
1251
- for studio in attached_studios:
1252
- console.print(f" • {studio['user']} ({studio['studio_id']})")
1253
-
1254
- if Confirm.ask("\nDetach all studios and stop the engine?"):
1255
- response = make_api_request(
1256
- "POST",
1257
- f"/engines/{engine['instance_id']}/stop",
1258
- json_data={"detach_studios": True},
1259
- )
1260
- else:
1261
- console.print("Stop cancelled.")
1262
- return
1263
-
1264
- if response.status_code == 200:
1265
- console.print(f"[green]✓ Engine stopped successfully![/green]")
1266
- else:
1267
- error = response.json().get("error", "Unknown error")
1268
- console.print(f"[red]❌ Failed to stop engine: {error}[/red]")
1269
-
1270
-
1271
- @engine_app.command("start")
1272
- def start_engine(
1273
- name_or_id: str = typer.Argument(help="Engine name or instance ID"),
1274
- ):
1275
- """Start a stopped engine."""
1276
- check_aws_sso()
1277
-
1278
- # Get all engines to resolve name
1279
- response = make_api_request("GET", "/engines")
1280
- if response.status_code != 200:
1281
- console.print("[red]❌ Failed to fetch engines[/red]")
1282
- raise typer.Exit(1)
1283
-
1284
- engines = response.json().get("engines", [])
1285
- engine = resolve_engine(name_or_id, engines)
1286
-
1287
- console.print(f"Starting engine [cyan]{engine['name']}[/cyan]...")
1288
-
1289
- response = make_api_request("POST", f"/engines/{engine['instance_id']}/start")
1290
-
1291
- if response.status_code == 200:
1292
- data = response.json()
1293
- console.print(f"[green]✓ Engine started successfully![/green]")
1294
- console.print(f"New public IP: {data.get('public_ip', 'Pending...')}")
1295
- else:
1296
- error = response.json().get("error", "Unknown error")
1297
- console.print(f"[red]❌ Failed to start engine: {error}[/red]")
1298
-
1299
-
1300
- @engine_app.command("terminate")
1301
- def terminate_engine(
1302
- name_or_id: str = typer.Argument(help="Engine name or instance ID"),
1303
- ):
1304
- """Permanently terminate an engine."""
1305
- check_aws_sso()
1306
-
1307
- # Get all engines to resolve name
1308
- response = make_api_request("GET", "/engines")
1309
- if response.status_code != 200:
1310
- console.print("[red]❌ Failed to fetch engines[/red]")
1311
- raise typer.Exit(1)
1312
-
1313
- engines = response.json().get("engines", [])
1314
- engine = resolve_engine(name_or_id, engines)
1315
-
1316
- # Calculate cost
1317
- launch_time = parse_launch_time(engine["launch_time"])
1318
- uptime = datetime.now(timezone.utc) - launch_time
1319
- hourly_cost = HOURLY_COSTS.get(engine["engine_type"], 0)
1320
- total_cost = hourly_cost * (uptime.total_seconds() / 3600)
1321
-
1322
- console.print(
1323
- f"\n[yellow]⚠️ This will permanently terminate engine '{engine['name']}'[/yellow]"
1324
- )
1325
- console.print(f"Total cost for this session: ${total_cost:.2f}")
1326
-
1327
- if not Confirm.ask("\nAre you sure you want to terminate this engine?"):
1328
- console.print("Termination cancelled.")
1329
- return
1330
-
1331
- response = make_api_request("DELETE", f"/engines/{engine['instance_id']}")
1332
-
1333
- if response.status_code == 200:
1334
- console.print(f"[green]✓ Engine terminated successfully![/green]")
1335
- else:
1336
- error = response.json().get("error", "Unknown error")
1337
- console.print(f"[red]❌ Failed to terminate engine: {error}[/red]")
1338
-
1339
-
1340
- @engine_app.command("ssh")
1341
- def ssh_engine(
1342
- name_or_id: str = typer.Argument(help="Engine name or instance ID"),
1343
- admin: bool = typer.Option(
1344
- False, "--admin", help="Connect as ec2-user instead of the engine owner user"
1345
- ),
1346
- idle_timeout: int = typer.Option(
1347
- 600,
1348
- "--idle-timeout",
1349
- help="Idle timeout (seconds) for the SSM port-forward (0 = disable)",
1350
- ),
1351
- ):
1352
- """Connect to an engine via SSH.
1353
-
1354
- By default the CLI connects using the engine's owner username (the same one stored in the `User` tag).
1355
- Pass `--admin` to connect with the underlying [`ec2-user`] account for break-glass or debugging.
1356
- """
1357
- username = check_aws_sso()
1358
-
1359
- # Check for Session Manager Plugin
1360
- if not check_session_manager_plugin():
1361
- raise typer.Exit(1)
1362
-
1363
- # Get all engines to resolve name
1364
- response = make_api_request("GET", "/engines")
1365
- if response.status_code != 200:
1366
- console.print("[red]❌ Failed to fetch engines[/red]")
1367
- raise typer.Exit(1)
1368
-
1369
- engines = response.json().get("engines", [])
1370
- engine = resolve_engine(name_or_id, engines)
1371
-
1372
- if engine["state"].lower() != "running":
1373
- console.print(f"[red]❌ Engine is not running (state: {engine['state']})[/red]")
1374
- raise typer.Exit(1)
1375
-
1376
- # Choose SSH user
1377
- ssh_user = "ec2-user" if admin else username
1378
-
1379
- # Update SSH config
1380
- console.print(
1381
- f"Updating SSH config for [cyan]{engine['name']}[/cyan] (user: {ssh_user})..."
1382
- )
1383
- update_ssh_config_entry(
1384
- engine["name"], engine["instance_id"], ssh_user, idle_timeout
1385
- )
1386
-
1387
- # Connect
1388
- console.print(f"[green]✓ Connecting to {engine['name']}...[/green]")
1389
- subprocess.run(["ssh", engine["name"]])
1390
-
1391
-
1392
- @engine_app.command("config-ssh")
1393
- def config_ssh(
1394
- clean: bool = typer.Option(False, "--clean", help="Remove all managed entries"),
1395
- all_engines: bool = typer.Option(
1396
- False, "--all", "-a", help="Include all engines from all users"
1397
- ),
1398
- admin: bool = typer.Option(
1399
- False,
1400
- "--admin",
1401
- help="Generate entries that use ec2-user instead of per-engine owner user",
1402
- ),
1403
- ):
1404
- """Update SSH config with available engines."""
1405
- username = check_aws_sso()
1406
-
1407
- # Only check for Session Manager Plugin if we're not just cleaning
1408
- if not clean and not check_session_manager_plugin():
1409
- raise typer.Exit(1)
1410
-
1411
- if clean:
1412
- console.print("Removing all managed SSH entries...")
1413
- else:
1414
- if all_engines:
1415
- console.print("Updating SSH config with all running engines...")
1416
- else:
1417
- console.print(
1418
- f"Updating SSH config with running engines for [cyan]{username}[/cyan] and [cyan]shared[/cyan]..."
1419
- )
1420
-
1421
- # Get all engines
1422
- response = make_api_request("GET", "/engines")
1423
- if response.status_code != 200:
1424
- console.print("[red]❌ Failed to fetch engines[/red]")
1425
- raise typer.Exit(1)
1426
-
1427
- engines = response.json().get("engines", [])
1428
- running_engines = [e for e in engines if e["state"].lower() == "running"]
1429
-
1430
- # Filter engines based on options
1431
- if not all_engines:
1432
- # Show only current user's engines and shared engines
1433
- running_engines = [
1434
- e for e in running_engines if e["user"] == username or e["user"] == "shared"
1435
- ]
1436
-
1437
- # Read existing config
1438
- config_path = Path.home() / ".ssh" / "config"
1439
- config_path.parent.mkdir(mode=0o700, exist_ok=True)
1440
-
1441
- if config_path.exists():
1442
- content = config_path.read_text()
1443
- lines = content.splitlines()
1444
- else:
1445
- content = ""
1446
- lines = []
1447
-
1448
- # Remove old managed entries
1449
- new_lines = []
1450
- skip_until_next_host = False
1451
- for line in lines:
1452
- if SSH_MANAGED_COMMENT in line:
1453
- skip_until_next_host = True
1454
- elif line.strip().startswith("Host ") and skip_until_next_host:
1455
- skip_until_next_host = False
1456
- # Check if this is a managed host
1457
- if SSH_MANAGED_COMMENT not in line:
1458
- new_lines.append(line)
1459
- elif not skip_until_next_host:
1460
- new_lines.append(line)
1461
-
1462
- # Add new entries if not cleaning
1463
- if not clean:
1464
- for engine in running_engines:
1465
- # Determine ssh user based on --admin flag
1466
- ssh_user = "ec2-user" if admin else username
1467
- new_lines.extend(
1468
- [
1469
- "",
1470
- f"Host {engine['name']} {SSH_MANAGED_COMMENT}",
1471
- f" HostName {engine['instance_id']}",
1472
- f" User {ssh_user}",
1473
- f" ProxyCommand sh -c \"AWS_SSM_IDLE_TIMEOUT=600 aws ssm start-session --target %h --document-name AWS-StartSSHSession --parameters 'portNumber=%p'\"",
1474
- ]
1475
- )
1476
-
1477
- # Write back
1478
- config_path.write_text("\n".join(new_lines))
1479
- config_path.chmod(0o600)
1480
-
1481
- if clean:
1482
- console.print("[green]✓ Removed all managed SSH entries[/green]")
1483
- else:
1484
- console.print(
1485
- f"[green]✓ Updated SSH config with {len(running_engines)} engines[/green]"
1486
- )
1487
- for engine in running_engines:
1488
- user_display = (
1489
- f"[dim]({engine['user']})[/dim]" if engine["user"] != username else ""
1490
- )
1491
- console.print(
1492
- f" • {engine['name']} → {engine['instance_id']} {user_display}"
1493
- )
1494
-
1495
-
1496
- @engine_app.command("coffee")
1497
- def coffee(
1498
- name_or_id: str = typer.Argument(help="Engine name or instance ID"),
1499
- duration: str = typer.Argument("4h", help="Duration (e.g., 2h, 30m, 2h30m)"),
1500
- cancel: bool = typer.Option(
1501
- False, "--cancel", help="Cancel existing coffee lock instead of extending"
1502
- ),
1503
- ):
1504
- """Pour ☕ for an engine: keeps it awake for the given duration (or cancel)."""
1505
- username = check_aws_sso()
1506
-
1507
- # Parse duration
1508
- import re
1509
-
1510
- if not cancel:
1511
- match = re.match(r"(?:(\d+)h)?(?:(\d+)m)?", duration)
1512
- if not match or (not match.group(1) and not match.group(2)):
1513
- console.print(f"[red]❌ Invalid duration format: {duration}[/red]")
1514
- console.print("Use format like: 4h, 30m, 2h30m")
1515
- raise typer.Exit(1)
1516
-
1517
- hours = int(match.group(1) or 0)
1518
- minutes = int(match.group(2) or 0)
1519
- seconds_total = (hours * 60 + minutes) * 60
1520
- if seconds_total == 0:
1521
- console.print("[red]❌ Duration must be greater than zero[/red]")
1522
- raise typer.Exit(1)
1523
-
1524
- # Get all engines to resolve name
1525
- response = make_api_request("GET", "/engines")
1526
- if response.status_code != 200:
1527
- console.print("[red]❌ Failed to fetch engines[/red]")
1528
- raise typer.Exit(1)
1529
-
1530
- engines = response.json().get("engines", [])
1531
- engine = resolve_engine(name_or_id, engines)
1532
-
1533
- if engine["state"].lower() != "running":
1534
- console.print(f"[red]❌ Engine is not running (state: {engine['state']})[/red]")
1535
- raise typer.Exit(1)
1536
-
1537
- if cancel:
1538
- console.print(f"Cancelling coffee for [cyan]{engine['name']}[/cyan]…")
1539
- else:
1540
- console.print(
1541
- f"Pouring coffee for [cyan]{engine['name']}[/cyan] for {duration}…"
1542
- )
1543
-
1544
- # Use SSM to run the engine coffee command
1545
- ssm = boto3.client("ssm", region_name="us-east-1")
1546
- try:
1547
- response = ssm.send_command(
1548
- InstanceIds=[engine["instance_id"]],
1549
- DocumentName="AWS-RunShellScript",
1550
- Parameters={
1551
- "commands": [
1552
- (
1553
- "/usr/local/bin/engine-coffee --cancel"
1554
- if cancel
1555
- else f"/usr/local/bin/engine-coffee {seconds_total}"
1556
- )
1557
- ],
1558
- "executionTimeout": ["60"],
1559
- },
1560
- )
1561
-
1562
- command_id = response["Command"]["CommandId"]
1563
-
1564
- # Wait for command to complete
1565
- for _ in range(10):
1566
- time.sleep(1)
1567
- result = ssm.get_command_invocation(
1568
- CommandId=command_id,
1569
- InstanceId=engine["instance_id"],
1570
- )
1571
- if result["Status"] in ["Success", "Failed"]:
1572
- break
1573
-
1574
- if result["Status"] == "Success":
1575
- if cancel:
1576
- console.print(
1577
- "[green]✓ Coffee cancelled – auto-shutdown re-enabled[/green]"
1578
- )
1579
- else:
1580
- console.print(f"[green]✓ Coffee poured for {duration}[/green]")
1581
- console.print(
1582
- "\n[dim]Note: Detached Docker containers (except dev containers) will also keep the engine awake.[/dim]"
1583
- )
1584
- console.print(
1585
- "[dim]Use coffee for nohup operations or other background tasks.[/dim]"
1586
- )
1587
- else:
1588
- console.print(
1589
- f"[red]❌ Failed to manage coffee: {result.get('StatusDetails', 'Unknown error')}[/red]"
1590
- )
1591
-
1592
- except ClientError as e:
1593
- console.print(f"[red]❌ Failed to manage coffee: {e}[/red]")
1594
-
1595
-
1596
- @engine_app.command("resize")
1597
- def resize_engine(
1598
- name_or_id: str = typer.Argument(help="Engine name or instance ID"),
1599
- size: int = typer.Option(..., "--size", "-s", help="New size in GB"),
1600
- online: bool = typer.Option(
1601
- False,
1602
- "--online",
1603
- help="Resize while running (requires manual filesystem expansion)",
1604
- ),
1605
- force: bool = typer.Option(
1606
- False, "--force", "-f", help="Force resize and detach all studios"
1607
- ),
1608
- ):
1609
- """Resize an engine's boot disk."""
1610
- check_aws_sso()
1611
-
1612
- # Get all engines to resolve name
1613
- response = make_api_request("GET", "/engines")
1614
- if response.status_code != 200:
1615
- console.print("[red]❌ Failed to fetch engines[/red]")
1616
- raise typer.Exit(1)
1617
-
1618
- engines = response.json().get("engines", [])
1619
- engine = resolve_engine(name_or_id, engines)
1620
-
1621
- # Get current volume info to validate size
1622
- ec2 = boto3.client("ec2", region_name="us-east-1")
1623
-
1624
- try:
1625
- # Get instance details to find root volume
1626
- instance_info = ec2.describe_instances(InstanceIds=[engine["instance_id"]])
1627
- instance = instance_info["Reservations"][0]["Instances"][0]
1628
-
1629
- # Find root volume
1630
- root_device = instance.get("RootDeviceName", "/dev/xvda")
1631
- root_volume_id = None
1632
-
1633
- for bdm in instance.get("BlockDeviceMappings", []):
1634
- if bdm["DeviceName"] == root_device:
1635
- root_volume_id = bdm["Ebs"]["VolumeId"]
1636
- break
1637
-
1638
- if not root_volume_id:
1639
- console.print("[red]❌ Could not find root volume[/red]")
1640
- raise typer.Exit(1)
1641
-
1642
- # Get current volume size
1643
- volumes = ec2.describe_volumes(VolumeIds=[root_volume_id])
1644
- current_size = volumes["Volumes"][0]["Size"]
1645
-
1646
- if size <= current_size:
1647
- console.print(
1648
- f"[red]❌ New size ({size}GB) must be larger than current size ({current_size}GB)[/red]"
1649
- )
1650
- raise typer.Exit(1)
1651
-
1652
- console.print(
1653
- f"[yellow]Resizing engine boot disk from {current_size}GB to {size}GB[/yellow]"
1654
- )
1655
-
1656
- # Check if we need to stop the instance
1657
- if not online and engine["state"].lower() == "running":
1658
- console.print("Stopping engine for offline resize...")
1659
- stop_response = make_api_request(
1660
- "POST",
1661
- f"/engines/{engine['instance_id']}/stop",
1662
- json_data={"detach_studios": False},
1663
- )
1664
- if stop_response.status_code != 200:
1665
- console.print("[red]❌ Failed to stop engine[/red]")
1666
- raise typer.Exit(1)
1667
-
1668
- # Wait for instance to stop
1669
- console.print("Waiting for engine to stop...")
1670
- waiter = ec2.get_waiter("instance_stopped")
1671
- waiter.wait(InstanceIds=[engine["instance_id"]])
1672
- console.print("[green]✓ Engine stopped[/green]")
1673
-
1674
- # Call the resize API
1675
- console.print("Resizing volume...")
1676
- resize_response = make_api_request(
1677
- "POST",
1678
- f"/engines/{engine['instance_id']}/resize",
1679
- json_data={"size": size, "detach_studios": force},
1680
- )
1681
-
1682
- if resize_response.status_code == 409 and not force:
1683
- # Engine has attached studios
1684
- data = resize_response.json()
1685
- attached_studios = data.get("attached_studios", [])
1686
-
1687
- console.print("\n[yellow]⚠️ This engine has attached studios:[/yellow]")
1688
- for studio in attached_studios:
1689
- console.print(f" • {studio['user']} ({studio['studio_id']})")
1690
-
1691
- if Confirm.ask("\nDetach all studios and resize the engine?"):
1692
- resize_response = make_api_request(
1693
- "POST",
1694
- f"/engines/{engine['instance_id']}/resize",
1695
- json_data={"size": size, "detach_studios": True},
1696
- )
1697
- else:
1698
- console.print("Resize cancelled.")
1699
- return
1700
-
1701
- if resize_response.status_code != 200:
1702
- error = resize_response.json().get("error", "Unknown error")
1703
- console.print(f"[red]❌ Failed to resize engine: {error}[/red]")
1704
- raise typer.Exit(1)
1705
-
1706
- # Check if studios were detached
1707
- data = resize_response.json()
1708
- detached_studios = data.get("detached_studios", 0)
1709
- if detached_studios > 0:
1710
- console.print(
1711
- f"[green]✓ Detached {detached_studios} studio(s) before resize[/green]"
1712
- )
1713
-
1714
- # Wait for modification to complete
1715
- console.print("Waiting for volume modification to complete...")
1716
- while True:
1717
- mod_state = ec2.describe_volumes_modifications(VolumeIds=[root_volume_id])
1718
- if not mod_state["VolumesModifications"]:
1719
- break # Modification complete
1720
-
1721
- modification = mod_state["VolumesModifications"][0]
1722
- state = modification["ModificationState"]
1723
- progress = modification.get("Progress", 0)
1724
-
1725
- # Show progress updates only for the resize phase
1726
- if state == "modifying":
1727
- console.print(f"[yellow]Progress: {progress}%[/yellow]")
1728
-
1729
- # Exit as soon as optimization starts (resize is complete)
1730
- if state == "optimizing":
1731
- console.print("[green]✓ Volume resized successfully[/green]")
1732
- console.print(
1733
- "[dim]AWS is optimizing the volume in the background (no action needed).[/dim]"
1734
- )
1735
- break
1736
-
1737
- if state == "completed":
1738
- console.print("[green]✓ Volume resized successfully[/green]")
1739
- break
1740
- elif state == "failed":
1741
- console.print("[red]❌ Volume modification failed[/red]")
1742
- raise typer.Exit(1)
1743
-
1744
- time.sleep(2) # Check more frequently for better UX
1745
-
1746
- # If offline resize, start the instance back up
1747
- if not online and engine["state"].lower() == "running":
1748
- console.print("Starting engine back up...")
1749
- start_response = make_api_request(
1750
- "POST", f"/engines/{engine['instance_id']}/start"
1751
- )
1752
- if start_response.status_code != 200:
1753
- console.print(
1754
- "[yellow]⚠️ Failed to restart engine automatically[/yellow]"
1755
- )
1756
- console.print(
1757
- f"Please start it manually: [cyan]dh engine start {engine['name']}[/cyan]"
1758
- )
1759
- else:
1760
- console.print("[green]✓ Engine started[/green]")
1761
- console.print("The filesystem will be automatically expanded on boot.")
1762
-
1763
- elif online and engine["state"].lower() == "running":
1764
- console.print(
1765
- "\n[yellow]⚠️ Online resize complete. You must now expand the filesystem:[/yellow]"
1766
- )
1767
- console.print(f"1. SSH into the engine: [cyan]ssh {engine['name']}[/cyan]")
1768
- console.print("2. Find the root device: [cyan]lsblk[/cyan]")
1769
- console.print(
1770
- "3. Expand the partition: [cyan]sudo growpart /dev/nvme0n1 1[/cyan] (adjust device name as needed)"
1771
- )
1772
- console.print("4. Expand the filesystem: [cyan]sudo xfs_growfs /[/cyan]")
1773
-
1774
- except ClientError as e:
1775
- console.print(f"[red]❌ Failed to resize engine: {e}[/red]")
1776
- raise typer.Exit(1)
1777
-
1778
-
1779
- @engine_app.command("gami")
1780
- def create_ami(
1781
- name_or_id: str = typer.Argument(
1782
- help="Engine name or instance ID to create AMI from"
1783
- ),
1784
- ):
1785
- """Create a 'Golden AMI' from a running engine.
1786
-
1787
- This process is for creating a pre-warmed, standardized machine image
1788
- that can be used to launch new engines more quickly.
1789
-
1790
- IMPORTANT:
1791
- - The engine MUST have all studios detached before running this command.
1792
- - This process will make the source engine unusable. You should
1793
- plan to TERMINATE the engine after the AMI is created.
1794
- """
1795
- check_aws_sso()
1796
-
1797
- # Get all engines to resolve name and check status
1798
- # We pass check_ready=True to get attached studio info
1799
- response = make_api_request("GET", "/engines", params={"check_ready": "true"})
1800
- if response.status_code != 200:
1801
- console.print("[red]❌ Failed to fetch engines[/red]")
1802
- raise typer.Exit(1)
1803
-
1804
- engines = response.json().get("engines", [])
1805
- engine = resolve_engine(name_or_id, engines)
1806
-
1807
- # --- Pre-flight checks ---
1808
-
1809
- # 1. Check if engine is running
1810
- if engine["state"].lower() != "running":
1811
- console.print(f"[red]❌ Engine '{engine['name']}' is not running.[/red]")
1812
- console.print("Please start it before creating an AMI.")
1813
- raise typer.Exit(1)
1814
-
1815
- # 2. Check for attached studios from the detailed API response
1816
- attached_studios = engine.get("studios", [])
1817
- if attached_studios:
1818
- console.print(
1819
- f"[bold red]❌ Engine '{engine['name']}' has studios attached.[/bold red]"
1820
- )
1821
- console.print("Please detach all studios before creating an AMI:")
1822
- for studio in attached_studios:
1823
- console.print(f" - {studio['user']} ({studio['studio_id']})")
1824
- console.print("\nTo detach, run [bold]dh studio detach[/bold]")
1825
- raise typer.Exit(1)
1826
-
1827
- # Construct AMI name and description
1828
- ami_name = (
1829
- f"prewarmed-engine-{engine['engine_type']}-{datetime.now().strftime('%Y%m%d')}"
1830
- )
1831
- description = (
1832
- f"Amazon Linux 2023 with NVIDIA drivers, Docker, and pre-pulled "
1833
- f"dev container image for {engine['engine_type']} engines"
1834
- )
1835
-
1836
- console.print(f"Creating AMI from engine [cyan]{engine['name']}[/cyan]...")
1837
- console.print(f"[bold]AMI Name:[/] {ami_name}")
1838
- console.print(f"[bold]Description:[/] {description}")
1839
-
1840
- console.print(
1841
- "\n[bold yellow]⚠️ Important:[/bold yellow]\n"
1842
- "1. This process will run cleanup scripts on the engine.\n"
1843
- "2. The source engine should be [bold]terminated[/bold] after the AMI is created.\n"
1844
- )
1845
-
1846
- if not Confirm.ask("Continue with AMI creation?"):
1847
- raise typer.Exit()
1848
-
1849
- # Create AMI using EC2 client directly, as the backend logic is too complex
1850
- ec2 = boto3.client("ec2", region_name="us-east-1")
1851
- ssm = boto3.client("ssm", region_name="us-east-1")
1852
-
1853
- try:
1854
- # Clean up instance state before snapshotting
1855
- console.print("Cleaning up instance for AMI creation...")
1856
- cleanup_commands = [
1857
- "sudo rm -f /opt/dayhoff/first_boot_complete.sentinel",
1858
- "history -c",
1859
- "sudo rm -rf /tmp/* /var/log/messages /var/log/cloud-init.log",
1860
- "sudo rm -rf /var/lib/amazon/ssm/* /etc/amazon/ssm/*",
1861
- "sleep 2 && sudo systemctl stop amazon-ssm-agent &", # Stop agent last
1862
- ]
1863
-
1864
- cleanup_response = ssm.send_command(
1865
- InstanceIds=[engine["instance_id"]],
1866
- DocumentName="AWS-RunShellScript",
1867
- Parameters={"commands": cleanup_commands, "executionTimeout": ["120"]},
1868
- )
1869
-
1870
- # Acknowledge that the SSM command might be in progress as the agent shuts down
1871
- console.print(
1872
- "[dim]ℹ️ Cleanup command sent (status may show 'InProgress' as SSM agent stops)[/dim]"
1873
- )
1874
-
1875
- # Create the AMI
1876
- with Progress(
1877
- SpinnerColumn(),
1878
- TextColumn("[progress.description]{task.description}"),
1879
- transient=True,
1880
- ) as progress:
1881
- task = progress.add_task(
1882
- "Creating AMI (this will take several minutes)...", total=None
1883
- )
1884
-
1885
- response = ec2.create_image(
1886
- InstanceId=engine["instance_id"],
1887
- Name=ami_name,
1888
- Description=description,
1889
- NoReboot=False,
1890
- TagSpecifications=[
1891
- {
1892
- "ResourceType": "image",
1893
- "Tags": [
1894
- {"Key": "Environment", "Value": "dev"},
1895
- {"Key": "Type", "Value": "golden-ami"},
1896
- {"Key": "EngineType", "Value": engine["engine_type"]},
1897
- {"Key": "Name", "Value": ami_name},
1898
- ],
1899
- }
1900
- ],
1901
- )
1902
-
1903
- ami_id = response["ImageId"]
1904
- progress.update(
1905
- task,
1906
- completed=True,
1907
- description=f"[green]✓ AMI creation initiated![/green]",
1908
- )
1909
-
1910
- console.print(f" [bold]AMI ID:[/] {ami_id}")
1911
- console.print("\nThe AMI creation process will continue in the background.")
1912
- console.print("You can monitor progress in the EC2 Console under 'AMIs'.")
1913
- console.print(
1914
- "\nOnce complete, update the AMI ID in [bold]terraform/environments/dev/variables.tf[/bold] "
1915
- "and run [bold]terraform apply[/bold]."
1916
- )
1917
- console.print(
1918
- f"\nRemember to [bold red]terminate the source engine '{engine['name']}'[/bold red] to save costs."
1919
- )
1920
-
1921
- except ClientError as e:
1922
- console.print(f"[red]❌ Failed to create AMI: {e}[/red]")
1923
- raise typer.Exit(1)
1924
-
1925
-
1926
- # ==================== STUDIO COMMANDS ====================
1927
-
1928
-
1929
- def get_user_studio(username: str) -> Optional[Dict]:
1930
- """Get the current user's studio."""
1931
- response = make_api_request("GET", "/studios")
1932
- if response.status_code != 200:
1933
- return None
1934
-
1935
- studios = response.json().get("studios", [])
1936
- user_studios = [s for s in studios if s["user"] == username]
1937
-
1938
- return user_studios[0] if user_studios else None
1939
-
1940
-
1941
- @studio_app.command("create")
1942
- def create_studio(
1943
- size_gb: int = typer.Option(50, "--size", "-s", help="Studio size in GB"),
1944
- ):
1945
- """Create a new studio for the current user."""
1946
- username = check_aws_sso()
1947
-
1948
- # Check if user already has a studio
1949
- existing = get_user_studio(username)
1950
- if existing:
1951
- console.print(
1952
- f"[yellow]You already have a studio: {existing['studio_id']}[/yellow]"
1953
- )
1954
- return
1955
-
1956
- console.print(f"Creating {size_gb}GB studio for user [cyan]{username}[/cyan]...")
1957
-
1958
- with Progress(
1959
- SpinnerColumn(),
1960
- TextColumn("[progress.description]{task.description}"),
1961
- transient=True,
1962
- ) as progress:
1963
- progress.add_task("Creating studio volume...", total=None)
1964
-
1965
- response = make_api_request(
1966
- "POST",
1967
- "/studios",
1968
- json_data={"user": username, "size_gb": size_gb},
1969
- )
1970
-
1971
- if response.status_code == 201:
1972
- data = response.json()
1973
- console.print(f"[green]✓ Studio created successfully![/green]")
1974
- console.print(f"Studio ID: [cyan]{data['studio_id']}[/cyan]")
1975
- console.print(f"Size: {data['size_gb']}GB")
1976
- console.print(f"\nNext step: [cyan]dh studio attach <engine-name>[/cyan]")
1977
- else:
1978
- error = response.json().get("error", "Unknown error")
1979
- console.print(f"[red]❌ Failed to create studio: {error}[/red]")
1980
-
1981
-
1982
- @studio_app.command("status")
1983
- def studio_status(
1984
- user: Optional[str] = typer.Option(
1985
- None, "--user", "-u", help="Check status for a different user (admin only)"
1986
- ),
1987
- ):
1988
- """Show status of your studio."""
1989
- username = check_aws_sso()
1990
-
1991
- # Use specified user if provided, otherwise use current user
1992
- target_user = user if user else username
1993
-
1994
- # Add warning when checking another user's studio
1995
- if target_user != username:
1996
- console.print(
1997
- f"[yellow]⚠️ Checking studio status for user: {target_user}[/yellow]"
1998
- )
1999
-
2000
- studio = get_user_studio(target_user)
2001
- if not studio:
2002
- if target_user == username:
2003
- console.print("[yellow]You don't have a studio yet.[/yellow]")
2004
- console.print("Create one with: [cyan]dh studio create[/cyan]")
2005
- else:
2006
- console.print(f"[yellow]User {target_user} doesn't have a studio.[/yellow]")
2007
- return
2008
-
2009
- # Create status panel
2010
- # Format status with colors
2011
- status = studio["status"]
2012
- if status == "in-use":
2013
- status_display = "[bright_blue]attached[/bright_blue]"
2014
- elif status in ["attaching", "detaching"]:
2015
- status_display = f"[yellow]{status}[/yellow]"
2016
- else:
2017
- status_display = f"[green]{status}[/green]"
2018
-
2019
- status_lines = [
2020
- f"[bold]Studio ID:[/bold] {studio['studio_id']}",
2021
- f"[bold]User:[/bold] {studio['user']}",
2022
- f"[bold]Status:[/bold] {status_display}",
2023
- f"[bold]Size:[/bold] {studio['size_gb']}GB",
2024
- f"[bold]Created:[/bold] {studio['creation_date']}",
2025
- ]
2026
-
2027
- if studio.get("attached_vm_id"):
2028
- status_lines.append(f"[bold]Attached to:[/bold] {studio['attached_vm_id']}")
2029
-
2030
- # Try to get engine details
2031
- response = make_api_request("GET", "/engines")
2032
- if response.status_code == 200:
2033
- engines = response.json().get("engines", [])
2034
- attached_engine = next(
2035
- (e for e in engines if e["instance_id"] == studio["attached_vm_id"]),
2036
- None,
2037
- )
2038
- if attached_engine:
2039
- status_lines.append(
2040
- f"[bold]Engine Name:[/bold] {attached_engine['name']}"
2041
- )
2042
-
2043
- panel = Panel(
2044
- "\n".join(status_lines),
2045
- title="Studio Details",
2046
- border_style="blue",
2047
- )
2048
- console.print(panel)
2049
-
2050
-
2051
- def _is_studio_attached(target_studio_id: str, target_vm_id: str) -> bool:
2052
- """Return True when the given studio already shows as attached to the VM.
2053
-
2054
- Using this extra check lets us stop the outer retry loop as soon as the
2055
- asynchronous attach operation actually finishes, even in the unlikely
2056
- event that the operation-tracking DynamoDB record is not yet updated.
2057
- """
2058
- # First try the per-studio endpoint – fastest.
2059
- resp = make_api_request("GET", f"/studios/{target_studio_id}")
2060
- if resp.status_code == 200:
2061
- data = resp.json()
2062
- if (
2063
- data.get("status") == "in-use"
2064
- and data.get("attached_vm_id") == target_vm_id
2065
- ):
2066
- return True
2067
- # Fallback: list + filter (covers edge-cases where the direct endpoint
2068
- # is slower to update IAM/APIGW mapping than the list endpoint).
2069
- list_resp = make_api_request("GET", "/studios")
2070
- if list_resp.status_code == 200:
2071
- for stu in list_resp.json().get("studios", []):
2072
- if (
2073
- stu.get("studio_id") == target_studio_id
2074
- and stu.get("status") == "in-use"
2075
- and stu.get("attached_vm_id") == target_vm_id
2076
- ):
2077
- return True
2078
- return False
2079
-
2080
-
2081
- @studio_app.command("attach")
2082
- def attach_studio(
2083
- engine_name_or_id: str = typer.Argument(help="Engine name or instance ID"),
2084
- user: Optional[str] = typer.Option(
2085
- None, "--user", "-u", help="Attach a different user's studio (admin only)"
2086
- ),
2087
- ):
2088
- """Attach your studio to an engine."""
2089
- username = check_aws_sso()
2090
-
2091
- # Check for Session Manager Plugin since we'll update SSH config
2092
- if not check_session_manager_plugin():
2093
- raise typer.Exit(1)
2094
-
2095
- # Use specified user if provided, otherwise use current user
2096
- target_user = user if user else username
2097
-
2098
- # Add confirmation when attaching another user's studio
2099
- if target_user != username:
2100
- console.print(f"[yellow]⚠️ Managing studio for user: {target_user}[/yellow]")
2101
- if not Confirm.ask(f"Are you sure you want to attach {target_user}'s studio?"):
2102
- console.print("Operation cancelled.")
2103
- return
2104
-
2105
- # Get user's studio
2106
- studio = get_user_studio(target_user)
2107
- if not studio:
2108
- if target_user == username:
2109
- console.print("[yellow]You don't have a studio yet.[/yellow]")
2110
- if Confirm.ask("Would you like to create one now?"):
2111
- size = IntPrompt.ask("Studio size (GB)", default=50)
2112
- response = make_api_request(
2113
- "POST",
2114
- "/studios",
2115
- json_data={"user": username, "size_gb": size},
2116
- )
2117
- if response.status_code != 201:
2118
- console.print("[red]❌ Failed to create studio[/red]")
2119
- raise typer.Exit(1)
2120
- studio = response.json()
2121
- studio["studio_id"] = studio["studio_id"] # Normalize key
2122
- else:
2123
- raise typer.Exit(0)
2124
- else:
2125
- console.print(f"[red]❌ User {target_user} doesn't have a studio.[/red]")
2126
- raise typer.Exit(1)
2127
-
2128
- # Check if already attached
2129
- if studio.get("status") == "in-use":
2130
- console.print(
2131
- f"[yellow]Studio is already attached to {studio.get('attached_vm_id')}[/yellow]"
2132
- )
2133
- if not Confirm.ask("Detach and reattach to new engine?"):
2134
- return
2135
- # Detach first
2136
- response = make_api_request("POST", f"/studios/{studio['studio_id']}/detach")
2137
- if response.status_code != 200:
2138
- console.print("[red]❌ Failed to detach studio[/red]")
2139
- raise typer.Exit(1)
2140
-
2141
- # Get all engines to resolve name
2142
- response = make_api_request("GET", "/engines")
2143
- if response.status_code != 200:
2144
- console.print("[red]❌ Failed to fetch engines[/red]")
2145
- raise typer.Exit(1)
2146
-
2147
- engines = response.json().get("engines", [])
2148
- engine = resolve_engine(engine_name_or_id, engines)
2149
-
2150
- # Flag to track if we started the engine in this command (affects retry length)
2151
- engine_started_now: bool = False
2152
-
2153
- if engine["state"].lower() != "running":
2154
- console.print(f"[yellow]⚠️ Engine is {engine['state']}[/yellow]")
2155
- if engine["state"].lower() == "stopped" and Confirm.ask(
2156
- "Start the engine first?"
2157
- ):
2158
- response = make_api_request(
2159
- "POST", f"/engines/{engine['instance_id']}/start"
2160
- )
2161
- if response.status_code != 200:
2162
- console.print("[red]❌ Failed to start engine[/red]")
2163
- raise typer.Exit(1)
2164
- console.print("[green]✓ Engine started[/green]")
2165
- # Mark that we booted the engine so attach loop gets extended retries
2166
- engine_started_now = True
2167
- # No further waiting here – attachment attempts below handle retry logic while the
2168
- # engine finishes booting.
2169
- else:
2170
- raise typer.Exit(1)
2171
-
2172
- # Retrieve SSH public key (required for authorised_keys provisioning)
2173
- try:
2174
- public_key = get_ssh_public_key()
2175
- except FileNotFoundError as e:
2176
- console.print(f"[red]❌ {e}[/red]")
2177
- raise typer.Exit(1)
2178
-
2179
- console.print(f"Attaching studio to engine [cyan]{engine['name']}[/cyan]...")
2180
-
2181
- # Determine retry strategy based on whether we just started the engine
2182
- if engine_started_now:
2183
- max_attempts = 40 # About 7 minutes total with exponential backoff
2184
- base_delay = 8
2185
- max_delay = 20
2186
- else:
2187
- max_attempts = 15 # About 2 minutes total with exponential backoff
2188
- base_delay = 5
2189
- max_delay = 10
2190
-
2191
- # Unified retry loop with exponential backoff
2192
- with Progress(
2193
- SpinnerColumn(),
2194
- TimeElapsedColumn(),
2195
- TextColumn("[progress.description]{task.description}"),
2196
- transient=True,
2197
- ) as prog:
2198
- desc = (
2199
- "Attaching studio (engine is still booting)…"
2200
- if engine_started_now
2201
- else "Attaching studio…"
2202
- )
2203
- task = prog.add_task(desc, total=None)
2204
-
2205
- consecutive_not_ready = 0
2206
- last_error = None
2207
-
2208
- for attempt in range(max_attempts):
2209
- # Check if the attach already completed
2210
- if _is_studio_attached(studio["studio_id"], engine["instance_id"]):
2211
- success = True
2212
- break
2213
-
2214
- success, error_msg = _attempt_studio_attach(
2215
- studio, engine, target_user, public_key
2216
- )
2217
-
2218
- if success:
2219
- break # success!
2220
-
2221
- if error_msg:
2222
- # Fatal error – bubble up immediately
2223
- console.print(f"[red]❌ Failed to attach studio: {error_msg}[/red]")
2224
-
2225
- # Suggest repair command if engine seems broken
2226
- if "not ready" in error_msg.lower() and attempt > 5:
2227
- console.print(
2228
- f"\n[yellow]Engine may be in a bad state. Try:[/yellow]"
2229
- )
2230
- console.print(f"[dim] dh engine repair {engine['name']}[/dim]")
2231
- return
2232
-
2233
- # Track consecutive "not ready" responses
2234
- consecutive_not_ready += 1
2235
- last_error = "Engine not ready"
2236
-
2237
- # Update progress display
2238
- if attempt % 3 == 0:
2239
- prog.update(
2240
- task,
2241
- description=f"{desc} attempt {attempt+1}/{max_attempts}",
2242
- )
2243
-
2244
- # If engine seems stuck after many attempts, show a hint
2245
- if consecutive_not_ready > 10 and attempt == 10:
2246
- console.print(
2247
- "[yellow]Engine is taking longer than expected to become ready.[/yellow]"
2248
- )
2249
- console.print(
2250
- "[dim]This can happen after GAMI creation or if the engine is still bootstrapping.[/dim]"
2251
- )
2252
-
2253
- # Exponential backoff with jitter
2254
- delay = min(base_delay * (1.5 ** min(attempt, 5)), max_delay)
2255
- delay += time.time() % 2 # Add 0-2 seconds of jitter
2256
- time.sleep(delay)
2257
-
2258
- else:
2259
- # All attempts exhausted
2260
- console.print(
2261
- f"[yellow]Engine is not becoming ready after {max_attempts} attempts.[/yellow]"
2262
- )
2263
- if last_error:
2264
- console.print(f"[dim]Last issue: {last_error}[/dim]")
2265
- console.print("\n[yellow]You can try:[/yellow]")
2266
- console.print(
2267
- f" 1. Wait a minute and retry: [cyan]dh studio attach {engine['name']}[/cyan]"
2268
- )
2269
- console.print(
2270
- f" 2. Check engine status: [cyan]dh engine status {engine['name']}[/cyan]"
2271
- )
2272
- console.print(
2273
- f" 3. Repair the engine: [cyan]dh engine repair {engine['name']}[/cyan]"
2274
- )
2275
- return
2276
-
2277
- # Successful attach path
2278
- console.print(f"[green]✓ Studio attached successfully![/green]")
2279
-
2280
- # Update SSH config - use target_user for the connection
2281
- update_ssh_config_entry(engine["name"], engine["instance_id"], target_user)
2282
- console.print(f"[green]✓ SSH config updated[/green]")
2283
- console.print(f"\nConnect with: [cyan]ssh {engine['name']}[/cyan]")
2284
- console.print(f"Files are at: [cyan]/studios/{target_user}[/cyan]")
2285
-
2286
-
2287
- def _attempt_studio_attach(studio, engine, target_user, public_key):
2288
- response = make_api_request(
2289
- "POST",
2290
- f"/studios/{studio['studio_id']}/attach",
2291
- json_data={
2292
- "vm_id": engine["instance_id"],
2293
- "user": target_user,
2294
- "public_key": public_key,
2295
- },
2296
- )
2297
-
2298
- # Fast-path success
2299
- if response.status_code == 200:
2300
- return True, None
2301
-
2302
- # Asynchronous path – API returned 202 Accepted and operation tracking ID
2303
- if response.status_code == 202:
2304
- # The operation status polling is broken in the Lambda, so we just
2305
- # wait and check if the studio is actually attached
2306
- time.sleep(5) # Give the async operation a moment to start
2307
-
2308
- # Check periodically if the studio is attached
2309
- for check in range(20): # Check for up to 60 seconds
2310
- if _is_studio_attached(studio["studio_id"], engine["instance_id"]):
2311
- return True, None
2312
- time.sleep(3)
2313
-
2314
- # If we get here, attachment didn't complete in reasonable time
2315
- return False, None # Return None to trigger retry
2316
-
2317
- # --- determine if we should retry ---
2318
- recoverable = False
2319
- error_text = response.json().get("error", "Unknown error")
2320
- err_msg = error_text.lower()
2321
-
2322
- # Check for "Studio is not available (status: in-use)" which means it's already attached
2323
- if (
2324
- response.status_code == 400
2325
- and "not available" in err_msg
2326
- and "in-use" in err_msg
2327
- ):
2328
- # Studio is already attached somewhere - check if it's to THIS engine
2329
- if _is_studio_attached(studio["studio_id"], engine["instance_id"]):
2330
- return True, None # It's attached to our target engine - success!
2331
- else:
2332
- return False, error_text # It's attached elsewhere - fatal error
2333
-
2334
- if response.status_code in (409, 503):
2335
- recoverable = True
2336
- else:
2337
- RECOVERABLE_PATTERNS = [
2338
- "not ready",
2339
- "still starting",
2340
- "initializing",
2341
- "failed to mount",
2342
- "device busy",
2343
- "pending", # VM state pending
2344
- ]
2345
- FATAL_PATTERNS = [
2346
- "permission",
2347
- ]
2348
- if any(p in err_msg for p in FATAL_PATTERNS):
2349
- recoverable = False
2350
- elif any(p in err_msg for p in RECOVERABLE_PATTERNS):
2351
- recoverable = True
2352
-
2353
- if not recoverable:
2354
- # fatal – abort immediately
2355
- return False, error_text
2356
-
2357
- # recoverable – signal caller to retry without treating as error
2358
- return False, None
2359
-
2360
-
2361
- # Note: _poll_operation was removed because the Lambda's operation tracking is broken.
2362
- # We now use _is_studio_attached() to check if the studio is actually attached instead.
2363
-
2364
-
2365
- @studio_app.command("detach")
2366
- def detach_studio(
2367
- user: Optional[str] = typer.Option(
2368
- None, "--user", "-u", help="Detach a different user's studio (admin only)"
2369
- ),
2370
- ):
2371
- """Detach your studio from its current engine."""
2372
- username = check_aws_sso()
2373
-
2374
- # Use specified user if provided, otherwise use current user
2375
- target_user = user if user else username
2376
-
2377
- # Add confirmation when detaching another user's studio
2378
- if target_user != username:
2379
- console.print(f"[yellow]⚠️ Managing studio for user: {target_user}[/yellow]")
2380
- if not Confirm.ask(f"Are you sure you want to detach {target_user}'s studio?"):
2381
- console.print("Operation cancelled.")
2382
- return
2383
-
2384
- studio = get_user_studio(target_user)
2385
- if not studio:
2386
- if target_user == username:
2387
- console.print("[yellow]You don't have a studio.[/yellow]")
2388
- else:
2389
- console.print(f"[yellow]User {target_user} doesn't have a studio.[/yellow]")
2390
- return
2391
-
2392
- if studio.get("status") != "in-use":
2393
- if target_user == username:
2394
- console.print("[yellow]Your studio is not attached to any engine.[/yellow]")
2395
- else:
2396
- console.print(
2397
- f"[yellow]{target_user}'s studio is not attached to any engine.[/yellow]"
2398
- )
2399
- return
2400
-
2401
- console.print(f"Detaching studio from {studio.get('attached_vm_id')}...")
2402
-
2403
- response = make_api_request("POST", f"/studios/{studio['studio_id']}/detach")
2404
-
2405
- if response.status_code == 200:
2406
- console.print(f"[green]✓ Studio detached successfully![/green]")
2407
- else:
2408
- error = response.json().get("error", "Unknown error")
2409
- console.print(f"[red]❌ Failed to detach studio: {error}[/red]")
2410
-
2411
-
2412
- @studio_app.command("delete")
2413
- def delete_studio(
2414
- user: Optional[str] = typer.Option(
2415
- None, "--user", "-u", help="Delete a different user's studio (admin only)"
2416
- ),
2417
- ):
2418
- """Delete your studio permanently."""
2419
- username = check_aws_sso()
2420
-
2421
- # Use specified user if provided, otherwise use current user
2422
- target_user = user if user else username
2423
-
2424
- # Extra warning when deleting another user's studio
2425
- if target_user != username:
2426
- console.print(
2427
- f"[red]⚠️ ADMIN ACTION: Deleting studio for user: {target_user}[/red]"
2428
- )
2429
-
2430
- studio = get_user_studio(target_user)
2431
- if not studio:
2432
- if target_user == username:
2433
- console.print("[yellow]You don't have a studio to delete.[/yellow]")
2434
- else:
2435
- console.print(
2436
- f"[yellow]User {target_user} doesn't have a studio to delete.[/yellow]"
2437
- )
2438
- return
2439
-
2440
- console.print(
2441
- "[red]⚠️ WARNING: This will permanently delete the studio and all data![/red]"
2442
- )
2443
- console.print(f"Studio ID: {studio['studio_id']}")
2444
- console.print(f"User: {target_user}")
2445
- console.print(f"Size: {studio['size_gb']}GB")
2446
-
2447
- # Multiple confirmations
2448
- if not Confirm.ask(
2449
- f"\nAre you sure you want to delete {target_user}'s studio?"
2450
- if target_user != username
2451
- else "\nAre you sure you want to delete your studio?"
2452
- ):
2453
- console.print("Deletion cancelled.")
2454
- return
2455
-
2456
- if not Confirm.ask("[red]This action cannot be undone. Continue?[/red]"):
2457
- console.print("Deletion cancelled.")
2458
- return
2459
-
2460
- typed_confirm = Prompt.ask('Type "DELETE" to confirm permanent deletion')
2461
- if typed_confirm != "DELETE":
2462
- console.print("Deletion cancelled.")
2463
- return
2464
-
2465
- response = make_api_request("DELETE", f"/studios/{studio['studio_id']}")
2466
-
2467
- if response.status_code == 200:
2468
- console.print(f"[green]✓ Studio deleted successfully![/green]")
2469
- else:
2470
- error = response.json().get("error", "Unknown error")
2471
- console.print(f"[red]❌ Failed to delete studio: {error}[/red]")
2472
-
2473
-
2474
- @studio_app.command("list")
2475
- def list_studios(
2476
- all_users: bool = typer.Option(
2477
- False, "--all", "-a", help="Show all users' studios"
2478
- ),
2479
- ):
2480
- """List studios."""
2481
- username = check_aws_sso()
2482
-
2483
- response = make_api_request("GET", "/studios")
2484
-
2485
- if response.status_code == 200:
2486
- studios = response.json().get("studios", [])
2487
-
2488
- if not studios:
2489
- console.print("No studios found.")
2490
- return
2491
-
2492
- # Get all engines to map instance IDs to names
2493
- engines_response = make_api_request("GET", "/engines")
2494
- engines = {}
2495
- if engines_response.status_code == 200:
2496
- for engine in engines_response.json().get("engines", []):
2497
- engines[engine["instance_id"]] = engine["name"]
2498
-
2499
- # Create table
2500
- table = Table(title="Studios", box=box.ROUNDED)
2501
- table.add_column("Studio ID", style="cyan")
2502
- table.add_column("User")
2503
- table.add_column("Status")
2504
- table.add_column("Size", justify="right")
2505
- table.add_column("Disk Usage", justify="right")
2506
- table.add_column("Attached To")
2507
-
2508
- for studio in studios:
2509
- # Change status display
2510
- if studio["status"] == "in-use":
2511
- status_display = "[bright_blue]attached[/bright_blue]"
2512
- elif studio["status"] in ["attaching", "detaching"]:
2513
- status_display = "[yellow]" + studio["status"] + "[/yellow]"
2514
- else:
2515
- status_display = "[green]available[/green]"
2516
-
2517
- # Format attached engine info
2518
- attached_to = "-"
2519
- disk_usage = "?/?"
2520
- if studio.get("attached_vm_id"):
2521
- vm_id = studio["attached_vm_id"]
2522
- engine_name = engines.get(vm_id, "unknown")
2523
- attached_to = f"{engine_name} ({vm_id})"
2524
-
2525
- # Try to get disk usage if attached
2526
- if studio["status"] == "in-use":
2527
- usage = get_studio_disk_usage_via_ssm(vm_id, studio["user"])
2528
- if usage:
2529
- disk_usage = usage
2530
-
2531
- table.add_row(
2532
- studio["studio_id"],
2533
- studio["user"],
2534
- status_display,
2535
- f"{studio['size_gb']}GB",
2536
- disk_usage,
2537
- attached_to,
2538
- )
2539
-
2540
- console.print(table)
2541
- else:
2542
- error = response.json().get("error", "Unknown error")
2543
- console.print(f"[red]❌ Failed to list studios: {error}[/red]")
2544
-
2545
-
2546
- @studio_app.command("reset")
2547
- def reset_studio(
2548
- user: Optional[str] = typer.Option(
2549
- None, "--user", "-u", help="Reset a different user's studio"
2550
- ),
2551
- ):
2552
- """Reset a stuck studio (admin operation)."""
2553
- username = check_aws_sso()
2554
-
2555
- # Use specified user if provided, otherwise use current user
2556
- target_user = user if user else username
2557
-
2558
- # Add warning when resetting another user's studio
2559
- if target_user != username:
2560
- console.print(f"[yellow]⚠️ Resetting studio for user: {target_user}[/yellow]")
2561
-
2562
- studio = get_user_studio(target_user)
2563
- if not studio:
2564
- if target_user == username:
2565
- console.print("[yellow]You don't have a studio.[/yellow]")
2566
- else:
2567
- console.print(f"[yellow]User {target_user} doesn't have a studio.[/yellow]")
2568
- return
2569
-
2570
- console.print(f"[yellow]⚠️ This will force-reset the studio state[/yellow]")
2571
- console.print(f"Current status: {studio['status']}")
2572
- if studio.get("attached_vm_id"):
2573
- console.print(f"Listed as attached to: {studio['attached_vm_id']}")
2574
-
2575
- if not Confirm.ask("\nReset studio state?"):
2576
- console.print("Reset cancelled.")
2577
- return
2578
-
2579
- # Direct DynamoDB update
2580
- console.print("Resetting studio state...")
2581
-
2582
- dynamodb = boto3.resource("dynamodb", region_name="us-east-1")
2583
- table = dynamodb.Table("dev-studios")
2584
-
2585
- try:
2586
- # Check if volume is actually attached
2587
- ec2 = boto3.client("ec2", region_name="us-east-1")
2588
- volumes = ec2.describe_volumes(VolumeIds=[studio["studio_id"]])
2589
-
2590
- if volumes["Volumes"]:
2591
- volume = volumes["Volumes"][0]
2592
- attachments = volume.get("Attachments", [])
2593
- if attachments:
2594
- console.print(
2595
- f"[red]Volume is still attached to {attachments[0]['InstanceId']}![/red]"
2596
- )
2597
- if Confirm.ask("Force-detach the volume?"):
2598
- ec2.detach_volume(
2599
- VolumeId=studio["studio_id"],
2600
- InstanceId=attachments[0]["InstanceId"],
2601
- Force=True,
2602
- )
2603
- console.print("Waiting for volume to detach...")
2604
- waiter = ec2.get_waiter("volume_available")
2605
- waiter.wait(VolumeIds=[studio["studio_id"]])
2606
-
2607
- # Reset in DynamoDB – align attribute names with Studio Manager backend
2608
- table.update_item(
2609
- Key={"StudioID": studio["studio_id"]},
2610
- UpdateExpression="SET #st = :status, AttachedVMID = :vm_id, AttachedDevice = :device",
2611
- ExpressionAttributeNames={"#st": "Status"},
2612
- ExpressionAttributeValues={
2613
- ":status": "available",
2614
- ":vm_id": None,
2615
- ":device": None,
2616
- },
2617
- )
2618
-
2619
- console.print(f"[green]✓ Studio reset to available state![/green]")
2620
-
2621
- except ClientError as e:
2622
- console.print(f"[red]❌ Failed to reset studio: {e}[/red]")
2623
-
2624
-
2625
- @studio_app.command("resize")
2626
- def resize_studio(
2627
- size: int = typer.Option(..., "--size", "-s", help="New size in GB"),
2628
- user: Optional[str] = typer.Option(
2629
- None, "--user", "-u", help="Resize a different user's studio (admin only)"
2630
- ),
2631
- ):
2632
- """Resize your studio volume (requires detachment)."""
2633
- username = check_aws_sso()
2634
-
2635
- # Use specified user if provided, otherwise use current user
2636
- target_user = user if user else username
2637
-
2638
- # Add warning when resizing another user's studio
2639
- if target_user != username:
2640
- console.print(f"[yellow]⚠️ Resizing studio for user: {target_user}[/yellow]")
2641
-
2642
- studio = get_user_studio(target_user)
2643
- if not studio:
2644
- if target_user == username:
2645
- console.print("[yellow]You don't have a studio yet.[/yellow]")
2646
- else:
2647
- console.print(f"[yellow]User {target_user} doesn't have a studio.[/yellow]")
2648
- return
2649
-
2650
- current_size = studio["size_gb"]
2651
-
2652
- if size <= current_size:
2653
- console.print(
2654
- f"[red]❌ New size ({size}GB) must be larger than current size ({current_size}GB)[/red]"
2655
- )
2656
- raise typer.Exit(1)
2657
-
2658
- # Check if studio is attached
2659
- if studio["status"] == "in-use":
2660
- console.print("[yellow]⚠️ Studio must be detached before resizing[/yellow]")
2661
- console.print(f"Currently attached to: {studio.get('attached_vm_id')}")
2662
-
2663
- if not Confirm.ask("\nDetach studio and proceed with resize?"):
2664
- console.print("Resize cancelled.")
2665
- return
2666
-
2667
- # Detach the studio
2668
- console.print("Detaching studio...")
2669
- response = make_api_request("POST", f"/studios/{studio['studio_id']}/detach")
2670
- if response.status_code != 200:
2671
- console.print("[red]❌ Failed to detach studio[/red]")
2672
- raise typer.Exit(1)
2673
-
2674
- console.print("[green]✓ Studio detached[/green]")
2675
-
2676
- # Wait a moment for detachment to complete
2677
- time.sleep(5)
2678
-
2679
- console.print(f"[yellow]Resizing studio from {current_size}GB to {size}GB[/yellow]")
2680
-
2681
- # Call the resize API
2682
- resize_response = make_api_request(
2683
- "POST", f"/studios/{studio['studio_id']}/resize", json_data={"size": size}
2684
- )
2685
-
2686
- if resize_response.status_code != 200:
2687
- error = resize_response.json().get("error", "Unknown error")
2688
- console.print(f"[red]❌ Failed to resize studio: {error}[/red]")
2689
- raise typer.Exit(1)
2690
-
2691
- # Wait for volume modification to complete
2692
- ec2 = boto3.client("ec2", region_name="us-east-1")
2693
- console.print("Resizing volume...")
2694
-
2695
- # Track progress
2696
- last_progress = 0
2697
-
2698
- while True:
2699
- try:
2700
- mod_state = ec2.describe_volumes_modifications(
2701
- VolumeIds=[studio["studio_id"]]
2702
- )
2703
- if not mod_state["VolumesModifications"]:
2704
- break # Modification complete
2705
-
2706
- modification = mod_state["VolumesModifications"][0]
2707
- state = modification["ModificationState"]
2708
- progress = modification.get("Progress", 0)
2709
-
2710
- # Show progress updates only for the resize phase
2711
- if state == "modifying" and progress > last_progress:
2712
- console.print(f"[yellow]Progress: {progress}%[/yellow]")
2713
- last_progress = progress
2714
-
2715
- # Exit as soon as optimization starts (resize is complete)
2716
- if state == "optimizing":
2717
- console.print(
2718
- f"[green]✓ Studio resized successfully to {size}GB![/green]"
2719
- )
2720
- console.print(
2721
- "[dim]AWS is optimizing the volume in the background (no action needed).[/dim]"
2722
- )
2723
- break
2724
-
2725
- if state == "completed":
2726
- console.print(
2727
- f"[green]✓ Studio resized successfully to {size}GB![/green]"
2728
- )
2729
- break
2730
- elif state == "failed":
2731
- console.print("[red]❌ Volume modification failed[/red]")
2732
- raise typer.Exit(1)
2733
-
2734
- time.sleep(2) # Check more frequently for better UX
2735
-
2736
- except ClientError:
2737
- # Modification might be complete
2738
- console.print(f"[green]✓ Studio resized successfully to {size}GB![/green]")
2739
- break
2740
-
2741
- console.print(
2742
- "\n[dim]The filesystem will be automatically expanded when you next attach the studio.[/dim]"
2743
- )
2744
- console.print(f"To attach: [cyan]dh studio attach <engine-name>[/cyan]")
2745
-
2746
-
2747
- # ================= Idle timeout command =================
2748
-
2749
-
2750
- @engine_app.command("idle")
2751
- def idle_timeout_cmd(
2752
- name_or_id: str = typer.Argument(help="Engine name or instance ID"),
2753
- set: Optional[str] = typer.Option(
2754
- None, "--set", "-s", help="New timeout (e.g., 2h30m, 45m)"
2755
- ),
2756
-
2757
- ):
2758
- """Show or set the engine idle-detector timeout."""
2759
- check_aws_sso()
2760
-
2761
- # Resolve engine
2762
- response = make_api_request("GET", "/engines")
2763
- if response.status_code != 200:
2764
- console.print("[red]❌ Failed to fetch engines[/red]")
2765
- raise typer.Exit(1)
2766
-
2767
- engines = response.json().get("engines", [])
2768
- engine = resolve_engine(name_or_id, engines)
2769
-
2770
- ssm = boto3.client("ssm", region_name="us-east-1")
2771
-
2772
- if set is None:
2773
- # Show current timeout setting
2774
- resp = ssm.send_command(
2775
- InstanceIds=[engine["instance_id"]],
2776
- DocumentName="AWS-RunShellScript",
2777
- Parameters={
2778
- "commands": [
2779
- "grep -E '^IDLE_TIMEOUT_SECONDS=' /etc/engine.env || echo 'IDLE_TIMEOUT_SECONDS=1800'"
2780
- ],
2781
- "executionTimeout": ["10"],
2782
- },
2783
- )
2784
- cid = resp["Command"]["CommandId"]
2785
- time.sleep(1)
2786
- inv = ssm.get_command_invocation(
2787
- CommandId=cid, InstanceId=engine["instance_id"]
2788
- )
2789
- if inv["Status"] == "Success":
2790
- line = inv["StandardOutputContent"].strip()
2791
- secs = int(line.split("=")[1]) if "=" in line else 1800
2792
- console.print(f"Current idle timeout: {secs//60}m ({secs} seconds)")
2793
- else:
2794
- console.print("[red]❌ Could not retrieve idle timeout[/red]")
2795
- return
2796
-
2797
- # ----- set new value -----
2798
- m = re.match(r"^(?:(\d+)h)?(?:(\d+)m)?$", set)
2799
- if not m:
2800
- console.print("[red]❌ Invalid duration format. Use e.g. 2h, 45m, 1h30m[/red]")
2801
- raise typer.Exit(1)
2802
- hours = int(m.group(1) or 0)
2803
- minutes = int(m.group(2) or 0)
2804
- seconds = hours * 3600 + minutes * 60
2805
- if seconds == 0:
2806
- console.print("[red]❌ Duration must be greater than zero[/red]")
2807
- raise typer.Exit(1)
2808
-
2809
- console.print(f"Setting idle timeout to {set} ({seconds} seconds)…")
2810
-
2811
- cmd = (
2812
- "sudo sed -i '/^IDLE_TIMEOUT_SECONDS=/d' /etc/engine.env && "
2813
- f"echo 'IDLE_TIMEOUT_SECONDS={seconds}' | sudo tee -a /etc/engine.env >/dev/null && "
2814
- "sudo systemctl restart engine-idle-detector.service"
2815
- )
2816
-
2817
- resp = ssm.send_command(
2818
- InstanceIds=[engine["instance_id"]],
2819
- DocumentName="AWS-RunShellScript",
2820
- Parameters={"commands": [cmd], "executionTimeout": ["60"]},
2821
- )
2822
- cid = resp["Command"]["CommandId"]
2823
- time.sleep(2)
2824
- console.print(f"[green]✓ Idle timeout updated to {set}[/green]")
2825
-
2826
-
2827
- # Add this near the end, after the idle-timeout command
2828
-
2829
-
2830
- @engine_app.command("debug")
2831
- def debug_engine(
2832
- name_or_id: str = typer.Argument(help="Engine name or instance ID"),
2833
- ):
2834
- """Debug engine bootstrap status and files."""
2835
- check_aws_sso()
2836
-
2837
- # Resolve engine
2838
- response = make_api_request("GET", "/engines")
2839
- if response.status_code != 200:
2840
- console.print("[red]❌ Failed to fetch engines[/red]")
2841
- raise typer.Exit(1)
2842
-
2843
- engines = response.json().get("engines", [])
2844
- engine = resolve_engine(name_or_id, engines)
2845
-
2846
- console.print(f"[bold]Debug info for {engine['name']}:[/bold]\n")
2847
-
2848
- ssm = boto3.client("ssm", region_name="us-east-1")
2849
-
2850
- # Check multiple files and systemd status
2851
- checks = [
2852
- (
2853
- "Stage file",
2854
- "cat /opt/dayhoff/state/engine-init.stage 2>/dev/null || cat /var/run/engine-init.stage 2>/dev/null || echo 'MISSING'",
2855
- ),
2856
- (
2857
- "Health file",
2858
- "cat /opt/dayhoff/state/engine-health.json 2>/dev/null || cat /var/run/engine-health.json 2>/dev/null || echo 'MISSING'",
2859
- ),
2860
- (
2861
- "Sentinel file",
2862
- "ls -la /opt/dayhoff/first_boot_complete.sentinel 2>/dev/null || echo 'MISSING'",
2863
- ),
2864
- (
2865
- "Setup service",
2866
- "systemctl status setup-aws-vm.service --no-pager || echo 'Service not found'",
2867
- ),
2868
- (
2869
- "Bootstrap log tail",
2870
- "tail -20 /var/log/engine-setup.log 2>/dev/null || echo 'No log'",
2871
- ),
2872
- ("Environment file", "cat /etc/engine.env 2>/dev/null || echo 'MISSING'"),
2873
- ]
2874
-
2875
- for name, cmd in checks:
2876
- try:
2877
- resp = ssm.send_command(
2878
- InstanceIds=[engine["instance_id"]],
2879
- DocumentName="AWS-RunShellScript",
2880
- Parameters={"commands": [cmd], "executionTimeout": ["10"]},
2881
- )
2882
- cid = resp["Command"]["CommandId"]
2883
- time.sleep(1)
2884
- inv = ssm.get_command_invocation(
2885
- CommandId=cid, InstanceId=engine["instance_id"]
2886
- )
2887
-
2888
- if inv["Status"] == "Success":
2889
- output = inv["StandardOutputContent"].strip()
2890
- console.print(f"[cyan]{name}:[/cyan]")
2891
- console.print(f"[dim]{output}[/dim]\n")
2892
- else:
2893
- console.print(f"[cyan]{name}:[/cyan] [red]FAILED[/red]\n")
2894
-
2895
- except Exception as e:
2896
- console.print(f"[cyan]{name}:[/cyan] [red]ERROR: {e}[/red]\n")
2897
-
2898
-
2899
- @engine_app.command("repair")
2900
- def repair_engine(
2901
- name_or_id: str = typer.Argument(help="Engine name or instance ID"),
2902
- ):
2903
- """Repair an engine that's stuck in a bad state (e.g., after GAMI creation)."""
2904
- check_aws_sso()
2905
-
2906
- # Get all engines to resolve name
2907
- response = make_api_request("GET", "/engines")
2908
- if response.status_code != 200:
2909
- console.print("[red]❌ Failed to fetch engines[/red]")
2910
- raise typer.Exit(1)
2911
-
2912
- engines = response.json().get("engines", [])
2913
- engine = resolve_engine(name_or_id, engines)
2914
-
2915
- if engine["state"].lower() != "running":
2916
- console.print(
2917
- f"[yellow]⚠️ Engine is {engine['state']}. Must be running to repair.[/yellow]"
2918
- )
2919
- if engine["state"].lower() == "stopped" and Confirm.ask(
2920
- "Start the engine first?"
2921
- ):
2922
- response = make_api_request(
2923
- "POST", f"/engines/{engine['instance_id']}/start"
2924
- )
2925
- if response.status_code != 200:
2926
- console.print("[red]❌ Failed to start engine[/red]")
2927
- raise typer.Exit(1)
2928
- console.print("[green]✓ Engine started[/green]")
2929
- console.print("Waiting for engine to become ready...")
2930
- time.sleep(30) # Give it time to boot
2931
- else:
2932
- raise typer.Exit(1)
2933
-
2934
- console.print(f"[bold]Repairing engine [cyan]{engine['name']}[/cyan][/bold]")
2935
- console.print(
2936
- "[dim]This will restore bootstrap state and ensure all services are running[/dim]\n"
2937
- )
2938
-
2939
- ssm = boto3.client("ssm", region_name="us-east-1")
2940
-
2941
- # Repair commands
2942
- repair_commands = [
2943
- # Create necessary directories
2944
- "sudo mkdir -p /opt/dayhoff /opt/dayhoff/state /opt/dayhoff/scripts",
2945
- # Download scripts from S3 if missing
2946
- "source /etc/engine.env && sudo aws s3 sync s3://${VM_SCRIPTS_BUCKET}/ /opt/dayhoff/scripts/ --exclude '*' --include '*.sh' --quiet",
2947
- "sudo chmod +x /opt/dayhoff/scripts/*.sh 2>/dev/null || true",
2948
- # Restore bootstrap state
2949
- "sudo touch /opt/dayhoff/first_boot_complete.sentinel",
2950
- "echo 'finished' | sudo tee /opt/dayhoff/state/engine-init.stage > /dev/null",
2951
- # Ensure SSM agent is running
2952
- "sudo systemctl restart amazon-ssm-agent 2>/dev/null || true",
2953
- # Restart idle detector (service only)
2954
- "sudo systemctl restart engine-idle-detector.service 2>/dev/null || true",
2955
- # Report status
2956
- "echo '=== Repair Complete ===' && echo 'Sentinel: ' && ls -la /opt/dayhoff/first_boot_complete.sentinel",
2957
- "echo 'Stage: ' && cat /opt/dayhoff/state/engine-init.stage",
2958
- "echo 'Scripts: ' && ls /opt/dayhoff/scripts/*.sh 2>/dev/null | wc -l",
2959
- ]
2960
-
2961
- try:
2962
- with Progress(
2963
- SpinnerColumn(),
2964
- TextColumn("[progress.description]{task.description}"),
2965
- transient=True,
2966
- ) as progress:
2967
- task = progress.add_task("Repairing engine...", total=None)
2968
-
2969
- response = ssm.send_command(
2970
- InstanceIds=[engine["instance_id"]],
2971
- DocumentName="AWS-RunShellScript",
2972
- Parameters={
2973
- "commands": repair_commands,
2974
- "executionTimeout": ["60"],
2975
- },
2976
- )
2977
-
2978
- command_id = response["Command"]["CommandId"]
2979
-
2980
- # Wait for command
2981
- for _ in range(60):
2982
- time.sleep(1)
2983
- result = ssm.get_command_invocation(
2984
- CommandId=command_id,
2985
- InstanceId=engine["instance_id"],
2986
- )
2987
- if result["Status"] in ["Success", "Failed"]:
2988
- break
2989
-
2990
- if result["Status"] == "Success":
2991
- output = result["StandardOutputContent"]
2992
- console.print("[green]✓ Engine repaired successfully![/green]\n")
2993
-
2994
- # Show repair results
2995
- if "=== Repair Complete ===" in output:
2996
- repair_section = output.split("=== Repair Complete ===")[1].strip()
2997
- console.print("[bold]Repair Results:[/bold]")
2998
- console.print(repair_section)
2999
-
3000
- console.print(
3001
- "\n[dim]You should now be able to attach studios to this engine.[/dim]"
3002
- )
3003
- else:
3004
- console.print(
3005
- f"[red]❌ Repair failed: {result.get('StandardErrorContent', 'Unknown error')}[/red]"
3006
- )
3007
- console.print(
3008
- "\n[yellow]Try running 'dh engine debug' for more information.[/yellow]"
3009
- )
3010
-
3011
- except Exception as e:
3012
- console.print(f"[red]❌ Failed to repair engine: {e}[/red]")