dayhoff-tools 1.5.2__tar.gz → 1.5.4__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of dayhoff-tools might be problematic. Click here for more details.
- {dayhoff_tools-1.5.2 → dayhoff_tools-1.5.4}/PKG-INFO +1 -1
- {dayhoff_tools-1.5.2 → dayhoff_tools-1.5.4}/dayhoff_tools/cli/engine_commands.py +346 -174
- {dayhoff_tools-1.5.2 → dayhoff_tools-1.5.4}/pyproject.toml +1 -1
- {dayhoff_tools-1.5.2 → dayhoff_tools-1.5.4}/README.md +0 -0
- {dayhoff_tools-1.5.2 → dayhoff_tools-1.5.4}/dayhoff_tools/__init__.py +0 -0
- {dayhoff_tools-1.5.2 → dayhoff_tools-1.5.4}/dayhoff_tools/chemistry/standardizer.py +0 -0
- {dayhoff_tools-1.5.2 → dayhoff_tools-1.5.4}/dayhoff_tools/chemistry/utils.py +0 -0
- {dayhoff_tools-1.5.2 → dayhoff_tools-1.5.4}/dayhoff_tools/cli/__init__.py +0 -0
- {dayhoff_tools-1.5.2 → dayhoff_tools-1.5.4}/dayhoff_tools/cli/cloud_commands.py +0 -0
- {dayhoff_tools-1.5.2 → dayhoff_tools-1.5.4}/dayhoff_tools/cli/main.py +0 -0
- {dayhoff_tools-1.5.2 → dayhoff_tools-1.5.4}/dayhoff_tools/cli/swarm_commands.py +0 -0
- {dayhoff_tools-1.5.2 → dayhoff_tools-1.5.4}/dayhoff_tools/cli/utility_commands.py +0 -0
- {dayhoff_tools-1.5.2 → dayhoff_tools-1.5.4}/dayhoff_tools/deployment/base.py +0 -0
- {dayhoff_tools-1.5.2 → dayhoff_tools-1.5.4}/dayhoff_tools/deployment/deploy_aws.py +0 -0
- {dayhoff_tools-1.5.2 → dayhoff_tools-1.5.4}/dayhoff_tools/deployment/deploy_gcp.py +0 -0
- {dayhoff_tools-1.5.2 → dayhoff_tools-1.5.4}/dayhoff_tools/deployment/deploy_utils.py +0 -0
- {dayhoff_tools-1.5.2 → dayhoff_tools-1.5.4}/dayhoff_tools/deployment/job_runner.py +0 -0
- {dayhoff_tools-1.5.2 → dayhoff_tools-1.5.4}/dayhoff_tools/deployment/processors.py +0 -0
- {dayhoff_tools-1.5.2 → dayhoff_tools-1.5.4}/dayhoff_tools/deployment/swarm.py +0 -0
- {dayhoff_tools-1.5.2 → dayhoff_tools-1.5.4}/dayhoff_tools/embedders.py +0 -0
- {dayhoff_tools-1.5.2 → dayhoff_tools-1.5.4}/dayhoff_tools/fasta.py +0 -0
- {dayhoff_tools-1.5.2 → dayhoff_tools-1.5.4}/dayhoff_tools/file_ops.py +0 -0
- {dayhoff_tools-1.5.2 → dayhoff_tools-1.5.4}/dayhoff_tools/h5.py +0 -0
- {dayhoff_tools-1.5.2 → dayhoff_tools-1.5.4}/dayhoff_tools/intake/gcp.py +0 -0
- {dayhoff_tools-1.5.2 → dayhoff_tools-1.5.4}/dayhoff_tools/intake/gtdb.py +0 -0
- {dayhoff_tools-1.5.2 → dayhoff_tools-1.5.4}/dayhoff_tools/intake/kegg.py +0 -0
- {dayhoff_tools-1.5.2 → dayhoff_tools-1.5.4}/dayhoff_tools/intake/mmseqs.py +0 -0
- {dayhoff_tools-1.5.2 → dayhoff_tools-1.5.4}/dayhoff_tools/intake/structure.py +0 -0
- {dayhoff_tools-1.5.2 → dayhoff_tools-1.5.4}/dayhoff_tools/intake/uniprot.py +0 -0
- {dayhoff_tools-1.5.2 → dayhoff_tools-1.5.4}/dayhoff_tools/logs.py +0 -0
- {dayhoff_tools-1.5.2 → dayhoff_tools-1.5.4}/dayhoff_tools/sqlite.py +0 -0
- {dayhoff_tools-1.5.2 → dayhoff_tools-1.5.4}/dayhoff_tools/warehouse.py +0 -0
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
"""Engine and Studio management commands for DHT CLI."""
|
|
2
2
|
|
|
3
3
|
import json
|
|
4
|
+
import re
|
|
4
5
|
import shutil
|
|
5
6
|
import subprocess
|
|
6
7
|
import sys
|
|
@@ -19,7 +20,6 @@ from rich.panel import Panel
|
|
|
19
20
|
from rich.progress import Progress, SpinnerColumn, TextColumn, TimeElapsedColumn
|
|
20
21
|
from rich.prompt import Confirm, IntPrompt, Prompt
|
|
21
22
|
from rich.table import Table
|
|
22
|
-
import re
|
|
23
23
|
|
|
24
24
|
# Initialize Typer apps
|
|
25
25
|
engine_app = typer.Typer(help="Manage compute engines for development.")
|
|
@@ -34,6 +34,10 @@ HOURLY_COSTS = {
|
|
|
34
34
|
"t4": 0.75, # g4dn.2xlarge
|
|
35
35
|
"a10g": 1.50, # g5.2xlarge
|
|
36
36
|
"a100": 21.96, # p4d.24xlarge
|
|
37
|
+
"4_t4": 3.91, # g4dn.12xlarge
|
|
38
|
+
"8_t4": 7.83, # g4dn.metal
|
|
39
|
+
"4_a10g": 6.24, # g5.12xlarge
|
|
40
|
+
"8_a10g": 16.29, # g5.48xlarge
|
|
37
41
|
}
|
|
38
42
|
|
|
39
43
|
# SSH config management
|
|
@@ -43,6 +47,7 @@ SSH_MANAGED_COMMENT = "# Managed by dh engine"
|
|
|
43
47
|
# Bootstrap stage helpers
|
|
44
48
|
# --------------------------------------------------------------------------------
|
|
45
49
|
|
|
50
|
+
|
|
46
51
|
def _colour_stage(stage: str) -> str:
|
|
47
52
|
"""Return colourised stage name for table output."""
|
|
48
53
|
if not stage:
|
|
@@ -67,7 +72,14 @@ def _fetch_init_stages(instance_ids: List[str]) -> Dict[str, str]:
|
|
|
67
72
|
for res in page["Reservations"]:
|
|
68
73
|
for inst in res["Instances"]:
|
|
69
74
|
iid = inst["InstanceId"]
|
|
70
|
-
tag_val = next(
|
|
75
|
+
tag_val = next(
|
|
76
|
+
(
|
|
77
|
+
t["Value"]
|
|
78
|
+
for t in inst.get("Tags", [])
|
|
79
|
+
if t["Key"] == "DayhoffInitStage"
|
|
80
|
+
),
|
|
81
|
+
None,
|
|
82
|
+
)
|
|
71
83
|
if tag_val:
|
|
72
84
|
stages[iid] = tag_val
|
|
73
85
|
except Exception:
|
|
@@ -167,13 +179,13 @@ def format_duration(duration: timedelta) -> str:
|
|
|
167
179
|
|
|
168
180
|
def get_disk_usage_via_ssm(instance_id: str) -> Optional[str]:
|
|
169
181
|
"""Get disk usage for an engine via SSM.
|
|
170
|
-
|
|
182
|
+
|
|
171
183
|
Returns:
|
|
172
184
|
String like "17/50 GB" or None if failed
|
|
173
185
|
"""
|
|
174
186
|
try:
|
|
175
187
|
ssm = boto3.client("ssm", region_name="us-east-1")
|
|
176
|
-
|
|
188
|
+
|
|
177
189
|
# Run df command to get disk usage
|
|
178
190
|
response = ssm.send_command(
|
|
179
191
|
InstanceIds=[instance_id],
|
|
@@ -181,14 +193,14 @@ def get_disk_usage_via_ssm(instance_id: str) -> Optional[str]:
|
|
|
181
193
|
Parameters={
|
|
182
194
|
"commands": [
|
|
183
195
|
# Get root filesystem usage in GB
|
|
184
|
-
|
|
196
|
+
'df -BG / | tail -1 | awk \'{gsub(/G/, "", $2); gsub(/G/, "", $3); print $3 "/" $2 " GB"}\''
|
|
185
197
|
],
|
|
186
198
|
"executionTimeout": ["10"],
|
|
187
199
|
},
|
|
188
200
|
)
|
|
189
|
-
|
|
201
|
+
|
|
190
202
|
command_id = response["Command"]["CommandId"]
|
|
191
|
-
|
|
203
|
+
|
|
192
204
|
# Wait for command to complete (with timeout)
|
|
193
205
|
for _ in range(5): # 5 second timeout
|
|
194
206
|
time.sleep(1)
|
|
@@ -198,13 +210,13 @@ def get_disk_usage_via_ssm(instance_id: str) -> Optional[str]:
|
|
|
198
210
|
)
|
|
199
211
|
if result["Status"] in ["Success", "Failed"]:
|
|
200
212
|
break
|
|
201
|
-
|
|
213
|
+
|
|
202
214
|
if result["Status"] == "Success":
|
|
203
215
|
output = result["StandardOutputContent"].strip()
|
|
204
216
|
return output if output else None
|
|
205
|
-
|
|
217
|
+
|
|
206
218
|
return None
|
|
207
|
-
|
|
219
|
+
|
|
208
220
|
except Exception as e:
|
|
209
221
|
# logger.debug(f"Failed to get disk usage for {instance_id}: {e}") # Original code had this line commented out
|
|
210
222
|
return None
|
|
@@ -212,13 +224,13 @@ def get_disk_usage_via_ssm(instance_id: str) -> Optional[str]:
|
|
|
212
224
|
|
|
213
225
|
def get_studio_disk_usage_via_ssm(instance_id: str, username: str) -> Optional[str]:
|
|
214
226
|
"""Get disk usage for a studio via SSM.
|
|
215
|
-
|
|
227
|
+
|
|
216
228
|
Returns:
|
|
217
229
|
String like "333/500 GB" or None if failed
|
|
218
230
|
"""
|
|
219
231
|
try:
|
|
220
232
|
ssm = boto3.client("ssm", region_name="us-east-1")
|
|
221
|
-
|
|
233
|
+
|
|
222
234
|
# Run df command to get studio disk usage
|
|
223
235
|
response = ssm.send_command(
|
|
224
236
|
InstanceIds=[instance_id],
|
|
@@ -226,14 +238,14 @@ def get_studio_disk_usage_via_ssm(instance_id: str, username: str) -> Optional[s
|
|
|
226
238
|
Parameters={
|
|
227
239
|
"commands": [
|
|
228
240
|
# Get studio filesystem usage in GB
|
|
229
|
-
f
|
|
241
|
+
f'df -BG /studios/{username} 2>/dev/null | tail -1 | awk \'{{gsub(/G/, "", $2); gsub(/G/, "", $3); print $3 "/" $2 " GB"}}\''
|
|
230
242
|
],
|
|
231
243
|
"executionTimeout": ["10"],
|
|
232
244
|
},
|
|
233
245
|
)
|
|
234
|
-
|
|
246
|
+
|
|
235
247
|
command_id = response["Command"]["CommandId"]
|
|
236
|
-
|
|
248
|
+
|
|
237
249
|
# Wait for command to complete (with timeout)
|
|
238
250
|
for _ in range(5): # 5 second timeout
|
|
239
251
|
time.sleep(1)
|
|
@@ -243,13 +255,13 @@ def get_studio_disk_usage_via_ssm(instance_id: str, username: str) -> Optional[s
|
|
|
243
255
|
)
|
|
244
256
|
if result["Status"] in ["Success", "Failed"]:
|
|
245
257
|
break
|
|
246
|
-
|
|
258
|
+
|
|
247
259
|
if result["Status"] == "Success":
|
|
248
260
|
output = result["StandardOutputContent"].strip()
|
|
249
261
|
return output if output else None
|
|
250
|
-
|
|
262
|
+
|
|
251
263
|
return None
|
|
252
|
-
|
|
264
|
+
|
|
253
265
|
except Exception:
|
|
254
266
|
return None
|
|
255
267
|
|
|
@@ -434,7 +446,7 @@ def launch_engine(
|
|
|
434
446
|
"cpu",
|
|
435
447
|
"--type",
|
|
436
448
|
"-t",
|
|
437
|
-
help="Engine type: cpu, cpumax, t4, a10g, a100",
|
|
449
|
+
help="Engine type: cpu, cpumax, t4, a10g, a100, 4_t4, 8_t4, 4_a10g, 8_a10g",
|
|
438
450
|
),
|
|
439
451
|
user: Optional[str] = typer.Option(None, "--user", "-u", help="Override username"),
|
|
440
452
|
boot_disk_size: Optional[int] = typer.Option(
|
|
@@ -455,7 +467,17 @@ def launch_engine(
|
|
|
455
467
|
username = user
|
|
456
468
|
|
|
457
469
|
# Validate engine type
|
|
458
|
-
valid_types = [
|
|
470
|
+
valid_types = [
|
|
471
|
+
"cpu",
|
|
472
|
+
"cpumax",
|
|
473
|
+
"t4",
|
|
474
|
+
"a10g",
|
|
475
|
+
"a100",
|
|
476
|
+
"4_t4",
|
|
477
|
+
"8_t4",
|
|
478
|
+
"4_a10g",
|
|
479
|
+
"8_a10g",
|
|
480
|
+
]
|
|
459
481
|
if engine_type not in valid_types:
|
|
460
482
|
console.print(f"[red]❌ Invalid engine type: {engine_type}[/red]")
|
|
461
483
|
console.print(f"Valid types: {', '.join(valid_types)}")
|
|
@@ -555,22 +577,17 @@ def list_engines(
|
|
|
555
577
|
table.add_column("Disk Usage")
|
|
556
578
|
table.add_column("Uptime/Since")
|
|
557
579
|
table.add_column("$/hour", justify="right")
|
|
558
|
-
table.add_column("Cost Today", justify="right", style="yellow")
|
|
559
580
|
|
|
560
|
-
total_cost = 0.0
|
|
561
581
|
for engine in engines:
|
|
562
582
|
launch_time = parse_launch_time(engine["launch_time"])
|
|
563
583
|
uptime = datetime.now(timezone.utc) - launch_time
|
|
564
584
|
hourly_cost = HOURLY_COSTS.get(engine["engine_type"], 0)
|
|
565
585
|
|
|
566
586
|
if engine["state"].lower() == "running":
|
|
567
|
-
daily_cost = hourly_cost * min(uptime.total_seconds() / 3600, 24)
|
|
568
|
-
total_cost += daily_cost
|
|
569
587
|
time_str = format_duration(uptime)
|
|
570
588
|
# Get disk usage for running engines
|
|
571
589
|
disk_usage = get_disk_usage_via_ssm(engine["instance_id"]) or "-"
|
|
572
590
|
else:
|
|
573
|
-
daily_cost = 0
|
|
574
591
|
time_str = launch_time.strftime("%Y-%m-%d %H:%M")
|
|
575
592
|
disk_usage = "-"
|
|
576
593
|
|
|
@@ -583,13 +600,9 @@ def list_engines(
|
|
|
583
600
|
disk_usage,
|
|
584
601
|
time_str,
|
|
585
602
|
f"${hourly_cost:.2f}",
|
|
586
|
-
f"${daily_cost:.2f}" if daily_cost > 0 else "-",
|
|
587
603
|
)
|
|
588
604
|
|
|
589
605
|
console.print(table)
|
|
590
|
-
|
|
591
|
-
if total_cost > 0:
|
|
592
|
-
console.print(f"\n[yellow]Total cost today: ${total_cost:.2f}[/yellow]")
|
|
593
606
|
else:
|
|
594
607
|
error = response.json().get("error", "Unknown error")
|
|
595
608
|
console.print(f"[red]❌ Failed to list engines: {error}[/red]")
|
|
@@ -617,7 +630,7 @@ def engine_status(
|
|
|
617
630
|
if response.status_code != 200:
|
|
618
631
|
console.print("[red]❌ Failed to fetch engine details[/red]")
|
|
619
632
|
raise typer.Exit(1)
|
|
620
|
-
|
|
633
|
+
|
|
621
634
|
engine_details = response.json()
|
|
622
635
|
engine = engine_details.get("engine", engine) # Use detailed info if available
|
|
623
636
|
idle_detector = engine_details.get("idle_detector", {})
|
|
@@ -650,26 +663,38 @@ def engine_status(
|
|
|
650
663
|
res = ssm.send_command(
|
|
651
664
|
InstanceIds=[engine["instance_id"]],
|
|
652
665
|
DocumentName="AWS-RunShellScript",
|
|
653
|
-
Parameters={
|
|
666
|
+
Parameters={
|
|
667
|
+
"commands": [
|
|
668
|
+
"cat /opt/dayhoff/state/engine-health.json 2>/dev/null || cat /var/run/engine-health.json 2>/dev/null || true"
|
|
669
|
+
],
|
|
670
|
+
"executionTimeout": ["10"],
|
|
671
|
+
},
|
|
654
672
|
)
|
|
655
673
|
cid = res["Command"]["CommandId"]
|
|
656
674
|
time.sleep(1)
|
|
657
|
-
inv = ssm.get_command_invocation(
|
|
675
|
+
inv = ssm.get_command_invocation(
|
|
676
|
+
CommandId=cid, InstanceId=engine["instance_id"]
|
|
677
|
+
)
|
|
658
678
|
if inv["Status"] == "Success":
|
|
659
679
|
import json as _json
|
|
680
|
+
|
|
660
681
|
health = _json.loads(inv["StandardOutputContent"].strip() or "{}")
|
|
661
682
|
status_lines.append("")
|
|
662
683
|
status_lines.append("[bold]Health:[/bold]")
|
|
663
|
-
status_lines.append(
|
|
664
|
-
|
|
684
|
+
status_lines.append(
|
|
685
|
+
f" • GPU Drivers: {'OK' if health.get('drivers_ok') else 'MISSING'}"
|
|
686
|
+
)
|
|
687
|
+
status_lines.append(
|
|
688
|
+
f" • Idle Detector: {health.get('idle_detector_timer', 'unknown')}"
|
|
689
|
+
)
|
|
665
690
|
except Exception:
|
|
666
691
|
pass
|
|
667
|
-
|
|
692
|
+
|
|
668
693
|
# Idle detector status (from new API endpoint)
|
|
669
694
|
if idle_detector.get("available"):
|
|
670
695
|
status_lines.append("")
|
|
671
696
|
status_lines.append("[bold]Idle Detector:[/bold]")
|
|
672
|
-
|
|
697
|
+
|
|
673
698
|
# Overall status
|
|
674
699
|
if idle_detector["status"] == "active":
|
|
675
700
|
status_lines.append(" [green]✓ Engine ACTIVE[/green]")
|
|
@@ -678,33 +703,41 @@ def engine_status(
|
|
|
678
703
|
idle_threshold = idle_detector.get("idle_threshold", 1800)
|
|
679
704
|
idle_minutes = idle_seconds // 60
|
|
680
705
|
threshold_minutes = idle_threshold // 60
|
|
681
|
-
status_lines.append(
|
|
682
|
-
|
|
706
|
+
status_lines.append(
|
|
707
|
+
f" [yellow]⏱ Engine IDLE ({idle_minutes}/{threshold_minutes} minutes)[/yellow]"
|
|
708
|
+
)
|
|
709
|
+
|
|
683
710
|
# Coffee lock
|
|
684
711
|
if idle_detector.get("coffee_lock"):
|
|
685
|
-
status_lines.append(
|
|
686
|
-
|
|
712
|
+
status_lines.append(
|
|
713
|
+
f" • [cyan]☕ Caffeinated for another {idle_detector['coffee_lock']}[/cyan]"
|
|
714
|
+
)
|
|
715
|
+
|
|
687
716
|
# SSH sessions
|
|
688
717
|
ssh_sessions = idle_detector.get("ssh_sessions", [])
|
|
689
718
|
if ssh_sessions:
|
|
690
719
|
status_lines.append(f" • [blue]SSH Sessions ({len(ssh_sessions)}):[/blue]")
|
|
691
720
|
for session in ssh_sessions:
|
|
692
|
-
status_lines.append(
|
|
693
|
-
|
|
721
|
+
status_lines.append(
|
|
722
|
+
f" - {session['tty']} (pid {session['pid']}, idle {session['idle_time']}) from {session['from_ip']}"
|
|
723
|
+
)
|
|
724
|
+
|
|
694
725
|
# IDE connections
|
|
695
726
|
ide_conn = idle_detector.get("ide_connections")
|
|
696
727
|
if ide_conn:
|
|
697
|
-
status_lines.append(
|
|
728
|
+
status_lines.append(
|
|
729
|
+
f" • [magenta]🖥 IDE connected ({ide_conn['connection_count']} connections)[/magenta]"
|
|
730
|
+
)
|
|
698
731
|
|
|
699
732
|
if attached_studios:
|
|
700
733
|
status_lines.append("")
|
|
701
734
|
status_lines.append("[bold]Attached Studios:[/bold]")
|
|
702
735
|
for studio in attached_studios:
|
|
703
|
-
status_lines.append(
|
|
704
|
-
f" • {studio['user']} ({studio['studio_id']})"
|
|
705
|
-
)
|
|
736
|
+
status_lines.append(f" • {studio['user']} ({studio['studio_id']})")
|
|
706
737
|
|
|
707
|
-
console.print(
|
|
738
|
+
console.print(
|
|
739
|
+
Panel("\n".join(status_lines), title="Engine Status", border_style="blue")
|
|
740
|
+
)
|
|
708
741
|
|
|
709
742
|
if show_log:
|
|
710
743
|
console.print("\n[bold]Bootstrap Log:[/bold]")
|
|
@@ -713,11 +746,18 @@ def engine_status(
|
|
|
713
746
|
resp = ssm.send_command(
|
|
714
747
|
InstanceIds=[engine["instance_id"]],
|
|
715
748
|
DocumentName="AWS-RunShellScript",
|
|
716
|
-
Parameters={
|
|
749
|
+
Parameters={
|
|
750
|
+
"commands": [
|
|
751
|
+
"cat /var/log/engine-setup.log 2>/dev/null || echo 'No setup log found'"
|
|
752
|
+
],
|
|
753
|
+
"executionTimeout": ["15"],
|
|
754
|
+
},
|
|
717
755
|
)
|
|
718
756
|
cid = resp["Command"]["CommandId"]
|
|
719
757
|
time.sleep(2)
|
|
720
|
-
inv = ssm.get_command_invocation(
|
|
758
|
+
inv = ssm.get_command_invocation(
|
|
759
|
+
CommandId=cid, InstanceId=engine["instance_id"]
|
|
760
|
+
)
|
|
721
761
|
if inv["Status"] == "Success":
|
|
722
762
|
log_content = inv["StandardOutputContent"].strip()
|
|
723
763
|
if log_content:
|
|
@@ -856,7 +896,9 @@ def terminate_engine(
|
|
|
856
896
|
@engine_app.command("ssh")
|
|
857
897
|
def ssh_engine(
|
|
858
898
|
name_or_id: str = typer.Argument(help="Engine name or instance ID"),
|
|
859
|
-
admin: bool = typer.Option(
|
|
899
|
+
admin: bool = typer.Option(
|
|
900
|
+
False, "--admin", help="Connect as ec2-user instead of the engine owner user"
|
|
901
|
+
),
|
|
860
902
|
):
|
|
861
903
|
"""Connect to an engine via SSH.
|
|
862
904
|
|
|
@@ -864,7 +906,7 @@ def ssh_engine(
|
|
|
864
906
|
Pass `--admin` to connect with the underlying [`ec2-user`] account for break-glass or debugging.
|
|
865
907
|
"""
|
|
866
908
|
username = check_aws_sso()
|
|
867
|
-
|
|
909
|
+
|
|
868
910
|
# Check for Session Manager Plugin
|
|
869
911
|
if not check_session_manager_plugin():
|
|
870
912
|
raise typer.Exit(1)
|
|
@@ -886,7 +928,9 @@ def ssh_engine(
|
|
|
886
928
|
ssh_user = "ec2-user" if admin else username
|
|
887
929
|
|
|
888
930
|
# Update SSH config
|
|
889
|
-
console.print(
|
|
931
|
+
console.print(
|
|
932
|
+
f"Updating SSH config for [cyan]{engine['name']}[/cyan] (user: {ssh_user})..."
|
|
933
|
+
)
|
|
890
934
|
update_ssh_config_entry(engine["name"], engine["instance_id"], ssh_user)
|
|
891
935
|
|
|
892
936
|
# Connect
|
|
@@ -900,7 +944,11 @@ def config_ssh(
|
|
|
900
944
|
all_engines: bool = typer.Option(
|
|
901
945
|
False, "--all", "-a", help="Include all engines from all users"
|
|
902
946
|
),
|
|
903
|
-
admin: bool = typer.Option(
|
|
947
|
+
admin: bool = typer.Option(
|
|
948
|
+
False,
|
|
949
|
+
"--admin",
|
|
950
|
+
help="Generate entries that use ec2-user instead of per-engine owner user",
|
|
951
|
+
),
|
|
904
952
|
):
|
|
905
953
|
"""Update SSH config with available engines."""
|
|
906
954
|
username = check_aws_sso()
|
|
@@ -964,7 +1012,7 @@ def config_ssh(
|
|
|
964
1012
|
if not clean:
|
|
965
1013
|
for engine in running_engines:
|
|
966
1014
|
# Determine ssh user based on --admin flag
|
|
967
|
-
ssh_user =
|
|
1015
|
+
ssh_user = "ec2-user" if admin else username
|
|
968
1016
|
new_lines.extend(
|
|
969
1017
|
[
|
|
970
1018
|
"",
|
|
@@ -998,7 +1046,9 @@ def config_ssh(
|
|
|
998
1046
|
def coffee(
|
|
999
1047
|
name_or_id: str = typer.Argument(help="Engine name or instance ID"),
|
|
1000
1048
|
duration: str = typer.Argument("4h", help="Duration (e.g., 2h, 30m, 2h30m)"),
|
|
1001
|
-
cancel: bool = typer.Option(
|
|
1049
|
+
cancel: bool = typer.Option(
|
|
1050
|
+
False, "--cancel", help="Cancel existing coffee lock instead of extending"
|
|
1051
|
+
),
|
|
1002
1052
|
):
|
|
1003
1053
|
"""Pour ☕ for an engine: keeps it awake for the given duration (or cancel)."""
|
|
1004
1054
|
username = check_aws_sso()
|
|
@@ -1036,7 +1086,9 @@ def coffee(
|
|
|
1036
1086
|
if cancel:
|
|
1037
1087
|
console.print(f"Cancelling coffee for [cyan]{engine['name']}[/cyan]…")
|
|
1038
1088
|
else:
|
|
1039
|
-
console.print(
|
|
1089
|
+
console.print(
|
|
1090
|
+
f"Pouring coffee for [cyan]{engine['name']}[/cyan] for {duration}…"
|
|
1091
|
+
)
|
|
1040
1092
|
|
|
1041
1093
|
# Use SSM to run the engine coffee command
|
|
1042
1094
|
ssm = boto3.client("ssm", region_name="us-east-1")
|
|
@@ -1046,7 +1098,11 @@ def coffee(
|
|
|
1046
1098
|
DocumentName="AWS-RunShellScript",
|
|
1047
1099
|
Parameters={
|
|
1048
1100
|
"commands": [
|
|
1049
|
-
(
|
|
1101
|
+
(
|
|
1102
|
+
"/usr/local/bin/engine-coffee --cancel"
|
|
1103
|
+
if cancel
|
|
1104
|
+
else f"/usr/local/bin/engine-coffee {seconds_total}"
|
|
1105
|
+
)
|
|
1050
1106
|
],
|
|
1051
1107
|
"executionTimeout": ["60"],
|
|
1052
1108
|
},
|
|
@@ -1066,7 +1122,9 @@ def coffee(
|
|
|
1066
1122
|
|
|
1067
1123
|
if result["Status"] == "Success":
|
|
1068
1124
|
if cancel:
|
|
1069
|
-
console.print(
|
|
1125
|
+
console.print(
|
|
1126
|
+
"[green]✓ Coffee cancelled – auto-shutdown re-enabled[/green]"
|
|
1127
|
+
)
|
|
1070
1128
|
else:
|
|
1071
1129
|
console.print(f"[green]✓ Coffee poured for {duration}[/green]")
|
|
1072
1130
|
console.print(
|
|
@@ -1089,7 +1147,9 @@ def resize_engine(
|
|
|
1089
1147
|
name_or_id: str = typer.Argument(help="Engine name or instance ID"),
|
|
1090
1148
|
size: int = typer.Option(..., "--size", "-s", help="New size in GB"),
|
|
1091
1149
|
online: bool = typer.Option(
|
|
1092
|
-
False,
|
|
1150
|
+
False,
|
|
1151
|
+
"--online",
|
|
1152
|
+
help="Resize while running (requires manual filesystem expansion)",
|
|
1093
1153
|
),
|
|
1094
1154
|
force: bool = typer.Option(
|
|
1095
1155
|
False, "--force", "-f", help="Force resize and detach all studios"
|
|
@@ -1109,59 +1169,65 @@ def resize_engine(
|
|
|
1109
1169
|
|
|
1110
1170
|
# Get current volume info to validate size
|
|
1111
1171
|
ec2 = boto3.client("ec2", region_name="us-east-1")
|
|
1112
|
-
|
|
1172
|
+
|
|
1113
1173
|
try:
|
|
1114
1174
|
# Get instance details to find root volume
|
|
1115
1175
|
instance_info = ec2.describe_instances(InstanceIds=[engine["instance_id"]])
|
|
1116
1176
|
instance = instance_info["Reservations"][0]["Instances"][0]
|
|
1117
|
-
|
|
1177
|
+
|
|
1118
1178
|
# Find root volume
|
|
1119
1179
|
root_device = instance.get("RootDeviceName", "/dev/xvda")
|
|
1120
1180
|
root_volume_id = None
|
|
1121
|
-
|
|
1181
|
+
|
|
1122
1182
|
for bdm in instance.get("BlockDeviceMappings", []):
|
|
1123
1183
|
if bdm["DeviceName"] == root_device:
|
|
1124
1184
|
root_volume_id = bdm["Ebs"]["VolumeId"]
|
|
1125
1185
|
break
|
|
1126
|
-
|
|
1186
|
+
|
|
1127
1187
|
if not root_volume_id:
|
|
1128
1188
|
console.print("[red]❌ Could not find root volume[/red]")
|
|
1129
1189
|
raise typer.Exit(1)
|
|
1130
|
-
|
|
1190
|
+
|
|
1131
1191
|
# Get current volume size
|
|
1132
1192
|
volumes = ec2.describe_volumes(VolumeIds=[root_volume_id])
|
|
1133
1193
|
current_size = volumes["Volumes"][0]["Size"]
|
|
1134
|
-
|
|
1194
|
+
|
|
1135
1195
|
if size <= current_size:
|
|
1136
|
-
console.print(
|
|
1196
|
+
console.print(
|
|
1197
|
+
f"[red]❌ New size ({size}GB) must be larger than current size ({current_size}GB)[/red]"
|
|
1198
|
+
)
|
|
1137
1199
|
raise typer.Exit(1)
|
|
1138
|
-
|
|
1139
|
-
console.print(
|
|
1140
|
-
|
|
1200
|
+
|
|
1201
|
+
console.print(
|
|
1202
|
+
f"[yellow]Resizing engine boot disk from {current_size}GB to {size}GB[/yellow]"
|
|
1203
|
+
)
|
|
1204
|
+
|
|
1141
1205
|
# Check if we need to stop the instance
|
|
1142
1206
|
if not online and engine["state"].lower() == "running":
|
|
1143
1207
|
console.print("Stopping engine for offline resize...")
|
|
1144
1208
|
stop_response = make_api_request(
|
|
1145
|
-
"POST",
|
|
1209
|
+
"POST",
|
|
1210
|
+
f"/engines/{engine['instance_id']}/stop",
|
|
1211
|
+
json_data={"detach_studios": False},
|
|
1146
1212
|
)
|
|
1147
1213
|
if stop_response.status_code != 200:
|
|
1148
1214
|
console.print("[red]❌ Failed to stop engine[/red]")
|
|
1149
1215
|
raise typer.Exit(1)
|
|
1150
|
-
|
|
1216
|
+
|
|
1151
1217
|
# Wait for instance to stop
|
|
1152
1218
|
console.print("Waiting for engine to stop...")
|
|
1153
1219
|
waiter = ec2.get_waiter("instance_stopped")
|
|
1154
1220
|
waiter.wait(InstanceIds=[engine["instance_id"]])
|
|
1155
1221
|
console.print("[green]✓ Engine stopped[/green]")
|
|
1156
|
-
|
|
1222
|
+
|
|
1157
1223
|
# Call the resize API
|
|
1158
1224
|
console.print("Resizing volume...")
|
|
1159
1225
|
resize_response = make_api_request(
|
|
1160
|
-
"POST",
|
|
1226
|
+
"POST",
|
|
1161
1227
|
f"/engines/{engine['instance_id']}/resize",
|
|
1162
|
-
json_data={"size": size, "detach_studios": force}
|
|
1228
|
+
json_data={"size": size, "detach_studios": force},
|
|
1163
1229
|
)
|
|
1164
|
-
|
|
1230
|
+
|
|
1165
1231
|
if resize_response.status_code == 409 and not force:
|
|
1166
1232
|
# Engine has attached studios
|
|
1167
1233
|
data = resize_response.json()
|
|
@@ -1175,71 +1241,85 @@ def resize_engine(
|
|
|
1175
1241
|
resize_response = make_api_request(
|
|
1176
1242
|
"POST",
|
|
1177
1243
|
f"/engines/{engine['instance_id']}/resize",
|
|
1178
|
-
json_data={"size": size, "detach_studios": True}
|
|
1244
|
+
json_data={"size": size, "detach_studios": True},
|
|
1179
1245
|
)
|
|
1180
1246
|
else:
|
|
1181
1247
|
console.print("Resize cancelled.")
|
|
1182
1248
|
return
|
|
1183
|
-
|
|
1249
|
+
|
|
1184
1250
|
if resize_response.status_code != 200:
|
|
1185
1251
|
error = resize_response.json().get("error", "Unknown error")
|
|
1186
1252
|
console.print(f"[red]❌ Failed to resize engine: {error}[/red]")
|
|
1187
1253
|
raise typer.Exit(1)
|
|
1188
|
-
|
|
1254
|
+
|
|
1189
1255
|
# Check if studios were detached
|
|
1190
1256
|
data = resize_response.json()
|
|
1191
1257
|
detached_studios = data.get("detached_studios", 0)
|
|
1192
1258
|
if detached_studios > 0:
|
|
1193
|
-
console.print(
|
|
1194
|
-
|
|
1259
|
+
console.print(
|
|
1260
|
+
f"[green]✓ Detached {detached_studios} studio(s) before resize[/green]"
|
|
1261
|
+
)
|
|
1262
|
+
|
|
1195
1263
|
# Wait for modification to complete
|
|
1196
1264
|
console.print("Waiting for volume modification to complete...")
|
|
1197
1265
|
while True:
|
|
1198
1266
|
mod_state = ec2.describe_volumes_modifications(VolumeIds=[root_volume_id])
|
|
1199
1267
|
if not mod_state["VolumesModifications"]:
|
|
1200
1268
|
break # Modification complete
|
|
1201
|
-
|
|
1269
|
+
|
|
1202
1270
|
modification = mod_state["VolumesModifications"][0]
|
|
1203
1271
|
state = modification["ModificationState"]
|
|
1204
1272
|
progress = modification.get("Progress", 0)
|
|
1205
|
-
|
|
1273
|
+
|
|
1206
1274
|
# Show progress updates only for the resize phase
|
|
1207
1275
|
if state == "modifying":
|
|
1208
1276
|
console.print(f"[yellow]Progress: {progress}%[/yellow]")
|
|
1209
|
-
|
|
1277
|
+
|
|
1210
1278
|
# Exit as soon as optimization starts (resize is complete)
|
|
1211
1279
|
if state == "optimizing":
|
|
1212
1280
|
console.print("[green]✓ Volume resized successfully[/green]")
|
|
1213
|
-
console.print(
|
|
1281
|
+
console.print(
|
|
1282
|
+
"[dim]AWS is optimizing the volume in the background (no action needed).[/dim]"
|
|
1283
|
+
)
|
|
1214
1284
|
break
|
|
1215
|
-
|
|
1285
|
+
|
|
1216
1286
|
if state == "completed":
|
|
1217
1287
|
console.print("[green]✓ Volume resized successfully[/green]")
|
|
1218
1288
|
break
|
|
1219
1289
|
elif state == "failed":
|
|
1220
1290
|
console.print("[red]❌ Volume modification failed[/red]")
|
|
1221
1291
|
raise typer.Exit(1)
|
|
1222
|
-
|
|
1292
|
+
|
|
1223
1293
|
time.sleep(2) # Check more frequently for better UX
|
|
1224
|
-
|
|
1294
|
+
|
|
1225
1295
|
# If offline resize, start the instance back up
|
|
1226
1296
|
if not online and engine["state"].lower() == "running":
|
|
1227
1297
|
console.print("Starting engine back up...")
|
|
1228
|
-
start_response = make_api_request(
|
|
1298
|
+
start_response = make_api_request(
|
|
1299
|
+
"POST", f"/engines/{engine['instance_id']}/start"
|
|
1300
|
+
)
|
|
1229
1301
|
if start_response.status_code != 200:
|
|
1230
|
-
console.print(
|
|
1231
|
-
|
|
1302
|
+
console.print(
|
|
1303
|
+
"[yellow]⚠️ Failed to restart engine automatically[/yellow]"
|
|
1304
|
+
)
|
|
1305
|
+
console.print(
|
|
1306
|
+
f"Please start it manually: [cyan]dh engine start {engine['name']}[/cyan]"
|
|
1307
|
+
)
|
|
1232
1308
|
else:
|
|
1233
1309
|
console.print("[green]✓ Engine started[/green]")
|
|
1234
1310
|
console.print("The filesystem will be automatically expanded on boot.")
|
|
1235
|
-
|
|
1311
|
+
|
|
1236
1312
|
elif online and engine["state"].lower() == "running":
|
|
1237
|
-
console.print(
|
|
1313
|
+
console.print(
|
|
1314
|
+
"\n[yellow]⚠️ Online resize complete. You must now expand the filesystem:[/yellow]"
|
|
1315
|
+
)
|
|
1238
1316
|
console.print(f"1. SSH into the engine: [cyan]ssh {engine['name']}[/cyan]")
|
|
1239
1317
|
console.print("2. Find the root device: [cyan]lsblk[/cyan]")
|
|
1240
|
-
console.print(
|
|
1318
|
+
console.print(
|
|
1319
|
+
"3. Expand the partition: [cyan]sudo growpart /dev/nvme0n1 1[/cyan] (adjust device name as needed)"
|
|
1320
|
+
)
|
|
1241
1321
|
console.print("4. Expand the filesystem: [cyan]sudo xfs_growfs /[/cyan]")
|
|
1242
|
-
|
|
1322
|
+
|
|
1243
1323
|
except ClientError as e:
|
|
1244
1324
|
console.print(f"[red]❌ Failed to resize engine: {e}[/red]")
|
|
1245
1325
|
raise typer.Exit(1)
|
|
@@ -1355,30 +1435,44 @@ def create_ami(
|
|
|
1355
1435
|
# If any user studios are still attached we must detach them before the instance reboots
|
|
1356
1436
|
# for snapshot consistency; otherwise Studio-Manager metadata becomes stale.
|
|
1357
1437
|
|
|
1358
|
-
attached_resp = make_api_request(
|
|
1359
|
-
|
|
1438
|
+
attached_resp = make_api_request(
|
|
1439
|
+
"GET", f"/engines/{engine['instance_id']}/studios"
|
|
1440
|
+
)
|
|
1441
|
+
attached_studios = (
|
|
1442
|
+
attached_resp.json().get("studios", [])
|
|
1443
|
+
if attached_resp.status_code == 200
|
|
1444
|
+
else []
|
|
1445
|
+
)
|
|
1360
1446
|
|
|
1361
1447
|
if attached_studios:
|
|
1362
|
-
console.print(
|
|
1448
|
+
console.print(
|
|
1449
|
+
f"Detaching {len(attached_studios)} studio(s) from this engine…"
|
|
1450
|
+
)
|
|
1363
1451
|
for s in attached_studios:
|
|
1364
1452
|
console.print(f" • {s['user']} ({s['studio_id']})")
|
|
1365
1453
|
|
|
1366
1454
|
for s in attached_studios:
|
|
1367
1455
|
resp = make_api_request("POST", f"/studios/{s['studio_id']}/detach")
|
|
1368
1456
|
if resp.status_code != 200:
|
|
1369
|
-
console.print(
|
|
1457
|
+
console.print(
|
|
1458
|
+
f"[red]❌ Failed to detach {s['studio_id']} – aborting.[/red]"
|
|
1459
|
+
)
|
|
1370
1460
|
return
|
|
1371
1461
|
|
|
1372
1462
|
# Wait briefly for volumes to become available (max 2 min)
|
|
1373
1463
|
# (time is already imported at module level)
|
|
1374
1464
|
ec2_wait = boto3.client("ec2", region_name="us-east-1")
|
|
1375
|
-
vol_ids = [s[
|
|
1465
|
+
vol_ids = [s["studio_id"] for s in attached_studios]
|
|
1376
1466
|
console.print("Waiting for volumes to detach…")
|
|
1377
1467
|
waiter = ec2_wait.get_waiter("volume_available")
|
|
1378
1468
|
try:
|
|
1379
|
-
waiter.wait(
|
|
1469
|
+
waiter.wait(
|
|
1470
|
+
VolumeIds=vol_ids, WaiterConfig={"Delay": 5, "MaxAttempts": 24}
|
|
1471
|
+
)
|
|
1380
1472
|
except Exception:
|
|
1381
|
-
console.print(
|
|
1473
|
+
console.print(
|
|
1474
|
+
"[yellow]Proceeding even though some volumes may still be detaching.[/yellow]"
|
|
1475
|
+
)
|
|
1382
1476
|
|
|
1383
1477
|
# Create the AMI
|
|
1384
1478
|
with Progress(
|
|
@@ -1386,7 +1480,9 @@ def create_ami(
|
|
|
1386
1480
|
TextColumn("[progress.description]{task.description}"),
|
|
1387
1481
|
transient=True,
|
|
1388
1482
|
) as progress:
|
|
1389
|
-
progress.add_task(
|
|
1483
|
+
progress.add_task(
|
|
1484
|
+
"Creating AMI (this will take several minutes)...", total=None
|
|
1485
|
+
)
|
|
1390
1486
|
|
|
1391
1487
|
create_params = {
|
|
1392
1488
|
"InstanceId": engine["instance_id"],
|
|
@@ -1519,18 +1615,22 @@ def create_studio(
|
|
|
1519
1615
|
|
|
1520
1616
|
@studio_app.command("status")
|
|
1521
1617
|
def studio_status(
|
|
1522
|
-
user: Optional[str] = typer.Option(
|
|
1618
|
+
user: Optional[str] = typer.Option(
|
|
1619
|
+
None, "--user", "-u", help="Check status for a different user (admin only)"
|
|
1620
|
+
),
|
|
1523
1621
|
):
|
|
1524
1622
|
"""Show status of your studio."""
|
|
1525
1623
|
username = check_aws_sso()
|
|
1526
|
-
|
|
1624
|
+
|
|
1527
1625
|
# Use specified user if provided, otherwise use current user
|
|
1528
1626
|
target_user = user if user else username
|
|
1529
|
-
|
|
1627
|
+
|
|
1530
1628
|
# Add warning when checking another user's studio
|
|
1531
1629
|
if target_user != username:
|
|
1532
|
-
console.print(
|
|
1533
|
-
|
|
1630
|
+
console.print(
|
|
1631
|
+
f"[yellow]⚠️ Checking studio status for user: {target_user}[/yellow]"
|
|
1632
|
+
)
|
|
1633
|
+
|
|
1534
1634
|
studio = get_user_studio(target_user)
|
|
1535
1635
|
if not studio:
|
|
1536
1636
|
if target_user == username:
|
|
@@ -1585,18 +1685,20 @@ def studio_status(
|
|
|
1585
1685
|
@studio_app.command("attach")
|
|
1586
1686
|
def attach_studio(
|
|
1587
1687
|
engine_name_or_id: str = typer.Argument(help="Engine name or instance ID"),
|
|
1588
|
-
user: Optional[str] = typer.Option(
|
|
1688
|
+
user: Optional[str] = typer.Option(
|
|
1689
|
+
None, "--user", "-u", help="Attach a different user's studio (admin only)"
|
|
1690
|
+
),
|
|
1589
1691
|
):
|
|
1590
1692
|
"""Attach your studio to an engine."""
|
|
1591
1693
|
username = check_aws_sso()
|
|
1592
|
-
|
|
1694
|
+
|
|
1593
1695
|
# Check for Session Manager Plugin since we'll update SSH config
|
|
1594
1696
|
if not check_session_manager_plugin():
|
|
1595
1697
|
raise typer.Exit(1)
|
|
1596
|
-
|
|
1698
|
+
|
|
1597
1699
|
# Use specified user if provided, otherwise use current user
|
|
1598
1700
|
target_user = user if user else username
|
|
1599
|
-
|
|
1701
|
+
|
|
1600
1702
|
# Add confirmation when attaching another user's studio
|
|
1601
1703
|
if target_user != username:
|
|
1602
1704
|
console.print(f"[yellow]⚠️ Managing studio for user: {target_user}[/yellow]")
|
|
@@ -1682,7 +1784,7 @@ def attach_studio(
|
|
|
1682
1784
|
|
|
1683
1785
|
# Determine retry strategy
|
|
1684
1786
|
max_attempts = 40 if engine_started_now else 3
|
|
1685
|
-
retry_delay
|
|
1787
|
+
retry_delay = 10 if engine_started_now else 3
|
|
1686
1788
|
|
|
1687
1789
|
if engine_started_now:
|
|
1688
1790
|
# Long spinner-based loop while the freshly started engine finishes booting
|
|
@@ -1692,17 +1794,24 @@ def attach_studio(
|
|
|
1692
1794
|
TextColumn("[progress.description]{task.description}"),
|
|
1693
1795
|
transient=True,
|
|
1694
1796
|
) as prog:
|
|
1695
|
-
task = prog.add_task(
|
|
1797
|
+
task = prog.add_task(
|
|
1798
|
+
"Attaching studio (engine is still booting)…", total=None
|
|
1799
|
+
)
|
|
1696
1800
|
|
|
1697
1801
|
for attempt in range(max_attempts):
|
|
1698
|
-
success, error_msg = _attempt_studio_attach(
|
|
1802
|
+
success, error_msg = _attempt_studio_attach(
|
|
1803
|
+
studio, engine, target_user, public_key
|
|
1804
|
+
)
|
|
1699
1805
|
|
|
1700
1806
|
if success:
|
|
1701
1807
|
break # success!
|
|
1702
1808
|
|
|
1703
1809
|
# Update spinner every 3rd try to avoid log spam
|
|
1704
1810
|
if attempt % 3 == 0:
|
|
1705
|
-
prog.update(
|
|
1811
|
+
prog.update(
|
|
1812
|
+
task,
|
|
1813
|
+
description=f"Attaching studio (engine is still booting)… {attempt+1}/{max_attempts}",
|
|
1814
|
+
)
|
|
1706
1815
|
|
|
1707
1816
|
if error_msg:
|
|
1708
1817
|
console.print(f"[red]❌ Failed to attach studio: {error_msg}[/red]")
|
|
@@ -1711,15 +1820,19 @@ def attach_studio(
|
|
|
1711
1820
|
time.sleep(retry_delay)
|
|
1712
1821
|
|
|
1713
1822
|
else:
|
|
1714
|
-
console.print(
|
|
1823
|
+
console.print(
|
|
1824
|
+
"[yellow]Engine is still starting up – please retry in a minute.[/yellow]"
|
|
1825
|
+
)
|
|
1715
1826
|
return
|
|
1716
1827
|
else:
|
|
1717
1828
|
# Give the (already-running) engine a little breathing room – e.g. it may still be mounting EFS
|
|
1718
1829
|
max_attempts = 10 # ~1 min total
|
|
1719
|
-
retry_delay
|
|
1830
|
+
retry_delay = 6
|
|
1720
1831
|
|
|
1721
1832
|
for attempt in range(max_attempts):
|
|
1722
|
-
success, error_msg = _attempt_studio_attach(
|
|
1833
|
+
success, error_msg = _attempt_studio_attach(
|
|
1834
|
+
studio, engine, target_user, public_key
|
|
1835
|
+
)
|
|
1723
1836
|
|
|
1724
1837
|
if success:
|
|
1725
1838
|
break # attached!
|
|
@@ -1735,7 +1848,9 @@ def attach_studio(
|
|
|
1735
1848
|
time.sleep(retry_delay)
|
|
1736
1849
|
|
|
1737
1850
|
else:
|
|
1738
|
-
console.print(
|
|
1851
|
+
console.print(
|
|
1852
|
+
"[yellow]Engine is busy or still initialising – please retry in about a minute.[/yellow]"
|
|
1853
|
+
)
|
|
1739
1854
|
return
|
|
1740
1855
|
|
|
1741
1856
|
# Successful attach path
|
|
@@ -1799,14 +1914,16 @@ def _attempt_studio_attach(studio, engine, target_user, public_key):
|
|
|
1799
1914
|
|
|
1800
1915
|
@studio_app.command("detach")
|
|
1801
1916
|
def detach_studio(
|
|
1802
|
-
user: Optional[str] = typer.Option(
|
|
1917
|
+
user: Optional[str] = typer.Option(
|
|
1918
|
+
None, "--user", "-u", help="Detach a different user's studio (admin only)"
|
|
1919
|
+
),
|
|
1803
1920
|
):
|
|
1804
1921
|
"""Detach your studio from its current engine."""
|
|
1805
1922
|
username = check_aws_sso()
|
|
1806
|
-
|
|
1923
|
+
|
|
1807
1924
|
# Use specified user if provided, otherwise use current user
|
|
1808
1925
|
target_user = user if user else username
|
|
1809
|
-
|
|
1926
|
+
|
|
1810
1927
|
# Add confirmation when detaching another user's studio
|
|
1811
1928
|
if target_user != username:
|
|
1812
1929
|
console.print(f"[yellow]⚠️ Managing studio for user: {target_user}[/yellow]")
|
|
@@ -1826,7 +1943,9 @@ def detach_studio(
|
|
|
1826
1943
|
if target_user == username:
|
|
1827
1944
|
console.print("[yellow]Your studio is not attached to any engine.[/yellow]")
|
|
1828
1945
|
else:
|
|
1829
|
-
console.print(
|
|
1946
|
+
console.print(
|
|
1947
|
+
f"[yellow]{target_user}'s studio is not attached to any engine.[/yellow]"
|
|
1948
|
+
)
|
|
1830
1949
|
return
|
|
1831
1950
|
|
|
1832
1951
|
console.print(f"Detaching studio from {studio.get('attached_vm_id')}...")
|
|
@@ -1842,24 +1961,30 @@ def detach_studio(
|
|
|
1842
1961
|
|
|
1843
1962
|
@studio_app.command("delete")
|
|
1844
1963
|
def delete_studio(
|
|
1845
|
-
user: Optional[str] = typer.Option(
|
|
1964
|
+
user: Optional[str] = typer.Option(
|
|
1965
|
+
None, "--user", "-u", help="Delete a different user's studio (admin only)"
|
|
1966
|
+
),
|
|
1846
1967
|
):
|
|
1847
1968
|
"""Delete your studio permanently."""
|
|
1848
1969
|
username = check_aws_sso()
|
|
1849
|
-
|
|
1970
|
+
|
|
1850
1971
|
# Use specified user if provided, otherwise use current user
|
|
1851
1972
|
target_user = user if user else username
|
|
1852
|
-
|
|
1973
|
+
|
|
1853
1974
|
# Extra warning when deleting another user's studio
|
|
1854
1975
|
if target_user != username:
|
|
1855
|
-
console.print(
|
|
1976
|
+
console.print(
|
|
1977
|
+
f"[red]⚠️ ADMIN ACTION: Deleting studio for user: {target_user}[/red]"
|
|
1978
|
+
)
|
|
1856
1979
|
|
|
1857
1980
|
studio = get_user_studio(target_user)
|
|
1858
1981
|
if not studio:
|
|
1859
1982
|
if target_user == username:
|
|
1860
1983
|
console.print("[yellow]You don't have a studio to delete.[/yellow]")
|
|
1861
1984
|
else:
|
|
1862
|
-
console.print(
|
|
1985
|
+
console.print(
|
|
1986
|
+
f"[yellow]User {target_user} doesn't have a studio to delete.[/yellow]"
|
|
1987
|
+
)
|
|
1863
1988
|
return
|
|
1864
1989
|
|
|
1865
1990
|
console.print(
|
|
@@ -1870,7 +1995,11 @@ def delete_studio(
|
|
|
1870
1995
|
console.print(f"Size: {studio['size_gb']}GB")
|
|
1871
1996
|
|
|
1872
1997
|
# Multiple confirmations
|
|
1873
|
-
if not Confirm.ask(
|
|
1998
|
+
if not Confirm.ask(
|
|
1999
|
+
f"\nAre you sure you want to delete {target_user}'s studio?"
|
|
2000
|
+
if target_user != username
|
|
2001
|
+
else "\nAre you sure you want to delete your studio?"
|
|
2002
|
+
):
|
|
1874
2003
|
console.print("Deletion cancelled.")
|
|
1875
2004
|
return
|
|
1876
2005
|
|
|
@@ -1942,7 +2071,7 @@ def list_studios(
|
|
|
1942
2071
|
vm_id = studio["attached_vm_id"]
|
|
1943
2072
|
engine_name = engines.get(vm_id, "unknown")
|
|
1944
2073
|
attached_to = f"{engine_name} ({vm_id})"
|
|
1945
|
-
|
|
2074
|
+
|
|
1946
2075
|
# Try to get disk usage if attached
|
|
1947
2076
|
if studio["status"] == "in-use":
|
|
1948
2077
|
usage = get_studio_disk_usage_via_ssm(vm_id, studio["user"])
|
|
@@ -1966,14 +2095,16 @@ def list_studios(
|
|
|
1966
2095
|
|
|
1967
2096
|
@studio_app.command("reset")
|
|
1968
2097
|
def reset_studio(
|
|
1969
|
-
user: Optional[str] = typer.Option(
|
|
2098
|
+
user: Optional[str] = typer.Option(
|
|
2099
|
+
None, "--user", "-u", help="Reset a different user's studio"
|
|
2100
|
+
),
|
|
1970
2101
|
):
|
|
1971
2102
|
"""Reset a stuck studio (admin operation)."""
|
|
1972
2103
|
username = check_aws_sso()
|
|
1973
|
-
|
|
2104
|
+
|
|
1974
2105
|
# Use specified user if provided, otherwise use current user
|
|
1975
2106
|
target_user = user if user else username
|
|
1976
|
-
|
|
2107
|
+
|
|
1977
2108
|
# Add warning when resetting another user's studio
|
|
1978
2109
|
if target_user != username:
|
|
1979
2110
|
console.print(f"[yellow]⚠️ Resetting studio for user: {target_user}[/yellow]")
|
|
@@ -2044,14 +2175,16 @@ def reset_studio(
|
|
|
2044
2175
|
@studio_app.command("resize")
|
|
2045
2176
|
def resize_studio(
|
|
2046
2177
|
size: int = typer.Option(..., "--size", "-s", help="New size in GB"),
|
|
2047
|
-
user: Optional[str] = typer.Option(
|
|
2178
|
+
user: Optional[str] = typer.Option(
|
|
2179
|
+
None, "--user", "-u", help="Resize a different user's studio (admin only)"
|
|
2180
|
+
),
|
|
2048
2181
|
):
|
|
2049
2182
|
"""Resize your studio volume (requires detachment)."""
|
|
2050
2183
|
username = check_aws_sso()
|
|
2051
|
-
|
|
2184
|
+
|
|
2052
2185
|
# Use specified user if provided, otherwise use current user
|
|
2053
2186
|
target_user = user if user else username
|
|
2054
|
-
|
|
2187
|
+
|
|
2055
2188
|
# Add warning when resizing another user's studio
|
|
2056
2189
|
if target_user != username:
|
|
2057
2190
|
console.print(f"[yellow]⚠️ Resizing studio for user: {target_user}[/yellow]")
|
|
@@ -2065,29 +2198,31 @@ def resize_studio(
|
|
|
2065
2198
|
return
|
|
2066
2199
|
|
|
2067
2200
|
current_size = studio["size_gb"]
|
|
2068
|
-
|
|
2201
|
+
|
|
2069
2202
|
if size <= current_size:
|
|
2070
|
-
console.print(
|
|
2203
|
+
console.print(
|
|
2204
|
+
f"[red]❌ New size ({size}GB) must be larger than current size ({current_size}GB)[/red]"
|
|
2205
|
+
)
|
|
2071
2206
|
raise typer.Exit(1)
|
|
2072
2207
|
|
|
2073
2208
|
# Check if studio is attached
|
|
2074
2209
|
if studio["status"] == "in-use":
|
|
2075
2210
|
console.print("[yellow]⚠️ Studio must be detached before resizing[/yellow]")
|
|
2076
2211
|
console.print(f"Currently attached to: {studio.get('attached_vm_id')}")
|
|
2077
|
-
|
|
2212
|
+
|
|
2078
2213
|
if not Confirm.ask("\nDetach studio and proceed with resize?"):
|
|
2079
2214
|
console.print("Resize cancelled.")
|
|
2080
2215
|
return
|
|
2081
|
-
|
|
2216
|
+
|
|
2082
2217
|
# Detach the studio
|
|
2083
2218
|
console.print("Detaching studio...")
|
|
2084
2219
|
response = make_api_request("POST", f"/studios/{studio['studio_id']}/detach")
|
|
2085
2220
|
if response.status_code != 200:
|
|
2086
2221
|
console.print("[red]❌ Failed to detach studio[/red]")
|
|
2087
2222
|
raise typer.Exit(1)
|
|
2088
|
-
|
|
2223
|
+
|
|
2089
2224
|
console.print("[green]✓ Studio detached[/green]")
|
|
2090
|
-
|
|
2225
|
+
|
|
2091
2226
|
# Wait a moment for detachment to complete
|
|
2092
2227
|
time.sleep(5)
|
|
2093
2228
|
|
|
@@ -2095,68 +2230,79 @@ def resize_studio(
|
|
|
2095
2230
|
|
|
2096
2231
|
# Call the resize API
|
|
2097
2232
|
resize_response = make_api_request(
|
|
2098
|
-
"POST",
|
|
2099
|
-
f"/studios/{studio['studio_id']}/resize",
|
|
2100
|
-
json_data={"size": size}
|
|
2233
|
+
"POST", f"/studios/{studio['studio_id']}/resize", json_data={"size": size}
|
|
2101
2234
|
)
|
|
2102
|
-
|
|
2235
|
+
|
|
2103
2236
|
if resize_response.status_code != 200:
|
|
2104
2237
|
error = resize_response.json().get("error", "Unknown error")
|
|
2105
2238
|
console.print(f"[red]❌ Failed to resize studio: {error}[/red]")
|
|
2106
2239
|
raise typer.Exit(1)
|
|
2107
|
-
|
|
2240
|
+
|
|
2108
2241
|
# Wait for volume modification to complete
|
|
2109
2242
|
ec2 = boto3.client("ec2", region_name="us-east-1")
|
|
2110
2243
|
console.print("Resizing volume...")
|
|
2111
|
-
|
|
2244
|
+
|
|
2112
2245
|
# Track progress
|
|
2113
2246
|
last_progress = 0
|
|
2114
|
-
|
|
2247
|
+
|
|
2115
2248
|
while True:
|
|
2116
2249
|
try:
|
|
2117
|
-
mod_state = ec2.describe_volumes_modifications(
|
|
2250
|
+
mod_state = ec2.describe_volumes_modifications(
|
|
2251
|
+
VolumeIds=[studio["studio_id"]]
|
|
2252
|
+
)
|
|
2118
2253
|
if not mod_state["VolumesModifications"]:
|
|
2119
2254
|
break # Modification complete
|
|
2120
|
-
|
|
2255
|
+
|
|
2121
2256
|
modification = mod_state["VolumesModifications"][0]
|
|
2122
2257
|
state = modification["ModificationState"]
|
|
2123
2258
|
progress = modification.get("Progress", 0)
|
|
2124
|
-
|
|
2259
|
+
|
|
2125
2260
|
# Show progress updates only for the resize phase
|
|
2126
2261
|
if state == "modifying" and progress > last_progress:
|
|
2127
2262
|
console.print(f"[yellow]Progress: {progress}%[/yellow]")
|
|
2128
2263
|
last_progress = progress
|
|
2129
|
-
|
|
2264
|
+
|
|
2130
2265
|
# Exit as soon as optimization starts (resize is complete)
|
|
2131
2266
|
if state == "optimizing":
|
|
2132
|
-
console.print(
|
|
2133
|
-
|
|
2267
|
+
console.print(
|
|
2268
|
+
f"[green]✓ Studio resized successfully to {size}GB![/green]"
|
|
2269
|
+
)
|
|
2270
|
+
console.print(
|
|
2271
|
+
"[dim]AWS is optimizing the volume in the background (no action needed).[/dim]"
|
|
2272
|
+
)
|
|
2134
2273
|
break
|
|
2135
|
-
|
|
2274
|
+
|
|
2136
2275
|
if state == "completed":
|
|
2137
|
-
console.print(
|
|
2276
|
+
console.print(
|
|
2277
|
+
f"[green]✓ Studio resized successfully to {size}GB![/green]"
|
|
2278
|
+
)
|
|
2138
2279
|
break
|
|
2139
2280
|
elif state == "failed":
|
|
2140
2281
|
console.print("[red]❌ Volume modification failed[/red]")
|
|
2141
2282
|
raise typer.Exit(1)
|
|
2142
|
-
|
|
2283
|
+
|
|
2143
2284
|
time.sleep(2) # Check more frequently for better UX
|
|
2144
|
-
|
|
2285
|
+
|
|
2145
2286
|
except ClientError:
|
|
2146
2287
|
# Modification might be complete
|
|
2147
2288
|
console.print(f"[green]✓ Studio resized successfully to {size}GB![/green]")
|
|
2148
2289
|
break
|
|
2149
|
-
|
|
2150
|
-
console.print(
|
|
2290
|
+
|
|
2291
|
+
console.print(
|
|
2292
|
+
"\n[dim]The filesystem will be automatically expanded when you next attach the studio.[/dim]"
|
|
2293
|
+
)
|
|
2151
2294
|
console.print(f"To attach: [cyan]dh studio attach <engine-name>[/cyan]")
|
|
2152
2295
|
|
|
2296
|
+
|
|
2153
2297
|
# ================= Idle timeout command =================
|
|
2154
2298
|
|
|
2155
2299
|
|
|
2156
2300
|
@engine_app.command("idle")
|
|
2157
2301
|
def idle_timeout_cmd(
|
|
2158
2302
|
name_or_id: str = typer.Argument(help="Engine name or instance ID"),
|
|
2159
|
-
set: Optional[str] = typer.Option(
|
|
2303
|
+
set: Optional[str] = typer.Option(
|
|
2304
|
+
None, "--set", "-s", help="New timeout (e.g., 2h30m, 45m)"
|
|
2305
|
+
),
|
|
2160
2306
|
):
|
|
2161
2307
|
"""Show or set the engine idle-detector timeout."""
|
|
2162
2308
|
check_aws_sso()
|
|
@@ -2177,11 +2323,18 @@ def idle_timeout_cmd(
|
|
|
2177
2323
|
resp = ssm.send_command(
|
|
2178
2324
|
InstanceIds=[engine["instance_id"]],
|
|
2179
2325
|
DocumentName="AWS-RunShellScript",
|
|
2180
|
-
Parameters={
|
|
2326
|
+
Parameters={
|
|
2327
|
+
"commands": [
|
|
2328
|
+
"grep -E '^IDLE_TIMEOUT_SECONDS=' /etc/engine.env || echo 'IDLE_TIMEOUT_SECONDS=1800'"
|
|
2329
|
+
],
|
|
2330
|
+
"executionTimeout": ["10"],
|
|
2331
|
+
},
|
|
2181
2332
|
)
|
|
2182
2333
|
cid = resp["Command"]["CommandId"]
|
|
2183
2334
|
time.sleep(1)
|
|
2184
|
-
inv = ssm.get_command_invocation(
|
|
2335
|
+
inv = ssm.get_command_invocation(
|
|
2336
|
+
CommandId=cid, InstanceId=engine["instance_id"]
|
|
2337
|
+
)
|
|
2185
2338
|
if inv["Status"] == "Success":
|
|
2186
2339
|
line = inv["StandardOutputContent"].strip()
|
|
2187
2340
|
secs = int(line.split("=")[1]) if "=" in line else 1800
|
|
@@ -2219,8 +2372,10 @@ def idle_timeout_cmd(
|
|
|
2219
2372
|
time.sleep(2)
|
|
2220
2373
|
console.print(f"[green]✓ Idle timeout updated to {set}[/green]")
|
|
2221
2374
|
|
|
2375
|
+
|
|
2222
2376
|
# Add this near the end, after the idle-timeout command
|
|
2223
2377
|
|
|
2378
|
+
|
|
2224
2379
|
@engine_app.command("debug")
|
|
2225
2380
|
def debug_engine(
|
|
2226
2381
|
name_or_id: str = typer.Argument(help="Engine name or instance ID"),
|
|
@@ -2240,17 +2395,32 @@ def debug_engine(
|
|
|
2240
2395
|
console.print(f"[bold]Debug info for {engine['name']}:[/bold]\n")
|
|
2241
2396
|
|
|
2242
2397
|
ssm = boto3.client("ssm", region_name="us-east-1")
|
|
2243
|
-
|
|
2398
|
+
|
|
2244
2399
|
# Check multiple files and systemd status
|
|
2245
2400
|
checks = [
|
|
2246
|
-
(
|
|
2247
|
-
|
|
2248
|
-
|
|
2249
|
-
|
|
2250
|
-
(
|
|
2401
|
+
(
|
|
2402
|
+
"Stage file",
|
|
2403
|
+
"cat /opt/dayhoff/state/engine-init.stage 2>/dev/null || cat /var/run/engine-init.stage 2>/dev/null || echo 'MISSING'",
|
|
2404
|
+
),
|
|
2405
|
+
(
|
|
2406
|
+
"Health file",
|
|
2407
|
+
"cat /opt/dayhoff/state/engine-health.json 2>/dev/null || cat /var/run/engine-health.json 2>/dev/null || echo 'MISSING'",
|
|
2408
|
+
),
|
|
2409
|
+
(
|
|
2410
|
+
"Sentinel file",
|
|
2411
|
+
"ls -la /opt/dayhoff/first_boot_complete.sentinel 2>/dev/null || echo 'MISSING'",
|
|
2412
|
+
),
|
|
2413
|
+
(
|
|
2414
|
+
"Setup service",
|
|
2415
|
+
"systemctl status setup-aws-vm.service --no-pager || echo 'Service not found'",
|
|
2416
|
+
),
|
|
2417
|
+
(
|
|
2418
|
+
"Bootstrap log tail",
|
|
2419
|
+
"tail -20 /var/log/engine-setup.log 2>/dev/null || echo 'No log'",
|
|
2420
|
+
),
|
|
2251
2421
|
("Environment file", "cat /etc/engine.env 2>/dev/null || echo 'MISSING'"),
|
|
2252
2422
|
]
|
|
2253
|
-
|
|
2423
|
+
|
|
2254
2424
|
for name, cmd in checks:
|
|
2255
2425
|
try:
|
|
2256
2426
|
resp = ssm.send_command(
|
|
@@ -2260,14 +2430,16 @@ def debug_engine(
|
|
|
2260
2430
|
)
|
|
2261
2431
|
cid = resp["Command"]["CommandId"]
|
|
2262
2432
|
time.sleep(1)
|
|
2263
|
-
inv = ssm.get_command_invocation(
|
|
2264
|
-
|
|
2433
|
+
inv = ssm.get_command_invocation(
|
|
2434
|
+
CommandId=cid, InstanceId=engine["instance_id"]
|
|
2435
|
+
)
|
|
2436
|
+
|
|
2265
2437
|
if inv["Status"] == "Success":
|
|
2266
2438
|
output = inv["StandardOutputContent"].strip()
|
|
2267
2439
|
console.print(f"[cyan]{name}:[/cyan]")
|
|
2268
2440
|
console.print(f"[dim]{output}[/dim]\n")
|
|
2269
2441
|
else:
|
|
2270
2442
|
console.print(f"[cyan]{name}:[/cyan] [red]FAILED[/red]\n")
|
|
2271
|
-
|
|
2443
|
+
|
|
2272
2444
|
except Exception as e:
|
|
2273
2445
|
console.print(f"[cyan]{name}:[/cyan] [red]ERROR: {e}[/red]\n")
|
|
@@ -5,7 +5,7 @@ build-backend = "poetry.core.masonry.api"
|
|
|
5
5
|
|
|
6
6
|
[project]
|
|
7
7
|
name = "dayhoff-tools"
|
|
8
|
-
version = "1.5.
|
|
8
|
+
version = "1.5.4"
|
|
9
9
|
description = "Common tools for all the repos at Dayhoff Labs"
|
|
10
10
|
authors = [
|
|
11
11
|
{name = "Daniel Martin-Alarcon", email = "dma@dayhofflabs.com"}
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|