dayhoff-tools 1.7.3__py3-none-any.whl → 1.7.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dayhoff_tools/cli/engine_commands.py +267 -61
- dayhoff_tools/cli/utility_commands.py +1 -1
- {dayhoff_tools-1.7.3.dist-info → dayhoff_tools-1.7.5.dist-info}/METADATA +1 -1
- {dayhoff_tools-1.7.3.dist-info → dayhoff_tools-1.7.5.dist-info}/RECORD +6 -6
- {dayhoff_tools-1.7.3.dist-info → dayhoff_tools-1.7.5.dist-info}/WHEEL +0 -0
- {dayhoff_tools-1.7.3.dist-info → dayhoff_tools-1.7.5.dist-info}/entry_points.txt +0 -0
@@ -319,6 +319,88 @@ def format_status(state: str, ready: Optional[bool]) -> str:
|
|
319
319
|
return state
|
320
320
|
|
321
321
|
|
322
|
+
# --------------------------------------------------------------------------------
|
323
|
+
# Audit helpers (Phase 1 observability)
|
324
|
+
# --------------------------------------------------------------------------------
|
325
|
+
|
326
|
+
|
327
|
+
def _get_engine_audit_bucket() -> Optional[str]:
|
328
|
+
"""Return the engine audit bucket name from SSM Parameter Store, if configured."""
|
329
|
+
try:
|
330
|
+
ssm = boto3.client("ssm", region_name="us-east-1")
|
331
|
+
resp = ssm.get_parameter(Name="/dev/studio-manager/engine-audit-bucket")
|
332
|
+
return resp["Parameter"]["Value"]
|
333
|
+
except ClientError:
|
334
|
+
return None
|
335
|
+
|
336
|
+
|
337
|
+
def _fetch_last_audit_via_ssm(instance_id: str) -> Optional[Dict]:
|
338
|
+
"""Fetch last shutdown attempt audit from the engine via SSM (fast best-effort)."""
|
339
|
+
try:
|
340
|
+
ssm = boto3.client("ssm", region_name="us-east-1")
|
341
|
+
resp = ssm.send_command(
|
342
|
+
InstanceIds=[instance_id],
|
343
|
+
DocumentName="AWS-RunShellScript",
|
344
|
+
Parameters={
|
345
|
+
"commands": [
|
346
|
+
"cat /var/log/idle-detector/last_shutdown_attempt.json 2>/dev/null || true",
|
347
|
+
],
|
348
|
+
"executionTimeout": ["3"],
|
349
|
+
},
|
350
|
+
)
|
351
|
+
cid = resp["Command"]["CommandId"]
|
352
|
+
time.sleep(1)
|
353
|
+
inv = ssm.get_command_invocation(CommandId=cid, InstanceId=instance_id)
|
354
|
+
if inv["Status"] != "Success":
|
355
|
+
return None
|
356
|
+
content = inv["StandardOutputContent"].strip()
|
357
|
+
if not content:
|
358
|
+
return None
|
359
|
+
return json.loads(content)
|
360
|
+
except Exception:
|
361
|
+
return None
|
362
|
+
|
363
|
+
|
364
|
+
def _fetch_last_audit_via_s3(instance_id: str) -> Optional[Dict]:
|
365
|
+
"""Fetch the newest audit object from S3 if available."""
|
366
|
+
bucket = _get_engine_audit_bucket()
|
367
|
+
if not bucket:
|
368
|
+
return None
|
369
|
+
try:
|
370
|
+
s3 = boto3.client("s3", region_name="us-east-1")
|
371
|
+
paginator = s3.get_paginator("list_objects_v2")
|
372
|
+
newest = None
|
373
|
+
for page in paginator.paginate(
|
374
|
+
Bucket=bucket, Prefix=f"{instance_id}/", MaxKeys=1000
|
375
|
+
):
|
376
|
+
for obj in page.get("Contents", []):
|
377
|
+
lm = obj.get("LastModified")
|
378
|
+
if newest is None or (lm and lm > newest["LastModified"]):
|
379
|
+
newest = obj
|
380
|
+
if not newest:
|
381
|
+
return None
|
382
|
+
obj = s3.get_object(Bucket=bucket, Key=newest["Key"])
|
383
|
+
data = obj["Body"].read().decode("utf-8")
|
384
|
+
return json.loads(data)
|
385
|
+
except Exception:
|
386
|
+
return None
|
387
|
+
|
388
|
+
|
389
|
+
def _summarize_audit(audit: Dict) -> str:
|
390
|
+
"""Return a compact one-line summary for status output."""
|
391
|
+
ts = audit.get("ts", "?")
|
392
|
+
shutdown = audit.get("shutdown", {})
|
393
|
+
result = shutdown.get("result", "?")
|
394
|
+
detach = audit.get("detach", {})
|
395
|
+
num_detached = sum(
|
396
|
+
1 for r in detach.get("results", []) if r.get("status") == "success"
|
397
|
+
)
|
398
|
+
idle = audit.get("idle", {})
|
399
|
+
elapsed = int(idle.get("elapsed_sec", 0))
|
400
|
+
threshold = int(idle.get("threshold_sec", 0))
|
401
|
+
return f"Last shutdown attempt: {ts} result={result} detach={num_detached} idle={elapsed//60}/{threshold//60}m"
|
402
|
+
|
403
|
+
|
322
404
|
def resolve_engine(name_or_id: str, engines: List[Dict]) -> Dict:
|
323
405
|
"""Resolve engine by name or ID with interactive selection."""
|
324
406
|
# Exact ID match
|
@@ -397,7 +479,9 @@ def get_ssh_public_key() -> str:
|
|
397
479
|
if shutil.which("ssh-add") is not None:
|
398
480
|
proc = subprocess.run(["ssh-add", "-L"], capture_output=True, text=True)
|
399
481
|
if proc.returncode == 0 and proc.stdout:
|
400
|
-
keys = [
|
482
|
+
keys = [
|
483
|
+
line.strip() for line in proc.stdout.splitlines() if line.strip()
|
484
|
+
]
|
401
485
|
# Prefer ed25519, then rsa
|
402
486
|
for pref in ("ssh-ed25519", "ssh-rsa", "ecdsa-sha2-nistp256"):
|
403
487
|
for k in keys:
|
@@ -437,7 +521,9 @@ def check_session_manager_plugin():
|
|
437
521
|
return True
|
438
522
|
|
439
523
|
|
440
|
-
def update_ssh_config_entry(
|
524
|
+
def update_ssh_config_entry(
|
525
|
+
engine_name: str, instance_id: str, ssh_user: str, idle_timeout: int = 600
|
526
|
+
):
|
441
527
|
"""Add or update a single SSH config entry for the given SSH user.
|
442
528
|
|
443
529
|
Args:
|
@@ -462,7 +548,10 @@ def update_ssh_config_entry(engine_name: str, instance_id: str, ssh_user: str, i
|
|
462
548
|
skip_until_next_host = False
|
463
549
|
for line in lines:
|
464
550
|
# Check if this is our managed host
|
465
|
-
if
|
551
|
+
if (
|
552
|
+
line.strip().startswith(f"Host {engine_name}")
|
553
|
+
and SSH_MANAGED_COMMENT in line
|
554
|
+
):
|
466
555
|
skip_until_next_host = True
|
467
556
|
elif line.strip().startswith("Host ") and skip_until_next_host:
|
468
557
|
skip_until_next_host = False
|
@@ -474,13 +563,15 @@ def update_ssh_config_entry(engine_name: str, instance_id: str, ssh_user: str, i
|
|
474
563
|
# Add the new entry
|
475
564
|
if new_lines and new_lines[-1].strip(): # Add blank line if needed
|
476
565
|
new_lines.append("")
|
477
|
-
|
478
|
-
new_lines.extend(
|
479
|
-
|
480
|
-
|
481
|
-
|
482
|
-
|
483
|
-
|
566
|
+
|
567
|
+
new_lines.extend(
|
568
|
+
[
|
569
|
+
f"Host {engine_name} {SSH_MANAGED_COMMENT}",
|
570
|
+
f" HostName {instance_id}",
|
571
|
+
f" User {ssh_user}",
|
572
|
+
f" ProxyCommand sh -c \"AWS_SSM_IDLE_TIMEOUT={idle_timeout} aws ssm start-session --target %h --document-name AWS-StartSSHSession --parameters 'portNumber=%p'\"",
|
573
|
+
]
|
574
|
+
)
|
484
575
|
|
485
576
|
# Write back
|
486
577
|
config_path.write_text("\n".join(new_lines))
|
@@ -662,16 +753,20 @@ def list_engines(
|
|
662
753
|
]
|
663
754
|
if detailed:
|
664
755
|
row_data.append(disk_usage)
|
665
|
-
row_data.extend(
|
666
|
-
|
667
|
-
|
668
|
-
|
669
|
-
|
756
|
+
row_data.extend(
|
757
|
+
[
|
758
|
+
time_str,
|
759
|
+
f"${hourly_cost:.2f}",
|
760
|
+
]
|
761
|
+
)
|
762
|
+
|
670
763
|
table.add_row(*row_data)
|
671
764
|
|
672
765
|
console.print(table)
|
673
766
|
if not detailed and any(e["state"].lower() == "running" for e in engines):
|
674
|
-
console.print(
|
767
|
+
console.print(
|
768
|
+
"\n[dim]Tip: Use --detailed to see disk usage and bootstrap status (slower)[/dim]"
|
769
|
+
)
|
675
770
|
else:
|
676
771
|
error = response.json().get("error", "Unknown error")
|
677
772
|
console.print(f"[red]❌ Failed to list engines: {error}[/red]")
|
@@ -798,6 +893,16 @@ def engine_status(
|
|
798
893
|
f" • [magenta]🖥 IDE connected ({ide_conn['connection_count']} connections)[/magenta]"
|
799
894
|
)
|
800
895
|
|
896
|
+
# Audit one-liner (best-effort SSM fetch)
|
897
|
+
try:
|
898
|
+
last_audit = _fetch_last_audit_via_ssm(engine["instance_id"])
|
899
|
+
if last_audit:
|
900
|
+
status_lines.append("")
|
901
|
+
status_lines.append("[bold]Idle Audit:[/bold]")
|
902
|
+
status_lines.append(f" • {_summarize_audit(last_audit)}")
|
903
|
+
except Exception:
|
904
|
+
pass
|
905
|
+
|
801
906
|
if attached_studios:
|
802
907
|
status_lines.append("")
|
803
908
|
status_lines.append("[bold]Attached Studios:[/bold]")
|
@@ -839,6 +944,67 @@ def engine_status(
|
|
839
944
|
console.print(f"[red]❌ Error fetching log: {e}[/red]")
|
840
945
|
|
841
946
|
|
947
|
+
@engine_app.command("why")
|
948
|
+
def engine_why(
|
949
|
+
name_or_id: str = typer.Argument(help="Engine name or instance ID"),
|
950
|
+
raw: bool = typer.Option(False, "--raw", help="Print raw audit JSON"),
|
951
|
+
):
|
952
|
+
"""Explain the last idle-detector shutdown attempt for an engine.
|
953
|
+
|
954
|
+
Tries SSM (on-instance file) first, then falls back to S3 audit bucket.
|
955
|
+
"""
|
956
|
+
check_aws_sso()
|
957
|
+
|
958
|
+
# Resolve engine
|
959
|
+
response = make_api_request("GET", "/engines")
|
960
|
+
if response.status_code != 200:
|
961
|
+
console.print("[red]❌ Failed to fetch engines[/red]")
|
962
|
+
raise typer.Exit(1)
|
963
|
+
|
964
|
+
engines = response.json().get("engines", [])
|
965
|
+
engine = resolve_engine(name_or_id, engines)
|
966
|
+
|
967
|
+
audit = _fetch_last_audit_via_ssm(
|
968
|
+
engine["instance_id"]
|
969
|
+
) or _fetch_last_audit_via_s3(engine["instance_id"])
|
970
|
+
if not audit:
|
971
|
+
console.print("No audit found (engine may not have attempted shutdown yet).")
|
972
|
+
raise typer.Exit(0)
|
973
|
+
|
974
|
+
if raw:
|
975
|
+
console.print_json(data=audit)
|
976
|
+
return
|
977
|
+
|
978
|
+
# Pretty summary
|
979
|
+
status = _summarize_audit(audit)
|
980
|
+
panel_lines = [
|
981
|
+
f"[bold]{status}[/bold]",
|
982
|
+
"",
|
983
|
+
"[bold]Sensors:[/bold]",
|
984
|
+
]
|
985
|
+
for r in audit.get("idle", {}).get("reason_snapshot", []):
|
986
|
+
active = "✓" if r.get("active") else "-"
|
987
|
+
reason = r.get("reason") or ""
|
988
|
+
sensor = r.get("sensor")
|
989
|
+
panel_lines.append(f" {active} {sensor}: {reason}")
|
990
|
+
panel_lines.append("")
|
991
|
+
panel_lines.append("[bold]Detach results:[/bold]")
|
992
|
+
for res in audit.get("detach", {}).get("results", []):
|
993
|
+
panel_lines.append(
|
994
|
+
f" - {res.get('studio_id')}: {res.get('status')} {res.get('error') or ''}"
|
995
|
+
)
|
996
|
+
s3_info = audit.get("s3", {})
|
997
|
+
if s3_info.get("uploaded"):
|
998
|
+
panel_lines.append("")
|
999
|
+
panel_lines.append(
|
1000
|
+
f"[dim]S3: s3://{s3_info.get('bucket')}/{s3_info.get('key')}[/dim]"
|
1001
|
+
)
|
1002
|
+
|
1003
|
+
console.print(
|
1004
|
+
Panel("\n".join(panel_lines), title="Idle Shutdown Audit", border_style="blue")
|
1005
|
+
)
|
1006
|
+
|
1007
|
+
|
842
1008
|
@engine_app.command("stop")
|
843
1009
|
def stop_engine(
|
844
1010
|
name_or_id: str = typer.Argument(help="Engine name or instance ID"),
|
@@ -969,7 +1135,9 @@ def ssh_engine(
|
|
969
1135
|
False, "--admin", help="Connect as ec2-user instead of the engine owner user"
|
970
1136
|
),
|
971
1137
|
idle_timeout: int = typer.Option(
|
972
|
-
600,
|
1138
|
+
600,
|
1139
|
+
"--idle-timeout",
|
1140
|
+
help="Idle timeout (seconds) for the SSM port-forward (0 = disable)",
|
973
1141
|
),
|
974
1142
|
):
|
975
1143
|
"""Connect to an engine via SSH.
|
@@ -1003,7 +1171,9 @@ def ssh_engine(
|
|
1003
1171
|
console.print(
|
1004
1172
|
f"Updating SSH config for [cyan]{engine['name']}[/cyan] (user: {ssh_user})..."
|
1005
1173
|
)
|
1006
|
-
update_ssh_config_entry(
|
1174
|
+
update_ssh_config_entry(
|
1175
|
+
engine["name"], engine["instance_id"], ssh_user, idle_timeout
|
1176
|
+
)
|
1007
1177
|
|
1008
1178
|
# Connect
|
1009
1179
|
console.print(f"[green]✓ Connecting to {engine['name']}...[/green]")
|
@@ -1421,7 +1591,7 @@ def create_ami(
|
|
1421
1591
|
if response.status_code != 200:
|
1422
1592
|
console.print("[red]❌ Failed to fetch engines[/red]")
|
1423
1593
|
raise typer.Exit(1)
|
1424
|
-
|
1594
|
+
|
1425
1595
|
engines = response.json().get("engines", [])
|
1426
1596
|
engine = resolve_engine(name_or_id, engines)
|
1427
1597
|
|
@@ -1436,15 +1606,19 @@ def create_ami(
|
|
1436
1606
|
# 2. Check for attached studios from the detailed API response
|
1437
1607
|
attached_studios = engine.get("studios", [])
|
1438
1608
|
if attached_studios:
|
1439
|
-
console.print(
|
1609
|
+
console.print(
|
1610
|
+
f"[bold red]❌ Engine '{engine['name']}' has studios attached.[/bold red]"
|
1611
|
+
)
|
1440
1612
|
console.print("Please detach all studios before creating an AMI:")
|
1441
1613
|
for studio in attached_studios:
|
1442
|
-
|
1614
|
+
console.print(f" - {studio['user']} ({studio['studio_id']})")
|
1443
1615
|
console.print("\nTo detach, run [bold]dh studio detach[/bold]")
|
1444
1616
|
raise typer.Exit(1)
|
1445
1617
|
|
1446
1618
|
# Construct AMI name and description
|
1447
|
-
ami_name =
|
1619
|
+
ami_name = (
|
1620
|
+
f"prewarmed-engine-{engine['engine_type']}-{datetime.now().strftime('%Y%m%d')}"
|
1621
|
+
)
|
1448
1622
|
description = (
|
1449
1623
|
f"Amazon Linux 2023 with NVIDIA drivers, Docker, and pre-pulled "
|
1450
1624
|
f"dev container image for {engine['engine_type']} engines"
|
@@ -1453,7 +1627,7 @@ def create_ami(
|
|
1453
1627
|
console.print(f"Creating AMI from engine [cyan]{engine['name']}[/cyan]...")
|
1454
1628
|
console.print(f"[bold]AMI Name:[/] {ami_name}")
|
1455
1629
|
console.print(f"[bold]Description:[/] {description}")
|
1456
|
-
|
1630
|
+
|
1457
1631
|
console.print(
|
1458
1632
|
"\n[bold yellow]⚠️ Important:[/bold yellow]\n"
|
1459
1633
|
"1. This process will run cleanup scripts on the engine.\n"
|
@@ -1475,7 +1649,7 @@ def create_ami(
|
|
1475
1649
|
"history -c",
|
1476
1650
|
"sudo rm -rf /tmp/* /var/log/messages /var/log/cloud-init.log",
|
1477
1651
|
"sudo rm -rf /var/lib/amazon/ssm/* /etc/amazon/ssm/*",
|
1478
|
-
"sleep 2 && sudo systemctl stop amazon-ssm-agent &",
|
1652
|
+
"sleep 2 && sudo systemctl stop amazon-ssm-agent &", # Stop agent last
|
1479
1653
|
]
|
1480
1654
|
|
1481
1655
|
cleanup_response = ssm.send_command(
|
@@ -1485,16 +1659,20 @@ def create_ami(
|
|
1485
1659
|
)
|
1486
1660
|
|
1487
1661
|
# Acknowledge that the SSM command might be in progress as the agent shuts down
|
1488
|
-
console.print(
|
1489
|
-
|
1662
|
+
console.print(
|
1663
|
+
"[dim]ℹ️ Cleanup command sent (status may show 'InProgress' as SSM agent stops)[/dim]"
|
1664
|
+
)
|
1665
|
+
|
1490
1666
|
# Create the AMI
|
1491
1667
|
with Progress(
|
1492
1668
|
SpinnerColumn(),
|
1493
1669
|
TextColumn("[progress.description]{task.description}"),
|
1494
1670
|
transient=True,
|
1495
1671
|
) as progress:
|
1496
|
-
task = progress.add_task(
|
1497
|
-
|
1672
|
+
task = progress.add_task(
|
1673
|
+
"Creating AMI (this will take several minutes)...", total=None
|
1674
|
+
)
|
1675
|
+
|
1498
1676
|
response = ec2.create_image(
|
1499
1677
|
InstanceId=engine["instance_id"],
|
1500
1678
|
Name=ami_name,
|
@@ -1506,7 +1684,7 @@ def create_ami(
|
|
1506
1684
|
"Tags": [
|
1507
1685
|
{"Key": "Environment", "Value": "dev"},
|
1508
1686
|
{"Key": "Type", "Value": "golden-ami"},
|
1509
|
-
{"Key": "EngineType", "Value": engine[
|
1687
|
+
{"Key": "EngineType", "Value": engine["engine_type"]},
|
1510
1688
|
{"Key": "Name", "Value": ami_name},
|
1511
1689
|
],
|
1512
1690
|
}
|
@@ -1514,7 +1692,11 @@ def create_ami(
|
|
1514
1692
|
)
|
1515
1693
|
|
1516
1694
|
ami_id = response["ImageId"]
|
1517
|
-
progress.update(
|
1695
|
+
progress.update(
|
1696
|
+
task,
|
1697
|
+
completed=True,
|
1698
|
+
description=f"[green]✓ AMI creation initiated![/green]",
|
1699
|
+
)
|
1518
1700
|
|
1519
1701
|
console.print(f" [bold]AMI ID:[/] {ami_id}")
|
1520
1702
|
console.print("\nThe AMI creation process will continue in the background.")
|
@@ -1523,14 +1705,15 @@ def create_ami(
|
|
1523
1705
|
"\nOnce complete, update the AMI ID in [bold]terraform/environments/dev/variables.tf[/bold] "
|
1524
1706
|
"and run [bold]terraform apply[/bold]."
|
1525
1707
|
)
|
1526
|
-
console.print(
|
1708
|
+
console.print(
|
1709
|
+
f"\nRemember to [bold red]terminate the source engine '{engine['name']}'[/bold red] to save costs."
|
1710
|
+
)
|
1527
1711
|
|
1528
1712
|
except ClientError as e:
|
1529
1713
|
console.print(f"[red]❌ Failed to create AMI: {e}[/red]")
|
1530
1714
|
raise typer.Exit(1)
|
1531
1715
|
|
1532
1716
|
|
1533
|
-
|
1534
1717
|
# ==================== STUDIO COMMANDS ====================
|
1535
1718
|
|
1536
1719
|
|
@@ -1803,9 +1986,13 @@ def attach_studio(
|
|
1803
1986
|
TextColumn("[progress.description]{task.description}"),
|
1804
1987
|
transient=True,
|
1805
1988
|
) as prog:
|
1806
|
-
desc =
|
1989
|
+
desc = (
|
1990
|
+
"Attaching studio (engine is still booting)…"
|
1991
|
+
if engine_started_now
|
1992
|
+
else "Attaching studio…"
|
1993
|
+
)
|
1807
1994
|
task = prog.add_task(desc, total=None)
|
1808
|
-
|
1995
|
+
|
1809
1996
|
consecutive_not_ready = 0
|
1810
1997
|
last_error = None
|
1811
1998
|
|
@@ -1814,7 +2001,7 @@ def attach_studio(
|
|
1814
2001
|
if _is_studio_attached(studio["studio_id"], engine["instance_id"]):
|
1815
2002
|
success = True
|
1816
2003
|
break
|
1817
|
-
|
2004
|
+
|
1818
2005
|
success, error_msg = _attempt_studio_attach(
|
1819
2006
|
studio, engine, target_user, public_key
|
1820
2007
|
)
|
@@ -1825,24 +2012,26 @@ def attach_studio(
|
|
1825
2012
|
if error_msg:
|
1826
2013
|
# Fatal error – bubble up immediately
|
1827
2014
|
console.print(f"[red]❌ Failed to attach studio: {error_msg}[/red]")
|
1828
|
-
|
2015
|
+
|
1829
2016
|
# Suggest repair command if engine seems broken
|
1830
2017
|
if "not ready" in error_msg.lower() and attempt > 5:
|
1831
|
-
console.print(
|
2018
|
+
console.print(
|
2019
|
+
f"\n[yellow]Engine may be in a bad state. Try:[/yellow]"
|
2020
|
+
)
|
1832
2021
|
console.print(f"[dim] dh engine repair {engine['name']}[/dim]")
|
1833
2022
|
return
|
1834
2023
|
|
1835
2024
|
# Track consecutive "not ready" responses
|
1836
2025
|
consecutive_not_ready += 1
|
1837
2026
|
last_error = "Engine not ready"
|
1838
|
-
|
2027
|
+
|
1839
2028
|
# Update progress display
|
1840
2029
|
if attempt % 3 == 0:
|
1841
2030
|
prog.update(
|
1842
2031
|
task,
|
1843
2032
|
description=f"{desc} attempt {attempt+1}/{max_attempts}",
|
1844
2033
|
)
|
1845
|
-
|
2034
|
+
|
1846
2035
|
# If engine seems stuck after many attempts, show a hint
|
1847
2036
|
if consecutive_not_ready > 10 and attempt == 10:
|
1848
2037
|
console.print(
|
@@ -1865,9 +2054,15 @@ def attach_studio(
|
|
1865
2054
|
if last_error:
|
1866
2055
|
console.print(f"[dim]Last issue: {last_error}[/dim]")
|
1867
2056
|
console.print("\n[yellow]You can try:[/yellow]")
|
1868
|
-
console.print(
|
1869
|
-
|
1870
|
-
|
2057
|
+
console.print(
|
2058
|
+
f" 1. Wait a minute and retry: [cyan]dh studio attach {engine['name']}[/cyan]"
|
2059
|
+
)
|
2060
|
+
console.print(
|
2061
|
+
f" 2. Check engine status: [cyan]dh engine status {engine['name']}[/cyan]"
|
2062
|
+
)
|
2063
|
+
console.print(
|
2064
|
+
f" 3. Repair the engine: [cyan]dh engine repair {engine['name']}[/cyan]"
|
2065
|
+
)
|
1871
2066
|
return
|
1872
2067
|
|
1873
2068
|
# Successful attach path
|
@@ -1900,13 +2095,13 @@ def _attempt_studio_attach(studio, engine, target_user, public_key):
|
|
1900
2095
|
# The operation status polling is broken in the Lambda, so we just
|
1901
2096
|
# wait and check if the studio is actually attached
|
1902
2097
|
time.sleep(5) # Give the async operation a moment to start
|
1903
|
-
|
2098
|
+
|
1904
2099
|
# Check periodically if the studio is attached
|
1905
2100
|
for check in range(20): # Check for up to 60 seconds
|
1906
2101
|
if _is_studio_attached(studio["studio_id"], engine["instance_id"]):
|
1907
2102
|
return True, None
|
1908
2103
|
time.sleep(3)
|
1909
|
-
|
2104
|
+
|
1910
2105
|
# If we get here, attachment didn't complete in reasonable time
|
1911
2106
|
return False, None # Return None to trigger retry
|
1912
2107
|
|
@@ -1914,15 +2109,19 @@ def _attempt_studio_attach(studio, engine, target_user, public_key):
|
|
1914
2109
|
recoverable = False
|
1915
2110
|
error_text = response.json().get("error", "Unknown error")
|
1916
2111
|
err_msg = error_text.lower()
|
1917
|
-
|
2112
|
+
|
1918
2113
|
# Check for "Studio is not available (status: in-use)" which means it's already attached
|
1919
|
-
if
|
2114
|
+
if (
|
2115
|
+
response.status_code == 400
|
2116
|
+
and "not available" in err_msg
|
2117
|
+
and "in-use" in err_msg
|
2118
|
+
):
|
1920
2119
|
# Studio is already attached somewhere - check if it's to THIS engine
|
1921
2120
|
if _is_studio_attached(studio["studio_id"], engine["instance_id"]):
|
1922
2121
|
return True, None # It's attached to our target engine - success!
|
1923
2122
|
else:
|
1924
2123
|
return False, error_text # It's attached elsewhere - fatal error
|
1925
|
-
|
2124
|
+
|
1926
2125
|
if response.status_code in (409, 503):
|
1927
2126
|
recoverable = True
|
1928
2127
|
else:
|
@@ -1952,7 +2151,7 @@ def _attempt_studio_attach(studio, engine, target_user, public_key):
|
|
1952
2151
|
|
1953
2152
|
# Note: _poll_operation was removed because the Lambda's operation tracking is broken.
|
1954
2153
|
# We now use _is_studio_attached() to check if the studio is actually attached instead.
|
1955
|
-
|
2154
|
+
|
1956
2155
|
|
1957
2156
|
@studio_app.command("detach")
|
1958
2157
|
def detach_studio(
|
@@ -2504,9 +2703,15 @@ def repair_engine(
|
|
2504
2703
|
engine = resolve_engine(name_or_id, engines)
|
2505
2704
|
|
2506
2705
|
if engine["state"].lower() != "running":
|
2507
|
-
console.print(
|
2508
|
-
|
2509
|
-
|
2706
|
+
console.print(
|
2707
|
+
f"[yellow]⚠️ Engine is {engine['state']}. Must be running to repair.[/yellow]"
|
2708
|
+
)
|
2709
|
+
if engine["state"].lower() == "stopped" and Confirm.ask(
|
2710
|
+
"Start the engine first?"
|
2711
|
+
):
|
2712
|
+
response = make_api_request(
|
2713
|
+
"POST", f"/engines/{engine['instance_id']}/start"
|
2714
|
+
)
|
2510
2715
|
if response.status_code != 200:
|
2511
2716
|
console.print("[red]❌ Failed to start engine[/red]")
|
2512
2717
|
raise typer.Exit(1)
|
@@ -2517,7 +2722,9 @@ def repair_engine(
|
|
2517
2722
|
raise typer.Exit(1)
|
2518
2723
|
|
2519
2724
|
console.print(f"[bold]Repairing engine [cyan]{engine['name']}[/cyan][/bold]")
|
2520
|
-
console.print(
|
2725
|
+
console.print(
|
2726
|
+
"[dim]This will restore bootstrap state and ensure all services are running[/dim]\n"
|
2727
|
+
)
|
2521
2728
|
|
2522
2729
|
ssm = boto3.client("ssm", region_name="us-east-1")
|
2523
2730
|
|
@@ -2525,22 +2732,17 @@ def repair_engine(
|
|
2525
2732
|
repair_commands = [
|
2526
2733
|
# Create necessary directories
|
2527
2734
|
"sudo mkdir -p /opt/dayhoff /opt/dayhoff/state /opt/dayhoff/scripts",
|
2528
|
-
|
2529
2735
|
# Download scripts from S3 if missing
|
2530
2736
|
"source /etc/engine.env && sudo aws s3 sync s3://${VM_SCRIPTS_BUCKET}/ /opt/dayhoff/scripts/ --exclude '*' --include '*.sh' --quiet",
|
2531
2737
|
"sudo chmod +x /opt/dayhoff/scripts/*.sh 2>/dev/null || true",
|
2532
|
-
|
2533
2738
|
# Restore bootstrap state
|
2534
2739
|
"sudo touch /opt/dayhoff/first_boot_complete.sentinel",
|
2535
2740
|
"echo 'finished' | sudo tee /opt/dayhoff/state/engine-init.stage > /dev/null",
|
2536
|
-
|
2537
2741
|
# Ensure SSM agent is running
|
2538
2742
|
"sudo systemctl restart amazon-ssm-agent 2>/dev/null || true",
|
2539
|
-
|
2540
2743
|
# Restart idle detector
|
2541
2744
|
"sudo systemctl restart engine-idle-detector.timer 2>/dev/null || true",
|
2542
2745
|
"sudo systemctl restart engine-idle-detector.service 2>/dev/null || true",
|
2543
|
-
|
2544
2746
|
# Report status
|
2545
2747
|
"echo '=== Repair Complete ===' && echo 'Sentinel: ' && ls -la /opt/dayhoff/first_boot_complete.sentinel",
|
2546
2748
|
"echo 'Stage: ' && cat /opt/dayhoff/state/engine-init.stage",
|
@@ -2579,19 +2781,23 @@ def repair_engine(
|
|
2579
2781
|
if result["Status"] == "Success":
|
2580
2782
|
output = result["StandardOutputContent"]
|
2581
2783
|
console.print("[green]✓ Engine repaired successfully![/green]\n")
|
2582
|
-
|
2784
|
+
|
2583
2785
|
# Show repair results
|
2584
2786
|
if "=== Repair Complete ===" in output:
|
2585
2787
|
repair_section = output.split("=== Repair Complete ===")[1].strip()
|
2586
2788
|
console.print("[bold]Repair Results:[/bold]")
|
2587
2789
|
console.print(repair_section)
|
2588
|
-
|
2589
|
-
console.print(
|
2790
|
+
|
2791
|
+
console.print(
|
2792
|
+
"\n[dim]You should now be able to attach studios to this engine.[/dim]"
|
2793
|
+
)
|
2590
2794
|
else:
|
2591
2795
|
console.print(
|
2592
2796
|
f"[red]❌ Repair failed: {result.get('StandardErrorContent', 'Unknown error')}[/red]"
|
2593
2797
|
)
|
2594
|
-
console.print(
|
2798
|
+
console.print(
|
2799
|
+
"\n[yellow]Try running 'dh engine debug' for more information.[/yellow]"
|
2800
|
+
)
|
2595
2801
|
|
2596
2802
|
except Exception as e:
|
2597
2803
|
console.print(f"[red]❌ Failed to repair engine: {e}[/red]")
|
@@ -225,7 +225,7 @@ def build_and_upload_wheel(bump_part: str = "patch"):
|
|
225
225
|
pass
|
226
226
|
if proj_name == "dayhoff-tools":
|
227
227
|
print("Re-installing dayhoff-tools into the active environment…")
|
228
|
-
reinstall_cmd = ["uv", "pip", "install", "-e", "."]
|
228
|
+
reinstall_cmd = ["uv", "pip", "install", "-e", ".[full]"]
|
229
229
|
print(f"Running command: {BLUE}{' '.join(reinstall_cmd)}{RESET}")
|
230
230
|
subprocess.run(reinstall_cmd, check=True)
|
231
231
|
print("dayhoff-tools reinstalled in the current environment.")
|
@@ -3,10 +3,10 @@ dayhoff_tools/chemistry/standardizer.py,sha256=uMn7VwHnx02nc404eO6fRuS4rsl4dvSPf
|
|
3
3
|
dayhoff_tools/chemistry/utils.py,sha256=jt-7JgF-GeeVC421acX-bobKbLU_X94KNOW24p_P-_M,2257
|
4
4
|
dayhoff_tools/cli/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
5
5
|
dayhoff_tools/cli/cloud_commands.py,sha256=33qcWLmq-FwEXMdL3F0OHm-5Stlh2r65CldyEZgQ1no,40904
|
6
|
-
dayhoff_tools/cli/engine_commands.py,sha256=
|
6
|
+
dayhoff_tools/cli/engine_commands.py,sha256=0syRVrJjWtRi7Y_q7MbEA5PKJ8TSXtEodHzxXu2Ymhs,102461
|
7
7
|
dayhoff_tools/cli/main.py,sha256=LLMybU9KbtV_F4rwvoYAQZKTTF1nswlSZIfDMKdkh00,5925
|
8
8
|
dayhoff_tools/cli/swarm_commands.py,sha256=5EyKj8yietvT5lfoz8Zx0iQvVaNgc3SJX1z2zQR6o6M,5614
|
9
|
-
dayhoff_tools/cli/utility_commands.py,sha256=
|
9
|
+
dayhoff_tools/cli/utility_commands.py,sha256=q2XyNy1uMGKg95cRXv1uA4MhaUDqlDBvkQwmvRo3hnA,25865
|
10
10
|
dayhoff_tools/deployment/base.py,sha256=mYp560l6hSDFtyY2H42VoM8k9VUzfwuiyh9Knqpgc28,17441
|
11
11
|
dayhoff_tools/deployment/deploy_aws.py,sha256=GvZpE2YIFA5Dl9rkAljFjtUypmPDNbWgw8NicHYTP24,18265
|
12
12
|
dayhoff_tools/deployment/deploy_gcp.py,sha256=xgaOVsUDmP6wSEMYNkm1yRNcVskfdz80qJtCulkBIAM,8860
|
@@ -27,7 +27,7 @@ dayhoff_tools/intake/uniprot.py,sha256=BZYJQF63OtPcBBnQ7_P9gulxzJtqyorgyuDiPeOJq
|
|
27
27
|
dayhoff_tools/logs.py,sha256=DKdeP0k0kliRcilwvX0mUB2eipO5BdWUeHwh-VnsICs,838
|
28
28
|
dayhoff_tools/sqlite.py,sha256=jV55ikF8VpTfeQqqlHSbY8OgfyfHj8zgHNpZjBLos_E,18672
|
29
29
|
dayhoff_tools/warehouse.py,sha256=heaYc64qplgN3_1WVPFmqj53goStioWwY5NqlWc4c0s,24453
|
30
|
-
dayhoff_tools-1.7.
|
31
|
-
dayhoff_tools-1.7.
|
32
|
-
dayhoff_tools-1.7.
|
33
|
-
dayhoff_tools-1.7.
|
30
|
+
dayhoff_tools-1.7.5.dist-info/METADATA,sha256=tnZl6OD70iojC4Avmej4LaBhAAQO2afIzZ4PBj6BEws,2914
|
31
|
+
dayhoff_tools-1.7.5.dist-info/WHEEL,sha256=b4K_helf-jlQoXBBETfwnf4B04YC67LOev0jo4fX5m8,88
|
32
|
+
dayhoff_tools-1.7.5.dist-info/entry_points.txt,sha256=iAf4jteNqW3cJm6CO6czLxjW3vxYKsyGLZ8WGmxamSc,49
|
33
|
+
dayhoff_tools-1.7.5.dist-info/RECORD,,
|
File without changes
|
File without changes
|