dayhoff-tools 1.3.16__py3-none-any.whl → 1.3.18__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -18,6 +18,7 @@ from rich.panel import Panel
18
18
  from rich.progress import Progress, SpinnerColumn, TextColumn
19
19
  from rich.prompt import Confirm, IntPrompt, Prompt
20
20
  from rich.table import Table
21
+ import re
21
22
 
22
23
  # Initialize Typer apps
23
24
  engine_app = typer.Typer(help="Manage compute engines for development.")
@@ -37,6 +38,41 @@ HOURLY_COSTS = {
37
38
  # SSH config management
38
39
  SSH_MANAGED_COMMENT = "# Managed by dh engine"
39
40
 
41
+ # --------------------------------------------------------------------------------
42
+ # Bootstrap stage helpers
43
+ # --------------------------------------------------------------------------------
44
+
45
+ def _colour_stage(stage: str) -> str:
46
+ """Return colourised stage name for table output."""
47
+ if not stage:
48
+ return "[dim]-[/dim]"
49
+ low = stage.lower()
50
+ if low.startswith("error"):
51
+ return f"[red]{stage}[/red]"
52
+ if low == "finished":
53
+ return f"[green]{stage}[/green]"
54
+ return f"[yellow]{stage}[/yellow]"
55
+
56
+
57
+ def _fetch_init_stages(instance_ids: List[str]) -> Dict[str, str]:
58
+ """Fetch DayhoffInitStage tag for many instances in one call."""
59
+ if not instance_ids:
60
+ return {}
61
+ ec2 = boto3.client("ec2", region_name="us-east-1")
62
+ stages: Dict[str, str] = {}
63
+ try:
64
+ paginator = ec2.get_paginator("describe_instances")
65
+ for page in paginator.paginate(InstanceIds=instance_ids):
66
+ for res in page["Reservations"]:
67
+ for inst in res["Instances"]:
68
+ iid = inst["InstanceId"]
69
+ tag_val = next((t["Value"] for t in inst.get("Tags", []) if t["Key"] == "DayhoffInitStage"), None)
70
+ if tag_val:
71
+ stages[iid] = tag_val
72
+ except Exception:
73
+ pass # best-effort
74
+ return stages
75
+
40
76
 
41
77
  def check_aws_sso() -> str:
42
78
  """Check AWS SSO status and return username."""
@@ -486,6 +522,9 @@ def list_engines(
486
522
  console.print("No engines found.")
487
523
  return
488
524
 
525
+ # Fetch bootstrap stages once
526
+ stages_map = _fetch_init_stages([e["instance_id"] for e in engines])
527
+
489
528
  # Create table
490
529
  table = Table(title="Engines", box=box.ROUNDED)
491
530
  table.add_column("Name", style="cyan")
@@ -493,6 +532,7 @@ def list_engines(
493
532
  table.add_column("Type")
494
533
  table.add_column("User")
495
534
  table.add_column("Status")
535
+ table.add_column("Stage")
496
536
  table.add_column("Disk Usage")
497
537
  table.add_column("Uptime/Since")
498
538
  table.add_column("$/hour", justify="right")
@@ -515,12 +555,15 @@ def list_engines(
515
555
  time_str = launch_time.strftime("%Y-%m-%d %H:%M")
516
556
  disk_usage = "-"
517
557
 
558
+ stage_display = _colour_stage(stages_map.get(engine["instance_id"], "-"))
559
+
518
560
  table.add_row(
519
561
  engine["name"],
520
562
  engine["instance_id"],
521
563
  engine["engine_type"],
522
564
  engine["user"],
523
565
  format_status(engine["state"], engine.get("ready")),
566
+ stage_display,
524
567
  disk_usage,
525
568
  time_str,
526
569
  f"${hourly_cost:.2f}",
@@ -539,8 +582,9 @@ def list_engines(
539
582
  @engine_app.command("status")
540
583
  def engine_status(
541
584
  name_or_id: str = typer.Argument(help="Engine name or instance ID"),
585
+ show_log: bool = typer.Option(False, "--show-log", help="Show bootstrap log"),
542
586
  ):
543
- """Show detailed status of an engine."""
587
+ """Show detailed engine status and information."""
544
588
  check_aws_sso()
545
589
 
546
590
  # Get all engines to resolve name
@@ -566,18 +610,43 @@ def engine_status(
566
610
  hourly_cost = HOURLY_COSTS.get(engine["engine_type"], 0)
567
611
  total_cost = hourly_cost * (uptime.total_seconds() / 3600)
568
612
 
569
- # Create status panel
613
+ stages_map = _fetch_init_stages([engine["instance_id"]])
614
+ stage_val = stages_map.get(engine["instance_id"], "-")
615
+
570
616
  status_lines = [
571
617
  f"[bold]Name:[/bold] {engine['name']}",
572
618
  f"[bold]Instance:[/bold] {engine['instance_id']}",
573
619
  f"[bold]Type:[/bold] {engine['engine_type']} ({engine['instance_type']})",
574
620
  f"[bold]Status:[/bold] {format_status(engine['state'], engine.get('ready'))}",
621
+ f"[bold]Bootstrap:[/bold] {_colour_stage(stage_val)}",
575
622
  f"[bold]User:[/bold] {engine['user']}",
576
623
  f"[bold]IP:[/bold] {engine.get('public_ip', 'N/A')}",
577
624
  f"[bold]Launched:[/bold] {launch_time.strftime('%Y-%m-%d %H:%M:%S')} ({format_duration(uptime)} ago)",
578
625
  f"[bold]Cost:[/bold] ${hourly_cost:.2f}/hour (${total_cost:.2f} total)",
579
626
  ]
580
627
 
628
+ # Health report (only if bootstrap finished)
629
+ if stage_val == "finished":
630
+ try:
631
+ ssm = boto3.client("ssm", region_name="us-east-1")
632
+ res = ssm.send_command(
633
+ InstanceIds=[engine["instance_id"]],
634
+ DocumentName="AWS-RunShellScript",
635
+ Parameters={"commands": ["cat /var/run/engine-health.json || true"], "executionTimeout": ["10"]},
636
+ )
637
+ cid = res["Command"]["CommandId"]
638
+ time.sleep(1)
639
+ inv = ssm.get_command_invocation(CommandId=cid, InstanceId=engine["instance_id"])
640
+ if inv["Status"] == "Success":
641
+ import json as _json
642
+ health = _json.loads(inv["StandardOutputContent"].strip() or "{}")
643
+ status_lines.append("")
644
+ status_lines.append("[bold]Health:[/bold]")
645
+ status_lines.append(f" • GPU Drivers: {'OK' if health.get('drivers_ok') else 'MISSING'}")
646
+ status_lines.append(f" • Idle Detector: {health.get('idle_detector_timer', 'unknown')}")
647
+ except Exception:
648
+ pass
649
+
581
650
  if attached_studios:
582
651
  status_lines.append("")
583
652
  status_lines.append("[bold]Attached Studios:[/bold]")
@@ -587,12 +656,30 @@ def engine_status(
587
656
  f" • {studio['user']} ({studio['studio_id']}) - attached {attach_time}"
588
657
  )
589
658
 
590
- panel = Panel(
591
- "\n".join(status_lines),
592
- title="Engine Details",
593
- border_style="blue",
594
- )
595
- console.print(panel)
659
+ console.print(Panel("\n".join(status_lines), title="Engine Status", border_style="blue"))
660
+
661
+ if show_log:
662
+ console.print("\n[bold]Bootstrap Log:[/bold]")
663
+ try:
664
+ ssm = boto3.client("ssm", region_name="us-east-1")
665
+ resp = ssm.send_command(
666
+ InstanceIds=[engine["instance_id"]],
667
+ DocumentName="AWS-RunShellScript",
668
+ Parameters={"commands": ["cat /var/log/engine-setup.log 2>/dev/null || echo 'No setup log found'"], "executionTimeout": ["15"]},
669
+ )
670
+ cid = resp["Command"]["CommandId"]
671
+ time.sleep(2)
672
+ inv = ssm.get_command_invocation(CommandId=cid, InstanceId=engine["instance_id"])
673
+ if inv["Status"] == "Success":
674
+ log_content = inv["StandardOutputContent"].strip()
675
+ if log_content:
676
+ console.print(f"[dim]{log_content}[/dim]")
677
+ else:
678
+ console.print("[yellow]No bootstrap log available[/yellow]")
679
+ else:
680
+ console.print("[red]❌ Could not retrieve bootstrap log[/red]")
681
+ except Exception as e:
682
+ console.print(f"[red]❌ Error fetching log: {e}[/red]")
596
683
 
597
684
 
598
685
  @engine_app.command("stop")
@@ -1909,3 +1996,125 @@ def resize_studio(
1909
1996
 
1910
1997
  console.print("\n[dim]The filesystem will be automatically expanded when you next attach the studio.[/dim]")
1911
1998
  console.print(f"To attach: [cyan]dh studio attach <engine-name>[/cyan]")
1999
+
2000
+ # ================= Idle timeout command =================
2001
+
2002
+
2003
+ @engine_app.command("idle-timeout")
2004
+ def idle_timeout_cmd(
2005
+ name_or_id: str = typer.Argument(help="Engine name or instance ID"),
2006
+ set: Optional[str] = typer.Option(None, "--set", "-s", help="New timeout (e.g., 2h30m, 45m)")
2007
+ ):
2008
+ """Show or set the engine idle-detector timeout."""
2009
+ check_aws_sso()
2010
+
2011
+ # Resolve engine
2012
+ response = make_api_request("GET", "/engines")
2013
+ if response.status_code != 200:
2014
+ console.print("[red]❌ Failed to fetch engines[/red]")
2015
+ raise typer.Exit(1)
2016
+
2017
+ engines = response.json().get("engines", [])
2018
+ engine = resolve_engine(name_or_id, engines)
2019
+
2020
+ ssm = boto3.client("ssm", region_name="us-east-1")
2021
+
2022
+ if set is None:
2023
+ # Show current
2024
+ resp = ssm.send_command(
2025
+ InstanceIds=[engine["instance_id"]],
2026
+ DocumentName="AWS-RunShellScript",
2027
+ Parameters={"commands": ["grep -E '^IDLE_TIMEOUT_SECONDS=' /etc/engine.env || echo 'IDLE_TIMEOUT_SECONDS=1800'"], "executionTimeout": ["10"]},
2028
+ )
2029
+ cid = resp["Command"]["CommandId"]
2030
+ time.sleep(1)
2031
+ inv = ssm.get_command_invocation(CommandId=cid, InstanceId=engine["instance_id"])
2032
+ if inv["Status"] == "Success":
2033
+ line = inv["StandardOutputContent"].strip()
2034
+ secs = int(line.split("=")[1]) if "=" in line else 1800
2035
+ console.print(f"Current idle timeout: {secs//60}m ({secs} seconds)")
2036
+ else:
2037
+ console.print("[red]❌ Could not retrieve idle timeout[/red]")
2038
+ return
2039
+
2040
+ # ----- set new value -----
2041
+ m = re.match(r"^(?:(\d+)h)?(?:(\d+)m)?$", set)
2042
+ if not m:
2043
+ console.print("[red]❌ Invalid duration format. Use e.g. 2h, 45m, 1h30m[/red]")
2044
+ raise typer.Exit(1)
2045
+ hours = int(m.group(1) or 0)
2046
+ minutes = int(m.group(2) or 0)
2047
+ seconds = hours * 3600 + minutes * 60
2048
+ if seconds == 0:
2049
+ console.print("[red]❌ Duration must be greater than zero[/red]")
2050
+ raise typer.Exit(1)
2051
+
2052
+ console.print(f"Setting idle timeout to {set} ({seconds} seconds)…")
2053
+
2054
+ cmd = (
2055
+ "sudo sed -i '/^IDLE_TIMEOUT_SECONDS=/d' /etc/engine.env && "
2056
+ f"echo 'IDLE_TIMEOUT_SECONDS={seconds}' | sudo tee -a /etc/engine.env >/dev/null && "
2057
+ "sudo systemctl restart engine-idle-detector.timer"
2058
+ )
2059
+
2060
+ resp = ssm.send_command(
2061
+ InstanceIds=[engine["instance_id"]],
2062
+ DocumentName="AWS-RunShellScript",
2063
+ Parameters={"commands": [cmd], "executionTimeout": ["60"]},
2064
+ )
2065
+ cid = resp["Command"]["CommandId"]
2066
+ time.sleep(2)
2067
+ console.print(f"[green]✓ Idle timeout updated to {set}[/green]")
2068
+
2069
+ # Add this near the end, after the idle-timeout command
2070
+
2071
+ @engine_app.command("debug")
2072
+ def debug_engine(
2073
+ name_or_id: str = typer.Argument(help="Engine name or instance ID"),
2074
+ ):
2075
+ """Debug engine bootstrap status and files."""
2076
+ check_aws_sso()
2077
+
2078
+ # Resolve engine
2079
+ response = make_api_request("GET", "/engines")
2080
+ if response.status_code != 200:
2081
+ console.print("[red]❌ Failed to fetch engines[/red]")
2082
+ raise typer.Exit(1)
2083
+
2084
+ engines = response.json().get("engines", [])
2085
+ engine = resolve_engine(name_or_id, engines)
2086
+
2087
+ console.print(f"[bold]Debug info for {engine['name']}:[/bold]\n")
2088
+
2089
+ ssm = boto3.client("ssm", region_name="us-east-1")
2090
+
2091
+ # Check multiple files and systemd status
2092
+ checks = [
2093
+ ("Stage file", "cat /var/run/engine-init.stage 2>/dev/null || echo 'MISSING'"),
2094
+ ("Health file", "cat /var/run/engine-health.json 2>/dev/null || echo 'MISSING'"),
2095
+ ("Sentinel file", "ls -la /opt/dayhoff/first_boot_complete.sentinel 2>/dev/null || echo 'MISSING'"),
2096
+ ("Setup service", "systemctl status setup-aws-vm.service --no-pager || echo 'Service not found'"),
2097
+ ("Bootstrap log tail", "tail -20 /var/log/engine-setup.log 2>/dev/null || echo 'No log'"),
2098
+ ("Environment file", "cat /etc/engine.env 2>/dev/null || echo 'MISSING'"),
2099
+ ]
2100
+
2101
+ for name, cmd in checks:
2102
+ try:
2103
+ resp = ssm.send_command(
2104
+ InstanceIds=[engine["instance_id"]],
2105
+ DocumentName="AWS-RunShellScript",
2106
+ Parameters={"commands": [cmd], "executionTimeout": ["10"]},
2107
+ )
2108
+ cid = resp["Command"]["CommandId"]
2109
+ time.sleep(1)
2110
+ inv = ssm.get_command_invocation(CommandId=cid, InstanceId=engine["instance_id"])
2111
+
2112
+ if inv["Status"] == "Success":
2113
+ output = inv["StandardOutputContent"].strip()
2114
+ console.print(f"[cyan]{name}:[/cyan]")
2115
+ console.print(f"[dim]{output}[/dim]\n")
2116
+ else:
2117
+ console.print(f"[cyan]{name}:[/cyan] [red]FAILED[/red]\n")
2118
+
2119
+ except Exception as e:
2120
+ console.print(f"[cyan]{name}:[/cyan] [red]ERROR: {e}[/red]\n")
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: dayhoff-tools
3
- Version: 1.3.16
3
+ Version: 1.3.18
4
4
  Summary: Common tools for all the repos at Dayhoff Labs
5
5
  Author: Daniel Martin-Alarcon
6
6
  Author-email: dma@dayhofflabs.com
@@ -3,7 +3,7 @@ dayhoff_tools/chemistry/standardizer.py,sha256=uMn7VwHnx02nc404eO6fRuS4rsl4dvSPf
3
3
  dayhoff_tools/chemistry/utils.py,sha256=jt-7JgF-GeeVC421acX-bobKbLU_X94KNOW24p_P-_M,2257
4
4
  dayhoff_tools/cli/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
5
5
  dayhoff_tools/cli/cloud_commands.py,sha256=33qcWLmq-FwEXMdL3F0OHm-5Stlh2r65CldyEZgQ1no,40904
6
- dayhoff_tools/cli/engine_commands.py,sha256=TX9IwHkpb-x3OvvydwwhsXqxCpXnZ9TCNiOvYXYGP94,70265
6
+ dayhoff_tools/cli/engine_commands.py,sha256=4Z00VAwAaZKv0z1cXimH_JF6UKgGnX4rUawLxfFf8Zs,79197
7
7
  dayhoff_tools/cli/main.py,sha256=rgeEHD9lJ8SBCR34BTLb7gVInHUUdmEBNXAJnq5yEU4,4795
8
8
  dayhoff_tools/cli/swarm_commands.py,sha256=5EyKj8yietvT5lfoz8Zx0iQvVaNgc3SJX1z2zQR6o6M,5614
9
9
  dayhoff_tools/cli/utility_commands.py,sha256=qs8vH9TBFHsOPC3X8cU3qZigM3dDn-2Ytq4o_F2WubU,27874
@@ -27,7 +27,7 @@ dayhoff_tools/intake/uniprot.py,sha256=BZYJQF63OtPcBBnQ7_P9gulxzJtqyorgyuDiPeOJq
27
27
  dayhoff_tools/logs.py,sha256=DKdeP0k0kliRcilwvX0mUB2eipO5BdWUeHwh-VnsICs,838
28
28
  dayhoff_tools/sqlite.py,sha256=jV55ikF8VpTfeQqqlHSbY8OgfyfHj8zgHNpZjBLos_E,18672
29
29
  dayhoff_tools/warehouse.py,sha256=8YbnQ--usrEgDQGfvpV4MrMji55A0rq2hZaOgFGh6ag,15896
30
- dayhoff_tools-1.3.16.dist-info/METADATA,sha256=Ylw3uOqRFudCtgZdphUqKDLMBn0bH7O6-Ns8ZGTQ5R4,2825
31
- dayhoff_tools-1.3.16.dist-info/WHEEL,sha256=b4K_helf-jlQoXBBETfwnf4B04YC67LOev0jo4fX5m8,88
32
- dayhoff_tools-1.3.16.dist-info/entry_points.txt,sha256=iAf4jteNqW3cJm6CO6czLxjW3vxYKsyGLZ8WGmxamSc,49
33
- dayhoff_tools-1.3.16.dist-info/RECORD,,
30
+ dayhoff_tools-1.3.18.dist-info/METADATA,sha256=hQN8_0h-PnFGTHJw8p775acWefGqk3y1klZuUkhmJO8,2825
31
+ dayhoff_tools-1.3.18.dist-info/WHEEL,sha256=b4K_helf-jlQoXBBETfwnf4B04YC67LOev0jo4fX5m8,88
32
+ dayhoff_tools-1.3.18.dist-info/entry_points.txt,sha256=iAf4jteNqW3cJm6CO6czLxjW3vxYKsyGLZ8WGmxamSc,49
33
+ dayhoff_tools-1.3.18.dist-info/RECORD,,