PyPI - dayhoff-tools - Versions diffs - 1.3.16__tar.gz → 1.3.18__tar.gz - Mend

dayhoff-tools 1.3.16tar.gz → 1.3.18tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (32) hide show

{dayhoff_tools-1.3.16 → dayhoff_tools-1.3.18}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.3
 Name: dayhoff-tools
-Version: 1.3.16
+Version: 1.3.18
 Summary: Common tools for all the repos at Dayhoff Labs
 Author: Daniel Martin-Alarcon
 Author-email: dma@dayhofflabs.com

{dayhoff_tools-1.3.16 → dayhoff_tools-1.3.18}/dayhoff_tools/cli/engine_commands.py RENAMED Viewed

@@ -18,6 +18,7 @@ from rich.panel import Panel
 from rich.progress import Progress, SpinnerColumn, TextColumn
 from rich.prompt import Confirm, IntPrompt, Prompt
 from rich.table import Table
+import re
 # Initialize Typer apps
 engine_app = typer.Typer(help="Manage compute engines for development.")
@@ -37,6 +38,41 @@ HOURLY_COSTS = {
 # SSH config management
 SSH_MANAGED_COMMENT = "# Managed by dh engine"
+# --------------------------------------------------------------------------------
+# Bootstrap stage helpers
+# --------------------------------------------------------------------------------
+def _colour_stage(stage: str) -> str:
+    """Return colourised stage name for table output."""
+    if not stage:
+        return "[dim]-[/dim]"
+    low = stage.lower()
+    if low.startswith("error"):
+        return f"[red]{stage}[/red]"
+    if low == "finished":
+        return f"[green]{stage}[/green]"
+    return f"[yellow]{stage}[/yellow]"
+def _fetch_init_stages(instance_ids: List[str]) -> Dict[str, str]:
+    """Fetch DayhoffInitStage tag for many instances in one call."""
+    if not instance_ids:
+        return {}
+    ec2 = boto3.client("ec2", region_name="us-east-1")
+    stages: Dict[str, str] = {}
+    try:
+        paginator = ec2.get_paginator("describe_instances")
+        for page in paginator.paginate(InstanceIds=instance_ids):
+            for res in page["Reservations"]:
+                for inst in res["Instances"]:
+                    iid = inst["InstanceId"]
+                    tag_val = next((t["Value"] for t in inst.get("Tags", []) if t["Key"] == "DayhoffInitStage"), None)
+                    if tag_val:
+                        stages[iid] = tag_val
+    except Exception:
+        pass  # best-effort
+    return stages
 def check_aws_sso() -> str:
     """Check AWS SSO status and return username."""
@@ -486,6 +522,9 @@ def list_engines(
             console.print("No engines found.")
             return
+        # Fetch bootstrap stages once
+        stages_map = _fetch_init_stages([e["instance_id"] for e in engines])
         # Create table
         table = Table(title="Engines", box=box.ROUNDED)
         table.add_column("Name", style="cyan")
@@ -493,6 +532,7 @@ def list_engines(
         table.add_column("Type")
         table.add_column("User")
         table.add_column("Status")
+        table.add_column("Stage")
         table.add_column("Disk Usage")
         table.add_column("Uptime/Since")
         table.add_column("$/hour", justify="right")
@@ -515,12 +555,15 @@ def list_engines(
                 time_str = launch_time.strftime("%Y-%m-%d %H:%M")
                 disk_usage = "-"
+            stage_display = _colour_stage(stages_map.get(engine["instance_id"], "-"))
             table.add_row(
                 engine["name"],
                 engine["instance_id"],
                 engine["engine_type"],
                 engine["user"],
                 format_status(engine["state"], engine.get("ready")),
+                stage_display,
                 disk_usage,
                 time_str,
                 f"${hourly_cost:.2f}",
@@ -539,8 +582,9 @@ def list_engines(
 @engine_app.command("status")
 def engine_status(
     name_or_id: str = typer.Argument(help="Engine name or instance ID"),
+    show_log: bool = typer.Option(False, "--show-log", help="Show bootstrap log"),
 ):
-    """Show detailed status of an engine."""
+    """Show detailed engine status and information."""
     check_aws_sso()
     # Get all engines to resolve name
@@ -566,18 +610,43 @@ def engine_status(
     hourly_cost = HOURLY_COSTS.get(engine["engine_type"], 0)
     total_cost = hourly_cost * (uptime.total_seconds() / 3600)
-    # Create status panel
+    stages_map = _fetch_init_stages([engine["instance_id"]])
+    stage_val = stages_map.get(engine["instance_id"], "-")
     status_lines = [
         f"[bold]Name:[/bold]        {engine['name']}",
         f"[bold]Instance:[/bold]    {engine['instance_id']}",
         f"[bold]Type:[/bold]        {engine['engine_type']} ({engine['instance_type']})",
         f"[bold]Status:[/bold]      {format_status(engine['state'], engine.get('ready'))}",
+        f"[bold]Bootstrap:[/bold]   {_colour_stage(stage_val)}",
         f"[bold]User:[/bold]        {engine['user']}",
         f"[bold]IP:[/bold]          {engine.get('public_ip', 'N/A')}",
         f"[bold]Launched:[/bold]    {launch_time.strftime('%Y-%m-%d %H:%M:%S')} ({format_duration(uptime)} ago)",
         f"[bold]Cost:[/bold]        ${hourly_cost:.2f}/hour (${total_cost:.2f} total)",
     ]
+    # Health report (only if bootstrap finished)
+    if stage_val == "finished":
+        try:
+            ssm = boto3.client("ssm", region_name="us-east-1")
+            res = ssm.send_command(
+                InstanceIds=[engine["instance_id"]],
+                DocumentName="AWS-RunShellScript",
+                Parameters={"commands": ["cat /var/run/engine-health.json || true"], "executionTimeout": ["10"]},
+            )
+            cid = res["Command"]["CommandId"]
+            time.sleep(1)
+            inv = ssm.get_command_invocation(CommandId=cid, InstanceId=engine["instance_id"])
+            if inv["Status"] == "Success":
+                import json as _json
+                health = _json.loads(inv["StandardOutputContent"].strip() or "{}")
+                status_lines.append("")
+                status_lines.append("[bold]Health:[/bold]")
+                status_lines.append(f"  • GPU Drivers: {'OK' if health.get('drivers_ok') else 'MISSING'}")
+                status_lines.append(f"  • Idle Detector: {health.get('idle_detector_timer', 'unknown')}")
+        except Exception:
+            pass
     if attached_studios:
         status_lines.append("")
         status_lines.append("[bold]Attached Studios:[/bold]")
@@ -587,12 +656,30 @@ def engine_status(
                 f"  • {studio['user']} ({studio['studio_id']}) - attached {attach_time}"
             )
-    panel = Panel(
-        "\n".join(status_lines),
-        title="Engine Details",
-        border_style="blue",
-    )
-    console.print(panel)
+    console.print(Panel("\n".join(status_lines), title="Engine Status", border_style="blue"))
+    if show_log:
+        console.print("\n[bold]Bootstrap Log:[/bold]")
+        try:
+            ssm = boto3.client("ssm", region_name="us-east-1")
+            resp = ssm.send_command(
+                InstanceIds=[engine["instance_id"]],
+                DocumentName="AWS-RunShellScript",
+                Parameters={"commands": ["cat /var/log/engine-setup.log 2>/dev/null || echo 'No setup log found'"], "executionTimeout": ["15"]},
+            )
+            cid = resp["Command"]["CommandId"]
+            time.sleep(2)
+            inv = ssm.get_command_invocation(CommandId=cid, InstanceId=engine["instance_id"])
+            if inv["Status"] == "Success":
+                log_content = inv["StandardOutputContent"].strip()
+                if log_content:
+                    console.print(f"[dim]{log_content}[/dim]")
+                else:
+                    console.print("[yellow]No bootstrap log available[/yellow]")
+            else:
+                console.print("[red]❌ Could not retrieve bootstrap log[/red]")
+        except Exception as e:
+            console.print(f"[red]❌ Error fetching log: {e}[/red]")
 @engine_app.command("stop")
@@ -1909,3 +1996,125 @@ def resize_studio(
     console.print("\n[dim]The filesystem will be automatically expanded when you next attach the studio.[/dim]")
     console.print(f"To attach: [cyan]dh studio attach <engine-name>[/cyan]")
+# ================= Idle timeout command =================
+@engine_app.command("idle-timeout")
+def idle_timeout_cmd(
+    name_or_id: str = typer.Argument(help="Engine name or instance ID"),
+    set: Optional[str] = typer.Option(None, "--set", "-s", help="New timeout (e.g., 2h30m, 45m)")
+):
+    """Show or set the engine idle-detector timeout."""
+    check_aws_sso()
+    # Resolve engine
+    response = make_api_request("GET", "/engines")
+    if response.status_code != 200:
+        console.print("[red]❌ Failed to fetch engines[/red]")
+        raise typer.Exit(1)
+    engines = response.json().get("engines", [])
+    engine = resolve_engine(name_or_id, engines)
+    ssm = boto3.client("ssm", region_name="us-east-1")
+    if set is None:
+        # Show current
+        resp = ssm.send_command(
+            InstanceIds=[engine["instance_id"]],
+            DocumentName="AWS-RunShellScript",
+            Parameters={"commands": ["grep -E '^IDLE_TIMEOUT_SECONDS=' /etc/engine.env || echo 'IDLE_TIMEOUT_SECONDS=1800'"], "executionTimeout": ["10"]},
+        )
+        cid = resp["Command"]["CommandId"]
+        time.sleep(1)
+        inv = ssm.get_command_invocation(CommandId=cid, InstanceId=engine["instance_id"])
+        if inv["Status"] == "Success":
+            line = inv["StandardOutputContent"].strip()
+            secs = int(line.split("=")[1]) if "=" in line else 1800
+            console.print(f"Current idle timeout: {secs//60}m ({secs} seconds)")
+        else:
+            console.print("[red]❌ Could not retrieve idle timeout[/red]")
+        return
+    # ----- set new value -----
+    m = re.match(r"^(?:(\d+)h)?(?:(\d+)m)?$", set)
+    if not m:
+        console.print("[red]❌ Invalid duration format. Use e.g. 2h, 45m, 1h30m[/red]")
+        raise typer.Exit(1)
+    hours = int(m.group(1) or 0)
+    minutes = int(m.group(2) or 0)
+    seconds = hours * 3600 + minutes * 60
+    if seconds == 0:
+        console.print("[red]❌ Duration must be greater than zero[/red]")
+        raise typer.Exit(1)
+    console.print(f"Setting idle timeout to {set} ({seconds} seconds)…")
+    cmd = (
+        "sudo sed -i '/^IDLE_TIMEOUT_SECONDS=/d' /etc/engine.env && "
+        f"echo 'IDLE_TIMEOUT_SECONDS={seconds}' | sudo tee -a /etc/engine.env >/dev/null && "
+        "sudo systemctl restart engine-idle-detector.timer"
+    )
+    resp = ssm.send_command(
+        InstanceIds=[engine["instance_id"]],
+        DocumentName="AWS-RunShellScript",
+        Parameters={"commands": [cmd], "executionTimeout": ["60"]},
+    )
+    cid = resp["Command"]["CommandId"]
+    time.sleep(2)
+    console.print(f"[green]✓ Idle timeout updated to {set}[/green]")
+# Add this near the end, after the idle-timeout command
+@engine_app.command("debug")
+def debug_engine(
+    name_or_id: str = typer.Argument(help="Engine name or instance ID"),
+):
+    """Debug engine bootstrap status and files."""
+    check_aws_sso()
+    # Resolve engine
+    response = make_api_request("GET", "/engines")
+    if response.status_code != 200:
+        console.print("[red]❌ Failed to fetch engines[/red]")
+        raise typer.Exit(1)
+    engines = response.json().get("engines", [])
+    engine = resolve_engine(name_or_id, engines)
+    console.print(f"[bold]Debug info for {engine['name']}:[/bold]\n")
+    ssm = boto3.client("ssm", region_name="us-east-1")
+    # Check multiple files and systemd status
+    checks = [
+        ("Stage file", "cat /var/run/engine-init.stage 2>/dev/null || echo 'MISSING'"),
+        ("Health file", "cat /var/run/engine-health.json 2>/dev/null || echo 'MISSING'"),
+        ("Sentinel file", "ls -la /opt/dayhoff/first_boot_complete.sentinel 2>/dev/null || echo 'MISSING'"),
+        ("Setup service", "systemctl status setup-aws-vm.service --no-pager || echo 'Service not found'"),
+        ("Bootstrap log tail", "tail -20 /var/log/engine-setup.log 2>/dev/null || echo 'No log'"),
+        ("Environment file", "cat /etc/engine.env 2>/dev/null || echo 'MISSING'"),
+    ]
+    for name, cmd in checks:
+        try:
+            resp = ssm.send_command(
+                InstanceIds=[engine["instance_id"]],
+                DocumentName="AWS-RunShellScript",
+                Parameters={"commands": [cmd], "executionTimeout": ["10"]},
+            )
+            cid = resp["Command"]["CommandId"]
+            time.sleep(1)
+            inv = ssm.get_command_invocation(CommandId=cid, InstanceId=engine["instance_id"])
+            if inv["Status"] == "Success":
+                output = inv["StandardOutputContent"].strip()
+                console.print(f"[cyan]{name}:[/cyan]")
+                console.print(f"[dim]{output}[/dim]\n")
+            else:
+                console.print(f"[cyan]{name}:[/cyan] [red]FAILED[/red]\n")
+        except Exception as e:
+            console.print(f"[cyan]{name}:[/cyan] [red]ERROR: {e}[/red]\n")

{dayhoff_tools-1.3.16 → dayhoff_tools-1.3.18}/pyproject.toml RENAMED Viewed

@@ -5,7 +5,7 @@ build-backend = "poetry.core.masonry.api"
 [project]
 name = "dayhoff-tools"
-version = "1.3.16"
+version = "1.3.18"
 description = "Common tools for all the repos at Dayhoff Labs"
 authors = [
     {name = "Daniel Martin-Alarcon", email = "dma@dayhofflabs.com"}