PyPI - dayhoff-tools - Versions diffs - 1.10.0__tar.gz → 1.10.1__tar.gz - Mend

dayhoff-tools 1.10.0tar.gz → 1.10.1tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (64) hide show

{dayhoff_tools-1.10.0 → dayhoff_tools-1.10.1}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.3
 Name: dayhoff-tools
-Version: 1.10.0
+Version: 1.10.1
 Summary: Common tools for all the repos at Dayhoff Labs
 Author: Daniel Martin-Alarcon
 Author-email: dma@dayhofflabs.com

dayhoff_tools-1.10.1/dayhoff_tools/cli/engine/__init__.py ADDED Viewed

	@@ -0,0 +1 @@
1	+ """Engine management commands."""

dayhoff_tools-1.10.1/dayhoff_tools/cli/engine/coffee.py ADDED Viewed

@@ -0,0 +1,110 @@
+"""Engine coffee command."""
+import re
+import time
+import boto3
+import typer
+from botocore.exceptions import ClientError
+from ..engine_studio_utils.api_utils import make_api_request
+from ..engine_studio_utils.aws_utils import check_aws_sso
+from ..engine_studio_utils.constants import console
+from ..engine_studio_utils.formatting import resolve_engine
+def coffee(
+    name_or_id: str = typer.Argument(help="Engine name or instance ID"),
+    duration: str = typer.Argument("4h", help="Duration (e.g., 2h, 30m, 2h30m)"),
+    cancel: bool = typer.Option(
+        False, "--cancel", help="Cancel existing coffee lock instead of extending"
+    ),
+):
+    """Pour ☕ for an engine: keeps it awake for the given duration (or cancel)."""
+    username = check_aws_sso()
+    # Parse duration
+    if not cancel:
+        match = re.match(r"(?:(\d+)h)?(?:(\d+)m)?", duration)
+        if not match or (not match.group(1) and not match.group(2)):
+            console.print(f"[red]❌ Invalid duration format: {duration}[/red]")
+            console.print("Use format like: 4h, 30m, 2h30m")
+            raise typer.Exit(1)
+        hours = int(match.group(1) or 0)
+        minutes = int(match.group(2) or 0)
+        seconds_total = (hours * 60 + minutes) * 60
+        if seconds_total == 0:
+            console.print("[red]❌ Duration must be greater than zero[/red]")
+            raise typer.Exit(1)
+    # Get all engines to resolve name
+    response = make_api_request("GET", "/engines")
+    if response.status_code != 200:
+        console.print("[red]❌ Failed to fetch engines[/red]")
+        raise typer.Exit(1)
+    engines = response.json().get("engines", [])
+    engine = resolve_engine(name_or_id, engines)
+    if engine["state"].lower() != "running":
+        console.print(f"[red]❌ Engine is not running (state: {engine['state']})[/red]")
+        raise typer.Exit(1)
+    if cancel:
+        console.print(f"Cancelling coffee for [cyan]{engine['name']}[/cyan]…")
+    else:
+        console.print(
+            f"Pouring coffee for [cyan]{engine['name']}[/cyan] for {duration}…"
+        )
+    # Use SSM to run the engine coffee command
+    ssm = boto3.client("ssm", region_name="us-east-1")
+    try:
+        response = ssm.send_command(
+            InstanceIds=[engine["instance_id"]],
+            DocumentName="AWS-RunShellScript",
+            Parameters={
+                "commands": [
+                    (
+                        "/usr/local/bin/engine-coffee --cancel"
+                        if cancel
+                        else f"/usr/local/bin/engine-coffee {seconds_total}"
+                    )
+                ],
+                "executionTimeout": ["60"],
+            },
+        )
+        command_id = response["Command"]["CommandId"]
+        # Wait for command to complete
+        for _ in range(10):
+            time.sleep(1)
+            result = ssm.get_command_invocation(
+                CommandId=command_id,
+                InstanceId=engine["instance_id"],
+            )
+            if result["Status"] in ["Success", "Failed"]:
+                break
+        if result["Status"] == "Success":
+            if cancel:
+                console.print(
+                    "[green]✓ Coffee cancelled – auto-shutdown re-enabled[/green]"
+                )
+            else:
+                console.print(f"[green]✓ Coffee poured for {duration}[/green]")
+            console.print(
+                "\n[dim]Note: Detached Docker containers (except dev containers) will also keep the engine awake.[/dim]"
+            )
+            console.print(
+                "[dim]Use coffee for nohup operations or other background tasks.[/dim]"
+            )
+        else:
+            console.print(
+                f"[red]❌ Failed to manage coffee: {result.get('StatusDetails', 'Unknown error')}[/red]"
+            )
+    except ClientError as e:
+        console.print(f"[red]❌ Failed to manage coffee: {e}[/red]")

dayhoff_tools-1.10.1/dayhoff_tools/cli/engine/config_ssh.py ADDED Viewed

@@ -0,0 +1,113 @@
+"""Engine config-ssh command."""
+from pathlib import Path
+import typer
+from ..engine_studio_utils.api_utils import make_api_request
+from ..engine_studio_utils.aws_utils import check_aws_sso
+from ..engine_studio_utils.constants import SSH_MANAGED_COMMENT, console
+from ..engine_studio_utils.ssh_utils import check_session_manager_plugin
+def config_ssh(
+    clean: bool = typer.Option(False, "--clean", help="Remove all managed entries"),
+    all_engines: bool = typer.Option(
+        False, "--all", "-a", help="Include all engines from all users"
+    ),
+    admin: bool = typer.Option(
+        False,
+        "--admin",
+        help="Generate entries that use ec2-user instead of per-engine owner user",
+    ),
+):
+    """Update SSH config with available engines."""
+    username = check_aws_sso()
+    # Only check for Session Manager Plugin if we're not just cleaning
+    if not clean and not check_session_manager_plugin():
+        raise typer.Exit(1)
+    if clean:
+        console.print("Removing all managed SSH entries...")
+    else:
+        if all_engines:
+            console.print("Updating SSH config with all running engines...")
+        else:
+            console.print(
+                f"Updating SSH config with running engines for [cyan]{username}[/cyan] and [cyan]shared[/cyan]..."
+            )
+    # Get all engines
+    response = make_api_request("GET", "/engines")
+    if response.status_code != 200:
+        console.print("[red]❌ Failed to fetch engines[/red]")
+        raise typer.Exit(1)
+    engines = response.json().get("engines", [])
+    running_engines = [e for e in engines if e["state"].lower() == "running"]
+    # Filter engines based on options
+    if not all_engines:
+        # Show only current user's engines and shared engines
+        running_engines = [
+            e for e in running_engines if e["user"] == username or e["user"] == "shared"
+        ]
+    # Read existing config
+    config_path = Path.home() / ".ssh" / "config"
+    config_path.parent.mkdir(mode=0o700, exist_ok=True)
+    if config_path.exists():
+        content = config_path.read_text()
+        lines = content.splitlines()
+    else:
+        content = ""
+        lines = []
+    # Remove old managed entries
+    new_lines = []
+    skip_until_next_host = False
+    for line in lines:
+        if SSH_MANAGED_COMMENT in line:
+            skip_until_next_host = True
+        elif line.strip().startswith("Host ") and skip_until_next_host:
+            skip_until_next_host = False
+            # Check if this is a managed host
+            if SSH_MANAGED_COMMENT not in line:
+                new_lines.append(line)
+        elif not skip_until_next_host:
+            new_lines.append(line)
+    # Add new entries if not cleaning
+    if not clean:
+        for engine in running_engines:
+            # Determine ssh user based on --admin flag
+            ssh_user = "ec2-user" if admin else username
+            new_lines.extend(
+                [
+                    "",
+                    f"Host {engine['name']} {SSH_MANAGED_COMMENT}",
+                    f"    HostName {engine['instance_id']}",
+                    f"    User {ssh_user}",
+                    f"    ProxyCommand sh -c \"AWS_SSM_IDLE_TIMEOUT=600 aws ssm start-session --target %h --document-name AWS-StartSSHSession --parameters 'portNumber=%p'\"",
+                ]
+            )
+    # Write back
+    config_path.write_text("\n".join(new_lines))
+    config_path.chmod(0o600)
+    if clean:
+        console.print("[green]✓ Removed all managed SSH entries[/green]")
+    else:
+        console.print(
+            f"[green]✓ Updated SSH config with {len(running_engines)} engines[/green]"
+        )
+        for engine in running_engines:
+            user_display = (
+                f"[dim]({engine['user']})[/dim]" if engine["user"] != username else ""
+            )
+            console.print(
+                f"  • {engine['name']} → {engine['instance_id']} {user_display}"
+            )

dayhoff_tools-1.10.1/dayhoff_tools/cli/engine/debug.py ADDED Viewed

@@ -0,0 +1,79 @@
+"""Engine debug command."""
+import time
+import boto3
+import typer
+from ..engine_studio_utils.api_utils import make_api_request
+from ..engine_studio_utils.aws_utils import check_aws_sso
+from ..engine_studio_utils.constants import console
+from ..engine_studio_utils.formatting import resolve_engine
+def debug_engine(
+    name_or_id: str = typer.Argument(help="Engine name or instance ID"),
+):
+    """Debug engine bootstrap status and files."""
+    check_aws_sso()
+    # Resolve engine
+    response = make_api_request("GET", "/engines")
+    if response.status_code != 200:
+        console.print("[red]❌ Failed to fetch engines[/red]")
+        raise typer.Exit(1)
+    engines = response.json().get("engines", [])
+    engine = resolve_engine(name_or_id, engines)
+    console.print(f"[bold]Debug info for {engine['name']}:[/bold]\n")
+    ssm = boto3.client("ssm", region_name="us-east-1")
+    # Check multiple files and systemd status
+    checks = [
+        (
+            "Stage file",
+            "cat /opt/dayhoff/state/engine-init.stage 2>/dev/null || cat /var/run/engine-init.stage 2>/dev/null || echo 'MISSING'",
+        ),
+        (
+            "Health file",
+            "cat /opt/dayhoff/state/engine-health.json 2>/dev/null || cat /var/run/engine-health.json 2>/dev/null || echo 'MISSING'",
+        ),
+        (
+            "Sentinel file",
+            "ls -la /opt/dayhoff/first_boot_complete.sentinel 2>/dev/null || echo 'MISSING'",
+        ),
+        (
+            "Setup service",
+            "systemctl status setup-aws-vm.service --no-pager || echo 'Service not found'",
+        ),
+        (
+            "Bootstrap log tail",
+            "tail -20 /var/log/engine-setup.log 2>/dev/null || echo 'No log'",
+        ),
+        ("Environment file", "cat /etc/engine.env 2>/dev/null || echo 'MISSING'"),
+    ]
+    for name, cmd in checks:
+        try:
+            resp = ssm.send_command(
+                InstanceIds=[engine["instance_id"]],
+                DocumentName="AWS-RunShellScript",
+                Parameters={"commands": [cmd], "executionTimeout": ["10"]},
+            )
+            cid = resp["Command"]["CommandId"]
+            time.sleep(1)
+            inv = ssm.get_command_invocation(
+                CommandId=cid, InstanceId=engine["instance_id"]
+            )
+            if inv["Status"] == "Success":
+                output = inv["StandardOutputContent"].strip()
+                console.print(f"[cyan]{name}:[/cyan]")
+                console.print(f"[dim]{output}[/dim]\n")
+            else:
+                console.print(f"[cyan]{name}:[/cyan] [red]FAILED[/red]\n")
+        except Exception as e:
+            console.print(f"[cyan]{name}:[/cyan] [red]ERROR: {e}[/red]\n")

dayhoff_tools-1.10.1/dayhoff_tools/cli/engine/gami.py ADDED Viewed

@@ -0,0 +1,160 @@
+"""Engine GAMI (Golden AMI) creation command."""
+from datetime import datetime
+import boto3
+import typer
+from botocore.exceptions import ClientError
+from rich.progress import Progress, SpinnerColumn, TextColumn
+from rich.prompt import Confirm
+from ..engine_studio_utils.api_utils import make_api_request
+from ..engine_studio_utils.aws_utils import check_aws_sso
+from ..engine_studio_utils.constants import console
+from ..engine_studio_utils.formatting import resolve_engine
+def create_ami(
+    name_or_id: str = typer.Argument(
+        help="Engine name or instance ID to create AMI from"
+    ),
+):
+    """Create a 'Golden AMI' from a running engine.
+    This process is for creating a pre-warmed, standardized machine image
+    that can be used to launch new engines more quickly.
+    IMPORTANT:
+    - The engine MUST have all studios detached before running this command.
+    - This process will make the source engine unusable. You should
+      plan to TERMINATE the engine after the AMI is created.
+    """
+    check_aws_sso()
+    # Get all engines to resolve name and check status
+    # We pass check_ready=True to get attached studio info
+    response = make_api_request("GET", "/engines", params={"check_ready": "true"})
+    if response.status_code != 200:
+        console.print("[red]❌ Failed to fetch engines[/red]")
+        raise typer.Exit(1)
+    engines = response.json().get("engines", [])
+    engine = resolve_engine(name_or_id, engines)
+    # --- Pre-flight checks ---
+    # 1. Check if engine is running
+    if engine["state"].lower() != "running":
+        console.print(f"[red]❌ Engine '{engine['name']}' is not running.[/red]")
+        console.print("Please start it before creating an AMI.")
+        raise typer.Exit(1)
+    # 2. Check for attached studios from the detailed API response
+    attached_studios = engine.get("studios", [])
+    if attached_studios:
+        console.print(
+            f"[bold red]❌ Engine '{engine['name']}' has studios attached.[/bold red]"
+        )
+        console.print("Please detach all studios before creating an AMI:")
+        for studio in attached_studios:
+            console.print(f"  - {studio['user']} ({studio['studio_id']})")
+        console.print("\nTo detach, run [bold]dh studio detach[/bold]")
+        raise typer.Exit(1)
+    # Construct AMI name and description
+    ami_name = (
+        f"prewarmed-engine-{engine['engine_type']}-{datetime.now().strftime('%Y%m%d')}"
+    )
+    description = (
+        f"Amazon Linux 2023 with NVIDIA drivers, Docker, and pre-pulled "
+        f"dev container image for {engine['engine_type']} engines"
+    )
+    console.print(f"Creating AMI from engine [cyan]{engine['name']}[/cyan]...")
+    console.print(f"[bold]AMI Name:[/] {ami_name}")
+    console.print(f"[bold]Description:[/] {description}")
+    console.print(
+        "\n[bold yellow]⚠️  Important:[/bold yellow]\n"
+        "1. This process will run cleanup scripts on the engine.\n"
+        "2. The source engine should be [bold]terminated[/bold] after the AMI is created.\n"
+    )
+    if not Confirm.ask("Continue with AMI creation?"):
+        raise typer.Exit()
+    # Create AMI using EC2 client directly, as the backend logic is too complex
+    ec2 = boto3.client("ec2", region_name="us-east-1")
+    ssm = boto3.client("ssm", region_name="us-east-1")
+    try:
+        # Clean up instance state before snapshotting
+        console.print("Cleaning up instance for AMI creation...")
+        cleanup_commands = [
+            "sudo rm -f /opt/dayhoff/first_boot_complete.sentinel",
+            "history -c",
+            "sudo rm -rf /tmp/* /var/log/messages /var/log/cloud-init.log",
+            "sudo rm -rf /var/lib/amazon/ssm/* /etc/amazon/ssm/*",
+            "sleep 2 && sudo systemctl stop amazon-ssm-agent &",  # Stop agent last
+        ]
+        cleanup_response = ssm.send_command(
+            InstanceIds=[engine["instance_id"]],
+            DocumentName="AWS-RunShellScript",
+            Parameters={"commands": cleanup_commands, "executionTimeout": ["120"]},
+        )
+        # Acknowledge that the SSM command might be in progress as the agent shuts down
+        console.print(
+            "[dim]ℹ️  Cleanup command sent (status may show 'InProgress' as SSM agent stops)[/dim]"
+        )
+        # Create the AMI
+        with Progress(
+            SpinnerColumn(),
+            TextColumn("[progress.description]{task.description}"),
+            transient=True,
+        ) as progress:
+            task = progress.add_task(
+                "Creating AMI (this will take several minutes)...", total=None
+            )
+            response = ec2.create_image(
+                InstanceId=engine["instance_id"],
+                Name=ami_name,
+                Description=description,
+                NoReboot=False,
+                TagSpecifications=[
+                    {
+                        "ResourceType": "image",
+                        "Tags": [
+                            {"Key": "Environment", "Value": "dev"},
+                            {"Key": "Type", "Value": "golden-ami"},
+                            {"Key": "EngineType", "Value": engine["engine_type"]},
+                            {"Key": "Name", "Value": ami_name},
+                        ],
+                    }
+                ],
+            )
+            ami_id = response["ImageId"]
+            progress.update(
+                task,
+                completed=True,
+                description=f"[green]✓ AMI creation initiated![/green]",
+            )
+        console.print(f"  [bold]AMI ID:[/] {ami_id}")
+        console.print("\nThe AMI creation process will continue in the background.")
+        console.print("You can monitor progress in the EC2 Console under 'AMIs'.")
+        console.print(
+            "\nOnce complete, update the AMI ID in [bold]terraform/environments/dev/variables.tf[/bold] "
+            "and run [bold]terraform apply[/bold]."
+        )
+        console.print(
+            f"\nRemember to [bold red]terminate the source engine '{engine['name']}'[/bold red] to save costs."
+        )
+    except ClientError as e:
+        console.print(f"[red]❌ Failed to create AMI: {e}[/red]")
+        raise typer.Exit(1)

dayhoff_tools-1.10.1/dayhoff_tools/cli/engine/idle.py ADDED Viewed

@@ -0,0 +1,148 @@
+"""Engine idle timeout command."""
+import re
+import time
+from typing import Optional
+import boto3
+import typer
+from ..engine_studio_utils.api_utils import make_api_request
+from ..engine_studio_utils.aws_utils import check_aws_sso
+from ..engine_studio_utils.constants import console
+from ..engine_studio_utils.formatting import resolve_engine
+def idle_timeout_cmd(
+    name_or_id: str = typer.Argument(help="Engine name or instance ID"),
+    set: Optional[str] = typer.Option(
+        None, "--set", "-s", help="New timeout (e.g., 2h30m, 45m)"
+    ),
+    slack: Optional[str] = typer.Option(
+        None, "--slack", help="Set Slack notifications: none, default, all"
+    ),
+):
+    """Show or set engine idle-detector settings."""
+    check_aws_sso()
+    # Resolve engine
+    response = make_api_request("GET", "/engines")
+    if response.status_code != 200:
+        console.print("[red]❌ Failed to fetch engines[/red]")
+        raise typer.Exit(1)
+    engines = response.json().get("engines", [])
+    engine = resolve_engine(name_or_id, engines)
+    ssm = boto3.client("ssm", region_name="us-east-1")
+    # Handle slack notifications change
+    if slack:
+        slack = slack.lower()
+        if slack not in ["none", "default", "all"]:
+            console.print("[red]❌ Invalid slack option. Use: none, default, all[/red]")
+            raise typer.Exit(1)
+        console.print(f"Setting Slack notifications to [bold]{slack}[/bold]...")
+        if slack == "none":
+            settings = {
+                "SLACK_NOTIFY_WARNINGS": "false",
+                "SLACK_NOTIFY_IDLE_START": "false",
+                "SLACK_NOTIFY_IDLE_END": "false",
+                "SLACK_NOTIFY_SHUTDOWN": "false",
+            }
+        elif slack == "default":
+            settings = {
+                "SLACK_NOTIFY_WARNINGS": "true",
+                "SLACK_NOTIFY_IDLE_START": "false",
+                "SLACK_NOTIFY_IDLE_END": "false",
+                "SLACK_NOTIFY_SHUTDOWN": "true",
+            }
+        else:  # all
+            settings = {
+                "SLACK_NOTIFY_WARNINGS": "true",
+                "SLACK_NOTIFY_IDLE_START": "true",
+                "SLACK_NOTIFY_IDLE_END": "true",
+                "SLACK_NOTIFY_SHUTDOWN": "true",
+            }
+        commands = []
+        for key, value in settings.items():
+            # Use a robust sed command that adds the line if it doesn't exist
+            commands.append(
+                f"grep -q '^{key}=' /etc/engine.env && sudo sed -i 's|^{key}=.*|{key}={value}|' /etc/engine.env || echo '{key}={value}' | sudo tee -a /etc/engine.env > /dev/null"
+            )
+        # Instead of restarting service, send SIGHUP to reload config
+        commands.append(
+            "sudo pkill -HUP -f engine-idle-detector.py || sudo systemctl restart engine-idle-detector.service"
+        )
+        resp = ssm.send_command(
+            InstanceIds=[engine["instance_id"]],
+            DocumentName="AWS-RunShellScript",
+            Parameters={"commands": commands, "executionTimeout": ["60"]},
+        )
+        cid = resp["Command"]["CommandId"]
+        time.sleep(2)  # Give it a moment to process
+        console.print(f"[green]✓ Slack notifications updated to '{slack}'[/green]")
+        console.print("[dim]Note: Settings updated without resetting idle timer[/dim]")
+    # Handle setting new timeout value
+    if set is not None:
+        m = re.match(r"^(?:(\d+)h)?(?:(\d+)m)?$", set)
+        if not m:
+            console.print(
+                "[red]❌ Invalid duration format. Use e.g. 2h, 45m, 1h30m[/red]"
+            )
+            raise typer.Exit(1)
+        hours = int(m.group(1) or 0)
+        minutes = int(m.group(2) or 0)
+        seconds = hours * 3600 + minutes * 60
+        if seconds == 0:
+            console.print("[red]❌ Duration must be greater than zero[/red]")
+            raise typer.Exit(1)
+        console.print(f"Setting idle timeout to {set} ({seconds} seconds)…")
+        cmd = (
+            "sudo sed -i '/^IDLE_TIMEOUT_SECONDS=/d' /etc/engine.env && "
+            f"echo 'IDLE_TIMEOUT_SECONDS={seconds}' | sudo tee -a /etc/engine.env >/dev/null && "
+            "sudo systemctl restart engine-idle-detector.service"
+        )
+        resp = ssm.send_command(
+            InstanceIds=[engine["instance_id"]],
+            DocumentName="AWS-RunShellScript",
+            Parameters={"commands": [cmd], "executionTimeout": ["60"]},
+        )
+        cid = resp["Command"]["CommandId"]
+        time.sleep(2)
+        console.print(f"[green]✓ Idle timeout updated to {set}[/green]")
+    # If no action was specified, show current timeout
+    if set is None and slack is None:
+        # Show current timeout setting
+        resp = ssm.send_command(
+            InstanceIds=[engine["instance_id"]],
+            DocumentName="AWS-RunShellScript",
+            Parameters={
+                "commands": [
+                    "grep -E '^IDLE_TIMEOUT_SECONDS=' /etc/engine.env || echo 'IDLE_TIMEOUT_SECONDS=1800'"
+                ],
+                "executionTimeout": ["10"],
+            },
+        )
+        cid = resp["Command"]["CommandId"]
+        time.sleep(1)
+        inv = ssm.get_command_invocation(
+            CommandId=cid, InstanceId=engine["instance_id"]
+        )
+        if inv["Status"] == "Success":
+            line = inv["StandardOutputContent"].strip()
+            secs = int(line.split("=")[1]) if "=" in line else 1800
+            console.print(f"Current idle timeout: {secs//60}m ({secs} seconds)")
+        else:
+            console.print("[red]❌ Could not retrieve idle timeout[/red]")
+        return

dayhoff-tools 1.10.0__tar.gz → 1.10.1__tar.gz

dayhoff-tools 1.10.0tar.gz → 1.10.1tar.gz