PyPI - dayhoff-tools - Versions diffs - 1.5.6__py3-none-any.whl → 1.5.8__py3-none-any.whl - Mend

dayhoff-tools 1.5.6py3-none-any.whl → 1.5.8py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of dayhoff-tools might be problematic. Click here for more details.

Files changed (6) hide show

dayhoff_tools/cli/engine_commands.py CHANGED Viewed

@@ -1523,32 +1523,60 @@ def create_ami(
             # Restore the source engine to a normal state
             console.print("Restoring source engine state...")
+            # Wait for instance to come back after reboot (AMI creation reboots by default)
+            console.print("[dim]Waiting for engine to reboot after snapshot...[/dim]")
+            ec2_waiter = ec2.get_waiter('instance_status_ok')
+            try:
+                ec2_waiter.wait(
+                    InstanceIds=[engine["instance_id"]],
+                    WaiterConfig={'Delay': 10, 'MaxAttempts': 30}  # Wait up to 5 minutes
+                )
+            except Exception as e:
+                console.print(f"[yellow]⚠️  Warning: Engine may still be rebooting: {e}[/yellow]")
+            # Now restore the sentinel and restart services
             restore_response = ssm.send_command(
                 InstanceIds=[engine["instance_id"]],
                 DocumentName="AWS-RunShellScript",
                 Parameters={
                     "commands": [
+                        # Ensure the directories exist
+                        "sudo mkdir -p /opt/dayhoff /opt/dayhoff/state",
+                        # Recreate the sentinel file
                         "sudo touch /opt/dayhoff/first_boot_complete.sentinel",
-                        "sudo systemctl restart engine-idle-detector.timer",
+                        # Mark bootstrap as finished
+                        "echo 'finished' | sudo tee /opt/dayhoff/state/engine-init.stage > /dev/null",
+                        # Restart idle detector if it exists
+                        "sudo systemctl restart engine-idle-detector.timer 2>/dev/null || true",
+                        # Ensure SSM agent is running
+                        "sudo systemctl start amazon-ssm-agent 2>/dev/null || true",
                     ],
                     "executionTimeout": ["60"],
                 },
             )
-            # Quick wait to see if it failed immediately
-            time.sleep(1)
+            # Wait for restore command to complete
             restore_command_id = restore_response["Command"]["CommandId"]
-            result = ssm.get_command_invocation(
-                CommandId=restore_command_id,
-                InstanceId=engine["instance_id"],
-            )
-            if result["Status"] not in ["Pending", "InProgress", "Success"]:
+            for _ in range(10):
+                time.sleep(2)
+                result = ssm.get_command_invocation(
+                    CommandId=restore_command_id,
+                    InstanceId=engine["instance_id"],
+                )
+                if result["Status"] in ["Success", "Failed"]:
+                    break
+            if result["Status"] == "Success":
                 console.print(
-                    "[yellow]⚠️  Warning: Failed to restore source engine state.[/yellow]"
+                    "[green]✓ Source engine restored to normal operation.[/green]"
                 )
             else:
                 console.print(
-                    "[green]✓ Source engine restored to normal operation.[/green]"
+                    "[yellow]⚠️  Warning: Engine state restoration incomplete. You may need to run:[/yellow]"
+                )
+                console.print(
+                    f"[dim]  dh engine repair {engine['name']}[/dim]"
                 )
             console.print(
@@ -1792,52 +1820,28 @@ def attach_studio(
     console.print(f"Attaching studio to engine [cyan]{engine['name']}[/cyan]...")
-    # Determine retry strategy
-    max_attempts = 40 if engine_started_now else 3
-    retry_delay = 10 if engine_started_now else 3
+    # Determine retry strategy based on whether we just started the engine
     if engine_started_now:
-        # Long spinner-based loop while the freshly started engine finishes booting
-        with Progress(
-            SpinnerColumn(),
-            TimeElapsedColumn(),
-            TextColumn("[progress.description]{task.description}"),
-            transient=True,
-        ) as prog:
-            task = prog.add_task(
-                "Attaching studio (engine is still booting)…", total=None
-            )
-            for attempt in range(max_attempts):
-                success, error_msg = _attempt_studio_attach(
-                    studio, engine, target_user, public_key
-                )
-                if success:
-                    break  # success!
-                # Update spinner every 3rd try to avoid log spam
-                if attempt % 3 == 0:
-                    prog.update(
-                        task,
-                        description=f"Attaching studio (engine is still booting)… {attempt+1}/{max_attempts}",
-                    )
-                if error_msg:
-                    console.print(f"[red]❌ Failed to attach studio: {error_msg}[/red]")
-                    return
-                time.sleep(retry_delay)
-            else:
-                console.print(
-                    "[yellow]Engine is still starting up – please retry in a minute.[/yellow]"
-                )
-                return
+        max_attempts = 40  # About 7 minutes total with exponential backoff
+        base_delay = 8
+        max_delay = 20
     else:
-        # Give the (already-running) engine a little breathing room – e.g. it may still be mounting EFS
-        max_attempts = 10  # ~1 min total
-        retry_delay = 6
+        max_attempts = 15  # About 2 minutes total with exponential backoff
+        base_delay = 5
+        max_delay = 10
+    # Unified retry loop with exponential backoff
+    with Progress(
+        SpinnerColumn(),
+        TimeElapsedColumn(),
+        TextColumn("[progress.description]{task.description}"),
+        transient=True,
+    ) as prog:
+        desc = "Attaching studio (engine is still booting)…" if engine_started_now else "Attaching studio…"
+        task = prog.add_task(desc, total=None)
+        consecutive_not_ready = 0
+        last_error = None
         for attempt in range(max_attempts):
             success, error_msg = _attempt_studio_attach(
@@ -1845,22 +1849,54 @@ def attach_studio(
             )
             if success:
-                break  # attached!
+                break  # success!
             if error_msg:
-                # Fatal – bubble up immediately
+                # Fatal error – bubble up immediately
                 console.print(f"[red]❌ Failed to attach studio: {error_msg}[/red]")
+                # Suggest repair command if engine seems broken
+                if "not ready" in error_msg.lower() and attempt > 5:
+                    console.print(f"\n[yellow]Engine may be in a bad state. Try:[/yellow]")
+                    console.print(f"[dim]  dh engine repair {engine['name']}[/dim]")
                 return
-            # Recoverable and still not ready – short wait + optional info
-            if attempt < max_attempts - 1:
-                console.print("[dim]Engine not ready yet – retrying …[/dim]")
-                time.sleep(retry_delay)
+            # Track consecutive "not ready" responses
+            consecutive_not_ready += 1
+            last_error = "Engine not ready"
+            # Update progress display
+            if attempt % 3 == 0:
+                prog.update(
+                    task,
+                    description=f"{desc} attempt {attempt+1}/{max_attempts}",
+                )
+            # If engine seems stuck after many attempts, show a hint
+            if consecutive_not_ready > 10 and attempt == 10:
+                console.print(
+                    "[yellow]Engine is taking longer than expected to become ready.[/yellow]"
+                )
+                console.print(
+                    "[dim]This can happen after GAMI creation or if the engine is still bootstrapping.[/dim]"
+                )
+            # Exponential backoff with jitter
+            delay = min(base_delay * (1.5 ** min(attempt, 5)), max_delay)
+            delay += time.time() % 2  # Add 0-2 seconds of jitter
+            time.sleep(delay)
         else:
+            # All attempts exhausted
             console.print(
-                "[yellow]Engine is busy or still initialising – please retry in about a minute.[/yellow]"
+                f"[yellow]Engine is not becoming ready after {max_attempts} attempts.[/yellow]"
             )
+            if last_error:
+                console.print(f"[dim]Last issue: {last_error}[/dim]")
+            console.print("\n[yellow]You can try:[/yellow]")
+            console.print(f"  1. Wait a minute and retry: [cyan]dh studio attach {engine['name']}[/cyan]")
+            console.print(f"  2. Check engine status: [cyan]dh engine status {engine['name']}[/cyan]")
+            console.print(f"  3. Repair the engine: [cyan]dh engine repair {engine['name']}[/cyan]")
             return
     # Successful attach path
@@ -2453,3 +2489,113 @@ def debug_engine(
         except Exception as e:
             console.print(f"[cyan]{name}:[/cyan] [red]ERROR: {e}[/red]\n")
+@engine_app.command("repair")
+def repair_engine(
+    name_or_id: str = typer.Argument(help="Engine name or instance ID"),
+):
+    """Repair an engine that's stuck in a bad state (e.g., after GAMI creation)."""
+    check_aws_sso()
+    # Get all engines to resolve name
+    response = make_api_request("GET", "/engines")
+    if response.status_code != 200:
+        console.print("[red]❌ Failed to fetch engines[/red]")
+        raise typer.Exit(1)
+    engines = response.json().get("engines", [])
+    engine = resolve_engine(name_or_id, engines)
+    if engine["state"].lower() != "running":
+        console.print(f"[yellow]⚠️  Engine is {engine['state']}. Must be running to repair.[/yellow]")
+        if engine["state"].lower() == "stopped" and Confirm.ask("Start the engine first?"):
+            response = make_api_request("POST", f"/engines/{engine['instance_id']}/start")
+            if response.status_code != 200:
+                console.print("[red]❌ Failed to start engine[/red]")
+                raise typer.Exit(1)
+            console.print("[green]✓ Engine started[/green]")
+            console.print("Waiting for engine to become ready...")
+            time.sleep(30)  # Give it time to boot
+        else:
+            raise typer.Exit(1)
+    console.print(f"[bold]Repairing engine [cyan]{engine['name']}[/cyan][/bold]")
+    console.print("[dim]This will restore bootstrap state and ensure all services are running[/dim]\n")
+    ssm = boto3.client("ssm", region_name="us-east-1")
+    # Repair commands
+    repair_commands = [
+        # Create necessary directories
+        "sudo mkdir -p /opt/dayhoff /opt/dayhoff/state /opt/dayhoff/scripts",
+        # Download scripts from S3 if missing
+        "source /etc/engine.env && sudo aws s3 sync s3://${VM_SCRIPTS_BUCKET}/ /opt/dayhoff/scripts/ --exclude '*' --include '*.sh' --quiet",
+        "sudo chmod +x /opt/dayhoff/scripts/*.sh 2>/dev/null || true",
+        # Restore bootstrap state
+        "sudo touch /opt/dayhoff/first_boot_complete.sentinel",
+        "echo 'finished' | sudo tee /opt/dayhoff/state/engine-init.stage > /dev/null",
+        # Ensure SSM agent is running
+        "sudo systemctl restart amazon-ssm-agent 2>/dev/null || true",
+        # Restart idle detector
+        "sudo systemctl restart engine-idle-detector.timer 2>/dev/null || true",
+        "sudo systemctl restart engine-idle-detector.service 2>/dev/null || true",
+        # Report status
+        "echo '=== Repair Complete ===' && echo 'Sentinel: ' && ls -la /opt/dayhoff/first_boot_complete.sentinel",
+        "echo 'Stage: ' && cat /opt/dayhoff/state/engine-init.stage",
+        "echo 'Scripts: ' && ls /opt/dayhoff/scripts/*.sh 2>/dev/null | wc -l",
+    ]
+    try:
+        with Progress(
+            SpinnerColumn(),
+            TextColumn("[progress.description]{task.description}"),
+            transient=True,
+        ) as progress:
+            task = progress.add_task("Repairing engine...", total=None)
+            response = ssm.send_command(
+                InstanceIds=[engine["instance_id"]],
+                DocumentName="AWS-RunShellScript",
+                Parameters={
+                    "commands": repair_commands,
+                    "executionTimeout": ["60"],
+                },
+            )
+            command_id = response["Command"]["CommandId"]
+            # Wait for command
+            for _ in range(60):
+                time.sleep(1)
+                result = ssm.get_command_invocation(
+                    CommandId=command_id,
+                    InstanceId=engine["instance_id"],
+                )
+                if result["Status"] in ["Success", "Failed"]:
+                    break
+        if result["Status"] == "Success":
+            output = result["StandardOutputContent"]
+            console.print("[green]✓ Engine repaired successfully![/green]\n")
+            # Show repair results
+            if "=== Repair Complete ===" in output:
+                repair_section = output.split("=== Repair Complete ===")[1].strip()
+                console.print("[bold]Repair Results:[/bold]")
+                console.print(repair_section)
+            console.print("\n[dim]You should now be able to attach studios to this engine.[/dim]")
+        else:
+            console.print(
+                f"[red]❌ Repair failed: {result.get('StandardErrorContent', 'Unknown error')}[/red]"
+            )
+            console.print("\n[yellow]Try running 'dh engine debug' for more information.[/yellow]")
+    except Exception as e:
+        console.print(f"[red]❌ Failed to repair engine: {e}[/red]")

dayhoff_tools/warehouse.py CHANGED Viewed

@@ -9,6 +9,25 @@ from zoneinfo import ZoneInfo
 # Import cloud helper lazily inside functions to avoid heavy deps at module load
+def _find_project_root() -> Path | None:
+    """
+    Find the project root by searching upwards from the current directory for
+    a `.git` directory or a `pyproject.toml` file.
+    Returns:
+        The path to the project root, or None if not found.
+    """
+    current_dir = Path.cwd().resolve()
+    while current_dir != current_dir.parent:
+        if (current_dir / ".git").is_dir() or (current_dir / "pyproject.toml").is_file():
+            return current_dir
+        current_dir = current_dir.parent
+    # Check the final directory in the hierarchy (e.g., '/')
+    if (current_dir / ".git").is_dir() or (current_dir / "pyproject.toml").is_file():
+        return current_dir
+    return None
 def _warn_if_gcp_default_sa(force_prompt: bool = False) -> None:
     """Warn the user when the active gcloud principal is the default VM service
     account.  See detailed docstring later in file (duplicate for early
@@ -528,11 +547,17 @@ def import_from_warehouse_typer() -> None:
     import questionary
     # Ensure execution from root directory
-    cwd = Path(os.getcwd())
-    if cwd.parent.name != "workspaces" or str(cwd.parent.parent) != cwd.root:
-        raise Exception(
-            f"This command must be executed from the repo's root directory (/workspaces/reponame). Current directory: {cwd}"
+    project_root = _find_project_root()
+    cwd = Path.cwd()
+    if not project_root or project_root != cwd:
+        error_msg = (
+            "This command must be run from the project's root directory, which is"
+            " expected to contain a `.git` folder or a `pyproject.toml` file.\n"
+            f"Current directory: {cwd}"
         )
+        if project_root:
+            error_msg += f"\nDetected project root: {project_root}"
+        raise Exception(error_msg)
     # Use questionary for prompts instead of typer
     warehouse_path = questionary.text("Warehouse path:").ask()
@@ -574,11 +599,17 @@ def get_from_warehouse_typer() -> None:
     import questionary
     # Ensure execution from root directory
-    cwd = Path(os.getcwd())
-    if cwd.parent.name != "workspaces" or str(cwd.parent.parent) != cwd.root:
-        raise Exception(
-            f"This command must be executed from the repo's root directory (/workspaces/reponame). Current directory: {cwd}"
+    project_root = _find_project_root()
+    cwd = Path.cwd()
+    if not project_root or project_root != cwd:
+        error_msg = (
+            "This command must be run from the project's root directory, which is"
+            " expected to contain a `.git` folder or a `pyproject.toml` file.\n"
+            f"Current directory: {cwd}"
         )
+        if project_root:
+            error_msg += f"\nDetected project root: {project_root}"
+        raise Exception(error_msg)
     # Use questionary for prompts instead of typer
     warehouse_path = questionary.text("Warehouse path:").ask()
@@ -619,11 +650,17 @@ def add_to_warehouse_typer() -> None:
     import questionary
     # Ensure execution from root directory
-    cwd = Path(os.getcwd())
-    if cwd.parent.name != "workspaces" or str(cwd.parent.parent) != cwd.root:
-        raise Exception(
-            f"This command must be executed from the repo's root directory (/workspaces/reponame). Current directory: {cwd}"
+    project_root = _find_project_root()
+    cwd = Path.cwd()
+    if not project_root or project_root != cwd:
+        error_msg = (
+            "This command must be run from the project's root directory, which is"
+            " expected to contain a `.git` folder or a `pyproject.toml` file.\n"
+            f"Current directory: {cwd}"
         )
+        if project_root:
+            error_msg += f"\nDetected project root: {project_root}"
+        raise Exception(error_msg)
     # Prompt for the data file path
     warehouse_path = questionary.text("Data file to be registered:").ask()

{dayhoff_tools-1.5.6.dist-info → dayhoff_tools-1.5.8.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.3
 Name: dayhoff-tools
-Version: 1.5.6
+Version: 1.5.8
 Summary: Common tools for all the repos at Dayhoff Labs
 Author: Daniel Martin-Alarcon
 Author-email: dma@dayhofflabs.com

{dayhoff_tools-1.5.6.dist-info → dayhoff_tools-1.5.8.dist-info}/RECORD RENAMED Viewed

@@ -3,7 +3,7 @@ dayhoff_tools/chemistry/standardizer.py,sha256=uMn7VwHnx02nc404eO6fRuS4rsl4dvSPf
 dayhoff_tools/chemistry/utils.py,sha256=jt-7JgF-GeeVC421acX-bobKbLU_X94KNOW24p_P-_M,2257
 dayhoff_tools/cli/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 dayhoff_tools/cli/cloud_commands.py,sha256=33qcWLmq-FwEXMdL3F0OHm-5Stlh2r65CldyEZgQ1no,40904
-dayhoff_tools/cli/engine_commands.py,sha256=_WzJpxwGVmN0vem6oVi5FosQkKaEGCLUgoUHoKSWejg,89475
+dayhoff_tools/cli/engine_commands.py,sha256=oY291nhCsU470Alol8VxXn_e2fbB7ykXFayH3AICK9g,96371
 dayhoff_tools/cli/main.py,sha256=tRN7WCBHg6uyNp6rA54pKTCoVmBntta2i0Yas3bUpZ4,4853
 dayhoff_tools/cli/swarm_commands.py,sha256=5EyKj8yietvT5lfoz8Zx0iQvVaNgc3SJX1z2zQR6o6M,5614
 dayhoff_tools/cli/utility_commands.py,sha256=FRZTPrjsG_qmIIqoNxd1Q1vVkS_5w8aY33IrVYVNCLg,18131
@@ -26,8 +26,8 @@ dayhoff_tools/intake/structure.py,sha256=ufN3gAodQxhnt7psK1VTQeu9rKERmo_PhoxIbB4
 dayhoff_tools/intake/uniprot.py,sha256=BZYJQF63OtPcBBnQ7_P9gulxzJtqyorgyuDiPeOJqE4,16456
 dayhoff_tools/logs.py,sha256=DKdeP0k0kliRcilwvX0mUB2eipO5BdWUeHwh-VnsICs,838
 dayhoff_tools/sqlite.py,sha256=jV55ikF8VpTfeQqqlHSbY8OgfyfHj8zgHNpZjBLos_E,18672
-dayhoff_tools/warehouse.py,sha256=fV3goH2cH1Y0oLpGERnu4p70P2JfByJHjBh_oMRv9C0,23134
-dayhoff_tools-1.5.6.dist-info/METADATA,sha256=Hi5GVM9uuiyHHt9vaL4FPmpUCxd60O8Y7031pGfZhIU,2914
-dayhoff_tools-1.5.6.dist-info/WHEEL,sha256=b4K_helf-jlQoXBBETfwnf4B04YC67LOev0jo4fX5m8,88
-dayhoff_tools-1.5.6.dist-info/entry_points.txt,sha256=iAf4jteNqW3cJm6CO6czLxjW3vxYKsyGLZ8WGmxamSc,49
-dayhoff_tools-1.5.6.dist-info/RECORD,,
+dayhoff_tools/warehouse.py,sha256=heaYc64qplgN3_1WVPFmqj53goStioWwY5NqlWc4c0s,24453
+dayhoff_tools-1.5.8.dist-info/METADATA,sha256=M5694yUrFz-O9IfiuxskFZZae8mXnJ5xA1GiRJfHHJQ,2914
+dayhoff_tools-1.5.8.dist-info/WHEEL,sha256=b4K_helf-jlQoXBBETfwnf4B04YC67LOev0jo4fX5m8,88
+dayhoff_tools-1.5.8.dist-info/entry_points.txt,sha256=iAf4jteNqW3cJm6CO6czLxjW3vxYKsyGLZ8WGmxamSc,49
+dayhoff_tools-1.5.8.dist-info/RECORD,,

{dayhoff_tools-1.5.6.dist-info → dayhoff_tools-1.5.8.dist-info}/WHEEL RENAMED Viewed

File without changes

{dayhoff_tools-1.5.6.dist-info → dayhoff_tools-1.5.8.dist-info}/entry_points.txt RENAMED Viewed

File without changes

dayhoff-tools 1.5.6__py3-none-any.whl → 1.5.8__py3-none-any.whl

Potentially problematic release.

dayhoff-tools 1.5.6py3-none-any.whl → 1.5.8py3-none-any.whl