dayhoff-tools 1.5.6__py3-none-any.whl → 1.5.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of dayhoff-tools might be problematic. Click here for more details.

@@ -1523,32 +1523,60 @@ def create_ami(
1523
1523
 
1524
1524
  # Restore the source engine to a normal state
1525
1525
  console.print("Restoring source engine state...")
1526
+
1527
+ # Wait for instance to come back after reboot (AMI creation reboots by default)
1528
+ console.print("[dim]Waiting for engine to reboot after snapshot...[/dim]")
1529
+ ec2_waiter = ec2.get_waiter('instance_status_ok')
1530
+ try:
1531
+ ec2_waiter.wait(
1532
+ InstanceIds=[engine["instance_id"]],
1533
+ WaiterConfig={'Delay': 10, 'MaxAttempts': 30} # Wait up to 5 minutes
1534
+ )
1535
+ except Exception as e:
1536
+ console.print(f"[yellow]⚠️ Warning: Engine may still be rebooting: {e}[/yellow]")
1537
+
1538
+ # Now restore the sentinel and restart services
1526
1539
  restore_response = ssm.send_command(
1527
1540
  InstanceIds=[engine["instance_id"]],
1528
1541
  DocumentName="AWS-RunShellScript",
1529
1542
  Parameters={
1530
1543
  "commands": [
1544
+ # Ensure the directories exist
1545
+ "sudo mkdir -p /opt/dayhoff /opt/dayhoff/state",
1546
+ # Recreate the sentinel file
1531
1547
  "sudo touch /opt/dayhoff/first_boot_complete.sentinel",
1532
- "sudo systemctl restart engine-idle-detector.timer",
1548
+ # Mark bootstrap as finished
1549
+ "echo 'finished' | sudo tee /opt/dayhoff/state/engine-init.stage > /dev/null",
1550
+ # Restart idle detector if it exists
1551
+ "sudo systemctl restart engine-idle-detector.timer 2>/dev/null || true",
1552
+ # Ensure SSM agent is running
1553
+ "sudo systemctl start amazon-ssm-agent 2>/dev/null || true",
1533
1554
  ],
1534
1555
  "executionTimeout": ["60"],
1535
1556
  },
1536
1557
  )
1537
1558
 
1538
- # Quick wait to see if it failed immediately
1539
- time.sleep(1)
1559
+ # Wait for restore command to complete
1540
1560
  restore_command_id = restore_response["Command"]["CommandId"]
1541
- result = ssm.get_command_invocation(
1542
- CommandId=restore_command_id,
1543
- InstanceId=engine["instance_id"],
1544
- )
1545
- if result["Status"] not in ["Pending", "InProgress", "Success"]:
1561
+ for _ in range(10):
1562
+ time.sleep(2)
1563
+ result = ssm.get_command_invocation(
1564
+ CommandId=restore_command_id,
1565
+ InstanceId=engine["instance_id"],
1566
+ )
1567
+ if result["Status"] in ["Success", "Failed"]:
1568
+ break
1569
+
1570
+ if result["Status"] == "Success":
1546
1571
  console.print(
1547
- "[yellow]⚠️ Warning: Failed to restore source engine state.[/yellow]"
1572
+ "[green] Source engine restored to normal operation.[/green]"
1548
1573
  )
1549
1574
  else:
1550
1575
  console.print(
1551
- "[green] Source engine restored to normal operation.[/green]"
1576
+ "[yellow]⚠️ Warning: Engine state restoration incomplete. You may need to run:[/yellow]"
1577
+ )
1578
+ console.print(
1579
+ f"[dim] dh engine repair {engine['name']}[/dim]"
1552
1580
  )
1553
1581
 
1554
1582
  console.print(
@@ -1792,52 +1820,28 @@ def attach_studio(
1792
1820
 
1793
1821
  console.print(f"Attaching studio to engine [cyan]{engine['name']}[/cyan]...")
1794
1822
 
1795
- # Determine retry strategy
1796
- max_attempts = 40 if engine_started_now else 3
1797
- retry_delay = 10 if engine_started_now else 3
1798
-
1823
+ # Determine retry strategy based on whether we just started the engine
1799
1824
  if engine_started_now:
1800
- # Long spinner-based loop while the freshly started engine finishes booting
1801
- with Progress(
1802
- SpinnerColumn(),
1803
- TimeElapsedColumn(),
1804
- TextColumn("[progress.description]{task.description}"),
1805
- transient=True,
1806
- ) as prog:
1807
- task = prog.add_task(
1808
- "Attaching studio (engine is still booting)…", total=None
1809
- )
1810
-
1811
- for attempt in range(max_attempts):
1812
- success, error_msg = _attempt_studio_attach(
1813
- studio, engine, target_user, public_key
1814
- )
1815
-
1816
- if success:
1817
- break # success!
1818
-
1819
- # Update spinner every 3rd try to avoid log spam
1820
- if attempt % 3 == 0:
1821
- prog.update(
1822
- task,
1823
- description=f"Attaching studio (engine is still booting)… {attempt+1}/{max_attempts}",
1824
- )
1825
-
1826
- if error_msg:
1827
- console.print(f"[red]❌ Failed to attach studio: {error_msg}[/red]")
1828
- return
1829
-
1830
- time.sleep(retry_delay)
1831
-
1832
- else:
1833
- console.print(
1834
- "[yellow]Engine is still starting up – please retry in a minute.[/yellow]"
1835
- )
1836
- return
1825
+ max_attempts = 40 # About 7 minutes total with exponential backoff
1826
+ base_delay = 8
1827
+ max_delay = 20
1837
1828
  else:
1838
- # Give the (already-running) engine a little breathing room – e.g. it may still be mounting EFS
1839
- max_attempts = 10 # ~1 min total
1840
- retry_delay = 6
1829
+ max_attempts = 15 # About 2 minutes total with exponential backoff
1830
+ base_delay = 5
1831
+ max_delay = 10
1832
+
1833
+ # Unified retry loop with exponential backoff
1834
+ with Progress(
1835
+ SpinnerColumn(),
1836
+ TimeElapsedColumn(),
1837
+ TextColumn("[progress.description]{task.description}"),
1838
+ transient=True,
1839
+ ) as prog:
1840
+ desc = "Attaching studio (engine is still booting)…" if engine_started_now else "Attaching studio…"
1841
+ task = prog.add_task(desc, total=None)
1842
+
1843
+ consecutive_not_ready = 0
1844
+ last_error = None
1841
1845
 
1842
1846
  for attempt in range(max_attempts):
1843
1847
  success, error_msg = _attempt_studio_attach(
@@ -1845,22 +1849,54 @@ def attach_studio(
1845
1849
  )
1846
1850
 
1847
1851
  if success:
1848
- break # attached!
1852
+ break # success!
1849
1853
 
1850
1854
  if error_msg:
1851
- # Fatal – bubble up immediately
1855
+ # Fatal error – bubble up immediately
1852
1856
  console.print(f"[red]❌ Failed to attach studio: {error_msg}[/red]")
1857
+
1858
+ # Suggest repair command if engine seems broken
1859
+ if "not ready" in error_msg.lower() and attempt > 5:
1860
+ console.print(f"\n[yellow]Engine may be in a bad state. Try:[/yellow]")
1861
+ console.print(f"[dim] dh engine repair {engine['name']}[/dim]")
1853
1862
  return
1854
1863
 
1855
- # Recoverable and still not ready – short wait + optional info
1856
- if attempt < max_attempts - 1:
1857
- console.print("[dim]Engine not ready yet – retrying …[/dim]")
1858
- time.sleep(retry_delay)
1864
+ # Track consecutive "not ready" responses
1865
+ consecutive_not_ready += 1
1866
+ last_error = "Engine not ready"
1867
+
1868
+ # Update progress display
1869
+ if attempt % 3 == 0:
1870
+ prog.update(
1871
+ task,
1872
+ description=f"{desc} attempt {attempt+1}/{max_attempts}",
1873
+ )
1874
+
1875
+ # If engine seems stuck after many attempts, show a hint
1876
+ if consecutive_not_ready > 10 and attempt == 10:
1877
+ console.print(
1878
+ "[yellow]Engine is taking longer than expected to become ready.[/yellow]"
1879
+ )
1880
+ console.print(
1881
+ "[dim]This can happen after GAMI creation or if the engine is still bootstrapping.[/dim]"
1882
+ )
1883
+
1884
+ # Exponential backoff with jitter
1885
+ delay = min(base_delay * (1.5 ** min(attempt, 5)), max_delay)
1886
+ delay += time.time() % 2 # Add 0-2 seconds of jitter
1887
+ time.sleep(delay)
1859
1888
 
1860
1889
  else:
1890
+ # All attempts exhausted
1861
1891
  console.print(
1862
- "[yellow]Engine is busy or still initialising please retry in about a minute.[/yellow]"
1892
+ f"[yellow]Engine is not becoming ready after {max_attempts} attempts.[/yellow]"
1863
1893
  )
1894
+ if last_error:
1895
+ console.print(f"[dim]Last issue: {last_error}[/dim]")
1896
+ console.print("\n[yellow]You can try:[/yellow]")
1897
+ console.print(f" 1. Wait a minute and retry: [cyan]dh studio attach {engine['name']}[/cyan]")
1898
+ console.print(f" 2. Check engine status: [cyan]dh engine status {engine['name']}[/cyan]")
1899
+ console.print(f" 3. Repair the engine: [cyan]dh engine repair {engine['name']}[/cyan]")
1864
1900
  return
1865
1901
 
1866
1902
  # Successful attach path
@@ -2453,3 +2489,113 @@ def debug_engine(
2453
2489
 
2454
2490
  except Exception as e:
2455
2491
  console.print(f"[cyan]{name}:[/cyan] [red]ERROR: {e}[/red]\n")
2492
+
2493
+
2494
+ @engine_app.command("repair")
2495
+ def repair_engine(
2496
+ name_or_id: str = typer.Argument(help="Engine name or instance ID"),
2497
+ ):
2498
+ """Repair an engine that's stuck in a bad state (e.g., after GAMI creation)."""
2499
+ check_aws_sso()
2500
+
2501
+ # Get all engines to resolve name
2502
+ response = make_api_request("GET", "/engines")
2503
+ if response.status_code != 200:
2504
+ console.print("[red]❌ Failed to fetch engines[/red]")
2505
+ raise typer.Exit(1)
2506
+
2507
+ engines = response.json().get("engines", [])
2508
+ engine = resolve_engine(name_or_id, engines)
2509
+
2510
+ if engine["state"].lower() != "running":
2511
+ console.print(f"[yellow]⚠️ Engine is {engine['state']}. Must be running to repair.[/yellow]")
2512
+ if engine["state"].lower() == "stopped" and Confirm.ask("Start the engine first?"):
2513
+ response = make_api_request("POST", f"/engines/{engine['instance_id']}/start")
2514
+ if response.status_code != 200:
2515
+ console.print("[red]❌ Failed to start engine[/red]")
2516
+ raise typer.Exit(1)
2517
+ console.print("[green]✓ Engine started[/green]")
2518
+ console.print("Waiting for engine to become ready...")
2519
+ time.sleep(30) # Give it time to boot
2520
+ else:
2521
+ raise typer.Exit(1)
2522
+
2523
+ console.print(f"[bold]Repairing engine [cyan]{engine['name']}[/cyan][/bold]")
2524
+ console.print("[dim]This will restore bootstrap state and ensure all services are running[/dim]\n")
2525
+
2526
+ ssm = boto3.client("ssm", region_name="us-east-1")
2527
+
2528
+ # Repair commands
2529
+ repair_commands = [
2530
+ # Create necessary directories
2531
+ "sudo mkdir -p /opt/dayhoff /opt/dayhoff/state /opt/dayhoff/scripts",
2532
+
2533
+ # Download scripts from S3 if missing
2534
+ "source /etc/engine.env && sudo aws s3 sync s3://${VM_SCRIPTS_BUCKET}/ /opt/dayhoff/scripts/ --exclude '*' --include '*.sh' --quiet",
2535
+ "sudo chmod +x /opt/dayhoff/scripts/*.sh 2>/dev/null || true",
2536
+
2537
+ # Restore bootstrap state
2538
+ "sudo touch /opt/dayhoff/first_boot_complete.sentinel",
2539
+ "echo 'finished' | sudo tee /opt/dayhoff/state/engine-init.stage > /dev/null",
2540
+
2541
+ # Ensure SSM agent is running
2542
+ "sudo systemctl restart amazon-ssm-agent 2>/dev/null || true",
2543
+
2544
+ # Restart idle detector
2545
+ "sudo systemctl restart engine-idle-detector.timer 2>/dev/null || true",
2546
+ "sudo systemctl restart engine-idle-detector.service 2>/dev/null || true",
2547
+
2548
+ # Report status
2549
+ "echo '=== Repair Complete ===' && echo 'Sentinel: ' && ls -la /opt/dayhoff/first_boot_complete.sentinel",
2550
+ "echo 'Stage: ' && cat /opt/dayhoff/state/engine-init.stage",
2551
+ "echo 'Scripts: ' && ls /opt/dayhoff/scripts/*.sh 2>/dev/null | wc -l",
2552
+ ]
2553
+
2554
+ try:
2555
+ with Progress(
2556
+ SpinnerColumn(),
2557
+ TextColumn("[progress.description]{task.description}"),
2558
+ transient=True,
2559
+ ) as progress:
2560
+ task = progress.add_task("Repairing engine...", total=None)
2561
+
2562
+ response = ssm.send_command(
2563
+ InstanceIds=[engine["instance_id"]],
2564
+ DocumentName="AWS-RunShellScript",
2565
+ Parameters={
2566
+ "commands": repair_commands,
2567
+ "executionTimeout": ["60"],
2568
+ },
2569
+ )
2570
+
2571
+ command_id = response["Command"]["CommandId"]
2572
+
2573
+ # Wait for command
2574
+ for _ in range(60):
2575
+ time.sleep(1)
2576
+ result = ssm.get_command_invocation(
2577
+ CommandId=command_id,
2578
+ InstanceId=engine["instance_id"],
2579
+ )
2580
+ if result["Status"] in ["Success", "Failed"]:
2581
+ break
2582
+
2583
+ if result["Status"] == "Success":
2584
+ output = result["StandardOutputContent"]
2585
+ console.print("[green]✓ Engine repaired successfully![/green]\n")
2586
+
2587
+ # Show repair results
2588
+ if "=== Repair Complete ===" in output:
2589
+ repair_section = output.split("=== Repair Complete ===")[1].strip()
2590
+ console.print("[bold]Repair Results:[/bold]")
2591
+ console.print(repair_section)
2592
+
2593
+ console.print("\n[dim]You should now be able to attach studios to this engine.[/dim]")
2594
+ else:
2595
+ console.print(
2596
+ f"[red]❌ Repair failed: {result.get('StandardErrorContent', 'Unknown error')}[/red]"
2597
+ )
2598
+ console.print("\n[yellow]Try running 'dh engine debug' for more information.[/yellow]")
2599
+
2600
+ except Exception as e:
2601
+ console.print(f"[red]❌ Failed to repair engine: {e}[/red]")
@@ -9,6 +9,25 @@ from zoneinfo import ZoneInfo
9
9
  # Import cloud helper lazily inside functions to avoid heavy deps at module load
10
10
 
11
11
 
12
+ def _find_project_root() -> Path | None:
13
+ """
14
+ Find the project root by searching upwards from the current directory for
15
+ a `.git` directory or a `pyproject.toml` file.
16
+
17
+ Returns:
18
+ The path to the project root, or None if not found.
19
+ """
20
+ current_dir = Path.cwd().resolve()
21
+ while current_dir != current_dir.parent:
22
+ if (current_dir / ".git").is_dir() or (current_dir / "pyproject.toml").is_file():
23
+ return current_dir
24
+ current_dir = current_dir.parent
25
+ # Check the final directory in the hierarchy (e.g., '/')
26
+ if (current_dir / ".git").is_dir() or (current_dir / "pyproject.toml").is_file():
27
+ return current_dir
28
+ return None
29
+
30
+
12
31
  def _warn_if_gcp_default_sa(force_prompt: bool = False) -> None:
13
32
  """Warn the user when the active gcloud principal is the default VM service
14
33
  account. See detailed docstring later in file (duplicate for early
@@ -528,11 +547,17 @@ def import_from_warehouse_typer() -> None:
528
547
  import questionary
529
548
 
530
549
  # Ensure execution from root directory
531
- cwd = Path(os.getcwd())
532
- if cwd.parent.name != "workspaces" or str(cwd.parent.parent) != cwd.root:
533
- raise Exception(
534
- f"This command must be executed from the repo's root directory (/workspaces/reponame). Current directory: {cwd}"
550
+ project_root = _find_project_root()
551
+ cwd = Path.cwd()
552
+ if not project_root or project_root != cwd:
553
+ error_msg = (
554
+ "This command must be run from the project's root directory, which is"
555
+ " expected to contain a `.git` folder or a `pyproject.toml` file.\n"
556
+ f"Current directory: {cwd}"
535
557
  )
558
+ if project_root:
559
+ error_msg += f"\nDetected project root: {project_root}"
560
+ raise Exception(error_msg)
536
561
 
537
562
  # Use questionary for prompts instead of typer
538
563
  warehouse_path = questionary.text("Warehouse path:").ask()
@@ -574,11 +599,17 @@ def get_from_warehouse_typer() -> None:
574
599
  import questionary
575
600
 
576
601
  # Ensure execution from root directory
577
- cwd = Path(os.getcwd())
578
- if cwd.parent.name != "workspaces" or str(cwd.parent.parent) != cwd.root:
579
- raise Exception(
580
- f"This command must be executed from the repo's root directory (/workspaces/reponame). Current directory: {cwd}"
602
+ project_root = _find_project_root()
603
+ cwd = Path.cwd()
604
+ if not project_root or project_root != cwd:
605
+ error_msg = (
606
+ "This command must be run from the project's root directory, which is"
607
+ " expected to contain a `.git` folder or a `pyproject.toml` file.\n"
608
+ f"Current directory: {cwd}"
581
609
  )
610
+ if project_root:
611
+ error_msg += f"\nDetected project root: {project_root}"
612
+ raise Exception(error_msg)
582
613
 
583
614
  # Use questionary for prompts instead of typer
584
615
  warehouse_path = questionary.text("Warehouse path:").ask()
@@ -619,11 +650,17 @@ def add_to_warehouse_typer() -> None:
619
650
  import questionary
620
651
 
621
652
  # Ensure execution from root directory
622
- cwd = Path(os.getcwd())
623
- if cwd.parent.name != "workspaces" or str(cwd.parent.parent) != cwd.root:
624
- raise Exception(
625
- f"This command must be executed from the repo's root directory (/workspaces/reponame). Current directory: {cwd}"
653
+ project_root = _find_project_root()
654
+ cwd = Path.cwd()
655
+ if not project_root or project_root != cwd:
656
+ error_msg = (
657
+ "This command must be run from the project's root directory, which is"
658
+ " expected to contain a `.git` folder or a `pyproject.toml` file.\n"
659
+ f"Current directory: {cwd}"
626
660
  )
661
+ if project_root:
662
+ error_msg += f"\nDetected project root: {project_root}"
663
+ raise Exception(error_msg)
627
664
 
628
665
  # Prompt for the data file path
629
666
  warehouse_path = questionary.text("Data file to be registered:").ask()
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: dayhoff-tools
3
- Version: 1.5.6
3
+ Version: 1.5.8
4
4
  Summary: Common tools for all the repos at Dayhoff Labs
5
5
  Author: Daniel Martin-Alarcon
6
6
  Author-email: dma@dayhofflabs.com
@@ -3,7 +3,7 @@ dayhoff_tools/chemistry/standardizer.py,sha256=uMn7VwHnx02nc404eO6fRuS4rsl4dvSPf
3
3
  dayhoff_tools/chemistry/utils.py,sha256=jt-7JgF-GeeVC421acX-bobKbLU_X94KNOW24p_P-_M,2257
4
4
  dayhoff_tools/cli/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
5
5
  dayhoff_tools/cli/cloud_commands.py,sha256=33qcWLmq-FwEXMdL3F0OHm-5Stlh2r65CldyEZgQ1no,40904
6
- dayhoff_tools/cli/engine_commands.py,sha256=_WzJpxwGVmN0vem6oVi5FosQkKaEGCLUgoUHoKSWejg,89475
6
+ dayhoff_tools/cli/engine_commands.py,sha256=oY291nhCsU470Alol8VxXn_e2fbB7ykXFayH3AICK9g,96371
7
7
  dayhoff_tools/cli/main.py,sha256=tRN7WCBHg6uyNp6rA54pKTCoVmBntta2i0Yas3bUpZ4,4853
8
8
  dayhoff_tools/cli/swarm_commands.py,sha256=5EyKj8yietvT5lfoz8Zx0iQvVaNgc3SJX1z2zQR6o6M,5614
9
9
  dayhoff_tools/cli/utility_commands.py,sha256=FRZTPrjsG_qmIIqoNxd1Q1vVkS_5w8aY33IrVYVNCLg,18131
@@ -26,8 +26,8 @@ dayhoff_tools/intake/structure.py,sha256=ufN3gAodQxhnt7psK1VTQeu9rKERmo_PhoxIbB4
26
26
  dayhoff_tools/intake/uniprot.py,sha256=BZYJQF63OtPcBBnQ7_P9gulxzJtqyorgyuDiPeOJqE4,16456
27
27
  dayhoff_tools/logs.py,sha256=DKdeP0k0kliRcilwvX0mUB2eipO5BdWUeHwh-VnsICs,838
28
28
  dayhoff_tools/sqlite.py,sha256=jV55ikF8VpTfeQqqlHSbY8OgfyfHj8zgHNpZjBLos_E,18672
29
- dayhoff_tools/warehouse.py,sha256=fV3goH2cH1Y0oLpGERnu4p70P2JfByJHjBh_oMRv9C0,23134
30
- dayhoff_tools-1.5.6.dist-info/METADATA,sha256=Hi5GVM9uuiyHHt9vaL4FPmpUCxd60O8Y7031pGfZhIU,2914
31
- dayhoff_tools-1.5.6.dist-info/WHEEL,sha256=b4K_helf-jlQoXBBETfwnf4B04YC67LOev0jo4fX5m8,88
32
- dayhoff_tools-1.5.6.dist-info/entry_points.txt,sha256=iAf4jteNqW3cJm6CO6czLxjW3vxYKsyGLZ8WGmxamSc,49
33
- dayhoff_tools-1.5.6.dist-info/RECORD,,
29
+ dayhoff_tools/warehouse.py,sha256=heaYc64qplgN3_1WVPFmqj53goStioWwY5NqlWc4c0s,24453
30
+ dayhoff_tools-1.5.8.dist-info/METADATA,sha256=M5694yUrFz-O9IfiuxskFZZae8mXnJ5xA1GiRJfHHJQ,2914
31
+ dayhoff_tools-1.5.8.dist-info/WHEEL,sha256=b4K_helf-jlQoXBBETfwnf4B04YC67LOev0jo4fX5m8,88
32
+ dayhoff_tools-1.5.8.dist-info/entry_points.txt,sha256=iAf4jteNqW3cJm6CO6czLxjW3vxYKsyGLZ8WGmxamSc,49
33
+ dayhoff_tools-1.5.8.dist-info/RECORD,,