dayhoff-tools 1.7.4__py3-none-any.whl → 1.7.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -319,6 +319,88 @@ def format_status(state: str, ready: Optional[bool]) -> str:
319
319
  return state
320
320
 
321
321
 
322
+ # --------------------------------------------------------------------------------
323
+ # Audit helpers (Phase 1 observability)
324
+ # --------------------------------------------------------------------------------
325
+
326
+
327
+ def _get_engine_audit_bucket() -> Optional[str]:
328
+ """Return the engine audit bucket name from SSM Parameter Store, if configured."""
329
+ try:
330
+ ssm = boto3.client("ssm", region_name="us-east-1")
331
+ resp = ssm.get_parameter(Name="/dev/studio-manager/engine-audit-bucket")
332
+ return resp["Parameter"]["Value"]
333
+ except ClientError:
334
+ return None
335
+
336
+
337
+ def _fetch_last_audit_via_ssm(instance_id: str) -> Optional[Dict]:
338
+ """Fetch last shutdown attempt audit from the engine via SSM (fast best-effort)."""
339
+ try:
340
+ ssm = boto3.client("ssm", region_name="us-east-1")
341
+ resp = ssm.send_command(
342
+ InstanceIds=[instance_id],
343
+ DocumentName="AWS-RunShellScript",
344
+ Parameters={
345
+ "commands": [
346
+ "cat /var/log/idle-detector/last_shutdown_attempt.json 2>/dev/null || true",
347
+ ],
348
+ "executionTimeout": ["3"],
349
+ },
350
+ )
351
+ cid = resp["Command"]["CommandId"]
352
+ time.sleep(1)
353
+ inv = ssm.get_command_invocation(CommandId=cid, InstanceId=instance_id)
354
+ if inv["Status"] != "Success":
355
+ return None
356
+ content = inv["StandardOutputContent"].strip()
357
+ if not content:
358
+ return None
359
+ return json.loads(content)
360
+ except Exception:
361
+ return None
362
+
363
+
364
+ def _fetch_last_audit_via_s3(instance_id: str) -> Optional[Dict]:
365
+ """Fetch the newest audit object from S3 if available."""
366
+ bucket = _get_engine_audit_bucket()
367
+ if not bucket:
368
+ return None
369
+ try:
370
+ s3 = boto3.client("s3", region_name="us-east-1")
371
+ paginator = s3.get_paginator("list_objects_v2")
372
+ newest = None
373
+ for page in paginator.paginate(
374
+ Bucket=bucket, Prefix=f"{instance_id}/", MaxKeys=1000
375
+ ):
376
+ for obj in page.get("Contents", []):
377
+ lm = obj.get("LastModified")
378
+ if newest is None or (lm and lm > newest["LastModified"]):
379
+ newest = obj
380
+ if not newest:
381
+ return None
382
+ obj = s3.get_object(Bucket=bucket, Key=newest["Key"])
383
+ data = obj["Body"].read().decode("utf-8")
384
+ return json.loads(data)
385
+ except Exception:
386
+ return None
387
+
388
+
389
+ def _summarize_audit(audit: Dict) -> str:
390
+ """Return a compact one-line summary for status output."""
391
+ ts = audit.get("ts", "?")
392
+ shutdown = audit.get("shutdown", {})
393
+ result = shutdown.get("result", "?")
394
+ detach = audit.get("detach", {})
395
+ num_detached = sum(
396
+ 1 for r in detach.get("results", []) if r.get("status") == "success"
397
+ )
398
+ idle = audit.get("idle", {})
399
+ elapsed = int(idle.get("elapsed_sec", 0))
400
+ threshold = int(idle.get("threshold_sec", 0))
401
+ return f"Last shutdown attempt: {ts} result={result} detach={num_detached} idle={elapsed//60}/{threshold//60}m"
402
+
403
+
322
404
  def resolve_engine(name_or_id: str, engines: List[Dict]) -> Dict:
323
405
  """Resolve engine by name or ID with interactive selection."""
324
406
  # Exact ID match
@@ -397,7 +479,9 @@ def get_ssh_public_key() -> str:
397
479
  if shutil.which("ssh-add") is not None:
398
480
  proc = subprocess.run(["ssh-add", "-L"], capture_output=True, text=True)
399
481
  if proc.returncode == 0 and proc.stdout:
400
- keys = [line.strip() for line in proc.stdout.splitlines() if line.strip()]
482
+ keys = [
483
+ line.strip() for line in proc.stdout.splitlines() if line.strip()
484
+ ]
401
485
  # Prefer ed25519, then rsa
402
486
  for pref in ("ssh-ed25519", "ssh-rsa", "ecdsa-sha2-nistp256"):
403
487
  for k in keys:
@@ -437,7 +521,9 @@ def check_session_manager_plugin():
437
521
  return True
438
522
 
439
523
 
440
- def update_ssh_config_entry(engine_name: str, instance_id: str, ssh_user: str, idle_timeout: int = 600):
524
+ def update_ssh_config_entry(
525
+ engine_name: str, instance_id: str, ssh_user: str, idle_timeout: int = 600
526
+ ):
441
527
  """Add or update a single SSH config entry for the given SSH user.
442
528
 
443
529
  Args:
@@ -462,7 +548,10 @@ def update_ssh_config_entry(engine_name: str, instance_id: str, ssh_user: str, i
462
548
  skip_until_next_host = False
463
549
  for line in lines:
464
550
  # Check if this is our managed host
465
- if line.strip().startswith(f"Host {engine_name}") and SSH_MANAGED_COMMENT in line:
551
+ if (
552
+ line.strip().startswith(f"Host {engine_name}")
553
+ and SSH_MANAGED_COMMENT in line
554
+ ):
466
555
  skip_until_next_host = True
467
556
  elif line.strip().startswith("Host ") and skip_until_next_host:
468
557
  skip_until_next_host = False
@@ -474,13 +563,15 @@ def update_ssh_config_entry(engine_name: str, instance_id: str, ssh_user: str, i
474
563
  # Add the new entry
475
564
  if new_lines and new_lines[-1].strip(): # Add blank line if needed
476
565
  new_lines.append("")
477
-
478
- new_lines.extend([
479
- f"Host {engine_name} {SSH_MANAGED_COMMENT}",
480
- f" HostName {instance_id}",
481
- f" User {ssh_user}",
482
- f" ProxyCommand sh -c \"AWS_SSM_IDLE_TIMEOUT={idle_timeout} aws ssm start-session --target %h --document-name AWS-StartSSHSession --parameters 'portNumber=%p'\"",
483
- ])
566
+
567
+ new_lines.extend(
568
+ [
569
+ f"Host {engine_name} {SSH_MANAGED_COMMENT}",
570
+ f" HostName {instance_id}",
571
+ f" User {ssh_user}",
572
+ f" ProxyCommand sh -c \"AWS_SSM_IDLE_TIMEOUT={idle_timeout} aws ssm start-session --target %h --document-name AWS-StartSSHSession --parameters 'portNumber=%p'\"",
573
+ ]
574
+ )
484
575
 
485
576
  # Write back
486
577
  config_path.write_text("\n".join(new_lines))
@@ -662,16 +753,20 @@ def list_engines(
662
753
  ]
663
754
  if detailed:
664
755
  row_data.append(disk_usage)
665
- row_data.extend([
666
- time_str,
667
- f"${hourly_cost:.2f}",
668
- ])
669
-
756
+ row_data.extend(
757
+ [
758
+ time_str,
759
+ f"${hourly_cost:.2f}",
760
+ ]
761
+ )
762
+
670
763
  table.add_row(*row_data)
671
764
 
672
765
  console.print(table)
673
766
  if not detailed and any(e["state"].lower() == "running" for e in engines):
674
- console.print("\n[dim]Tip: Use --detailed to see disk usage and bootstrap status (slower)[/dim]")
767
+ console.print(
768
+ "\n[dim]Tip: Use --detailed to see disk usage and bootstrap status (slower)[/dim]"
769
+ )
675
770
  else:
676
771
  error = response.json().get("error", "Unknown error")
677
772
  console.print(f"[red]❌ Failed to list engines: {error}[/red]")
@@ -798,6 +893,16 @@ def engine_status(
798
893
  f" • [magenta]🖥 IDE connected ({ide_conn['connection_count']} connections)[/magenta]"
799
894
  )
800
895
 
896
+ # Audit one-liner (best-effort SSM fetch)
897
+ try:
898
+ last_audit = _fetch_last_audit_via_ssm(engine["instance_id"])
899
+ if last_audit:
900
+ status_lines.append("")
901
+ status_lines.append("[bold]Idle Audit:[/bold]")
902
+ status_lines.append(f" • {_summarize_audit(last_audit)}")
903
+ except Exception:
904
+ pass
905
+
801
906
  if attached_studios:
802
907
  status_lines.append("")
803
908
  status_lines.append("[bold]Attached Studios:[/bold]")
@@ -839,6 +944,67 @@ def engine_status(
839
944
  console.print(f"[red]❌ Error fetching log: {e}[/red]")
840
945
 
841
946
 
947
+ @engine_app.command("why")
948
+ def engine_why(
949
+ name_or_id: str = typer.Argument(help="Engine name or instance ID"),
950
+ raw: bool = typer.Option(False, "--raw", help="Print raw audit JSON"),
951
+ ):
952
+ """Explain the last idle-detector shutdown attempt for an engine.
953
+
954
+ Tries SSM (on-instance file) first, then falls back to S3 audit bucket.
955
+ """
956
+ check_aws_sso()
957
+
958
+ # Resolve engine
959
+ response = make_api_request("GET", "/engines")
960
+ if response.status_code != 200:
961
+ console.print("[red]❌ Failed to fetch engines[/red]")
962
+ raise typer.Exit(1)
963
+
964
+ engines = response.json().get("engines", [])
965
+ engine = resolve_engine(name_or_id, engines)
966
+
967
+ audit = _fetch_last_audit_via_ssm(
968
+ engine["instance_id"]
969
+ ) or _fetch_last_audit_via_s3(engine["instance_id"])
970
+ if not audit:
971
+ console.print("No audit found (engine may not have attempted shutdown yet).")
972
+ raise typer.Exit(0)
973
+
974
+ if raw:
975
+ console.print_json(data=audit)
976
+ return
977
+
978
+ # Pretty summary
979
+ status = _summarize_audit(audit)
980
+ panel_lines = [
981
+ f"[bold]{status}[/bold]",
982
+ "",
983
+ "[bold]Sensors:[/bold]",
984
+ ]
985
+ for r in audit.get("idle", {}).get("reason_snapshot", []):
986
+ active = "✓" if r.get("active") else "-"
987
+ reason = r.get("reason") or ""
988
+ sensor = r.get("sensor")
989
+ panel_lines.append(f" {active} {sensor}: {reason}")
990
+ panel_lines.append("")
991
+ panel_lines.append("[bold]Detach results:[/bold]")
992
+ for res in audit.get("detach", {}).get("results", []):
993
+ panel_lines.append(
994
+ f" - {res.get('studio_id')}: {res.get('status')} {res.get('error') or ''}"
995
+ )
996
+ s3_info = audit.get("s3", {})
997
+ if s3_info.get("uploaded"):
998
+ panel_lines.append("")
999
+ panel_lines.append(
1000
+ f"[dim]S3: s3://{s3_info.get('bucket')}/{s3_info.get('key')}[/dim]"
1001
+ )
1002
+
1003
+ console.print(
1004
+ Panel("\n".join(panel_lines), title="Idle Shutdown Audit", border_style="blue")
1005
+ )
1006
+
1007
+
842
1008
  @engine_app.command("stop")
843
1009
  def stop_engine(
844
1010
  name_or_id: str = typer.Argument(help="Engine name or instance ID"),
@@ -969,7 +1135,9 @@ def ssh_engine(
969
1135
  False, "--admin", help="Connect as ec2-user instead of the engine owner user"
970
1136
  ),
971
1137
  idle_timeout: int = typer.Option(
972
- 600, "--idle-timeout", help="Idle timeout (seconds) for the SSM port-forward (0 = disable)"
1138
+ 600,
1139
+ "--idle-timeout",
1140
+ help="Idle timeout (seconds) for the SSM port-forward (0 = disable)",
973
1141
  ),
974
1142
  ):
975
1143
  """Connect to an engine via SSH.
@@ -1003,7 +1171,9 @@ def ssh_engine(
1003
1171
  console.print(
1004
1172
  f"Updating SSH config for [cyan]{engine['name']}[/cyan] (user: {ssh_user})..."
1005
1173
  )
1006
- update_ssh_config_entry(engine["name"], engine["instance_id"], ssh_user, idle_timeout)
1174
+ update_ssh_config_entry(
1175
+ engine["name"], engine["instance_id"], ssh_user, idle_timeout
1176
+ )
1007
1177
 
1008
1178
  # Connect
1009
1179
  console.print(f"[green]✓ Connecting to {engine['name']}...[/green]")
@@ -1421,7 +1591,7 @@ def create_ami(
1421
1591
  if response.status_code != 200:
1422
1592
  console.print("[red]❌ Failed to fetch engines[/red]")
1423
1593
  raise typer.Exit(1)
1424
-
1594
+
1425
1595
  engines = response.json().get("engines", [])
1426
1596
  engine = resolve_engine(name_or_id, engines)
1427
1597
 
@@ -1436,15 +1606,19 @@ def create_ami(
1436
1606
  # 2. Check for attached studios from the detailed API response
1437
1607
  attached_studios = engine.get("studios", [])
1438
1608
  if attached_studios:
1439
- console.print(f"[bold red]❌ Engine '{engine['name']}' has studios attached.[/bold red]")
1609
+ console.print(
1610
+ f"[bold red]❌ Engine '{engine['name']}' has studios attached.[/bold red]"
1611
+ )
1440
1612
  console.print("Please detach all studios before creating an AMI:")
1441
1613
  for studio in attached_studios:
1442
- console.print(f" - {studio['user']} ({studio['studio_id']})")
1614
+ console.print(f" - {studio['user']} ({studio['studio_id']})")
1443
1615
  console.print("\nTo detach, run [bold]dh studio detach[/bold]")
1444
1616
  raise typer.Exit(1)
1445
1617
 
1446
1618
  # Construct AMI name and description
1447
- ami_name = f"prewarmed-engine-{engine['engine_type']}-{datetime.now().strftime('%Y%m%d')}"
1619
+ ami_name = (
1620
+ f"prewarmed-engine-{engine['engine_type']}-{datetime.now().strftime('%Y%m%d')}"
1621
+ )
1448
1622
  description = (
1449
1623
  f"Amazon Linux 2023 with NVIDIA drivers, Docker, and pre-pulled "
1450
1624
  f"dev container image for {engine['engine_type']} engines"
@@ -1453,7 +1627,7 @@ def create_ami(
1453
1627
  console.print(f"Creating AMI from engine [cyan]{engine['name']}[/cyan]...")
1454
1628
  console.print(f"[bold]AMI Name:[/] {ami_name}")
1455
1629
  console.print(f"[bold]Description:[/] {description}")
1456
-
1630
+
1457
1631
  console.print(
1458
1632
  "\n[bold yellow]⚠️ Important:[/bold yellow]\n"
1459
1633
  "1. This process will run cleanup scripts on the engine.\n"
@@ -1475,7 +1649,7 @@ def create_ami(
1475
1649
  "history -c",
1476
1650
  "sudo rm -rf /tmp/* /var/log/messages /var/log/cloud-init.log",
1477
1651
  "sudo rm -rf /var/lib/amazon/ssm/* /etc/amazon/ssm/*",
1478
- "sleep 2 && sudo systemctl stop amazon-ssm-agent &", # Stop agent last
1652
+ "sleep 2 && sudo systemctl stop amazon-ssm-agent &", # Stop agent last
1479
1653
  ]
1480
1654
 
1481
1655
  cleanup_response = ssm.send_command(
@@ -1485,16 +1659,20 @@ def create_ami(
1485
1659
  )
1486
1660
 
1487
1661
  # Acknowledge that the SSM command might be in progress as the agent shuts down
1488
- console.print("[dim]ℹ️ Cleanup command sent (status may show 'InProgress' as SSM agent stops)[/dim]")
1489
-
1662
+ console.print(
1663
+ "[dim]ℹ️ Cleanup command sent (status may show 'InProgress' as SSM agent stops)[/dim]"
1664
+ )
1665
+
1490
1666
  # Create the AMI
1491
1667
  with Progress(
1492
1668
  SpinnerColumn(),
1493
1669
  TextColumn("[progress.description]{task.description}"),
1494
1670
  transient=True,
1495
1671
  ) as progress:
1496
- task = progress.add_task("Creating AMI (this will take several minutes)...", total=None)
1497
-
1672
+ task = progress.add_task(
1673
+ "Creating AMI (this will take several minutes)...", total=None
1674
+ )
1675
+
1498
1676
  response = ec2.create_image(
1499
1677
  InstanceId=engine["instance_id"],
1500
1678
  Name=ami_name,
@@ -1506,7 +1684,7 @@ def create_ami(
1506
1684
  "Tags": [
1507
1685
  {"Key": "Environment", "Value": "dev"},
1508
1686
  {"Key": "Type", "Value": "golden-ami"},
1509
- {"Key": "EngineType", "Value": engine['engine_type']},
1687
+ {"Key": "EngineType", "Value": engine["engine_type"]},
1510
1688
  {"Key": "Name", "Value": ami_name},
1511
1689
  ],
1512
1690
  }
@@ -1514,7 +1692,11 @@ def create_ami(
1514
1692
  )
1515
1693
 
1516
1694
  ami_id = response["ImageId"]
1517
- progress.update(task, completed=True, description=f"[green]✓ AMI creation initiated![/green]")
1695
+ progress.update(
1696
+ task,
1697
+ completed=True,
1698
+ description=f"[green]✓ AMI creation initiated![/green]",
1699
+ )
1518
1700
 
1519
1701
  console.print(f" [bold]AMI ID:[/] {ami_id}")
1520
1702
  console.print("\nThe AMI creation process will continue in the background.")
@@ -1523,14 +1705,15 @@ def create_ami(
1523
1705
  "\nOnce complete, update the AMI ID in [bold]terraform/environments/dev/variables.tf[/bold] "
1524
1706
  "and run [bold]terraform apply[/bold]."
1525
1707
  )
1526
- console.print(f"\nRemember to [bold red]terminate the source engine '{engine['name']}'[/bold red] to save costs.")
1708
+ console.print(
1709
+ f"\nRemember to [bold red]terminate the source engine '{engine['name']}'[/bold red] to save costs."
1710
+ )
1527
1711
 
1528
1712
  except ClientError as e:
1529
1713
  console.print(f"[red]❌ Failed to create AMI: {e}[/red]")
1530
1714
  raise typer.Exit(1)
1531
1715
 
1532
1716
 
1533
-
1534
1717
  # ==================== STUDIO COMMANDS ====================
1535
1718
 
1536
1719
 
@@ -1803,9 +1986,13 @@ def attach_studio(
1803
1986
  TextColumn("[progress.description]{task.description}"),
1804
1987
  transient=True,
1805
1988
  ) as prog:
1806
- desc = "Attaching studio (engine is still booting)…" if engine_started_now else "Attaching studio…"
1989
+ desc = (
1990
+ "Attaching studio (engine is still booting)…"
1991
+ if engine_started_now
1992
+ else "Attaching studio…"
1993
+ )
1807
1994
  task = prog.add_task(desc, total=None)
1808
-
1995
+
1809
1996
  consecutive_not_ready = 0
1810
1997
  last_error = None
1811
1998
 
@@ -1814,7 +2001,7 @@ def attach_studio(
1814
2001
  if _is_studio_attached(studio["studio_id"], engine["instance_id"]):
1815
2002
  success = True
1816
2003
  break
1817
-
2004
+
1818
2005
  success, error_msg = _attempt_studio_attach(
1819
2006
  studio, engine, target_user, public_key
1820
2007
  )
@@ -1825,24 +2012,26 @@ def attach_studio(
1825
2012
  if error_msg:
1826
2013
  # Fatal error – bubble up immediately
1827
2014
  console.print(f"[red]❌ Failed to attach studio: {error_msg}[/red]")
1828
-
2015
+
1829
2016
  # Suggest repair command if engine seems broken
1830
2017
  if "not ready" in error_msg.lower() and attempt > 5:
1831
- console.print(f"\n[yellow]Engine may be in a bad state. Try:[/yellow]")
2018
+ console.print(
2019
+ f"\n[yellow]Engine may be in a bad state. Try:[/yellow]"
2020
+ )
1832
2021
  console.print(f"[dim] dh engine repair {engine['name']}[/dim]")
1833
2022
  return
1834
2023
 
1835
2024
  # Track consecutive "not ready" responses
1836
2025
  consecutive_not_ready += 1
1837
2026
  last_error = "Engine not ready"
1838
-
2027
+
1839
2028
  # Update progress display
1840
2029
  if attempt % 3 == 0:
1841
2030
  prog.update(
1842
2031
  task,
1843
2032
  description=f"{desc} attempt {attempt+1}/{max_attempts}",
1844
2033
  )
1845
-
2034
+
1846
2035
  # If engine seems stuck after many attempts, show a hint
1847
2036
  if consecutive_not_ready > 10 and attempt == 10:
1848
2037
  console.print(
@@ -1865,9 +2054,15 @@ def attach_studio(
1865
2054
  if last_error:
1866
2055
  console.print(f"[dim]Last issue: {last_error}[/dim]")
1867
2056
  console.print("\n[yellow]You can try:[/yellow]")
1868
- console.print(f" 1. Wait a minute and retry: [cyan]dh studio attach {engine['name']}[/cyan]")
1869
- console.print(f" 2. Check engine status: [cyan]dh engine status {engine['name']}[/cyan]")
1870
- console.print(f" 3. Repair the engine: [cyan]dh engine repair {engine['name']}[/cyan]")
2057
+ console.print(
2058
+ f" 1. Wait a minute and retry: [cyan]dh studio attach {engine['name']}[/cyan]"
2059
+ )
2060
+ console.print(
2061
+ f" 2. Check engine status: [cyan]dh engine status {engine['name']}[/cyan]"
2062
+ )
2063
+ console.print(
2064
+ f" 3. Repair the engine: [cyan]dh engine repair {engine['name']}[/cyan]"
2065
+ )
1871
2066
  return
1872
2067
 
1873
2068
  # Successful attach path
@@ -1900,13 +2095,13 @@ def _attempt_studio_attach(studio, engine, target_user, public_key):
1900
2095
  # The operation status polling is broken in the Lambda, so we just
1901
2096
  # wait and check if the studio is actually attached
1902
2097
  time.sleep(5) # Give the async operation a moment to start
1903
-
2098
+
1904
2099
  # Check periodically if the studio is attached
1905
2100
  for check in range(20): # Check for up to 60 seconds
1906
2101
  if _is_studio_attached(studio["studio_id"], engine["instance_id"]):
1907
2102
  return True, None
1908
2103
  time.sleep(3)
1909
-
2104
+
1910
2105
  # If we get here, attachment didn't complete in reasonable time
1911
2106
  return False, None # Return None to trigger retry
1912
2107
 
@@ -1914,15 +2109,19 @@ def _attempt_studio_attach(studio, engine, target_user, public_key):
1914
2109
  recoverable = False
1915
2110
  error_text = response.json().get("error", "Unknown error")
1916
2111
  err_msg = error_text.lower()
1917
-
2112
+
1918
2113
  # Check for "Studio is not available (status: in-use)" which means it's already attached
1919
- if response.status_code == 400 and "not available" in err_msg and "in-use" in err_msg:
2114
+ if (
2115
+ response.status_code == 400
2116
+ and "not available" in err_msg
2117
+ and "in-use" in err_msg
2118
+ ):
1920
2119
  # Studio is already attached somewhere - check if it's to THIS engine
1921
2120
  if _is_studio_attached(studio["studio_id"], engine["instance_id"]):
1922
2121
  return True, None # It's attached to our target engine - success!
1923
2122
  else:
1924
2123
  return False, error_text # It's attached elsewhere - fatal error
1925
-
2124
+
1926
2125
  if response.status_code in (409, 503):
1927
2126
  recoverable = True
1928
2127
  else:
@@ -1952,7 +2151,7 @@ def _attempt_studio_attach(studio, engine, target_user, public_key):
1952
2151
 
1953
2152
  # Note: _poll_operation was removed because the Lambda's operation tracking is broken.
1954
2153
  # We now use _is_studio_attached() to check if the studio is actually attached instead.
1955
-
2154
+
1956
2155
 
1957
2156
  @studio_app.command("detach")
1958
2157
  def detach_studio(
@@ -2504,9 +2703,15 @@ def repair_engine(
2504
2703
  engine = resolve_engine(name_or_id, engines)
2505
2704
 
2506
2705
  if engine["state"].lower() != "running":
2507
- console.print(f"[yellow]⚠️ Engine is {engine['state']}. Must be running to repair.[/yellow]")
2508
- if engine["state"].lower() == "stopped" and Confirm.ask("Start the engine first?"):
2509
- response = make_api_request("POST", f"/engines/{engine['instance_id']}/start")
2706
+ console.print(
2707
+ f"[yellow]⚠️ Engine is {engine['state']}. Must be running to repair.[/yellow]"
2708
+ )
2709
+ if engine["state"].lower() == "stopped" and Confirm.ask(
2710
+ "Start the engine first?"
2711
+ ):
2712
+ response = make_api_request(
2713
+ "POST", f"/engines/{engine['instance_id']}/start"
2714
+ )
2510
2715
  if response.status_code != 200:
2511
2716
  console.print("[red]❌ Failed to start engine[/red]")
2512
2717
  raise typer.Exit(1)
@@ -2517,7 +2722,9 @@ def repair_engine(
2517
2722
  raise typer.Exit(1)
2518
2723
 
2519
2724
  console.print(f"[bold]Repairing engine [cyan]{engine['name']}[/cyan][/bold]")
2520
- console.print("[dim]This will restore bootstrap state and ensure all services are running[/dim]\n")
2725
+ console.print(
2726
+ "[dim]This will restore bootstrap state and ensure all services are running[/dim]\n"
2727
+ )
2521
2728
 
2522
2729
  ssm = boto3.client("ssm", region_name="us-east-1")
2523
2730
 
@@ -2525,22 +2732,17 @@ def repair_engine(
2525
2732
  repair_commands = [
2526
2733
  # Create necessary directories
2527
2734
  "sudo mkdir -p /opt/dayhoff /opt/dayhoff/state /opt/dayhoff/scripts",
2528
-
2529
2735
  # Download scripts from S3 if missing
2530
2736
  "source /etc/engine.env && sudo aws s3 sync s3://${VM_SCRIPTS_BUCKET}/ /opt/dayhoff/scripts/ --exclude '*' --include '*.sh' --quiet",
2531
2737
  "sudo chmod +x /opt/dayhoff/scripts/*.sh 2>/dev/null || true",
2532
-
2533
2738
  # Restore bootstrap state
2534
2739
  "sudo touch /opt/dayhoff/first_boot_complete.sentinel",
2535
2740
  "echo 'finished' | sudo tee /opt/dayhoff/state/engine-init.stage > /dev/null",
2536
-
2537
2741
  # Ensure SSM agent is running
2538
2742
  "sudo systemctl restart amazon-ssm-agent 2>/dev/null || true",
2539
-
2540
2743
  # Restart idle detector
2541
2744
  "sudo systemctl restart engine-idle-detector.timer 2>/dev/null || true",
2542
2745
  "sudo systemctl restart engine-idle-detector.service 2>/dev/null || true",
2543
-
2544
2746
  # Report status
2545
2747
  "echo '=== Repair Complete ===' && echo 'Sentinel: ' && ls -la /opt/dayhoff/first_boot_complete.sentinel",
2546
2748
  "echo 'Stage: ' && cat /opt/dayhoff/state/engine-init.stage",
@@ -2579,19 +2781,23 @@ def repair_engine(
2579
2781
  if result["Status"] == "Success":
2580
2782
  output = result["StandardOutputContent"]
2581
2783
  console.print("[green]✓ Engine repaired successfully![/green]\n")
2582
-
2784
+
2583
2785
  # Show repair results
2584
2786
  if "=== Repair Complete ===" in output:
2585
2787
  repair_section = output.split("=== Repair Complete ===")[1].strip()
2586
2788
  console.print("[bold]Repair Results:[/bold]")
2587
2789
  console.print(repair_section)
2588
-
2589
- console.print("\n[dim]You should now be able to attach studios to this engine.[/dim]")
2790
+
2791
+ console.print(
2792
+ "\n[dim]You should now be able to attach studios to this engine.[/dim]"
2793
+ )
2590
2794
  else:
2591
2795
  console.print(
2592
2796
  f"[red]❌ Repair failed: {result.get('StandardErrorContent', 'Unknown error')}[/red]"
2593
2797
  )
2594
- console.print("\n[yellow]Try running 'dh engine debug' for more information.[/yellow]")
2798
+ console.print(
2799
+ "\n[yellow]Try running 'dh engine debug' for more information.[/yellow]"
2800
+ )
2595
2801
 
2596
2802
  except Exception as e:
2597
2803
  console.print(f"[red]❌ Failed to repair engine: {e}[/red]")
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: dayhoff-tools
3
- Version: 1.7.4
3
+ Version: 1.7.5
4
4
  Summary: Common tools for all the repos at Dayhoff Labs
5
5
  Author: Daniel Martin-Alarcon
6
6
  Author-email: dma@dayhofflabs.com
@@ -3,7 +3,7 @@ dayhoff_tools/chemistry/standardizer.py,sha256=uMn7VwHnx02nc404eO6fRuS4rsl4dvSPf
3
3
  dayhoff_tools/chemistry/utils.py,sha256=jt-7JgF-GeeVC421acX-bobKbLU_X94KNOW24p_P-_M,2257
4
4
  dayhoff_tools/cli/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
5
5
  dayhoff_tools/cli/cloud_commands.py,sha256=33qcWLmq-FwEXMdL3F0OHm-5Stlh2r65CldyEZgQ1no,40904
6
- dayhoff_tools/cli/engine_commands.py,sha256=_NQI9x4VKtAC9vTDdA4NGrILf2kb005t3Kw0aZ5oroE,96485
6
+ dayhoff_tools/cli/engine_commands.py,sha256=0syRVrJjWtRi7Y_q7MbEA5PKJ8TSXtEodHzxXu2Ymhs,102461
7
7
  dayhoff_tools/cli/main.py,sha256=LLMybU9KbtV_F4rwvoYAQZKTTF1nswlSZIfDMKdkh00,5925
8
8
  dayhoff_tools/cli/swarm_commands.py,sha256=5EyKj8yietvT5lfoz8Zx0iQvVaNgc3SJX1z2zQR6o6M,5614
9
9
  dayhoff_tools/cli/utility_commands.py,sha256=q2XyNy1uMGKg95cRXv1uA4MhaUDqlDBvkQwmvRo3hnA,25865
@@ -27,7 +27,7 @@ dayhoff_tools/intake/uniprot.py,sha256=BZYJQF63OtPcBBnQ7_P9gulxzJtqyorgyuDiPeOJq
27
27
  dayhoff_tools/logs.py,sha256=DKdeP0k0kliRcilwvX0mUB2eipO5BdWUeHwh-VnsICs,838
28
28
  dayhoff_tools/sqlite.py,sha256=jV55ikF8VpTfeQqqlHSbY8OgfyfHj8zgHNpZjBLos_E,18672
29
29
  dayhoff_tools/warehouse.py,sha256=heaYc64qplgN3_1WVPFmqj53goStioWwY5NqlWc4c0s,24453
30
- dayhoff_tools-1.7.4.dist-info/METADATA,sha256=veRc-71WGUY0_5WZCxZUibD-omlTEE_lOvac4WjYOOg,2914
31
- dayhoff_tools-1.7.4.dist-info/WHEEL,sha256=b4K_helf-jlQoXBBETfwnf4B04YC67LOev0jo4fX5m8,88
32
- dayhoff_tools-1.7.4.dist-info/entry_points.txt,sha256=iAf4jteNqW3cJm6CO6czLxjW3vxYKsyGLZ8WGmxamSc,49
33
- dayhoff_tools-1.7.4.dist-info/RECORD,,
30
+ dayhoff_tools-1.7.5.dist-info/METADATA,sha256=tnZl6OD70iojC4Avmej4LaBhAAQO2afIzZ4PBj6BEws,2914
31
+ dayhoff_tools-1.7.5.dist-info/WHEEL,sha256=b4K_helf-jlQoXBBETfwnf4B04YC67LOev0jo4fX5m8,88
32
+ dayhoff_tools-1.7.5.dist-info/entry_points.txt,sha256=iAf4jteNqW3cJm6CO6czLxjW3vxYKsyGLZ8WGmxamSc,49
33
+ dayhoff_tools-1.7.5.dist-info/RECORD,,