dayhoff-tools 1.9.26__py3-none-any.whl → 1.10.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (39) hide show
  1. dayhoff_tools/cli/engine/__init__.py +1 -323
  2. dayhoff_tools/cli/engine/coffee.py +110 -0
  3. dayhoff_tools/cli/engine/config_ssh.py +113 -0
  4. dayhoff_tools/cli/engine/debug.py +79 -0
  5. dayhoff_tools/cli/engine/gami.py +160 -0
  6. dayhoff_tools/cli/engine/idle.py +148 -0
  7. dayhoff_tools/cli/engine/launch.py +101 -0
  8. dayhoff_tools/cli/engine/list.py +116 -0
  9. dayhoff_tools/cli/engine/repair.py +128 -0
  10. dayhoff_tools/cli/engine/resize.py +195 -0
  11. dayhoff_tools/cli/engine/ssh.py +62 -0
  12. dayhoff_tools/cli/engine/{engine_core.py → status.py} +6 -201
  13. dayhoff_tools/cli/engine_studio_commands.py +323 -0
  14. dayhoff_tools/cli/engine_studio_utils/__init__.py +1 -0
  15. dayhoff_tools/cli/engine_studio_utils/api_utils.py +47 -0
  16. dayhoff_tools/cli/engine_studio_utils/aws_utils.py +102 -0
  17. dayhoff_tools/cli/engine_studio_utils/constants.py +21 -0
  18. dayhoff_tools/cli/engine_studio_utils/formatting.py +210 -0
  19. dayhoff_tools/cli/engine_studio_utils/ssh_utils.py +141 -0
  20. dayhoff_tools/cli/main.py +1 -2
  21. dayhoff_tools/cli/studio/__init__.py +1 -0
  22. dayhoff_tools/cli/studio/attach.py +314 -0
  23. dayhoff_tools/cli/studio/create.py +48 -0
  24. dayhoff_tools/cli/studio/delete.py +71 -0
  25. dayhoff_tools/cli/studio/detach.py +56 -0
  26. dayhoff_tools/cli/studio/list.py +81 -0
  27. dayhoff_tools/cli/studio/reset.py +90 -0
  28. dayhoff_tools/cli/studio/resize.py +134 -0
  29. dayhoff_tools/cli/studio/status.py +78 -0
  30. {dayhoff_tools-1.9.26.dist-info → dayhoff_tools-1.10.1.dist-info}/METADATA +1 -1
  31. dayhoff_tools-1.10.1.dist-info/RECORD +61 -0
  32. dayhoff_tools/cli/engine/engine_maintenance.py +0 -431
  33. dayhoff_tools/cli/engine/engine_management.py +0 -505
  34. dayhoff_tools/cli/engine/shared.py +0 -501
  35. dayhoff_tools/cli/engine/studio_commands.py +0 -825
  36. dayhoff_tools-1.9.26.dist-info/RECORD +0 -39
  37. /dayhoff_tools/cli/engine/{engine_lifecycle.py → lifecycle.py} +0 -0
  38. {dayhoff_tools-1.9.26.dist-info → dayhoff_tools-1.10.1.dist-info}/WHEEL +0 -0
  39. {dayhoff_tools-1.9.26.dist-info → dayhoff_tools-1.10.1.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,195 @@
1
+ """Engine resize command."""
2
+
3
+ import time
4
+
5
+ import boto3
6
+ import typer
7
+ from botocore.exceptions import ClientError
8
+ from rich.prompt import Confirm
9
+
10
+ from ..engine_studio_utils.api_utils import make_api_request
11
+ from ..engine_studio_utils.aws_utils import check_aws_sso
12
+ from ..engine_studio_utils.constants import console
13
+ from ..engine_studio_utils.formatting import resolve_engine
14
+
15
+
16
+ def resize_engine(
17
+ name_or_id: str = typer.Argument(help="Engine name or instance ID"),
18
+ size: int = typer.Option(..., "--size", "-s", help="New size in GB"),
19
+ online: bool = typer.Option(
20
+ False,
21
+ "--online",
22
+ help="Resize while running (requires manual filesystem expansion)",
23
+ ),
24
+ force: bool = typer.Option(
25
+ False, "--force", "-f", help="Force resize and detach all studios"
26
+ ),
27
+ ):
28
+ """Resize an engine's boot disk."""
29
+ check_aws_sso()
30
+
31
+ # Get all engines to resolve name
32
+ response = make_api_request("GET", "/engines")
33
+ if response.status_code != 200:
34
+ console.print("[red]❌ Failed to fetch engines[/red]")
35
+ raise typer.Exit(1)
36
+
37
+ engines = response.json().get("engines", [])
38
+ engine = resolve_engine(name_or_id, engines)
39
+
40
+ # Get current volume info to validate size
41
+ ec2 = boto3.client("ec2", region_name="us-east-1")
42
+
43
+ try:
44
+ # Get instance details to find root volume
45
+ instance_info = ec2.describe_instances(InstanceIds=[engine["instance_id"]])
46
+ instance = instance_info["Reservations"][0]["Instances"][0]
47
+
48
+ # Find root volume
49
+ root_device = instance.get("RootDeviceName", "/dev/xvda")
50
+ root_volume_id = None
51
+
52
+ for bdm in instance.get("BlockDeviceMappings", []):
53
+ if bdm["DeviceName"] == root_device:
54
+ root_volume_id = bdm["Ebs"]["VolumeId"]
55
+ break
56
+
57
+ if not root_volume_id:
58
+ console.print("[red]❌ Could not find root volume[/red]")
59
+ raise typer.Exit(1)
60
+
61
+ # Get current volume size
62
+ volumes = ec2.describe_volumes(VolumeIds=[root_volume_id])
63
+ current_size = volumes["Volumes"][0]["Size"]
64
+
65
+ if size <= current_size:
66
+ console.print(
67
+ f"[red]❌ New size ({size}GB) must be larger than current size ({current_size}GB)[/red]"
68
+ )
69
+ raise typer.Exit(1)
70
+
71
+ console.print(
72
+ f"[yellow]Resizing engine boot disk from {current_size}GB to {size}GB[/yellow]"
73
+ )
74
+
75
+ # Check if we need to stop the instance
76
+ if not online and engine["state"].lower() == "running":
77
+ console.print("Stopping engine for offline resize...")
78
+ stop_response = make_api_request(
79
+ "POST",
80
+ f"/engines/{engine['instance_id']}/stop",
81
+ json_data={"detach_studios": False},
82
+ )
83
+ if stop_response.status_code != 200:
84
+ console.print("[red]❌ Failed to stop engine[/red]")
85
+ raise typer.Exit(1)
86
+
87
+ # Wait for instance to stop
88
+ console.print("Waiting for engine to stop...")
89
+ waiter = ec2.get_waiter("instance_stopped")
90
+ waiter.wait(InstanceIds=[engine["instance_id"]])
91
+ console.print("[green]✓ Engine stopped[/green]")
92
+
93
+ # Call the resize API
94
+ console.print("Resizing volume...")
95
+ resize_response = make_api_request(
96
+ "POST",
97
+ f"/engines/{engine['instance_id']}/resize",
98
+ json_data={"size": size, "detach_studios": force},
99
+ )
100
+
101
+ if resize_response.status_code == 409 and not force:
102
+ # Engine has attached studios
103
+ data = resize_response.json()
104
+ attached_studios = data.get("attached_studios", [])
105
+
106
+ console.print("\n[yellow]⚠️ This engine has attached studios:[/yellow]")
107
+ for studio in attached_studios:
108
+ console.print(f" • {studio['user']} ({studio['studio_id']})")
109
+
110
+ if Confirm.ask("\nDetach all studios and resize the engine?"):
111
+ resize_response = make_api_request(
112
+ "POST",
113
+ f"/engines/{engine['instance_id']}/resize",
114
+ json_data={"size": size, "detach_studios": True},
115
+ )
116
+ else:
117
+ console.print("Resize cancelled.")
118
+ return
119
+
120
+ if resize_response.status_code != 200:
121
+ error = resize_response.json().get("error", "Unknown error")
122
+ console.print(f"[red]❌ Failed to resize engine: {error}[/red]")
123
+ raise typer.Exit(1)
124
+
125
+ # Check if studios were detached
126
+ data = resize_response.json()
127
+ detached_studios = data.get("detached_studios", 0)
128
+ if detached_studios > 0:
129
+ console.print(
130
+ f"[green]✓ Detached {detached_studios} studio(s) before resize[/green]"
131
+ )
132
+
133
+ # Wait for modification to complete
134
+ console.print("Waiting for volume modification to complete...")
135
+ while True:
136
+ mod_state = ec2.describe_volumes_modifications(VolumeIds=[root_volume_id])
137
+ if not mod_state["VolumesModifications"]:
138
+ break # Modification complete
139
+
140
+ modification = mod_state["VolumesModifications"][0]
141
+ state = modification["ModificationState"]
142
+ progress = modification.get("Progress", 0)
143
+
144
+ # Show progress updates only for the resize phase
145
+ if state == "modifying":
146
+ console.print(f"[yellow]Progress: {progress}%[/yellow]")
147
+
148
+ # Exit as soon as optimization starts (resize is complete)
149
+ if state == "optimizing":
150
+ console.print("[green]✓ Volume resized successfully[/green]")
151
+ console.print(
152
+ "[dim]AWS is optimizing the volume in the background (no action needed).[/dim]"
153
+ )
154
+ break
155
+
156
+ if state == "completed":
157
+ console.print("[green]✓ Volume resized successfully[/green]")
158
+ break
159
+ elif state == "failed":
160
+ console.print("[red]❌ Volume modification failed[/red]")
161
+ raise typer.Exit(1)
162
+
163
+ time.sleep(2) # Check more frequently for better UX
164
+
165
+ # If offline resize, start the instance back up
166
+ if not online and engine["state"].lower() == "running":
167
+ console.print("Starting engine back up...")
168
+ start_response = make_api_request(
169
+ "POST", f"/engines/{engine['instance_id']}/start"
170
+ )
171
+ if start_response.status_code != 200:
172
+ console.print(
173
+ "[yellow]⚠️ Failed to restart engine automatically[/yellow]"
174
+ )
175
+ console.print(
176
+ f"Please start it manually: [cyan]dh engine start {engine['name']}[/cyan]"
177
+ )
178
+ else:
179
+ console.print("[green]✓ Engine started[/green]")
180
+ console.print("The filesystem will be automatically expanded on boot.")
181
+
182
+ elif online and engine["state"].lower() == "running":
183
+ console.print(
184
+ "\n[yellow]⚠️ Online resize complete. You must now expand the filesystem:[/yellow]"
185
+ )
186
+ console.print(f"1. SSH into the engine: [cyan]ssh {engine['name']}[/cyan]")
187
+ console.print("2. Find the root device: [cyan]lsblk[/cyan]")
188
+ console.print(
189
+ "3. Expand the partition: [cyan]sudo growpart /dev/nvme0n1 1[/cyan] (adjust device name as needed)"
190
+ )
191
+ console.print("4. Expand the filesystem: [cyan]sudo xfs_growfs /[/cyan]")
192
+
193
+ except ClientError as e:
194
+ console.print(f"[red]❌ Failed to resize engine: {e}[/red]")
195
+ raise typer.Exit(1)
@@ -0,0 +1,62 @@
1
+ """Engine SSH command."""
2
+
3
+ import subprocess
4
+
5
+ import typer
6
+
7
+ from ..engine_studio_utils.api_utils import make_api_request
8
+ from ..engine_studio_utils.aws_utils import check_aws_sso
9
+ from ..engine_studio_utils.constants import console
10
+ from ..engine_studio_utils.formatting import resolve_engine
11
+ from ..engine_studio_utils.ssh_utils import check_session_manager_plugin, update_ssh_config_entry
12
+
13
+
14
+ def ssh_engine(
15
+ name_or_id: str = typer.Argument(help="Engine name or instance ID"),
16
+ admin: bool = typer.Option(
17
+ False, "--admin", help="Connect as ec2-user instead of the engine owner user"
18
+ ),
19
+ idle_timeout: int = typer.Option(
20
+ 600,
21
+ "--idle-timeout",
22
+ help="Idle timeout (seconds) for the SSM port-forward (0 = disable)",
23
+ ),
24
+ ):
25
+ """Connect to an engine via SSH.
26
+
27
+ By default the CLI connects using the engine's owner username (the same one stored in the `User` tag).
28
+ Pass `--admin` to connect with the underlying [`ec2-user`] account for break-glass or debugging.
29
+ """
30
+ username = check_aws_sso()
31
+
32
+ # Check for Session Manager Plugin
33
+ if not check_session_manager_plugin():
34
+ raise typer.Exit(1)
35
+
36
+ # Get all engines to resolve name
37
+ response = make_api_request("GET", "/engines")
38
+ if response.status_code != 200:
39
+ console.print("[red]❌ Failed to fetch engines[/red]")
40
+ raise typer.Exit(1)
41
+
42
+ engines = response.json().get("engines", [])
43
+ engine = resolve_engine(name_or_id, engines)
44
+
45
+ if engine["state"].lower() != "running":
46
+ console.print(f"[red]❌ Engine is not running (state: {engine['state']})[/red]")
47
+ raise typer.Exit(1)
48
+
49
+ # Choose SSH user
50
+ ssh_user = "ec2-user" if admin else username
51
+
52
+ # Update SSH config
53
+ console.print(
54
+ f"Updating SSH config for [cyan]{engine['name']}[/cyan] (user: {ssh_user})..."
55
+ )
56
+ update_ssh_config_entry(
57
+ engine["name"], engine["instance_id"], ssh_user, idle_timeout
58
+ )
59
+
60
+ # Connect
61
+ console.print(f"[green]✓ Connecting to {engine['name']}...[/green]")
62
+ subprocess.run(["ssh", engine["name"]])
@@ -1,4 +1,4 @@
1
- """Core engine commands: launch, list, and status."""
1
+ """Engine status command."""
2
2
 
3
3
  import json
4
4
  import time
@@ -7,214 +7,19 @@ from typing import Any, Dict, Optional
7
7
 
8
8
  import boto3
9
9
  import typer
10
- from rich import box
11
10
  from rich.panel import Panel
12
- from rich.progress import Progress, SpinnerColumn, TextColumn
13
- from rich.table import Table
14
-
15
- from .shared import (
16
- HOURLY_COSTS,
17
- _fetch_init_stages,
18
- check_aws_sso,
19
- console,
11
+
12
+ from ..engine_studio_utils.api_utils import make_api_request
13
+ from ..engine_studio_utils.aws_utils import _fetch_init_stages, check_aws_sso
14
+ from ..engine_studio_utils.constants import HOURLY_COSTS, console
15
+ from ..engine_studio_utils.formatting import (
20
16
  format_duration,
21
- format_status,
22
17
  get_disk_usage_via_ssm,
23
- make_api_request,
24
18
  parse_launch_time,
25
19
  resolve_engine,
26
20
  )
27
21
 
28
22
 
29
- def launch_engine(
30
- name: str = typer.Argument(help="Name for the new engine"),
31
- engine_type: str = typer.Option(
32
- "cpu",
33
- "--type",
34
- "-t",
35
- help="Engine type: cpu, cpumax, t4, a10g, a100, 4_t4, 8_t4, 4_a10g, 8_a10g",
36
- ),
37
- user: Optional[str] = typer.Option(None, "--user", "-u", help="Override username"),
38
- boot_disk_size: Optional[int] = typer.Option(
39
- None,
40
- "--size",
41
- "-s",
42
- help="Boot disk size in GB (default: 50GB, min: 20GB, max: 1000GB)",
43
- ),
44
- availability_zone: Optional[str] = typer.Option(
45
- None,
46
- "--az",
47
- help="Prefer a specific Availability Zone (e.g., us-east-1b). If omitted the service will try all public subnets.",
48
- ),
49
- ):
50
- """Launch a new engine instance."""
51
- username = check_aws_sso()
52
- if user:
53
- username = user
54
-
55
- # Validate engine type
56
- valid_types = [
57
- "cpu",
58
- "cpumax",
59
- "t4",
60
- "a10g",
61
- "a100",
62
- "4_t4",
63
- "8_t4",
64
- "4_a10g",
65
- "8_a10g",
66
- ]
67
- if engine_type not in valid_types:
68
- console.print(f"[red]❌ Invalid engine type: {engine_type}[/red]")
69
- console.print(f"Valid types: {', '.join(valid_types)}")
70
- raise typer.Exit(1)
71
-
72
- # Validate boot disk size
73
- if boot_disk_size is not None:
74
- if boot_disk_size < 20:
75
- console.print("[red]❌ Boot disk size must be at least 20GB[/red]")
76
- raise typer.Exit(1)
77
- if boot_disk_size > 1000:
78
- console.print("[red]❌ Boot disk size cannot exceed 1000GB[/red]")
79
- raise typer.Exit(1)
80
-
81
- cost = HOURLY_COSTS.get(engine_type, 0)
82
- disk_info = f" with {boot_disk_size}GB boot disk" if boot_disk_size else ""
83
- console.print(
84
- f"Launching [cyan]{name}[/cyan] ({engine_type}){disk_info} for ${cost:.2f}/hour..."
85
- )
86
-
87
- with Progress(
88
- SpinnerColumn(),
89
- TextColumn("[progress.description]{task.description}"),
90
- transient=True,
91
- ) as progress:
92
- progress.add_task("Creating engine...", total=None)
93
-
94
- request_data: Dict[str, Any] = {
95
- "name": name,
96
- "user": username,
97
- "engine_type": engine_type,
98
- }
99
- if boot_disk_size is not None:
100
- request_data["boot_disk_size"] = boot_disk_size
101
- if availability_zone:
102
- request_data["availability_zone"] = availability_zone
103
-
104
- response = make_api_request("POST", "/engines", json_data=request_data)
105
-
106
- if response.status_code == 201:
107
- data = response.json()
108
- console.print(f"[green]✓ Engine launched successfully![/green]")
109
- console.print(f"Instance ID: [cyan]{data['instance_id']}[/cyan]")
110
- console.print(f"Type: {data['instance_type']} (${cost:.2f}/hour)")
111
- if boot_disk_size:
112
- console.print(f"Boot disk: {boot_disk_size}GB")
113
- console.print("\nThe engine is initializing. This may take a few minutes.")
114
- console.print(f"Check status with: [cyan]dh engine status {name}[/cyan]")
115
- else:
116
- error = response.json().get("error", "Unknown error")
117
- console.print(f"[red]❌ Failed to launch engine: {error}[/red]")
118
-
119
-
120
- def list_engines(
121
- user: Optional[str] = typer.Option(None, "--user", "-u", help="Filter by user"),
122
- running_only: bool = typer.Option(
123
- False, "--running", help="Show only running engines"
124
- ),
125
- stopped_only: bool = typer.Option(
126
- False, "--stopped", help="Show only stopped engines"
127
- ),
128
- detailed: bool = typer.Option(
129
- False, "--detailed", "-d", help="Show detailed status (slower)"
130
- ),
131
- ):
132
- """List engines (shows all engines by default)."""
133
- current_user = check_aws_sso()
134
-
135
- params = {}
136
- if user:
137
- params["user"] = user
138
- if detailed:
139
- params["check_ready"] = "true"
140
-
141
- response = make_api_request("GET", "/engines", params=params)
142
-
143
- if response.status_code == 200:
144
- data = response.json()
145
- engines = data.get("engines", [])
146
-
147
- # Filter by state if requested
148
- if running_only:
149
- engines = [e for e in engines if e["state"].lower() == "running"]
150
- elif stopped_only:
151
- engines = [e for e in engines if e["state"].lower() == "stopped"]
152
-
153
- if not engines:
154
- console.print("No engines found.")
155
- return
156
-
157
- # Only fetch detailed info if requested (slow)
158
- stages_map = {}
159
- if detailed:
160
- stages_map = _fetch_init_stages([e["instance_id"] for e in engines])
161
-
162
- # Create table
163
- table = Table(title="Engines", box=box.ROUNDED)
164
- table.add_column("Name", style="cyan")
165
- table.add_column("Instance ID", style="dim")
166
- table.add_column("Type")
167
- table.add_column("User")
168
- table.add_column("Status")
169
- if detailed:
170
- table.add_column("Disk Usage")
171
- table.add_column("Uptime/Since")
172
- table.add_column("$/hour", justify="right")
173
-
174
- for engine in engines:
175
- launch_time = parse_launch_time(engine["launch_time"])
176
- uptime = datetime.now(timezone.utc) - launch_time
177
- hourly_cost = HOURLY_COSTS.get(engine["engine_type"], 0)
178
-
179
- if engine["state"].lower() == "running":
180
- time_str = format_duration(uptime)
181
- # Only get disk usage if detailed mode
182
- if detailed:
183
- disk_usage = get_disk_usage_via_ssm(engine["instance_id"]) or "-"
184
- else:
185
- disk_usage = None
186
- else:
187
- time_str = launch_time.strftime("%Y-%m-%d %H:%M")
188
- disk_usage = "-" if detailed else None
189
-
190
- row_data = [
191
- engine["name"],
192
- engine["instance_id"],
193
- engine["engine_type"],
194
- engine["user"],
195
- format_status(engine["state"], engine.get("ready")),
196
- ]
197
- if detailed:
198
- row_data.append(disk_usage)
199
- row_data.extend(
200
- [
201
- time_str,
202
- f"${hourly_cost:.2f}",
203
- ]
204
- )
205
-
206
- table.add_row(*row_data)
207
-
208
- console.print(table)
209
- if not detailed and any(e["state"].lower() == "running" for e in engines):
210
- console.print(
211
- "\n[dim]Tip: Use --detailed to see disk usage and bootstrap status (slower)[/dim]"
212
- )
213
- else:
214
- error = response.json().get("error", "Unknown error")
215
- console.print(f"[red]❌ Failed to list engines: {error}[/red]")
216
-
217
-
218
23
  def engine_status(
219
24
  name_or_id: str = typer.Argument(help="Engine name or instance ID"),
220
25
  detailed: bool = typer.Option(