dayhoff-tools 1.9.9__py3-none-any.whl → 1.9.10__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,136 @@
1
+ """Engine lifecycle commands: start, stop, and terminate."""
2
+
3
+ from datetime import datetime, timezone
4
+
5
+ import typer
6
+ from rich.prompt import Confirm
7
+
8
+ from .shared import (
9
+ HOURLY_COSTS,
10
+ check_aws_sso,
11
+ console,
12
+ format_duration,
13
+ make_api_request,
14
+ parse_launch_time,
15
+ resolve_engine,
16
+ )
17
+
18
+
19
+ def start_engine(
20
+ name_or_id: str = typer.Argument(help="Engine name or instance ID"),
21
+ ):
22
+ """Start a stopped engine."""
23
+ check_aws_sso()
24
+
25
+ # Get all engines to resolve name
26
+ response = make_api_request("GET", "/engines")
27
+ if response.status_code != 200:
28
+ console.print("[red]❌ Failed to fetch engines[/red]")
29
+ raise typer.Exit(1)
30
+
31
+ engines = response.json().get("engines", [])
32
+ engine = resolve_engine(name_or_id, engines)
33
+
34
+ console.print(f"Starting engine [cyan]{engine['name']}[/cyan]...")
35
+
36
+ response = make_api_request("POST", f"/engines/{engine['instance_id']}/start")
37
+
38
+ if response.status_code == 200:
39
+ data = response.json()
40
+ console.print(f"[green]✓ Engine started successfully![/green]")
41
+ console.print(f"New public IP: {data.get('public_ip', 'Pending...')}")
42
+ else:
43
+ error = response.json().get("error", "Unknown error")
44
+ console.print(f"[red]❌ Failed to start engine: {error}[/red]")
45
+
46
+
47
+ def stop_engine(
48
+ name_or_id: str = typer.Argument(help="Engine name or instance ID"),
49
+ force: bool = typer.Option(
50
+ False, "--force", "-f", help="Force stop and detach all studios"
51
+ ),
52
+ ):
53
+ """Stop an engine."""
54
+ check_aws_sso()
55
+
56
+ # Get all engines to resolve name
57
+ response = make_api_request("GET", "/engines")
58
+ if response.status_code != 200:
59
+ console.print("[red]❌ Failed to fetch engines[/red]")
60
+ raise typer.Exit(1)
61
+
62
+ engines = response.json().get("engines", [])
63
+ engine = resolve_engine(name_or_id, engines)
64
+
65
+ console.print(f"Stopping engine [cyan]{engine['name']}[/cyan]...")
66
+
67
+ # First attempt without detaching
68
+ response = make_api_request(
69
+ "POST",
70
+ f"/engines/{engine['instance_id']}/stop",
71
+ json_data={"detach_studios": force},
72
+ )
73
+
74
+ if response.status_code == 409 and not force:
75
+ # Engine has attached studios
76
+ data = response.json()
77
+ attached_studios = data.get("attached_studios", [])
78
+
79
+ console.print("\n[yellow]⚠️ This engine has attached studios:[/yellow]")
80
+ for studio in attached_studios:
81
+ console.print(f" • {studio['user']} ({studio['studio_id']})")
82
+
83
+ if Confirm.ask("\nDetach all studios and stop the engine?"):
84
+ response = make_api_request(
85
+ "POST",
86
+ f"/engines/{engine['instance_id']}/stop",
87
+ json_data={"detach_studios": True},
88
+ )
89
+ else:
90
+ console.print("Stop cancelled.")
91
+ return
92
+
93
+ if response.status_code == 200:
94
+ console.print(f"[green]✓ Engine stopped successfully![/green]")
95
+ else:
96
+ error = response.json().get("error", "Unknown error")
97
+ console.print(f"[red]❌ Failed to stop engine: {error}[/red]")
98
+
99
+
100
+ def terminate_engine(
101
+ name_or_id: str = typer.Argument(help="Engine name or instance ID"),
102
+ ):
103
+ """Permanently terminate an engine."""
104
+ check_aws_sso()
105
+
106
+ # Get all engines to resolve name
107
+ response = make_api_request("GET", "/engines")
108
+ if response.status_code != 200:
109
+ console.print("[red]❌ Failed to fetch engines[/red]")
110
+ raise typer.Exit(1)
111
+
112
+ engines = response.json().get("engines", [])
113
+ engine = resolve_engine(name_or_id, engines)
114
+
115
+ # Calculate cost
116
+ launch_time = parse_launch_time(engine["launch_time"])
117
+ uptime = datetime.now(timezone.utc) - launch_time
118
+ hourly_cost = HOURLY_COSTS.get(engine["engine_type"], 0)
119
+ total_cost = hourly_cost * (uptime.total_seconds() / 3600)
120
+
121
+ console.print(
122
+ f"\n[yellow]⚠️ This will permanently terminate engine '{engine['name']}'[/yellow]"
123
+ )
124
+ console.print(f"Total cost for this session: ${total_cost:.2f}")
125
+
126
+ if not Confirm.ask("\nAre you sure you want to terminate this engine?"):
127
+ console.print("Termination cancelled.")
128
+ return
129
+
130
+ response = make_api_request("DELETE", f"/engines/{engine['instance_id']}")
131
+
132
+ if response.status_code == 200:
133
+ console.print(f"[green]✓ Engine terminated successfully![/green]")
134
+ else:
135
+ error = response.json().get("error", "Unknown error")
136
+ console.print(f"[red]❌ Failed to terminate engine: {error}[/red]")
@@ -0,0 +1,377 @@
1
+ """Engine maintenance commands: coffee, idle timeout, debug, and repair."""
2
+
3
+ import re
4
+ import subprocess
5
+ import time
6
+ from typing import Optional
7
+
8
+ import boto3
9
+ import typer
10
+ from botocore.exceptions import ClientError
11
+ from rich.progress import Progress, SpinnerColumn, TextColumn
12
+ from rich.prompt import Confirm
13
+
14
+ from .shared import (
15
+ check_aws_sso,
16
+ console,
17
+ make_api_request,
18
+ resolve_engine,
19
+ )
20
+
21
+
22
+ def coffee(
23
+ name_or_id: str = typer.Argument(help="Engine name or instance ID"),
24
+ duration: str = typer.Argument("4h", help="Duration (e.g., 2h, 30m, 2h30m)"),
25
+ cancel: bool = typer.Option(
26
+ False, "--cancel", help="Cancel existing coffee lock instead of extending"
27
+ ),
28
+ ):
29
+ """Pour ☕ for an engine: keeps it awake for the given duration (or cancel)."""
30
+ username = check_aws_sso()
31
+
32
+ # Parse duration
33
+ import re
34
+
35
+ if not cancel:
36
+ match = re.match(r"(?:(\d+)h)?(?:(\d+)m)?", duration)
37
+ if not match or (not match.group(1) and not match.group(2)):
38
+ console.print(f"[red]❌ Invalid duration format: {duration}[/red]")
39
+ console.print("Use format like: 4h, 30m, 2h30m")
40
+ raise typer.Exit(1)
41
+
42
+ hours = int(match.group(1) or 0)
43
+ minutes = int(match.group(2) or 0)
44
+ seconds_total = (hours * 60 + minutes) * 60
45
+ if seconds_total == 0:
46
+ console.print("[red]❌ Duration must be greater than zero[/red]")
47
+ raise typer.Exit(1)
48
+
49
+ # Get all engines to resolve name
50
+ response = make_api_request("GET", "/engines")
51
+ if response.status_code != 200:
52
+ console.print("[red]❌ Failed to fetch engines[/red]")
53
+ raise typer.Exit(1)
54
+
55
+ engines = response.json().get("engines", [])
56
+ engine = resolve_engine(name_or_id, engines)
57
+
58
+ if engine["state"].lower() != "running":
59
+ console.print(f"[red]❌ Engine is not running (state: {engine['state']})[/red]")
60
+ raise typer.Exit(1)
61
+
62
+ if cancel:
63
+ console.print(f"Cancelling coffee for [cyan]{engine['name']}[/cyan]…")
64
+ else:
65
+ console.print(
66
+ f"Pouring coffee for [cyan]{engine['name']}[/cyan] for {duration}…"
67
+ )
68
+
69
+ # Use SSM to run the engine coffee command
70
+ ssm = boto3.client("ssm", region_name="us-east-1")
71
+ try:
72
+ response = ssm.send_command(
73
+ InstanceIds=[engine["instance_id"]],
74
+ DocumentName="AWS-RunShellScript",
75
+ Parameters={
76
+ "commands": [
77
+ (
78
+ "/usr/local/bin/engine-coffee --cancel"
79
+ if cancel
80
+ else f"/usr/local/bin/engine-coffee {seconds_total}"
81
+ )
82
+ ],
83
+ "executionTimeout": ["60"],
84
+ },
85
+ )
86
+
87
+ command_id = response["Command"]["CommandId"]
88
+
89
+ # Wait for command to complete
90
+ for _ in range(10):
91
+ time.sleep(1)
92
+ result = ssm.get_command_invocation(
93
+ CommandId=command_id,
94
+ InstanceId=engine["instance_id"],
95
+ )
96
+ if result["Status"] in ["Success", "Failed"]:
97
+ break
98
+
99
+ if result["Status"] == "Success":
100
+ if cancel:
101
+ console.print(
102
+ "[green]✓ Coffee cancelled – auto-shutdown re-enabled[/green]"
103
+ )
104
+ else:
105
+ console.print(f"[green]✓ Coffee poured for {duration}[/green]")
106
+ console.print(
107
+ "\n[dim]Note: Detached Docker containers (except dev containers) will also keep the engine awake.[/dim]"
108
+ )
109
+ console.print(
110
+ "[dim]Use coffee for nohup operations or other background tasks.[/dim]"
111
+ )
112
+ else:
113
+ console.print(
114
+ f"[red]❌ Failed to manage coffee: {result.get('StatusDetails', 'Unknown error')}[/red]"
115
+ )
116
+
117
+ except ClientError as e:
118
+ console.print(f"[red]❌ Failed to manage coffee: {e}[/red]")
119
+
120
+
121
+ def idle_timeout_cmd(
122
+ name_or_id: str = typer.Argument(help="Engine name or instance ID"),
123
+ set: Optional[str] = typer.Option(
124
+ None, "--set", "-s", help="New timeout (e.g., 2h30m, 45m)"
125
+ ),
126
+
127
+ ):
128
+ """Show or set the engine idle-detector timeout."""
129
+ check_aws_sso()
130
+
131
+ # Resolve engine
132
+ response = make_api_request("GET", "/engines")
133
+ if response.status_code != 200:
134
+ console.print("[red]❌ Failed to fetch engines[/red]")
135
+ raise typer.Exit(1)
136
+
137
+ engines = response.json().get("engines", [])
138
+ engine = resolve_engine(name_or_id, engines)
139
+
140
+ ssm = boto3.client("ssm", region_name="us-east-1")
141
+
142
+ if set is None:
143
+ # Show current timeout setting
144
+ resp = ssm.send_command(
145
+ InstanceIds=[engine["instance_id"]],
146
+ DocumentName="AWS-RunShellScript",
147
+ Parameters={
148
+ "commands": [
149
+ "grep -E '^IDLE_TIMEOUT_SECONDS=' /etc/engine.env || echo 'IDLE_TIMEOUT_SECONDS=1800'"
150
+ ],
151
+ "executionTimeout": ["10"],
152
+ },
153
+ )
154
+ cid = resp["Command"]["CommandId"]
155
+ time.sleep(1)
156
+ inv = ssm.get_command_invocation(
157
+ CommandId=cid, InstanceId=engine["instance_id"]
158
+ )
159
+ if inv["Status"] == "Success":
160
+ line = inv["StandardOutputContent"].strip()
161
+ secs = int(line.split("=")[1]) if "=" in line else 1800
162
+ console.print(f"Current idle timeout: {secs//60}m ({secs} seconds)")
163
+ else:
164
+ console.print("[red]❌ Could not retrieve idle timeout[/red]")
165
+ return
166
+
167
+ # ----- set new value -----
168
+ m = re.match(r"^(?:(\d+)h)?(?:(\d+)m)?$", set)
169
+ if not m:
170
+ console.print("[red]❌ Invalid duration format. Use e.g. 2h, 45m, 1h30m[/red]")
171
+ raise typer.Exit(1)
172
+ hours = int(m.group(1) or 0)
173
+ minutes = int(m.group(2) or 0)
174
+ seconds = hours * 3600 + minutes * 60
175
+ if seconds == 0:
176
+ console.print("[red]❌ Duration must be greater than zero[/red]")
177
+ raise typer.Exit(1)
178
+
179
+ console.print(f"Setting idle timeout to {set} ({seconds} seconds)…")
180
+
181
+ cmd = (
182
+ "sudo sed -i '/^IDLE_TIMEOUT_SECONDS=/d' /etc/engine.env && "
183
+ f"echo 'IDLE_TIMEOUT_SECONDS={seconds}' | sudo tee -a /etc/engine.env >/dev/null && "
184
+ "sudo systemctl restart engine-idle-detector.service"
185
+ )
186
+
187
+ resp = ssm.send_command(
188
+ InstanceIds=[engine["instance_id"]],
189
+ DocumentName="AWS-RunShellScript",
190
+ Parameters={"commands": [cmd], "executionTimeout": ["60"]},
191
+ )
192
+ cid = resp["Command"]["CommandId"]
193
+ time.sleep(2)
194
+ console.print(f"[green]✓ Idle timeout updated to {set}[/green]")
195
+
196
+
197
+ def debug_engine(
198
+ name_or_id: str = typer.Argument(help="Engine name or instance ID"),
199
+ ):
200
+ """Debug engine bootstrap status and files."""
201
+ check_aws_sso()
202
+
203
+ # Resolve engine
204
+ response = make_api_request("GET", "/engines")
205
+ if response.status_code != 200:
206
+ console.print("[red]❌ Failed to fetch engines[/red]")
207
+ raise typer.Exit(1)
208
+
209
+ engines = response.json().get("engines", [])
210
+ engine = resolve_engine(name_or_id, engines)
211
+
212
+ console.print(f"[bold]Debug info for {engine['name']}:[/bold]\n")
213
+
214
+ ssm = boto3.client("ssm", region_name="us-east-1")
215
+
216
+ # Check multiple files and systemd status
217
+ checks = [
218
+ (
219
+ "Stage file",
220
+ "cat /opt/dayhoff/state/engine-init.stage 2>/dev/null || cat /var/run/engine-init.stage 2>/dev/null || echo 'MISSING'",
221
+ ),
222
+ (
223
+ "Health file",
224
+ "cat /opt/dayhoff/state/engine-health.json 2>/dev/null || cat /var/run/engine-health.json 2>/dev/null || echo 'MISSING'",
225
+ ),
226
+ (
227
+ "Sentinel file",
228
+ "ls -la /opt/dayhoff/first_boot_complete.sentinel 2>/dev/null || echo 'MISSING'",
229
+ ),
230
+ (
231
+ "Setup service",
232
+ "systemctl status setup-aws-vm.service --no-pager || echo 'Service not found'",
233
+ ),
234
+ (
235
+ "Bootstrap log tail",
236
+ "tail -20 /var/log/engine-setup.log 2>/dev/null || echo 'No log'",
237
+ ),
238
+ ("Environment file", "cat /etc/engine.env 2>/dev/null || echo 'MISSING'"),
239
+ ]
240
+
241
+ for name, cmd in checks:
242
+ try:
243
+ resp = ssm.send_command(
244
+ InstanceIds=[engine["instance_id"]],
245
+ DocumentName="AWS-RunShellScript",
246
+ Parameters={"commands": [cmd], "executionTimeout": ["10"]},
247
+ )
248
+ cid = resp["Command"]["CommandId"]
249
+ time.sleep(1)
250
+ inv = ssm.get_command_invocation(
251
+ CommandId=cid, InstanceId=engine["instance_id"]
252
+ )
253
+
254
+ if inv["Status"] == "Success":
255
+ output = inv["StandardOutputContent"].strip()
256
+ console.print(f"[cyan]{name}:[/cyan]")
257
+ console.print(f"[dim]{output}[/dim]\n")
258
+ else:
259
+ console.print(f"[cyan]{name}:[/cyan] [red]FAILED[/red]\n")
260
+
261
+ except Exception as e:
262
+ console.print(f"[cyan]{name}:[/cyan] [red]ERROR: {e}[/red]\n")
263
+
264
+
265
+ def repair_engine(
266
+ name_or_id: str = typer.Argument(help="Engine name or instance ID"),
267
+ ):
268
+ """Repair an engine that's stuck in a bad state (e.g., after GAMI creation)."""
269
+ check_aws_sso()
270
+
271
+ # Get all engines to resolve name
272
+ response = make_api_request("GET", "/engines")
273
+ if response.status_code != 200:
274
+ console.print("[red]❌ Failed to fetch engines[/red]")
275
+ raise typer.Exit(1)
276
+
277
+ engines = response.json().get("engines", [])
278
+ engine = resolve_engine(name_or_id, engines)
279
+
280
+ if engine["state"].lower() != "running":
281
+ console.print(
282
+ f"[yellow]⚠️ Engine is {engine['state']}. Must be running to repair.[/yellow]"
283
+ )
284
+ if engine["state"].lower() == "stopped" and Confirm.ask(
285
+ "Start the engine first?"
286
+ ):
287
+ response = make_api_request(
288
+ "POST", f"/engines/{engine['instance_id']}/start"
289
+ )
290
+ if response.status_code != 200:
291
+ console.print("[red]❌ Failed to start engine[/red]")
292
+ raise typer.Exit(1)
293
+ console.print("[green]✓ Engine started[/green]")
294
+ console.print("Waiting for engine to become ready...")
295
+ time.sleep(30) # Give it time to boot
296
+ else:
297
+ raise typer.Exit(1)
298
+
299
+ console.print(f"[bold]Repairing engine [cyan]{engine['name']}[/cyan][/bold]")
300
+ console.print(
301
+ "[dim]This will restore bootstrap state and ensure all services are running[/dim]\n"
302
+ )
303
+
304
+ ssm = boto3.client("ssm", region_name="us-east-1")
305
+
306
+ # Repair commands
307
+ repair_commands = [
308
+ # Create necessary directories
309
+ "sudo mkdir -p /opt/dayhoff /opt/dayhoff/state /opt/dayhoff/scripts",
310
+ # Download scripts from S3 if missing
311
+ "source /etc/engine.env && sudo aws s3 sync s3://${VM_SCRIPTS_BUCKET}/ /opt/dayhoff/scripts/ --exclude '*' --include '*.sh' --quiet",
312
+ "sudo chmod +x /opt/dayhoff/scripts/*.sh 2>/dev/null || true",
313
+ # Restore bootstrap state
314
+ "sudo touch /opt/dayhoff/first_boot_complete.sentinel",
315
+ "echo 'finished' | sudo tee /opt/dayhoff/state/engine-init.stage > /dev/null",
316
+ # Ensure SSM agent is running
317
+ "sudo systemctl restart amazon-ssm-agent 2>/dev/null || true",
318
+ # Restart idle detector (service only)
319
+ "sudo systemctl restart engine-idle-detector.service 2>/dev/null || true",
320
+ # Report status
321
+ "echo '=== Repair Complete ===' && echo 'Sentinel: ' && ls -la /opt/dayhoff/first_boot_complete.sentinel",
322
+ "echo 'Stage: ' && cat /opt/dayhoff/state/engine-init.stage",
323
+ "echo 'Scripts: ' && ls /opt/dayhoff/scripts/*.sh 2>/dev/null | wc -l",
324
+ ]
325
+
326
+ try:
327
+ with Progress(
328
+ SpinnerColumn(),
329
+ TextColumn("[progress.description]{task.description}"),
330
+ transient=True,
331
+ ) as progress:
332
+ task = progress.add_task("Repairing engine...", total=None)
333
+
334
+ response = ssm.send_command(
335
+ InstanceIds=[engine["instance_id"]],
336
+ DocumentName="AWS-RunShellScript",
337
+ Parameters={
338
+ "commands": repair_commands,
339
+ "executionTimeout": ["60"],
340
+ },
341
+ )
342
+
343
+ command_id = response["Command"]["CommandId"]
344
+
345
+ # Wait for command
346
+ for _ in range(60):
347
+ time.sleep(1)
348
+ result = ssm.get_command_invocation(
349
+ CommandId=command_id,
350
+ InstanceId=engine["instance_id"],
351
+ )
352
+ if result["Status"] in ["Success", "Failed"]:
353
+ break
354
+
355
+ if result["Status"] == "Success":
356
+ output = result["StandardOutputContent"]
357
+ console.print("[green]✓ Engine repaired successfully![/green]\n")
358
+
359
+ # Show repair results
360
+ if "=== Repair Complete ===" in output:
361
+ repair_section = output.split("=== Repair Complete ===")[1].strip()
362
+ console.print("[bold]Repair Results:[/bold]")
363
+ console.print(repair_section)
364
+
365
+ console.print(
366
+ "\n[dim]You should now be able to attach studios to this engine.[/dim]"
367
+ )
368
+ else:
369
+ console.print(
370
+ f"[red]❌ Repair failed: {result.get('StandardErrorContent', 'Unknown error')}[/red]"
371
+ )
372
+ console.print(
373
+ "\n[yellow]Try running 'dh engine debug' for more information.[/yellow]"
374
+ )
375
+
376
+ except Exception as e:
377
+ console.print(f"[red]❌ Failed to repair engine: {e}[/red]")