dayhoff-tools 1.9.26__py3-none-any.whl → 1.10.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (39) hide show
  1. dayhoff_tools/cli/engine/__init__.py +1 -323
  2. dayhoff_tools/cli/engine/coffee.py +110 -0
  3. dayhoff_tools/cli/engine/config_ssh.py +113 -0
  4. dayhoff_tools/cli/engine/debug.py +79 -0
  5. dayhoff_tools/cli/engine/gami.py +160 -0
  6. dayhoff_tools/cli/engine/idle.py +148 -0
  7. dayhoff_tools/cli/engine/launch.py +101 -0
  8. dayhoff_tools/cli/engine/list.py +116 -0
  9. dayhoff_tools/cli/engine/repair.py +128 -0
  10. dayhoff_tools/cli/engine/resize.py +195 -0
  11. dayhoff_tools/cli/engine/ssh.py +62 -0
  12. dayhoff_tools/cli/engine/{engine_core.py → status.py} +6 -201
  13. dayhoff_tools/cli/engine_studio_commands.py +323 -0
  14. dayhoff_tools/cli/engine_studio_utils/__init__.py +1 -0
  15. dayhoff_tools/cli/engine_studio_utils/api_utils.py +47 -0
  16. dayhoff_tools/cli/engine_studio_utils/aws_utils.py +102 -0
  17. dayhoff_tools/cli/engine_studio_utils/constants.py +21 -0
  18. dayhoff_tools/cli/engine_studio_utils/formatting.py +210 -0
  19. dayhoff_tools/cli/engine_studio_utils/ssh_utils.py +141 -0
  20. dayhoff_tools/cli/main.py +1 -2
  21. dayhoff_tools/cli/studio/__init__.py +1 -0
  22. dayhoff_tools/cli/studio/attach.py +314 -0
  23. dayhoff_tools/cli/studio/create.py +48 -0
  24. dayhoff_tools/cli/studio/delete.py +71 -0
  25. dayhoff_tools/cli/studio/detach.py +56 -0
  26. dayhoff_tools/cli/studio/list.py +81 -0
  27. dayhoff_tools/cli/studio/reset.py +90 -0
  28. dayhoff_tools/cli/studio/resize.py +134 -0
  29. dayhoff_tools/cli/studio/status.py +78 -0
  30. {dayhoff_tools-1.9.26.dist-info → dayhoff_tools-1.10.1.dist-info}/METADATA +1 -1
  31. dayhoff_tools-1.10.1.dist-info/RECORD +61 -0
  32. dayhoff_tools/cli/engine/engine_maintenance.py +0 -431
  33. dayhoff_tools/cli/engine/engine_management.py +0 -505
  34. dayhoff_tools/cli/engine/shared.py +0 -501
  35. dayhoff_tools/cli/engine/studio_commands.py +0 -825
  36. dayhoff_tools-1.9.26.dist-info/RECORD +0 -39
  37. /dayhoff_tools/cli/engine/{engine_lifecycle.py → lifecycle.py} +0 -0
  38. {dayhoff_tools-1.9.26.dist-info → dayhoff_tools-1.10.1.dist-info}/WHEEL +0 -0
  39. {dayhoff_tools-1.9.26.dist-info → dayhoff_tools-1.10.1.dist-info}/entry_points.txt +0 -0
@@ -1,505 +0,0 @@
1
- """Engine management commands: SSH, configuration, resizing, and AMI creation."""
2
-
3
- import subprocess
4
- import time
5
- from datetime import datetime
6
- from pathlib import Path
7
- from typing import Optional
8
-
9
- import boto3
10
- import typer
11
- from botocore.exceptions import ClientError
12
- from rich.progress import Progress, SpinnerColumn, TextColumn
13
- from rich.prompt import Confirm
14
-
15
- from .shared import (
16
- SSH_MANAGED_COMMENT,
17
- check_aws_sso,
18
- check_session_manager_plugin,
19
- console,
20
- make_api_request,
21
- resolve_engine,
22
- update_ssh_config_entry,
23
- )
24
-
25
-
26
- def ssh_engine(
27
- name_or_id: str = typer.Argument(help="Engine name or instance ID"),
28
- admin: bool = typer.Option(
29
- False, "--admin", help="Connect as ec2-user instead of the engine owner user"
30
- ),
31
- idle_timeout: int = typer.Option(
32
- 600,
33
- "--idle-timeout",
34
- help="Idle timeout (seconds) for the SSM port-forward (0 = disable)",
35
- ),
36
- ):
37
- """Connect to an engine via SSH.
38
-
39
- By default the CLI connects using the engine's owner username (the same one stored in the `User` tag).
40
- Pass `--admin` to connect with the underlying [`ec2-user`] account for break-glass or debugging.
41
- """
42
- username = check_aws_sso()
43
-
44
- # Check for Session Manager Plugin
45
- if not check_session_manager_plugin():
46
- raise typer.Exit(1)
47
-
48
- # Get all engines to resolve name
49
- response = make_api_request("GET", "/engines")
50
- if response.status_code != 200:
51
- console.print("[red]❌ Failed to fetch engines[/red]")
52
- raise typer.Exit(1)
53
-
54
- engines = response.json().get("engines", [])
55
- engine = resolve_engine(name_or_id, engines)
56
-
57
- if engine["state"].lower() != "running":
58
- console.print(f"[red]❌ Engine is not running (state: {engine['state']})[/red]")
59
- raise typer.Exit(1)
60
-
61
- # Choose SSH user
62
- ssh_user = "ec2-user" if admin else username
63
-
64
- # Update SSH config
65
- console.print(
66
- f"Updating SSH config for [cyan]{engine['name']}[/cyan] (user: {ssh_user})..."
67
- )
68
- update_ssh_config_entry(
69
- engine["name"], engine["instance_id"], ssh_user, idle_timeout
70
- )
71
-
72
- # Connect
73
- console.print(f"[green]✓ Connecting to {engine['name']}...[/green]")
74
- subprocess.run(["ssh", engine["name"]])
75
-
76
-
77
- def config_ssh(
78
- clean: bool = typer.Option(False, "--clean", help="Remove all managed entries"),
79
- all_engines: bool = typer.Option(
80
- False, "--all", "-a", help="Include all engines from all users"
81
- ),
82
- admin: bool = typer.Option(
83
- False,
84
- "--admin",
85
- help="Generate entries that use ec2-user instead of per-engine owner user",
86
- ),
87
- ):
88
- """Update SSH config with available engines."""
89
- username = check_aws_sso()
90
-
91
- # Only check for Session Manager Plugin if we're not just cleaning
92
- if not clean and not check_session_manager_plugin():
93
- raise typer.Exit(1)
94
-
95
- if clean:
96
- console.print("Removing all managed SSH entries...")
97
- else:
98
- if all_engines:
99
- console.print("Updating SSH config with all running engines...")
100
- else:
101
- console.print(
102
- f"Updating SSH config with running engines for [cyan]{username}[/cyan] and [cyan]shared[/cyan]..."
103
- )
104
-
105
- # Get all engines
106
- response = make_api_request("GET", "/engines")
107
- if response.status_code != 200:
108
- console.print("[red]❌ Failed to fetch engines[/red]")
109
- raise typer.Exit(1)
110
-
111
- engines = response.json().get("engines", [])
112
- running_engines = [e for e in engines if e["state"].lower() == "running"]
113
-
114
- # Filter engines based on options
115
- if not all_engines:
116
- # Show only current user's engines and shared engines
117
- running_engines = [
118
- e for e in running_engines if e["user"] == username or e["user"] == "shared"
119
- ]
120
-
121
- # Read existing config
122
- config_path = Path.home() / ".ssh" / "config"
123
- config_path.parent.mkdir(mode=0o700, exist_ok=True)
124
-
125
- if config_path.exists():
126
- content = config_path.read_text()
127
- lines = content.splitlines()
128
- else:
129
- content = ""
130
- lines = []
131
-
132
- # Remove old managed entries
133
- new_lines = []
134
- skip_until_next_host = False
135
- for line in lines:
136
- if SSH_MANAGED_COMMENT in line:
137
- skip_until_next_host = True
138
- elif line.strip().startswith("Host ") and skip_until_next_host:
139
- skip_until_next_host = False
140
- # Check if this is a managed host
141
- if SSH_MANAGED_COMMENT not in line:
142
- new_lines.append(line)
143
- elif not skip_until_next_host:
144
- new_lines.append(line)
145
-
146
- # Add new entries if not cleaning
147
- if not clean:
148
- for engine in running_engines:
149
- # Determine ssh user based on --admin flag
150
- ssh_user = "ec2-user" if admin else username
151
- new_lines.extend(
152
- [
153
- "",
154
- f"Host {engine['name']} {SSH_MANAGED_COMMENT}",
155
- f" HostName {engine['instance_id']}",
156
- f" User {ssh_user}",
157
- f" ProxyCommand sh -c \"AWS_SSM_IDLE_TIMEOUT=600 aws ssm start-session --target %h --document-name AWS-StartSSHSession --parameters 'portNumber=%p'\"",
158
- ]
159
- )
160
-
161
- # Write back
162
- config_path.write_text("\n".join(new_lines))
163
- config_path.chmod(0o600)
164
-
165
- if clean:
166
- console.print("[green]✓ Removed all managed SSH entries[/green]")
167
- else:
168
- console.print(
169
- f"[green]✓ Updated SSH config with {len(running_engines)} engines[/green]"
170
- )
171
- for engine in running_engines:
172
- user_display = (
173
- f"[dim]({engine['user']})[/dim]" if engine["user"] != username else ""
174
- )
175
- console.print(
176
- f" • {engine['name']} → {engine['instance_id']} {user_display}"
177
- )
178
-
179
-
180
- def resize_engine(
181
- name_or_id: str = typer.Argument(help="Engine name or instance ID"),
182
- size: int = typer.Option(..., "--size", "-s", help="New size in GB"),
183
- online: bool = typer.Option(
184
- False,
185
- "--online",
186
- help="Resize while running (requires manual filesystem expansion)",
187
- ),
188
- force: bool = typer.Option(
189
- False, "--force", "-f", help="Force resize and detach all studios"
190
- ),
191
- ):
192
- """Resize an engine's boot disk."""
193
- check_aws_sso()
194
-
195
- # Get all engines to resolve name
196
- response = make_api_request("GET", "/engines")
197
- if response.status_code != 200:
198
- console.print("[red]❌ Failed to fetch engines[/red]")
199
- raise typer.Exit(1)
200
-
201
- engines = response.json().get("engines", [])
202
- engine = resolve_engine(name_or_id, engines)
203
-
204
- # Get current volume info to validate size
205
- ec2 = boto3.client("ec2", region_name="us-east-1")
206
-
207
- try:
208
- # Get instance details to find root volume
209
- instance_info = ec2.describe_instances(InstanceIds=[engine["instance_id"]])
210
- instance = instance_info["Reservations"][0]["Instances"][0]
211
-
212
- # Find root volume
213
- root_device = instance.get("RootDeviceName", "/dev/xvda")
214
- root_volume_id = None
215
-
216
- for bdm in instance.get("BlockDeviceMappings", []):
217
- if bdm["DeviceName"] == root_device:
218
- root_volume_id = bdm["Ebs"]["VolumeId"]
219
- break
220
-
221
- if not root_volume_id:
222
- console.print("[red]❌ Could not find root volume[/red]")
223
- raise typer.Exit(1)
224
-
225
- # Get current volume size
226
- volumes = ec2.describe_volumes(VolumeIds=[root_volume_id])
227
- current_size = volumes["Volumes"][0]["Size"]
228
-
229
- if size <= current_size:
230
- console.print(
231
- f"[red]❌ New size ({size}GB) must be larger than current size ({current_size}GB)[/red]"
232
- )
233
- raise typer.Exit(1)
234
-
235
- console.print(
236
- f"[yellow]Resizing engine boot disk from {current_size}GB to {size}GB[/yellow]"
237
- )
238
-
239
- # Check if we need to stop the instance
240
- if not online and engine["state"].lower() == "running":
241
- console.print("Stopping engine for offline resize...")
242
- stop_response = make_api_request(
243
- "POST",
244
- f"/engines/{engine['instance_id']}/stop",
245
- json_data={"detach_studios": False},
246
- )
247
- if stop_response.status_code != 200:
248
- console.print("[red]❌ Failed to stop engine[/red]")
249
- raise typer.Exit(1)
250
-
251
- # Wait for instance to stop
252
- console.print("Waiting for engine to stop...")
253
- waiter = ec2.get_waiter("instance_stopped")
254
- waiter.wait(InstanceIds=[engine["instance_id"]])
255
- console.print("[green]✓ Engine stopped[/green]")
256
-
257
- # Call the resize API
258
- console.print("Resizing volume...")
259
- resize_response = make_api_request(
260
- "POST",
261
- f"/engines/{engine['instance_id']}/resize",
262
- json_data={"size": size, "detach_studios": force},
263
- )
264
-
265
- if resize_response.status_code == 409 and not force:
266
- # Engine has attached studios
267
- data = resize_response.json()
268
- attached_studios = data.get("attached_studios", [])
269
-
270
- console.print("\n[yellow]⚠️ This engine has attached studios:[/yellow]")
271
- for studio in attached_studios:
272
- console.print(f" • {studio['user']} ({studio['studio_id']})")
273
-
274
- if Confirm.ask("\nDetach all studios and resize the engine?"):
275
- resize_response = make_api_request(
276
- "POST",
277
- f"/engines/{engine['instance_id']}/resize",
278
- json_data={"size": size, "detach_studios": True},
279
- )
280
- else:
281
- console.print("Resize cancelled.")
282
- return
283
-
284
- if resize_response.status_code != 200:
285
- error = resize_response.json().get("error", "Unknown error")
286
- console.print(f"[red]❌ Failed to resize engine: {error}[/red]")
287
- raise typer.Exit(1)
288
-
289
- # Check if studios were detached
290
- data = resize_response.json()
291
- detached_studios = data.get("detached_studios", 0)
292
- if detached_studios > 0:
293
- console.print(
294
- f"[green]✓ Detached {detached_studios} studio(s) before resize[/green]"
295
- )
296
-
297
- # Wait for modification to complete
298
- console.print("Waiting for volume modification to complete...")
299
- while True:
300
- mod_state = ec2.describe_volumes_modifications(VolumeIds=[root_volume_id])
301
- if not mod_state["VolumesModifications"]:
302
- break # Modification complete
303
-
304
- modification = mod_state["VolumesModifications"][0]
305
- state = modification["ModificationState"]
306
- progress = modification.get("Progress", 0)
307
-
308
- # Show progress updates only for the resize phase
309
- if state == "modifying":
310
- console.print(f"[yellow]Progress: {progress}%[/yellow]")
311
-
312
- # Exit as soon as optimization starts (resize is complete)
313
- if state == "optimizing":
314
- console.print("[green]✓ Volume resized successfully[/green]")
315
- console.print(
316
- "[dim]AWS is optimizing the volume in the background (no action needed).[/dim]"
317
- )
318
- break
319
-
320
- if state == "completed":
321
- console.print("[green]✓ Volume resized successfully[/green]")
322
- break
323
- elif state == "failed":
324
- console.print("[red]❌ Volume modification failed[/red]")
325
- raise typer.Exit(1)
326
-
327
- time.sleep(2) # Check more frequently for better UX
328
-
329
- # If offline resize, start the instance back up
330
- if not online and engine["state"].lower() == "running":
331
- console.print("Starting engine back up...")
332
- start_response = make_api_request(
333
- "POST", f"/engines/{engine['instance_id']}/start"
334
- )
335
- if start_response.status_code != 200:
336
- console.print(
337
- "[yellow]⚠️ Failed to restart engine automatically[/yellow]"
338
- )
339
- console.print(
340
- f"Please start it manually: [cyan]dh engine start {engine['name']}[/cyan]"
341
- )
342
- else:
343
- console.print("[green]✓ Engine started[/green]")
344
- console.print("The filesystem will be automatically expanded on boot.")
345
-
346
- elif online and engine["state"].lower() == "running":
347
- console.print(
348
- "\n[yellow]⚠️ Online resize complete. You must now expand the filesystem:[/yellow]"
349
- )
350
- console.print(f"1. SSH into the engine: [cyan]ssh {engine['name']}[/cyan]")
351
- console.print("2. Find the root device: [cyan]lsblk[/cyan]")
352
- console.print(
353
- "3. Expand the partition: [cyan]sudo growpart /dev/nvme0n1 1[/cyan] (adjust device name as needed)"
354
- )
355
- console.print("4. Expand the filesystem: [cyan]sudo xfs_growfs /[/cyan]")
356
-
357
- except ClientError as e:
358
- console.print(f"[red]❌ Failed to resize engine: {e}[/red]")
359
- raise typer.Exit(1)
360
-
361
-
362
- def create_ami(
363
- name_or_id: str = typer.Argument(
364
- help="Engine name or instance ID to create AMI from"
365
- ),
366
- ):
367
- """Create a 'Golden AMI' from a running engine.
368
-
369
- This process is for creating a pre-warmed, standardized machine image
370
- that can be used to launch new engines more quickly.
371
-
372
- IMPORTANT:
373
- - The engine MUST have all studios detached before running this command.
374
- - This process will make the source engine unusable. You should
375
- plan to TERMINATE the engine after the AMI is created.
376
- """
377
- check_aws_sso()
378
-
379
- # Get all engines to resolve name and check status
380
- # We pass check_ready=True to get attached studio info
381
- response = make_api_request("GET", "/engines", params={"check_ready": "true"})
382
- if response.status_code != 200:
383
- console.print("[red]❌ Failed to fetch engines[/red]")
384
- raise typer.Exit(1)
385
-
386
- engines = response.json().get("engines", [])
387
- engine = resolve_engine(name_or_id, engines)
388
-
389
- # --- Pre-flight checks ---
390
-
391
- # 1. Check if engine is running
392
- if engine["state"].lower() != "running":
393
- console.print(f"[red]❌ Engine '{engine['name']}' is not running.[/red]")
394
- console.print("Please start it before creating an AMI.")
395
- raise typer.Exit(1)
396
-
397
- # 2. Check for attached studios from the detailed API response
398
- attached_studios = engine.get("studios", [])
399
- if attached_studios:
400
- console.print(
401
- f"[bold red]❌ Engine '{engine['name']}' has studios attached.[/bold red]"
402
- )
403
- console.print("Please detach all studios before creating an AMI:")
404
- for studio in attached_studios:
405
- console.print(f" - {studio['user']} ({studio['studio_id']})")
406
- console.print("\nTo detach, run [bold]dh studio detach[/bold]")
407
- raise typer.Exit(1)
408
-
409
- # Construct AMI name and description
410
- ami_name = (
411
- f"prewarmed-engine-{engine['engine_type']}-{datetime.now().strftime('%Y%m%d')}"
412
- )
413
- description = (
414
- f"Amazon Linux 2023 with NVIDIA drivers, Docker, and pre-pulled "
415
- f"dev container image for {engine['engine_type']} engines"
416
- )
417
-
418
- console.print(f"Creating AMI from engine [cyan]{engine['name']}[/cyan]...")
419
- console.print(f"[bold]AMI Name:[/] {ami_name}")
420
- console.print(f"[bold]Description:[/] {description}")
421
-
422
- console.print(
423
- "\n[bold yellow]⚠️ Important:[/bold yellow]\n"
424
- "1. This process will run cleanup scripts on the engine.\n"
425
- "2. The source engine should be [bold]terminated[/bold] after the AMI is created.\n"
426
- )
427
-
428
- if not Confirm.ask("Continue with AMI creation?"):
429
- raise typer.Exit()
430
-
431
- # Create AMI using EC2 client directly, as the backend logic is too complex
432
- ec2 = boto3.client("ec2", region_name="us-east-1")
433
- ssm = boto3.client("ssm", region_name="us-east-1")
434
-
435
- try:
436
- # Clean up instance state before snapshotting
437
- console.print("Cleaning up instance for AMI creation...")
438
- cleanup_commands = [
439
- "sudo rm -f /opt/dayhoff/first_boot_complete.sentinel",
440
- "history -c",
441
- "sudo rm -rf /tmp/* /var/log/messages /var/log/cloud-init.log",
442
- "sudo rm -rf /var/lib/amazon/ssm/* /etc/amazon/ssm/*",
443
- "sleep 2 && sudo systemctl stop amazon-ssm-agent &", # Stop agent last
444
- ]
445
-
446
- cleanup_response = ssm.send_command(
447
- InstanceIds=[engine["instance_id"]],
448
- DocumentName="AWS-RunShellScript",
449
- Parameters={"commands": cleanup_commands, "executionTimeout": ["120"]},
450
- )
451
-
452
- # Acknowledge that the SSM command might be in progress as the agent shuts down
453
- console.print(
454
- "[dim]ℹ️ Cleanup command sent (status may show 'InProgress' as SSM agent stops)[/dim]"
455
- )
456
-
457
- # Create the AMI
458
- with Progress(
459
- SpinnerColumn(),
460
- TextColumn("[progress.description]{task.description}"),
461
- transient=True,
462
- ) as progress:
463
- task = progress.add_task(
464
- "Creating AMI (this will take several minutes)...", total=None
465
- )
466
-
467
- response = ec2.create_image(
468
- InstanceId=engine["instance_id"],
469
- Name=ami_name,
470
- Description=description,
471
- NoReboot=False,
472
- TagSpecifications=[
473
- {
474
- "ResourceType": "image",
475
- "Tags": [
476
- {"Key": "Environment", "Value": "dev"},
477
- {"Key": "Type", "Value": "golden-ami"},
478
- {"Key": "EngineType", "Value": engine["engine_type"]},
479
- {"Key": "Name", "Value": ami_name},
480
- ],
481
- }
482
- ],
483
- )
484
-
485
- ami_id = response["ImageId"]
486
- progress.update(
487
- task,
488
- completed=True,
489
- description=f"[green]✓ AMI creation initiated![/green]",
490
- )
491
-
492
- console.print(f" [bold]AMI ID:[/] {ami_id}")
493
- console.print("\nThe AMI creation process will continue in the background.")
494
- console.print("You can monitor progress in the EC2 Console under 'AMIs'.")
495
- console.print(
496
- "\nOnce complete, update the AMI ID in [bold]terraform/environments/dev/variables.tf[/bold] "
497
- "and run [bold]terraform apply[/bold]."
498
- )
499
- console.print(
500
- f"\nRemember to [bold red]terminate the source engine '{engine['name']}'[/bold red] to save costs."
501
- )
502
-
503
- except ClientError as e:
504
- console.print(f"[red]❌ Failed to create AMI: {e}[/red]")
505
- raise typer.Exit(1)