dayhoff-tools 1.9.8__py3-none-any.whl → 1.9.10__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dayhoff_tools/cli/engine/__init__.py +49 -0
- dayhoff_tools/cli/engine/engine_core.py +739 -0
- dayhoff_tools/cli/engine/engine_lifecycle.py +136 -0
- dayhoff_tools/cli/engine/engine_maintenance.py +377 -0
- dayhoff_tools/cli/engine/engine_management.py +505 -0
- dayhoff_tools/cli/engine/shared.py +501 -0
- dayhoff_tools/cli/engine/studio_commands.py +825 -0
- dayhoff_tools/cli/main.py +1 -1
- {dayhoff_tools-1.9.8.dist-info → dayhoff_tools-1.9.10.dist-info}/METADATA +1 -1
- {dayhoff_tools-1.9.8.dist-info → dayhoff_tools-1.9.10.dist-info}/RECORD +12 -6
- dayhoff_tools/cli/engine_commands.py +0 -3012
- {dayhoff_tools-1.9.8.dist-info → dayhoff_tools-1.9.10.dist-info}/WHEEL +0 -0
- {dayhoff_tools-1.9.8.dist-info → dayhoff_tools-1.9.10.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,739 @@
|
|
1
|
+
"""Core engine commands: launch, list, and status."""
|
2
|
+
|
3
|
+
import json
|
4
|
+
import time
|
5
|
+
from datetime import datetime, timezone
|
6
|
+
from typing import Any, Dict, Optional
|
7
|
+
|
8
|
+
import boto3
|
9
|
+
import typer
|
10
|
+
from rich import box
|
11
|
+
from rich.panel import Panel
|
12
|
+
from rich.progress import Progress, SpinnerColumn, TextColumn
|
13
|
+
from rich.table import Table
|
14
|
+
|
15
|
+
from .shared import (
|
16
|
+
HOURLY_COSTS,
|
17
|
+
_fetch_init_stages,
|
18
|
+
check_aws_sso,
|
19
|
+
console,
|
20
|
+
format_duration,
|
21
|
+
format_status,
|
22
|
+
get_disk_usage_via_ssm,
|
23
|
+
make_api_request,
|
24
|
+
parse_launch_time,
|
25
|
+
resolve_engine,
|
26
|
+
)
|
27
|
+
|
28
|
+
|
29
|
+
def launch_engine(
|
30
|
+
name: str = typer.Argument(help="Name for the new engine"),
|
31
|
+
engine_type: str = typer.Option(
|
32
|
+
"cpu",
|
33
|
+
"--type",
|
34
|
+
"-t",
|
35
|
+
help="Engine type: cpu, cpumax, t4, a10g, a100, 4_t4, 8_t4, 4_a10g, 8_a10g",
|
36
|
+
),
|
37
|
+
user: Optional[str] = typer.Option(None, "--user", "-u", help="Override username"),
|
38
|
+
boot_disk_size: Optional[int] = typer.Option(
|
39
|
+
None,
|
40
|
+
"--size",
|
41
|
+
"-s",
|
42
|
+
help="Boot disk size in GB (default: 50GB, min: 20GB, max: 1000GB)",
|
43
|
+
),
|
44
|
+
availability_zone: Optional[str] = typer.Option(
|
45
|
+
None,
|
46
|
+
"--az",
|
47
|
+
help="Prefer a specific Availability Zone (e.g., us-east-1b). If omitted the service will try all public subnets.",
|
48
|
+
),
|
49
|
+
):
|
50
|
+
"""Launch a new engine instance."""
|
51
|
+
username = check_aws_sso()
|
52
|
+
if user:
|
53
|
+
username = user
|
54
|
+
|
55
|
+
# Validate engine type
|
56
|
+
valid_types = [
|
57
|
+
"cpu",
|
58
|
+
"cpumax",
|
59
|
+
"t4",
|
60
|
+
"a10g",
|
61
|
+
"a100",
|
62
|
+
"4_t4",
|
63
|
+
"8_t4",
|
64
|
+
"4_a10g",
|
65
|
+
"8_a10g",
|
66
|
+
]
|
67
|
+
if engine_type not in valid_types:
|
68
|
+
console.print(f"[red]❌ Invalid engine type: {engine_type}[/red]")
|
69
|
+
console.print(f"Valid types: {', '.join(valid_types)}")
|
70
|
+
raise typer.Exit(1)
|
71
|
+
|
72
|
+
# Validate boot disk size
|
73
|
+
if boot_disk_size is not None:
|
74
|
+
if boot_disk_size < 20:
|
75
|
+
console.print("[red]❌ Boot disk size must be at least 20GB[/red]")
|
76
|
+
raise typer.Exit(1)
|
77
|
+
if boot_disk_size > 1000:
|
78
|
+
console.print("[red]❌ Boot disk size cannot exceed 1000GB[/red]")
|
79
|
+
raise typer.Exit(1)
|
80
|
+
|
81
|
+
cost = HOURLY_COSTS.get(engine_type, 0)
|
82
|
+
disk_info = f" with {boot_disk_size}GB boot disk" if boot_disk_size else ""
|
83
|
+
console.print(
|
84
|
+
f"Launching [cyan]{name}[/cyan] ({engine_type}){disk_info} for ${cost:.2f}/hour..."
|
85
|
+
)
|
86
|
+
|
87
|
+
with Progress(
|
88
|
+
SpinnerColumn(),
|
89
|
+
TextColumn("[progress.description]{task.description}"),
|
90
|
+
transient=True,
|
91
|
+
) as progress:
|
92
|
+
progress.add_task("Creating engine...", total=None)
|
93
|
+
|
94
|
+
request_data: Dict[str, Any] = {
|
95
|
+
"name": name,
|
96
|
+
"user": username,
|
97
|
+
"engine_type": engine_type,
|
98
|
+
}
|
99
|
+
if boot_disk_size is not None:
|
100
|
+
request_data["boot_disk_size"] = boot_disk_size
|
101
|
+
if availability_zone:
|
102
|
+
request_data["availability_zone"] = availability_zone
|
103
|
+
|
104
|
+
response = make_api_request("POST", "/engines", json_data=request_data)
|
105
|
+
|
106
|
+
if response.status_code == 201:
|
107
|
+
data = response.json()
|
108
|
+
console.print(f"[green]✓ Engine launched successfully![/green]")
|
109
|
+
console.print(f"Instance ID: [cyan]{data['instance_id']}[/cyan]")
|
110
|
+
console.print(f"Type: {data['instance_type']} (${cost:.2f}/hour)")
|
111
|
+
if boot_disk_size:
|
112
|
+
console.print(f"Boot disk: {boot_disk_size}GB")
|
113
|
+
console.print("\nThe engine is initializing. This may take a few minutes.")
|
114
|
+
console.print(f"Check status with: [cyan]dh engine status {name}[/cyan]")
|
115
|
+
else:
|
116
|
+
error = response.json().get("error", "Unknown error")
|
117
|
+
console.print(f"[red]❌ Failed to launch engine: {error}[/red]")
|
118
|
+
|
119
|
+
|
120
|
+
def list_engines(
|
121
|
+
user: Optional[str] = typer.Option(None, "--user", "-u", help="Filter by user"),
|
122
|
+
running_only: bool = typer.Option(
|
123
|
+
False, "--running", help="Show only running engines"
|
124
|
+
),
|
125
|
+
stopped_only: bool = typer.Option(
|
126
|
+
False, "--stopped", help="Show only stopped engines"
|
127
|
+
),
|
128
|
+
detailed: bool = typer.Option(
|
129
|
+
False, "--detailed", "-d", help="Show detailed status (slower)"
|
130
|
+
),
|
131
|
+
):
|
132
|
+
"""List engines (shows all engines by default)."""
|
133
|
+
current_user = check_aws_sso()
|
134
|
+
|
135
|
+
params = {}
|
136
|
+
if user:
|
137
|
+
params["user"] = user
|
138
|
+
if detailed:
|
139
|
+
params["check_ready"] = "true"
|
140
|
+
|
141
|
+
response = make_api_request("GET", "/engines", params=params)
|
142
|
+
|
143
|
+
if response.status_code == 200:
|
144
|
+
data = response.json()
|
145
|
+
engines = data.get("engines", [])
|
146
|
+
|
147
|
+
# Filter by state if requested
|
148
|
+
if running_only:
|
149
|
+
engines = [e for e in engines if e["state"].lower() == "running"]
|
150
|
+
elif stopped_only:
|
151
|
+
engines = [e for e in engines if e["state"].lower() == "stopped"]
|
152
|
+
|
153
|
+
if not engines:
|
154
|
+
console.print("No engines found.")
|
155
|
+
return
|
156
|
+
|
157
|
+
# Only fetch detailed info if requested (slow)
|
158
|
+
stages_map = {}
|
159
|
+
if detailed:
|
160
|
+
stages_map = _fetch_init_stages([e["instance_id"] for e in engines])
|
161
|
+
|
162
|
+
# Create table
|
163
|
+
table = Table(title="Engines", box=box.ROUNDED)
|
164
|
+
table.add_column("Name", style="cyan")
|
165
|
+
table.add_column("Instance ID", style="dim")
|
166
|
+
table.add_column("Type")
|
167
|
+
table.add_column("User")
|
168
|
+
table.add_column("Status")
|
169
|
+
if detailed:
|
170
|
+
table.add_column("Disk Usage")
|
171
|
+
table.add_column("Uptime/Since")
|
172
|
+
table.add_column("$/hour", justify="right")
|
173
|
+
|
174
|
+
for engine in engines:
|
175
|
+
launch_time = parse_launch_time(engine["launch_time"])
|
176
|
+
uptime = datetime.now(timezone.utc) - launch_time
|
177
|
+
hourly_cost = HOURLY_COSTS.get(engine["engine_type"], 0)
|
178
|
+
|
179
|
+
if engine["state"].lower() == "running":
|
180
|
+
time_str = format_duration(uptime)
|
181
|
+
# Only get disk usage if detailed mode
|
182
|
+
if detailed:
|
183
|
+
disk_usage = get_disk_usage_via_ssm(engine["instance_id"]) or "-"
|
184
|
+
else:
|
185
|
+
disk_usage = None
|
186
|
+
else:
|
187
|
+
time_str = launch_time.strftime("%Y-%m-%d %H:%M")
|
188
|
+
disk_usage = "-" if detailed else None
|
189
|
+
|
190
|
+
row_data = [
|
191
|
+
engine["name"],
|
192
|
+
engine["instance_id"],
|
193
|
+
engine["engine_type"],
|
194
|
+
engine["user"],
|
195
|
+
format_status(engine["state"], engine.get("ready")),
|
196
|
+
]
|
197
|
+
if detailed:
|
198
|
+
row_data.append(disk_usage)
|
199
|
+
row_data.extend(
|
200
|
+
[
|
201
|
+
time_str,
|
202
|
+
f"${hourly_cost:.2f}",
|
203
|
+
]
|
204
|
+
)
|
205
|
+
|
206
|
+
table.add_row(*row_data)
|
207
|
+
|
208
|
+
console.print(table)
|
209
|
+
if not detailed and any(e["state"].lower() == "running" for e in engines):
|
210
|
+
console.print(
|
211
|
+
"\n[dim]Tip: Use --detailed to see disk usage and bootstrap status (slower)[/dim]"
|
212
|
+
)
|
213
|
+
else:
|
214
|
+
error = response.json().get("error", "Unknown error")
|
215
|
+
console.print(f"[red]❌ Failed to list engines: {error}[/red]")
|
216
|
+
|
217
|
+
|
218
|
+
def engine_status(
|
219
|
+
name_or_id: str = typer.Argument(help="Engine name or instance ID"),
|
220
|
+
detailed: bool = typer.Option(False, "--detailed", "-d", help="Show detailed status (slower)"),
|
221
|
+
show_log: bool = typer.Option(False, "--show-log", help="Show bootstrap log (requires --detailed)"),
|
222
|
+
):
|
223
|
+
"""Show engine status and information."""
|
224
|
+
check_aws_sso()
|
225
|
+
|
226
|
+
# Get all engines to resolve name
|
227
|
+
response = make_api_request("GET", "/engines")
|
228
|
+
if response.status_code != 200:
|
229
|
+
console.print("[red]❌ Failed to fetch engines[/red]")
|
230
|
+
raise typer.Exit(1)
|
231
|
+
|
232
|
+
engines = response.json().get("engines", [])
|
233
|
+
engine = resolve_engine(name_or_id, engines)
|
234
|
+
|
235
|
+
# Fast status display (default)
|
236
|
+
if not detailed:
|
237
|
+
# Fetch idle status via SSM with longer timeout
|
238
|
+
ssm = boto3.client("ssm", region_name="us-east-1")
|
239
|
+
idle_data = None # Use None to indicate no data received
|
240
|
+
|
241
|
+
if engine["state"].lower() == "running":
|
242
|
+
try:
|
243
|
+
resp = ssm.send_command(
|
244
|
+
InstanceIds=[engine["instance_id"]],
|
245
|
+
DocumentName="AWS-RunShellScript",
|
246
|
+
Parameters={
|
247
|
+
"commands": [
|
248
|
+
"cat /var/run/idle-detector/last_state.json 2>/dev/null || echo '{}'"
|
249
|
+
],
|
250
|
+
"executionTimeout": ["10"],
|
251
|
+
},
|
252
|
+
)
|
253
|
+
cid = resp["Command"]["CommandId"]
|
254
|
+
|
255
|
+
# Wait up to 3 seconds for result
|
256
|
+
for _ in range(6): # 6 * 0.5 = 3 seconds
|
257
|
+
time.sleep(0.5)
|
258
|
+
inv = ssm.get_command_invocation(
|
259
|
+
CommandId=cid, InstanceId=engine["instance_id"]
|
260
|
+
)
|
261
|
+
if inv["Status"] in ["Success", "Failed"]:
|
262
|
+
break
|
263
|
+
|
264
|
+
if inv["Status"] == "Success":
|
265
|
+
content = inv["StandardOutputContent"].strip()
|
266
|
+
if content and content != "{}":
|
267
|
+
idle_data = json.loads(content)
|
268
|
+
else:
|
269
|
+
idle_data = {} # Empty response but SSM worked
|
270
|
+
except Exception:
|
271
|
+
idle_data = None # SSM failed
|
272
|
+
|
273
|
+
# Determine running state display
|
274
|
+
running_state = engine["state"].lower()
|
275
|
+
if running_state == "running":
|
276
|
+
run_disp = "[green]Running[/green]"
|
277
|
+
elif running_state == "pending":
|
278
|
+
run_disp = "[yellow]Starting...[/yellow]"
|
279
|
+
elif running_state == "stopping":
|
280
|
+
run_disp = "[yellow]Stopping...[/yellow]"
|
281
|
+
elif running_state == "stopped":
|
282
|
+
run_disp = "[dim]Stopped[/dim]"
|
283
|
+
else:
|
284
|
+
run_disp = engine["state"].capitalize()
|
285
|
+
|
286
|
+
# Determine idle/active status
|
287
|
+
idle_disp = ""
|
288
|
+
if running_state == "running":
|
289
|
+
if idle_data is None:
|
290
|
+
# SSM failed - we don't know the status
|
291
|
+
idle_disp = " [dim]N/A[/dim]"
|
292
|
+
elif not idle_data:
|
293
|
+
# Empty data - likely very early in boot
|
294
|
+
idle_disp = " [dim]N/A[/dim]"
|
295
|
+
else:
|
296
|
+
# We have data
|
297
|
+
is_idle = idle_data.get("idle", False)
|
298
|
+
timeout_sec = idle_data.get("timeout_sec")
|
299
|
+
idle_seconds = idle_data.get("idle_seconds", 0) if is_idle else 0
|
300
|
+
|
301
|
+
if is_idle:
|
302
|
+
if isinstance(timeout_sec, int) and isinstance(idle_seconds, int):
|
303
|
+
remaining = max(0, timeout_sec - idle_seconds)
|
304
|
+
remaining_mins = remaining // 60
|
305
|
+
if remaining_mins == 0:
|
306
|
+
idle_disp = f" [yellow]Idle {idle_seconds//60}m/{timeout_sec//60}m: [red]<1m[/red] left[/yellow]"
|
307
|
+
else:
|
308
|
+
idle_disp = f" [yellow]Idle {idle_seconds//60}m/{timeout_sec//60}m: [red]{remaining_mins}m[/red] left[/yellow]"
|
309
|
+
else:
|
310
|
+
idle_disp = " [yellow]Idle ?/?[/yellow]"
|
311
|
+
else:
|
312
|
+
# Actively not idle
|
313
|
+
idle_disp = " [green]Active[/green]"
|
314
|
+
|
315
|
+
# Build status lines - minimal info for fast view
|
316
|
+
status_lines = [
|
317
|
+
f"[blue]{engine['name']}[/blue] {run_disp}{idle_disp}",
|
318
|
+
]
|
319
|
+
|
320
|
+
# Add activity sensors if we have idle data
|
321
|
+
if idle_data and idle_data.get("reasons"):
|
322
|
+
status_lines.append("") # blank line before sensors
|
323
|
+
|
324
|
+
sensor_map = {
|
325
|
+
"CoffeeLockSensor": ("☕", "Coffee"),
|
326
|
+
"ActiveLoginSensor": ("🐚", "SSH"),
|
327
|
+
"IDEConnectionSensor": ("🖥 ", "IDE"),
|
328
|
+
"DockerWorkloadSensor": ("🐳", "Docker"),
|
329
|
+
}
|
330
|
+
|
331
|
+
for r in idle_data.get("reasons", []):
|
332
|
+
sensor = r.get("sensor", "Unknown")
|
333
|
+
active = r.get("active", False)
|
334
|
+
icon, label = sensor_map.get(sensor, ("?", sensor))
|
335
|
+
status_str = "[green]YES[/green]" if active else "[dim]nope[/dim]"
|
336
|
+
status_lines.append(f" {icon} {label:6} {status_str}")
|
337
|
+
|
338
|
+
# Display in a nice panel
|
339
|
+
console.print(
|
340
|
+
Panel("\n".join(status_lines), title="Engine Status", border_style="blue")
|
341
|
+
)
|
342
|
+
return # Exit early for fast status
|
343
|
+
|
344
|
+
# Get detailed engine status including idle detector info (for --detailed mode)
|
345
|
+
response = make_api_request("GET", f"/engines/{engine['instance_id']}")
|
346
|
+
if response.status_code != 200:
|
347
|
+
console.print("[red]❌ Failed to fetch engine details[/red]")
|
348
|
+
raise typer.Exit(1)
|
349
|
+
|
350
|
+
engine_details = response.json()
|
351
|
+
engine = engine_details.get("engine", engine) # Use detailed info if available
|
352
|
+
idle_detector = engine_details.get("idle_detector", {}) or {}
|
353
|
+
attached_studios = engine_details.get("attached_studios", [])
|
354
|
+
|
355
|
+
# Calculate costs
|
356
|
+
launch_time = parse_launch_time(engine["launch_time"])
|
357
|
+
uptime = datetime.now(timezone.utc) - launch_time
|
358
|
+
hourly_cost = HOURLY_COSTS.get(engine["engine_type"], 0)
|
359
|
+
# total_cost intentionally not shown in status view
|
360
|
+
|
361
|
+
stages_map = _fetch_init_stages([engine["instance_id"]])
|
362
|
+
stage_val = stages_map.get(engine["instance_id"], "-")
|
363
|
+
|
364
|
+
# Try to fetch actual boot time via SSM (best-effort)
|
365
|
+
boot_time_str: Optional[str] = None
|
366
|
+
try:
|
367
|
+
if engine["state"].lower() == "running":
|
368
|
+
ssm = boto3.client("ssm", region_name="us-east-1")
|
369
|
+
resp = ssm.send_command(
|
370
|
+
InstanceIds=[engine["instance_id"]],
|
371
|
+
DocumentName="AWS-RunShellScript",
|
372
|
+
Parameters={
|
373
|
+
"commands": ["uptime -s || who -b | awk '{print $3\" \"$4}'"]
|
374
|
+
},
|
375
|
+
)
|
376
|
+
cid = resp["Command"]["CommandId"]
|
377
|
+
time.sleep(1)
|
378
|
+
inv = ssm.get_command_invocation(
|
379
|
+
CommandId=cid, InstanceId=engine["instance_id"]
|
380
|
+
)
|
381
|
+
if inv.get("Status") == "Success":
|
382
|
+
boot_time_str = (
|
383
|
+
(inv.get("StandardOutputContent") or "").strip().splitlines()[0]
|
384
|
+
if inv.get("StandardOutputContent")
|
385
|
+
else None
|
386
|
+
)
|
387
|
+
except Exception:
|
388
|
+
boot_time_str = None
|
389
|
+
|
390
|
+
started_line = (
|
391
|
+
f"[bold]Started:[/bold] {boot_time_str} ({format_duration(uptime)} ago)"
|
392
|
+
if boot_time_str
|
393
|
+
else f"[bold]Started:[/bold] {launch_time.strftime('%Y-%m-%d %H:%M:%S')} ({format_duration(uptime)} ago)"
|
394
|
+
)
|
395
|
+
|
396
|
+
# ---------------- Front-loaded summary ----------------
|
397
|
+
running_state = engine["state"].lower()
|
398
|
+
if running_state == "running":
|
399
|
+
run_disp = "[green]Running[/green]"
|
400
|
+
elif running_state == "pending":
|
401
|
+
run_disp = "[yellow]Starting...[/yellow]"
|
402
|
+
elif running_state == "stopping":
|
403
|
+
run_disp = "[yellow]Stopping...[/yellow]"
|
404
|
+
elif running_state == "stopped":
|
405
|
+
run_disp = "[dim]Stopped[/dim]"
|
406
|
+
else:
|
407
|
+
run_disp = engine["state"].capitalize()
|
408
|
+
|
409
|
+
# Compose Active/Idle header with extra detail when idle
|
410
|
+
def _compute_active_disp(idle_info: Dict[str, Any]) -> str:
|
411
|
+
# If we don't have idle info or it's explicitly unavailable, show N/A
|
412
|
+
if not idle_info or idle_info.get("available") == False:
|
413
|
+
return "[dim]N/A[/dim]"
|
414
|
+
|
415
|
+
if idle_info.get("status") == "active":
|
416
|
+
return "[green]Active[/green]"
|
417
|
+
if running_state in ("stopped", "stopping"):
|
418
|
+
return "[dim]N/A[/dim]"
|
419
|
+
|
420
|
+
# If idle, show time/threshold with time remaining if available
|
421
|
+
if idle_info.get("status") == "idle":
|
422
|
+
idle_seconds_v = idle_info.get("idle_seconds")
|
423
|
+
thresh_v = idle_info.get("idle_threshold")
|
424
|
+
if isinstance(idle_seconds_v, (int, float)) and isinstance(thresh_v, (int, float)):
|
425
|
+
remaining = max(0, int(thresh_v) - int(idle_seconds_v))
|
426
|
+
remaining_mins = remaining // 60
|
427
|
+
if remaining_mins == 0:
|
428
|
+
return f"[yellow]Idle {int(idle_seconds_v)//60}m/{int(thresh_v)//60}m: [red]<1m[/red] left[/yellow]"
|
429
|
+
else:
|
430
|
+
return f"[yellow]Idle {int(idle_seconds_v)//60}m/{int(thresh_v)//60}m: [red]{remaining_mins}m[/red] left[/yellow]"
|
431
|
+
elif isinstance(thresh_v, (int, float)):
|
432
|
+
return f"[yellow]Idle ?/{int(thresh_v)//60}m[/yellow]"
|
433
|
+
else:
|
434
|
+
return "[yellow]Idle ?/?[/yellow]"
|
435
|
+
|
436
|
+
# Default to N/A if we can't determine status
|
437
|
+
return "[dim]N/A[/dim]"
|
438
|
+
|
439
|
+
active_disp = _compute_active_disp(idle_detector)
|
440
|
+
|
441
|
+
top_lines = [
|
442
|
+
f"[blue]{engine['name']}[/blue] {run_disp} {active_disp}\n",
|
443
|
+
]
|
444
|
+
|
445
|
+
# Studios summary next, with studio name in purple/magenta
|
446
|
+
studios_line = None
|
447
|
+
if attached_studios:
|
448
|
+
stu_texts = [
|
449
|
+
f"[magenta]{s.get('user', 'studio')}[/magenta] ({s.get('studio_id', 'unknown')})"
|
450
|
+
for s in attached_studios
|
451
|
+
]
|
452
|
+
studios_line = "Studios: " + ", ".join(stu_texts)
|
453
|
+
top_lines.append(studios_line)
|
454
|
+
|
455
|
+
# Paragraph break
|
456
|
+
top_lines.append("")
|
457
|
+
|
458
|
+
# ---------------- Details block (white/default) ----------------
|
459
|
+
status_lines = [
|
460
|
+
f"Name: {engine['name']}",
|
461
|
+
f"Instance: {engine['instance_id']}",
|
462
|
+
f"Type: {engine['engine_type']} ({engine['instance_type']})",
|
463
|
+
f"Status: {engine['state']}",
|
464
|
+
f"User: {engine['user']}",
|
465
|
+
f"IP: {engine.get('public_ip', 'N/A')}",
|
466
|
+
started_line,
|
467
|
+
f"$/hour: ${hourly_cost:.2f}",
|
468
|
+
]
|
469
|
+
|
470
|
+
# Disk usage (like list --detailed)
|
471
|
+
if engine["state"].lower() == "running":
|
472
|
+
disk_usage = get_disk_usage_via_ssm(engine["instance_id"]) or "-"
|
473
|
+
status_lines.append(f"Disk: {disk_usage}")
|
474
|
+
|
475
|
+
# Idle timeout (show even when not idle) - but only if we have data
|
476
|
+
if idle_detector.get("available"):
|
477
|
+
idle_threshold_secs: Optional[int] = None
|
478
|
+
# Prefer value from idle detector overlay if present
|
479
|
+
try:
|
480
|
+
if isinstance(idle_detector.get("idle_threshold"), (int, float)):
|
481
|
+
idle_threshold_secs = int(idle_detector["idle_threshold"])
|
482
|
+
except Exception:
|
483
|
+
idle_threshold_secs = None
|
484
|
+
|
485
|
+
if idle_threshold_secs is None and engine["state"].lower() == "running":
|
486
|
+
# Fallback: read /etc/engine.env via SSM
|
487
|
+
try:
|
488
|
+
ssm = boto3.client("ssm", region_name="us-east-1")
|
489
|
+
resp = ssm.send_command(
|
490
|
+
InstanceIds=[engine["instance_id"]],
|
491
|
+
DocumentName="AWS-RunShellScript",
|
492
|
+
Parameters={
|
493
|
+
"commands": [
|
494
|
+
"grep -E '^IDLE_TIMEOUT_SECONDS=' /etc/engine.env | cut -d'=' -f2 || echo '?'",
|
495
|
+
],
|
496
|
+
"executionTimeout": ["5"],
|
497
|
+
},
|
498
|
+
)
|
499
|
+
cid = resp["Command"]["CommandId"]
|
500
|
+
time.sleep(1)
|
501
|
+
inv = ssm.get_command_invocation(
|
502
|
+
CommandId=cid, InstanceId=engine["instance_id"]
|
503
|
+
)
|
504
|
+
if inv.get("Status") == "Success":
|
505
|
+
out = (inv.get("StandardOutputContent") or "").strip()
|
506
|
+
if out and out != "?" and out.isdigit():
|
507
|
+
idle_threshold_secs = int(out)
|
508
|
+
except Exception:
|
509
|
+
idle_threshold_secs = None
|
510
|
+
|
511
|
+
if idle_threshold_secs is not None:
|
512
|
+
status_lines.append(
|
513
|
+
f"Idle timeout: {idle_threshold_secs//60}m ({idle_threshold_secs}s)"
|
514
|
+
)
|
515
|
+
else:
|
516
|
+
status_lines.append("Idle timeout: unknown")
|
517
|
+
else:
|
518
|
+
# No idle detector data available
|
519
|
+
status_lines.append("Idle timeout: N/A")
|
520
|
+
|
521
|
+
# Health report (only if bootstrap finished)
|
522
|
+
if stage_val == "finished":
|
523
|
+
try:
|
524
|
+
ssm = boto3.client("ssm", region_name="us-east-1")
|
525
|
+
res = ssm.send_command(
|
526
|
+
InstanceIds=[engine["instance_id"]],
|
527
|
+
DocumentName="AWS-RunShellScript",
|
528
|
+
Parameters={
|
529
|
+
"commands": [
|
530
|
+
"cat /opt/dayhoff/state/engine-health.json 2>/dev/null || cat /var/run/engine-health.json 2>/dev/null || true"
|
531
|
+
],
|
532
|
+
"executionTimeout": ["10"],
|
533
|
+
},
|
534
|
+
)
|
535
|
+
cid = res["Command"]["CommandId"]
|
536
|
+
time.sleep(1)
|
537
|
+
inv = ssm.get_command_invocation(
|
538
|
+
CommandId=cid, InstanceId=engine["instance_id"]
|
539
|
+
)
|
540
|
+
if inv["Status"] == "Success":
|
541
|
+
import json as _json
|
542
|
+
|
543
|
+
health = _json.loads(inv["StandardOutputContent"].strip() or "{}")
|
544
|
+
status_lines.append("")
|
545
|
+
status_lines.append("[bold]Health:[/bold]")
|
546
|
+
status_lines.append(
|
547
|
+
f" • GPU Drivers: {'OK' if health.get('drivers_ok') else 'MISSING'}"
|
548
|
+
)
|
549
|
+
idle_stat = health.get("idle_detector_service") or health.get(
|
550
|
+
"idle_detector_timer", "unknown"
|
551
|
+
)
|
552
|
+
status_lines.append(f" • Idle Detector: {idle_stat}")
|
553
|
+
except Exception:
|
554
|
+
pass
|
555
|
+
|
556
|
+
# Try to enrich/fallback idle-detector details from on-engine summary file via SSM
|
557
|
+
def _fetch_idle_summary_via_ssm(instance_id: str) -> Optional[Dict]:
|
558
|
+
try:
|
559
|
+
ssm = boto3.client("ssm", region_name="us-east-1")
|
560
|
+
res = ssm.send_command(
|
561
|
+
InstanceIds=[instance_id],
|
562
|
+
DocumentName="AWS-RunShellScript",
|
563
|
+
Parameters={
|
564
|
+
"commands": [
|
565
|
+
"cat /var/run/idle-detector/last_state.json 2>/dev/null || true",
|
566
|
+
],
|
567
|
+
"executionTimeout": ["5"],
|
568
|
+
},
|
569
|
+
)
|
570
|
+
cid = res["Command"]["CommandId"]
|
571
|
+
# Wait up to 2 seconds for SSM command to complete (was 1 second)
|
572
|
+
for _ in range(4): # 4 * 0.5 = 2 seconds
|
573
|
+
time.sleep(0.5)
|
574
|
+
inv = ssm.get_command_invocation(CommandId=cid, InstanceId=instance_id)
|
575
|
+
if inv["Status"] in ["Success", "Failed"]:
|
576
|
+
break
|
577
|
+
if inv["Status"] != "Success":
|
578
|
+
return None
|
579
|
+
content = inv["StandardOutputContent"].strip()
|
580
|
+
if not content:
|
581
|
+
return None
|
582
|
+
data = json.loads(content)
|
583
|
+
# Convert last_state schema (new or old) to idle_detector schema used by CLI output
|
584
|
+
idle_info: Dict[str, Any] = {"available": True}
|
585
|
+
|
586
|
+
# Active/idle
|
587
|
+
idle_flag = bool(data.get("idle", False))
|
588
|
+
idle_info["status"] = "idle" if idle_flag else "active"
|
589
|
+
|
590
|
+
# Threshold and elapsed
|
591
|
+
if isinstance(data.get("timeout_sec"), (int, float)):
|
592
|
+
idle_info["idle_threshold"] = int(data["timeout_sec"]) # seconds
|
593
|
+
if isinstance(data.get("idle_seconds"), (int, float)):
|
594
|
+
idle_info["idle_seconds"] = int(data["idle_seconds"])
|
595
|
+
|
596
|
+
# Keep raw reasons for sensor display when available (new schema)
|
597
|
+
if isinstance(data.get("reasons"), list):
|
598
|
+
idle_info["_reasons_raw"] = data["reasons"]
|
599
|
+
else:
|
600
|
+
# Fallback: synthesize reasons from the old forensics layout
|
601
|
+
f_all = data.get("forensics", {}) or {}
|
602
|
+
synthesized = []
|
603
|
+
|
604
|
+
def _mk(sensor_name: str, key: str):
|
605
|
+
entry = f_all.get(key, {}) or {}
|
606
|
+
synthesized.append(
|
607
|
+
{
|
608
|
+
"sensor": sensor_name,
|
609
|
+
"active": bool(entry.get("active", False)),
|
610
|
+
"reason": entry.get("reason", ""),
|
611
|
+
"forensic": entry.get("forensic", {}),
|
612
|
+
}
|
613
|
+
)
|
614
|
+
|
615
|
+
_mk("CoffeeLockSensor", "coffee")
|
616
|
+
_mk("ActiveLoginSensor", "ssh")
|
617
|
+
_mk("IDEConnectionSensor", "ide")
|
618
|
+
_mk("DockerWorkloadSensor", "docker")
|
619
|
+
idle_info["_reasons_raw"] = synthesized
|
620
|
+
|
621
|
+
# Derive details from sensors
|
622
|
+
for r in idle_info.get("_reasons_raw", []):
|
623
|
+
if not r.get("active"):
|
624
|
+
continue
|
625
|
+
sensor = (r.get("sensor") or "").lower()
|
626
|
+
forensic = r.get("forensic") or {}
|
627
|
+
if sensor == "ideconnectionsensor":
|
628
|
+
# Prefer unique_pid_count written by new detector
|
629
|
+
cnt = forensic.get("unique_pid_count")
|
630
|
+
if not isinstance(cnt, int):
|
631
|
+
cnt = forensic.get("matches")
|
632
|
+
if isinstance(cnt, int):
|
633
|
+
idle_info["ide_connections"] = {"connection_count": cnt}
|
634
|
+
else:
|
635
|
+
idle_info["ide_connections"] = {"connection_count": 1}
|
636
|
+
elif sensor == "coffeelocksensor":
|
637
|
+
rem = forensic.get("remaining_sec")
|
638
|
+
if isinstance(rem, (int, float)) and rem > 0:
|
639
|
+
idle_info["coffee_lock"] = format_duration(
|
640
|
+
timedelta(seconds=int(rem))
|
641
|
+
)
|
642
|
+
elif sensor == "activeloginsensor":
|
643
|
+
sess = {
|
644
|
+
"tty": forensic.get("tty", "pts/?"),
|
645
|
+
"pid": forensic.get("pid", "?"),
|
646
|
+
"idle_time": forensic.get("idle_sec", 0),
|
647
|
+
"from_ip": forensic.get("remote_addr", "unknown"),
|
648
|
+
}
|
649
|
+
idle_info.setdefault("ssh_sessions", []).append(sess)
|
650
|
+
return idle_info
|
651
|
+
except Exception:
|
652
|
+
return None
|
653
|
+
|
654
|
+
# Always try to enrich from on-engine summary (fast, best-effort)
|
655
|
+
overlay = _fetch_idle_summary_via_ssm(engine["instance_id"])
|
656
|
+
if overlay:
|
657
|
+
# If API didn't indicate availability, replace entirely; otherwise fill gaps
|
658
|
+
if not idle_detector.get("available"):
|
659
|
+
idle_detector = overlay
|
660
|
+
else:
|
661
|
+
for k, v in overlay.items():
|
662
|
+
idle_detector.setdefault(k, v)
|
663
|
+
else:
|
664
|
+
# SSM failed - mark as unavailable if we don't have good data
|
665
|
+
if not idle_detector.get("available"):
|
666
|
+
idle_detector = {"available": False} # Mark as unavailable
|
667
|
+
|
668
|
+
# Recompute header display with latest data
|
669
|
+
active_disp = _compute_active_disp(idle_detector)
|
670
|
+
top_lines[0] = f"[blue]{engine['name']}[/blue] {run_disp} {active_disp}\n"
|
671
|
+
|
672
|
+
# Activity Sensors (show all with YES/no)
|
673
|
+
if idle_detector.get("available"):
|
674
|
+
status_lines.append("")
|
675
|
+
status_lines.append("[bold]Activity Sensors:[/bold]")
|
676
|
+
reasons_raw = idle_detector.get("_reasons_raw", []) or []
|
677
|
+
by_sensor: Dict[str, Dict[str, Any]] = {}
|
678
|
+
for r in reasons_raw:
|
679
|
+
nm = r.get("sensor")
|
680
|
+
if nm:
|
681
|
+
by_sensor[nm] = r
|
682
|
+
|
683
|
+
def _sensor_line(label: str, key: str, emoji: str) -> str:
|
684
|
+
r = by_sensor.get(key, {})
|
685
|
+
active = bool(r.get("active"))
|
686
|
+
reason_txt = r.get("reason") or ("" if not active else "active")
|
687
|
+
flag = "[green]YES[/green]" if active else "[dim]nope[/dim]"
|
688
|
+
return (
|
689
|
+
f" {emoji} {label}: {flag} {('- ' + reason_txt) if reason_txt else ''}"
|
690
|
+
)
|
691
|
+
|
692
|
+
status_lines.append(_sensor_line("Coffee", "CoffeeLockSensor", "☕"))
|
693
|
+
status_lines.append(_sensor_line("Shell ", "ActiveLoginSensor", "🐚"))
|
694
|
+
status_lines.append(_sensor_line(" IDE ", "IDEConnectionSensor", "🖥"))
|
695
|
+
status_lines.append(_sensor_line("Docker", "DockerWorkloadSensor", "🐳"))
|
696
|
+
|
697
|
+
# Recompute display with latest idle detector data
|
698
|
+
active_disp = _compute_active_disp(idle_detector)
|
699
|
+
# Rewrite top header line (index 0) to include updated display
|
700
|
+
top_lines[0] = f"[blue]{engine['name']}[/blue] {run_disp} {active_disp}\n"
|
701
|
+
|
702
|
+
# Combine top summary and details
|
703
|
+
all_lines = top_lines + status_lines
|
704
|
+
console.print(
|
705
|
+
Panel("\n".join(all_lines), title="Engine Status", border_style="blue")
|
706
|
+
)
|
707
|
+
|
708
|
+
if show_log:
|
709
|
+
if not detailed:
|
710
|
+
console.print("[yellow]Note: --show-log requires --detailed flag[/yellow]")
|
711
|
+
return
|
712
|
+
console.print("\n[bold]Bootstrap Log:[/bold]")
|
713
|
+
try:
|
714
|
+
ssm = boto3.client("ssm", region_name="us-east-1")
|
715
|
+
resp = ssm.send_command(
|
716
|
+
InstanceIds=[engine["instance_id"]],
|
717
|
+
DocumentName="AWS-RunShellScript",
|
718
|
+
Parameters={
|
719
|
+
"commands": [
|
720
|
+
"cat /var/log/engine-setup.log 2>/dev/null || echo 'No setup log found'"
|
721
|
+
],
|
722
|
+
"executionTimeout": ["15"],
|
723
|
+
},
|
724
|
+
)
|
725
|
+
cid = resp["Command"]["CommandId"]
|
726
|
+
time.sleep(2)
|
727
|
+
inv = ssm.get_command_invocation(
|
728
|
+
CommandId=cid, InstanceId=engine["instance_id"]
|
729
|
+
)
|
730
|
+
if inv["Status"] == "Success":
|
731
|
+
log_content = inv["StandardOutputContent"].strip()
|
732
|
+
if log_content:
|
733
|
+
console.print(f"[dim]{log_content}[/dim]")
|
734
|
+
else:
|
735
|
+
console.print("[yellow]No bootstrap log available[/yellow]")
|
736
|
+
else:
|
737
|
+
console.print("[red]❌ Could not retrieve bootstrap log[/red]")
|
738
|
+
except Exception as e:
|
739
|
+
console.print(f"[red]❌ Error fetching log: {e}[/red]")
|