dayhoff-tools 1.9.9__py3-none-any.whl → 1.9.11__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,653 @@
1
+ """Core engine commands: launch, list, and status."""
2
+
3
+ import json
4
+ import time
5
+ from datetime import datetime, timezone
6
+ from typing import Any, Dict, Optional
7
+
8
+ import boto3
9
+ import typer
10
+ from rich import box
11
+ from rich.panel import Panel
12
+ from rich.progress import Progress, SpinnerColumn, TextColumn
13
+ from rich.table import Table
14
+
15
+ from .shared import (
16
+ HOURLY_COSTS,
17
+ _fetch_init_stages,
18
+ check_aws_sso,
19
+ console,
20
+ format_duration,
21
+ format_status,
22
+ get_disk_usage_via_ssm,
23
+ make_api_request,
24
+ parse_launch_time,
25
+ resolve_engine,
26
+ )
27
+
28
+
29
+ def launch_engine(
30
+ name: str = typer.Argument(help="Name for the new engine"),
31
+ engine_type: str = typer.Option(
32
+ "cpu",
33
+ "--type",
34
+ "-t",
35
+ help="Engine type: cpu, cpumax, t4, a10g, a100, 4_t4, 8_t4, 4_a10g, 8_a10g",
36
+ ),
37
+ user: Optional[str] = typer.Option(None, "--user", "-u", help="Override username"),
38
+ boot_disk_size: Optional[int] = typer.Option(
39
+ None,
40
+ "--size",
41
+ "-s",
42
+ help="Boot disk size in GB (default: 50GB, min: 20GB, max: 1000GB)",
43
+ ),
44
+ availability_zone: Optional[str] = typer.Option(
45
+ None,
46
+ "--az",
47
+ help="Prefer a specific Availability Zone (e.g., us-east-1b). If omitted the service will try all public subnets.",
48
+ ),
49
+ ):
50
+ """Launch a new engine instance."""
51
+ username = check_aws_sso()
52
+ if user:
53
+ username = user
54
+
55
+ # Validate engine type
56
+ valid_types = [
57
+ "cpu",
58
+ "cpumax",
59
+ "t4",
60
+ "a10g",
61
+ "a100",
62
+ "4_t4",
63
+ "8_t4",
64
+ "4_a10g",
65
+ "8_a10g",
66
+ ]
67
+ if engine_type not in valid_types:
68
+ console.print(f"[red]❌ Invalid engine type: {engine_type}[/red]")
69
+ console.print(f"Valid types: {', '.join(valid_types)}")
70
+ raise typer.Exit(1)
71
+
72
+ # Validate boot disk size
73
+ if boot_disk_size is not None:
74
+ if boot_disk_size < 20:
75
+ console.print("[red]❌ Boot disk size must be at least 20GB[/red]")
76
+ raise typer.Exit(1)
77
+ if boot_disk_size > 1000:
78
+ console.print("[red]❌ Boot disk size cannot exceed 1000GB[/red]")
79
+ raise typer.Exit(1)
80
+
81
+ cost = HOURLY_COSTS.get(engine_type, 0)
82
+ disk_info = f" with {boot_disk_size}GB boot disk" if boot_disk_size else ""
83
+ console.print(
84
+ f"Launching [cyan]{name}[/cyan] ({engine_type}){disk_info} for ${cost:.2f}/hour..."
85
+ )
86
+
87
+ with Progress(
88
+ SpinnerColumn(),
89
+ TextColumn("[progress.description]{task.description}"),
90
+ transient=True,
91
+ ) as progress:
92
+ progress.add_task("Creating engine...", total=None)
93
+
94
+ request_data: Dict[str, Any] = {
95
+ "name": name,
96
+ "user": username,
97
+ "engine_type": engine_type,
98
+ }
99
+ if boot_disk_size is not None:
100
+ request_data["boot_disk_size"] = boot_disk_size
101
+ if availability_zone:
102
+ request_data["availability_zone"] = availability_zone
103
+
104
+ response = make_api_request("POST", "/engines", json_data=request_data)
105
+
106
+ if response.status_code == 201:
107
+ data = response.json()
108
+ console.print(f"[green]✓ Engine launched successfully![/green]")
109
+ console.print(f"Instance ID: [cyan]{data['instance_id']}[/cyan]")
110
+ console.print(f"Type: {data['instance_type']} (${cost:.2f}/hour)")
111
+ if boot_disk_size:
112
+ console.print(f"Boot disk: {boot_disk_size}GB")
113
+ console.print("\nThe engine is initializing. This may take a few minutes.")
114
+ console.print(f"Check status with: [cyan]dh engine status {name}[/cyan]")
115
+ else:
116
+ error = response.json().get("error", "Unknown error")
117
+ console.print(f"[red]❌ Failed to launch engine: {error}[/red]")
118
+
119
+
120
+ def list_engines(
121
+ user: Optional[str] = typer.Option(None, "--user", "-u", help="Filter by user"),
122
+ running_only: bool = typer.Option(
123
+ False, "--running", help="Show only running engines"
124
+ ),
125
+ stopped_only: bool = typer.Option(
126
+ False, "--stopped", help="Show only stopped engines"
127
+ ),
128
+ detailed: bool = typer.Option(
129
+ False, "--detailed", "-d", help="Show detailed status (slower)"
130
+ ),
131
+ ):
132
+ """List engines (shows all engines by default)."""
133
+ current_user = check_aws_sso()
134
+
135
+ params = {}
136
+ if user:
137
+ params["user"] = user
138
+ if detailed:
139
+ params["check_ready"] = "true"
140
+
141
+ response = make_api_request("GET", "/engines", params=params)
142
+
143
+ if response.status_code == 200:
144
+ data = response.json()
145
+ engines = data.get("engines", [])
146
+
147
+ # Filter by state if requested
148
+ if running_only:
149
+ engines = [e for e in engines if e["state"].lower() == "running"]
150
+ elif stopped_only:
151
+ engines = [e for e in engines if e["state"].lower() == "stopped"]
152
+
153
+ if not engines:
154
+ console.print("No engines found.")
155
+ return
156
+
157
+ # Only fetch detailed info if requested (slow)
158
+ stages_map = {}
159
+ if detailed:
160
+ stages_map = _fetch_init_stages([e["instance_id"] for e in engines])
161
+
162
+ # Create table
163
+ table = Table(title="Engines", box=box.ROUNDED)
164
+ table.add_column("Name", style="cyan")
165
+ table.add_column("Instance ID", style="dim")
166
+ table.add_column("Type")
167
+ table.add_column("User")
168
+ table.add_column("Status")
169
+ if detailed:
170
+ table.add_column("Disk Usage")
171
+ table.add_column("Uptime/Since")
172
+ table.add_column("$/hour", justify="right")
173
+
174
+ for engine in engines:
175
+ launch_time = parse_launch_time(engine["launch_time"])
176
+ uptime = datetime.now(timezone.utc) - launch_time
177
+ hourly_cost = HOURLY_COSTS.get(engine["engine_type"], 0)
178
+
179
+ if engine["state"].lower() == "running":
180
+ time_str = format_duration(uptime)
181
+ # Only get disk usage if detailed mode
182
+ if detailed:
183
+ disk_usage = get_disk_usage_via_ssm(engine["instance_id"]) or "-"
184
+ else:
185
+ disk_usage = None
186
+ else:
187
+ time_str = launch_time.strftime("%Y-%m-%d %H:%M")
188
+ disk_usage = "-" if detailed else None
189
+
190
+ row_data = [
191
+ engine["name"],
192
+ engine["instance_id"],
193
+ engine["engine_type"],
194
+ engine["user"],
195
+ format_status(engine["state"], engine.get("ready")),
196
+ ]
197
+ if detailed:
198
+ row_data.append(disk_usage)
199
+ row_data.extend(
200
+ [
201
+ time_str,
202
+ f"${hourly_cost:.2f}",
203
+ ]
204
+ )
205
+
206
+ table.add_row(*row_data)
207
+
208
+ console.print(table)
209
+ if not detailed and any(e["state"].lower() == "running" for e in engines):
210
+ console.print(
211
+ "\n[dim]Tip: Use --detailed to see disk usage and bootstrap status (slower)[/dim]"
212
+ )
213
+ else:
214
+ error = response.json().get("error", "Unknown error")
215
+ console.print(f"[red]❌ Failed to list engines: {error}[/red]")
216
+
217
+
218
+ def engine_status(
219
+ name_or_id: str = typer.Argument(help="Engine name or instance ID"),
220
+ detailed: bool = typer.Option(False, "--detailed", "-d", help="Show detailed status (slower)"),
221
+ show_log: bool = typer.Option(False, "--show-log", help="Show bootstrap log (requires --detailed)"),
222
+ ):
223
+ """Show engine status and information."""
224
+ check_aws_sso()
225
+
226
+ # Get all engines to resolve name
227
+ response = make_api_request("GET", "/engines")
228
+ if response.status_code != 200:
229
+ console.print("[red]❌ Failed to fetch engines[/red]")
230
+ raise typer.Exit(1)
231
+
232
+ engines = response.json().get("engines", [])
233
+ engine = resolve_engine(name_or_id, engines)
234
+
235
+ # Always try to fetch live idle data from the engine for both views
236
+ live_idle_data = _fetch_live_idle_data(engine["instance_id"])
237
+
238
+ # Fast status display (default)
239
+ if not detailed:
240
+ # Determine running state display
241
+ running_state = engine["state"].lower()
242
+ if running_state == "running":
243
+ run_disp = "[green]Running[/green]"
244
+ elif running_state == "pending":
245
+ run_disp = "[yellow]Starting...[/yellow]"
246
+ elif running_state == "stopping":
247
+ run_disp = "[yellow]Stopping...[/yellow]"
248
+ elif running_state == "stopped":
249
+ run_disp = "[dim]Stopped[/dim]"
250
+ else:
251
+ run_disp = engine["state"].capitalize()
252
+
253
+ # Format idle display using the unified function
254
+ idle_disp = " " + _format_idle_status_display(live_idle_data, running_state)
255
+
256
+ # Build status lines - minimal info for fast view
257
+ status_lines = [
258
+ f"[blue]{engine['name']}[/blue] {run_disp}{idle_disp}",
259
+ ]
260
+
261
+ # Add activity sensors if we have live data
262
+ if live_idle_data and live_idle_data.get("_reasons_raw"):
263
+ status_lines.append("") # blank line before sensors
264
+
265
+ sensor_map = {
266
+ "CoffeeLockSensor": ("☕", "Coffee"),
267
+ "ActiveLoginSensor": ("🐚", "SSH"),
268
+ "IDEConnectionSensor": ("🖥 ", "IDE"),
269
+ "DockerWorkloadSensor": ("🐳", "Docker"),
270
+ }
271
+
272
+ for r in live_idle_data.get("_reasons_raw", []):
273
+ sensor = r.get("sensor", "Unknown")
274
+ active = r.get("active", False)
275
+ icon, label = sensor_map.get(sensor, ("?", sensor))
276
+ status_str = "[green]YES[/green]" if active else "[dim]nope[/dim]"
277
+ status_lines.append(f" {icon} {label:6} {status_str}")
278
+
279
+ # Display in a nice panel
280
+ console.print(
281
+ Panel("\n".join(status_lines), title="Engine Status", border_style="blue")
282
+ )
283
+ return # Exit early for fast status
284
+
285
+ # Get detailed engine status including idle detector info (for --detailed mode)
286
+ response = make_api_request("GET", f"/engines/{engine['instance_id']}")
287
+ if response.status_code != 200:
288
+ console.print("[red]❌ Failed to fetch engine details[/red]")
289
+ raise typer.Exit(1)
290
+
291
+ engine_details = response.json()
292
+ engine = engine_details.get("engine", engine) # Use detailed info if available
293
+ idle_detector = engine_details.get("idle_detector", {}) or {}
294
+ attached_studios = engine_details.get("attached_studios", [])
295
+
296
+ # Overlay stale API data with fresh data from the engine
297
+ if live_idle_data:
298
+ # If API didn't indicate availability, replace entirely; otherwise, update.
299
+ if not idle_detector.get("available"):
300
+ idle_detector = live_idle_data
301
+ else:
302
+ idle_detector.update(live_idle_data)
303
+ else:
304
+ # SSM failed - mark as unavailable if we don't have good data from API
305
+ if not idle_detector.get("available"):
306
+ idle_detector = {"available": False} # Mark as unavailable
307
+
308
+ # Calculate costs
309
+ launch_time = parse_launch_time(engine["launch_time"])
310
+ uptime = datetime.now(timezone.utc) - launch_time
311
+ hourly_cost = HOURLY_COSTS.get(engine["engine_type"], 0)
312
+ # total_cost intentionally not shown in status view
313
+
314
+ stages_map = _fetch_init_stages([engine["instance_id"]])
315
+ stage_val = stages_map.get(engine["instance_id"], "-")
316
+
317
+ # Try to fetch actual boot time via SSM (best-effort)
318
+ boot_time_str: Optional[str] = None
319
+ try:
320
+ if engine["state"].lower() == "running":
321
+ ssm = boto3.client("ssm", region_name="us-east-1")
322
+ resp = ssm.send_command(
323
+ InstanceIds=[engine["instance_id"]],
324
+ DocumentName="AWS-RunShellScript",
325
+ Parameters={
326
+ "commands": ["uptime -s || who -b | awk '{print $3\" \"$4}'"]
327
+ },
328
+ )
329
+ cid = resp["Command"]["CommandId"]
330
+ time.sleep(1)
331
+ inv = ssm.get_command_invocation(
332
+ CommandId=cid, InstanceId=engine["instance_id"]
333
+ )
334
+ if inv.get("Status") == "Success":
335
+ boot_time_str = (
336
+ (inv.get("StandardOutputContent") or "").strip().splitlines()[0]
337
+ if inv.get("StandardOutputContent")
338
+ else None
339
+ )
340
+ except Exception:
341
+ boot_time_str = None
342
+
343
+ started_line = (
344
+ f"[bold]Started:[/bold] {boot_time_str} ({format_duration(uptime)} ago)"
345
+ if boot_time_str
346
+ else f"[bold]Started:[/bold] {launch_time.strftime('%Y-%m-%d %H:%M:%S')} ({format_duration(uptime)} ago)"
347
+ )
348
+
349
+ # ---------------- Front-loaded summary ----------------
350
+ running_state = engine["state"].lower()
351
+ if running_state == "running":
352
+ run_disp = "[green]Running[/green]"
353
+ elif running_state == "pending":
354
+ run_disp = "[yellow]Starting...[/yellow]"
355
+ elif running_state == "stopping":
356
+ run_disp = "[yellow]Stopping...[/yellow]"
357
+ elif running_state == "stopped":
358
+ run_disp = "[dim]Stopped[/dim]"
359
+ else:
360
+ run_disp = engine["state"].capitalize()
361
+
362
+ # Recompute header display with latest data
363
+ active_disp = _format_idle_status_display(idle_detector, running_state)
364
+
365
+ top_lines = [
366
+ f"[blue]{engine['name']}[/blue] {run_disp} {active_disp}\n",
367
+ ]
368
+
369
+ # Studios summary next, with studio name in purple/magenta
370
+ studios_line = None
371
+ if attached_studios:
372
+ stu_texts = [
373
+ f"[magenta]{s.get('user', 'studio')}[/magenta] ({s.get('studio_id', 'unknown')})"
374
+ for s in attached_studios
375
+ ]
376
+ studios_line = "Studios: " + ", ".join(stu_texts)
377
+ top_lines.append(studios_line)
378
+
379
+ # Paragraph break
380
+ top_lines.append("")
381
+
382
+ # ---------------- Details block (white/default) ----------------
383
+ status_lines = [
384
+ f"Name: {engine['name']}",
385
+ f"Instance: {engine['instance_id']}",
386
+ f"Type: {engine['engine_type']} ({engine['instance_type']})",
387
+ f"Status: {engine['state']}",
388
+ f"User: {engine['user']}",
389
+ f"IP: {engine.get('public_ip', 'N/A')}",
390
+ started_line,
391
+ f"$/hour: ${hourly_cost:.2f}",
392
+ ]
393
+
394
+ # Disk usage (like list --detailed)
395
+ if engine["state"].lower() == "running":
396
+ disk_usage = get_disk_usage_via_ssm(engine["instance_id"]) or "-"
397
+ status_lines.append(f"Disk: {disk_usage}")
398
+
399
+ # Idle timeout (show even when not idle) - but only if we have data
400
+ if idle_detector.get("available"):
401
+ idle_threshold_secs: Optional[int] = None
402
+ # Prefer value from idle detector overlay if present
403
+ try:
404
+ if isinstance(idle_detector.get("idle_threshold"), (int, float)):
405
+ idle_threshold_secs = int(idle_detector["idle_threshold"])
406
+ except Exception:
407
+ idle_threshold_secs = None
408
+
409
+ if idle_threshold_secs is None and engine["state"].lower() == "running":
410
+ # Fallback: read /etc/engine.env via SSM
411
+ try:
412
+ ssm = boto3.client("ssm", region_name="us-east-1")
413
+ resp = ssm.send_command(
414
+ InstanceIds=[engine["instance_id"]],
415
+ DocumentName="AWS-RunShellScript",
416
+ Parameters={
417
+ "commands": [
418
+ "grep -E '^IDLE_TIMEOUT_SECONDS=' /etc/engine.env | cut -d'=' -f2 || echo '?'",
419
+ ],
420
+ "executionTimeout": ["5"],
421
+ },
422
+ )
423
+ cid = resp["Command"]["CommandId"]
424
+ time.sleep(1)
425
+ inv = ssm.get_command_invocation(
426
+ CommandId=cid, InstanceId=engine["instance_id"]
427
+ )
428
+ if inv.get("Status") == "Success":
429
+ out = (inv.get("StandardOutputContent") or "").strip()
430
+ if out and out != "?" and out.isdigit():
431
+ idle_threshold_secs = int(out)
432
+ except Exception:
433
+ idle_threshold_secs = None
434
+
435
+ if idle_threshold_secs is not None:
436
+ status_lines.append(
437
+ f"Idle timeout: {idle_threshold_secs//60}m ({idle_threshold_secs}s)"
438
+ )
439
+ else:
440
+ status_lines.append("Idle timeout: unknown")
441
+ else:
442
+ # No idle detector data available
443
+ status_lines.append("Idle timeout: N/A")
444
+
445
+ # Health report (only if bootstrap finished)
446
+ if stage_val == "finished":
447
+ try:
448
+ ssm = boto3.client("ssm", region_name="us-east-1")
449
+ res = ssm.send_command(
450
+ InstanceIds=[engine["instance_id"]],
451
+ DocumentName="AWS-RunShellScript",
452
+ Parameters={
453
+ "commands": [
454
+ "cat /opt/dayhoff/state/engine-health.json 2>/dev/null || cat /var/run/engine-health.json 2>/dev/null || true"
455
+ ],
456
+ "executionTimeout": ["10"],
457
+ },
458
+ )
459
+ cid = res["Command"]["CommandId"]
460
+ time.sleep(1)
461
+ inv = ssm.get_command_invocation(
462
+ CommandId=cid, InstanceId=engine["instance_id"]
463
+ )
464
+ if inv["Status"] == "Success":
465
+ import json as _json
466
+
467
+ health = _json.loads(inv["StandardOutputContent"].strip() or "{}")
468
+ status_lines.append("")
469
+ status_lines.append("[bold]Health:[/bold]")
470
+ status_lines.append(
471
+ f" • GPU Drivers: {'OK' if health.get('drivers_ok') else 'MISSING'}"
472
+ )
473
+ idle_stat = health.get("idle_detector_service") or health.get(
474
+ "idle_detector_timer", "unknown"
475
+ )
476
+ status_lines.append(f" • Idle Detector: {idle_stat}")
477
+ except Exception:
478
+ pass
479
+
480
+ # Activity Sensors (show all with YES/no)
481
+ if idle_detector.get("available"):
482
+ status_lines.append("")
483
+ status_lines.append("[bold]Activity Sensors:[/bold]")
484
+ reasons_raw = idle_detector.get("_reasons_raw", []) or []
485
+ by_sensor: Dict[str, Dict[str, Any]] = {}
486
+ for r in reasons_raw:
487
+ nm = r.get("sensor")
488
+ if nm:
489
+ by_sensor[nm] = r
490
+
491
+ def _sensor_line(label: str, key: str, emoji: str) -> str:
492
+ r = by_sensor.get(key, {})
493
+ active = bool(r.get("active"))
494
+ reason_txt = r.get("reason") or ("" if not active else "active")
495
+ flag = "[green]YES[/green]" if active else "[dim]nope[/dim]"
496
+ return (
497
+ f" {emoji} {label}: {flag} {('- ' + reason_txt) if reason_txt else ''}"
498
+ )
499
+
500
+ status_lines.append(_sensor_line("Coffee", "CoffeeLockSensor", "☕"))
501
+ status_lines.append(_sensor_line("Shell ", "ActiveLoginSensor", "🐚"))
502
+ status_lines.append(_sensor_line(" IDE ", "IDEConnectionSensor", "🖥"))
503
+ status_lines.append(_sensor_line("Docker", "DockerWorkloadSensor", "🐳"))
504
+
505
+ # Combine top summary and details
506
+ all_lines = top_lines + status_lines
507
+ console.print(
508
+ Panel("\n".join(all_lines), title="Engine Status", border_style="blue")
509
+ )
510
+
511
+ if show_log:
512
+ if not detailed:
513
+ console.print("[yellow]Note: --show-log requires --detailed flag[/yellow]")
514
+ return
515
+ console.print("\n[bold]Bootstrap Log:[/bold]")
516
+ try:
517
+ ssm = boto3.client("ssm", region_name="us-east-1")
518
+ resp = ssm.send_command(
519
+ InstanceIds=[engine["instance_id"]],
520
+ DocumentName="AWS-RunShellScript",
521
+ Parameters={
522
+ "commands": [
523
+ "cat /var/log/engine-setup.log 2>/dev/null || echo 'No setup log found'"
524
+ ],
525
+ "executionTimeout": ["15"],
526
+ },
527
+ )
528
+ cid = resp["Command"]["CommandId"]
529
+ time.sleep(2)
530
+ inv = ssm.get_command_invocation(
531
+ CommandId=cid, InstanceId=engine["instance_id"]
532
+ )
533
+ if inv["Status"] == "Success":
534
+ log_content = inv["StandardOutputContent"].strip()
535
+ if log_content:
536
+ console.print(f"[dim]{log_content}[/dim]")
537
+ else:
538
+ console.print("[yellow]No bootstrap log available[/yellow]")
539
+ else:
540
+ console.print("[red]❌ Could not retrieve bootstrap log[/red]")
541
+ except Exception as e:
542
+ console.print(f"[red]❌ Error fetching log: {e}[/red]")
543
+
544
+
545
+ def _format_idle_status_display(
546
+ idle_info: Optional[Dict[str, Any]], running_state: str
547
+ ) -> str:
548
+ """Computes the rich string for active/idle status display."""
549
+ # If we don't have idle info or it's explicitly unavailable, show N/A
550
+ if not idle_info or idle_info.get("available") is False:
551
+ return "[dim]N/A[/dim]"
552
+
553
+ if idle_info.get("status") == "active":
554
+ return "[green]Active[/green]"
555
+ if running_state in ("stopped", "stopping"):
556
+ return "[dim]N/A[/dim]"
557
+
558
+ # If idle, show time/threshold with time remaining if available
559
+ if idle_info.get("status") == "idle":
560
+ idle_seconds_v = idle_info.get("idle_seconds")
561
+ thresh_v = idle_info.get("idle_threshold")
562
+ if isinstance(idle_seconds_v, (int, float)) and isinstance(
563
+ thresh_v, (int, float)
564
+ ):
565
+ remaining = max(0, int(thresh_v) - int(idle_seconds_v))
566
+ remaining_mins = remaining // 60
567
+ if remaining_mins == 0:
568
+ return f"[yellow]Idle {int(idle_seconds_v)//60}m/{int(thresh_v)//60}m: [red]<1m[/red] left[/yellow]"
569
+ else:
570
+ return f"[yellow]Idle {int(idle_seconds_v)//60}m/{int(thresh_v)//60}m: [red]{remaining_mins}m[/red] left[/yellow]"
571
+ elif isinstance(thresh_v, (int, float)):
572
+ return f"[yellow]Idle ?/{int(thresh_v)//60}m[/yellow]"
573
+ else:
574
+ return "[yellow]Idle ?/?[/yellow]"
575
+
576
+ # Default to N/A if we can't determine status
577
+ return "[dim]N/A[/dim]"
578
+
579
+
580
+ def _fetch_live_idle_data(instance_id: str) -> Optional[Dict]:
581
+ """
582
+ Fetch and parse the live idle detector state from an engine via SSM.
583
+
584
+ This is the single source of truth for on-engine idle status. It fetches
585
+ the `last_state.json` file, parses it, and transforms it into the schema
586
+ used by the CLI for display logic.
587
+ """
588
+ try:
589
+ ssm = boto3.client("ssm", region_name="us-east-1")
590
+ res = ssm.send_command(
591
+ InstanceIds=[instance_id],
592
+ DocumentName="AWS-RunShellScript",
593
+ Parameters={
594
+ "commands": [
595
+ "cat /var/run/idle-detector/last_state.json 2>/dev/null || true",
596
+ ],
597
+ "executionTimeout": ["5"],
598
+ },
599
+ )
600
+ cid = res["Command"]["CommandId"]
601
+ # Wait up to 3 seconds for SSM command to complete
602
+ for _ in range(6): # 6 * 0.5 = 3 seconds
603
+ time.sleep(0.5)
604
+ inv = ssm.get_command_invocation(CommandId=cid, InstanceId=instance_id)
605
+ if inv["Status"] in ["Success", "Failed"]:
606
+ break
607
+ if inv["Status"] != "Success":
608
+ return None
609
+ content = inv["StandardOutputContent"].strip()
610
+ if not content:
611
+ return None
612
+ data = json.loads(content)
613
+ # Convert last_state schema (new or old) to idle_detector schema used by CLI output
614
+ idle_info: Dict[str, Any] = {"available": True}
615
+
616
+ # Active/idle
617
+ idle_flag = bool(data.get("idle", False))
618
+ idle_info["status"] = "idle" if idle_flag else "active"
619
+
620
+ # Threshold and elapsed
621
+ if isinstance(data.get("timeout_sec"), (int, float)):
622
+ idle_info["idle_threshold"] = int(data["timeout_sec"]) # seconds
623
+ if isinstance(data.get("idle_seconds"), (int, float)):
624
+ idle_info["idle_seconds"] = int(data["idle_seconds"])
625
+
626
+ # Keep raw reasons for sensor display when available (new schema)
627
+ if isinstance(data.get("reasons"), list):
628
+ idle_info["_reasons_raw"] = data["reasons"]
629
+ else:
630
+ # Fallback: synthesize reasons from the old forensics layout
631
+ f_all = data.get("forensics", {}) or {}
632
+ synthesized = []
633
+
634
+ def _mk(sensor_name: str, key: str):
635
+ entry = f_all.get(key, {}) or {}
636
+ synthesized.append(
637
+ {
638
+ "sensor": sensor_name,
639
+ "active": bool(entry.get("active", False)),
640
+ "reason": entry.get("reason", ""),
641
+ "forensic": entry.get("forensic", {}),
642
+ }
643
+ )
644
+
645
+ _mk("CoffeeLockSensor", "coffee")
646
+ _mk("ActiveLoginSensor", "ssh")
647
+ _mk("IDEConnectionSensor", "ide")
648
+ _mk("DockerWorkloadSensor", "docker")
649
+ idle_info["_reasons_raw"] = synthesized
650
+
651
+ return idle_info
652
+ except Exception:
653
+ return None