loki-mode 5.49.0 → 5.49.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/autonomy/run.sh CHANGED
@@ -667,6 +667,146 @@ log_error() { echo -e "${RED}[ERROR]${NC} $*"; }
667
667
  log_step() { echo -e "${CYAN}[STEP]${NC} $*"; }
668
668
  log_debug() { [[ "${LOKI_DEBUG:-}" == "true" ]] && echo -e "${CYAN}[DEBUG]${NC} $*" || true; }
669
669
 
670
+ #===============================================================================
671
+ # Process Registry (PID Supervisor)
672
+ # Central registry of all spawned child processes for reliable cleanup
673
+ #===============================================================================
674
+
675
+ PID_REGISTRY_DIR=""
676
+
677
+ # Initialize the PID registry directory
678
+ init_pid_registry() {
679
+ PID_REGISTRY_DIR="${TARGET_DIR:-.}/.loki/pids"
680
+ mkdir -p "$PID_REGISTRY_DIR"
681
+ }
682
+
683
+ # Parse a field from a JSON registry entry (python3 with shell fallback)
684
+ # Usage: _parse_json_field <file> <field>
685
+ _parse_json_field() {
686
+ local file="$1" field="$2"
687
+ if command -v python3 >/dev/null 2>&1; then
688
+ python3 -c "import json,sys; print(json.load(open(sys.argv[1])).get(sys.argv[2],''))" "$file" "$field" 2>/dev/null
689
+ else
690
+ # Shell fallback: extract value for simple flat JSON
691
+ sed 's/.*"'"$field"'":\s*//' "$file" 2>/dev/null | sed 's/[",}].*//' | head -1
692
+ fi
693
+ }
694
+
695
+ # Register a spawned process in the central registry
696
+ # Usage: register_pid <pid> <label> [<extra_info>]
697
+ # Example: register_pid $! "dashboard" "port=57374"
698
+ register_pid() {
699
+ local pid="$1"
700
+ # Sanitize label and extra for JSON safety (escape backslash first, then double-quote, strip newlines)
701
+ local label="${2//\\/\\\\}"
702
+ label="${label//\"/\\\"}"
703
+ label="$(printf '%s' "$label" | tr -d '\n\r')"
704
+ local extra="${3:-}"
705
+ extra="${extra//\\/\\\\}"
706
+ extra="${extra//\"/\\\"}"
707
+ extra="$(printf '%s' "$extra" | tr -d '\n\r')"
708
+ [ -z "$PID_REGISTRY_DIR" ] && init_pid_registry
709
+ local entry_file="$PID_REGISTRY_DIR/${pid}.json"
710
+ cat > "$entry_file" << EOF
711
+ {"pid":$pid,"label":"$label","started":"$(date -u +%Y-%m-%dT%H:%M:%SZ)","ppid":$$,"extra":"$extra"}
712
+ EOF
713
+ }
714
+
715
+ # Unregister a process from the registry (called on clean shutdown)
716
+ # Usage: unregister_pid <pid>
717
+ unregister_pid() {
718
+ local pid="$1"
719
+ [ -z "$PID_REGISTRY_DIR" ] && init_pid_registry
720
+ rm -f "$PID_REGISTRY_DIR/${pid}.json" 2>/dev/null
721
+ }
722
+
723
+ # Kill a registered process with SIGTERM -> wait -> SIGKILL escalation
724
+ # Usage: kill_registered_pid <pid>
725
+ kill_registered_pid() {
726
+ local pid="$1"
727
+ if kill -0 "$pid" 2>/dev/null; then
728
+ kill "$pid" 2>/dev/null || true
729
+ # Wait up to 2 seconds for graceful exit
730
+ local waited=0
731
+ while [ $waited -lt 4 ] && kill -0 "$pid" 2>/dev/null; do
732
+ sleep 0.5
733
+ waited=$((waited + 1))
734
+ done
735
+ # Escalate to SIGKILL if still alive
736
+ if kill -0 "$pid" 2>/dev/null; then
737
+ kill -9 "$pid" 2>/dev/null || true
738
+ fi
739
+ fi
740
+ unregister_pid "$pid"
741
+ }
742
+
743
+ # Scan registry for orphaned processes and kill them
744
+ # Called on startup and by `loki cleanup`
745
+ # Returns: number of orphans killed
746
+ cleanup_orphan_pids() {
747
+ [ -z "$PID_REGISTRY_DIR" ] && init_pid_registry
748
+ local orphan_count=0
749
+
750
+ if [ ! -d "$PID_REGISTRY_DIR" ]; then
751
+ echo "0"
752
+ return 0
753
+ fi
754
+
755
+ for entry_file in "$PID_REGISTRY_DIR"/*.json; do
756
+ [ -f "$entry_file" ] || continue
757
+ local pid
758
+ pid=$(basename "$entry_file" .json)
759
+
760
+ # Skip non-numeric filenames
761
+ case "$pid" in
762
+ ''|*[!0-9]*) continue ;;
763
+ esac
764
+
765
+ if kill -0 "$pid" 2>/dev/null; then
766
+ # Process is alive -- check if its parent session is dead
767
+ local ppid_val=""
768
+ ppid_val=$(_parse_json_field "$entry_file" "ppid") || true
769
+
770
+ # Validate ppid_val is numeric before using with kill
771
+ case "$ppid_val" in ''|*[!0-9]*) ppid_val="" ;; esac
772
+ if [ -n "$ppid_val" ] && [ "$ppid_val" != "$$" ]; then
773
+ if ! kill -0 "$ppid_val" 2>/dev/null; then
774
+ # Parent is dead -- this is an orphan
775
+ local label=""
776
+ label=$(_parse_json_field "$entry_file" "label") || label="unknown"
777
+ log_warn "Killing orphaned process: PID=$pid label=$label (parent $ppid_val is dead)" >&2
778
+ kill_registered_pid "$pid"
779
+ orphan_count=$((orphan_count + 1))
780
+ fi
781
+ fi
782
+ else
783
+ # Process is dead -- clean up stale registry entry
784
+ rm -f "$entry_file" 2>/dev/null
785
+ fi
786
+ done
787
+
788
+ echo "$orphan_count"
789
+ }
790
+
791
+ # Kill ALL registered processes (used during full shutdown)
792
+ kill_all_registered() {
793
+ [ -z "$PID_REGISTRY_DIR" ] && init_pid_registry
794
+
795
+ if [ ! -d "$PID_REGISTRY_DIR" ]; then
796
+ return 0
797
+ fi
798
+
799
+ for entry_file in "$PID_REGISTRY_DIR"/*.json; do
800
+ [ -f "$entry_file" ] || continue
801
+ local pid
802
+ pid=$(basename "$entry_file" .json)
803
+ case "$pid" in
804
+ ''|*[!0-9]*) continue ;;
805
+ esac
806
+ kill_registered_pid "$pid"
807
+ done
808
+ }
809
+
670
810
  #===============================================================================
671
811
  # Event Emission (Dashboard Integration)
672
812
  # Writes events to .loki/events.jsonl for dashboard consumption
@@ -1688,6 +1828,7 @@ create_worktree() {
1688
1828
  ) &
1689
1829
  # Capture install PID for cleanup on exit
1690
1830
  WORKTREE_INSTALL_PIDS+=($!)
1831
+ register_pid "$!" "worktree-install" "stream=$stream_name"
1691
1832
 
1692
1833
  log_info "Created worktree: $worktree_path"
1693
1834
  return 0
@@ -1796,6 +1937,7 @@ spawn_worktree_session() {
1796
1937
 
1797
1938
  local pid=$!
1798
1939
  WORKTREE_PIDS[$stream_name]=$pid
1940
+ register_pid "$pid" "worktree-session" "stream=$stream_name"
1799
1941
 
1800
1942
  log_info "Session spawned: $stream_name (PID: $pid)"
1801
1943
  return 0
@@ -2002,6 +2144,7 @@ cleanup_parallel_streams() {
2002
2144
  if kill -0 "$pid" 2>/dev/null; then
2003
2145
  kill "$pid" 2>/dev/null || true
2004
2146
  fi
2147
+ unregister_pid "$pid"
2005
2148
  done
2006
2149
  WORKTREE_INSTALL_PIDS=()
2007
2150
 
@@ -2012,6 +2155,7 @@ cleanup_parallel_streams() {
2012
2155
  log_step "Stopping session: $stream"
2013
2156
  kill "$pid" 2>/dev/null || true
2014
2157
  fi
2158
+ unregister_pid "$pid"
2015
2159
  done
2016
2160
 
2017
2161
  # Wait for all to finish
@@ -2620,8 +2764,8 @@ write_dashboard_state() {
2620
2764
  # Get complexity tier
2621
2765
  local complexity="${DETECTED_COMPLEXITY:-standard}"
2622
2766
 
2623
- # Get RARV cycle step (approximate based on iteration)
2624
- local rarv_step=$((ITERATION_COUNT % 4))
2767
+ # Get RARV cycle step from actual phase tracking (falls back to iteration-based)
2768
+ local rarv_step=${RARV_CURRENT_STEP:-$((ITERATION_COUNT % 4))}
2625
2769
  local rarv_stages='["reason", "act", "reflect", "verify"]'
2626
2770
 
2627
2771
  # Get memory system stats (if available)
@@ -2634,9 +2778,9 @@ write_dashboard_state() {
2634
2778
  [ -d ".loki/memory/skills" ] && procedural_count=$(find ".loki/memory/skills" -type f -name "*.json" 2>/dev/null | wc -l | tr -d ' ')
2635
2779
 
2636
2780
  # Get quality gates status (if available)
2637
- local quality_gates='{"staticAnalysis":"pending","codeReview":"pending","antiSycophancy":"pending","testCoverage":"pending","securityScan":"pending","performance":"pending"}'
2781
+ local quality_gates='null'
2638
2782
  if [ -f ".loki/state/quality-gates.json" ]; then
2639
- quality_gates=$(cat ".loki/state/quality-gates.json" 2>/dev/null || echo "$quality_gates")
2783
+ quality_gates=$(cat ".loki/state/quality-gates.json" 2>/dev/null || echo 'null')
2640
2784
  fi
2641
2785
 
2642
2786
  # Get Completion Council state (v5.25.0)
@@ -3037,6 +3181,7 @@ start_status_monitor() {
3037
3181
  done
3038
3182
  ) &
3039
3183
  STATUS_MONITOR_PID=$!
3184
+ register_pid "$STATUS_MONITOR_PID" "status-monitor"
3040
3185
 
3041
3186
  log_info "Status monitor started"
3042
3187
  log_info "Monitor progress: ${CYAN}watch -n 2 cat .loki/STATUS.txt${NC}"
@@ -3046,6 +3191,7 @@ stop_status_monitor() {
3046
3191
  if [ -n "$STATUS_MONITOR_PID" ]; then
3047
3192
  kill "$STATUS_MONITOR_PID" 2>/dev/null || true
3048
3193
  wait "$STATUS_MONITOR_PID" 2>/dev/null || true
3194
+ unregister_pid "$STATUS_MONITOR_PID"
3049
3195
  fi
3050
3196
  stop_resource_monitor
3051
3197
  }
@@ -3621,6 +3767,7 @@ start_resource_monitor() {
3621
3767
  done
3622
3768
  ) &
3623
3769
  RESOURCE_MONITOR_PID=$!
3770
+ register_pid "$RESOURCE_MONITOR_PID" "resource-monitor"
3624
3771
 
3625
3772
  log_info "Resource monitor started (CPU threshold: ${RESOURCE_CPU_THRESHOLD}%, Memory threshold: ${RESOURCE_MEM_THRESHOLD}%)"
3626
3773
  log_info "Check status: ${CYAN}cat .loki/state/resources.json${NC}"
@@ -3630,6 +3777,7 @@ stop_resource_monitor() {
3630
3777
  if [ -n "$RESOURCE_MONITOR_PID" ]; then
3631
3778
  kill "$RESOURCE_MONITOR_PID" 2>/dev/null || true
3632
3779
  wait "$RESOURCE_MONITOR_PID" 2>/dev/null || true
3780
+ unregister_pid "$RESOURCE_MONITOR_PID"
3633
3781
  fi
3634
3782
  }
3635
3783
 
@@ -4720,12 +4868,14 @@ BUILD_PROMPT
4720
4868
  esac
4721
4869
  ) &
4722
4870
  pids+=($!)
4871
+ register_pid "$!" "code-reviewer" "name=$reviewer_name"
4723
4872
  done
4724
4873
 
4725
4874
  # Wait for all reviewers to complete
4726
4875
  log_info "Waiting for $reviewer_count reviewers to complete (blind review)..."
4727
4876
  for pid in "${pids[@]}"; do
4728
4877
  wait "$pid" || true
4878
+ unregister_pid "$pid"
4729
4879
  done
4730
4880
 
4731
4881
  log_info "All reviewers complete. Aggregating verdicts..."
@@ -5191,6 +5341,7 @@ start_dashboard() {
5191
5341
  LOKI_TLS_CERT="${LOKI_TLS_CERT:-}" LOKI_TLS_KEY="${LOKI_TLS_KEY:-}" \
5192
5342
  LOKI_SKILL_DIR="${skill_dir}" PYTHONPATH="${skill_dir}" nohup "$python_cmd" -m dashboard.server > "$log_file" 2>&1 &
5193
5343
  DASHBOARD_PID=$!
5344
+ register_pid "$DASHBOARD_PID" "dashboard" "port=${DASHBOARD_PORT:-57374}"
5194
5345
 
5195
5346
  # Save PID for later cleanup
5196
5347
  mkdir -p .loki/dashboard
@@ -5224,6 +5375,7 @@ stop_dashboard() {
5224
5375
  if [ -n "$DASHBOARD_PID" ]; then
5225
5376
  kill "$DASHBOARD_PID" 2>/dev/null || true
5226
5377
  wait "$DASHBOARD_PID" 2>/dev/null || true
5378
+ unregister_pid "$DASHBOARD_PID"
5227
5379
  fi
5228
5380
 
5229
5381
  # Also try PID file
@@ -5231,6 +5383,7 @@ stop_dashboard() {
5231
5383
  local saved_pid=$(cat ".loki/dashboard/dashboard.pid" 2>/dev/null)
5232
5384
  if [ -n "$saved_pid" ]; then
5233
5385
  kill "$saved_pid" 2>/dev/null || true
5386
+ unregister_pid "$saved_pid"
5234
5387
  fi
5235
5388
  rm -f ".loki/dashboard/dashboard.pid"
5236
5389
  fi
@@ -7121,6 +7274,7 @@ cleanup() {
7121
7274
  fi
7122
7275
  stop_dashboard
7123
7276
  stop_status_monitor
7277
+ kill_all_registered
7124
7278
  rm -f "$loki_dir/loki.pid" 2>/dev/null
7125
7279
  if [ -f "$loki_dir/session.json" ]; then
7126
7280
  _LOKI_SESSION_FILE="$loki_dir/session.json" python3 -c "
@@ -7148,6 +7302,7 @@ except (json.JSONDecodeError, OSError): pass
7148
7302
  fi
7149
7303
  stop_dashboard
7150
7304
  stop_status_monitor
7305
+ kill_all_registered
7151
7306
  rm -f .loki/loki.pid .loki/PAUSE 2>/dev/null
7152
7307
  # Mark session.json as stopped
7153
7308
  if [ -f ".loki/session.json" ]; then
@@ -7335,6 +7490,7 @@ main() {
7335
7490
  LOKI_RUNNING_FROM_TEMP='' nohup "$original_script" "${cmd_args[@]}" > "$log_file" 2>&1 &
7336
7491
  local bg_pid=$!
7337
7492
  echo "$bg_pid" > "$pid_file"
7493
+ register_pid "$bg_pid" "background-session" "log=$log_file"
7338
7494
 
7339
7495
  echo ""
7340
7496
  echo -e "${GREEN}━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━${NC}"
@@ -7457,6 +7613,14 @@ main() {
7457
7613
  # Write PID file for ALL modes (foreground + background)
7458
7614
  echo "$$" > "$pid_file"
7459
7615
 
7616
+ # Initialize PID registry and clean up orphans from previous sessions
7617
+ init_pid_registry
7618
+ local orphan_count
7619
+ orphan_count=$(cleanup_orphan_pids)
7620
+ if [ "$orphan_count" -gt 0 ]; then
7621
+ log_warn "Killed $orphan_count orphaned process(es) from previous session"
7622
+ fi
7623
+
7460
7624
  # Copy skill files to .loki/skills/ - makes CLI self-contained
7461
7625
  # No need to install Claude Code skill separately
7462
7626
  copy_skill_files
@@ -7536,10 +7700,12 @@ main() {
7536
7700
  run_autonomous "$PRD_PATH"
7537
7701
  ) &
7538
7702
  local main_pid=$!
7703
+ register_pid "$main_pid" "parallel-main" ""
7539
7704
 
7540
7705
  # Run parallel orchestrator
7541
7706
  run_parallel_orchestrator &
7542
7707
  local orchestrator_pid=$!
7708
+ register_pid "$orchestrator_pid" "parallel-orchestrator" ""
7543
7709
 
7544
7710
  # Wait for main session (orchestrator continues watching)
7545
7711
  wait $main_pid || result=$?
@@ -7,7 +7,7 @@ Modules:
7
7
  control: Session control API (start/stop/pause/resume)
8
8
  """
9
9
 
10
- __version__ = "5.49.0"
10
+ __version__ = "5.49.2"
11
11
 
12
12
  # Expose the control app for easy import
13
13
  try:
@@ -404,7 +404,20 @@ async def get_status() -> StatusResponse:
404
404
  iteration = state.get("iteration", 0)
405
405
  complexity = state.get("complexity", "standard")
406
406
  mode = state.get("mode", "")
407
- running_agents = len(state.get("agents", []))
407
+ # Count only agents with alive PIDs (not raw array length)
408
+ agents_list = state.get("agents", [])
409
+ running_agents = 0
410
+ for agent in agents_list:
411
+ agent_pid = agent.get("pid")
412
+ if agent_pid:
413
+ try:
414
+ os.kill(int(agent_pid), 0)
415
+ running_agents += 1
416
+ except (OSError, ValueError, TypeError):
417
+ pass
418
+ else:
419
+ # No PID field -- count as running (legacy data)
420
+ running_agents += 1
408
421
 
409
422
  tasks = state.get("tasks", {})
410
423
  pending_tasks = len(tasks.get("pending", []))
@@ -3138,23 +3151,112 @@ async def get_github_sync_log(
3138
3151
  # =============================================================================
3139
3152
 
3140
3153
 
3154
+ def _resolve_process_state(pid: Optional[int], last_status: str = "",
3155
+ started: str = "", heartbeat: str = "",
3156
+ stale_threshold: int = 30) -> dict[str, Any]:
3157
+ """Resolve process state with honest labels.
3158
+
3159
+ States:
3160
+ RUNNING - PID alive AND heartbeat < stale_threshold seconds
3161
+ STALE - PID alive BUT no heartbeat update in > stale_threshold seconds
3162
+ COMPLETED - last_status marked done/completed and PID exited
3163
+ FAILED - last_status marked failed OR PID exited non-zero
3164
+ CRASHED - PID dead BUT last_status was 'running'
3165
+ UNKNOWN - No PID, no status, or conflicting data
3166
+
3167
+ Returns dict with: state, pid_alive, started, last_heartbeat, duration_seconds
3168
+ """
3169
+ now = datetime.now(timezone.utc)
3170
+ pid_alive = False
3171
+ if pid is not None:
3172
+ try:
3173
+ os.kill(pid, 0)
3174
+ pid_alive = True
3175
+ except (OSError, ValueError, TypeError):
3176
+ pass
3177
+
3178
+ # Parse timestamps
3179
+ started_dt = None
3180
+ heartbeat_dt = None
3181
+ if started:
3182
+ try:
3183
+ started_dt = datetime.fromisoformat(started.replace("Z", "+00:00"))
3184
+ if started_dt.tzinfo is None:
3185
+ started_dt = started_dt.replace(tzinfo=timezone.utc)
3186
+ except (ValueError, AttributeError):
3187
+ pass
3188
+ if heartbeat:
3189
+ try:
3190
+ heartbeat_dt = datetime.fromisoformat(heartbeat.replace("Z", "+00:00"))
3191
+ if heartbeat_dt.tzinfo is None:
3192
+ heartbeat_dt = heartbeat_dt.replace(tzinfo=timezone.utc)
3193
+ except (ValueError, AttributeError):
3194
+ pass
3195
+
3196
+ # Calculate duration
3197
+ duration_seconds = None
3198
+ if started_dt:
3199
+ duration_seconds = round((now - started_dt).total_seconds())
3200
+
3201
+ # Calculate heartbeat age
3202
+ heartbeat_age = None
3203
+ if heartbeat_dt:
3204
+ heartbeat_age = round((now - heartbeat_dt).total_seconds())
3205
+
3206
+ # Resolve state
3207
+ normalized = last_status.lower().strip() if last_status else ""
3208
+ if pid_alive:
3209
+ if heartbeat_age is not None and heartbeat_age > stale_threshold:
3210
+ state = "STALE"
3211
+ else:
3212
+ state = "RUNNING"
3213
+ else:
3214
+ if normalized in ("done", "completed", "complete", "success"):
3215
+ state = "COMPLETED"
3216
+ elif normalized in ("failed", "error", "errored"):
3217
+ state = "FAILED"
3218
+ elif normalized in ("running", "active", "in_progress", "starting"):
3219
+ state = "CRASHED"
3220
+ elif pid is None:
3221
+ state = "UNKNOWN"
3222
+ else:
3223
+ # PID dead, unknown last status
3224
+ state = "CRASHED" if normalized == "" else "UNKNOWN"
3225
+
3226
+ result: dict[str, Any] = {
3227
+ "state": state,
3228
+ "pid_alive": pid_alive,
3229
+ }
3230
+ if started:
3231
+ result["started"] = started
3232
+ if heartbeat:
3233
+ result["last_heartbeat"] = heartbeat
3234
+ if heartbeat_age is not None:
3235
+ result["heartbeat_age_seconds"] = heartbeat_age
3236
+ if duration_seconds is not None:
3237
+ result["duration_seconds"] = duration_seconds
3238
+ return result
3239
+
3240
+
3141
3241
  @app.get("/api/health/processes")
3142
3242
  async def get_process_health(token: Optional[dict] = Depends(auth.get_current_token)):
3143
- """Get health status of all loki processes (dashboard, session, agents)."""
3243
+ """Get health status of all loki processes (dashboard, session, agents).
3244
+
3245
+ Returns honest state labels: RUNNING, STALE, COMPLETED, FAILED, CRASHED, UNKNOWN.
3246
+ Every entry includes timestamps (started, last_heartbeat, duration_seconds).
3247
+ """
3144
3248
  result: dict[str, Any] = {"dashboard": None, "session": None, "agents": []}
3145
3249
 
3146
3250
  loki_dir = _get_loki_dir()
3251
+ now_iso = datetime.now(timezone.utc).isoformat()
3147
3252
 
3148
3253
  # Dashboard PID
3149
3254
  dpid_file = loki_dir / "dashboard" / "dashboard.pid"
3150
3255
  if dpid_file.exists():
3151
3256
  try:
3152
3257
  dpid = int(dpid_file.read_text().strip())
3153
- try:
3154
- os.kill(dpid, 0)
3155
- result["dashboard"] = {"pid": dpid, "status": "alive"}
3156
- except OSError:
3157
- result["dashboard"] = {"pid": dpid, "status": "dead"}
3258
+ state_info = _resolve_process_state(dpid, last_status="running")
3259
+ result["dashboard"] = {"pid": dpid, **state_info}
3158
3260
  except (ValueError, OSError):
3159
3261
  pass
3160
3262
 
@@ -3163,14 +3265,23 @@ async def get_process_health(token: Optional[dict] = Depends(auth.get_current_to
3163
3265
  if spid_file.exists():
3164
3266
  try:
3165
3267
  spid = int(spid_file.read_text().strip())
3166
- try:
3167
- os.kill(spid, 0)
3168
- result["session"] = {"pid": spid, "status": "alive"}
3169
- except OSError:
3170
- result["session"] = {"pid": spid, "status": "dead"}
3268
+ state_info = _resolve_process_state(spid, last_status="running")
3269
+ result["session"] = {"pid": spid, **state_info}
3171
3270
  except (ValueError, OSError):
3172
3271
  pass
3173
3272
 
3273
+ # Read dashboard-state.json for heartbeat timestamp
3274
+ state_file = loki_dir / "dashboard-state.json"
3275
+ state_heartbeat = ""
3276
+ if state_file.exists():
3277
+ try:
3278
+ st = os.stat(state_file)
3279
+ state_heartbeat = datetime.fromtimestamp(
3280
+ st.st_mtime, tz=timezone.utc
3281
+ ).isoformat()
3282
+ except OSError:
3283
+ pass
3284
+
3174
3285
  # Agent PIDs
3175
3286
  agents_file = loki_dir / "state" / "agents.json"
3176
3287
  if agents_file.exists():
@@ -3178,24 +3289,65 @@ async def get_process_health(token: Optional[dict] = Depends(auth.get_current_to
3178
3289
  agents = json.loads(agents_file.read_text())
3179
3290
  for agent in agents:
3180
3291
  pid = agent.get("pid")
3181
- status = "unknown"
3182
- if pid:
3183
- try:
3184
- os.kill(int(pid), 0)
3185
- status = "alive"
3186
- except (OSError, ValueError):
3187
- status = "dead"
3292
+ pid_int = int(pid) if pid else None
3293
+ agent_status = agent.get("status", "")
3294
+ agent_started = agent.get("started", "")
3295
+ agent_heartbeat = agent.get("heartbeat", state_heartbeat)
3296
+ state_info = _resolve_process_state(
3297
+ pid_int,
3298
+ last_status=agent_status,
3299
+ started=agent_started,
3300
+ heartbeat=agent_heartbeat,
3301
+ )
3188
3302
  result["agents"].append({
3189
3303
  "id": agent.get("id", ""),
3190
3304
  "name": agent.get("name", ""),
3191
3305
  "pid": pid,
3192
- "status": status,
3306
+ **state_info,
3193
3307
  })
3194
3308
  except Exception:
3195
3309
  pass
3196
3310
 
3311
+ # PID registry (central process supervisor)
3312
+ pids_dir = loki_dir / "pids"
3313
+ registered: list[dict[str, Any]] = []
3314
+ if pids_dir.exists():
3315
+ for entry_file in sorted(pids_dir.glob("*.json")):
3316
+ try:
3317
+ pid_str = entry_file.stem
3318
+ pid = int(pid_str)
3319
+ entry = json.loads(entry_file.read_text())
3320
+ entry_started = entry.get("started", "")
3321
+ entry_heartbeat = entry.get("heartbeat", "")
3322
+ # Use file mtime as heartbeat fallback
3323
+ if not entry_heartbeat:
3324
+ try:
3325
+ st = os.stat(entry_file)
3326
+ entry_heartbeat = datetime.fromtimestamp(
3327
+ st.st_mtime, tz=timezone.utc
3328
+ ).isoformat()
3329
+ except OSError:
3330
+ pass
3331
+ entry_status = entry.get("status", "running")
3332
+ state_info = _resolve_process_state(
3333
+ pid,
3334
+ last_status=entry_status,
3335
+ started=entry_started,
3336
+ heartbeat=entry_heartbeat,
3337
+ )
3338
+ registered.append({
3339
+ "pid": pid,
3340
+ "label": entry.get("label", "unknown"),
3341
+ "ppid": entry.get("ppid"),
3342
+ **state_info,
3343
+ })
3344
+ except (ValueError, json.JSONDecodeError, OSError):
3345
+ continue
3346
+ result["registered_processes"] = registered
3347
+
3197
3348
  watchdog_enabled = os.environ.get("LOKI_WATCHDOG", "false").lower() == "true"
3198
3349
  result["watchdog_enabled"] = watchdog_enabled
3350
+ result["checked_at"] = now_iso
3199
3351
 
3200
3352
  return result
3201
3353
 
@@ -4774,7 +4774,7 @@ var LokiDashboard=(()=>{var X=Object.defineProperty;var gt=Object.getOwnProperty
4774
4774
  <p>Checklist not initialized</p>
4775
4775
  <p class="hint">The PRD checklist will be created during the first iteration when a PRD is provided.</p>
4776
4776
  </div>
4777
- `}_attachEventListeners(){let t=this.shadowRoot;t&&(t.querySelectorAll(".category-header[data-category]").forEach(e=>{e.addEventListener("click",()=>this._toggleCategory(e.dataset.category))}),t.querySelectorAll("button[data-waive-id]").forEach(e=>{e.addEventListener("click",a=>{a.stopPropagation(),this._waiveItem(e.dataset.waiveId)})}),t.querySelectorAll("button[data-unwaive-id]").forEach(e=>{e.addEventListener("click",a=>{a.stopPropagation(),this._unwaiveItem(e.dataset.unwaiveId)})}))}_escapeHtml(t){return t?String(t).replace(/&/g,"&amp;").replace(/</g,"&lt;").replace(/>/g,"&gt;").replace(/"/g,"&quot;"):""}};customElements.define("loki-checklist-viewer",G);var ht={not_initialized:{color:"var(--loki-text-muted, #71717a)",label:"Not Started",pulse:!1},starting:{color:"var(--loki-yellow, #ca8a04)",label:"Starting...",pulse:!0},running:{color:"var(--loki-green, #16a34a)",label:"Running",pulse:!0},crashed:{color:"var(--loki-red, #dc2626)",label:"Crashed",pulse:!1},stopped:{color:"var(--loki-text-muted, #a1a1aa)",label:"Stopped",pulse:!1}},J=class extends c{static get observedAttributes(){return["api-url","theme"]}constructor(){super(),this._loading=!1,this._error=null,this._api=null,this._pollInterval=null,this._status=null,this._logs=[],this._lastDataHash=null,this._lastLogsHash=null}connectedCallback(){super.connectedCallback(),this._setupApi(),this._loadData(),this._startPolling()}disconnectedCallback(){super.disconnectedCallback(),this._stopPolling()}attributeChangedCallback(t,e,a){e!==a&&(t==="api-url"&&this._api&&(this._api.baseUrl=a,this._loadData()),t==="theme"&&this._applyTheme())}_setupApi(){let t=this.getAttribute("api-url")||window.location.origin;this._api=u({baseUrl:t})}_startPolling(){this._pollInterval=setInterval(()=>this._loadData(),3e3),this._visibilityHandler=()=>{document.hidden?this._pollInterval&&(clearInterval(this._pollInterval),this._pollInterval=null):this._pollInterval||(this._loadData(),this._pollInterval=setInterval(()=>this._loadData(),3e3))},document.addEventListener("visibilitychange",this._visibilityHandler)}_stopPolling(){this._pollInterval&&(clearInterval(this._pollInterval),this._pollInterval=null),this._visibilityHandler&&(document.removeEventListener("visibilitychange",this._visibilityHandler),this._visibilityHandler=null)}async _loadData(){try{let[t,e]=await Promise.all([this._api.getAppRunnerStatus(),this._api.getAppRunnerLogs()]),a=JSON.stringify({status:t?.status,port:t?.port,restarts:t?.restart_count,url:t?.url}),i=JSON.stringify(e?.lines?.slice(-5)||[]),s=i!==this._lastLogsHash;if(a===this._lastDataHash&&!s)return;this._lastDataHash=a,this._lastLogsHash=i,this._status=t,this._logs=e?.lines||[],this._error=null,this.render(),this._scrollLogsToBottom()}catch(t){this._error||(this._error=`Failed to load app status: ${t.message}`,this.render())}}_scrollLogsToBottom(){let t=this.shadowRoot;if(!t)return;let e=t.querySelector(".log-area");e&&(e.scrollTop=e.scrollHeight)}async _handleRestart(){try{await this._api.restartApp(),this._loadData()}catch(t){this._error=`Restart failed: ${t.message}`,this.render()}}async _handleStop(){try{await this._api.stopApp(),this._loadData()}catch(t){this._error=`Stop failed: ${t.message}`,this.render()}}_formatUptime(t){if(!t)return"--";let e=new Date(t),i=Math.floor((new Date-e)/1e3);if(i<60)return`${i}s`;if(i<3600)return`${Math.floor(i/60)}m ${i%60}s`;let s=Math.floor(i/3600),r=Math.floor(i%3600/60);return`${s}h ${r}m`}_isValidUrl(t){if(!t)return!1;try{let e=new URL(t);return e.protocol==="http:"||e.protocol==="https:"}catch{return!1}}_getStyles(){return`
4777
+ `}_attachEventListeners(){let t=this.shadowRoot;t&&(t.querySelectorAll(".category-header[data-category]").forEach(e=>{e.addEventListener("click",()=>this._toggleCategory(e.dataset.category))}),t.querySelectorAll("button[data-waive-id]").forEach(e=>{e.addEventListener("click",a=>{a.stopPropagation(),this._waiveItem(e.dataset.waiveId)})}),t.querySelectorAll("button[data-unwaive-id]").forEach(e=>{e.addEventListener("click",a=>{a.stopPropagation(),this._unwaiveItem(e.dataset.unwaiveId)})}))}_escapeHtml(t){return t?String(t).replace(/&/g,"&amp;").replace(/</g,"&lt;").replace(/>/g,"&gt;").replace(/"/g,"&quot;"):""}};customElements.define("loki-checklist-viewer",G);var ht={not_initialized:{color:"var(--loki-text-muted, #71717a)",label:"Not Started",pulse:!1},starting:{color:"var(--loki-yellow, #ca8a04)",label:"Starting...",pulse:!0},running:{color:"var(--loki-green, #16a34a)",label:"Running",pulse:!0},stale:{color:"var(--loki-yellow, #ca8a04)",label:"Stale",pulse:!1},completed:{color:"var(--loki-text-muted, #a1a1aa)",label:"Completed",pulse:!1},failed:{color:"var(--loki-red, #dc2626)",label:"Failed",pulse:!1},crashed:{color:"var(--loki-red, #dc2626)",label:"Crashed",pulse:!1},stopped:{color:"var(--loki-text-muted, #a1a1aa)",label:"Stopped",pulse:!1},unknown:{color:"var(--loki-text-muted, #71717a)",label:"Unknown",pulse:!1}},J=class extends c{static get observedAttributes(){return["api-url","theme"]}constructor(){super(),this._loading=!1,this._error=null,this._api=null,this._pollInterval=null,this._status=null,this._logs=[],this._lastDataHash=null,this._lastLogsHash=null}connectedCallback(){super.connectedCallback(),this._setupApi(),this._loadData(),this._startPolling()}disconnectedCallback(){super.disconnectedCallback(),this._stopPolling()}attributeChangedCallback(t,e,a){e!==a&&(t==="api-url"&&this._api&&(this._api.baseUrl=a,this._loadData()),t==="theme"&&this._applyTheme())}_setupApi(){let t=this.getAttribute("api-url")||window.location.origin;this._api=u({baseUrl:t})}_startPolling(){this._pollInterval=setInterval(()=>this._loadData(),3e3),this._visibilityHandler=()=>{document.hidden?this._pollInterval&&(clearInterval(this._pollInterval),this._pollInterval=null):this._pollInterval||(this._loadData(),this._pollInterval=setInterval(()=>this._loadData(),3e3))},document.addEventListener("visibilitychange",this._visibilityHandler)}_stopPolling(){this._pollInterval&&(clearInterval(this._pollInterval),this._pollInterval=null),this._visibilityHandler&&(document.removeEventListener("visibilitychange",this._visibilityHandler),this._visibilityHandler=null)}async _loadData(){try{let[t,e]=await Promise.all([this._api.getAppRunnerStatus(),this._api.getAppRunnerLogs()]),a=JSON.stringify({status:t?.status,port:t?.port,restarts:t?.restart_count,url:t?.url}),i=JSON.stringify(e?.lines?.slice(-5)||[]),s=i!==this._lastLogsHash;if(a===this._lastDataHash&&!s)return;this._lastDataHash=a,this._lastLogsHash=i,this._status=t,this._logs=e?.lines||[],this._error=null,this.render(),this._scrollLogsToBottom()}catch(t){this._error||(this._error=`Failed to load app status: ${t.message}`,this.render())}}_scrollLogsToBottom(){let t=this.shadowRoot;if(!t)return;let e=t.querySelector(".log-area");e&&(e.scrollTop=e.scrollHeight)}async _handleRestart(){try{await this._api.restartApp(),this._loadData()}catch(t){this._error=`Restart failed: ${t.message}`,this.render()}}async _handleStop(){try{await this._api.stopApp(),this._loadData()}catch(t){this._error=`Stop failed: ${t.message}`,this.render()}}_formatUptime(t){if(!t)return"--";let e=new Date(t),i=Math.floor((new Date-e)/1e3);if(i<60)return`${i}s`;if(i<3600)return`${Math.floor(i/60)}m ${i%60}s`;let s=Math.floor(i/3600),r=Math.floor(i%3600/60);return`${s}h ${r}m`}_isValidUrl(t){if(!t)return!1;try{let e=new URL(t);return e.protocol==="http:"||e.protocol==="https:"}catch{return!1}}_getStyles(){return`
4778
4778
  .app-status {
4779
4779
  padding: 16px;
4780
4780
  font-family: var(--loki-font-family, system-ui, -apple-system, sans-serif);