npm - agent-relay-runner - Versions diffs - 0.15.0 → 0.16.0 - Mend

agent-relay-runner 0.15.0 → 0.16.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (17) hide show

package/package.json +1 -1
package/plugins/claude/.claude-plugin/plugin.json +1 -1
package/plugins/claude/hooks/permission-request.sh +2 -0
package/plugins/claude/hooks/post-compact.sh +1 -0
package/plugins/claude/hooks/pre-compact.sh +1 -0
package/plugins/claude/hooks/relay-status.sh +28 -1
package/plugins/claude/hooks/session-end.sh +1 -0
package/plugins/claude/hooks/session-start.sh +1 -0
package/plugins/claude/hooks/stop-failure.sh +1 -0
package/plugins/claude/hooks/stop.sh +14 -2
package/plugins/claude/hooks/subagent-start.sh +1 -0
package/plugins/claude/hooks/subagent-stop.sh +1 -0
package/plugins/claude/hooks/user-prompt-submit.sh +1 -0
package/src/adapters/codex.ts +3 -2
package/src/control-server.ts +43 -0
package/src/logger.ts +97 -0
package/src/runner.ts +53 -51

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "agent-relay-runner",
-  "version": "0.15.0",
+  "version": "0.16.0",
   "description": "Unified provider lifecycle runner for Agent Relay",
   "type": "module",
   "bin": {

package/plugins/claude/.claude-plugin/plugin.json CHANGED Viewed

@@ -1,7 +1,7 @@
 {
   "name": "agent-relay-runner",
   "description": "Thin Agent Relay runner bridge for Claude Code",
-  "version": "0.15.0",
+  "version": "0.16.0",
   "agentRelayContracts": {
     "providerPluginProtocol": 1
   }

package/plugins/claude/hooks/permission-request.sh CHANGED Viewed

@@ -1,5 +1,7 @@
 #!/usr/bin/env bash
 set -euo pipefail
+source "${CLAUDE_PLUGIN_ROOT:-$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)}/hooks/relay-status.sh"
+relay_install_hook_guard permission-request
 port="${AGENT_RELAY_RUNNER_PORT:-}"
 if [[ -z "$port" ]]; then

package/plugins/claude/hooks/post-compact.sh CHANGED Viewed

@@ -1,5 +1,6 @@
 #!/usr/bin/env bash
 set -euo pipefail
 source "${CLAUDE_PLUGIN_ROOT:-$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)}/hooks/relay-status.sh"
+relay_install_hook_guard post-compact
 relay_post_timeline_status idle provider-turn "" compacted

package/plugins/claude/hooks/pre-compact.sh CHANGED Viewed

@@ -1,5 +1,6 @@
 #!/usr/bin/env bash
 set -euo pipefail
 source "${CLAUDE_PLUGIN_ROOT:-$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)}/hooks/relay-status.sh"
+relay_install_hook_guard pre-compact
 relay_post_timeline_status busy provider-turn "" compacting

package/plugins/claude/hooks/relay-status.sh CHANGED Viewed

@@ -92,11 +92,38 @@ relay_post_session_end() {
     -d "$body" >/dev/null 2>&1 || true
 }
+# --- Hook FATAL surfacing (#198) -------------------------------------------
+# A hook that dies unexpectedly must never be silent. relay_install_hook_guard
+# arms an ERR trap that reports the failure FATAL to the runner control port,
+# which logs it to the dashboard-surfaced per-agent log. Best-effort and bounded
+# (--max-time 2) so the report itself can never blow the hook's timeout budget.
+relay_hook_fatal_report() {
+  local hook="${1:-unknown}" detail="${2:-}"
+  local port="${AGENT_RELAY_RUNNER_PORT:-}"
+  [ -z "$port" ] && return 0
+  local body="{\"hook\":\"$(relay_json_escape "$hook")\",\"error\":\"$(relay_json_escape "$detail")\"}"
+  curl -fsS --max-time 2 -X POST "http://127.0.0.1:${port}/hook-fatal" \
+    -H 'Content-Type: application/json' \
+    -d "$body" >/dev/null 2>&1 || true
+}
+relay_install_hook_guard() {
+  RELAY_HOOK_NAME="${1:-unknown}"
+  # Fires on any unhandled failure under `set -e`/`set -u`/pipefail in the hook's
+  # main body, just before the shell exits. Reports, then lets the exit proceed.
+  # (ERR is not inherited into functions without `set -E`; this covers the top-level
+  # flow, which is where a silent death actually wedges a turn.)
+  trap 'relay_hook_err_rc=$?; relay_hook_fatal_report "${RELAY_HOOK_NAME:-unknown}" "exit ${relay_hook_err_rc}: ${BASH_COMMAND}"' ERR
+}
 relay_pending_reply_stop_decision() {
   local port="${AGENT_RELAY_RUNNER_PORT:-}"
   [ -z "$port" ] && return 0
   local response
-  response="$(curl -fsS "http://127.0.0.1:${port}/reply-obligations/claude-stop" 2>/dev/null || true)"
+  # --max-time guards the Claude Stop hook's 5s budget: a slow runner/server (e.g. an
+  # un-indexed obligation query) must never block past the timeout, or Claude SIGKILLs
+  # the hook before it clears the turn -> stuck "busy" (#199). On timeout: no block.
+  response="$(curl -fsS --max-time 2 "http://127.0.0.1:${port}/reply-obligations/claude-stop" 2>/dev/null || true)"
   case "$response" in
     *'"decision":"block"'*|*'"decision": "block"'*) ;;
     *) return 0 ;;

package/plugins/claude/hooks/session-end.sh CHANGED Viewed

@@ -1,6 +1,7 @@
 #!/usr/bin/env bash
 set -euo pipefail
 source "${CLAUDE_PLUGIN_ROOT:-$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)}/hooks/relay-status.sh"
+relay_install_hook_guard session-end
 payload="$(cat || true)"
 reason="$(relay_json_string_field reason "$payload")"

package/plugins/claude/hooks/session-start.sh CHANGED Viewed

@@ -1,6 +1,7 @@
 #!/usr/bin/env bash
 set -euo pipefail
 source "${CLAUDE_PLUGIN_ROOT:-$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)}/hooks/relay-status.sh"
+relay_install_hook_guard session-start
 payload="$(cat || true)"
 source_kind="$(relay_json_string_field source "$payload")"

package/plugins/claude/hooks/stop-failure.sh CHANGED Viewed

@@ -1,6 +1,7 @@
 #!/usr/bin/env bash
 set -euo pipefail
 source "${CLAUDE_PLUGIN_ROOT:-$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)}/hooks/relay-status.sh"
+relay_install_hook_guard stop-failure
 payload="$(cat || true)"
 error="$(relay_json_string_field error "$payload")"

package/plugins/claude/hooks/stop.sh CHANGED Viewed

@@ -1,17 +1,29 @@
 #!/usr/bin/env bash
 set -euo pipefail
 source "${CLAUDE_PLUGIN_ROOT:-$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)}/hooks/relay-status.sh"
+relay_install_hook_guard stop
+# Clearing the turn's busy state is the critical path (#199). Register it on EXIT
+# so it runs even if a side-call below fails or times out under `set -e`. The one
+# exception is the reply-obligation block path, which deliberately keeps the agent
+# busy to answer — it opts out via the flag before exiting.
+_relay_clear_idle_on_exit=1
+trap '[ "${_relay_clear_idle_on_exit:-0}" = "1" ] && relay_post_status_clearing_subagents idle' EXIT
 payload="$(cat || true)"
 stop_hook_active="$(relay_json_bool_field stop_hook_active "$payload")"
 if [ "$stop_hook_active" != "true" ]; then
   last_assistant_msg="$(echo "$payload" | jq -c '.last_assistant_message // empty' 2>/dev/null || true)"
   relay_post_session_turn "$(relay_json_string_field transcript_path "$payload")" "$last_assistant_msg"
-  stop_decision="$(relay_pending_reply_stop_decision)"
+  # `|| true`: under `set -e`, a non-zero from the obligation check must never abort
+  # the hook before the idle-clear — clearing the turn is the critical path (#199).
+  stop_decision="$(relay_pending_reply_stop_decision || true)"
   if [ -n "$stop_decision" ]; then
+    _relay_clear_idle_on_exit=0
     printf '%s\n' "$stop_decision"
     exit 0
   fi
 fi
-relay_post_status_clearing_subagents idle
+# Normal turn end → the EXIT trap posts idle (always, even on an unexpected abort above).
+exit 0

package/plugins/claude/hooks/subagent-start.sh CHANGED Viewed

@@ -4,6 +4,7 @@ set -euo pipefail
 PLUGIN_ROOT="${CLAUDE_PLUGIN_ROOT:-$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)}"
 # shellcheck source=/dev/null
 source "${PLUGIN_ROOT}/hooks/relay-status.sh"
+relay_install_hook_guard subagent-start
 payload="$(cat || true)"
 agent_id="$(relay_json_string_field agent_id "$payload")"

package/plugins/claude/hooks/subagent-stop.sh CHANGED Viewed

@@ -4,6 +4,7 @@ set -euo pipefail
 PLUGIN_ROOT="${CLAUDE_PLUGIN_ROOT:-$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)}"
 # shellcheck source=/dev/null
 source "${PLUGIN_ROOT}/hooks/relay-status.sh"
+relay_install_hook_guard subagent-stop
 payload="$(cat || true)"
 agent_id="$(relay_json_string_field agent_id "$payload")"

package/plugins/claude/hooks/user-prompt-submit.sh CHANGED Viewed

@@ -1,6 +1,7 @@
 #!/usr/bin/env bash
 set -euo pipefail
 source "${CLAUDE_PLUGIN_ROOT:-$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)}/hooks/relay-status.sh"
+relay_install_hook_guard user-prompt-submit
 payload="$(cat || true)"
 relay_post_status busy
 # Mirror a terminal/TUI-typed prompt into the dashboard chat and start reasoning

package/src/adapters/codex.ts CHANGED Viewed

@@ -4,6 +4,7 @@ import { basename, join, resolve } from "node:path";
 import type { ContextState, Message } from "agent-relay-sdk";
 import { profileAllowsRelayFeature, providerMessageText, RELAY_CONTEXT, type ManagedProcess, type ProviderAdapter, type ProviderConfig, type ProviderPermissionDecisionInput, type ProviderSessionEvent, type ProviderStatusUpdate, type RunnerSpawnConfig, type SpawnArgs, type TerminalAttachSpec } from "../adapter";
 import { workspaceDepsNoteFromEnv } from "../relay-instructions";
+import { logger } from "../logger";
 /** Relay context prepended to a Codex agent's first turn: the standard relay
  * blurb plus, when running in an isolated workspace, the deps caveat (#159). */
@@ -199,7 +200,7 @@ export class CodexAdapter implements ProviderAdapter {
       input = codexRelayContextBlock() + "\n\n" + input;
       process.meta = { ...(process.meta ?? {}), relayContextSent: true };
     }
-    console.error(`[agent-relay] starting Codex initial prompt in thread ${threadId}`);
+    logger.info("codex", `starting Codex initial prompt in thread ${threadId}`);
     const client = process.meta?.client as CodexAppClient;
     await client.turnStart(threadId, input);
   }
@@ -211,7 +212,7 @@ export class CodexAdapter implements ProviderAdapter {
       text = codexRelayContextBlock() + "\n\n" + text;
       process.meta = { ...(process.meta ?? {}), relayContextSent: true };
     }
-    console.error(codexDeliveryNotice(messages, threadId));
+    logger.info("codex", codexDeliveryNotice(messages, threadId));
     const client = process.meta?.client as CodexAppClient;
     await client.turnStart(threadId, text);
   }

package/src/control-server.ts CHANGED Viewed

@@ -1,6 +1,16 @@
 import type { Server, ServerWebSocket } from "bun";
 import type { Message, ReplyObligation } from "agent-relay-sdk";
 import type { ProviderPermissionDecisionInput, ProviderStatusEvent, SemanticStatus, TerminalAttachSpec } from "./adapter";
+import { logger, parseLogLevel, LOG_LEVELS } from "./logger";
+// A hook that failed in a way it could not handle itself reports here so the
+// failure is never silent (#198 item 5). Phase 1 logs it FATAL to the per-agent
+// log; Phase 2 (#196) will additionally route it through the runner outbox to the
+// server.
+export interface HookFatalReport {
+  hook: string;
+  error: string;
+}
 interface MonitorSocketData {
   kind: "monitor";
@@ -33,6 +43,10 @@ interface ControlServerOptions {
   // transcript. transcriptPath is optional — the runner falls back to the last
   // path it saw during the session.
   onSessionEnd?(input: { reason?: string; transcriptPath?: string }): Promise<void>;
+  // Phase 1 observability (#198): a hook reporting an unhandled failure. The
+  // control server already logs it FATAL; this is the seam for Phase 2 to also
+  // surface it to the server via the runner outbox.
+  onHookFatal?(report: HookFatalReport): void;
 }
 export function startControlServer(options: ControlServerOptions): ControlServer {
@@ -81,6 +95,15 @@ export function startControlServer(options: ControlServerOptions): ControlServer
       if (url.pathname === "/session-end" && req.method === "POST") {
         return handleSessionEnd(req, options);
       }
+      if (url.pathname === "/log-level" && req.method === "GET") {
+        return Response.json({ level: logger.getLevel(), levels: LOG_LEVELS });
+      }
+      if (url.pathname === "/log-level" && req.method === "POST") {
+        return handleLogLevel(req);
+      }
+      if (url.pathname === "/hook-fatal" && req.method === "POST") {
+        return handleHookFatal(req, options);
+      }
       if (url.pathname === "/monitor") {
         const upgraded = srv.upgrade(req, { data: { kind: "monitor" } });
         return upgraded ? undefined : new Response("WebSocket upgrade failed", { status: 400 });
@@ -361,6 +384,26 @@ async function handleSessionEnd(req: Request, options: ControlServerOptions): Pr
   return Response.json({ ok: true });
 }
+async function handleLogLevel(req: Request): Promise<Response> {
+  const body = await req.json().catch(() => null);
+  const level = parseLogLevel(isRecord(body) && typeof body.level === "string" ? body.level : undefined);
+  if (!level) return Response.json({ error: `level must be one of: ${LOG_LEVELS.join(", ")}` }, { status: 400 });
+  const previous = logger.getLevel();
+  logger.setLevel(level);
+  logger.info("logger", `log level set to ${level} (was ${previous}) via control port`);
+  return Response.json({ ok: true, level, previous });
+}
+async function handleHookFatal(req: Request, options: ControlServerOptions): Promise<Response> {
+  const body = await req.json().catch(() => null);
+  const hook = isRecord(body) && typeof body.hook === "string" && body.hook.trim() ? body.hook.trim() : "unknown";
+  const error = isRecord(body) && typeof body.error === "string" ? body.error : "(no detail)";
+  // Never silent: a hook that couldn't handle its own failure lands here as FATAL.
+  logger.fatal(`hook:${hook}`, error);
+  try { options.onHookFatal?.({ hook, error }); } catch { /* reporting must never throw back at the hook */ }
+  return Response.json({ ok: true });
+}
 async function handleStatus(req: Request, options: ControlServerOptions): Promise<Response> {
   const body = await req.json().catch(() => null) as Partial<ProviderStatusEvent> | null;
   const status = body?.status;

package/src/logger.ts ADDED Viewed

@@ -0,0 +1,97 @@
+import { appendFileSync, mkdirSync } from "node:fs";
+import { join } from "node:path";
+// Phase 1 observability (#198): one leveled, runtime-togglable logger for the
+// Runner and the provider adapters below it. Replaces the ad-hoc scatter of
+// `console.error`, `logRunnerDiagnostic` (-> runner-<agent>.log) and
+// `sessionLog`/`sessionDebug` (-> session-mirror-<agent>.log) with a single
+// switch and a single greppable, ANSI-free sink.
+//
+// Sink: the per-agent `session-mirror-<agent>.log` — the file the orchestrator
+// already surfaces to the dashboard log-viewer (captureSessionMirror). One place
+// to look when anything in the Runner misbehaves.
+//
+// Level is read once from AGENT_RELAY_LOG_LEVEL (default "info") and can be
+// flipped at runtime via the control port (no restart) — so a phase refactor can
+// be watched at debug without bouncing the agent.
+export type LogLevel = "debug" | "info" | "warn" | "error" | "fatal";
+const ORDER: Record<LogLevel, number> = { debug: 10, info: 20, warn: 30, error: 40, fatal: 50 };
+export const LOG_LEVELS = Object.keys(ORDER) as LogLevel[];
+export function parseLogLevel(value: string | undefined | null): LogLevel | undefined {
+  if (!value) return undefined;
+  const v = value.trim().toLowerCase();
+  return (LOG_LEVELS as string[]).includes(v) ? (v as LogLevel) : undefined;
+}
+// Matches the runner's safeLogName and the orchestrator's safeMirrorLogName so all
+// three resolve the identical filename for a given agent id.
+function safeLogName(value: string): string {
+  return value.replace(/[^a-zA-Z0-9_.-]+/g, "_").slice(0, 180);
+}
+export interface LoggerConfig {
+  agentId?: string;
+  level?: LogLevel;
+  headless?: boolean;
+  logDir?: string;
+}
+export class Logger {
+  private level: LogLevel;
+  private agentId: string;
+  private headless: boolean;
+  private logDir: string;
+  constructor(config: LoggerConfig = {}) {
+    this.level = config.level ?? parseLogLevel(process.env.AGENT_RELAY_LOG_LEVEL) ?? "info";
+    this.agentId = config.agentId ?? "runner";
+    this.headless = config.headless ?? false;
+    this.logDir = config.logDir ?? join(process.env.HOME || ".", ".agent-relay", "logs");
+  }
+  // Bind the logger to a concrete agent once the runner knows its id. Preserves a
+  // level already set via env/runtime unless an explicit level is passed.
+  configure(config: LoggerConfig): void {
+    if (config.agentId !== undefined) this.agentId = config.agentId;
+    if (config.headless !== undefined) this.headless = config.headless;
+    if (config.logDir !== undefined) this.logDir = config.logDir;
+    if (config.level !== undefined) this.level = config.level;
+  }
+  setLevel(level: LogLevel): void { this.level = level; }
+  getLevel(): LogLevel { return this.level; }
+  isEnabled(level: LogLevel): boolean { return ORDER[level] >= ORDER[this.level]; }
+  debug(component: string, message: string): void { this.log("debug", component, message); }
+  info(component: string, message: string): void { this.log("info", component, message); }
+  warn(component: string, message: string): void { this.log("warn", component, message); }
+  error(component: string, message: string): void { this.log("error", component, message); }
+  fatal(component: string, message: string): void { this.log("fatal", component, message); }
+  log(level: LogLevel, component: string, message: string): void {
+    if (!this.isEnabled(level)) return;
+    const line = `[${new Date().toISOString()}] ${level.toUpperCase().padEnd(5)} [${component}] ${oneLine(message)}\n`;
+    try {
+      mkdirSync(this.logDir, { recursive: true });
+      appendFileSync(join(this.logDir, `session-mirror-${safeLogName(this.agentId)}.log`), line);
+    } catch {
+      // Best-effort. If the per-agent file can't be written, surface error/fatal to
+      // stderr so it is not lost entirely (headless: lands in the orchestrator log).
+      if (ORDER[level] >= ORDER.error) { try { console.error(line.trimEnd()); } catch { /* give up */ } }
+    }
+  }
+}
+// Newlines would split one record across several log lines and break greppability;
+// collapse them so a multi-line message stays one line.
+function oneLine(message: string): string {
+  return message.replace(/\r?\n/g, " ⏎ ");
+}
+// Process-global logger. A runner process serves exactly one agent, so a singleton
+// is the right scope; the runner calls configure() once it knows its id, and
+// adapters import this instance directly (no constructor threading).
+export const logger = new Logger();

package/src/runner.ts CHANGED Viewed

@@ -1,5 +1,5 @@
 import { hostname } from "node:os";
-import { appendFileSync, closeSync, mkdirSync, openSync, readSync, statSync, writeFileSync } from "node:fs";
+import { closeSync, mkdirSync, openSync, readSync, statSync, writeFileSync } from "node:fs";
 import { readFile } from "node:fs/promises";
 import { dirname, join } from "node:path";
 import type { AgentProfile, ContextState, Message, MessageSessionMeta, ProviderCapabilities, TaskStatusInput, WorkspaceMetadata } from "agent-relay-sdk";
@@ -13,6 +13,7 @@ import { extractLastAssistantTurn, extractFinalAssistantMessage, extractHookAssi
 import { agentProfileProjectionReport } from "./profile-projection";
 import { profileUsesHostProviderGlobals } from "./profile-home";
 import { runtimeMetadata } from "./version";
+import { logger, parseLogLevel } from "./logger";
 import { ensureSessionScratch, reapSessionScratch, sweepStaleSessions, type SessionScratchLayout } from "./session-scratch";
 interface RunnerOptions {
@@ -76,12 +77,18 @@ const LOG_TAIL_BYTES = 128 * 1024;
 const PROMPT_ECHO_DEDUP_MS = 30_000;
 // Busy reconciler: a conservative LAST-RESORT backstop for a turn that ended
 // without the provider's Stop hook clearing busy (e.g. ESC straight into the web
-// terminal). It must never fire during a live turn, so it (a) only counts idle
-// after it has actually observed the provider busy, and (b) requires a long,
+// terminal). It must never fire during a live turn, so it requires a long,
 // unbroken idle streak — an active turn shows its working spinner well within
 // this window, which resets the streak. ~32s of uninterrupted idle = really done.
 const BUSY_RECONCILE_POLL_MS = 4_000;
 const BUSY_RECONCILE_IDLE_CONFIRM = 8;
+// When the reconciler never observed the provider busy this turn (a turn faster
+// than the 4s poll — common for short voice/autosend replies), it can't trust a
+// quick idle the way it does after seeing the spinner. But refusing forever wedged
+// fast turns in "busy" when the Stop hook's idle was lost (#199). So we still
+// force-clear, just after a much longer unbroken-idle window — an active turn would
+// have flashed its spinner into at least one of these probes and reset the streak.
+const BUSY_RECONCILE_IDLE_CONFIRM_NO_BUSY = 15;
 // After a dashboard interrupt, give the provider a moment to drop out of its turn,
 // then reconcile immediately so the user sees "stopped" without waiting for the backstop.
 const INTERRUPT_RECONCILE_DELAY_MS = 1_500;
@@ -171,6 +178,14 @@ export class AgentRunner {
   constructor(private readonly options: RunnerOptions) {
     this.agentId = options.agentId ?? options.runnerId;
+    // Bind the process-global logger to this agent. AGENT_RELAY_SESSION_DEBUG=1 is
+    // kept as a back-compat alias for the verbose probe/emit lines, now expressed
+    // as log level "debug" (AGENT_RELAY_LOG_LEVEL still wins when both are set).
+    logger.configure({
+      agentId: this.agentId,
+      headless: options.headless,
+      ...(this.sessionDebugVerbose && !parseLogLevel(process.env.AGENT_RELAY_LOG_LEVEL) ? { level: "debug" as const } : {}),
+    });
     this.currentToken = options.token;
     this.currentTokenJti = options.tokenJti;
     this.currentTokenProfileId = options.tokenProfileId;
@@ -381,7 +396,7 @@ export class AgentRunner {
         startedAt: this.options.startedAt,
       }, null, 2) + "\n", { mode: 0o600 });
     } catch (error) {
-      console.error(`[runner] failed to write runner info file: ${error}`);
+      logger.error("runner", `failed to write runner info file: ${error}`);
     }
   }
@@ -397,7 +412,7 @@ export class AgentRunner {
       const messages = await this.http.pollMessages({ for: this.agentId, unread: true, limit: 100 });
       for (const message of messages) this.enqueueMessage(message);
     } catch (error) {
-      console.error(`[runner] inbox bootstrap failed: ${error}`);
+      logger.error("runner", `inbox bootstrap failed: ${error}`);
     }
   }
@@ -407,7 +422,7 @@ export class AgentRunner {
     try {
       await this.options.adapter.deliverInitialPrompt(this.process, prompt);
     } catch (error) {
-      console.error(`[runner] initial prompt delivery failed: ${error}`);
+      logger.error("runner", `initial prompt delivery failed: ${error}`);
     }
   }
@@ -444,7 +459,7 @@ export class AgentRunner {
             status: "in_progress",
             agentId: this.agentId,
             metadata: { messageId: message.id, completedBy: "runner" },
-          }).catch((error) => console.error(`[runner] task ${taskId} in_progress update failed: ${error}`));
+          }).catch((error) => logger.error("task", `task ${taskId} in_progress update failed: ${error}`));
           // Runner owns claim + status here; drop the server's self-claim instruction
           // so the agent doesn't improvise a stray claim send (see stripRunnerClaimedGuidance).
           toDeliver = { ...message, body: stripRunnerClaimedGuidance(message.body) };
@@ -462,7 +477,7 @@ export class AgentRunner {
     try {
       const prepared = await messagesWithCachedAttachments(deliverable, this.http, {
         agentId: this.agentId,
-        onError: (message) => console.error(`[runner] ${message}`),
+        onError: (message) => logger.error("runner", message),
       });
       await this.options.adapter.deliver(this.process, prepared);
       for (const message of deliverable) {
@@ -471,7 +486,7 @@ export class AgentRunner {
       }
     } catch (error) {
       failed = true;
-      if (shouldLogDeliveryFailure(error)) console.error(`[runner] message delivery failed: ${error}`);
+      if (shouldLogDeliveryFailure(error)) logger.warn("delivery", `message delivery failed: ${error}`);
       for (const message of deliverable) {
         this.clearActiveClaim(message);
         this.pendingMessages.set(message.id, message);
@@ -539,7 +554,7 @@ export class AgentRunner {
         await this.http.deleteAgent(this.agentId).catch(() => {});
         if (this.options.exitProcessOnShutdown !== false) {
           setTimeout(() => void this.stop().catch((error) => {
-            console.error(`[runner] stop after command failed: ${error}`);
+            logger.error("lifecycle", `stop after command failed: ${error}`);
           }).finally(() => process.exit(0)), 10);
         }
       } else if (!this.stopped) {
@@ -674,7 +689,7 @@ export class AgentRunner {
       if (this.shouldStopUnexpectedProviderExit(diagnostics)) {
         const hasResumeId = typeof diagnostics.claudeResumeId === "string" && diagnostics.claudeResumeId.length > 0;
-        console.warn(`[runner] ${this.options.provider} exited; leaving agent offline for manual recovery`);
+        logger.warn("lifecycle", `${this.options.provider} exited; leaving agent offline for manual recovery`);
         this.publishRunnerTimelineEvent({
           status: "provider.restart_decision",
           id: `provider-restart-decision-${this.providerSessionId}-${now}`,
@@ -702,7 +717,7 @@ export class AgentRunner {
       }
       if (runtimeMs < RAPID_EXIT_MS && recent.length > MAX_RAPID_UNEXPECTED_EXITS) {
-        console.error(`[runner] provider session exited ${recent.length} times within ${Math.round(UNEXPECTED_EXIT_WINDOW_MS / 1000)}s; giving up`);
+        logger.error("lifecycle", `provider session exited ${recent.length} times within ${Math.round(UNEXPECTED_EXIT_WINDOW_MS / 1000)}s; giving up`);
         this.publishRunnerTimelineEvent({
           status: "provider.restart_decision",
           id: `provider-restart-decision-${this.providerSessionId}-${now}`,
@@ -726,7 +741,7 @@ export class AgentRunner {
       }
       const delayMs = Math.min(10_000, Math.max(500, 500 * recent.length));
-      console.warn(`[runner] provider session exited unexpectedly after ${Math.round(runtimeMs / 1000)}s; restarting in ${delayMs}ms`);
+      logger.warn("lifecycle", `provider session exited unexpectedly after ${Math.round(runtimeMs / 1000)}s; restarting in ${delayMs}ms`);
       this.publishRunnerTimelineEvent({
         status: "provider.restart_decision",
         id: `provider-restart-decision-${this.providerSessionId}-${now}`,
@@ -751,7 +766,7 @@ export class AgentRunner {
         this.publishStatus();
         this.scheduleDrain();
       } catch (error) {
-        console.error(`[runner] provider restart after unexpected exit failed: ${error}`);
+        logger.error("lifecycle", `provider restart after unexpected exit failed: ${error}`);
         this.setProviderStatus("error");
         this.options.onProviderExit?.(1);
       }
@@ -826,10 +841,10 @@ export class AgentRunner {
   private handleBusError(code: string, message: string): void {
     const action = runnerBusErrorAction(code, this.stopped);
     if (action === "ignore") return;
-    console.error(`[runner] bus error ${code}: ${message}`);
+    logger.error("bus", `bus error ${code}: ${message}`);
     if (action === "stop") {
       void this.stop().catch((error) => {
-        console.error(`[runner] stop after bus error failed: ${error}`);
+        logger.error("bus", `stop after bus error failed: ${error}`);
       }).finally(() => process.exit(0));
     }
   }
@@ -1146,18 +1161,21 @@ export class AgentRunner {
     let activity: "busy" | "idle" | "unknown";
     try { activity = await this.options.adapter.probeActivity(this.process); } catch { return; }
     if (activity === "busy") this.busyReconcileSawBusy = true;
-    // Reset the streak on anything that isn't a confident idle — and never start
-    // counting until we've actually observed the provider busy this turn.
-    if (activity !== "idle" || !this.busyReconcileSawBusy) {
-      if (activity !== "idle") this.busyReconcileIdleStreak = 0;
+    // Reset the streak on anything that isn't a confident idle.
+    if (activity !== "idle") {
+      this.busyReconcileIdleStreak = 0;
       this.sessionDebug(`reconcile probe=${activity} sawBusy=${this.busyReconcileSawBusy} streak=${this.busyReconcileIdleStreak}`);
       return;
     }
     this.busyReconcileIdleStreak += 1;
-    this.sessionDebug(`reconcile probe=idle streak=${this.busyReconcileIdleStreak}/${BUSY_RECONCILE_IDLE_CONFIRM}`);
-    if (this.busyReconcileIdleStreak < BUSY_RECONCILE_IDLE_CONFIRM) return;
+    // Confirm faster once we've seen the spinner this turn; otherwise demand a much
+    // longer all-idle window before trusting it (rescues fast turns without
+    // false-clearing a live turn that simply hasn't flashed busy into a probe yet).
+    const confirm = this.busyReconcileSawBusy ? BUSY_RECONCILE_IDLE_CONFIRM : BUSY_RECONCILE_IDLE_CONFIRM_NO_BUSY;
+    this.sessionDebug(`reconcile probe=idle sawBusy=${this.busyReconcileSawBusy} streak=${this.busyReconcileIdleStreak}/${confirm}`);
+    if (this.busyReconcileIdleStreak < confirm) return;
     this.disarmBusyReconciler();
-    this.forceClearProviderTurn("backstop reconciler");
+    this.forceClearProviderTurn(this.busyReconcileSawBusy ? "backstop reconciler" : "backstop reconciler (no-busy-observed)");
   }
   // Force-clear a stuck provider-turn claim directly. Unlike the idle status path
@@ -1354,36 +1372,24 @@ export class AgentRunner {
     this.logRunnerDiagnostic(`[runner] HTTP liveness update failed: ${suffix}`);
   }
+  // Runner operational diagnostics (HTTP liveness, token renewal failures). Routed
+  // through the leveled logger at warn — see logger.ts. Kept as a thin wrapper so
+  // the existing call sites and their `[runner]` framing stay put.
   private logRunnerDiagnostic(message: string): void {
-    if (this.options.headless) {
-      console.error(message);
-      return;
-    }
-    try {
-      const logDir = join(process.env.HOME || ".", ".agent-relay", "logs");
-      mkdirSync(logDir, { recursive: true });
-      appendFileSync(join(logDir, `runner-${safeLogName(this.agentId)}.log`), `[${new Date().toISOString()}] ${message}\n`);
-    } catch {
-      // Do not write runner diagnostics into an interactive provider TUI.
-    }
+    logger.warn("runner", message.replace(/^\[runner\]\s*/, ""));
   }
-  // Session-mirror diagnostics → a dedicated, ANSI-free, greppable log per agent
-  // (NOT the provider's TUI stdout, which is unreadable). This is the single place
-  // to look when chat/terminal sync misbehaves. Key transitions always log here.
+  // Session-mirror diagnostics → the leveled logger (component "mirror"), written
+  // to the dashboard-surfaced session-mirror-<agent>.log. Key transitions log at
+  // info; the single place to look when chat/terminal sync misbehaves.
   private sessionLog(message: string): void {
-    try {
-      const logDir = join(process.env.HOME || ".", ".agent-relay", "logs");
-      mkdirSync(logDir, { recursive: true });
-      appendFileSync(join(logDir, `session-mirror-${safeLogName(this.agentId)}.log`), `[${new Date().toISOString()}] ${message}\n`);
-    } catch {
-      // best-effort
-    }
+    logger.info("mirror", message);
   }
-  // Verbose, high-frequency lines (per-probe, per-emit) — only when AGENT_RELAY_SESSION_DEBUG=1.
+  // Verbose, high-frequency lines (per-probe, per-emit) — surfaced only at log
+  // level "debug" (AGENT_RELAY_LOG_LEVEL=debug, or flip live via /log-level).
   private sessionDebug(message: string): void {
-    if (this.sessionDebugVerbose) this.sessionLog(message);
+    logger.debug("mirror", message);
   }
   private ensureScratch(): void {
@@ -1648,7 +1654,7 @@ export class AgentRunner {
       })
         .then(() => true)
         .catch((error) => {
-          console.error(`[runner] task ${claim.taskId} completion update failed: ${error}`);
+          logger.error("task", `task ${claim.taskId} completion update failed: ${error}`);
           return false;
         });
       if (!ok) continue;
@@ -1953,10 +1959,6 @@ function httpErrorKey(error: unknown): string {
   return String(error);
 }
-function safeLogName(value: string): string {
-  return value.replace(/[^a-zA-Z0-9_.-]+/g, "_").slice(0, 180);
-}
 function isContextState(value: unknown): value is ContextState {
   if (!value || typeof value !== "object" || Array.isArray(value)) return false;
   const state = value as Record<string, unknown>;