agent-relay-runner 0.15.0 → 0.16.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "agent-relay-runner",
3
- "version": "0.15.0",
3
+ "version": "0.16.0",
4
4
  "description": "Unified provider lifecycle runner for Agent Relay",
5
5
  "type": "module",
6
6
  "bin": {
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "name": "agent-relay-runner",
3
3
  "description": "Thin Agent Relay runner bridge for Claude Code",
4
- "version": "0.15.0",
4
+ "version": "0.16.0",
5
5
  "agentRelayContracts": {
6
6
  "providerPluginProtocol": 1
7
7
  }
@@ -1,5 +1,7 @@
1
1
  #!/usr/bin/env bash
2
2
  set -euo pipefail
3
+ source "${CLAUDE_PLUGIN_ROOT:-$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)}/hooks/relay-status.sh"
4
+ relay_install_hook_guard permission-request
3
5
 
4
6
  port="${AGENT_RELAY_RUNNER_PORT:-}"
5
7
  if [[ -z "$port" ]]; then
@@ -1,5 +1,6 @@
1
1
  #!/usr/bin/env bash
2
2
  set -euo pipefail
3
3
  source "${CLAUDE_PLUGIN_ROOT:-$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)}/hooks/relay-status.sh"
4
+ relay_install_hook_guard post-compact
4
5
 
5
6
  relay_post_timeline_status idle provider-turn "" compacted
@@ -1,5 +1,6 @@
1
1
  #!/usr/bin/env bash
2
2
  set -euo pipefail
3
3
  source "${CLAUDE_PLUGIN_ROOT:-$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)}/hooks/relay-status.sh"
4
+ relay_install_hook_guard pre-compact
4
5
 
5
6
  relay_post_timeline_status busy provider-turn "" compacting
@@ -92,11 +92,38 @@ relay_post_session_end() {
92
92
  -d "$body" >/dev/null 2>&1 || true
93
93
  }
94
94
 
95
+ # --- Hook FATAL surfacing (#198) -------------------------------------------
96
+ # A hook that dies unexpectedly must never be silent. relay_install_hook_guard
97
+ # arms an ERR trap that reports the failure FATAL to the runner control port,
98
+ # which logs it to the dashboard-surfaced per-agent log. Best-effort and bounded
99
+ # (--max-time 2) so the report itself can never blow the hook's timeout budget.
100
+ relay_hook_fatal_report() {
101
+ local hook="${1:-unknown}" detail="${2:-}"
102
+ local port="${AGENT_RELAY_RUNNER_PORT:-}"
103
+ [ -z "$port" ] && return 0
104
+ local body="{\"hook\":\"$(relay_json_escape "$hook")\",\"error\":\"$(relay_json_escape "$detail")\"}"
105
+ curl -fsS --max-time 2 -X POST "http://127.0.0.1:${port}/hook-fatal" \
106
+ -H 'Content-Type: application/json' \
107
+ -d "$body" >/dev/null 2>&1 || true
108
+ }
109
+
110
+ relay_install_hook_guard() {
111
+ RELAY_HOOK_NAME="${1:-unknown}"
112
+ # Fires on any unhandled failure under `set -e`/`set -u`/pipefail in the hook's
113
+ # main body, just before the shell exits. Reports, then lets the exit proceed.
114
+ # (ERR is not inherited into functions without `set -E`; this covers the top-level
115
+ # flow, which is where a silent death actually wedges a turn.)
116
+ trap 'relay_hook_err_rc=$?; relay_hook_fatal_report "${RELAY_HOOK_NAME:-unknown}" "exit ${relay_hook_err_rc}: ${BASH_COMMAND}"' ERR
117
+ }
118
+
95
119
  relay_pending_reply_stop_decision() {
96
120
  local port="${AGENT_RELAY_RUNNER_PORT:-}"
97
121
  [ -z "$port" ] && return 0
98
122
  local response
99
- response="$(curl -fsS "http://127.0.0.1:${port}/reply-obligations/claude-stop" 2>/dev/null || true)"
123
+ # --max-time guards the Claude Stop hook's 5s budget: a slow runner/server (e.g. an
124
+ # un-indexed obligation query) must never block past the timeout, or Claude SIGKILLs
125
+ # the hook before it clears the turn -> stuck "busy" (#199). On timeout: no block.
126
+ response="$(curl -fsS --max-time 2 "http://127.0.0.1:${port}/reply-obligations/claude-stop" 2>/dev/null || true)"
100
127
  case "$response" in
101
128
  *'"decision":"block"'*|*'"decision": "block"'*) ;;
102
129
  *) return 0 ;;
@@ -1,6 +1,7 @@
1
1
  #!/usr/bin/env bash
2
2
  set -euo pipefail
3
3
  source "${CLAUDE_PLUGIN_ROOT:-$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)}/hooks/relay-status.sh"
4
+ relay_install_hook_guard session-end
4
5
 
5
6
  payload="$(cat || true)"
6
7
  reason="$(relay_json_string_field reason "$payload")"
@@ -1,6 +1,7 @@
1
1
  #!/usr/bin/env bash
2
2
  set -euo pipefail
3
3
  source "${CLAUDE_PLUGIN_ROOT:-$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)}/hooks/relay-status.sh"
4
+ relay_install_hook_guard session-start
4
5
 
5
6
  payload="$(cat || true)"
6
7
  source_kind="$(relay_json_string_field source "$payload")"
@@ -1,6 +1,7 @@
1
1
  #!/usr/bin/env bash
2
2
  set -euo pipefail
3
3
  source "${CLAUDE_PLUGIN_ROOT:-$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)}/hooks/relay-status.sh"
4
+ relay_install_hook_guard stop-failure
4
5
 
5
6
  payload="$(cat || true)"
6
7
  error="$(relay_json_string_field error "$payload")"
@@ -1,17 +1,29 @@
1
1
  #!/usr/bin/env bash
2
2
  set -euo pipefail
3
3
  source "${CLAUDE_PLUGIN_ROOT:-$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)}/hooks/relay-status.sh"
4
+ relay_install_hook_guard stop
5
+
6
+ # Clearing the turn's busy state is the critical path (#199). Register it on EXIT
7
+ # so it runs even if a side-call below fails or times out under `set -e`. The one
8
+ # exception is the reply-obligation block path, which deliberately keeps the agent
9
+ # busy to answer — it opts out via the flag before exiting.
10
+ _relay_clear_idle_on_exit=1
11
+ trap '[ "${_relay_clear_idle_on_exit:-0}" = "1" ] && relay_post_status_clearing_subagents idle' EXIT
4
12
 
5
13
  payload="$(cat || true)"
6
14
  stop_hook_active="$(relay_json_bool_field stop_hook_active "$payload")"
7
15
  if [ "$stop_hook_active" != "true" ]; then
8
16
  last_assistant_msg="$(echo "$payload" | jq -c '.last_assistant_message // empty' 2>/dev/null || true)"
9
17
  relay_post_session_turn "$(relay_json_string_field transcript_path "$payload")" "$last_assistant_msg"
10
- stop_decision="$(relay_pending_reply_stop_decision)"
18
+ # `|| true`: under `set -e`, a non-zero from the obligation check must never abort
19
+ # the hook before the idle-clear — clearing the turn is the critical path (#199).
20
+ stop_decision="$(relay_pending_reply_stop_decision || true)"
11
21
  if [ -n "$stop_decision" ]; then
22
+ _relay_clear_idle_on_exit=0
12
23
  printf '%s\n' "$stop_decision"
13
24
  exit 0
14
25
  fi
15
26
  fi
16
27
 
17
- relay_post_status_clearing_subagents idle
28
+ # Normal turn end → the EXIT trap posts idle (always, even on an unexpected abort above).
29
+ exit 0
@@ -4,6 +4,7 @@ set -euo pipefail
4
4
  PLUGIN_ROOT="${CLAUDE_PLUGIN_ROOT:-$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)}"
5
5
  # shellcheck source=/dev/null
6
6
  source "${PLUGIN_ROOT}/hooks/relay-status.sh"
7
+ relay_install_hook_guard subagent-start
7
8
 
8
9
  payload="$(cat || true)"
9
10
  agent_id="$(relay_json_string_field agent_id "$payload")"
@@ -4,6 +4,7 @@ set -euo pipefail
4
4
  PLUGIN_ROOT="${CLAUDE_PLUGIN_ROOT:-$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)}"
5
5
  # shellcheck source=/dev/null
6
6
  source "${PLUGIN_ROOT}/hooks/relay-status.sh"
7
+ relay_install_hook_guard subagent-stop
7
8
 
8
9
  payload="$(cat || true)"
9
10
  agent_id="$(relay_json_string_field agent_id "$payload")"
@@ -1,6 +1,7 @@
1
1
  #!/usr/bin/env bash
2
2
  set -euo pipefail
3
3
  source "${CLAUDE_PLUGIN_ROOT:-$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)}/hooks/relay-status.sh"
4
+ relay_install_hook_guard user-prompt-submit
4
5
  payload="$(cat || true)"
5
6
  relay_post_status busy
6
7
  # Mirror a terminal/TUI-typed prompt into the dashboard chat and start reasoning
@@ -4,6 +4,7 @@ import { basename, join, resolve } from "node:path";
4
4
  import type { ContextState, Message } from "agent-relay-sdk";
5
5
  import { profileAllowsRelayFeature, providerMessageText, RELAY_CONTEXT, type ManagedProcess, type ProviderAdapter, type ProviderConfig, type ProviderPermissionDecisionInput, type ProviderSessionEvent, type ProviderStatusUpdate, type RunnerSpawnConfig, type SpawnArgs, type TerminalAttachSpec } from "../adapter";
6
6
  import { workspaceDepsNoteFromEnv } from "../relay-instructions";
7
+ import { logger } from "../logger";
7
8
 
8
9
  /** Relay context prepended to a Codex agent's first turn: the standard relay
9
10
  * blurb plus, when running in an isolated workspace, the deps caveat (#159). */
@@ -199,7 +200,7 @@ export class CodexAdapter implements ProviderAdapter {
199
200
  input = codexRelayContextBlock() + "\n\n" + input;
200
201
  process.meta = { ...(process.meta ?? {}), relayContextSent: true };
201
202
  }
202
- console.error(`[agent-relay] starting Codex initial prompt in thread ${threadId}`);
203
+ logger.info("codex", `starting Codex initial prompt in thread ${threadId}`);
203
204
  const client = process.meta?.client as CodexAppClient;
204
205
  await client.turnStart(threadId, input);
205
206
  }
@@ -211,7 +212,7 @@ export class CodexAdapter implements ProviderAdapter {
211
212
  text = codexRelayContextBlock() + "\n\n" + text;
212
213
  process.meta = { ...(process.meta ?? {}), relayContextSent: true };
213
214
  }
214
- console.error(codexDeliveryNotice(messages, threadId));
215
+ logger.info("codex", codexDeliveryNotice(messages, threadId));
215
216
  const client = process.meta?.client as CodexAppClient;
216
217
  await client.turnStart(threadId, text);
217
218
  }
@@ -1,6 +1,16 @@
1
1
  import type { Server, ServerWebSocket } from "bun";
2
2
  import type { Message, ReplyObligation } from "agent-relay-sdk";
3
3
  import type { ProviderPermissionDecisionInput, ProviderStatusEvent, SemanticStatus, TerminalAttachSpec } from "./adapter";
4
+ import { logger, parseLogLevel, LOG_LEVELS } from "./logger";
5
+
6
+ // A hook that failed in a way it could not handle itself reports here so the
7
+ // failure is never silent (#198 item 5). Phase 1 logs it FATAL to the per-agent
8
+ // log; Phase 2 (#196) will additionally route it through the runner outbox to the
9
+ // server.
10
+ export interface HookFatalReport {
11
+ hook: string;
12
+ error: string;
13
+ }
4
14
 
5
15
  interface MonitorSocketData {
6
16
  kind: "monitor";
@@ -33,6 +43,10 @@ interface ControlServerOptions {
33
43
  // transcript. transcriptPath is optional — the runner falls back to the last
34
44
  // path it saw during the session.
35
45
  onSessionEnd?(input: { reason?: string; transcriptPath?: string }): Promise<void>;
46
+ // Phase 1 observability (#198): a hook reporting an unhandled failure. The
47
+ // control server already logs it FATAL; this is the seam for Phase 2 to also
48
+ // surface it to the server via the runner outbox.
49
+ onHookFatal?(report: HookFatalReport): void;
36
50
  }
37
51
 
38
52
  export function startControlServer(options: ControlServerOptions): ControlServer {
@@ -81,6 +95,15 @@ export function startControlServer(options: ControlServerOptions): ControlServer
81
95
  if (url.pathname === "/session-end" && req.method === "POST") {
82
96
  return handleSessionEnd(req, options);
83
97
  }
98
+ if (url.pathname === "/log-level" && req.method === "GET") {
99
+ return Response.json({ level: logger.getLevel(), levels: LOG_LEVELS });
100
+ }
101
+ if (url.pathname === "/log-level" && req.method === "POST") {
102
+ return handleLogLevel(req);
103
+ }
104
+ if (url.pathname === "/hook-fatal" && req.method === "POST") {
105
+ return handleHookFatal(req, options);
106
+ }
84
107
  if (url.pathname === "/monitor") {
85
108
  const upgraded = srv.upgrade(req, { data: { kind: "monitor" } });
86
109
  return upgraded ? undefined : new Response("WebSocket upgrade failed", { status: 400 });
@@ -361,6 +384,26 @@ async function handleSessionEnd(req: Request, options: ControlServerOptions): Pr
361
384
  return Response.json({ ok: true });
362
385
  }
363
386
 
387
+ async function handleLogLevel(req: Request): Promise<Response> {
388
+ const body = await req.json().catch(() => null);
389
+ const level = parseLogLevel(isRecord(body) && typeof body.level === "string" ? body.level : undefined);
390
+ if (!level) return Response.json({ error: `level must be one of: ${LOG_LEVELS.join(", ")}` }, { status: 400 });
391
+ const previous = logger.getLevel();
392
+ logger.setLevel(level);
393
+ logger.info("logger", `log level set to ${level} (was ${previous}) via control port`);
394
+ return Response.json({ ok: true, level, previous });
395
+ }
396
+
397
+ async function handleHookFatal(req: Request, options: ControlServerOptions): Promise<Response> {
398
+ const body = await req.json().catch(() => null);
399
+ const hook = isRecord(body) && typeof body.hook === "string" && body.hook.trim() ? body.hook.trim() : "unknown";
400
+ const error = isRecord(body) && typeof body.error === "string" ? body.error : "(no detail)";
401
+ // Never silent: a hook that couldn't handle its own failure lands here as FATAL.
402
+ logger.fatal(`hook:${hook}`, error);
403
+ try { options.onHookFatal?.({ hook, error }); } catch { /* reporting must never throw back at the hook */ }
404
+ return Response.json({ ok: true });
405
+ }
406
+
364
407
  async function handleStatus(req: Request, options: ControlServerOptions): Promise<Response> {
365
408
  const body = await req.json().catch(() => null) as Partial<ProviderStatusEvent> | null;
366
409
  const status = body?.status;
package/src/logger.ts ADDED
@@ -0,0 +1,97 @@
1
+ import { appendFileSync, mkdirSync } from "node:fs";
2
+ import { join } from "node:path";
3
+
4
+ // Phase 1 observability (#198): one leveled, runtime-togglable logger for the
5
+ // Runner and the provider adapters below it. Replaces the ad-hoc scatter of
6
+ // `console.error`, `logRunnerDiagnostic` (-> runner-<agent>.log) and
7
+ // `sessionLog`/`sessionDebug` (-> session-mirror-<agent>.log) with a single
8
+ // switch and a single greppable, ANSI-free sink.
9
+ //
10
+ // Sink: the per-agent `session-mirror-<agent>.log` — the file the orchestrator
11
+ // already surfaces to the dashboard log-viewer (captureSessionMirror). One place
12
+ // to look when anything in the Runner misbehaves.
13
+ //
14
+ // Level is read once from AGENT_RELAY_LOG_LEVEL (default "info") and can be
15
+ // flipped at runtime via the control port (no restart) — so a phase refactor can
16
+ // be watched at debug without bouncing the agent.
17
+
18
+ export type LogLevel = "debug" | "info" | "warn" | "error" | "fatal";
19
+
20
+ const ORDER: Record<LogLevel, number> = { debug: 10, info: 20, warn: 30, error: 40, fatal: 50 };
21
+ export const LOG_LEVELS = Object.keys(ORDER) as LogLevel[];
22
+
23
+ export function parseLogLevel(value: string | undefined | null): LogLevel | undefined {
24
+ if (!value) return undefined;
25
+ const v = value.trim().toLowerCase();
26
+ return (LOG_LEVELS as string[]).includes(v) ? (v as LogLevel) : undefined;
27
+ }
28
+
29
+ // Matches the runner's safeLogName and the orchestrator's safeMirrorLogName so all
30
+ // three resolve the identical filename for a given agent id.
31
+ function safeLogName(value: string): string {
32
+ return value.replace(/[^a-zA-Z0-9_.-]+/g, "_").slice(0, 180);
33
+ }
34
+
35
+ export interface LoggerConfig {
36
+ agentId?: string;
37
+ level?: LogLevel;
38
+ headless?: boolean;
39
+ logDir?: string;
40
+ }
41
+
42
+ export class Logger {
43
+ private level: LogLevel;
44
+ private agentId: string;
45
+ private headless: boolean;
46
+ private logDir: string;
47
+
48
+ constructor(config: LoggerConfig = {}) {
49
+ this.level = config.level ?? parseLogLevel(process.env.AGENT_RELAY_LOG_LEVEL) ?? "info";
50
+ this.agentId = config.agentId ?? "runner";
51
+ this.headless = config.headless ?? false;
52
+ this.logDir = config.logDir ?? join(process.env.HOME || ".", ".agent-relay", "logs");
53
+ }
54
+
55
+ // Bind the logger to a concrete agent once the runner knows its id. Preserves a
56
+ // level already set via env/runtime unless an explicit level is passed.
57
+ configure(config: LoggerConfig): void {
58
+ if (config.agentId !== undefined) this.agentId = config.agentId;
59
+ if (config.headless !== undefined) this.headless = config.headless;
60
+ if (config.logDir !== undefined) this.logDir = config.logDir;
61
+ if (config.level !== undefined) this.level = config.level;
62
+ }
63
+
64
+ setLevel(level: LogLevel): void { this.level = level; }
65
+ getLevel(): LogLevel { return this.level; }
66
+ isEnabled(level: LogLevel): boolean { return ORDER[level] >= ORDER[this.level]; }
67
+
68
+ debug(component: string, message: string): void { this.log("debug", component, message); }
69
+ info(component: string, message: string): void { this.log("info", component, message); }
70
+ warn(component: string, message: string): void { this.log("warn", component, message); }
71
+ error(component: string, message: string): void { this.log("error", component, message); }
72
+ fatal(component: string, message: string): void { this.log("fatal", component, message); }
73
+
74
+ log(level: LogLevel, component: string, message: string): void {
75
+ if (!this.isEnabled(level)) return;
76
+ const line = `[${new Date().toISOString()}] ${level.toUpperCase().padEnd(5)} [${component}] ${oneLine(message)}\n`;
77
+ try {
78
+ mkdirSync(this.logDir, { recursive: true });
79
+ appendFileSync(join(this.logDir, `session-mirror-${safeLogName(this.agentId)}.log`), line);
80
+ } catch {
81
+ // Best-effort. If the per-agent file can't be written, surface error/fatal to
82
+ // stderr so it is not lost entirely (headless: lands in the orchestrator log).
83
+ if (ORDER[level] >= ORDER.error) { try { console.error(line.trimEnd()); } catch { /* give up */ } }
84
+ }
85
+ }
86
+ }
87
+
88
+ // Newlines would split one record across several log lines and break greppability;
89
+ // collapse them so a multi-line message stays one line.
90
+ function oneLine(message: string): string {
91
+ return message.replace(/\r?\n/g, " ⏎ ");
92
+ }
93
+
94
+ // Process-global logger. A runner process serves exactly one agent, so a singleton
95
+ // is the right scope; the runner calls configure() once it knows its id, and
96
+ // adapters import this instance directly (no constructor threading).
97
+ export const logger = new Logger();
package/src/runner.ts CHANGED
@@ -1,5 +1,5 @@
1
1
  import { hostname } from "node:os";
2
- import { appendFileSync, closeSync, mkdirSync, openSync, readSync, statSync, writeFileSync } from "node:fs";
2
+ import { closeSync, mkdirSync, openSync, readSync, statSync, writeFileSync } from "node:fs";
3
3
  import { readFile } from "node:fs/promises";
4
4
  import { dirname, join } from "node:path";
5
5
  import type { AgentProfile, ContextState, Message, MessageSessionMeta, ProviderCapabilities, TaskStatusInput, WorkspaceMetadata } from "agent-relay-sdk";
@@ -13,6 +13,7 @@ import { extractLastAssistantTurn, extractFinalAssistantMessage, extractHookAssi
13
13
  import { agentProfileProjectionReport } from "./profile-projection";
14
14
  import { profileUsesHostProviderGlobals } from "./profile-home";
15
15
  import { runtimeMetadata } from "./version";
16
+ import { logger, parseLogLevel } from "./logger";
16
17
  import { ensureSessionScratch, reapSessionScratch, sweepStaleSessions, type SessionScratchLayout } from "./session-scratch";
17
18
 
18
19
  interface RunnerOptions {
@@ -76,12 +77,18 @@ const LOG_TAIL_BYTES = 128 * 1024;
76
77
  const PROMPT_ECHO_DEDUP_MS = 30_000;
77
78
  // Busy reconciler: a conservative LAST-RESORT backstop for a turn that ended
78
79
  // without the provider's Stop hook clearing busy (e.g. ESC straight into the web
79
- // terminal). It must never fire during a live turn, so it (a) only counts idle
80
- // after it has actually observed the provider busy, and (b) requires a long,
80
+ // terminal). It must never fire during a live turn, so it requires a long,
81
81
  // unbroken idle streak — an active turn shows its working spinner well within
82
82
  // this window, which resets the streak. ~32s of uninterrupted idle = really done.
83
83
  const BUSY_RECONCILE_POLL_MS = 4_000;
84
84
  const BUSY_RECONCILE_IDLE_CONFIRM = 8;
85
+ // When the reconciler never observed the provider busy this turn (a turn faster
86
+ // than the 4s poll — common for short voice/autosend replies), it can't trust a
87
+ // quick idle the way it does after seeing the spinner. But refusing forever wedged
88
+ // fast turns in "busy" when the Stop hook's idle was lost (#199). So we still
89
+ // force-clear, just after a much longer unbroken-idle window — an active turn would
90
+ // have flashed its spinner into at least one of these probes and reset the streak.
91
+ const BUSY_RECONCILE_IDLE_CONFIRM_NO_BUSY = 15;
85
92
  // After a dashboard interrupt, give the provider a moment to drop out of its turn,
86
93
  // then reconcile immediately so the user sees "stopped" without waiting for the backstop.
87
94
  const INTERRUPT_RECONCILE_DELAY_MS = 1_500;
@@ -171,6 +178,14 @@ export class AgentRunner {
171
178
 
172
179
  constructor(private readonly options: RunnerOptions) {
173
180
  this.agentId = options.agentId ?? options.runnerId;
181
+ // Bind the process-global logger to this agent. AGENT_RELAY_SESSION_DEBUG=1 is
182
+ // kept as a back-compat alias for the verbose probe/emit lines, now expressed
183
+ // as log level "debug" (AGENT_RELAY_LOG_LEVEL still wins when both are set).
184
+ logger.configure({
185
+ agentId: this.agentId,
186
+ headless: options.headless,
187
+ ...(this.sessionDebugVerbose && !parseLogLevel(process.env.AGENT_RELAY_LOG_LEVEL) ? { level: "debug" as const } : {}),
188
+ });
174
189
  this.currentToken = options.token;
175
190
  this.currentTokenJti = options.tokenJti;
176
191
  this.currentTokenProfileId = options.tokenProfileId;
@@ -381,7 +396,7 @@ export class AgentRunner {
381
396
  startedAt: this.options.startedAt,
382
397
  }, null, 2) + "\n", { mode: 0o600 });
383
398
  } catch (error) {
384
- console.error(`[runner] failed to write runner info file: ${error}`);
399
+ logger.error("runner", `failed to write runner info file: ${error}`);
385
400
  }
386
401
  }
387
402
 
@@ -397,7 +412,7 @@ export class AgentRunner {
397
412
  const messages = await this.http.pollMessages({ for: this.agentId, unread: true, limit: 100 });
398
413
  for (const message of messages) this.enqueueMessage(message);
399
414
  } catch (error) {
400
- console.error(`[runner] inbox bootstrap failed: ${error}`);
415
+ logger.error("runner", `inbox bootstrap failed: ${error}`);
401
416
  }
402
417
  }
403
418
 
@@ -407,7 +422,7 @@ export class AgentRunner {
407
422
  try {
408
423
  await this.options.adapter.deliverInitialPrompt(this.process, prompt);
409
424
  } catch (error) {
410
- console.error(`[runner] initial prompt delivery failed: ${error}`);
425
+ logger.error("runner", `initial prompt delivery failed: ${error}`);
411
426
  }
412
427
  }
413
428
 
@@ -444,7 +459,7 @@ export class AgentRunner {
444
459
  status: "in_progress",
445
460
  agentId: this.agentId,
446
461
  metadata: { messageId: message.id, completedBy: "runner" },
447
- }).catch((error) => console.error(`[runner] task ${taskId} in_progress update failed: ${error}`));
462
+ }).catch((error) => logger.error("task", `task ${taskId} in_progress update failed: ${error}`));
448
463
  // Runner owns claim + status here; drop the server's self-claim instruction
449
464
  // so the agent doesn't improvise a stray claim send (see stripRunnerClaimedGuidance).
450
465
  toDeliver = { ...message, body: stripRunnerClaimedGuidance(message.body) };
@@ -462,7 +477,7 @@ export class AgentRunner {
462
477
  try {
463
478
  const prepared = await messagesWithCachedAttachments(deliverable, this.http, {
464
479
  agentId: this.agentId,
465
- onError: (message) => console.error(`[runner] ${message}`),
480
+ onError: (message) => logger.error("runner", message),
466
481
  });
467
482
  await this.options.adapter.deliver(this.process, prepared);
468
483
  for (const message of deliverable) {
@@ -471,7 +486,7 @@ export class AgentRunner {
471
486
  }
472
487
  } catch (error) {
473
488
  failed = true;
474
- if (shouldLogDeliveryFailure(error)) console.error(`[runner] message delivery failed: ${error}`);
489
+ if (shouldLogDeliveryFailure(error)) logger.warn("delivery", `message delivery failed: ${error}`);
475
490
  for (const message of deliverable) {
476
491
  this.clearActiveClaim(message);
477
492
  this.pendingMessages.set(message.id, message);
@@ -539,7 +554,7 @@ export class AgentRunner {
539
554
  await this.http.deleteAgent(this.agentId).catch(() => {});
540
555
  if (this.options.exitProcessOnShutdown !== false) {
541
556
  setTimeout(() => void this.stop().catch((error) => {
542
- console.error(`[runner] stop after command failed: ${error}`);
557
+ logger.error("lifecycle", `stop after command failed: ${error}`);
543
558
  }).finally(() => process.exit(0)), 10);
544
559
  }
545
560
  } else if (!this.stopped) {
@@ -674,7 +689,7 @@ export class AgentRunner {
674
689
 
675
690
  if (this.shouldStopUnexpectedProviderExit(diagnostics)) {
676
691
  const hasResumeId = typeof diagnostics.claudeResumeId === "string" && diagnostics.claudeResumeId.length > 0;
677
- console.warn(`[runner] ${this.options.provider} exited; leaving agent offline for manual recovery`);
692
+ logger.warn("lifecycle", `${this.options.provider} exited; leaving agent offline for manual recovery`);
678
693
  this.publishRunnerTimelineEvent({
679
694
  status: "provider.restart_decision",
680
695
  id: `provider-restart-decision-${this.providerSessionId}-${now}`,
@@ -702,7 +717,7 @@ export class AgentRunner {
702
717
  }
703
718
 
704
719
  if (runtimeMs < RAPID_EXIT_MS && recent.length > MAX_RAPID_UNEXPECTED_EXITS) {
705
- console.error(`[runner] provider session exited ${recent.length} times within ${Math.round(UNEXPECTED_EXIT_WINDOW_MS / 1000)}s; giving up`);
720
+ logger.error("lifecycle", `provider session exited ${recent.length} times within ${Math.round(UNEXPECTED_EXIT_WINDOW_MS / 1000)}s; giving up`);
706
721
  this.publishRunnerTimelineEvent({
707
722
  status: "provider.restart_decision",
708
723
  id: `provider-restart-decision-${this.providerSessionId}-${now}`,
@@ -726,7 +741,7 @@ export class AgentRunner {
726
741
  }
727
742
 
728
743
  const delayMs = Math.min(10_000, Math.max(500, 500 * recent.length));
729
- console.warn(`[runner] provider session exited unexpectedly after ${Math.round(runtimeMs / 1000)}s; restarting in ${delayMs}ms`);
744
+ logger.warn("lifecycle", `provider session exited unexpectedly after ${Math.round(runtimeMs / 1000)}s; restarting in ${delayMs}ms`);
730
745
  this.publishRunnerTimelineEvent({
731
746
  status: "provider.restart_decision",
732
747
  id: `provider-restart-decision-${this.providerSessionId}-${now}`,
@@ -751,7 +766,7 @@ export class AgentRunner {
751
766
  this.publishStatus();
752
767
  this.scheduleDrain();
753
768
  } catch (error) {
754
- console.error(`[runner] provider restart after unexpected exit failed: ${error}`);
769
+ logger.error("lifecycle", `provider restart after unexpected exit failed: ${error}`);
755
770
  this.setProviderStatus("error");
756
771
  this.options.onProviderExit?.(1);
757
772
  }
@@ -826,10 +841,10 @@ export class AgentRunner {
826
841
  private handleBusError(code: string, message: string): void {
827
842
  const action = runnerBusErrorAction(code, this.stopped);
828
843
  if (action === "ignore") return;
829
- console.error(`[runner] bus error ${code}: ${message}`);
844
+ logger.error("bus", `bus error ${code}: ${message}`);
830
845
  if (action === "stop") {
831
846
  void this.stop().catch((error) => {
832
- console.error(`[runner] stop after bus error failed: ${error}`);
847
+ logger.error("bus", `stop after bus error failed: ${error}`);
833
848
  }).finally(() => process.exit(0));
834
849
  }
835
850
  }
@@ -1146,18 +1161,21 @@ export class AgentRunner {
1146
1161
  let activity: "busy" | "idle" | "unknown";
1147
1162
  try { activity = await this.options.adapter.probeActivity(this.process); } catch { return; }
1148
1163
  if (activity === "busy") this.busyReconcileSawBusy = true;
1149
- // Reset the streak on anything that isn't a confident idle — and never start
1150
- // counting until we've actually observed the provider busy this turn.
1151
- if (activity !== "idle" || !this.busyReconcileSawBusy) {
1152
- if (activity !== "idle") this.busyReconcileIdleStreak = 0;
1164
+ // Reset the streak on anything that isn't a confident idle.
1165
+ if (activity !== "idle") {
1166
+ this.busyReconcileIdleStreak = 0;
1153
1167
  this.sessionDebug(`reconcile probe=${activity} sawBusy=${this.busyReconcileSawBusy} streak=${this.busyReconcileIdleStreak}`);
1154
1168
  return;
1155
1169
  }
1156
1170
  this.busyReconcileIdleStreak += 1;
1157
- this.sessionDebug(`reconcile probe=idle streak=${this.busyReconcileIdleStreak}/${BUSY_RECONCILE_IDLE_CONFIRM}`);
1158
- if (this.busyReconcileIdleStreak < BUSY_RECONCILE_IDLE_CONFIRM) return;
1171
+ // Confirm faster once we've seen the spinner this turn; otherwise demand a much
1172
+ // longer all-idle window before trusting it (rescues fast turns without
1173
+ // false-clearing a live turn that simply hasn't flashed busy into a probe yet).
1174
+ const confirm = this.busyReconcileSawBusy ? BUSY_RECONCILE_IDLE_CONFIRM : BUSY_RECONCILE_IDLE_CONFIRM_NO_BUSY;
1175
+ this.sessionDebug(`reconcile probe=idle sawBusy=${this.busyReconcileSawBusy} streak=${this.busyReconcileIdleStreak}/${confirm}`);
1176
+ if (this.busyReconcileIdleStreak < confirm) return;
1159
1177
  this.disarmBusyReconciler();
1160
- this.forceClearProviderTurn("backstop reconciler");
1178
+ this.forceClearProviderTurn(this.busyReconcileSawBusy ? "backstop reconciler" : "backstop reconciler (no-busy-observed)");
1161
1179
  }
1162
1180
 
1163
1181
  // Force-clear a stuck provider-turn claim directly. Unlike the idle status path
@@ -1354,36 +1372,24 @@ export class AgentRunner {
1354
1372
  this.logRunnerDiagnostic(`[runner] HTTP liveness update failed: ${suffix}`);
1355
1373
  }
1356
1374
 
1375
+ // Runner operational diagnostics (HTTP liveness, token renewal failures). Routed
1376
+ // through the leveled logger at warn — see logger.ts. Kept as a thin wrapper so
1377
+ // the existing call sites and their `[runner]` framing stay put.
1357
1378
  private logRunnerDiagnostic(message: string): void {
1358
- if (this.options.headless) {
1359
- console.error(message);
1360
- return;
1361
- }
1362
- try {
1363
- const logDir = join(process.env.HOME || ".", ".agent-relay", "logs");
1364
- mkdirSync(logDir, { recursive: true });
1365
- appendFileSync(join(logDir, `runner-${safeLogName(this.agentId)}.log`), `[${new Date().toISOString()}] ${message}\n`);
1366
- } catch {
1367
- // Do not write runner diagnostics into an interactive provider TUI.
1368
- }
1379
+ logger.warn("runner", message.replace(/^\[runner\]\s*/, ""));
1369
1380
  }
1370
1381
 
1371
- // Session-mirror diagnostics → a dedicated, ANSI-free, greppable log per agent
1372
- // (NOT the provider's TUI stdout, which is unreadable). This is the single place
1373
- // to look when chat/terminal sync misbehaves. Key transitions always log here.
1382
+ // Session-mirror diagnostics → the leveled logger (component "mirror"), written
1383
+ // to the dashboard-surfaced session-mirror-<agent>.log. Key transitions log at
1384
+ // info; the single place to look when chat/terminal sync misbehaves.
1374
1385
  private sessionLog(message: string): void {
1375
- try {
1376
- const logDir = join(process.env.HOME || ".", ".agent-relay", "logs");
1377
- mkdirSync(logDir, { recursive: true });
1378
- appendFileSync(join(logDir, `session-mirror-${safeLogName(this.agentId)}.log`), `[${new Date().toISOString()}] ${message}\n`);
1379
- } catch {
1380
- // best-effort
1381
- }
1386
+ logger.info("mirror", message);
1382
1387
  }
1383
1388
 
1384
- // Verbose, high-frequency lines (per-probe, per-emit) — only when AGENT_RELAY_SESSION_DEBUG=1.
1389
+ // Verbose, high-frequency lines (per-probe, per-emit) — surfaced only at log
1390
+ // level "debug" (AGENT_RELAY_LOG_LEVEL=debug, or flip live via /log-level).
1385
1391
  private sessionDebug(message: string): void {
1386
- if (this.sessionDebugVerbose) this.sessionLog(message);
1392
+ logger.debug("mirror", message);
1387
1393
  }
1388
1394
 
1389
1395
  private ensureScratch(): void {
@@ -1648,7 +1654,7 @@ export class AgentRunner {
1648
1654
  })
1649
1655
  .then(() => true)
1650
1656
  .catch((error) => {
1651
- console.error(`[runner] task ${claim.taskId} completion update failed: ${error}`);
1657
+ logger.error("task", `task ${claim.taskId} completion update failed: ${error}`);
1652
1658
  return false;
1653
1659
  });
1654
1660
  if (!ok) continue;
@@ -1953,10 +1959,6 @@ function httpErrorKey(error: unknown): string {
1953
1959
  return String(error);
1954
1960
  }
1955
1961
 
1956
- function safeLogName(value: string): string {
1957
- return value.replace(/[^a-zA-Z0-9_.-]+/g, "_").slice(0, 180);
1958
- }
1959
-
1960
1962
  function isContextState(value: unknown): value is ContextState {
1961
1963
  if (!value || typeof value !== "object" || Array.isArray(value)) return false;
1962
1964
  const state = value as Record<string, unknown>;