agent-relay-runner 0.15.1 → 0.17.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "agent-relay-runner",
3
- "version": "0.15.1",
3
+ "version": "0.17.0",
4
4
  "description": "Unified provider lifecycle runner for Agent Relay",
5
5
  "type": "module",
6
6
  "bin": {
@@ -20,7 +20,7 @@
20
20
  "directory": "runner"
21
21
  },
22
22
  "dependencies": {
23
- "agent-relay-sdk": "0.2.7"
23
+ "agent-relay-sdk": "0.2.8"
24
24
  },
25
25
  "devDependencies": {
26
26
  "@types/bun": "latest",
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "name": "agent-relay-runner",
3
3
  "description": "Thin Agent Relay runner bridge for Claude Code",
4
- "version": "0.15.1",
4
+ "version": "0.17.0",
5
5
  "agentRelayContracts": {
6
6
  "providerPluginProtocol": 1
7
7
  }
@@ -1,5 +1,7 @@
1
1
  #!/usr/bin/env bash
2
2
  set -euo pipefail
3
+ source "${CLAUDE_PLUGIN_ROOT:-$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)}/hooks/relay-status.sh"
4
+ relay_install_hook_guard permission-request
3
5
 
4
6
  port="${AGENT_RELAY_RUNNER_PORT:-}"
5
7
  if [[ -z "$port" ]]; then
@@ -1,5 +1,6 @@
1
1
  #!/usr/bin/env bash
2
2
  set -euo pipefail
3
3
  source "${CLAUDE_PLUGIN_ROOT:-$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)}/hooks/relay-status.sh"
4
+ relay_install_hook_guard post-compact
4
5
 
5
6
  relay_post_timeline_status idle provider-turn "" compacted
@@ -1,5 +1,6 @@
1
1
  #!/usr/bin/env bash
2
2
  set -euo pipefail
3
3
  source "${CLAUDE_PLUGIN_ROOT:-$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)}/hooks/relay-status.sh"
4
+ relay_install_hook_guard pre-compact
4
5
 
5
6
  relay_post_timeline_status busy provider-turn "" compacting
@@ -92,6 +92,30 @@ relay_post_session_end() {
92
92
  -d "$body" >/dev/null 2>&1 || true
93
93
  }
94
94
 
95
+ # --- Hook FATAL surfacing (#198) -------------------------------------------
96
+ # A hook that dies unexpectedly must never be silent. relay_install_hook_guard
97
+ # arms an ERR trap that reports the failure FATAL to the runner control port,
98
+ # which logs it to the dashboard-surfaced per-agent log. Best-effort and bounded
99
+ # (--max-time 2) so the report itself can never blow the hook's timeout budget.
100
+ relay_hook_fatal_report() {
101
+ local hook="${1:-unknown}" detail="${2:-}"
102
+ local port="${AGENT_RELAY_RUNNER_PORT:-}"
103
+ [ -z "$port" ] && return 0
104
+ local body="{\"hook\":\"$(relay_json_escape "$hook")\",\"error\":\"$(relay_json_escape "$detail")\"}"
105
+ curl -fsS --max-time 2 -X POST "http://127.0.0.1:${port}/hook-fatal" \
106
+ -H 'Content-Type: application/json' \
107
+ -d "$body" >/dev/null 2>&1 || true
108
+ }
109
+
110
+ relay_install_hook_guard() {
111
+ RELAY_HOOK_NAME="${1:-unknown}"
112
+ # Fires on any unhandled failure under `set -e`/`set -u`/pipefail in the hook's
113
+ # main body, just before the shell exits. Reports, then lets the exit proceed.
114
+ # (ERR is not inherited into functions without `set -E`; this covers the top-level
115
+ # flow, which is where a silent death actually wedges a turn.)
116
+ trap 'relay_hook_err_rc=$?; relay_hook_fatal_report "${RELAY_HOOK_NAME:-unknown}" "exit ${relay_hook_err_rc}: ${BASH_COMMAND}"' ERR
117
+ }
118
+
95
119
  relay_pending_reply_stop_decision() {
96
120
  local port="${AGENT_RELAY_RUNNER_PORT:-}"
97
121
  [ -z "$port" ] && return 0
@@ -1,6 +1,7 @@
1
1
  #!/usr/bin/env bash
2
2
  set -euo pipefail
3
3
  source "${CLAUDE_PLUGIN_ROOT:-$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)}/hooks/relay-status.sh"
4
+ relay_install_hook_guard session-end
4
5
 
5
6
  payload="$(cat || true)"
6
7
  reason="$(relay_json_string_field reason "$payload")"
@@ -1,6 +1,7 @@
1
1
  #!/usr/bin/env bash
2
2
  set -euo pipefail
3
3
  source "${CLAUDE_PLUGIN_ROOT:-$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)}/hooks/relay-status.sh"
4
+ relay_install_hook_guard session-start
4
5
 
5
6
  payload="$(cat || true)"
6
7
  source_kind="$(relay_json_string_field source "$payload")"
@@ -1,6 +1,7 @@
1
1
  #!/usr/bin/env bash
2
2
  set -euo pipefail
3
3
  source "${CLAUDE_PLUGIN_ROOT:-$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)}/hooks/relay-status.sh"
4
+ relay_install_hook_guard stop-failure
4
5
 
5
6
  payload="$(cat || true)"
6
7
  error="$(relay_json_string_field error "$payload")"
@@ -1,6 +1,14 @@
1
1
  #!/usr/bin/env bash
2
2
  set -euo pipefail
3
3
  source "${CLAUDE_PLUGIN_ROOT:-$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)}/hooks/relay-status.sh"
4
+ relay_install_hook_guard stop
5
+
6
+ # Clearing the turn's busy state is the critical path (#199). Register it on EXIT
7
+ # so it runs even if a side-call below fails or times out under `set -e`. The one
8
+ # exception is the reply-obligation block path, which deliberately keeps the agent
9
+ # busy to answer — it opts out via the flag before exiting.
10
+ _relay_clear_idle_on_exit=1
11
+ trap '[ "${_relay_clear_idle_on_exit:-0}" = "1" ] && relay_post_status_clearing_subagents idle' EXIT
4
12
 
5
13
  payload="$(cat || true)"
6
14
  stop_hook_active="$(relay_json_bool_field stop_hook_active "$payload")"
@@ -8,12 +16,14 @@ if [ "$stop_hook_active" != "true" ]; then
8
16
  last_assistant_msg="$(echo "$payload" | jq -c '.last_assistant_message // empty' 2>/dev/null || true)"
9
17
  relay_post_session_turn "$(relay_json_string_field transcript_path "$payload")" "$last_assistant_msg"
10
18
  # `|| true`: under `set -e`, a non-zero from the obligation check must never abort
11
- # the hook before the idle-clear below — clearing the turn is the critical path (#199).
19
+ # the hook before the idle-clear — clearing the turn is the critical path (#199).
12
20
  stop_decision="$(relay_pending_reply_stop_decision || true)"
13
21
  if [ -n "$stop_decision" ]; then
22
+ _relay_clear_idle_on_exit=0
14
23
  printf '%s\n' "$stop_decision"
15
24
  exit 0
16
25
  fi
17
26
  fi
18
27
 
19
- relay_post_status_clearing_subagents idle
28
+ # Normal turn end → the EXIT trap posts idle (always, even on an unexpected abort above).
29
+ exit 0
@@ -4,6 +4,7 @@ set -euo pipefail
4
4
  PLUGIN_ROOT="${CLAUDE_PLUGIN_ROOT:-$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)}"
5
5
  # shellcheck source=/dev/null
6
6
  source "${PLUGIN_ROOT}/hooks/relay-status.sh"
7
+ relay_install_hook_guard subagent-start
7
8
 
8
9
  payload="$(cat || true)"
9
10
  agent_id="$(relay_json_string_field agent_id "$payload")"
@@ -4,6 +4,7 @@ set -euo pipefail
4
4
  PLUGIN_ROOT="${CLAUDE_PLUGIN_ROOT:-$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)}"
5
5
  # shellcheck source=/dev/null
6
6
  source "${PLUGIN_ROOT}/hooks/relay-status.sh"
7
+ relay_install_hook_guard subagent-stop
7
8
 
8
9
  payload="$(cat || true)"
9
10
  agent_id="$(relay_json_string_field agent_id "$payload")"
@@ -1,6 +1,7 @@
1
1
  #!/usr/bin/env bash
2
2
  set -euo pipefail
3
3
  source "${CLAUDE_PLUGIN_ROOT:-$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)}/hooks/relay-status.sh"
4
+ relay_install_hook_guard user-prompt-submit
4
5
  payload="$(cat || true)"
5
6
  relay_post_status busy
6
7
  # Mirror a terminal/TUI-typed prompt into the dashboard chat and start reasoning
@@ -4,6 +4,7 @@ import { basename, join, resolve } from "node:path";
4
4
  import type { ContextState, Message } from "agent-relay-sdk";
5
5
  import { profileAllowsRelayFeature, providerMessageText, RELAY_CONTEXT, type ManagedProcess, type ProviderAdapter, type ProviderConfig, type ProviderPermissionDecisionInput, type ProviderSessionEvent, type ProviderStatusUpdate, type RunnerSpawnConfig, type SpawnArgs, type TerminalAttachSpec } from "../adapter";
6
6
  import { workspaceDepsNoteFromEnv } from "../relay-instructions";
7
+ import { logger } from "../logger";
7
8
 
8
9
  /** Relay context prepended to a Codex agent's first turn: the standard relay
9
10
  * blurb plus, when running in an isolated workspace, the deps caveat (#159). */
@@ -199,7 +200,7 @@ export class CodexAdapter implements ProviderAdapter {
199
200
  input = codexRelayContextBlock() + "\n\n" + input;
200
201
  process.meta = { ...(process.meta ?? {}), relayContextSent: true };
201
202
  }
202
- console.error(`[agent-relay] starting Codex initial prompt in thread ${threadId}`);
203
+ logger.info("codex", `starting Codex initial prompt in thread ${threadId}`);
203
204
  const client = process.meta?.client as CodexAppClient;
204
205
  await client.turnStart(threadId, input);
205
206
  }
@@ -211,7 +212,7 @@ export class CodexAdapter implements ProviderAdapter {
211
212
  text = codexRelayContextBlock() + "\n\n" + text;
212
213
  process.meta = { ...(process.meta ?? {}), relayContextSent: true };
213
214
  }
214
- console.error(codexDeliveryNotice(messages, threadId));
215
+ logger.info("codex", codexDeliveryNotice(messages, threadId));
215
216
  const client = process.meta?.client as CodexAppClient;
216
217
  await client.turnStart(threadId, text);
217
218
  }
@@ -1,6 +1,16 @@
1
1
  import type { Server, ServerWebSocket } from "bun";
2
2
  import type { Message, ReplyObligation } from "agent-relay-sdk";
3
3
  import type { ProviderPermissionDecisionInput, ProviderStatusEvent, SemanticStatus, TerminalAttachSpec } from "./adapter";
4
+ import { logger, parseLogLevel, LOG_LEVELS } from "./logger";
5
+
6
+ // A hook that failed in a way it could not handle itself reports here so the
7
+ // failure is never silent (#198 item 5). Phase 1 logs it FATAL to the per-agent
8
+ // log; Phase 2 (#196) will additionally route it through the runner outbox to the
9
+ // server.
10
+ export interface HookFatalReport {
11
+ hook: string;
12
+ error: string;
13
+ }
4
14
 
5
15
  interface MonitorSocketData {
6
16
  kind: "monitor";
@@ -33,6 +43,10 @@ interface ControlServerOptions {
33
43
  // transcript. transcriptPath is optional — the runner falls back to the last
34
44
  // path it saw during the session.
35
45
  onSessionEnd?(input: { reason?: string; transcriptPath?: string }): Promise<void>;
46
+ // Phase 1 observability (#198): a hook reporting an unhandled failure. The
47
+ // control server already logs it FATAL; this is the seam for Phase 2 to also
48
+ // surface it to the server via the runner outbox.
49
+ onHookFatal?(report: HookFatalReport): void;
36
50
  }
37
51
 
38
52
  export function startControlServer(options: ControlServerOptions): ControlServer {
@@ -81,6 +95,15 @@ export function startControlServer(options: ControlServerOptions): ControlServer
81
95
  if (url.pathname === "/session-end" && req.method === "POST") {
82
96
  return handleSessionEnd(req, options);
83
97
  }
98
+ if (url.pathname === "/log-level" && req.method === "GET") {
99
+ return Response.json({ level: logger.getLevel(), levels: LOG_LEVELS });
100
+ }
101
+ if (url.pathname === "/log-level" && req.method === "POST") {
102
+ return handleLogLevel(req);
103
+ }
104
+ if (url.pathname === "/hook-fatal" && req.method === "POST") {
105
+ return handleHookFatal(req, options);
106
+ }
84
107
  if (url.pathname === "/monitor") {
85
108
  const upgraded = srv.upgrade(req, { data: { kind: "monitor" } });
86
109
  return upgraded ? undefined : new Response("WebSocket upgrade failed", { status: 400 });
@@ -361,6 +384,26 @@ async function handleSessionEnd(req: Request, options: ControlServerOptions): Pr
361
384
  return Response.json({ ok: true });
362
385
  }
363
386
 
387
+ async function handleLogLevel(req: Request): Promise<Response> {
388
+ const body = await req.json().catch(() => null);
389
+ const level = parseLogLevel(isRecord(body) && typeof body.level === "string" ? body.level : undefined);
390
+ if (!level) return Response.json({ error: `level must be one of: ${LOG_LEVELS.join(", ")}` }, { status: 400 });
391
+ const previous = logger.getLevel();
392
+ logger.setLevel(level);
393
+ logger.info("logger", `log level set to ${level} (was ${previous}) via control port`);
394
+ return Response.json({ ok: true, level, previous });
395
+ }
396
+
397
+ async function handleHookFatal(req: Request, options: ControlServerOptions): Promise<Response> {
398
+ const body = await req.json().catch(() => null);
399
+ const hook = isRecord(body) && typeof body.hook === "string" && body.hook.trim() ? body.hook.trim() : "unknown";
400
+ const error = isRecord(body) && typeof body.error === "string" ? body.error : "(no detail)";
401
+ // Never silent: a hook that couldn't handle its own failure lands here as FATAL.
402
+ logger.fatal(`hook:${hook}`, error);
403
+ try { options.onHookFatal?.({ hook, error }); } catch { /* reporting must never throw back at the hook */ }
404
+ return Response.json({ ok: true });
405
+ }
406
+
364
407
  async function handleStatus(req: Request, options: ControlServerOptions): Promise<Response> {
365
408
  const body = await req.json().catch(() => null) as Partial<ProviderStatusEvent> | null;
366
409
  const status = body?.status;
package/src/logger.ts ADDED
@@ -0,0 +1,97 @@
1
+ import { appendFileSync, mkdirSync } from "node:fs";
2
+ import { join } from "node:path";
3
+
4
+ // Phase 1 observability (#198): one leveled, runtime-togglable logger for the
5
+ // Runner and the provider adapters below it. Replaces the ad-hoc scatter of
6
+ // `console.error`, `logRunnerDiagnostic` (-> runner-<agent>.log) and
7
+ // `sessionLog`/`sessionDebug` (-> session-mirror-<agent>.log) with a single
8
+ // switch and a single greppable, ANSI-free sink.
9
+ //
10
+ // Sink: the per-agent `session-mirror-<agent>.log` — the file the orchestrator
11
+ // already surfaces to the dashboard log-viewer (captureSessionMirror). One place
12
+ // to look when anything in the Runner misbehaves.
13
+ //
14
+ // Level is read once from AGENT_RELAY_LOG_LEVEL (default "info") and can be
15
+ // flipped at runtime via the control port (no restart) — so a phase refactor can
16
+ // be watched at debug without bouncing the agent.
17
+
18
+ export type LogLevel = "debug" | "info" | "warn" | "error" | "fatal";
19
+
20
+ const ORDER: Record<LogLevel, number> = { debug: 10, info: 20, warn: 30, error: 40, fatal: 50 };
21
+ export const LOG_LEVELS = Object.keys(ORDER) as LogLevel[];
22
+
23
+ export function parseLogLevel(value: string | undefined | null): LogLevel | undefined {
24
+ if (!value) return undefined;
25
+ const v = value.trim().toLowerCase();
26
+ return (LOG_LEVELS as string[]).includes(v) ? (v as LogLevel) : undefined;
27
+ }
28
+
29
+ // Matches the runner's safeLogName and the orchestrator's safeMirrorLogName so all
30
+ // three resolve the identical filename for a given agent id.
31
+ function safeLogName(value: string): string {
32
+ return value.replace(/[^a-zA-Z0-9_.-]+/g, "_").slice(0, 180);
33
+ }
34
+
35
+ export interface LoggerConfig {
36
+ agentId?: string;
37
+ level?: LogLevel;
38
+ headless?: boolean;
39
+ logDir?: string;
40
+ }
41
+
42
+ export class Logger {
43
+ private level: LogLevel;
44
+ private agentId: string;
45
+ private headless: boolean;
46
+ private logDir: string;
47
+
48
+ constructor(config: LoggerConfig = {}) {
49
+ this.level = config.level ?? parseLogLevel(process.env.AGENT_RELAY_LOG_LEVEL) ?? "info";
50
+ this.agentId = config.agentId ?? "runner";
51
+ this.headless = config.headless ?? false;
52
+ this.logDir = config.logDir ?? join(process.env.HOME || ".", ".agent-relay", "logs");
53
+ }
54
+
55
+ // Bind the logger to a concrete agent once the runner knows its id. Preserves a
56
+ // level already set via env/runtime unless an explicit level is passed.
57
+ configure(config: LoggerConfig): void {
58
+ if (config.agentId !== undefined) this.agentId = config.agentId;
59
+ if (config.headless !== undefined) this.headless = config.headless;
60
+ if (config.logDir !== undefined) this.logDir = config.logDir;
61
+ if (config.level !== undefined) this.level = config.level;
62
+ }
63
+
64
+ setLevel(level: LogLevel): void { this.level = level; }
65
+ getLevel(): LogLevel { return this.level; }
66
+ isEnabled(level: LogLevel): boolean { return ORDER[level] >= ORDER[this.level]; }
67
+
68
+ debug(component: string, message: string): void { this.log("debug", component, message); }
69
+ info(component: string, message: string): void { this.log("info", component, message); }
70
+ warn(component: string, message: string): void { this.log("warn", component, message); }
71
+ error(component: string, message: string): void { this.log("error", component, message); }
72
+ fatal(component: string, message: string): void { this.log("fatal", component, message); }
73
+
74
+ log(level: LogLevel, component: string, message: string): void {
75
+ if (!this.isEnabled(level)) return;
76
+ const line = `[${new Date().toISOString()}] ${level.toUpperCase().padEnd(5)} [${component}] ${oneLine(message)}\n`;
77
+ try {
78
+ mkdirSync(this.logDir, { recursive: true });
79
+ appendFileSync(join(this.logDir, `session-mirror-${safeLogName(this.agentId)}.log`), line);
80
+ } catch {
81
+ // Best-effort. If the per-agent file can't be written, surface error/fatal to
82
+ // stderr so it is not lost entirely (headless: lands in the orchestrator log).
83
+ if (ORDER[level] >= ORDER.error) { try { console.error(line.trimEnd()); } catch { /* give up */ } }
84
+ }
85
+ }
86
+ }
87
+
88
+ // Newlines would split one record across several log lines and break greppability;
89
+ // collapse them so a multi-line message stays one line.
90
+ function oneLine(message: string): string {
91
+ return message.replace(/\r?\n/g, " ⏎ ");
92
+ }
93
+
94
+ // Process-global logger. A runner process serves exactly one agent, so a singleton
95
+ // is the right scope; the runner calls configure() once it knows its id, and
96
+ // adapters import this instance directly (no constructor threading).
97
+ export const logger = new Logger();
package/src/outbox.ts ADDED
@@ -0,0 +1,303 @@
1
+ import { Database } from "bun:sqlite";
2
+ import { mkdirSync } from "node:fs";
3
+ import { dirname, join } from "node:path";
4
+ import { tmpdir } from "node:os";
5
+ import { logger } from "./logger";
6
+
7
+ // Phase 2 (#196) — the "nothing is ever lost" half. Runner→server events that used to be
8
+ // fire-and-forget over HTTP (session turns, reasoning/tool traces, prompt echoes, insights,
9
+ // hook-fatal reports) were silently dropped whenever the server was momentarily down. This
10
+ // is a durable, FIFO, disk-backed queue that:
11
+ // - survives Runner/server restart (bun:sqlite file in the runtime dir),
12
+ // - stamps true event time (`occurredAt`) once at enqueue and preserves it through retries,
13
+ // - retries with capped exponential backoff, strictly in order (an append log must not
14
+ // reorder turns),
15
+ // - poisons a permanently-failing head after maxAttempts so it can't block the queue,
16
+ // - is bounded with a logged drop policy (never silently truncates).
17
+ //
18
+ // Status deliberately does NOT go through here: it rides the WebSocket bus, which is
19
+ // last-wins and self-heals on reconnect (so it already satisfies "coalesce, don't replay
20
+ // stale busyes"). The coalesce mode below exists so a future state event could migrate here.
21
+
22
+ export type OutboxMode = "append" | "coalesce";
23
+
24
+ export interface OutboxEventInput {
25
+ kind: string;
26
+ payload: unknown;
27
+ mode?: OutboxMode;
28
+ // Required for coalesce mode: prior un-poisoned rows with the same dedupeKey are replaced.
29
+ dedupeKey?: string;
30
+ // Defaults to now. Set explicitly only to backdate (e.g. replaying a captured timestamp).
31
+ occurredAt?: number;
32
+ // Defaults to a stable derived key so server-side dedup makes retries exactly-once.
33
+ idempotencyKey?: string;
34
+ }
35
+
36
+ export interface OutboxRecord {
37
+ seq: number;
38
+ kind: string;
39
+ mode: OutboxMode;
40
+ occurredAt: number;
41
+ idempotencyKey: string;
42
+ payload: unknown;
43
+ attempts: number;
44
+ }
45
+
46
+ // The transport. Resolve = delivered (row deleted). Reject = failed (retried with backoff).
47
+ export type OutboxSend = (record: OutboxRecord) => Promise<void>;
48
+
49
+ export interface OutboxOptions {
50
+ agentId: string;
51
+ send: OutboxSend;
52
+ // Storage directory. Defaults to AGENT_RELAY_RUNNER_OUTBOX_DIR, else a per-host temp dir.
53
+ dir?: string;
54
+ maxRows?: number;
55
+ maxAttempts?: number;
56
+ baseBackoffMs?: number;
57
+ maxBackoffMs?: number;
58
+ pollMs?: number;
59
+ }
60
+
61
+ const DEFAULTS = {
62
+ maxRows: 5000,
63
+ maxAttempts: 12,
64
+ baseBackoffMs: 1_000,
65
+ maxBackoffMs: 60_000,
66
+ pollMs: 5_000,
67
+ };
68
+
69
+ interface Row {
70
+ seq: number;
71
+ kind: string;
72
+ mode: string;
73
+ occurred_at: number;
74
+ idempotency_key: string;
75
+ payload: string;
76
+ attempts: number;
77
+ next_attempt_at: number;
78
+ poisoned: number;
79
+ }
80
+
81
+ export class Outbox {
82
+ private readonly db: Database;
83
+ private readonly agentId: string;
84
+ private readonly send: OutboxSend;
85
+ private readonly maxRows: number;
86
+ private readonly maxAttempts: number;
87
+ private readonly baseBackoffMs: number;
88
+ private readonly maxBackoffMs: number;
89
+ private readonly pollMs: number;
90
+ readonly path: string;
91
+
92
+ private draining = false;
93
+ private rerun = false;
94
+ private pollTimer?: ReturnType<typeof setInterval>;
95
+ private dueTimer?: ReturnType<typeof setTimeout>;
96
+ private stopped = false;
97
+
98
+ constructor(options: OutboxOptions) {
99
+ this.agentId = options.agentId;
100
+ this.send = options.send;
101
+ this.maxRows = options.maxRows ?? DEFAULTS.maxRows;
102
+ this.maxAttempts = options.maxAttempts ?? DEFAULTS.maxAttempts;
103
+ this.baseBackoffMs = options.baseBackoffMs ?? DEFAULTS.baseBackoffMs;
104
+ this.maxBackoffMs = options.maxBackoffMs ?? DEFAULTS.maxBackoffMs;
105
+ this.pollMs = options.pollMs ?? DEFAULTS.pollMs;
106
+
107
+ const dir = options.dir ?? process.env.AGENT_RELAY_RUNNER_OUTBOX_DIR ?? join(tmpdir(), "agent-relay-outbox");
108
+ this.path = options.dir === ":memory:" ? ":memory:" : join(dir, `outbox-${safeName(this.agentId)}.sqlite`);
109
+ if (this.path !== ":memory:") mkdirSync(dirname(this.path), { recursive: true });
110
+
111
+ this.db = new Database(this.path, { create: true });
112
+ this.db.exec("PRAGMA journal_mode = WAL");
113
+ this.db.exec("PRAGMA busy_timeout = 2000");
114
+ this.db.exec(`
115
+ CREATE TABLE IF NOT EXISTS outbox (
116
+ seq INTEGER PRIMARY KEY AUTOINCREMENT,
117
+ kind TEXT NOT NULL,
118
+ mode TEXT NOT NULL DEFAULT 'append',
119
+ dedupe_key TEXT,
120
+ occurred_at INTEGER NOT NULL,
121
+ idempotency_key TEXT NOT NULL,
122
+ payload TEXT NOT NULL,
123
+ attempts INTEGER NOT NULL DEFAULT 0,
124
+ next_attempt_at INTEGER NOT NULL DEFAULT 0,
125
+ poisoned INTEGER NOT NULL DEFAULT 0,
126
+ created_at INTEGER NOT NULL
127
+ )
128
+ `);
129
+ // A restart is a fresh start: clear any backoff timers left by the prior process so
130
+ // pending events get an immediate retry (the down server may now be back). `attempts`
131
+ // is kept so the poison threshold still counts cumulative failures.
132
+ this.db.exec("UPDATE outbox SET next_attempt_at = 0 WHERE next_attempt_at > 0");
133
+ }
134
+
135
+ // Persist an event. Returns the assigned seq. Triggers a drain.
136
+ enqueue(input: OutboxEventInput): number {
137
+ if (this.stopped) throw new Error("outbox is stopped");
138
+ const mode: OutboxMode = input.mode ?? "append";
139
+ const occurredAt = input.occurredAt ?? Date.now();
140
+ const payloadJson = JSON.stringify(input.payload ?? null);
141
+ const idempotencyKey = input.idempotencyKey ?? `${this.agentId}:${input.kind}:${occurredAt}:${shortHash(payloadJson)}`;
142
+
143
+ if (mode === "coalesce") {
144
+ if (!input.dedupeKey) throw new Error("coalesce mode requires a dedupeKey");
145
+ this.db.query("DELETE FROM outbox WHERE dedupe_key = ? AND poisoned = 0").run(input.dedupeKey);
146
+ }
147
+
148
+ const info = this.db
149
+ .query(`INSERT INTO outbox (kind, mode, dedupe_key, occurred_at, idempotency_key, payload, created_at)
150
+ VALUES (?, ?, ?, ?, ?, ?, ?)`)
151
+ .run(input.kind, mode, input.dedupeKey ?? null, occurredAt, idempotencyKey, payloadJson, Date.now());
152
+ const seq = Number(info.lastInsertRowid);
153
+
154
+ this.enforceBound();
155
+ // Defer the drain to a microtask so a synchronous burst of enqueues (e.g. several
156
+ // coalesce updates) all land — and coalesce — before the pump pulls the head.
157
+ queueMicrotask(() => { void this.drain(); });
158
+ return seq;
159
+ }
160
+
161
+ // Bounded ring buffer: if over capacity, drop the oldest rows (defined overflow policy).
162
+ // Logged, never silent. Prefers dropping already-poisoned rows first, then oldest by seq.
163
+ private enforceBound(): void {
164
+ const { n } = this.db.query("SELECT count(*) AS n FROM outbox").get() as { n: number };
165
+ if (n <= this.maxRows) return;
166
+ const overflow = n - this.maxRows;
167
+ // Oldest poisoned first, then oldest live — both by seq.
168
+ const victims = this.db
169
+ .query("SELECT seq FROM outbox ORDER BY poisoned DESC, seq ASC LIMIT ?")
170
+ .all(overflow) as Array<{ seq: number }>;
171
+ const ids = victims.map((v) => v.seq);
172
+ if (ids.length === 0) return;
173
+ const placeholders = ids.map(() => "?").join(",");
174
+ this.db.query(`DELETE FROM outbox WHERE seq IN (${placeholders})`).run(...ids);
175
+ logger.warn("outbox", `bound exceeded (${n}/${this.maxRows}) — dropped ${ids.length} oldest event(s)`);
176
+ }
177
+
178
+ // Begin the background pump: an initial drain plus a poll timer as a backstop.
179
+ start(): void {
180
+ if (this.pollTimer || this.stopped) return;
181
+ void this.drain();
182
+ this.pollTimer = setInterval(() => { void this.drain(); }, this.pollMs);
183
+ this.pollTimer.unref?.();
184
+ }
185
+
186
+ // Process the queue strictly oldest-first. Coalesces concurrent calls; if a drain is
187
+ // requested while one is running, it re-runs once at the end (so an enqueue during a
188
+ // send isn't missed).
189
+ async drain(): Promise<void> {
190
+ if (this.stopped) return;
191
+ if (this.draining) { this.rerun = true; return; }
192
+ this.draining = true;
193
+ try {
194
+ do {
195
+ this.rerun = false;
196
+ await this.drainOnce();
197
+ } while (this.rerun && !this.stopped);
198
+ } finally {
199
+ this.draining = false;
200
+ }
201
+ }
202
+
203
+ private async drainOnce(): Promise<void> {
204
+ for (;;) {
205
+ if (this.stopped) return;
206
+ const row = this.db
207
+ .query("SELECT * FROM outbox WHERE poisoned = 0 ORDER BY seq ASC LIMIT 1")
208
+ .get() as Row | null;
209
+ if (!row) return;
210
+
211
+ const now = Date.now();
212
+ if (row.next_attempt_at > now) {
213
+ // Head isn't due yet. Don't reorder past it (FIFO) — schedule a wake-up and stop.
214
+ this.scheduleDue(row.next_attempt_at - now);
215
+ return;
216
+ }
217
+
218
+ const record: OutboxRecord = {
219
+ seq: row.seq,
220
+ kind: row.kind,
221
+ mode: row.mode as OutboxMode,
222
+ occurredAt: row.occurred_at,
223
+ idempotencyKey: row.idempotency_key,
224
+ payload: safeParse(row.payload),
225
+ attempts: row.attempts,
226
+ };
227
+
228
+ try {
229
+ await this.send(record);
230
+ this.db.query("DELETE FROM outbox WHERE seq = ?").run(row.seq);
231
+ } catch (error) {
232
+ const attempts = row.attempts + 1;
233
+ const reason = error instanceof Error ? error.message : String(error);
234
+ if (attempts >= this.maxAttempts) {
235
+ this.db.query("UPDATE outbox SET attempts = ?, poisoned = 1 WHERE seq = ?").run(attempts, row.seq);
236
+ logger.fatal("outbox", `event seq=${row.seq} kind=${row.kind} poisoned after ${attempts} attempts: ${reason}`);
237
+ // Move on — the next iteration picks the new head (poison no longer blocks).
238
+ continue;
239
+ }
240
+ const delay = this.backoff(attempts);
241
+ this.db.query("UPDATE outbox SET attempts = ?, next_attempt_at = ? WHERE seq = ?").run(attempts, now + delay, row.seq);
242
+ logger.debug("outbox", `event seq=${row.seq} kind=${row.kind} retry ${attempts}/${this.maxAttempts} in ${delay}ms: ${reason}`);
243
+ this.scheduleDue(delay);
244
+ return; // head is now scheduled; stop until it's due (preserve order)
245
+ }
246
+ }
247
+ }
248
+
249
+ private backoff(attempts: number): number {
250
+ const exp = Math.min(this.maxBackoffMs, this.baseBackoffMs * 2 ** (attempts - 1));
251
+ return Math.round(exp / 2 + Math.random() * (exp / 2)); // full-ish jitter, never below half
252
+ }
253
+
254
+ private scheduleDue(delayMs: number): void {
255
+ if (this.stopped || this.dueTimer) return;
256
+ this.dueTimer = setTimeout(() => {
257
+ this.dueTimer = undefined;
258
+ void this.drain();
259
+ }, Math.max(0, delayMs));
260
+ this.dueTimer.unref?.();
261
+ }
262
+
263
+ // Observability / tests.
264
+ pendingCount(): number {
265
+ return (this.db.query("SELECT count(*) AS n FROM outbox WHERE poisoned = 0").get() as { n: number }).n;
266
+ }
267
+
268
+ poisonedCount(): number {
269
+ return (this.db.query("SELECT count(*) AS n FROM outbox WHERE poisoned = 1").get() as { n: number }).n;
270
+ }
271
+
272
+ stop(): void {
273
+ this.stopped = true;
274
+ if (this.pollTimer) clearInterval(this.pollTimer);
275
+ this.pollTimer = undefined;
276
+ if (this.dueTimer) clearTimeout(this.dueTimer);
277
+ this.dueTimer = undefined;
278
+ }
279
+
280
+ close(): void {
281
+ this.stop();
282
+ try { this.db.close(); } catch { /* already closed */ }
283
+ }
284
+ }
285
+
286
+ function safeName(value: string): string {
287
+ return value.replace(/[^a-zA-Z0-9_.-]+/g, "_").slice(0, 180) || "agent";
288
+ }
289
+
290
+ function safeParse(json: string): unknown {
291
+ try { return JSON.parse(json); } catch { return null; }
292
+ }
293
+
294
+ // Small, fast, stable string hash (FNV-1a, 32-bit) — enough to disambiguate identical
295
+ // kind+timestamp payloads in the idempotency key. Not security-sensitive.
296
+ function shortHash(value: string): string {
297
+ let h = 0x811c9dc5;
298
+ for (let i = 0; i < value.length; i++) {
299
+ h ^= value.charCodeAt(i);
300
+ h = Math.imul(h, 0x01000193);
301
+ }
302
+ return (h >>> 0).toString(36);
303
+ }
@@ -45,15 +45,29 @@ export function workspaceDepsNote(input: { mode?: string | null; depsMode?: stri
45
45
  }
46
46
  }
47
47
 
48
- /** Resolve the workspace deps caveat from the runner/monitor environment.
48
+ /**
49
+ * Caveat for untracked paths symlinked from main into an isolated worktree
50
+ * (WorkspaceConfig.symlinkPaths, e.g. AGENTS.md, .claude-rig). Edits to these
51
+ * write THROUGH to the main checkout — the agent must know so it doesn't mutate
52
+ * shared config thinking it's worktree-local. Returns "" when nothing was linked.
53
+ */
54
+ export function workspaceSymlinksNote(linked: string[]): string {
55
+ if (!linked.length) return "";
56
+ return `[agent-relay] Isolated workspace: these untracked paths are SYMLINKED from the main checkout: ${linked.join(", ")}. They resolve to the real files in main, so editing or deleting them writes THROUGH to main — treat them as read-only unless you intend to change main.`;
57
+ }
58
+
59
+ /** Resolve the workspace caveats from the runner/monitor environment.
49
60
  * AGENT_RELAY_WORKSPACE_JSON carries the resolved workspace metadata (mode +
50
- * deps) and is the authoritative source. Best-effort: never throws. */
61
+ * deps + symlinks) and is the authoritative source. Best-effort: never throws. */
51
62
  export function workspaceDepsNoteFromEnv(env: Record<string, string | undefined> = process.env): string {
52
63
  const json = env.AGENT_RELAY_WORKSPACE_JSON;
53
64
  if (!json) return "";
54
65
  try {
55
- const parsed = JSON.parse(json) as { mode?: string; deps?: { mode?: string } };
56
- return workspaceDepsNote({ mode: parsed.mode ?? null, depsMode: parsed.deps?.mode ?? null });
66
+ const parsed = JSON.parse(json) as { mode?: string; deps?: { mode?: string }; symlinks?: { linked?: string[] } };
67
+ return [
68
+ workspaceDepsNote({ mode: parsed.mode ?? null, depsMode: parsed.deps?.mode ?? null }),
69
+ parsed.mode === "isolated" ? workspaceSymlinksNote(parsed.symlinks?.linked ?? []) : "",
70
+ ].filter(Boolean).join("\n\n");
57
71
  } catch {
58
72
  return "";
59
73
  }
@@ -0,0 +1,109 @@
1
+ import type { ReplyObligation } from "agent-relay-sdk";
2
+ import { logger } from "./logger";
3
+
4
+ // Phase 2 (#196) — the crux. The Claude Stop hook used to ask the server, synchronously
5
+ // and in the hot path, "does this agent owe a reply?" before clearing the turn. A slow
6
+ // server answer (the unindexed reply_to scan, #199) blew the hook's timeout and wedged the
7
+ // agent in `busy` forever. The fix: the hook asks the Runner, the Runner answers instantly
8
+ // from this local snapshot, and the snapshot is refreshed from the server only in the
9
+ // background — never on the path that ends a turn.
10
+ //
11
+ // Design rules:
12
+ // - `get()` is synchronous, never throws, never touches the network.
13
+ // - `refresh()` is the only thing that talks to the server; it coalesces concurrent calls
14
+ // and, on failure, keeps the last-known snapshot (stale-but-serving beats blocking).
15
+ // - A background interval keeps the snapshot warm; `markDirty()` requests an extra,
16
+ // debounced refresh when state likely just changed (a message arrived, a turn ended).
17
+
18
+ export type ReplyObligationFetch = () => Promise<ReplyObligation[]>;
19
+
20
+ export interface ReplyObligationCacheOptions {
21
+ fetch: ReplyObligationFetch;
22
+ // Background freshness backstop. Default 10s — well under any turn cadence, cheap.
23
+ intervalMs?: number;
24
+ // Debounce window for markDirty()-triggered refreshes so a burst of events
25
+ // (e.g. a fan-out of messages) collapses into one server round-trip.
26
+ dirtyDebounceMs?: number;
27
+ }
28
+
29
+ const DEFAULT_INTERVAL_MS = 10_000;
30
+ const DEFAULT_DIRTY_DEBOUNCE_MS = 400;
31
+
32
+ export class ReplyObligationCache {
33
+ private readonly fetch: ReplyObligationFetch;
34
+ private readonly intervalMs: number;
35
+ private readonly dirtyDebounceMs: number;
36
+
37
+ private snapshot: ReplyObligation[] = [];
38
+ private lastRefreshedAt = 0;
39
+ private inFlight: Promise<void> | null = null;
40
+ private intervalTimer?: ReturnType<typeof setInterval>;
41
+ private dirtyTimer?: ReturnType<typeof setTimeout>;
42
+ private stopped = false;
43
+
44
+ constructor(options: ReplyObligationCacheOptions) {
45
+ this.fetch = options.fetch;
46
+ this.intervalMs = options.intervalMs ?? DEFAULT_INTERVAL_MS;
47
+ this.dirtyDebounceMs = options.dirtyDebounceMs ?? DEFAULT_DIRTY_DEBOUNCE_MS;
48
+ }
49
+
50
+ // Synchronous, hot-path-safe read. Returns a copy so callers can't mutate the snapshot.
51
+ get(): ReplyObligation[] {
52
+ return this.snapshot.slice();
53
+ }
54
+
55
+ getLastRefreshedAt(): number {
56
+ return this.lastRefreshedAt;
57
+ }
58
+
59
+ // Begin the background freshness loop and prime the first snapshot immediately.
60
+ start(): void {
61
+ if (this.intervalTimer || this.stopped) return;
62
+ void this.refresh();
63
+ this.intervalTimer = setInterval(() => { void this.refresh(); }, this.intervalMs);
64
+ // Don't keep the process alive solely for cache refreshes.
65
+ this.intervalTimer.unref?.();
66
+ }
67
+
68
+ stop(): void {
69
+ this.stopped = true;
70
+ if (this.intervalTimer) clearInterval(this.intervalTimer);
71
+ this.intervalTimer = undefined;
72
+ if (this.dirtyTimer) clearTimeout(this.dirtyTimer);
73
+ this.dirtyTimer = undefined;
74
+ }
75
+
76
+ // Request a refresh because state likely changed (message arrived / turn ended).
77
+ // Debounced so a burst collapses into a single server round-trip.
78
+ markDirty(): void {
79
+ if (this.stopped || this.dirtyTimer) return;
80
+ this.dirtyTimer = setTimeout(() => {
81
+ this.dirtyTimer = undefined;
82
+ void this.refresh();
83
+ }, this.dirtyDebounceMs);
84
+ this.dirtyTimer.unref?.();
85
+ }
86
+
87
+ // Fetch from the server and replace the snapshot. Coalesces concurrent callers onto a
88
+ // single in-flight request. Never rejects — a failed fetch leaves the prior snapshot in
89
+ // place (the hook keeps getting an answer even while the server is down).
90
+ refresh(): Promise<void> {
91
+ if (this.stopped) return Promise.resolve();
92
+ if (this.inFlight) return this.inFlight;
93
+ this.inFlight = this.doRefresh().finally(() => { this.inFlight = null; });
94
+ return this.inFlight;
95
+ }
96
+
97
+ private async doRefresh(): Promise<void> {
98
+ try {
99
+ const obligations = await this.fetch();
100
+ if (this.stopped) return;
101
+ this.snapshot = Array.isArray(obligations) ? obligations : [];
102
+ this.lastRefreshedAt = Date.now();
103
+ } catch (error) {
104
+ // Server-down is a non-event: keep serving the last snapshot. Debug, not error —
105
+ // this is expected during outages and must not spam the log.
106
+ logger.debug("obligation-cache", `refresh failed, serving cached snapshot (${this.snapshot.length}): ${error instanceof Error ? error.message : String(error)}`);
107
+ }
108
+ }
109
+ }
package/src/runner.ts CHANGED
@@ -1,18 +1,21 @@
1
1
  import { hostname } from "node:os";
2
- import { appendFileSync, closeSync, mkdirSync, openSync, readSync, statSync, writeFileSync } from "node:fs";
2
+ import { closeSync, mkdirSync, openSync, readSync, statSync, writeFileSync } from "node:fs";
3
3
  import { readFile } from "node:fs/promises";
4
4
  import { dirname, join } from "node:path";
5
- import type { AgentProfile, ContextState, Message, MessageSessionMeta, ProviderCapabilities, TaskStatusInput, WorkspaceMetadata } from "agent-relay-sdk";
5
+ import type { AgentProfile, ContextState, Message, MessageSessionMeta, ProviderCapabilities, SendMessageInput, TaskStatusInput, WorkspaceMetadata } from "agent-relay-sdk";
6
6
  import { RelayBusClient, RelayHttpClient } from "agent-relay-sdk";
7
7
  import { contextStateFromProbeMetrics, readContextProbeState } from "agent-relay-sdk/context-probe";
8
8
  import type { ManagedProcess, ProviderAdapter, ProviderConfig, ProviderPermissionDecision, ProviderPermissionDecisionInput, ProviderSessionEvent, ProviderStatusUpdate, RunnerSpawnConfig, SemanticStatus, TerminalAttachSpec } from "./adapter";
9
9
  import { messagesWithCachedAttachments } from "./attachment-cache";
10
10
  import { ClaimTracker } from "./claim-tracker";
11
11
  import { startControlServer, type ControlServer } from "./control-server";
12
+ import { ReplyObligationCache } from "./reply-obligation-cache";
13
+ import { Outbox, type OutboxRecord } from "./outbox";
12
14
  import { extractLastAssistantTurn, extractFinalAssistantMessage, extractHookAssistantMessage, extractLatestTurnSteps, transcriptLooksComplete, analyzeSession } from "./adapters/claude-transcript";
13
15
  import { agentProfileProjectionReport } from "./profile-projection";
14
16
  import { profileUsesHostProviderGlobals } from "./profile-home";
15
17
  import { runtimeMetadata } from "./version";
18
+ import { logger, parseLogLevel } from "./logger";
16
19
  import { ensureSessionScratch, reapSessionScratch, sweepStaleSessions, type SessionScratchLayout } from "./session-scratch";
17
20
 
18
21
  interface RunnerOptions {
@@ -115,6 +118,13 @@ export class AgentRunner {
115
118
  private readonly claims = new ClaimTracker();
116
119
  private readonly http: RelayHttpClient;
117
120
  private readonly bus: RelayBusClient;
121
+ // Phase 2 (#196): the Stop hook reads reply obligations from this local snapshot, never
122
+ // from the server — so a slow server can no longer wedge a turn (the crux fix).
123
+ private readonly obligationCache: ReplyObligationCache;
124
+ // Phase 2 (#196): Runner→server append-log events (session turns, reasoning, prompts,
125
+ // insights, hook-fatal) go through this durable, disk-backed, timestamped queue instead of
126
+ // direct fire-and-forget HTTP — so nothing is lost across a server/Runner restart.
127
+ private readonly outbox: Outbox;
118
128
  private currentToken?: string;
119
129
  private currentTokenJti?: string;
120
130
  private currentTokenProfileId?: string;
@@ -177,12 +187,26 @@ export class AgentRunner {
177
187
 
178
188
  constructor(private readonly options: RunnerOptions) {
179
189
  this.agentId = options.agentId ?? options.runnerId;
190
+ // Bind the process-global logger to this agent. AGENT_RELAY_SESSION_DEBUG=1 is
191
+ // kept as a back-compat alias for the verbose probe/emit lines, now expressed
192
+ // as log level "debug" (AGENT_RELAY_LOG_LEVEL still wins when both are set).
193
+ logger.configure({
194
+ agentId: this.agentId,
195
+ headless: options.headless,
196
+ ...(this.sessionDebugVerbose && !parseLogLevel(process.env.AGENT_RELAY_LOG_LEVEL) ? { level: "debug" as const } : {}),
197
+ });
180
198
  this.currentToken = options.token;
181
199
  this.currentTokenJti = options.tokenJti;
182
200
  this.currentTokenProfileId = options.tokenProfileId;
183
201
  this.currentTokenExpiresAt = options.tokenExpiresAt;
184
202
  const runtime = runtimeMetadata(options.provider);
185
203
  this.http = new RelayHttpClient({ baseUrl: options.relayUrl, token: this.currentToken });
204
+ this.obligationCache = new ReplyObligationCache({ fetch: () => this.http.listReplyObligations(this.agentId) });
205
+ // Co-locate the durable outbox with the runner's runtime state (survives reboot) when the
206
+ // orchestrator told us where that is; otherwise the Outbox falls back to a temp dir.
207
+ const outboxDir = process.env.AGENT_RELAY_RUNNER_OUTBOX_DIR
208
+ ?? (process.env.AGENT_RELAY_RUNNER_INFO_FILE ? join(dirname(process.env.AGENT_RELAY_RUNNER_INFO_FILE), "outbox") : undefined);
209
+ this.outbox = new Outbox({ agentId: this.agentId, dir: outboxDir, send: (record) => this.deliverOutboxEvent(record) });
186
210
  this.bus = new RelayBusClient({
187
211
  url: relayBusUrl(options.relayUrl),
188
212
  role: "provider",
@@ -251,10 +275,13 @@ export class AgentRunner {
251
275
  this.control = startControlServer({
252
276
  onStatus: (status) => this.setProviderStatus(status),
253
277
  onTerminalAttachSpec: () => this.terminalAttachSpec(),
254
- onReplyObligations: () => this.http.listReplyObligations(this.agentId),
278
+ // Hot-path-safe: answered instantly from the local snapshot, never a server
279
+ // round-trip. The snapshot is kept warm by the background refresh below (#196).
280
+ onReplyObligations: () => Promise.resolve(this.obligationCache.get()),
255
281
  onSessionTurn: (input) => this.publishSessionTurn(input),
256
282
  onUserPrompt: (input) => this.handleUserPrompt(input),
257
283
  onSessionEnd: (input) => this.handleSessionEnd(input),
284
+ onHookFatal: (report) => this.reportHookFatal(report),
258
285
  });
259
286
  this.writeRunnerInfoFile();
260
287
  this.options.adapter.onStatusChange((status) => {
@@ -268,12 +295,19 @@ export class AgentRunner {
268
295
  if (runnerShouldResolveProviderExit(semanticStatus, this.exitCommandInProgress)) this.options.onProviderExit?.(semanticStatus === "offline" ? 0 : 1);
269
296
  });
270
297
  this.options.adapter.onSessionEvent?.((event) => { void this.publishProviderSessionEvent(event); });
271
- this.bus.on("message.new", (message) => this.enqueueMessage(message as Message));
298
+ this.bus.on("message.new", (message) => {
299
+ // A delivered message may create a new reply obligation — warm the snapshot so the
300
+ // next turn-end sees it without a hot-path server read.
301
+ this.obligationCache.markDirty();
302
+ this.enqueueMessage(message as Message);
303
+ });
272
304
  this.bus.on("command", (type, params, commandId, command) => {
273
305
  void this.handleCommand(type, params, commandId, command);
274
306
  });
275
307
  this.bus.on("error", (code, message) => this.handleBusError(String(code), String(message)));
276
308
  await this.bus.connect();
309
+ this.obligationCache.start();
310
+ this.outbox.start();
277
311
  this.ensureScratch();
278
312
  void this.sweepStaleScratch();
279
313
  this.process = await this.spawnProvider();
@@ -313,6 +347,8 @@ export class AgentRunner {
313
347
  this.tokenRenewTimer = undefined;
314
348
  this.disarmBusyReconciler();
315
349
  this.stopReasoningTail();
350
+ this.obligationCache.stop();
351
+ this.outbox.close();
316
352
  this.control?.stop();
317
353
  await this.bus.close();
318
354
  }
@@ -387,7 +423,7 @@ export class AgentRunner {
387
423
  startedAt: this.options.startedAt,
388
424
  }, null, 2) + "\n", { mode: 0o600 });
389
425
  } catch (error) {
390
- console.error(`[runner] failed to write runner info file: ${error}`);
426
+ logger.error("runner", `failed to write runner info file: ${error}`);
391
427
  }
392
428
  }
393
429
 
@@ -403,7 +439,7 @@ export class AgentRunner {
403
439
  const messages = await this.http.pollMessages({ for: this.agentId, unread: true, limit: 100 });
404
440
  for (const message of messages) this.enqueueMessage(message);
405
441
  } catch (error) {
406
- console.error(`[runner] inbox bootstrap failed: ${error}`);
442
+ logger.error("runner", `inbox bootstrap failed: ${error}`);
407
443
  }
408
444
  }
409
445
 
@@ -413,7 +449,7 @@ export class AgentRunner {
413
449
  try {
414
450
  await this.options.adapter.deliverInitialPrompt(this.process, prompt);
415
451
  } catch (error) {
416
- console.error(`[runner] initial prompt delivery failed: ${error}`);
452
+ logger.error("runner", `initial prompt delivery failed: ${error}`);
417
453
  }
418
454
  }
419
455
 
@@ -450,7 +486,7 @@ export class AgentRunner {
450
486
  status: "in_progress",
451
487
  agentId: this.agentId,
452
488
  metadata: { messageId: message.id, completedBy: "runner" },
453
- }).catch((error) => console.error(`[runner] task ${taskId} in_progress update failed: ${error}`));
489
+ }).catch((error) => logger.error("task", `task ${taskId} in_progress update failed: ${error}`));
454
490
  // Runner owns claim + status here; drop the server's self-claim instruction
455
491
  // so the agent doesn't improvise a stray claim send (see stripRunnerClaimedGuidance).
456
492
  toDeliver = { ...message, body: stripRunnerClaimedGuidance(message.body) };
@@ -468,7 +504,7 @@ export class AgentRunner {
468
504
  try {
469
505
  const prepared = await messagesWithCachedAttachments(deliverable, this.http, {
470
506
  agentId: this.agentId,
471
- onError: (message) => console.error(`[runner] ${message}`),
507
+ onError: (message) => logger.error("runner", message),
472
508
  });
473
509
  await this.options.adapter.deliver(this.process, prepared);
474
510
  for (const message of deliverable) {
@@ -477,7 +513,7 @@ export class AgentRunner {
477
513
  }
478
514
  } catch (error) {
479
515
  failed = true;
480
- if (shouldLogDeliveryFailure(error)) console.error(`[runner] message delivery failed: ${error}`);
516
+ if (shouldLogDeliveryFailure(error)) logger.warn("delivery", `message delivery failed: ${error}`);
481
517
  for (const message of deliverable) {
482
518
  this.clearActiveClaim(message);
483
519
  this.pendingMessages.set(message.id, message);
@@ -545,7 +581,7 @@ export class AgentRunner {
545
581
  await this.http.deleteAgent(this.agentId).catch(() => {});
546
582
  if (this.options.exitProcessOnShutdown !== false) {
547
583
  setTimeout(() => void this.stop().catch((error) => {
548
- console.error(`[runner] stop after command failed: ${error}`);
584
+ logger.error("lifecycle", `stop after command failed: ${error}`);
549
585
  }).finally(() => process.exit(0)), 10);
550
586
  }
551
587
  } else if (!this.stopped) {
@@ -680,7 +716,7 @@ export class AgentRunner {
680
716
 
681
717
  if (this.shouldStopUnexpectedProviderExit(diagnostics)) {
682
718
  const hasResumeId = typeof diagnostics.claudeResumeId === "string" && diagnostics.claudeResumeId.length > 0;
683
- console.warn(`[runner] ${this.options.provider} exited; leaving agent offline for manual recovery`);
719
+ logger.warn("lifecycle", `${this.options.provider} exited; leaving agent offline for manual recovery`);
684
720
  this.publishRunnerTimelineEvent({
685
721
  status: "provider.restart_decision",
686
722
  id: `provider-restart-decision-${this.providerSessionId}-${now}`,
@@ -708,7 +744,7 @@ export class AgentRunner {
708
744
  }
709
745
 
710
746
  if (runtimeMs < RAPID_EXIT_MS && recent.length > MAX_RAPID_UNEXPECTED_EXITS) {
711
- console.error(`[runner] provider session exited ${recent.length} times within ${Math.round(UNEXPECTED_EXIT_WINDOW_MS / 1000)}s; giving up`);
747
+ logger.error("lifecycle", `provider session exited ${recent.length} times within ${Math.round(UNEXPECTED_EXIT_WINDOW_MS / 1000)}s; giving up`);
712
748
  this.publishRunnerTimelineEvent({
713
749
  status: "provider.restart_decision",
714
750
  id: `provider-restart-decision-${this.providerSessionId}-${now}`,
@@ -732,7 +768,7 @@ export class AgentRunner {
732
768
  }
733
769
 
734
770
  const delayMs = Math.min(10_000, Math.max(500, 500 * recent.length));
735
- console.warn(`[runner] provider session exited unexpectedly after ${Math.round(runtimeMs / 1000)}s; restarting in ${delayMs}ms`);
771
+ logger.warn("lifecycle", `provider session exited unexpectedly after ${Math.round(runtimeMs / 1000)}s; restarting in ${delayMs}ms`);
736
772
  this.publishRunnerTimelineEvent({
737
773
  status: "provider.restart_decision",
738
774
  id: `provider-restart-decision-${this.providerSessionId}-${now}`,
@@ -757,7 +793,7 @@ export class AgentRunner {
757
793
  this.publishStatus();
758
794
  this.scheduleDrain();
759
795
  } catch (error) {
760
- console.error(`[runner] provider restart after unexpected exit failed: ${error}`);
796
+ logger.error("lifecycle", `provider restart after unexpected exit failed: ${error}`);
761
797
  this.setProviderStatus("error");
762
798
  this.options.onProviderExit?.(1);
763
799
  }
@@ -832,10 +868,10 @@ export class AgentRunner {
832
868
  private handleBusError(code: string, message: string): void {
833
869
  const action = runnerBusErrorAction(code, this.stopped);
834
870
  if (action === "ignore") return;
835
- console.error(`[runner] bus error ${code}: ${message}`);
871
+ logger.error("bus", `bus error ${code}: ${message}`);
836
872
  if (action === "stop") {
837
873
  void this.stop().catch((error) => {
838
- console.error(`[runner] stop after bus error failed: ${error}`);
874
+ logger.error("bus", `stop after bus error failed: ${error}`);
839
875
  }).finally(() => process.exit(0));
840
876
  }
841
877
  }
@@ -918,13 +954,10 @@ export class AgentRunner {
918
954
  replyToMessageId = pendingPrompt;
919
955
  this.pendingPromptMessageId = undefined;
920
956
  } else {
921
- try {
922
- const obligations = await this.http.listReplyObligations(this.agentId);
923
- const obligation = [...obligations].reverse().find((o) => o.from === "user");
924
- replyToMessageId = obligation?.messageId;
925
- } catch {
926
- // fall through and capture without correlation
927
- }
957
+ // Correlation-only (threading + obligation clearing) — the local snapshot is fresh
958
+ // enough and never blocks the response-capture path (#196).
959
+ const obligation = [...this.obligationCache.get()].reverse().find((o) => o.from === "user");
960
+ replyToMessageId = obligation?.messageId;
928
961
  }
929
962
 
930
963
  // The Stop hook can fire before the final assistant entry is flushed to disk.
@@ -966,31 +999,86 @@ export class AgentRunner {
966
999
  ...(replyToMessageId ? { replyTo: replyToMessageId } : {}),
967
1000
  session: { type: "response", origin: "provider", ...(turnId ? { turnId } : {}) },
968
1001
  });
1002
+ // The agent's reply may have cleared an obligation — refresh the snapshot so the next
1003
+ // turn-end doesn't re-prompt for a message already answered (#196).
1004
+ if (replyToMessageId) this.obligationCache.markDirty();
969
1005
  }
970
1006
 
971
1007
  // Post one session-mirror event (prompt echo, assistant response, reasoning or
972
1008
  // tool step) as a `kind: "session"` relay message tagged with payload.session so
973
1009
  // the dashboard can render the live provider session faithfully. Display-only:
974
1010
  // session messages are never delivered back into a provider.
975
- private async publishSessionEvent(input: {
1011
+ private publishSessionEvent(input: {
976
1012
  from: string;
977
1013
  to: string;
978
1014
  body: string;
979
1015
  session: MessageSessionMeta;
980
1016
  replyTo?: number;
981
- }): Promise<void> {
982
- try {
983
- await this.http.sendMessage({
1017
+ }): void {
1018
+ // Durable, ordered, timestamped (#196): the actual POST happens in deliverOutboxEvent,
1019
+ // retried until it lands. occurredAt is stamped now so a queued event reports when it
1020
+ // truly happened, not when the server finally accepted it.
1021
+ this.outbox.enqueue({
1022
+ kind: "session-message",
1023
+ payload: {
984
1024
  from: input.from,
985
1025
  to: input.to,
986
1026
  ...(input.replyTo ? { replyTo: input.replyTo } : {}),
987
1027
  kind: "session",
988
1028
  body: input.body,
989
1029
  payload: { session: { provider: this.options.provider, ...input.session } },
1030
+ } satisfies SendMessageInput,
1031
+ });
1032
+ }
1033
+
1034
+ // The outbox transport: map a queued record to its HTTP call. Throw to retry, return to
1035
+ // ack (delete). occurredAt + idempotencyKey are injected from the record so retries are
1036
+ // exactly-once server-side and carry true event time.
1037
+ private async deliverOutboxEvent(record: OutboxRecord): Promise<void> {
1038
+ try {
1039
+ if (record.kind === "session-message") {
1040
+ await this.http.sendMessage({
1041
+ ...(record.payload as SendMessageInput),
1042
+ occurredAt: record.occurredAt,
1043
+ idempotencyKey: record.idempotencyKey,
1044
+ });
1045
+ return;
1046
+ }
1047
+ if (record.kind === "insight") {
1048
+ await this.http.recordInsightObservation({
1049
+ ...(record.payload as Parameters<RelayHttpClient["recordInsightObservation"]>[0]),
1050
+ occurredAt: record.occurredAt,
1051
+ });
1052
+ return;
1053
+ }
1054
+ logger.warn("outbox", `dropping event with unknown kind: ${record.kind}`);
1055
+ } catch (error) {
1056
+ // 409 = the server intentionally rejected it (e.g. Insights/feature toggled off). That
1057
+ // is a permanent "don't want this", not a transient failure — ack so it doesn't retry.
1058
+ if (isHttpStatusError(error, 409)) return;
1059
+ if (isHttpAuthError(error)) this.recoverRuntimeTokenAfterAuthFailure("outbox");
1060
+ throw error; // transient (or auth, post-recovery) → let the outbox retry with backoff
1061
+ }
1062
+ }
1063
+
1064
+ // A hook reported an unhandled failure (#198 seam). Already logged FATAL by the control
1065
+ // server; here we additionally surface it durably to the server as a generic insight so
1066
+ // it shows up in observability rather than only in the per-agent log (#196).
1067
+ private reportHookFatal(report: { hook: string; error: string }): void {
1068
+ try {
1069
+ this.outbox.enqueue({
1070
+ kind: "insight",
1071
+ payload: {
1072
+ sessionId: this.providerSessionId,
1073
+ project: this.options.cwd,
1074
+ agentId: this.agentId,
1075
+ signal: "hook_fatal",
1076
+ value: { hook: report.hook, error: report.error },
1077
+ source: "server",
1078
+ },
990
1079
  });
991
1080
  } catch (error) {
992
- this.logRunnerDiagnostic(`session ${input.session.type} capture failed: ${error instanceof Error ? error.message : String(error)}`);
993
- if (isHttpAuthError(error)) this.recoverRuntimeTokenAfterAuthFailure("session-capture");
1081
+ logger.error("outbox", `failed to queue hook-fatal report: ${error instanceof Error ? error.message : String(error)}`);
994
1082
  }
995
1083
  }
996
1084
 
@@ -1034,8 +1122,11 @@ export class AgentRunner {
1034
1122
  }
1035
1123
  const analysis = analyzeSession(jsonl);
1036
1124
  if (!analysis) return; // no tool calls = nothing substantive to measure
1037
- try {
1038
- await this.http.recordInsightObservation({
1125
+ // Durable + non-blocking (#196): queue it. SessionEnd can race provider shutdown, so a
1126
+ // direct POST risked being dropped if the server hiccuped; the outbox survives that.
1127
+ this.outbox.enqueue({
1128
+ kind: "insight",
1129
+ payload: {
1039
1130
  sessionId: this.providerSessionId,
1040
1131
  project: this.options.cwd,
1041
1132
  agentId: this.agentId,
@@ -1043,13 +1134,9 @@ export class AgentRunner {
1043
1134
  value: { ...analysis.metric, ...(input.reason ? { endReason: input.reason } : {}) },
1044
1135
  outcome: { ...analysis.outcome },
1045
1136
  source: "server",
1046
- });
1047
- this.sessionLog(`insights: context_ratio ${analysis.metric.ratio.toFixed(2)} (${analysis.metric.gatheringCalls}/${analysis.metric.totalToolCalls} gathering)`);
1048
- } catch (error) {
1049
- // 409 = Insights/feature toggled off; anything else is best-effort too.
1050
- this.sessionDebug(`insights context_ratio skipped: ${error instanceof Error ? error.message : String(error)}`);
1051
- if (isHttpAuthError(error)) this.recoverRuntimeTokenAfterAuthFailure("insights");
1052
- }
1137
+ },
1138
+ });
1139
+ this.sessionLog(`insights: context_ratio ${analysis.metric.ratio.toFixed(2)} (${analysis.metric.gatheringCalls}/${analysis.metric.totalToolCalls} gathering) queued`);
1053
1140
  }
1054
1141
 
1055
1142
  // Route a provider-emitted session event (Codex app-server) into the chat mirror.
@@ -1078,13 +1165,9 @@ export class AgentRunner {
1078
1165
  if (pendingPrompt) {
1079
1166
  replyToMessageId = pendingPrompt;
1080
1167
  this.pendingPromptMessageId = undefined;
1081
- } else {
1082
- try {
1083
- const obligations = await this.http.listReplyObligations(this.agentId);
1084
- if (obligations.some((o) => o.from === "user")) return;
1085
- } catch {
1086
- // capture anyway on lookup failure
1087
- }
1168
+ } else if (this.obligationCache.get().some((o) => o.from === "user")) {
1169
+ // The agent will answer the relay obligation itself — don't double-post (#196).
1170
+ return;
1088
1171
  }
1089
1172
  await this.publishSessionEvent({
1090
1173
  from: this.agentId,
@@ -1363,36 +1446,24 @@ export class AgentRunner {
1363
1446
  this.logRunnerDiagnostic(`[runner] HTTP liveness update failed: ${suffix}`);
1364
1447
  }
1365
1448
 
1449
+ // Runner operational diagnostics (HTTP liveness, token renewal failures). Routed
1450
+ // through the leveled logger at warn — see logger.ts. Kept as a thin wrapper so
1451
+ // the existing call sites and their `[runner]` framing stay put.
1366
1452
  private logRunnerDiagnostic(message: string): void {
1367
- if (this.options.headless) {
1368
- console.error(message);
1369
- return;
1370
- }
1371
- try {
1372
- const logDir = join(process.env.HOME || ".", ".agent-relay", "logs");
1373
- mkdirSync(logDir, { recursive: true });
1374
- appendFileSync(join(logDir, `runner-${safeLogName(this.agentId)}.log`), `[${new Date().toISOString()}] ${message}\n`);
1375
- } catch {
1376
- // Do not write runner diagnostics into an interactive provider TUI.
1377
- }
1453
+ logger.warn("runner", message.replace(/^\[runner\]\s*/, ""));
1378
1454
  }
1379
1455
 
1380
- // Session-mirror diagnostics → a dedicated, ANSI-free, greppable log per agent
1381
- // (NOT the provider's TUI stdout, which is unreadable). This is the single place
1382
- // to look when chat/terminal sync misbehaves. Key transitions always log here.
1456
+ // Session-mirror diagnostics → the leveled logger (component "mirror"), written
1457
+ // to the dashboard-surfaced session-mirror-<agent>.log. Key transitions log at
1458
+ // info; the single place to look when chat/terminal sync misbehaves.
1383
1459
  private sessionLog(message: string): void {
1384
- try {
1385
- const logDir = join(process.env.HOME || ".", ".agent-relay", "logs");
1386
- mkdirSync(logDir, { recursive: true });
1387
- appendFileSync(join(logDir, `session-mirror-${safeLogName(this.agentId)}.log`), `[${new Date().toISOString()}] ${message}\n`);
1388
- } catch {
1389
- // best-effort
1390
- }
1460
+ logger.info("mirror", message);
1391
1461
  }
1392
1462
 
1393
- // Verbose, high-frequency lines (per-probe, per-emit) — only when AGENT_RELAY_SESSION_DEBUG=1.
1463
+ // Verbose, high-frequency lines (per-probe, per-emit) — surfaced only at log
1464
+ // level "debug" (AGENT_RELAY_LOG_LEVEL=debug, or flip live via /log-level).
1394
1465
  private sessionDebug(message: string): void {
1395
- if (this.sessionDebugVerbose) this.sessionLog(message);
1466
+ logger.debug("mirror", message);
1396
1467
  }
1397
1468
 
1398
1469
  private ensureScratch(): void {
@@ -1657,7 +1728,7 @@ export class AgentRunner {
1657
1728
  })
1658
1729
  .then(() => true)
1659
1730
  .catch((error) => {
1660
- console.error(`[runner] task ${claim.taskId} completion update failed: ${error}`);
1731
+ logger.error("task", `task ${claim.taskId} completion update failed: ${error}`);
1661
1732
  return false;
1662
1733
  });
1663
1734
  if (!ok) continue;
@@ -1956,16 +2027,17 @@ function isHttpAuthError(error: unknown): boolean {
1956
2027
  return status === 401 || status === 403;
1957
2028
  }
1958
2029
 
2030
+ function isHttpStatusError(error: unknown, code: number): boolean {
2031
+ const status = typeof error === "object" && error !== null ? (error as { status?: unknown }).status : undefined;
2032
+ return status === code;
2033
+ }
2034
+
1959
2035
  function httpErrorKey(error: unknown): string {
1960
2036
  const status = typeof error === "object" && error !== null ? (error as { status?: unknown }).status : undefined;
1961
2037
  if (typeof status === "number") return `status:${status}`;
1962
2038
  return String(error);
1963
2039
  }
1964
2040
 
1965
- function safeLogName(value: string): string {
1966
- return value.replace(/[^a-zA-Z0-9_.-]+/g, "_").slice(0, 180);
1967
- }
1968
-
1969
2041
  function isContextState(value: unknown): value is ContextState {
1970
2042
  if (!value || typeof value !== "object" || Array.isArray(value)) return false;
1971
2043
  const state = value as Record<string, unknown>;